From dfcb27540213e8061ecffacd4bd8ed54a310a7b0 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 19 May 2021 02:09:28 +0200 Subject: rcu/nocb: Start moving nocb code to its own plugin file The kernel/rcu/tree_plugin.h file contains not only the plugins for preemptible RCU, but also many other features including rcu_nocbs callback offloading. This offloading has become large and complex, so it is time to put it in its own file. This commit starts that process. Suggested-by: Paul E. McKenney Signed-off-by: Frederic Weisbecker [ paulmck: Rename to tree_nocb.h, add Frederic as author. ] Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 1 + kernel/rcu/tree_nocb.h | 1496 ++++++++++++++++++++++++++++++++++++++++++++++ kernel/rcu/tree_plugin.h | 1487 --------------------------------------------- 3 files changed, 1497 insertions(+), 1487 deletions(-) create mode 100644 kernel/rcu/tree_nocb.h (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 51f24ecd94b2..b6eb480402e0 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -4784,4 +4784,5 @@ void __init rcu_init(void) #include "tree_stall.h" #include "tree_exp.h" +#include "tree_nocb.h" #include "tree_plugin.h" diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h new file mode 100644 index 000000000000..8fdf44f8523f --- /dev/null +++ b/kernel/rcu/tree_nocb.h @@ -0,0 +1,1496 @@ +/* SPDX-License-Identifier: GPL-2.0+ */ +/* + * Read-Copy Update mechanism for mutual exclusion (tree-based version) + * Internal non-public definitions that provide either classic + * or preemptible semantics. + * + * Copyright Red Hat, 2009 + * Copyright IBM Corporation, 2009 + * Copyright SUSE, 2021 + * + * Author: Ingo Molnar + * Paul E. McKenney + * Frederic Weisbecker + */ + +#ifdef CONFIG_RCU_NOCB_CPU +static cpumask_var_t rcu_nocb_mask; /* CPUs to have callbacks offloaded. */ +static bool __read_mostly rcu_nocb_poll; /* Offload kthread are to poll. */ +static inline int rcu_lockdep_is_held_nocb(struct rcu_data *rdp) +{ + return lockdep_is_held(&rdp->nocb_lock); +} + +static inline bool rcu_current_is_nocb_kthread(struct rcu_data *rdp) +{ + /* Race on early boot between thread creation and assignment */ + if (!rdp->nocb_cb_kthread || !rdp->nocb_gp_kthread) + return true; + + if (current == rdp->nocb_cb_kthread || current == rdp->nocb_gp_kthread) + if (in_task()) + return true; + return false; +} + +/* + * Offload callback processing from the boot-time-specified set of CPUs + * specified by rcu_nocb_mask. For the CPUs in the set, there are kthreads + * created that pull the callbacks from the corresponding CPU, wait for + * a grace period to elapse, and invoke the callbacks. These kthreads + * are organized into GP kthreads, which manage incoming callbacks, wait for + * grace periods, and awaken CB kthreads, and the CB kthreads, which only + * invoke callbacks. Each GP kthread invokes its own CBs. The no-CBs CPUs + * do a wake_up() on their GP kthread when they insert a callback into any + * empty list, unless the rcu_nocb_poll boot parameter has been specified, + * in which case each kthread actively polls its CPU. (Which isn't so great + * for energy efficiency, but which does reduce RCU's overhead on that CPU.) + * + * This is intended to be used in conjunction with Frederic Weisbecker's + * adaptive-idle work, which would seriously reduce OS jitter on CPUs + * running CPU-bound user-mode computations. + * + * Offloading of callbacks can also be used as an energy-efficiency + * measure because CPUs with no RCU callbacks queued are more aggressive + * about entering dyntick-idle mode. + */ + + +/* + * Parse the boot-time rcu_nocb_mask CPU list from the kernel parameters. + * If the list is invalid, a warning is emitted and all CPUs are offloaded. + */ +static int __init rcu_nocb_setup(char *str) +{ + alloc_bootmem_cpumask_var(&rcu_nocb_mask); + if (cpulist_parse(str, rcu_nocb_mask)) { + pr_warn("rcu_nocbs= bad CPU range, all CPUs set\n"); + cpumask_setall(rcu_nocb_mask); + } + return 1; +} +__setup("rcu_nocbs=", rcu_nocb_setup); + +static int __init parse_rcu_nocb_poll(char *arg) +{ + rcu_nocb_poll = true; + return 0; +} +early_param("rcu_nocb_poll", parse_rcu_nocb_poll); + +/* + * Don't bother bypassing ->cblist if the call_rcu() rate is low. + * After all, the main point of bypassing is to avoid lock contention + * on ->nocb_lock, which only can happen at high call_rcu() rates. + */ +static int nocb_nobypass_lim_per_jiffy = 16 * 1000 / HZ; +module_param(nocb_nobypass_lim_per_jiffy, int, 0); + +/* + * Acquire the specified rcu_data structure's ->nocb_bypass_lock. If the + * lock isn't immediately available, increment ->nocb_lock_contended to + * flag the contention. + */ +static void rcu_nocb_bypass_lock(struct rcu_data *rdp) + __acquires(&rdp->nocb_bypass_lock) +{ + lockdep_assert_irqs_disabled(); + if (raw_spin_trylock(&rdp->nocb_bypass_lock)) + return; + atomic_inc(&rdp->nocb_lock_contended); + WARN_ON_ONCE(smp_processor_id() != rdp->cpu); + smp_mb__after_atomic(); /* atomic_inc() before lock. */ + raw_spin_lock(&rdp->nocb_bypass_lock); + smp_mb__before_atomic(); /* atomic_dec() after lock. */ + atomic_dec(&rdp->nocb_lock_contended); +} + +/* + * Spinwait until the specified rcu_data structure's ->nocb_lock is + * not contended. Please note that this is extremely special-purpose, + * relying on the fact that at most two kthreads and one CPU contend for + * this lock, and also that the two kthreads are guaranteed to have frequent + * grace-period-duration time intervals between successive acquisitions + * of the lock. This allows us to use an extremely simple throttling + * mechanism, and further to apply it only to the CPU doing floods of + * call_rcu() invocations. Don't try this at home! + */ +static void rcu_nocb_wait_contended(struct rcu_data *rdp) +{ + WARN_ON_ONCE(smp_processor_id() != rdp->cpu); + while (WARN_ON_ONCE(atomic_read(&rdp->nocb_lock_contended))) + cpu_relax(); +} + +/* + * Conditionally acquire the specified rcu_data structure's + * ->nocb_bypass_lock. + */ +static bool rcu_nocb_bypass_trylock(struct rcu_data *rdp) +{ + lockdep_assert_irqs_disabled(); + return raw_spin_trylock(&rdp->nocb_bypass_lock); +} + +/* + * Release the specified rcu_data structure's ->nocb_bypass_lock. + */ +static void rcu_nocb_bypass_unlock(struct rcu_data *rdp) + __releases(&rdp->nocb_bypass_lock) +{ + lockdep_assert_irqs_disabled(); + raw_spin_unlock(&rdp->nocb_bypass_lock); +} + +/* + * Acquire the specified rcu_data structure's ->nocb_lock, but only + * if it corresponds to a no-CBs CPU. + */ +static void rcu_nocb_lock(struct rcu_data *rdp) +{ + lockdep_assert_irqs_disabled(); + if (!rcu_rdp_is_offloaded(rdp)) + return; + raw_spin_lock(&rdp->nocb_lock); +} + +/* + * Release the specified rcu_data structure's ->nocb_lock, but only + * if it corresponds to a no-CBs CPU. + */ +static void rcu_nocb_unlock(struct rcu_data *rdp) +{ + if (rcu_rdp_is_offloaded(rdp)) { + lockdep_assert_irqs_disabled(); + raw_spin_unlock(&rdp->nocb_lock); + } +} + +/* + * Release the specified rcu_data structure's ->nocb_lock and restore + * interrupts, but only if it corresponds to a no-CBs CPU. + */ +static void rcu_nocb_unlock_irqrestore(struct rcu_data *rdp, + unsigned long flags) +{ + if (rcu_rdp_is_offloaded(rdp)) { + lockdep_assert_irqs_disabled(); + raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags); + } else { + local_irq_restore(flags); + } +} + +/* Lockdep check that ->cblist may be safely accessed. */ +static void rcu_lockdep_assert_cblist_protected(struct rcu_data *rdp) +{ + lockdep_assert_irqs_disabled(); + if (rcu_rdp_is_offloaded(rdp)) + lockdep_assert_held(&rdp->nocb_lock); +} + +/* + * Wake up any no-CBs CPUs' kthreads that were waiting on the just-ended + * grace period. + */ +static void rcu_nocb_gp_cleanup(struct swait_queue_head *sq) +{ + swake_up_all(sq); +} + +static struct swait_queue_head *rcu_nocb_gp_get(struct rcu_node *rnp) +{ + return &rnp->nocb_gp_wq[rcu_seq_ctr(rnp->gp_seq) & 0x1]; +} + +static void rcu_init_one_nocb(struct rcu_node *rnp) +{ + init_swait_queue_head(&rnp->nocb_gp_wq[0]); + init_swait_queue_head(&rnp->nocb_gp_wq[1]); +} + +/* Is the specified CPU a no-CBs CPU? */ +bool rcu_is_nocb_cpu(int cpu) +{ + if (cpumask_available(rcu_nocb_mask)) + return cpumask_test_cpu(cpu, rcu_nocb_mask); + return false; +} + +static bool __wake_nocb_gp(struct rcu_data *rdp_gp, + struct rcu_data *rdp, + bool force, unsigned long flags) + __releases(rdp_gp->nocb_gp_lock) +{ + bool needwake = false; + + if (!READ_ONCE(rdp_gp->nocb_gp_kthread)) { + raw_spin_unlock_irqrestore(&rdp_gp->nocb_gp_lock, flags); + trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, + TPS("AlreadyAwake")); + return false; + } + + if (rdp_gp->nocb_defer_wakeup > RCU_NOCB_WAKE_NOT) { + WRITE_ONCE(rdp_gp->nocb_defer_wakeup, RCU_NOCB_WAKE_NOT); + del_timer(&rdp_gp->nocb_timer); + } + + if (force || READ_ONCE(rdp_gp->nocb_gp_sleep)) { + WRITE_ONCE(rdp_gp->nocb_gp_sleep, false); + needwake = true; + } + raw_spin_unlock_irqrestore(&rdp_gp->nocb_gp_lock, flags); + if (needwake) { + trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("DoWake")); + wake_up_process(rdp_gp->nocb_gp_kthread); + } + + return needwake; +} + +/* + * Kick the GP kthread for this NOCB group. + */ +static bool wake_nocb_gp(struct rcu_data *rdp, bool force) +{ + unsigned long flags; + struct rcu_data *rdp_gp = rdp->nocb_gp_rdp; + + raw_spin_lock_irqsave(&rdp_gp->nocb_gp_lock, flags); + return __wake_nocb_gp(rdp_gp, rdp, force, flags); +} + +/* + * Arrange to wake the GP kthread for this NOCB group at some future + * time when it is safe to do so. + */ +static void wake_nocb_gp_defer(struct rcu_data *rdp, int waketype, + const char *reason) +{ + unsigned long flags; + struct rcu_data *rdp_gp = rdp->nocb_gp_rdp; + + raw_spin_lock_irqsave(&rdp_gp->nocb_gp_lock, flags); + + /* + * Bypass wakeup overrides previous deferments. In case + * of callback storm, no need to wake up too early. + */ + if (waketype == RCU_NOCB_WAKE_BYPASS) { + mod_timer(&rdp_gp->nocb_timer, jiffies + 2); + WRITE_ONCE(rdp_gp->nocb_defer_wakeup, waketype); + } else { + if (rdp_gp->nocb_defer_wakeup < RCU_NOCB_WAKE) + mod_timer(&rdp_gp->nocb_timer, jiffies + 1); + if (rdp_gp->nocb_defer_wakeup < waketype) + WRITE_ONCE(rdp_gp->nocb_defer_wakeup, waketype); + } + + raw_spin_unlock_irqrestore(&rdp_gp->nocb_gp_lock, flags); + + trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, reason); +} + +/* + * Flush the ->nocb_bypass queue into ->cblist, enqueuing rhp if non-NULL. + * However, if there is a callback to be enqueued and if ->nocb_bypass + * proves to be initially empty, just return false because the no-CB GP + * kthread may need to be awakened in this case. + * + * Note that this function always returns true if rhp is NULL. + */ +static bool rcu_nocb_do_flush_bypass(struct rcu_data *rdp, struct rcu_head *rhp, + unsigned long j) +{ + struct rcu_cblist rcl; + + WARN_ON_ONCE(!rcu_rdp_is_offloaded(rdp)); + rcu_lockdep_assert_cblist_protected(rdp); + lockdep_assert_held(&rdp->nocb_bypass_lock); + if (rhp && !rcu_cblist_n_cbs(&rdp->nocb_bypass)) { + raw_spin_unlock(&rdp->nocb_bypass_lock); + return false; + } + /* Note: ->cblist.len already accounts for ->nocb_bypass contents. */ + if (rhp) + rcu_segcblist_inc_len(&rdp->cblist); /* Must precede enqueue. */ + rcu_cblist_flush_enqueue(&rcl, &rdp->nocb_bypass, rhp); + rcu_segcblist_insert_pend_cbs(&rdp->cblist, &rcl); + WRITE_ONCE(rdp->nocb_bypass_first, j); + rcu_nocb_bypass_unlock(rdp); + return true; +} + +/* + * Flush the ->nocb_bypass queue into ->cblist, enqueuing rhp if non-NULL. + * However, if there is a callback to be enqueued and if ->nocb_bypass + * proves to be initially empty, just return false because the no-CB GP + * kthread may need to be awakened in this case. + * + * Note that this function always returns true if rhp is NULL. + */ +static bool rcu_nocb_flush_bypass(struct rcu_data *rdp, struct rcu_head *rhp, + unsigned long j) +{ + if (!rcu_rdp_is_offloaded(rdp)) + return true; + rcu_lockdep_assert_cblist_protected(rdp); + rcu_nocb_bypass_lock(rdp); + return rcu_nocb_do_flush_bypass(rdp, rhp, j); +} + +/* + * If the ->nocb_bypass_lock is immediately available, flush the + * ->nocb_bypass queue into ->cblist. + */ +static void rcu_nocb_try_flush_bypass(struct rcu_data *rdp, unsigned long j) +{ + rcu_lockdep_assert_cblist_protected(rdp); + if (!rcu_rdp_is_offloaded(rdp) || + !rcu_nocb_bypass_trylock(rdp)) + return; + WARN_ON_ONCE(!rcu_nocb_do_flush_bypass(rdp, NULL, j)); +} + +/* + * See whether it is appropriate to use the ->nocb_bypass list in order + * to control contention on ->nocb_lock. A limited number of direct + * enqueues are permitted into ->cblist per jiffy. If ->nocb_bypass + * is non-empty, further callbacks must be placed into ->nocb_bypass, + * otherwise rcu_barrier() breaks. Use rcu_nocb_flush_bypass() to switch + * back to direct use of ->cblist. However, ->nocb_bypass should not be + * used if ->cblist is empty, because otherwise callbacks can be stranded + * on ->nocb_bypass because we cannot count on the current CPU ever again + * invoking call_rcu(). The general rule is that if ->nocb_bypass is + * non-empty, the corresponding no-CBs grace-period kthread must not be + * in an indefinite sleep state. + * + * Finally, it is not permitted to use the bypass during early boot, + * as doing so would confuse the auto-initialization code. Besides + * which, there is no point in worrying about lock contention while + * there is only one CPU in operation. + */ +static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp, + bool *was_alldone, unsigned long flags) +{ + unsigned long c; + unsigned long cur_gp_seq; + unsigned long j = jiffies; + long ncbs = rcu_cblist_n_cbs(&rdp->nocb_bypass); + + lockdep_assert_irqs_disabled(); + + // Pure softirq/rcuc based processing: no bypassing, no + // locking. + if (!rcu_rdp_is_offloaded(rdp)) { + *was_alldone = !rcu_segcblist_pend_cbs(&rdp->cblist); + return false; + } + + // In the process of (de-)offloading: no bypassing, but + // locking. + if (!rcu_segcblist_completely_offloaded(&rdp->cblist)) { + rcu_nocb_lock(rdp); + *was_alldone = !rcu_segcblist_pend_cbs(&rdp->cblist); + return false; /* Not offloaded, no bypassing. */ + } + + // Don't use ->nocb_bypass during early boot. + if (rcu_scheduler_active != RCU_SCHEDULER_RUNNING) { + rcu_nocb_lock(rdp); + WARN_ON_ONCE(rcu_cblist_n_cbs(&rdp->nocb_bypass)); + *was_alldone = !rcu_segcblist_pend_cbs(&rdp->cblist); + return false; + } + + // If we have advanced to a new jiffy, reset counts to allow + // moving back from ->nocb_bypass to ->cblist. + if (j == rdp->nocb_nobypass_last) { + c = rdp->nocb_nobypass_count + 1; + } else { + WRITE_ONCE(rdp->nocb_nobypass_last, j); + c = rdp->nocb_nobypass_count - nocb_nobypass_lim_per_jiffy; + if (ULONG_CMP_LT(rdp->nocb_nobypass_count, + nocb_nobypass_lim_per_jiffy)) + c = 0; + else if (c > nocb_nobypass_lim_per_jiffy) + c = nocb_nobypass_lim_per_jiffy; + } + WRITE_ONCE(rdp->nocb_nobypass_count, c); + + // If there hasn't yet been all that many ->cblist enqueues + // this jiffy, tell the caller to enqueue onto ->cblist. But flush + // ->nocb_bypass first. + if (rdp->nocb_nobypass_count < nocb_nobypass_lim_per_jiffy) { + rcu_nocb_lock(rdp); + *was_alldone = !rcu_segcblist_pend_cbs(&rdp->cblist); + if (*was_alldone) + trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, + TPS("FirstQ")); + WARN_ON_ONCE(!rcu_nocb_flush_bypass(rdp, NULL, j)); + WARN_ON_ONCE(rcu_cblist_n_cbs(&rdp->nocb_bypass)); + return false; // Caller must enqueue the callback. + } + + // If ->nocb_bypass has been used too long or is too full, + // flush ->nocb_bypass to ->cblist. + if ((ncbs && j != READ_ONCE(rdp->nocb_bypass_first)) || + ncbs >= qhimark) { + rcu_nocb_lock(rdp); + if (!rcu_nocb_flush_bypass(rdp, rhp, j)) { + *was_alldone = !rcu_segcblist_pend_cbs(&rdp->cblist); + if (*was_alldone) + trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, + TPS("FirstQ")); + WARN_ON_ONCE(rcu_cblist_n_cbs(&rdp->nocb_bypass)); + return false; // Caller must enqueue the callback. + } + if (j != rdp->nocb_gp_adv_time && + rcu_segcblist_nextgp(&rdp->cblist, &cur_gp_seq) && + rcu_seq_done(&rdp->mynode->gp_seq, cur_gp_seq)) { + rcu_advance_cbs_nowake(rdp->mynode, rdp); + rdp->nocb_gp_adv_time = j; + } + rcu_nocb_unlock_irqrestore(rdp, flags); + return true; // Callback already enqueued. + } + + // We need to use the bypass. + rcu_nocb_wait_contended(rdp); + rcu_nocb_bypass_lock(rdp); + ncbs = rcu_cblist_n_cbs(&rdp->nocb_bypass); + rcu_segcblist_inc_len(&rdp->cblist); /* Must precede enqueue. */ + rcu_cblist_enqueue(&rdp->nocb_bypass, rhp); + if (!ncbs) { + WRITE_ONCE(rdp->nocb_bypass_first, j); + trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("FirstBQ")); + } + rcu_nocb_bypass_unlock(rdp); + smp_mb(); /* Order enqueue before wake. */ + if (ncbs) { + local_irq_restore(flags); + } else { + // No-CBs GP kthread might be indefinitely asleep, if so, wake. + rcu_nocb_lock(rdp); // Rare during call_rcu() flood. + if (!rcu_segcblist_pend_cbs(&rdp->cblist)) { + trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, + TPS("FirstBQwake")); + __call_rcu_nocb_wake(rdp, true, flags); + } else { + trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, + TPS("FirstBQnoWake")); + rcu_nocb_unlock_irqrestore(rdp, flags); + } + } + return true; // Callback already enqueued. +} + +/* + * Awaken the no-CBs grace-period kthread if needed, either due to it + * legitimately being asleep or due to overload conditions. + * + * If warranted, also wake up the kthread servicing this CPUs queues. + */ +static void __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_alldone, + unsigned long flags) + __releases(rdp->nocb_lock) +{ + unsigned long cur_gp_seq; + unsigned long j; + long len; + struct task_struct *t; + + // If we are being polled or there is no kthread, just leave. + t = READ_ONCE(rdp->nocb_gp_kthread); + if (rcu_nocb_poll || !t) { + rcu_nocb_unlock_irqrestore(rdp, flags); + trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, + TPS("WakeNotPoll")); + return; + } + // Need to actually to a wakeup. + len = rcu_segcblist_n_cbs(&rdp->cblist); + if (was_alldone) { + rdp->qlen_last_fqs_check = len; + if (!irqs_disabled_flags(flags)) { + /* ... if queue was empty ... */ + rcu_nocb_unlock_irqrestore(rdp, flags); + wake_nocb_gp(rdp, false); + trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, + TPS("WakeEmpty")); + } else { + rcu_nocb_unlock_irqrestore(rdp, flags); + wake_nocb_gp_defer(rdp, RCU_NOCB_WAKE, + TPS("WakeEmptyIsDeferred")); + } + } else if (len > rdp->qlen_last_fqs_check + qhimark) { + /* ... or if many callbacks queued. */ + rdp->qlen_last_fqs_check = len; + j = jiffies; + if (j != rdp->nocb_gp_adv_time && + rcu_segcblist_nextgp(&rdp->cblist, &cur_gp_seq) && + rcu_seq_done(&rdp->mynode->gp_seq, cur_gp_seq)) { + rcu_advance_cbs_nowake(rdp->mynode, rdp); + rdp->nocb_gp_adv_time = j; + } + smp_mb(); /* Enqueue before timer_pending(). */ + if ((rdp->nocb_cb_sleep || + !rcu_segcblist_ready_cbs(&rdp->cblist)) && + !timer_pending(&rdp->nocb_timer)) { + rcu_nocb_unlock_irqrestore(rdp, flags); + wake_nocb_gp_defer(rdp, RCU_NOCB_WAKE_FORCE, + TPS("WakeOvfIsDeferred")); + } else { + rcu_nocb_unlock_irqrestore(rdp, flags); + trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WakeNot")); + } + } else { + rcu_nocb_unlock_irqrestore(rdp, flags); + trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WakeNot")); + } + return; +} + +/* + * Check if we ignore this rdp. + * + * We check that without holding the nocb lock but + * we make sure not to miss a freshly offloaded rdp + * with the current ordering: + * + * rdp_offload_toggle() nocb_gp_enabled_cb() + * ------------------------- ---------------------------- + * WRITE flags LOCK nocb_gp_lock + * LOCK nocb_gp_lock READ/WRITE nocb_gp_sleep + * READ/WRITE nocb_gp_sleep UNLOCK nocb_gp_lock + * UNLOCK nocb_gp_lock READ flags + */ +static inline bool nocb_gp_enabled_cb(struct rcu_data *rdp) +{ + u8 flags = SEGCBLIST_OFFLOADED | SEGCBLIST_KTHREAD_GP; + + return rcu_segcblist_test_flags(&rdp->cblist, flags); +} + +static inline bool nocb_gp_update_state_deoffloading(struct rcu_data *rdp, + bool *needwake_state) +{ + struct rcu_segcblist *cblist = &rdp->cblist; + + if (rcu_segcblist_test_flags(cblist, SEGCBLIST_OFFLOADED)) { + if (!rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP)) { + rcu_segcblist_set_flags(cblist, SEGCBLIST_KTHREAD_GP); + if (rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB)) + *needwake_state = true; + } + return false; + } + + /* + * De-offloading. Clear our flag and notify the de-offload worker. + * We will ignore this rdp until it ever gets re-offloaded. + */ + WARN_ON_ONCE(!rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP)); + rcu_segcblist_clear_flags(cblist, SEGCBLIST_KTHREAD_GP); + if (!rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB)) + *needwake_state = true; + return true; +} + + +/* + * No-CBs GP kthreads come here to wait for additional callbacks to show up + * or for grace periods to end. + */ +static void nocb_gp_wait(struct rcu_data *my_rdp) +{ + bool bypass = false; + long bypass_ncbs; + int __maybe_unused cpu = my_rdp->cpu; + unsigned long cur_gp_seq; + unsigned long flags; + bool gotcbs = false; + unsigned long j = jiffies; + bool needwait_gp = false; // This prevents actual uninitialized use. + bool needwake; + bool needwake_gp; + struct rcu_data *rdp; + struct rcu_node *rnp; + unsigned long wait_gp_seq = 0; // Suppress "use uninitialized" warning. + bool wasempty = false; + + /* + * Each pass through the following loop checks for CBs and for the + * nearest grace period (if any) to wait for next. The CB kthreads + * and the global grace-period kthread are awakened if needed. + */ + WARN_ON_ONCE(my_rdp->nocb_gp_rdp != my_rdp); + for (rdp = my_rdp; rdp; rdp = rdp->nocb_next_cb_rdp) { + bool needwake_state = false; + + if (!nocb_gp_enabled_cb(rdp)) + continue; + trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("Check")); + rcu_nocb_lock_irqsave(rdp, flags); + if (nocb_gp_update_state_deoffloading(rdp, &needwake_state)) { + rcu_nocb_unlock_irqrestore(rdp, flags); + if (needwake_state) + swake_up_one(&rdp->nocb_state_wq); + continue; + } + bypass_ncbs = rcu_cblist_n_cbs(&rdp->nocb_bypass); + if (bypass_ncbs && + (time_after(j, READ_ONCE(rdp->nocb_bypass_first) + 1) || + bypass_ncbs > 2 * qhimark)) { + // Bypass full or old, so flush it. + (void)rcu_nocb_try_flush_bypass(rdp, j); + bypass_ncbs = rcu_cblist_n_cbs(&rdp->nocb_bypass); + } else if (!bypass_ncbs && rcu_segcblist_empty(&rdp->cblist)) { + rcu_nocb_unlock_irqrestore(rdp, flags); + if (needwake_state) + swake_up_one(&rdp->nocb_state_wq); + continue; /* No callbacks here, try next. */ + } + if (bypass_ncbs) { + trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, + TPS("Bypass")); + bypass = true; + } + rnp = rdp->mynode; + + // Advance callbacks if helpful and low contention. + needwake_gp = false; + if (!rcu_segcblist_restempty(&rdp->cblist, + RCU_NEXT_READY_TAIL) || + (rcu_segcblist_nextgp(&rdp->cblist, &cur_gp_seq) && + rcu_seq_done(&rnp->gp_seq, cur_gp_seq))) { + raw_spin_lock_rcu_node(rnp); /* irqs disabled. */ + needwake_gp = rcu_advance_cbs(rnp, rdp); + wasempty = rcu_segcblist_restempty(&rdp->cblist, + RCU_NEXT_READY_TAIL); + raw_spin_unlock_rcu_node(rnp); /* irqs disabled. */ + } + // Need to wait on some grace period? + WARN_ON_ONCE(wasempty && + !rcu_segcblist_restempty(&rdp->cblist, + RCU_NEXT_READY_TAIL)); + if (rcu_segcblist_nextgp(&rdp->cblist, &cur_gp_seq)) { + if (!needwait_gp || + ULONG_CMP_LT(cur_gp_seq, wait_gp_seq)) + wait_gp_seq = cur_gp_seq; + needwait_gp = true; + trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, + TPS("NeedWaitGP")); + } + if (rcu_segcblist_ready_cbs(&rdp->cblist)) { + needwake = rdp->nocb_cb_sleep; + WRITE_ONCE(rdp->nocb_cb_sleep, false); + smp_mb(); /* CB invocation -after- GP end. */ + } else { + needwake = false; + } + rcu_nocb_unlock_irqrestore(rdp, flags); + if (needwake) { + swake_up_one(&rdp->nocb_cb_wq); + gotcbs = true; + } + if (needwake_gp) + rcu_gp_kthread_wake(); + if (needwake_state) + swake_up_one(&rdp->nocb_state_wq); + } + + my_rdp->nocb_gp_bypass = bypass; + my_rdp->nocb_gp_gp = needwait_gp; + my_rdp->nocb_gp_seq = needwait_gp ? wait_gp_seq : 0; + + if (bypass && !rcu_nocb_poll) { + // At least one child with non-empty ->nocb_bypass, so set + // timer in order to avoid stranding its callbacks. + wake_nocb_gp_defer(my_rdp, RCU_NOCB_WAKE_BYPASS, + TPS("WakeBypassIsDeferred")); + } + if (rcu_nocb_poll) { + /* Polling, so trace if first poll in the series. */ + if (gotcbs) + trace_rcu_nocb_wake(rcu_state.name, cpu, TPS("Poll")); + schedule_timeout_idle(1); + } else if (!needwait_gp) { + /* Wait for callbacks to appear. */ + trace_rcu_nocb_wake(rcu_state.name, cpu, TPS("Sleep")); + swait_event_interruptible_exclusive(my_rdp->nocb_gp_wq, + !READ_ONCE(my_rdp->nocb_gp_sleep)); + trace_rcu_nocb_wake(rcu_state.name, cpu, TPS("EndSleep")); + } else { + rnp = my_rdp->mynode; + trace_rcu_this_gp(rnp, my_rdp, wait_gp_seq, TPS("StartWait")); + swait_event_interruptible_exclusive( + rnp->nocb_gp_wq[rcu_seq_ctr(wait_gp_seq) & 0x1], + rcu_seq_done(&rnp->gp_seq, wait_gp_seq) || + !READ_ONCE(my_rdp->nocb_gp_sleep)); + trace_rcu_this_gp(rnp, my_rdp, wait_gp_seq, TPS("EndWait")); + } + if (!rcu_nocb_poll) { + raw_spin_lock_irqsave(&my_rdp->nocb_gp_lock, flags); + if (my_rdp->nocb_defer_wakeup > RCU_NOCB_WAKE_NOT) { + WRITE_ONCE(my_rdp->nocb_defer_wakeup, RCU_NOCB_WAKE_NOT); + del_timer(&my_rdp->nocb_timer); + } + WRITE_ONCE(my_rdp->nocb_gp_sleep, true); + raw_spin_unlock_irqrestore(&my_rdp->nocb_gp_lock, flags); + } + my_rdp->nocb_gp_seq = -1; + WARN_ON(signal_pending(current)); +} + +/* + * No-CBs grace-period-wait kthread. There is one of these per group + * of CPUs, but only once at least one CPU in that group has come online + * at least once since boot. This kthread checks for newly posted + * callbacks from any of the CPUs it is responsible for, waits for a + * grace period, then awakens all of the rcu_nocb_cb_kthread() instances + * that then have callback-invocation work to do. + */ +static int rcu_nocb_gp_kthread(void *arg) +{ + struct rcu_data *rdp = arg; + + for (;;) { + WRITE_ONCE(rdp->nocb_gp_loops, rdp->nocb_gp_loops + 1); + nocb_gp_wait(rdp); + cond_resched_tasks_rcu_qs(); + } + return 0; +} + +static inline bool nocb_cb_can_run(struct rcu_data *rdp) +{ + u8 flags = SEGCBLIST_OFFLOADED | SEGCBLIST_KTHREAD_CB; + return rcu_segcblist_test_flags(&rdp->cblist, flags); +} + +static inline bool nocb_cb_wait_cond(struct rcu_data *rdp) +{ + return nocb_cb_can_run(rdp) && !READ_ONCE(rdp->nocb_cb_sleep); +} + +/* + * Invoke any ready callbacks from the corresponding no-CBs CPU, + * then, if there are no more, wait for more to appear. + */ +static void nocb_cb_wait(struct rcu_data *rdp) +{ + struct rcu_segcblist *cblist = &rdp->cblist; + unsigned long cur_gp_seq; + unsigned long flags; + bool needwake_state = false; + bool needwake_gp = false; + bool can_sleep = true; + struct rcu_node *rnp = rdp->mynode; + + local_irq_save(flags); + rcu_momentary_dyntick_idle(); + local_irq_restore(flags); + /* + * Disable BH to provide the expected environment. Also, when + * transitioning to/from NOCB mode, a self-requeuing callback might + * be invoked from softirq. A short grace period could cause both + * instances of this callback would execute concurrently. + */ + local_bh_disable(); + rcu_do_batch(rdp); + local_bh_enable(); + lockdep_assert_irqs_enabled(); + rcu_nocb_lock_irqsave(rdp, flags); + if (rcu_segcblist_nextgp(cblist, &cur_gp_seq) && + rcu_seq_done(&rnp->gp_seq, cur_gp_seq) && + raw_spin_trylock_rcu_node(rnp)) { /* irqs already disabled. */ + needwake_gp = rcu_advance_cbs(rdp->mynode, rdp); + raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled. */ + } + + if (rcu_segcblist_test_flags(cblist, SEGCBLIST_OFFLOADED)) { + if (!rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB)) { + rcu_segcblist_set_flags(cblist, SEGCBLIST_KTHREAD_CB); + if (rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP)) + needwake_state = true; + } + if (rcu_segcblist_ready_cbs(cblist)) + can_sleep = false; + } else { + /* + * De-offloading. Clear our flag and notify the de-offload worker. + * We won't touch the callbacks and keep sleeping until we ever + * get re-offloaded. + */ + WARN_ON_ONCE(!rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB)); + rcu_segcblist_clear_flags(cblist, SEGCBLIST_KTHREAD_CB); + if (!rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP)) + needwake_state = true; + } + + WRITE_ONCE(rdp->nocb_cb_sleep, can_sleep); + + if (rdp->nocb_cb_sleep) + trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("CBSleep")); + + rcu_nocb_unlock_irqrestore(rdp, flags); + if (needwake_gp) + rcu_gp_kthread_wake(); + + if (needwake_state) + swake_up_one(&rdp->nocb_state_wq); + + do { + swait_event_interruptible_exclusive(rdp->nocb_cb_wq, + nocb_cb_wait_cond(rdp)); + + // VVV Ensure CB invocation follows _sleep test. + if (smp_load_acquire(&rdp->nocb_cb_sleep)) { // ^^^ + WARN_ON(signal_pending(current)); + trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WokeEmpty")); + } + } while (!nocb_cb_can_run(rdp)); +} + +/* + * Per-rcu_data kthread, but only for no-CBs CPUs. Repeatedly invoke + * nocb_cb_wait() to do the dirty work. + */ +static int rcu_nocb_cb_kthread(void *arg) +{ + struct rcu_data *rdp = arg; + + // Each pass through this loop does one callback batch, and, + // if there are no more ready callbacks, waits for them. + for (;;) { + nocb_cb_wait(rdp); + cond_resched_tasks_rcu_qs(); + } + return 0; +} + +/* Is a deferred wakeup of rcu_nocb_kthread() required? */ +static int rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp, int level) +{ + return READ_ONCE(rdp->nocb_defer_wakeup) >= level; +} + +/* Do a deferred wakeup of rcu_nocb_kthread(). */ +static bool do_nocb_deferred_wakeup_common(struct rcu_data *rdp_gp, + struct rcu_data *rdp, int level, + unsigned long flags) + __releases(rdp_gp->nocb_gp_lock) +{ + int ndw; + int ret; + + if (!rcu_nocb_need_deferred_wakeup(rdp_gp, level)) { + raw_spin_unlock_irqrestore(&rdp_gp->nocb_gp_lock, flags); + return false; + } + + ndw = rdp_gp->nocb_defer_wakeup; + ret = __wake_nocb_gp(rdp_gp, rdp, ndw == RCU_NOCB_WAKE_FORCE, flags); + trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("DeferredWake")); + + return ret; +} + +/* Do a deferred wakeup of rcu_nocb_kthread() from a timer handler. */ +static void do_nocb_deferred_wakeup_timer(struct timer_list *t) +{ + unsigned long flags; + struct rcu_data *rdp = from_timer(rdp, t, nocb_timer); + + WARN_ON_ONCE(rdp->nocb_gp_rdp != rdp); + trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("Timer")); + + raw_spin_lock_irqsave(&rdp->nocb_gp_lock, flags); + smp_mb__after_spinlock(); /* Timer expire before wakeup. */ + do_nocb_deferred_wakeup_common(rdp, rdp, RCU_NOCB_WAKE_BYPASS, flags); +} + +/* + * Do a deferred wakeup of rcu_nocb_kthread() from fastpath. + * This means we do an inexact common-case check. Note that if + * we miss, ->nocb_timer will eventually clean things up. + */ +static bool do_nocb_deferred_wakeup(struct rcu_data *rdp) +{ + unsigned long flags; + struct rcu_data *rdp_gp = rdp->nocb_gp_rdp; + + if (!rdp_gp || !rcu_nocb_need_deferred_wakeup(rdp_gp, RCU_NOCB_WAKE)) + return false; + + raw_spin_lock_irqsave(&rdp_gp->nocb_gp_lock, flags); + return do_nocb_deferred_wakeup_common(rdp_gp, rdp, RCU_NOCB_WAKE, flags); +} + +void rcu_nocb_flush_deferred_wakeup(void) +{ + do_nocb_deferred_wakeup(this_cpu_ptr(&rcu_data)); +} +EXPORT_SYMBOL_GPL(rcu_nocb_flush_deferred_wakeup); + +static int rdp_offload_toggle(struct rcu_data *rdp, + bool offload, unsigned long flags) + __releases(rdp->nocb_lock) +{ + struct rcu_segcblist *cblist = &rdp->cblist; + struct rcu_data *rdp_gp = rdp->nocb_gp_rdp; + bool wake_gp = false; + + rcu_segcblist_offload(cblist, offload); + + if (rdp->nocb_cb_sleep) + rdp->nocb_cb_sleep = false; + rcu_nocb_unlock_irqrestore(rdp, flags); + + /* + * Ignore former value of nocb_cb_sleep and force wake up as it could + * have been spuriously set to false already. + */ + swake_up_one(&rdp->nocb_cb_wq); + + raw_spin_lock_irqsave(&rdp_gp->nocb_gp_lock, flags); + if (rdp_gp->nocb_gp_sleep) { + rdp_gp->nocb_gp_sleep = false; + wake_gp = true; + } + raw_spin_unlock_irqrestore(&rdp_gp->nocb_gp_lock, flags); + + if (wake_gp) + wake_up_process(rdp_gp->nocb_gp_kthread); + + return 0; +} + +static long rcu_nocb_rdp_deoffload(void *arg) +{ + struct rcu_data *rdp = arg; + struct rcu_segcblist *cblist = &rdp->cblist; + unsigned long flags; + int ret; + + WARN_ON_ONCE(rdp->cpu != raw_smp_processor_id()); + + pr_info("De-offloading %d\n", rdp->cpu); + + rcu_nocb_lock_irqsave(rdp, flags); + /* + * Flush once and for all now. This suffices because we are + * running on the target CPU holding ->nocb_lock (thus having + * interrupts disabled), and because rdp_offload_toggle() + * invokes rcu_segcblist_offload(), which clears SEGCBLIST_OFFLOADED. + * Thus future calls to rcu_segcblist_completely_offloaded() will + * return false, which means that future calls to rcu_nocb_try_bypass() + * will refuse to put anything into the bypass. + */ + WARN_ON_ONCE(!rcu_nocb_flush_bypass(rdp, NULL, jiffies)); + ret = rdp_offload_toggle(rdp, false, flags); + swait_event_exclusive(rdp->nocb_state_wq, + !rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB | + SEGCBLIST_KTHREAD_GP)); + /* + * Lock one last time to acquire latest callback updates from kthreads + * so we can later handle callbacks locally without locking. + */ + rcu_nocb_lock_irqsave(rdp, flags); + /* + * Theoretically we could set SEGCBLIST_SOFTIRQ_ONLY after the nocb + * lock is released but how about being paranoid for once? + */ + rcu_segcblist_set_flags(cblist, SEGCBLIST_SOFTIRQ_ONLY); + /* + * With SEGCBLIST_SOFTIRQ_ONLY, we can't use + * rcu_nocb_unlock_irqrestore() anymore. + */ + raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags); + + /* Sanity check */ + WARN_ON_ONCE(rcu_cblist_n_cbs(&rdp->nocb_bypass)); + + + return ret; +} + +int rcu_nocb_cpu_deoffload(int cpu) +{ + struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); + int ret = 0; + + mutex_lock(&rcu_state.barrier_mutex); + cpus_read_lock(); + if (rcu_rdp_is_offloaded(rdp)) { + if (cpu_online(cpu)) { + ret = work_on_cpu(cpu, rcu_nocb_rdp_deoffload, rdp); + if (!ret) + cpumask_clear_cpu(cpu, rcu_nocb_mask); + } else { + pr_info("NOCB: Can't CB-deoffload an offline CPU\n"); + ret = -EINVAL; + } + } + cpus_read_unlock(); + mutex_unlock(&rcu_state.barrier_mutex); + + return ret; +} +EXPORT_SYMBOL_GPL(rcu_nocb_cpu_deoffload); + +static long rcu_nocb_rdp_offload(void *arg) +{ + struct rcu_data *rdp = arg; + struct rcu_segcblist *cblist = &rdp->cblist; + unsigned long flags; + int ret; + + WARN_ON_ONCE(rdp->cpu != raw_smp_processor_id()); + /* + * For now we only support re-offload, ie: the rdp must have been + * offloaded on boot first. + */ + if (!rdp->nocb_gp_rdp) + return -EINVAL; + + pr_info("Offloading %d\n", rdp->cpu); + /* + * Can't use rcu_nocb_lock_irqsave() while we are in + * SEGCBLIST_SOFTIRQ_ONLY mode. + */ + raw_spin_lock_irqsave(&rdp->nocb_lock, flags); + + /* + * We didn't take the nocb lock while working on the + * rdp->cblist in SEGCBLIST_SOFTIRQ_ONLY mode. + * Every modifications that have been done previously on + * rdp->cblist must be visible remotely by the nocb kthreads + * upon wake up after reading the cblist flags. + * + * The layout against nocb_lock enforces that ordering: + * + * __rcu_nocb_rdp_offload() nocb_cb_wait()/nocb_gp_wait() + * ------------------------- ---------------------------- + * WRITE callbacks rcu_nocb_lock() + * rcu_nocb_lock() READ flags + * WRITE flags READ callbacks + * rcu_nocb_unlock() rcu_nocb_unlock() + */ + ret = rdp_offload_toggle(rdp, true, flags); + swait_event_exclusive(rdp->nocb_state_wq, + rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB) && + rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP)); + + return ret; +} + +int rcu_nocb_cpu_offload(int cpu) +{ + struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); + int ret = 0; + + mutex_lock(&rcu_state.barrier_mutex); + cpus_read_lock(); + if (!rcu_rdp_is_offloaded(rdp)) { + if (cpu_online(cpu)) { + ret = work_on_cpu(cpu, rcu_nocb_rdp_offload, rdp); + if (!ret) + cpumask_set_cpu(cpu, rcu_nocb_mask); + } else { + pr_info("NOCB: Can't CB-offload an offline CPU\n"); + ret = -EINVAL; + } + } + cpus_read_unlock(); + mutex_unlock(&rcu_state.barrier_mutex); + + return ret; +} +EXPORT_SYMBOL_GPL(rcu_nocb_cpu_offload); + +void __init rcu_init_nohz(void) +{ + int cpu; + bool need_rcu_nocb_mask = false; + struct rcu_data *rdp; + +#if defined(CONFIG_NO_HZ_FULL) + if (tick_nohz_full_running && cpumask_weight(tick_nohz_full_mask)) + need_rcu_nocb_mask = true; +#endif /* #if defined(CONFIG_NO_HZ_FULL) */ + + if (!cpumask_available(rcu_nocb_mask) && need_rcu_nocb_mask) { + if (!zalloc_cpumask_var(&rcu_nocb_mask, GFP_KERNEL)) { + pr_info("rcu_nocb_mask allocation failed, callback offloading disabled.\n"); + return; + } + } + if (!cpumask_available(rcu_nocb_mask)) + return; + +#if defined(CONFIG_NO_HZ_FULL) + if (tick_nohz_full_running) + cpumask_or(rcu_nocb_mask, rcu_nocb_mask, tick_nohz_full_mask); +#endif /* #if defined(CONFIG_NO_HZ_FULL) */ + + if (!cpumask_subset(rcu_nocb_mask, cpu_possible_mask)) { + pr_info("\tNote: kernel parameter 'rcu_nocbs=', 'nohz_full', or 'isolcpus=' contains nonexistent CPUs.\n"); + cpumask_and(rcu_nocb_mask, cpu_possible_mask, + rcu_nocb_mask); + } + if (cpumask_empty(rcu_nocb_mask)) + pr_info("\tOffload RCU callbacks from CPUs: (none).\n"); + else + pr_info("\tOffload RCU callbacks from CPUs: %*pbl.\n", + cpumask_pr_args(rcu_nocb_mask)); + if (rcu_nocb_poll) + pr_info("\tPoll for callbacks from no-CBs CPUs.\n"); + + for_each_cpu(cpu, rcu_nocb_mask) { + rdp = per_cpu_ptr(&rcu_data, cpu); + if (rcu_segcblist_empty(&rdp->cblist)) + rcu_segcblist_init(&rdp->cblist); + rcu_segcblist_offload(&rdp->cblist, true); + rcu_segcblist_set_flags(&rdp->cblist, SEGCBLIST_KTHREAD_CB); + rcu_segcblist_set_flags(&rdp->cblist, SEGCBLIST_KTHREAD_GP); + } + rcu_organize_nocb_kthreads(); +} + +/* Initialize per-rcu_data variables for no-CBs CPUs. */ +static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp) +{ + init_swait_queue_head(&rdp->nocb_cb_wq); + init_swait_queue_head(&rdp->nocb_gp_wq); + init_swait_queue_head(&rdp->nocb_state_wq); + raw_spin_lock_init(&rdp->nocb_lock); + raw_spin_lock_init(&rdp->nocb_bypass_lock); + raw_spin_lock_init(&rdp->nocb_gp_lock); + timer_setup(&rdp->nocb_timer, do_nocb_deferred_wakeup_timer, 0); + rcu_cblist_init(&rdp->nocb_bypass); +} + +/* + * If the specified CPU is a no-CBs CPU that does not already have its + * rcuo CB kthread, spawn it. Additionally, if the rcuo GP kthread + * for this CPU's group has not yet been created, spawn it as well. + */ +static void rcu_spawn_one_nocb_kthread(int cpu) +{ + struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); + struct rcu_data *rdp_gp; + struct task_struct *t; + + /* + * If this isn't a no-CBs CPU or if it already has an rcuo kthread, + * then nothing to do. + */ + if (!rcu_is_nocb_cpu(cpu) || rdp->nocb_cb_kthread) + return; + + /* If we didn't spawn the GP kthread first, reorganize! */ + rdp_gp = rdp->nocb_gp_rdp; + if (!rdp_gp->nocb_gp_kthread) { + t = kthread_run(rcu_nocb_gp_kthread, rdp_gp, + "rcuog/%d", rdp_gp->cpu); + if (WARN_ONCE(IS_ERR(t), "%s: Could not start rcuo GP kthread, OOM is now expected behavior\n", __func__)) + return; + WRITE_ONCE(rdp_gp->nocb_gp_kthread, t); + } + + /* Spawn the kthread for this CPU. */ + t = kthread_run(rcu_nocb_cb_kthread, rdp, + "rcuo%c/%d", rcu_state.abbr, cpu); + if (WARN_ONCE(IS_ERR(t), "%s: Could not start rcuo CB kthread, OOM is now expected behavior\n", __func__)) + return; + WRITE_ONCE(rdp->nocb_cb_kthread, t); + WRITE_ONCE(rdp->nocb_gp_kthread, rdp_gp->nocb_gp_kthread); +} + +/* + * If the specified CPU is a no-CBs CPU that does not already have its + * rcuo kthread, spawn it. + */ +static void rcu_spawn_cpu_nocb_kthread(int cpu) +{ + if (rcu_scheduler_fully_active) + rcu_spawn_one_nocb_kthread(cpu); +} + +/* + * Once the scheduler is running, spawn rcuo kthreads for all online + * no-CBs CPUs. This assumes that the early_initcall()s happen before + * non-boot CPUs come online -- if this changes, we will need to add + * some mutual exclusion. + */ +static void __init rcu_spawn_nocb_kthreads(void) +{ + int cpu; + + for_each_online_cpu(cpu) + rcu_spawn_cpu_nocb_kthread(cpu); +} + +/* How many CB CPU IDs per GP kthread? Default of -1 for sqrt(nr_cpu_ids). */ +static int rcu_nocb_gp_stride = -1; +module_param(rcu_nocb_gp_stride, int, 0444); + +/* + * Initialize GP-CB relationships for all no-CBs CPU. + */ +static void __init rcu_organize_nocb_kthreads(void) +{ + int cpu; + bool firsttime = true; + bool gotnocbs = false; + bool gotnocbscbs = true; + int ls = rcu_nocb_gp_stride; + int nl = 0; /* Next GP kthread. */ + struct rcu_data *rdp; + struct rcu_data *rdp_gp = NULL; /* Suppress misguided gcc warn. */ + struct rcu_data *rdp_prev = NULL; + + if (!cpumask_available(rcu_nocb_mask)) + return; + if (ls == -1) { + ls = nr_cpu_ids / int_sqrt(nr_cpu_ids); + rcu_nocb_gp_stride = ls; + } + + /* + * Each pass through this loop sets up one rcu_data structure. + * Should the corresponding CPU come online in the future, then + * we will spawn the needed set of rcu_nocb_kthread() kthreads. + */ + for_each_cpu(cpu, rcu_nocb_mask) { + rdp = per_cpu_ptr(&rcu_data, cpu); + if (rdp->cpu >= nl) { + /* New GP kthread, set up for CBs & next GP. */ + gotnocbs = true; + nl = DIV_ROUND_UP(rdp->cpu + 1, ls) * ls; + rdp->nocb_gp_rdp = rdp; + rdp_gp = rdp; + if (dump_tree) { + if (!firsttime) + pr_cont("%s\n", gotnocbscbs + ? "" : " (self only)"); + gotnocbscbs = false; + firsttime = false; + pr_alert("%s: No-CB GP kthread CPU %d:", + __func__, cpu); + } + } else { + /* Another CB kthread, link to previous GP kthread. */ + gotnocbscbs = true; + rdp->nocb_gp_rdp = rdp_gp; + rdp_prev->nocb_next_cb_rdp = rdp; + if (dump_tree) + pr_cont(" %d", cpu); + } + rdp_prev = rdp; + } + if (gotnocbs && dump_tree) + pr_cont("%s\n", gotnocbscbs ? "" : " (self only)"); +} + +/* + * Bind the current task to the offloaded CPUs. If there are no offloaded + * CPUs, leave the task unbound. Splat if the bind attempt fails. + */ +void rcu_bind_current_to_nocb(void) +{ + if (cpumask_available(rcu_nocb_mask) && cpumask_weight(rcu_nocb_mask)) + WARN_ON(sched_setaffinity(current->pid, rcu_nocb_mask)); +} +EXPORT_SYMBOL_GPL(rcu_bind_current_to_nocb); + +// The ->on_cpu field is available only in CONFIG_SMP=y, so... +#ifdef CONFIG_SMP +static char *show_rcu_should_be_on_cpu(struct task_struct *tsp) +{ + return tsp && task_is_running(tsp) && !tsp->on_cpu ? "!" : ""; +} +#else // #ifdef CONFIG_SMP +static char *show_rcu_should_be_on_cpu(struct task_struct *tsp) +{ + return ""; +} +#endif // #else #ifdef CONFIG_SMP + +/* + * Dump out nocb grace-period kthread state for the specified rcu_data + * structure. + */ +static void show_rcu_nocb_gp_state(struct rcu_data *rdp) +{ + struct rcu_node *rnp = rdp->mynode; + + pr_info("nocb GP %d %c%c%c%c%c %c[%c%c] %c%c:%ld rnp %d:%d %lu %c CPU %d%s\n", + rdp->cpu, + "kK"[!!rdp->nocb_gp_kthread], + "lL"[raw_spin_is_locked(&rdp->nocb_gp_lock)], + "dD"[!!rdp->nocb_defer_wakeup], + "tT"[timer_pending(&rdp->nocb_timer)], + "sS"[!!rdp->nocb_gp_sleep], + ".W"[swait_active(&rdp->nocb_gp_wq)], + ".W"[swait_active(&rnp->nocb_gp_wq[0])], + ".W"[swait_active(&rnp->nocb_gp_wq[1])], + ".B"[!!rdp->nocb_gp_bypass], + ".G"[!!rdp->nocb_gp_gp], + (long)rdp->nocb_gp_seq, + rnp->grplo, rnp->grphi, READ_ONCE(rdp->nocb_gp_loops), + rdp->nocb_gp_kthread ? task_state_to_char(rdp->nocb_gp_kthread) : '.', + rdp->nocb_cb_kthread ? (int)task_cpu(rdp->nocb_gp_kthread) : -1, + show_rcu_should_be_on_cpu(rdp->nocb_cb_kthread)); +} + +/* Dump out nocb kthread state for the specified rcu_data structure. */ +static void show_rcu_nocb_state(struct rcu_data *rdp) +{ + char bufw[20]; + char bufr[20]; + struct rcu_segcblist *rsclp = &rdp->cblist; + bool waslocked; + bool wassleep; + + if (rdp->nocb_gp_rdp == rdp) + show_rcu_nocb_gp_state(rdp); + + sprintf(bufw, "%ld", rsclp->gp_seq[RCU_WAIT_TAIL]); + sprintf(bufr, "%ld", rsclp->gp_seq[RCU_NEXT_READY_TAIL]); + pr_info(" CB %d^%d->%d %c%c%c%c%c%c F%ld L%ld C%d %c%c%s%c%s%c%c q%ld %c CPU %d%s\n", + rdp->cpu, rdp->nocb_gp_rdp->cpu, + rdp->nocb_next_cb_rdp ? rdp->nocb_next_cb_rdp->cpu : -1, + "kK"[!!rdp->nocb_cb_kthread], + "bB"[raw_spin_is_locked(&rdp->nocb_bypass_lock)], + "cC"[!!atomic_read(&rdp->nocb_lock_contended)], + "lL"[raw_spin_is_locked(&rdp->nocb_lock)], + "sS"[!!rdp->nocb_cb_sleep], + ".W"[swait_active(&rdp->nocb_cb_wq)], + jiffies - rdp->nocb_bypass_first, + jiffies - rdp->nocb_nobypass_last, + rdp->nocb_nobypass_count, + ".D"[rcu_segcblist_ready_cbs(rsclp)], + ".W"[!rcu_segcblist_segempty(rsclp, RCU_WAIT_TAIL)], + rcu_segcblist_segempty(rsclp, RCU_WAIT_TAIL) ? "" : bufw, + ".R"[!rcu_segcblist_segempty(rsclp, RCU_NEXT_READY_TAIL)], + rcu_segcblist_segempty(rsclp, RCU_NEXT_READY_TAIL) ? "" : bufr, + ".N"[!rcu_segcblist_segempty(rsclp, RCU_NEXT_TAIL)], + ".B"[!!rcu_cblist_n_cbs(&rdp->nocb_bypass)], + rcu_segcblist_n_cbs(&rdp->cblist), + rdp->nocb_cb_kthread ? task_state_to_char(rdp->nocb_cb_kthread) : '.', + rdp->nocb_cb_kthread ? (int)task_cpu(rdp->nocb_gp_kthread) : -1, + show_rcu_should_be_on_cpu(rdp->nocb_cb_kthread)); + + /* It is OK for GP kthreads to have GP state. */ + if (rdp->nocb_gp_rdp == rdp) + return; + + waslocked = raw_spin_is_locked(&rdp->nocb_gp_lock); + wassleep = swait_active(&rdp->nocb_gp_wq); + if (!rdp->nocb_gp_sleep && !waslocked && !wassleep) + return; /* Nothing untoward. */ + + pr_info(" nocb GP activity on CB-only CPU!!! %c%c%c %c\n", + "lL"[waslocked], + "dD"[!!rdp->nocb_defer_wakeup], + "sS"[!!rdp->nocb_gp_sleep], + ".W"[wassleep]); +} + +#else /* #ifdef CONFIG_RCU_NOCB_CPU */ + +static inline int rcu_lockdep_is_held_nocb(struct rcu_data *rdp) +{ + return 0; +} + +static inline bool rcu_current_is_nocb_kthread(struct rcu_data *rdp) +{ + return false; +} + +/* No ->nocb_lock to acquire. */ +static void rcu_nocb_lock(struct rcu_data *rdp) +{ +} + +/* No ->nocb_lock to release. */ +static void rcu_nocb_unlock(struct rcu_data *rdp) +{ +} + +/* No ->nocb_lock to release. */ +static void rcu_nocb_unlock_irqrestore(struct rcu_data *rdp, + unsigned long flags) +{ + local_irq_restore(flags); +} + +/* Lockdep check that ->cblist may be safely accessed. */ +static void rcu_lockdep_assert_cblist_protected(struct rcu_data *rdp) +{ + lockdep_assert_irqs_disabled(); +} + +static void rcu_nocb_gp_cleanup(struct swait_queue_head *sq) +{ +} + +static struct swait_queue_head *rcu_nocb_gp_get(struct rcu_node *rnp) +{ + return NULL; +} + +static void rcu_init_one_nocb(struct rcu_node *rnp) +{ +} + +static bool rcu_nocb_flush_bypass(struct rcu_data *rdp, struct rcu_head *rhp, + unsigned long j) +{ + return true; +} + +static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp, + bool *was_alldone, unsigned long flags) +{ + return false; +} + +static void __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_empty, + unsigned long flags) +{ + WARN_ON_ONCE(1); /* Should be dead code! */ +} + +static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp) +{ +} + +static int rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp, int level) +{ + return false; +} + +static bool do_nocb_deferred_wakeup(struct rcu_data *rdp) +{ + return false; +} + +static void rcu_spawn_cpu_nocb_kthread(int cpu) +{ +} + +static void __init rcu_spawn_nocb_kthreads(void) +{ +} + +static void show_rcu_nocb_state(struct rcu_data *rdp) +{ +} + +#endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */ diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index de1dc3bb7f70..aec5075c60b3 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -13,39 +13,6 @@ #include "../locking/rtmutex_common.h" -#ifdef CONFIG_RCU_NOCB_CPU -static cpumask_var_t rcu_nocb_mask; /* CPUs to have callbacks offloaded. */ -static bool __read_mostly rcu_nocb_poll; /* Offload kthread are to poll. */ -static inline int rcu_lockdep_is_held_nocb(struct rcu_data *rdp) -{ - return lockdep_is_held(&rdp->nocb_lock); -} - -static inline bool rcu_current_is_nocb_kthread(struct rcu_data *rdp) -{ - /* Race on early boot between thread creation and assignment */ - if (!rdp->nocb_cb_kthread || !rdp->nocb_gp_kthread) - return true; - - if (current == rdp->nocb_cb_kthread || current == rdp->nocb_gp_kthread) - if (in_task()) - return true; - return false; -} - -#else -static inline int rcu_lockdep_is_held_nocb(struct rcu_data *rdp) -{ - return 0; -} - -static inline bool rcu_current_is_nocb_kthread(struct rcu_data *rdp) -{ - return false; -} - -#endif /* #ifdef CONFIG_RCU_NOCB_CPU */ - static bool rcu_rdp_is_offloaded(struct rcu_data *rdp) { /* @@ -1479,1460 +1446,6 @@ static void rcu_cleanup_after_idle(void) #endif /* #else #if !defined(CONFIG_RCU_FAST_NO_HZ) */ -#ifdef CONFIG_RCU_NOCB_CPU - -/* - * Offload callback processing from the boot-time-specified set of CPUs - * specified by rcu_nocb_mask. For the CPUs in the set, there are kthreads - * created that pull the callbacks from the corresponding CPU, wait for - * a grace period to elapse, and invoke the callbacks. These kthreads - * are organized into GP kthreads, which manage incoming callbacks, wait for - * grace periods, and awaken CB kthreads, and the CB kthreads, which only - * invoke callbacks. Each GP kthread invokes its own CBs. The no-CBs CPUs - * do a wake_up() on their GP kthread when they insert a callback into any - * empty list, unless the rcu_nocb_poll boot parameter has been specified, - * in which case each kthread actively polls its CPU. (Which isn't so great - * for energy efficiency, but which does reduce RCU's overhead on that CPU.) - * - * This is intended to be used in conjunction with Frederic Weisbecker's - * adaptive-idle work, which would seriously reduce OS jitter on CPUs - * running CPU-bound user-mode computations. - * - * Offloading of callbacks can also be used as an energy-efficiency - * measure because CPUs with no RCU callbacks queued are more aggressive - * about entering dyntick-idle mode. - */ - - -/* - * Parse the boot-time rcu_nocb_mask CPU list from the kernel parameters. - * If the list is invalid, a warning is emitted and all CPUs are offloaded. - */ -static int __init rcu_nocb_setup(char *str) -{ - alloc_bootmem_cpumask_var(&rcu_nocb_mask); - if (cpulist_parse(str, rcu_nocb_mask)) { - pr_warn("rcu_nocbs= bad CPU range, all CPUs set\n"); - cpumask_setall(rcu_nocb_mask); - } - return 1; -} -__setup("rcu_nocbs=", rcu_nocb_setup); - -static int __init parse_rcu_nocb_poll(char *arg) -{ - rcu_nocb_poll = true; - return 0; -} -early_param("rcu_nocb_poll", parse_rcu_nocb_poll); - -/* - * Don't bother bypassing ->cblist if the call_rcu() rate is low. - * After all, the main point of bypassing is to avoid lock contention - * on ->nocb_lock, which only can happen at high call_rcu() rates. - */ -static int nocb_nobypass_lim_per_jiffy = 16 * 1000 / HZ; -module_param(nocb_nobypass_lim_per_jiffy, int, 0); - -/* - * Acquire the specified rcu_data structure's ->nocb_bypass_lock. If the - * lock isn't immediately available, increment ->nocb_lock_contended to - * flag the contention. - */ -static void rcu_nocb_bypass_lock(struct rcu_data *rdp) - __acquires(&rdp->nocb_bypass_lock) -{ - lockdep_assert_irqs_disabled(); - if (raw_spin_trylock(&rdp->nocb_bypass_lock)) - return; - atomic_inc(&rdp->nocb_lock_contended); - WARN_ON_ONCE(smp_processor_id() != rdp->cpu); - smp_mb__after_atomic(); /* atomic_inc() before lock. */ - raw_spin_lock(&rdp->nocb_bypass_lock); - smp_mb__before_atomic(); /* atomic_dec() after lock. */ - atomic_dec(&rdp->nocb_lock_contended); -} - -/* - * Spinwait until the specified rcu_data structure's ->nocb_lock is - * not contended. Please note that this is extremely special-purpose, - * relying on the fact that at most two kthreads and one CPU contend for - * this lock, and also that the two kthreads are guaranteed to have frequent - * grace-period-duration time intervals between successive acquisitions - * of the lock. This allows us to use an extremely simple throttling - * mechanism, and further to apply it only to the CPU doing floods of - * call_rcu() invocations. Don't try this at home! - */ -static void rcu_nocb_wait_contended(struct rcu_data *rdp) -{ - WARN_ON_ONCE(smp_processor_id() != rdp->cpu); - while (WARN_ON_ONCE(atomic_read(&rdp->nocb_lock_contended))) - cpu_relax(); -} - -/* - * Conditionally acquire the specified rcu_data structure's - * ->nocb_bypass_lock. - */ -static bool rcu_nocb_bypass_trylock(struct rcu_data *rdp) -{ - lockdep_assert_irqs_disabled(); - return raw_spin_trylock(&rdp->nocb_bypass_lock); -} - -/* - * Release the specified rcu_data structure's ->nocb_bypass_lock. - */ -static void rcu_nocb_bypass_unlock(struct rcu_data *rdp) - __releases(&rdp->nocb_bypass_lock) -{ - lockdep_assert_irqs_disabled(); - raw_spin_unlock(&rdp->nocb_bypass_lock); -} - -/* - * Acquire the specified rcu_data structure's ->nocb_lock, but only - * if it corresponds to a no-CBs CPU. - */ -static void rcu_nocb_lock(struct rcu_data *rdp) -{ - lockdep_assert_irqs_disabled(); - if (!rcu_rdp_is_offloaded(rdp)) - return; - raw_spin_lock(&rdp->nocb_lock); -} - -/* - * Release the specified rcu_data structure's ->nocb_lock, but only - * if it corresponds to a no-CBs CPU. - */ -static void rcu_nocb_unlock(struct rcu_data *rdp) -{ - if (rcu_rdp_is_offloaded(rdp)) { - lockdep_assert_irqs_disabled(); - raw_spin_unlock(&rdp->nocb_lock); - } -} - -/* - * Release the specified rcu_data structure's ->nocb_lock and restore - * interrupts, but only if it corresponds to a no-CBs CPU. - */ -static void rcu_nocb_unlock_irqrestore(struct rcu_data *rdp, - unsigned long flags) -{ - if (rcu_rdp_is_offloaded(rdp)) { - lockdep_assert_irqs_disabled(); - raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags); - } else { - local_irq_restore(flags); - } -} - -/* Lockdep check that ->cblist may be safely accessed. */ -static void rcu_lockdep_assert_cblist_protected(struct rcu_data *rdp) -{ - lockdep_assert_irqs_disabled(); - if (rcu_rdp_is_offloaded(rdp)) - lockdep_assert_held(&rdp->nocb_lock); -} - -/* - * Wake up any no-CBs CPUs' kthreads that were waiting on the just-ended - * grace period. - */ -static void rcu_nocb_gp_cleanup(struct swait_queue_head *sq) -{ - swake_up_all(sq); -} - -static struct swait_queue_head *rcu_nocb_gp_get(struct rcu_node *rnp) -{ - return &rnp->nocb_gp_wq[rcu_seq_ctr(rnp->gp_seq) & 0x1]; -} - -static void rcu_init_one_nocb(struct rcu_node *rnp) -{ - init_swait_queue_head(&rnp->nocb_gp_wq[0]); - init_swait_queue_head(&rnp->nocb_gp_wq[1]); -} - -/* Is the specified CPU a no-CBs CPU? */ -bool rcu_is_nocb_cpu(int cpu) -{ - if (cpumask_available(rcu_nocb_mask)) - return cpumask_test_cpu(cpu, rcu_nocb_mask); - return false; -} - -static bool __wake_nocb_gp(struct rcu_data *rdp_gp, - struct rcu_data *rdp, - bool force, unsigned long flags) - __releases(rdp_gp->nocb_gp_lock) -{ - bool needwake = false; - - if (!READ_ONCE(rdp_gp->nocb_gp_kthread)) { - raw_spin_unlock_irqrestore(&rdp_gp->nocb_gp_lock, flags); - trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, - TPS("AlreadyAwake")); - return false; - } - - if (rdp_gp->nocb_defer_wakeup > RCU_NOCB_WAKE_NOT) { - WRITE_ONCE(rdp_gp->nocb_defer_wakeup, RCU_NOCB_WAKE_NOT); - del_timer(&rdp_gp->nocb_timer); - } - - if (force || READ_ONCE(rdp_gp->nocb_gp_sleep)) { - WRITE_ONCE(rdp_gp->nocb_gp_sleep, false); - needwake = true; - } - raw_spin_unlock_irqrestore(&rdp_gp->nocb_gp_lock, flags); - if (needwake) { - trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("DoWake")); - wake_up_process(rdp_gp->nocb_gp_kthread); - } - - return needwake; -} - -/* - * Kick the GP kthread for this NOCB group. - */ -static bool wake_nocb_gp(struct rcu_data *rdp, bool force) -{ - unsigned long flags; - struct rcu_data *rdp_gp = rdp->nocb_gp_rdp; - - raw_spin_lock_irqsave(&rdp_gp->nocb_gp_lock, flags); - return __wake_nocb_gp(rdp_gp, rdp, force, flags); -} - -/* - * Arrange to wake the GP kthread for this NOCB group at some future - * time when it is safe to do so. - */ -static void wake_nocb_gp_defer(struct rcu_data *rdp, int waketype, - const char *reason) -{ - unsigned long flags; - struct rcu_data *rdp_gp = rdp->nocb_gp_rdp; - - raw_spin_lock_irqsave(&rdp_gp->nocb_gp_lock, flags); - - /* - * Bypass wakeup overrides previous deferments. In case - * of callback storm, no need to wake up too early. - */ - if (waketype == RCU_NOCB_WAKE_BYPASS) { - mod_timer(&rdp_gp->nocb_timer, jiffies + 2); - WRITE_ONCE(rdp_gp->nocb_defer_wakeup, waketype); - } else { - if (rdp_gp->nocb_defer_wakeup < RCU_NOCB_WAKE) - mod_timer(&rdp_gp->nocb_timer, jiffies + 1); - if (rdp_gp->nocb_defer_wakeup < waketype) - WRITE_ONCE(rdp_gp->nocb_defer_wakeup, waketype); - } - - raw_spin_unlock_irqrestore(&rdp_gp->nocb_gp_lock, flags); - - trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, reason); -} - -/* - * Flush the ->nocb_bypass queue into ->cblist, enqueuing rhp if non-NULL. - * However, if there is a callback to be enqueued and if ->nocb_bypass - * proves to be initially empty, just return false because the no-CB GP - * kthread may need to be awakened in this case. - * - * Note that this function always returns true if rhp is NULL. - */ -static bool rcu_nocb_do_flush_bypass(struct rcu_data *rdp, struct rcu_head *rhp, - unsigned long j) -{ - struct rcu_cblist rcl; - - WARN_ON_ONCE(!rcu_rdp_is_offloaded(rdp)); - rcu_lockdep_assert_cblist_protected(rdp); - lockdep_assert_held(&rdp->nocb_bypass_lock); - if (rhp && !rcu_cblist_n_cbs(&rdp->nocb_bypass)) { - raw_spin_unlock(&rdp->nocb_bypass_lock); - return false; - } - /* Note: ->cblist.len already accounts for ->nocb_bypass contents. */ - if (rhp) - rcu_segcblist_inc_len(&rdp->cblist); /* Must precede enqueue. */ - rcu_cblist_flush_enqueue(&rcl, &rdp->nocb_bypass, rhp); - rcu_segcblist_insert_pend_cbs(&rdp->cblist, &rcl); - WRITE_ONCE(rdp->nocb_bypass_first, j); - rcu_nocb_bypass_unlock(rdp); - return true; -} - -/* - * Flush the ->nocb_bypass queue into ->cblist, enqueuing rhp if non-NULL. - * However, if there is a callback to be enqueued and if ->nocb_bypass - * proves to be initially empty, just return false because the no-CB GP - * kthread may need to be awakened in this case. - * - * Note that this function always returns true if rhp is NULL. - */ -static bool rcu_nocb_flush_bypass(struct rcu_data *rdp, struct rcu_head *rhp, - unsigned long j) -{ - if (!rcu_rdp_is_offloaded(rdp)) - return true; - rcu_lockdep_assert_cblist_protected(rdp); - rcu_nocb_bypass_lock(rdp); - return rcu_nocb_do_flush_bypass(rdp, rhp, j); -} - -/* - * If the ->nocb_bypass_lock is immediately available, flush the - * ->nocb_bypass queue into ->cblist. - */ -static void rcu_nocb_try_flush_bypass(struct rcu_data *rdp, unsigned long j) -{ - rcu_lockdep_assert_cblist_protected(rdp); - if (!rcu_rdp_is_offloaded(rdp) || - !rcu_nocb_bypass_trylock(rdp)) - return; - WARN_ON_ONCE(!rcu_nocb_do_flush_bypass(rdp, NULL, j)); -} - -/* - * See whether it is appropriate to use the ->nocb_bypass list in order - * to control contention on ->nocb_lock. A limited number of direct - * enqueues are permitted into ->cblist per jiffy. If ->nocb_bypass - * is non-empty, further callbacks must be placed into ->nocb_bypass, - * otherwise rcu_barrier() breaks. Use rcu_nocb_flush_bypass() to switch - * back to direct use of ->cblist. However, ->nocb_bypass should not be - * used if ->cblist is empty, because otherwise callbacks can be stranded - * on ->nocb_bypass because we cannot count on the current CPU ever again - * invoking call_rcu(). The general rule is that if ->nocb_bypass is - * non-empty, the corresponding no-CBs grace-period kthread must not be - * in an indefinite sleep state. - * - * Finally, it is not permitted to use the bypass during early boot, - * as doing so would confuse the auto-initialization code. Besides - * which, there is no point in worrying about lock contention while - * there is only one CPU in operation. - */ -static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp, - bool *was_alldone, unsigned long flags) -{ - unsigned long c; - unsigned long cur_gp_seq; - unsigned long j = jiffies; - long ncbs = rcu_cblist_n_cbs(&rdp->nocb_bypass); - - lockdep_assert_irqs_disabled(); - - // Pure softirq/rcuc based processing: no bypassing, no - // locking. - if (!rcu_rdp_is_offloaded(rdp)) { - *was_alldone = !rcu_segcblist_pend_cbs(&rdp->cblist); - return false; - } - - // In the process of (de-)offloading: no bypassing, but - // locking. - if (!rcu_segcblist_completely_offloaded(&rdp->cblist)) { - rcu_nocb_lock(rdp); - *was_alldone = !rcu_segcblist_pend_cbs(&rdp->cblist); - return false; /* Not offloaded, no bypassing. */ - } - - // Don't use ->nocb_bypass during early boot. - if (rcu_scheduler_active != RCU_SCHEDULER_RUNNING) { - rcu_nocb_lock(rdp); - WARN_ON_ONCE(rcu_cblist_n_cbs(&rdp->nocb_bypass)); - *was_alldone = !rcu_segcblist_pend_cbs(&rdp->cblist); - return false; - } - - // If we have advanced to a new jiffy, reset counts to allow - // moving back from ->nocb_bypass to ->cblist. - if (j == rdp->nocb_nobypass_last) { - c = rdp->nocb_nobypass_count + 1; - } else { - WRITE_ONCE(rdp->nocb_nobypass_last, j); - c = rdp->nocb_nobypass_count - nocb_nobypass_lim_per_jiffy; - if (ULONG_CMP_LT(rdp->nocb_nobypass_count, - nocb_nobypass_lim_per_jiffy)) - c = 0; - else if (c > nocb_nobypass_lim_per_jiffy) - c = nocb_nobypass_lim_per_jiffy; - } - WRITE_ONCE(rdp->nocb_nobypass_count, c); - - // If there hasn't yet been all that many ->cblist enqueues - // this jiffy, tell the caller to enqueue onto ->cblist. But flush - // ->nocb_bypass first. - if (rdp->nocb_nobypass_count < nocb_nobypass_lim_per_jiffy) { - rcu_nocb_lock(rdp); - *was_alldone = !rcu_segcblist_pend_cbs(&rdp->cblist); - if (*was_alldone) - trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, - TPS("FirstQ")); - WARN_ON_ONCE(!rcu_nocb_flush_bypass(rdp, NULL, j)); - WARN_ON_ONCE(rcu_cblist_n_cbs(&rdp->nocb_bypass)); - return false; // Caller must enqueue the callback. - } - - // If ->nocb_bypass has been used too long or is too full, - // flush ->nocb_bypass to ->cblist. - if ((ncbs && j != READ_ONCE(rdp->nocb_bypass_first)) || - ncbs >= qhimark) { - rcu_nocb_lock(rdp); - if (!rcu_nocb_flush_bypass(rdp, rhp, j)) { - *was_alldone = !rcu_segcblist_pend_cbs(&rdp->cblist); - if (*was_alldone) - trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, - TPS("FirstQ")); - WARN_ON_ONCE(rcu_cblist_n_cbs(&rdp->nocb_bypass)); - return false; // Caller must enqueue the callback. - } - if (j != rdp->nocb_gp_adv_time && - rcu_segcblist_nextgp(&rdp->cblist, &cur_gp_seq) && - rcu_seq_done(&rdp->mynode->gp_seq, cur_gp_seq)) { - rcu_advance_cbs_nowake(rdp->mynode, rdp); - rdp->nocb_gp_adv_time = j; - } - rcu_nocb_unlock_irqrestore(rdp, flags); - return true; // Callback already enqueued. - } - - // We need to use the bypass. - rcu_nocb_wait_contended(rdp); - rcu_nocb_bypass_lock(rdp); - ncbs = rcu_cblist_n_cbs(&rdp->nocb_bypass); - rcu_segcblist_inc_len(&rdp->cblist); /* Must precede enqueue. */ - rcu_cblist_enqueue(&rdp->nocb_bypass, rhp); - if (!ncbs) { - WRITE_ONCE(rdp->nocb_bypass_first, j); - trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("FirstBQ")); - } - rcu_nocb_bypass_unlock(rdp); - smp_mb(); /* Order enqueue before wake. */ - if (ncbs) { - local_irq_restore(flags); - } else { - // No-CBs GP kthread might be indefinitely asleep, if so, wake. - rcu_nocb_lock(rdp); // Rare during call_rcu() flood. - if (!rcu_segcblist_pend_cbs(&rdp->cblist)) { - trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, - TPS("FirstBQwake")); - __call_rcu_nocb_wake(rdp, true, flags); - } else { - trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, - TPS("FirstBQnoWake")); - rcu_nocb_unlock_irqrestore(rdp, flags); - } - } - return true; // Callback already enqueued. -} - -/* - * Awaken the no-CBs grace-period kthread if needed, either due to it - * legitimately being asleep or due to overload conditions. - * - * If warranted, also wake up the kthread servicing this CPUs queues. - */ -static void __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_alldone, - unsigned long flags) - __releases(rdp->nocb_lock) -{ - unsigned long cur_gp_seq; - unsigned long j; - long len; - struct task_struct *t; - - // If we are being polled or there is no kthread, just leave. - t = READ_ONCE(rdp->nocb_gp_kthread); - if (rcu_nocb_poll || !t) { - rcu_nocb_unlock_irqrestore(rdp, flags); - trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, - TPS("WakeNotPoll")); - return; - } - // Need to actually to a wakeup. - len = rcu_segcblist_n_cbs(&rdp->cblist); - if (was_alldone) { - rdp->qlen_last_fqs_check = len; - if (!irqs_disabled_flags(flags)) { - /* ... if queue was empty ... */ - rcu_nocb_unlock_irqrestore(rdp, flags); - wake_nocb_gp(rdp, false); - trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, - TPS("WakeEmpty")); - } else { - rcu_nocb_unlock_irqrestore(rdp, flags); - wake_nocb_gp_defer(rdp, RCU_NOCB_WAKE, - TPS("WakeEmptyIsDeferred")); - } - } else if (len > rdp->qlen_last_fqs_check + qhimark) { - /* ... or if many callbacks queued. */ - rdp->qlen_last_fqs_check = len; - j = jiffies; - if (j != rdp->nocb_gp_adv_time && - rcu_segcblist_nextgp(&rdp->cblist, &cur_gp_seq) && - rcu_seq_done(&rdp->mynode->gp_seq, cur_gp_seq)) { - rcu_advance_cbs_nowake(rdp->mynode, rdp); - rdp->nocb_gp_adv_time = j; - } - smp_mb(); /* Enqueue before timer_pending(). */ - if ((rdp->nocb_cb_sleep || - !rcu_segcblist_ready_cbs(&rdp->cblist)) && - !timer_pending(&rdp->nocb_timer)) { - rcu_nocb_unlock_irqrestore(rdp, flags); - wake_nocb_gp_defer(rdp, RCU_NOCB_WAKE_FORCE, - TPS("WakeOvfIsDeferred")); - } else { - rcu_nocb_unlock_irqrestore(rdp, flags); - trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WakeNot")); - } - } else { - rcu_nocb_unlock_irqrestore(rdp, flags); - trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WakeNot")); - } - return; -} - -/* - * Check if we ignore this rdp. - * - * We check that without holding the nocb lock but - * we make sure not to miss a freshly offloaded rdp - * with the current ordering: - * - * rdp_offload_toggle() nocb_gp_enabled_cb() - * ------------------------- ---------------------------- - * WRITE flags LOCK nocb_gp_lock - * LOCK nocb_gp_lock READ/WRITE nocb_gp_sleep - * READ/WRITE nocb_gp_sleep UNLOCK nocb_gp_lock - * UNLOCK nocb_gp_lock READ flags - */ -static inline bool nocb_gp_enabled_cb(struct rcu_data *rdp) -{ - u8 flags = SEGCBLIST_OFFLOADED | SEGCBLIST_KTHREAD_GP; - - return rcu_segcblist_test_flags(&rdp->cblist, flags); -} - -static inline bool nocb_gp_update_state_deoffloading(struct rcu_data *rdp, - bool *needwake_state) -{ - struct rcu_segcblist *cblist = &rdp->cblist; - - if (rcu_segcblist_test_flags(cblist, SEGCBLIST_OFFLOADED)) { - if (!rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP)) { - rcu_segcblist_set_flags(cblist, SEGCBLIST_KTHREAD_GP); - if (rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB)) - *needwake_state = true; - } - return false; - } - - /* - * De-offloading. Clear our flag and notify the de-offload worker. - * We will ignore this rdp until it ever gets re-offloaded. - */ - WARN_ON_ONCE(!rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP)); - rcu_segcblist_clear_flags(cblist, SEGCBLIST_KTHREAD_GP); - if (!rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB)) - *needwake_state = true; - return true; -} - - -/* - * No-CBs GP kthreads come here to wait for additional callbacks to show up - * or for grace periods to end. - */ -static void nocb_gp_wait(struct rcu_data *my_rdp) -{ - bool bypass = false; - long bypass_ncbs; - int __maybe_unused cpu = my_rdp->cpu; - unsigned long cur_gp_seq; - unsigned long flags; - bool gotcbs = false; - unsigned long j = jiffies; - bool needwait_gp = false; // This prevents actual uninitialized use. - bool needwake; - bool needwake_gp; - struct rcu_data *rdp; - struct rcu_node *rnp; - unsigned long wait_gp_seq = 0; // Suppress "use uninitialized" warning. - bool wasempty = false; - - /* - * Each pass through the following loop checks for CBs and for the - * nearest grace period (if any) to wait for next. The CB kthreads - * and the global grace-period kthread are awakened if needed. - */ - WARN_ON_ONCE(my_rdp->nocb_gp_rdp != my_rdp); - for (rdp = my_rdp; rdp; rdp = rdp->nocb_next_cb_rdp) { - bool needwake_state = false; - - if (!nocb_gp_enabled_cb(rdp)) - continue; - trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("Check")); - rcu_nocb_lock_irqsave(rdp, flags); - if (nocb_gp_update_state_deoffloading(rdp, &needwake_state)) { - rcu_nocb_unlock_irqrestore(rdp, flags); - if (needwake_state) - swake_up_one(&rdp->nocb_state_wq); - continue; - } - bypass_ncbs = rcu_cblist_n_cbs(&rdp->nocb_bypass); - if (bypass_ncbs && - (time_after(j, READ_ONCE(rdp->nocb_bypass_first) + 1) || - bypass_ncbs > 2 * qhimark)) { - // Bypass full or old, so flush it. - (void)rcu_nocb_try_flush_bypass(rdp, j); - bypass_ncbs = rcu_cblist_n_cbs(&rdp->nocb_bypass); - } else if (!bypass_ncbs && rcu_segcblist_empty(&rdp->cblist)) { - rcu_nocb_unlock_irqrestore(rdp, flags); - if (needwake_state) - swake_up_one(&rdp->nocb_state_wq); - continue; /* No callbacks here, try next. */ - } - if (bypass_ncbs) { - trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, - TPS("Bypass")); - bypass = true; - } - rnp = rdp->mynode; - - // Advance callbacks if helpful and low contention. - needwake_gp = false; - if (!rcu_segcblist_restempty(&rdp->cblist, - RCU_NEXT_READY_TAIL) || - (rcu_segcblist_nextgp(&rdp->cblist, &cur_gp_seq) && - rcu_seq_done(&rnp->gp_seq, cur_gp_seq))) { - raw_spin_lock_rcu_node(rnp); /* irqs disabled. */ - needwake_gp = rcu_advance_cbs(rnp, rdp); - wasempty = rcu_segcblist_restempty(&rdp->cblist, - RCU_NEXT_READY_TAIL); - raw_spin_unlock_rcu_node(rnp); /* irqs disabled. */ - } - // Need to wait on some grace period? - WARN_ON_ONCE(wasempty && - !rcu_segcblist_restempty(&rdp->cblist, - RCU_NEXT_READY_TAIL)); - if (rcu_segcblist_nextgp(&rdp->cblist, &cur_gp_seq)) { - if (!needwait_gp || - ULONG_CMP_LT(cur_gp_seq, wait_gp_seq)) - wait_gp_seq = cur_gp_seq; - needwait_gp = true; - trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, - TPS("NeedWaitGP")); - } - if (rcu_segcblist_ready_cbs(&rdp->cblist)) { - needwake = rdp->nocb_cb_sleep; - WRITE_ONCE(rdp->nocb_cb_sleep, false); - smp_mb(); /* CB invocation -after- GP end. */ - } else { - needwake = false; - } - rcu_nocb_unlock_irqrestore(rdp, flags); - if (needwake) { - swake_up_one(&rdp->nocb_cb_wq); - gotcbs = true; - } - if (needwake_gp) - rcu_gp_kthread_wake(); - if (needwake_state) - swake_up_one(&rdp->nocb_state_wq); - } - - my_rdp->nocb_gp_bypass = bypass; - my_rdp->nocb_gp_gp = needwait_gp; - my_rdp->nocb_gp_seq = needwait_gp ? wait_gp_seq : 0; - - if (bypass && !rcu_nocb_poll) { - // At least one child with non-empty ->nocb_bypass, so set - // timer in order to avoid stranding its callbacks. - wake_nocb_gp_defer(my_rdp, RCU_NOCB_WAKE_BYPASS, - TPS("WakeBypassIsDeferred")); - } - if (rcu_nocb_poll) { - /* Polling, so trace if first poll in the series. */ - if (gotcbs) - trace_rcu_nocb_wake(rcu_state.name, cpu, TPS("Poll")); - schedule_timeout_idle(1); - } else if (!needwait_gp) { - /* Wait for callbacks to appear. */ - trace_rcu_nocb_wake(rcu_state.name, cpu, TPS("Sleep")); - swait_event_interruptible_exclusive(my_rdp->nocb_gp_wq, - !READ_ONCE(my_rdp->nocb_gp_sleep)); - trace_rcu_nocb_wake(rcu_state.name, cpu, TPS("EndSleep")); - } else { - rnp = my_rdp->mynode; - trace_rcu_this_gp(rnp, my_rdp, wait_gp_seq, TPS("StartWait")); - swait_event_interruptible_exclusive( - rnp->nocb_gp_wq[rcu_seq_ctr(wait_gp_seq) & 0x1], - rcu_seq_done(&rnp->gp_seq, wait_gp_seq) || - !READ_ONCE(my_rdp->nocb_gp_sleep)); - trace_rcu_this_gp(rnp, my_rdp, wait_gp_seq, TPS("EndWait")); - } - if (!rcu_nocb_poll) { - raw_spin_lock_irqsave(&my_rdp->nocb_gp_lock, flags); - if (my_rdp->nocb_defer_wakeup > RCU_NOCB_WAKE_NOT) { - WRITE_ONCE(my_rdp->nocb_defer_wakeup, RCU_NOCB_WAKE_NOT); - del_timer(&my_rdp->nocb_timer); - } - WRITE_ONCE(my_rdp->nocb_gp_sleep, true); - raw_spin_unlock_irqrestore(&my_rdp->nocb_gp_lock, flags); - } - my_rdp->nocb_gp_seq = -1; - WARN_ON(signal_pending(current)); -} - -/* - * No-CBs grace-period-wait kthread. There is one of these per group - * of CPUs, but only once at least one CPU in that group has come online - * at least once since boot. This kthread checks for newly posted - * callbacks from any of the CPUs it is responsible for, waits for a - * grace period, then awakens all of the rcu_nocb_cb_kthread() instances - * that then have callback-invocation work to do. - */ -static int rcu_nocb_gp_kthread(void *arg) -{ - struct rcu_data *rdp = arg; - - for (;;) { - WRITE_ONCE(rdp->nocb_gp_loops, rdp->nocb_gp_loops + 1); - nocb_gp_wait(rdp); - cond_resched_tasks_rcu_qs(); - } - return 0; -} - -static inline bool nocb_cb_can_run(struct rcu_data *rdp) -{ - u8 flags = SEGCBLIST_OFFLOADED | SEGCBLIST_KTHREAD_CB; - return rcu_segcblist_test_flags(&rdp->cblist, flags); -} - -static inline bool nocb_cb_wait_cond(struct rcu_data *rdp) -{ - return nocb_cb_can_run(rdp) && !READ_ONCE(rdp->nocb_cb_sleep); -} - -/* - * Invoke any ready callbacks from the corresponding no-CBs CPU, - * then, if there are no more, wait for more to appear. - */ -static void nocb_cb_wait(struct rcu_data *rdp) -{ - struct rcu_segcblist *cblist = &rdp->cblist; - unsigned long cur_gp_seq; - unsigned long flags; - bool needwake_state = false; - bool needwake_gp = false; - bool can_sleep = true; - struct rcu_node *rnp = rdp->mynode; - - local_irq_save(flags); - rcu_momentary_dyntick_idle(); - local_irq_restore(flags); - /* - * Disable BH to provide the expected environment. Also, when - * transitioning to/from NOCB mode, a self-requeuing callback might - * be invoked from softirq. A short grace period could cause both - * instances of this callback would execute concurrently. - */ - local_bh_disable(); - rcu_do_batch(rdp); - local_bh_enable(); - lockdep_assert_irqs_enabled(); - rcu_nocb_lock_irqsave(rdp, flags); - if (rcu_segcblist_nextgp(cblist, &cur_gp_seq) && - rcu_seq_done(&rnp->gp_seq, cur_gp_seq) && - raw_spin_trylock_rcu_node(rnp)) { /* irqs already disabled. */ - needwake_gp = rcu_advance_cbs(rdp->mynode, rdp); - raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled. */ - } - - if (rcu_segcblist_test_flags(cblist, SEGCBLIST_OFFLOADED)) { - if (!rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB)) { - rcu_segcblist_set_flags(cblist, SEGCBLIST_KTHREAD_CB); - if (rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP)) - needwake_state = true; - } - if (rcu_segcblist_ready_cbs(cblist)) - can_sleep = false; - } else { - /* - * De-offloading. Clear our flag and notify the de-offload worker. - * We won't touch the callbacks and keep sleeping until we ever - * get re-offloaded. - */ - WARN_ON_ONCE(!rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB)); - rcu_segcblist_clear_flags(cblist, SEGCBLIST_KTHREAD_CB); - if (!rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP)) - needwake_state = true; - } - - WRITE_ONCE(rdp->nocb_cb_sleep, can_sleep); - - if (rdp->nocb_cb_sleep) - trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("CBSleep")); - - rcu_nocb_unlock_irqrestore(rdp, flags); - if (needwake_gp) - rcu_gp_kthread_wake(); - - if (needwake_state) - swake_up_one(&rdp->nocb_state_wq); - - do { - swait_event_interruptible_exclusive(rdp->nocb_cb_wq, - nocb_cb_wait_cond(rdp)); - - // VVV Ensure CB invocation follows _sleep test. - if (smp_load_acquire(&rdp->nocb_cb_sleep)) { // ^^^ - WARN_ON(signal_pending(current)); - trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WokeEmpty")); - } - } while (!nocb_cb_can_run(rdp)); -} - -/* - * Per-rcu_data kthread, but only for no-CBs CPUs. Repeatedly invoke - * nocb_cb_wait() to do the dirty work. - */ -static int rcu_nocb_cb_kthread(void *arg) -{ - struct rcu_data *rdp = arg; - - // Each pass through this loop does one callback batch, and, - // if there are no more ready callbacks, waits for them. - for (;;) { - nocb_cb_wait(rdp); - cond_resched_tasks_rcu_qs(); - } - return 0; -} - -/* Is a deferred wakeup of rcu_nocb_kthread() required? */ -static int rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp, int level) -{ - return READ_ONCE(rdp->nocb_defer_wakeup) >= level; -} - -/* Do a deferred wakeup of rcu_nocb_kthread(). */ -static bool do_nocb_deferred_wakeup_common(struct rcu_data *rdp_gp, - struct rcu_data *rdp, int level, - unsigned long flags) - __releases(rdp_gp->nocb_gp_lock) -{ - int ndw; - int ret; - - if (!rcu_nocb_need_deferred_wakeup(rdp_gp, level)) { - raw_spin_unlock_irqrestore(&rdp_gp->nocb_gp_lock, flags); - return false; - } - - ndw = rdp_gp->nocb_defer_wakeup; - ret = __wake_nocb_gp(rdp_gp, rdp, ndw == RCU_NOCB_WAKE_FORCE, flags); - trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("DeferredWake")); - - return ret; -} - -/* Do a deferred wakeup of rcu_nocb_kthread() from a timer handler. */ -static void do_nocb_deferred_wakeup_timer(struct timer_list *t) -{ - unsigned long flags; - struct rcu_data *rdp = from_timer(rdp, t, nocb_timer); - - WARN_ON_ONCE(rdp->nocb_gp_rdp != rdp); - trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("Timer")); - - raw_spin_lock_irqsave(&rdp->nocb_gp_lock, flags); - smp_mb__after_spinlock(); /* Timer expire before wakeup. */ - do_nocb_deferred_wakeup_common(rdp, rdp, RCU_NOCB_WAKE_BYPASS, flags); -} - -/* - * Do a deferred wakeup of rcu_nocb_kthread() from fastpath. - * This means we do an inexact common-case check. Note that if - * we miss, ->nocb_timer will eventually clean things up. - */ -static bool do_nocb_deferred_wakeup(struct rcu_data *rdp) -{ - unsigned long flags; - struct rcu_data *rdp_gp = rdp->nocb_gp_rdp; - - if (!rdp_gp || !rcu_nocb_need_deferred_wakeup(rdp_gp, RCU_NOCB_WAKE)) - return false; - - raw_spin_lock_irqsave(&rdp_gp->nocb_gp_lock, flags); - return do_nocb_deferred_wakeup_common(rdp_gp, rdp, RCU_NOCB_WAKE, flags); -} - -void rcu_nocb_flush_deferred_wakeup(void) -{ - do_nocb_deferred_wakeup(this_cpu_ptr(&rcu_data)); -} -EXPORT_SYMBOL_GPL(rcu_nocb_flush_deferred_wakeup); - -static int rdp_offload_toggle(struct rcu_data *rdp, - bool offload, unsigned long flags) - __releases(rdp->nocb_lock) -{ - struct rcu_segcblist *cblist = &rdp->cblist; - struct rcu_data *rdp_gp = rdp->nocb_gp_rdp; - bool wake_gp = false; - - rcu_segcblist_offload(cblist, offload); - - if (rdp->nocb_cb_sleep) - rdp->nocb_cb_sleep = false; - rcu_nocb_unlock_irqrestore(rdp, flags); - - /* - * Ignore former value of nocb_cb_sleep and force wake up as it could - * have been spuriously set to false already. - */ - swake_up_one(&rdp->nocb_cb_wq); - - raw_spin_lock_irqsave(&rdp_gp->nocb_gp_lock, flags); - if (rdp_gp->nocb_gp_sleep) { - rdp_gp->nocb_gp_sleep = false; - wake_gp = true; - } - raw_spin_unlock_irqrestore(&rdp_gp->nocb_gp_lock, flags); - - if (wake_gp) - wake_up_process(rdp_gp->nocb_gp_kthread); - - return 0; -} - -static long rcu_nocb_rdp_deoffload(void *arg) -{ - struct rcu_data *rdp = arg; - struct rcu_segcblist *cblist = &rdp->cblist; - unsigned long flags; - int ret; - - WARN_ON_ONCE(rdp->cpu != raw_smp_processor_id()); - - pr_info("De-offloading %d\n", rdp->cpu); - - rcu_nocb_lock_irqsave(rdp, flags); - /* - * Flush once and for all now. This suffices because we are - * running on the target CPU holding ->nocb_lock (thus having - * interrupts disabled), and because rdp_offload_toggle() - * invokes rcu_segcblist_offload(), which clears SEGCBLIST_OFFLOADED. - * Thus future calls to rcu_segcblist_completely_offloaded() will - * return false, which means that future calls to rcu_nocb_try_bypass() - * will refuse to put anything into the bypass. - */ - WARN_ON_ONCE(!rcu_nocb_flush_bypass(rdp, NULL, jiffies)); - ret = rdp_offload_toggle(rdp, false, flags); - swait_event_exclusive(rdp->nocb_state_wq, - !rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB | - SEGCBLIST_KTHREAD_GP)); - /* - * Lock one last time to acquire latest callback updates from kthreads - * so we can later handle callbacks locally without locking. - */ - rcu_nocb_lock_irqsave(rdp, flags); - /* - * Theoretically we could set SEGCBLIST_SOFTIRQ_ONLY after the nocb - * lock is released but how about being paranoid for once? - */ - rcu_segcblist_set_flags(cblist, SEGCBLIST_SOFTIRQ_ONLY); - /* - * With SEGCBLIST_SOFTIRQ_ONLY, we can't use - * rcu_nocb_unlock_irqrestore() anymore. - */ - raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags); - - /* Sanity check */ - WARN_ON_ONCE(rcu_cblist_n_cbs(&rdp->nocb_bypass)); - - - return ret; -} - -int rcu_nocb_cpu_deoffload(int cpu) -{ - struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); - int ret = 0; - - mutex_lock(&rcu_state.barrier_mutex); - cpus_read_lock(); - if (rcu_rdp_is_offloaded(rdp)) { - if (cpu_online(cpu)) { - ret = work_on_cpu(cpu, rcu_nocb_rdp_deoffload, rdp); - if (!ret) - cpumask_clear_cpu(cpu, rcu_nocb_mask); - } else { - pr_info("NOCB: Can't CB-deoffload an offline CPU\n"); - ret = -EINVAL; - } - } - cpus_read_unlock(); - mutex_unlock(&rcu_state.barrier_mutex); - - return ret; -} -EXPORT_SYMBOL_GPL(rcu_nocb_cpu_deoffload); - -static long rcu_nocb_rdp_offload(void *arg) -{ - struct rcu_data *rdp = arg; - struct rcu_segcblist *cblist = &rdp->cblist; - unsigned long flags; - int ret; - - WARN_ON_ONCE(rdp->cpu != raw_smp_processor_id()); - /* - * For now we only support re-offload, ie: the rdp must have been - * offloaded on boot first. - */ - if (!rdp->nocb_gp_rdp) - return -EINVAL; - - pr_info("Offloading %d\n", rdp->cpu); - /* - * Can't use rcu_nocb_lock_irqsave() while we are in - * SEGCBLIST_SOFTIRQ_ONLY mode. - */ - raw_spin_lock_irqsave(&rdp->nocb_lock, flags); - - /* - * We didn't take the nocb lock while working on the - * rdp->cblist in SEGCBLIST_SOFTIRQ_ONLY mode. - * Every modifications that have been done previously on - * rdp->cblist must be visible remotely by the nocb kthreads - * upon wake up after reading the cblist flags. - * - * The layout against nocb_lock enforces that ordering: - * - * __rcu_nocb_rdp_offload() nocb_cb_wait()/nocb_gp_wait() - * ------------------------- ---------------------------- - * WRITE callbacks rcu_nocb_lock() - * rcu_nocb_lock() READ flags - * WRITE flags READ callbacks - * rcu_nocb_unlock() rcu_nocb_unlock() - */ - ret = rdp_offload_toggle(rdp, true, flags); - swait_event_exclusive(rdp->nocb_state_wq, - rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB) && - rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP)); - - return ret; -} - -int rcu_nocb_cpu_offload(int cpu) -{ - struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); - int ret = 0; - - mutex_lock(&rcu_state.barrier_mutex); - cpus_read_lock(); - if (!rcu_rdp_is_offloaded(rdp)) { - if (cpu_online(cpu)) { - ret = work_on_cpu(cpu, rcu_nocb_rdp_offload, rdp); - if (!ret) - cpumask_set_cpu(cpu, rcu_nocb_mask); - } else { - pr_info("NOCB: Can't CB-offload an offline CPU\n"); - ret = -EINVAL; - } - } - cpus_read_unlock(); - mutex_unlock(&rcu_state.barrier_mutex); - - return ret; -} -EXPORT_SYMBOL_GPL(rcu_nocb_cpu_offload); - -void __init rcu_init_nohz(void) -{ - int cpu; - bool need_rcu_nocb_mask = false; - struct rcu_data *rdp; - -#if defined(CONFIG_NO_HZ_FULL) - if (tick_nohz_full_running && cpumask_weight(tick_nohz_full_mask)) - need_rcu_nocb_mask = true; -#endif /* #if defined(CONFIG_NO_HZ_FULL) */ - - if (!cpumask_available(rcu_nocb_mask) && need_rcu_nocb_mask) { - if (!zalloc_cpumask_var(&rcu_nocb_mask, GFP_KERNEL)) { - pr_info("rcu_nocb_mask allocation failed, callback offloading disabled.\n"); - return; - } - } - if (!cpumask_available(rcu_nocb_mask)) - return; - -#if defined(CONFIG_NO_HZ_FULL) - if (tick_nohz_full_running) - cpumask_or(rcu_nocb_mask, rcu_nocb_mask, tick_nohz_full_mask); -#endif /* #if defined(CONFIG_NO_HZ_FULL) */ - - if (!cpumask_subset(rcu_nocb_mask, cpu_possible_mask)) { - pr_info("\tNote: kernel parameter 'rcu_nocbs=', 'nohz_full', or 'isolcpus=' contains nonexistent CPUs.\n"); - cpumask_and(rcu_nocb_mask, cpu_possible_mask, - rcu_nocb_mask); - } - if (cpumask_empty(rcu_nocb_mask)) - pr_info("\tOffload RCU callbacks from CPUs: (none).\n"); - else - pr_info("\tOffload RCU callbacks from CPUs: %*pbl.\n", - cpumask_pr_args(rcu_nocb_mask)); - if (rcu_nocb_poll) - pr_info("\tPoll for callbacks from no-CBs CPUs.\n"); - - for_each_cpu(cpu, rcu_nocb_mask) { - rdp = per_cpu_ptr(&rcu_data, cpu); - if (rcu_segcblist_empty(&rdp->cblist)) - rcu_segcblist_init(&rdp->cblist); - rcu_segcblist_offload(&rdp->cblist, true); - rcu_segcblist_set_flags(&rdp->cblist, SEGCBLIST_KTHREAD_CB); - rcu_segcblist_set_flags(&rdp->cblist, SEGCBLIST_KTHREAD_GP); - } - rcu_organize_nocb_kthreads(); -} - -/* Initialize per-rcu_data variables for no-CBs CPUs. */ -static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp) -{ - init_swait_queue_head(&rdp->nocb_cb_wq); - init_swait_queue_head(&rdp->nocb_gp_wq); - init_swait_queue_head(&rdp->nocb_state_wq); - raw_spin_lock_init(&rdp->nocb_lock); - raw_spin_lock_init(&rdp->nocb_bypass_lock); - raw_spin_lock_init(&rdp->nocb_gp_lock); - timer_setup(&rdp->nocb_timer, do_nocb_deferred_wakeup_timer, 0); - rcu_cblist_init(&rdp->nocb_bypass); -} - -/* - * If the specified CPU is a no-CBs CPU that does not already have its - * rcuo CB kthread, spawn it. Additionally, if the rcuo GP kthread - * for this CPU's group has not yet been created, spawn it as well. - */ -static void rcu_spawn_one_nocb_kthread(int cpu) -{ - struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); - struct rcu_data *rdp_gp; - struct task_struct *t; - - /* - * If this isn't a no-CBs CPU or if it already has an rcuo kthread, - * then nothing to do. - */ - if (!rcu_is_nocb_cpu(cpu) || rdp->nocb_cb_kthread) - return; - - /* If we didn't spawn the GP kthread first, reorganize! */ - rdp_gp = rdp->nocb_gp_rdp; - if (!rdp_gp->nocb_gp_kthread) { - t = kthread_run(rcu_nocb_gp_kthread, rdp_gp, - "rcuog/%d", rdp_gp->cpu); - if (WARN_ONCE(IS_ERR(t), "%s: Could not start rcuo GP kthread, OOM is now expected behavior\n", __func__)) - return; - WRITE_ONCE(rdp_gp->nocb_gp_kthread, t); - } - - /* Spawn the kthread for this CPU. */ - t = kthread_run(rcu_nocb_cb_kthread, rdp, - "rcuo%c/%d", rcu_state.abbr, cpu); - if (WARN_ONCE(IS_ERR(t), "%s: Could not start rcuo CB kthread, OOM is now expected behavior\n", __func__)) - return; - WRITE_ONCE(rdp->nocb_cb_kthread, t); - WRITE_ONCE(rdp->nocb_gp_kthread, rdp_gp->nocb_gp_kthread); -} - -/* - * If the specified CPU is a no-CBs CPU that does not already have its - * rcuo kthread, spawn it. - */ -static void rcu_spawn_cpu_nocb_kthread(int cpu) -{ - if (rcu_scheduler_fully_active) - rcu_spawn_one_nocb_kthread(cpu); -} - -/* - * Once the scheduler is running, spawn rcuo kthreads for all online - * no-CBs CPUs. This assumes that the early_initcall()s happen before - * non-boot CPUs come online -- if this changes, we will need to add - * some mutual exclusion. - */ -static void __init rcu_spawn_nocb_kthreads(void) -{ - int cpu; - - for_each_online_cpu(cpu) - rcu_spawn_cpu_nocb_kthread(cpu); -} - -/* How many CB CPU IDs per GP kthread? Default of -1 for sqrt(nr_cpu_ids). */ -static int rcu_nocb_gp_stride = -1; -module_param(rcu_nocb_gp_stride, int, 0444); - -/* - * Initialize GP-CB relationships for all no-CBs CPU. - */ -static void __init rcu_organize_nocb_kthreads(void) -{ - int cpu; - bool firsttime = true; - bool gotnocbs = false; - bool gotnocbscbs = true; - int ls = rcu_nocb_gp_stride; - int nl = 0; /* Next GP kthread. */ - struct rcu_data *rdp; - struct rcu_data *rdp_gp = NULL; /* Suppress misguided gcc warn. */ - struct rcu_data *rdp_prev = NULL; - - if (!cpumask_available(rcu_nocb_mask)) - return; - if (ls == -1) { - ls = nr_cpu_ids / int_sqrt(nr_cpu_ids); - rcu_nocb_gp_stride = ls; - } - - /* - * Each pass through this loop sets up one rcu_data structure. - * Should the corresponding CPU come online in the future, then - * we will spawn the needed set of rcu_nocb_kthread() kthreads. - */ - for_each_cpu(cpu, rcu_nocb_mask) { - rdp = per_cpu_ptr(&rcu_data, cpu); - if (rdp->cpu >= nl) { - /* New GP kthread, set up for CBs & next GP. */ - gotnocbs = true; - nl = DIV_ROUND_UP(rdp->cpu + 1, ls) * ls; - rdp->nocb_gp_rdp = rdp; - rdp_gp = rdp; - if (dump_tree) { - if (!firsttime) - pr_cont("%s\n", gotnocbscbs - ? "" : " (self only)"); - gotnocbscbs = false; - firsttime = false; - pr_alert("%s: No-CB GP kthread CPU %d:", - __func__, cpu); - } - } else { - /* Another CB kthread, link to previous GP kthread. */ - gotnocbscbs = true; - rdp->nocb_gp_rdp = rdp_gp; - rdp_prev->nocb_next_cb_rdp = rdp; - if (dump_tree) - pr_cont(" %d", cpu); - } - rdp_prev = rdp; - } - if (gotnocbs && dump_tree) - pr_cont("%s\n", gotnocbscbs ? "" : " (self only)"); -} - -/* - * Bind the current task to the offloaded CPUs. If there are no offloaded - * CPUs, leave the task unbound. Splat if the bind attempt fails. - */ -void rcu_bind_current_to_nocb(void) -{ - if (cpumask_available(rcu_nocb_mask) && cpumask_weight(rcu_nocb_mask)) - WARN_ON(sched_setaffinity(current->pid, rcu_nocb_mask)); -} -EXPORT_SYMBOL_GPL(rcu_bind_current_to_nocb); - -// The ->on_cpu field is available only in CONFIG_SMP=y, so... -#ifdef CONFIG_SMP -static char *show_rcu_should_be_on_cpu(struct task_struct *tsp) -{ - return tsp && task_is_running(tsp) && !tsp->on_cpu ? "!" : ""; -} -#else // #ifdef CONFIG_SMP -static char *show_rcu_should_be_on_cpu(struct task_struct *tsp) -{ - return ""; -} -#endif // #else #ifdef CONFIG_SMP - -/* - * Dump out nocb grace-period kthread state for the specified rcu_data - * structure. - */ -static void show_rcu_nocb_gp_state(struct rcu_data *rdp) -{ - struct rcu_node *rnp = rdp->mynode; - - pr_info("nocb GP %d %c%c%c%c%c %c[%c%c] %c%c:%ld rnp %d:%d %lu %c CPU %d%s\n", - rdp->cpu, - "kK"[!!rdp->nocb_gp_kthread], - "lL"[raw_spin_is_locked(&rdp->nocb_gp_lock)], - "dD"[!!rdp->nocb_defer_wakeup], - "tT"[timer_pending(&rdp->nocb_timer)], - "sS"[!!rdp->nocb_gp_sleep], - ".W"[swait_active(&rdp->nocb_gp_wq)], - ".W"[swait_active(&rnp->nocb_gp_wq[0])], - ".W"[swait_active(&rnp->nocb_gp_wq[1])], - ".B"[!!rdp->nocb_gp_bypass], - ".G"[!!rdp->nocb_gp_gp], - (long)rdp->nocb_gp_seq, - rnp->grplo, rnp->grphi, READ_ONCE(rdp->nocb_gp_loops), - rdp->nocb_gp_kthread ? task_state_to_char(rdp->nocb_gp_kthread) : '.', - rdp->nocb_cb_kthread ? (int)task_cpu(rdp->nocb_gp_kthread) : -1, - show_rcu_should_be_on_cpu(rdp->nocb_cb_kthread)); -} - -/* Dump out nocb kthread state for the specified rcu_data structure. */ -static void show_rcu_nocb_state(struct rcu_data *rdp) -{ - char bufw[20]; - char bufr[20]; - struct rcu_segcblist *rsclp = &rdp->cblist; - bool waslocked; - bool wassleep; - - if (rdp->nocb_gp_rdp == rdp) - show_rcu_nocb_gp_state(rdp); - - sprintf(bufw, "%ld", rsclp->gp_seq[RCU_WAIT_TAIL]); - sprintf(bufr, "%ld", rsclp->gp_seq[RCU_NEXT_READY_TAIL]); - pr_info(" CB %d^%d->%d %c%c%c%c%c%c F%ld L%ld C%d %c%c%s%c%s%c%c q%ld %c CPU %d%s\n", - rdp->cpu, rdp->nocb_gp_rdp->cpu, - rdp->nocb_next_cb_rdp ? rdp->nocb_next_cb_rdp->cpu : -1, - "kK"[!!rdp->nocb_cb_kthread], - "bB"[raw_spin_is_locked(&rdp->nocb_bypass_lock)], - "cC"[!!atomic_read(&rdp->nocb_lock_contended)], - "lL"[raw_spin_is_locked(&rdp->nocb_lock)], - "sS"[!!rdp->nocb_cb_sleep], - ".W"[swait_active(&rdp->nocb_cb_wq)], - jiffies - rdp->nocb_bypass_first, - jiffies - rdp->nocb_nobypass_last, - rdp->nocb_nobypass_count, - ".D"[rcu_segcblist_ready_cbs(rsclp)], - ".W"[!rcu_segcblist_segempty(rsclp, RCU_WAIT_TAIL)], - rcu_segcblist_segempty(rsclp, RCU_WAIT_TAIL) ? "" : bufw, - ".R"[!rcu_segcblist_segempty(rsclp, RCU_NEXT_READY_TAIL)], - rcu_segcblist_segempty(rsclp, RCU_NEXT_READY_TAIL) ? "" : bufr, - ".N"[!rcu_segcblist_segempty(rsclp, RCU_NEXT_TAIL)], - ".B"[!!rcu_cblist_n_cbs(&rdp->nocb_bypass)], - rcu_segcblist_n_cbs(&rdp->cblist), - rdp->nocb_cb_kthread ? task_state_to_char(rdp->nocb_cb_kthread) : '.', - rdp->nocb_cb_kthread ? (int)task_cpu(rdp->nocb_gp_kthread) : -1, - show_rcu_should_be_on_cpu(rdp->nocb_cb_kthread)); - - /* It is OK for GP kthreads to have GP state. */ - if (rdp->nocb_gp_rdp == rdp) - return; - - waslocked = raw_spin_is_locked(&rdp->nocb_gp_lock); - wassleep = swait_active(&rdp->nocb_gp_wq); - if (!rdp->nocb_gp_sleep && !waslocked && !wassleep) - return; /* Nothing untoward. */ - - pr_info(" nocb GP activity on CB-only CPU!!! %c%c%c %c\n", - "lL"[waslocked], - "dD"[!!rdp->nocb_defer_wakeup], - "sS"[!!rdp->nocb_gp_sleep], - ".W"[wassleep]); -} - -#else /* #ifdef CONFIG_RCU_NOCB_CPU */ - -/* No ->nocb_lock to acquire. */ -static void rcu_nocb_lock(struct rcu_data *rdp) -{ -} - -/* No ->nocb_lock to release. */ -static void rcu_nocb_unlock(struct rcu_data *rdp) -{ -} - -/* No ->nocb_lock to release. */ -static void rcu_nocb_unlock_irqrestore(struct rcu_data *rdp, - unsigned long flags) -{ - local_irq_restore(flags); -} - -/* Lockdep check that ->cblist may be safely accessed. */ -static void rcu_lockdep_assert_cblist_protected(struct rcu_data *rdp) -{ - lockdep_assert_irqs_disabled(); -} - -static void rcu_nocb_gp_cleanup(struct swait_queue_head *sq) -{ -} - -static struct swait_queue_head *rcu_nocb_gp_get(struct rcu_node *rnp) -{ - return NULL; -} - -static void rcu_init_one_nocb(struct rcu_node *rnp) -{ -} - -static bool rcu_nocb_flush_bypass(struct rcu_data *rdp, struct rcu_head *rhp, - unsigned long j) -{ - return true; -} - -static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp, - bool *was_alldone, unsigned long flags) -{ - return false; -} - -static void __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_empty, - unsigned long flags) -{ - WARN_ON_ONCE(1); /* Should be dead code! */ -} - -static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp) -{ -} - -static int rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp, int level) -{ - return false; -} - -static bool do_nocb_deferred_wakeup(struct rcu_data *rdp) -{ - return false; -} - -static void rcu_spawn_cpu_nocb_kthread(int cpu) -{ -} - -static void __init rcu_spawn_nocb_kthreads(void) -{ -} - -static void show_rcu_nocb_state(struct rcu_data *rdp) -{ -} - -#endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */ - /* * Is this CPU a NO_HZ_FULL CPU that should ignore RCU so that the * grace-period kthread will do force_quiescent_state() processing? -- cgit From cba712beebf32b27fea71241aa3cdd2ab0fc31a3 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 19 May 2021 02:09:29 +0200 Subject: rcu/nocb: Remove NOCB deferred wakeup from rcutree_dead_cpu() At CPU offline time, we must handle any pending wakeup for the nocb_gp kthread linked to the outgoing CPU. Now we are making sure of that twice: 1) From rcu_report_dead() when the outgoing CPU makes the very last local cleanups by itself before switching offline. 2) From rcutree_dead_cpu(). Here the offlining CPU has gone and is truly now offline. Another CPU takes care of post-portem cleaning up and check if the offline CPU had pending wakeup. Both ways are fine but we have to choose one or the other because we don't need to repeat that action. Simply benefit from cache locality and keep only the first solution. Signed-off-by: Frederic Weisbecker Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index b6eb480402e0..0fda98a0d12e 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2471,9 +2471,6 @@ int rcutree_dead_cpu(unsigned int cpu) WRITE_ONCE(rcu_state.n_online_cpus, rcu_state.n_online_cpus - 1); /* Adjust any no-longer-needed kthreads. */ rcu_boost_kthread_setaffinity(rnp, -1); - /* Do any needed no-CB deferred wakeups from this CPU. */ - do_nocb_deferred_wakeup(per_cpu_ptr(&rcu_data, cpu)); - // Stop-machine done, so allow nohz_full to disable tick. tick_dep_clear(TICK_DEP_BIT_RCU); return 0; -- cgit From 45f4b4a202c03de14e315aaae3d305820cd12221 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 24 May 2021 11:26:53 -0700 Subject: rcu-tasks: Add comments explaining task_struct strategy Accesses to task_struct structures must be either protected by RCU or by get_task_struct(). Tasks trace RCU uses these in a non-obvious combination, in conjunction with an IPI handler. This commit therefore adds comments explaining this usage. Signed-off-by: Paul E. McKenney --- kernel/rcu/tasks.h | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index 8536c55df514..6a117375a62a 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -785,7 +785,10 @@ EXPORT_SYMBOL_GPL(show_rcu_tasks_rude_gp_kthread); // set that task's .need_qs flag so that task's next outermost // rcu_read_unlock_trace() will report the quiescent state (in which // case the count of readers is incremented). If both attempts fail, -// the task is added to a "holdout" list. +// the task is added to a "holdout" list. Note that IPIs are used +// to invoke trc_read_check_handler() in the context of running tasks +// in order to avoid ordering overhead on common-case shared-variable +// accessses. // rcu_tasks_trace_postscan(): // Initialize state and attempt to identify an immediate quiescent // state as above (but only for idle tasks), unblock CPU-hotplug @@ -994,6 +997,12 @@ static void trc_wait_for_one_reader(struct task_struct *t, } put_task_struct(t); + // If this task is not yet on the holdout list, then we are in + // an RCU read-side critical section. Otherwise, the invocation of + // rcu_add_holdout() that added it to the list did the necessary + // get_task_struct(). Either way, the task cannot be freed out + // from under this code. + // If currently running, send an IPI, either way, add to list. trc_add_holdout(t, bhp); if (task_curr(t) && -- cgit From bdb0cca0d11060fce8a8a44588ac1470c25d62bc Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 24 May 2021 12:48:18 -0700 Subject: rcu-tasks: Mark ->trc_reader_nesting data races There are several ->trc_reader_nesting data races that are too low-probability for KCSAN to notice, but which will happen sooner or later. This commit therefore marks these accesses, and comments one that cannot race. Signed-off-by: Paul E. McKenney --- kernel/rcu/tasks.h | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index 6a117375a62a..f6b5320c4484 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -897,7 +897,7 @@ static void trc_read_check_handler(void *t_in) // If the task is not in a read-side critical section, and // if this is the last reader, awaken the grace-period kthread. - if (likely(!t->trc_reader_nesting)) { + if (likely(!READ_ONCE(t->trc_reader_nesting))) { if (WARN_ON_ONCE(atomic_dec_and_test(&trc_n_readers_need_end))) wake_up(&trc_wait); // Mark as checked after decrement to avoid false @@ -906,7 +906,7 @@ static void trc_read_check_handler(void *t_in) goto reset_ipi; } // If we are racing with an rcu_read_unlock_trace(), try again later. - if (unlikely(t->trc_reader_nesting < 0)) { + if (unlikely(READ_ONCE(t->trc_reader_nesting) < 0)) { if (WARN_ON_ONCE(atomic_dec_and_test(&trc_n_readers_need_end))) wake_up(&trc_wait); goto reset_ipi; @@ -953,6 +953,7 @@ static bool trc_inspect_reader(struct task_struct *t, void *arg) n_heavy_reader_ofl_updates++; in_qs = true; } else { + // The task is not running, so C-language access is safe. in_qs = likely(!t->trc_reader_nesting); } @@ -985,7 +986,7 @@ static void trc_wait_for_one_reader(struct task_struct *t, // The current task had better be in a quiescent state. if (t == current) { t->trc_reader_checked = true; - WARN_ON_ONCE(t->trc_reader_nesting); + WARN_ON_ONCE(READ_ONCE(t->trc_reader_nesting)); return; } @@ -1101,7 +1102,7 @@ static void show_stalled_task_trace(struct task_struct *t, bool *firstreport) ".I"[READ_ONCE(t->trc_ipi_to_cpu) > 0], ".i"[is_idle_task(t)], ".N"[cpu > 0 && tick_nohz_full_cpu(cpu)], - t->trc_reader_nesting, + READ_ONCE(t->trc_reader_nesting), " N"[!!t->trc_reader_special.b.need_qs], cpu); sched_show_task(t); @@ -1196,7 +1197,7 @@ static void rcu_tasks_trace_postgp(struct rcu_tasks *rtp) static void exit_tasks_rcu_finish_trace(struct task_struct *t) { WRITE_ONCE(t->trc_reader_checked, true); - WARN_ON_ONCE(t->trc_reader_nesting); + WARN_ON_ONCE(READ_ONCE(t->trc_reader_nesting)); WRITE_ONCE(t->trc_reader_nesting, 0); if (WARN_ON_ONCE(READ_ONCE(t->trc_reader_special.b.need_qs))) rcu_read_unlock_trace_special(t, 0); -- cgit From f8ab3fad80dddf3f2cecb53983063c4431058ca1 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 24 May 2021 15:36:37 -0700 Subject: rcu-tasks: Mark ->trc_reader_special.b.need_qs data races There are several ->trc_reader_special.b.need_qs data races that are too low-probability for KCSAN to notice, but which will happen sooner or later. This commit therefore marks these accesses. Signed-off-by: Paul E. McKenney --- kernel/rcu/tasks.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index f6b5320c4484..1cece5e9be9a 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -850,7 +850,7 @@ static DEFINE_IRQ_WORK(rcu_tasks_trace_iw, rcu_read_unlock_iw); /* If we are the last reader, wake up the grace-period kthread. */ void rcu_read_unlock_trace_special(struct task_struct *t, int nesting) { - int nq = t->trc_reader_special.b.need_qs; + int nq = READ_ONCE(t->trc_reader_special.b.need_qs); if (IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB) && t->trc_reader_special.b.need_mb) @@ -916,7 +916,7 @@ static void trc_read_check_handler(void *t_in) // Get here if the task is in a read-side critical section. Set // its state so that it will awaken the grace-period kthread upon // exit from that critical section. - WARN_ON_ONCE(t->trc_reader_special.b.need_qs); + WARN_ON_ONCE(READ_ONCE(t->trc_reader_special.b.need_qs)); WRITE_ONCE(t->trc_reader_special.b.need_qs, true); reset_ipi: @@ -968,7 +968,7 @@ static bool trc_inspect_reader(struct task_struct *t, void *arg) // state so that it will awaken the grace-period kthread upon exit // from that critical section. atomic_inc(&trc_n_readers_need_end); // One more to wait on. - WARN_ON_ONCE(t->trc_reader_special.b.need_qs); + WARN_ON_ONCE(READ_ONCE(t->trc_reader_special.b.need_qs)); WRITE_ONCE(t->trc_reader_special.b.need_qs, true); return true; } @@ -1103,7 +1103,7 @@ static void show_stalled_task_trace(struct task_struct *t, bool *firstreport) ".i"[is_idle_task(t)], ".N"[cpu > 0 && tick_nohz_full_cpu(cpu)], READ_ONCE(t->trc_reader_nesting), - " N"[!!t->trc_reader_special.b.need_qs], + " N"[!!READ_ONCE(t->trc_reader_special.b.need_qs)], cpu); sched_show_task(t); } -- cgit From e4be1f44b6f8d36a8607a598d41c766044b74be3 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 22 Jun 2021 11:57:15 -0700 Subject: rcu-tasks: Fix synchronize_rcu_rude() typo in comment This commit replaces the fictitious synchronize_rcu_rude() function with its real-world synchronize_rcu_tasks_rude() counterpart. Signed-off-by: Paul E. McKenney --- kernel/rcu/tasks.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index 1cece5e9be9a..f9f52daacd1c 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -643,8 +643,8 @@ void exit_tasks_rcu_finish(void) { exit_tasks_rcu_finish_trace(current); } // // "Rude" variant of Tasks RCU, inspired by Steve Rostedt's trick of // passing an empty function to schedule_on_each_cpu(). This approach -// provides an asynchronous call_rcu_tasks_rude() API and batching -// of concurrent calls to the synchronous synchronize_rcu_rude() API. +// provides an asynchronous call_rcu_tasks_rude() API and batching of +// concurrent calls to the synchronous synchronize_rcu_tasks_rude() API. // This invokes schedule_on_each_cpu() in order to send IPIs far and wide // and induces otherwise unnecessary context switches on all online CPUs, // whether idle or not. -- cgit From fed31a4dd3adb5455df7c704de2abb639a1dc1c0 Mon Sep 17 00:00:00 2001 From: Zhouyi Zhou Date: Tue, 13 Jul 2021 08:56:45 +0800 Subject: rcu: Fix macro name CONFIG_TASKS_RCU_TRACE This commit fixes several typos where CONFIG_TASKS_RCU_TRACE should instead be CONFIG_TASKS_TRACE_RCU. Among other things, these typos could cause CONFIG_TASKS_TRACE_RCU_READ_MB=y kernels to suffer from memory-ordering bugs that could result in false-positive quiescent states and too-short grace periods. Signed-off-by: Zhouyi Zhou Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_plugin.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index de1dc3bb7f70..6ce104242b23 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -2982,17 +2982,17 @@ static void noinstr rcu_dynticks_task_exit(void) /* Turn on heavyweight RCU tasks trace readers on idle/user entry. */ static void rcu_dynticks_task_trace_enter(void) { -#ifdef CONFIG_TASKS_RCU_TRACE +#ifdef CONFIG_TASKS_TRACE_RCU if (IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB)) current->trc_reader_special.b.need_mb = true; -#endif /* #ifdef CONFIG_TASKS_RCU_TRACE */ +#endif /* #ifdef CONFIG_TASKS_TRACE_RCU */ } /* Turn off heavyweight RCU tasks trace readers on idle/user exit. */ static void rcu_dynticks_task_trace_exit(void) { -#ifdef CONFIG_TASKS_RCU_TRACE +#ifdef CONFIG_TASKS_TRACE_RCU if (IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB)) current->trc_reader_special.b.need_mb = false; -#endif /* #ifdef CONFIG_TASKS_RCU_TRACE */ +#endif /* #ifdef CONFIG_TASKS_TRACE_RCU */ } -- cgit From 25f6fa53a07422e2bb004229eefd32760c469fb0 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 3 May 2021 17:04:57 -0700 Subject: refscale: Add measurement of clock readout This commit adds a "clock" type to refscale, which checks the performance of ktime_get_real_fast_ns(). Use the "clocksource=" kernel boot parameter to select the underlying clock source. [ paulmck: Work around compiler false positive per kernel test robot. ] Signed-off-by: Paul E. McKenney --- kernel/rcu/refscale.c | 36 +++++++++++++++++++++++++++++++++++- 1 file changed, 35 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/refscale.c b/kernel/rcu/refscale.c index d998a76fb542..66dc14cf5687 100644 --- a/kernel/rcu/refscale.c +++ b/kernel/rcu/refscale.c @@ -467,6 +467,40 @@ static struct ref_scale_ops acqrel_ops = { .name = "acqrel" }; +static volatile u64 stopopts; + +static void ref_clock_section(const int nloops) +{ + u64 x = 0; + int i; + + preempt_disable(); + for (i = nloops; i >= 0; i--) + x += ktime_get_real_fast_ns(); + preempt_enable(); + stopopts = x; +} + +static void ref_clock_delay_section(const int nloops, const int udl, const int ndl) +{ + u64 x = 0; + int i; + + preempt_disable(); + for (i = nloops; i >= 0; i--) { + x += ktime_get_real_fast_ns(); + un_delay(udl, ndl); + } + preempt_enable(); + stopopts = x; +} + +static struct ref_scale_ops clock_ops = { + .readsection = ref_clock_section, + .delaysection = ref_clock_delay_section, + .name = "clock" +}; + static void rcu_scale_one_reader(void) { if (readdelay <= 0) @@ -759,7 +793,7 @@ ref_scale_init(void) int firsterr = 0; static struct ref_scale_ops *scale_ops[] = { &rcu_ops, &srcu_ops, &rcu_trace_ops, &rcu_tasks_ops, &refcnt_ops, &rwlock_ops, - &rwsem_ops, &lock_ops, &lock_irq_ops, &acqrel_ops, + &rwsem_ops, &lock_ops, &lock_irq_ops, &acqrel_ops, &clock_ops, }; if (!torture_init_begin(scale_type, verbose)) -- cgit From 59e836662860a28880d45b35e1fbc5afca4847ce Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sun, 16 May 2021 21:17:27 -0700 Subject: rcutorture: Preempt rather than block when testing task stalls Currently, rcu_torture_stall() does a one-jiffy timed wait when stall_cpu_block is set. This works, but emits a pointless splat in CONFIG_PREEMPT=y kernels. This commit avoids this splat by instead invoking preempt_schedule() in CONFIG_PREEMPT=y kernels. This uses an admittedly ugly #ifdef, but abstracted approaches just looked worse. A prettier approach would provide a preempt_schedule() definition with a WARN_ON() for CONFIG_PREEMPT=n kernels, but this seems quite silly. Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 40ef5417d954..ab4215266ebe 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -2022,8 +2022,13 @@ static int rcu_torture_stall(void *args) __func__, raw_smp_processor_id()); while (ULONG_CMP_LT((unsigned long)ktime_get_seconds(), stop_at)) - if (stall_cpu_block) + if (stall_cpu_block) { +#ifdef CONFIG_PREEMPTION + preempt_schedule(); +#else schedule_timeout_uninterruptible(HZ); +#endif + } if (stall_cpu_irqsoff) local_irq_enable(); else if (!stall_cpu_block) -- cgit From 811192c5f24bfd7246ce9ce06f668d8c408bf39b Mon Sep 17 00:00:00 2001 From: "Jiangong.Han" Date: Tue, 22 Jun 2021 18:37:08 +0800 Subject: rcuscale: Console output claims too few grace periods The rcuscale console output claims N grace periods, numbered from zero to N, which means that there were really N+1 grace periods. The root cause of this bug is that rcu_scale_writer() stores the number of the last grace period (numbered from zero) into writer_n_durations[me] instead of the number of grace periods. This commit therefore assigns the actual number of grace periods to writer_n_durations[me], and also makes the corresponding adjustment to the loop outputting per-grace-period measurements. Sample of old console output: rcu-scale: writer 0 gps: 133 ...... rcu-scale: 0 writer-duration: 0 44003961 rcu-scale: 0 writer-duration: 1 32003582 ...... rcu-scale: 0 writer-duration: 132 28004391 rcu-scale: 0 writer-duration: 133 27996410 Sample of new console output: rcu-scale: writer 0 gps: 134 ...... rcu-scale: 0 writer-duration: 0 44003961 rcu-scale: 0 writer-duration: 1 32003582 ...... rcu-scale: 0 writer-duration: 132 28004391 rcu-scale: 0 writer-duration: 133 27996410 Signed-off-by: Jiangong.Han Signed-off-by: Paul E. McKenney --- kernel/rcu/rcuscale.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcuscale.c b/kernel/rcu/rcuscale.c index dca51fe9c73f..2cc34a22a506 100644 --- a/kernel/rcu/rcuscale.c +++ b/kernel/rcu/rcuscale.c @@ -487,7 +487,7 @@ retry: if (gp_async) { cur_ops->gp_barrier(); } - writer_n_durations[me] = i_max; + writer_n_durations[me] = i_max + 1; torture_kthread_stopping("rcu_scale_writer"); return 0; } @@ -561,7 +561,7 @@ rcu_scale_cleanup(void) wdpp = writer_durations[i]; if (!wdpp) continue; - for (j = 0; j <= writer_n_durations[i]; j++) { + for (j = 0; j < writer_n_durations[i]; j++) { wdp = &wdpp[j]; pr_alert("%s%s %4d writer-duration: %5d %llu\n", scale_type, SCALE_FLAG, -- cgit From 5b237d650eb8b0870b5d816fecc0be00237cbfff Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 2 Jun 2021 15:51:48 -0700 Subject: locktorture: Mark statistics data races The lock_stress_stats structure's ->n_lock_fail and ->n_lock_acquired fields are incremented and sampled locklessly using plain C-language statements, which KCSAN objects to. This commit therefore marks the statistics gathering with data_race() to flag the intent. While in the area, this commit also reduces the number of accesses to the ->n_lock_acquired field, thus eliminating some possible check/use confusion. Signed-off-by: Paul E. McKenney --- kernel/locking/locktorture.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c index b3adb40549bf..313d5e613fbe 100644 --- a/kernel/locking/locktorture.c +++ b/kernel/locking/locktorture.c @@ -738,20 +738,22 @@ static int lock_torture_reader(void *arg) static void __torture_print_stats(char *page, struct lock_stress_stats *statp, bool write) { + long cur; bool fail = false; int i, n_stress; - long max = 0, min = statp ? statp[0].n_lock_acquired : 0; + long max = 0, min = statp ? data_race(statp[0].n_lock_acquired) : 0; long long sum = 0; n_stress = write ? cxt.nrealwriters_stress : cxt.nrealreaders_stress; for (i = 0; i < n_stress; i++) { - if (statp[i].n_lock_fail) + if (data_race(statp[i].n_lock_fail)) fail = true; - sum += statp[i].n_lock_acquired; - if (max < statp[i].n_lock_acquired) - max = statp[i].n_lock_acquired; - if (min > statp[i].n_lock_acquired) - min = statp[i].n_lock_acquired; + cur = data_race(statp[i].n_lock_acquired); + sum += cur; + if (max < cur) + max = cur; + if (min > cur) + min = cur; } page += sprintf(page, "%s: Total: %lld Max/Min: %ld/%ld %s Fail: %d %s\n", -- cgit From af5f6e27d52cdb2cb3826df19a69a74e9d5eff5e Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 2 Jun 2021 16:04:11 -0700 Subject: locktorture: Count lock readers Currently, the lock_is_read_held variable is bool, so that a reader sets it to true just after lock acquisition and then to false just before lock release. This works in a rough statistical sense, but can result in false negatives just after one of a pair of concurrent readers has released the lock. This approach does have low overhead, but at the expense of the setting to true potentially never leaving the reader's store buffer, thus resulting in an unconditional false negative. This commit therefore converts this variable to atomic_t and makes the reader use atomic_inc() just after acquisition and atomic_dec() just before release. This does increase overhead, but this increase is negligible compared to the 10-microsecond lock hold time. Signed-off-by: Paul E. McKenney --- kernel/locking/locktorture.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c index 313d5e613fbe..7c5a4a087cc7 100644 --- a/kernel/locking/locktorture.c +++ b/kernel/locking/locktorture.c @@ -59,7 +59,7 @@ static struct task_struct **writer_tasks; static struct task_struct **reader_tasks; static bool lock_is_write_held; -static bool lock_is_read_held; +static atomic_t lock_is_read_held; static unsigned long last_lock_release; struct lock_stress_stats { @@ -682,7 +682,7 @@ static int lock_torture_writer(void *arg) if (WARN_ON_ONCE(lock_is_write_held)) lwsp->n_lock_fail++; lock_is_write_held = true; - if (WARN_ON_ONCE(lock_is_read_held)) + if (WARN_ON_ONCE(atomic_read(&lock_is_read_held))) lwsp->n_lock_fail++; /* rare, but... */ lwsp->n_lock_acquired++; @@ -717,13 +717,13 @@ static int lock_torture_reader(void *arg) schedule_timeout_uninterruptible(1); cxt.cur_ops->readlock(tid); - lock_is_read_held = true; + atomic_inc(&lock_is_read_held); if (WARN_ON_ONCE(lock_is_write_held)) lrsp->n_lock_fail++; /* rare, but... */ lrsp->n_lock_acquired++; cxt.cur_ops->read_delay(&rand); - lock_is_read_held = false; + atomic_dec(&lock_is_read_held); cxt.cur_ops->readunlock(tid); stutter_wait("lock_torture_reader"); @@ -998,7 +998,6 @@ static int __init lock_torture_init(void) } if (nreaders_stress) { - lock_is_read_held = false; cxt.lrsa = kmalloc_array(cxt.nrealreaders_stress, sizeof(*cxt.lrsa), GFP_KERNEL); -- cgit From 9b9a80677fd80bd531cb05bfe205a40a51955939 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 24 Jun 2021 17:53:43 -0700 Subject: scftorture: Add RPC-like IPI tests This commit adds the single_weight_rpc module parameter, which causes the IPI handler to awaken the IPI sender. In many scheduler configurations, this will result in an IPI back to the sender that is likely to be received at a time when the sender CPU is idle. The intent is to stress IPI reception during CPU busy-to-idle transitions. Signed-off-by: Paul E. McKenney --- kernel/scftorture.c | 76 ++++++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 63 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/scftorture.c b/kernel/scftorture.c index 29e8fc5d91a7..5cf40e438319 100644 --- a/kernel/scftorture.c +++ b/kernel/scftorture.c @@ -64,6 +64,7 @@ torture_param(bool, use_cpus_read_lock, 0, "Use cpus_read_lock() to exclude CPU torture_param(int, verbose, 0, "Enable verbose debugging printk()s"); torture_param(int, weight_resched, -1, "Testing weight for resched_cpu() operations."); torture_param(int, weight_single, -1, "Testing weight for single-CPU no-wait operations."); +torture_param(int, weight_single_rpc, -1, "Testing weight for single-CPU RPC operations."); torture_param(int, weight_single_wait, -1, "Testing weight for single-CPU operations."); torture_param(int, weight_many, -1, "Testing weight for multi-CPU no-wait operations."); torture_param(int, weight_many_wait, -1, "Testing weight for multi-CPU operations."); @@ -86,6 +87,8 @@ struct scf_statistics { long long n_resched; long long n_single; long long n_single_ofl; + long long n_single_rpc; + long long n_single_rpc_ofl; long long n_single_wait; long long n_single_wait_ofl; long long n_many; @@ -101,14 +104,17 @@ static DEFINE_PER_CPU(long long, scf_invoked_count); // Data for random primitive selection #define SCF_PRIM_RESCHED 0 #define SCF_PRIM_SINGLE 1 -#define SCF_PRIM_MANY 2 -#define SCF_PRIM_ALL 3 -#define SCF_NPRIMS 7 // Need wait and no-wait versions of each, - // except for SCF_PRIM_RESCHED. +#define SCF_PRIM_SINGLE_RPC 2 +#define SCF_PRIM_MANY 3 +#define SCF_PRIM_ALL 4 +#define SCF_NPRIMS 8 // Need wait and no-wait versions of each, + // except for SCF_PRIM_RESCHED and + // SCF_PRIM_SINGLE_RPC. static char *scf_prim_name[] = { "resched_cpu", "smp_call_function_single", + "smp_call_function_single_rpc", "smp_call_function_many", "smp_call_function", }; @@ -128,6 +134,8 @@ struct scf_check { bool scfc_out; int scfc_cpu; // -1 for not _single(). bool scfc_wait; + bool scfc_rpc; + struct completion scfc_completion; }; // Use to wait for all threads to start. @@ -158,6 +166,7 @@ static void scf_torture_stats_print(void) scfs.n_resched += scf_stats_p[i].n_resched; scfs.n_single += scf_stats_p[i].n_single; scfs.n_single_ofl += scf_stats_p[i].n_single_ofl; + scfs.n_single_rpc += scf_stats_p[i].n_single_rpc; scfs.n_single_wait += scf_stats_p[i].n_single_wait; scfs.n_single_wait_ofl += scf_stats_p[i].n_single_wait_ofl; scfs.n_many += scf_stats_p[i].n_many; @@ -168,9 +177,10 @@ static void scf_torture_stats_print(void) if (atomic_read(&n_errs) || atomic_read(&n_mb_in_errs) || atomic_read(&n_mb_out_errs) || atomic_read(&n_alloc_errs)) bangstr = "!!! "; - pr_alert("%s %sscf_invoked_count %s: %lld resched: %lld single: %lld/%lld single_ofl: %lld/%lld many: %lld/%lld all: %lld/%lld ", + pr_alert("%s %sscf_invoked_count %s: %lld resched: %lld single: %lld/%lld single_ofl: %lld/%lld single_rpc: %lld single_rpc_ofl: %lld many: %lld/%lld all: %lld/%lld ", SCFTORT_FLAG, bangstr, isdone ? "VER" : "ver", invoked_count, scfs.n_resched, scfs.n_single, scfs.n_single_wait, scfs.n_single_ofl, scfs.n_single_wait_ofl, + scfs.n_single_rpc, scfs.n_single_rpc_ofl, scfs.n_many, scfs.n_many_wait, scfs.n_all, scfs.n_all_wait); torture_onoff_stats(); pr_cont("ste: %d stnmie: %d stnmoe: %d staf: %d\n", atomic_read(&n_errs), @@ -282,10 +292,13 @@ static void scf_handler(void *scfc_in) out: if (unlikely(!scfcp)) return; - if (scfcp->scfc_wait) + if (scfcp->scfc_wait) { WRITE_ONCE(scfcp->scfc_out, true); - else + if (scfcp->scfc_rpc) + complete(&scfcp->scfc_completion); + } else { kfree(scfcp); + } } // As above, but check for correct CPU. @@ -319,6 +332,7 @@ static void scftorture_invoke_one(struct scf_statistics *scfp, struct torture_ra scfcp->scfc_cpu = -1; scfcp->scfc_wait = scfsp->scfs_wait; scfcp->scfc_out = false; + scfcp->scfc_rpc = false; } } switch (scfsp->scfs_prim) { @@ -350,6 +364,34 @@ static void scftorture_invoke_one(struct scf_statistics *scfp, struct torture_ra scfcp = NULL; } break; + case SCF_PRIM_SINGLE_RPC: + if (!scfcp) + break; + cpu = torture_random(trsp) % nr_cpu_ids; + scfp->n_single_rpc++; + scfcp->scfc_cpu = cpu; + scfcp->scfc_wait = true; + init_completion(&scfcp->scfc_completion); + scfcp->scfc_rpc = true; + barrier(); // Prevent race-reduction compiler optimizations. + scfcp->scfc_in = true; + ret = smp_call_function_single(cpu, scf_handler_1, (void *)scfcp, 0); + if (!ret) { + if (use_cpus_read_lock) + cpus_read_unlock(); + else + preempt_enable(); + wait_for_completion(&scfcp->scfc_completion); + if (use_cpus_read_lock) + cpus_read_lock(); + else + preempt_disable(); + } else { + scfp->n_single_rpc_ofl++; + kfree(scfcp); + scfcp = NULL; + } + break; case SCF_PRIM_MANY: if (scfsp->scfs_wait) scfp->n_many_wait++; @@ -379,10 +421,12 @@ static void scftorture_invoke_one(struct scf_statistics *scfp, struct torture_ra } if (scfcp && scfsp->scfs_wait) { if (WARN_ON_ONCE((num_online_cpus() > 1 || scfsp->scfs_prim == SCF_PRIM_SINGLE) && - !scfcp->scfc_out)) + !scfcp->scfc_out)) { + pr_warn("%s: Memory-ordering failure, scfs_prim: %d.\n", __func__, scfsp->scfs_prim); atomic_inc(&n_mb_out_errs); // Leak rather than trash! - else + } else { kfree(scfcp); + } barrier(); // Prevent race-reduction compiler optimizations. } if (use_cpus_read_lock) @@ -453,8 +497,8 @@ static void scftorture_print_module_parms(const char *tag) { pr_alert(SCFTORT_FLAG - "--- %s: verbose=%d holdoff=%d longwait=%d nthreads=%d onoff_holdoff=%d onoff_interval=%d shutdown_secs=%d stat_interval=%d stutter=%d use_cpus_read_lock=%d, weight_resched=%d, weight_single=%d, weight_single_wait=%d, weight_many=%d, weight_many_wait=%d, weight_all=%d, weight_all_wait=%d\n", tag, - verbose, holdoff, longwait, nthreads, onoff_holdoff, onoff_interval, shutdown, stat_interval, stutter, use_cpus_read_lock, weight_resched, weight_single, weight_single_wait, weight_many, weight_many_wait, weight_all, weight_all_wait); + "--- %s: verbose=%d holdoff=%d longwait=%d nthreads=%d onoff_holdoff=%d onoff_interval=%d shutdown_secs=%d stat_interval=%d stutter=%d use_cpus_read_lock=%d, weight_resched=%d, weight_single=%d, weight_single_rpc=%d, weight_single_wait=%d, weight_many=%d, weight_many_wait=%d, weight_all=%d, weight_all_wait=%d\n", tag, + verbose, holdoff, longwait, nthreads, onoff_holdoff, onoff_interval, shutdown, stat_interval, stutter, use_cpus_read_lock, weight_resched, weight_single, weight_single_rpc, weight_single_wait, weight_many, weight_many_wait, weight_all, weight_all_wait); } static void scf_cleanup_handler(void *unused) @@ -497,6 +541,7 @@ static int __init scf_torture_init(void) int firsterr = 0; unsigned long weight_resched1 = weight_resched; unsigned long weight_single1 = weight_single; + unsigned long weight_single_rpc1 = weight_single_rpc; unsigned long weight_single_wait1 = weight_single_wait; unsigned long weight_many1 = weight_many; unsigned long weight_many_wait1 = weight_many_wait; @@ -508,11 +553,13 @@ static int __init scf_torture_init(void) scftorture_print_module_parms("Start of test"); - if (weight_resched == -1 && weight_single == -1 && weight_single_wait == -1 && + if (weight_resched == -1 && + weight_single == -1 && weight_single_rpc == -1 && weight_single_wait == -1 && weight_many == -1 && weight_many_wait == -1 && weight_all == -1 && weight_all_wait == -1) { weight_resched1 = 2 * nr_cpu_ids; weight_single1 = 2 * nr_cpu_ids; + weight_single_rpc1 = 2 * nr_cpu_ids; weight_single_wait1 = 2 * nr_cpu_ids; weight_many1 = 2; weight_many_wait1 = 2; @@ -523,6 +570,8 @@ static int __init scf_torture_init(void) weight_resched1 = 0; if (weight_single == -1) weight_single1 = 0; + if (weight_single_rpc == -1) + weight_single_rpc1 = 0; if (weight_single_wait == -1) weight_single_wait1 = 0; if (weight_many == -1) @@ -534,7 +583,7 @@ static int __init scf_torture_init(void) if (weight_all_wait == -1) weight_all_wait1 = 0; } - if (weight_single1 == 0 && weight_single_wait1 == 0 && + if (weight_single1 == 0 && weight_single_rpc1 == 0 && weight_single_wait1 == 0 && weight_many1 == 0 && weight_many_wait1 == 0 && weight_all1 == 0 && weight_all_wait1 == 0) { VERBOSE_SCFTORTOUT_ERRSTRING("all zero weights makes no sense"); @@ -546,6 +595,7 @@ static int __init scf_torture_init(void) else if (weight_resched1) VERBOSE_SCFTORTOUT_ERRSTRING("built as module, weight_resched ignored"); scf_sel_add(weight_single1, SCF_PRIM_SINGLE, false); + scf_sel_add(weight_single_rpc1, SCF_PRIM_SINGLE_RPC, true); scf_sel_add(weight_single_wait1, SCF_PRIM_SINGLE, true); scf_sel_add(weight_many1, SCF_PRIM_MANY, false); scf_sel_add(weight_many_wait1, SCF_PRIM_MANY, true); -- cgit From 586e4d4193a653eef21f02b50dee89e2e4be208c Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 9 Jul 2021 17:53:27 -0700 Subject: scftorture: Avoid NULL pointer exception on early exit When scftorture finds an error in the module parameters controlling the relative frequencies of smp_call_function*() variants, it takes an early exit. So early that it has not allocated memory to track the kthreads running the test, which results in a segfault. This commit therefore checks for the existence of the memory before attempting to stop the kthreads that would otherwise have been recorded in that non-existent memory. Signed-off-by: Paul E. McKenney --- kernel/scftorture.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/scftorture.c b/kernel/scftorture.c index 5cf40e438319..64a08288b1a6 100644 --- a/kernel/scftorture.c +++ b/kernel/scftorture.c @@ -513,7 +513,7 @@ static void scf_torture_cleanup(void) return; WRITE_ONCE(scfdone, true); - if (nthreads) + if (nthreads && scf_stats_p) for (i = 0; i < nthreads; i++) torture_stop_kthread("scftorture_invoker", scf_stats_p[i].task); else -- cgit From e6a901a44f76878ed1653626c9ff4cfc5a3f58f8 Mon Sep 17 00:00:00 2001 From: Yanfei Xu Date: Sun, 16 May 2021 00:45:11 +0800 Subject: rcu: Fix to include first blocked task in stall warning The for loop in rcu_print_task_stall() always omits ts[0], which points to the first task blocking the stalled grace period. This in turn fails to count this first task, which means that ndetected will be equal to zero when all CPUs have passed through their quiescent states and only one task is blocking the stalled grace period. This zero value for ndetected will in turn result in an incorrect "All QSes seen" message: rcu: INFO: rcu_preempt detected stalls on CPUs/tasks: rcu: Tasks blocked on level-1 rcu_node (CPUs 12-23): (detected by 15, t=6504 jiffies, g=164777, q=9011209) rcu: All QSes seen, last rcu_preempt kthread activity 1 (4295252379-4295252378), jiffies_till_next_fqs=1, root ->qsmask 0x2 BUG: sleeping function called from invalid context at include/linux/uaccess.h:156 in_atomic(): 1, irqs_disabled(): 0, non_block: 0, pid: 70613, name: msgstress04 INFO: lockdep is turned off. Preemption disabled at: [] create_object.isra.0+0x204/0x4b0 CPU: 15 PID: 70613 Comm: msgstress04 Kdump: loaded Not tainted 5.12.2-yoctodev-standard #1 Hardware name: Marvell OcteonTX CN96XX board (DT) Call trace: dump_backtrace+0x0/0x2cc show_stack+0x24/0x30 dump_stack+0x110/0x188 ___might_sleep+0x214/0x2d0 __might_sleep+0x7c/0xe0 This commit therefore fixes the loop to include ts[0]. Fixes: c583bcb8f5ed ("rcu: Don't invoke try_invoke_on_locked_down_task() with irqs disabled") Tested-by: Qais Yousef Signed-off-by: Yanfei Xu Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_stall.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h index 6c76988cc019..2e96f9741666 100644 --- a/kernel/rcu/tree_stall.h +++ b/kernel/rcu/tree_stall.h @@ -280,8 +280,8 @@ static int rcu_print_task_stall(struct rcu_node *rnp, unsigned long flags) break; } raw_spin_unlock_irqrestore_rcu_node(rnp, flags); - for (i--; i; i--) { - t = ts[i]; + while (i) { + t = ts[--i]; if (!try_invoke_on_locked_down_task(t, check_slow_task, &rscr)) pr_cont(" P%d", t->pid); else -- cgit From dc87740c8a6806bd2162bfb441770e4e53be5601 Mon Sep 17 00:00:00 2001 From: Yanfei Xu Date: Sun, 16 May 2021 17:50:10 +0800 Subject: rcu: Fix stall-warning deadlock due to non-release of rcu_node ->lock If rcu_print_task_stall() is invoked on an rcu_node structure that does not contain any tasks blocking the current grace period, it takes an early exit that fails to release that rcu_node structure's lock. This results in a self-deadlock, which is detected by lockdep. To reproduce this bug: tools/testing/selftests/rcutorture/bin/kvm.sh --allcpus --duration 3 --trust-make --configs "TREE03" --kconfig "CONFIG_PROVE_LOCKING=y" --bootargs "rcutorture.stall_cpu=30 rcutorture.stall_cpu_block=1 rcutorture.fwd_progress=0 rcutorture.test_boost=0" This will also result in other complaints, including RCU's scheduler hook complaining about blocking rather than preemption and an rcutorture writer stall. Only a partial RCU CPU stall warning message will be printed because of the self-deadlock. This commit therefore releases the lock on the rcu_print_task_stall() function's early exit path. Fixes: c583bcb8f5ed ("rcu: Don't invoke try_invoke_on_locked_down_task() with irqs disabled") Tested-by: Qais Yousef Signed-off-by: Yanfei Xu Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_stall.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h index 2e96f9741666..bd4de5bc5807 100644 --- a/kernel/rcu/tree_stall.h +++ b/kernel/rcu/tree_stall.h @@ -267,8 +267,10 @@ static int rcu_print_task_stall(struct rcu_node *rnp, unsigned long flags) struct task_struct *ts[8]; lockdep_assert_irqs_disabled(); - if (!rcu_preempt_blocked_readers_cgp(rnp)) + if (!rcu_preempt_blocked_readers_cgp(rnp)) { + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); return 0; + } pr_err("\tTasks blocked on level-%d rcu_node (CPUs %d-%d):", rnp->level, rnp->grplo, rnp->grphi); t = list_entry(rnp->gp_tasks->prev, -- cgit From a86baa69c2b7b85bab41692fa3ec188a5aae1d27 Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Tue, 18 May 2021 19:17:16 -0700 Subject: rcu: Remove special bit at the bottom of the ->dynticks counter Commit b8c17e6664c4 ("rcu: Maintain special bits at bottom of ->dynticks counter") reserved a bit at the bottom of the ->dynticks counter to defer flushing of TLBs, but this facility never has been used. This commit therefore removes this capability along with the rcu_eqs_special_set() function used to trigger it. Link: https://lore.kernel.org/linux-doc/CALCETrWNPOOdTrFabTDd=H7+wc6xJ9rJceg6OL1S0rTV5pfSsA@mail.gmail.com/ Suggested-by: Andy Lutomirski Signed-off-by: "Joel Fernandes (Google)" [ paulmck: Forward-port to v5.13-rc1. ] Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 77 ++++++++++--------------------------------------------- 1 file changed, 14 insertions(+), 63 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 51f24ecd94b2..42a0032dd99f 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -74,17 +74,10 @@ /* Data structures. */ -/* - * Steal a bit from the bottom of ->dynticks for idle entry/exit - * control. Initially this is for TLB flushing. - */ -#define RCU_DYNTICK_CTRL_MASK 0x1 -#define RCU_DYNTICK_CTRL_CTR (RCU_DYNTICK_CTRL_MASK + 1) - static DEFINE_PER_CPU_SHARED_ALIGNED(struct rcu_data, rcu_data) = { .dynticks_nesting = 1, .dynticks_nmi_nesting = DYNTICK_IRQ_NONIDLE, - .dynticks = ATOMIC_INIT(RCU_DYNTICK_CTRL_CTR), + .dynticks = ATOMIC_INIT(1), #ifdef CONFIG_RCU_NOCB_CPU .cblist.flags = SEGCBLIST_SOFTIRQ_ONLY, #endif @@ -266,7 +259,6 @@ void rcu_softirq_qs(void) */ static noinstr void rcu_dynticks_eqs_enter(void) { - struct rcu_data *rdp = this_cpu_ptr(&rcu_data); int seq; /* @@ -275,13 +267,9 @@ static noinstr void rcu_dynticks_eqs_enter(void) * next idle sojourn. */ rcu_dynticks_task_trace_enter(); // Before ->dynticks update! - seq = arch_atomic_add_return(RCU_DYNTICK_CTRL_CTR, &rdp->dynticks); + seq = arch_atomic_inc_return(&this_cpu_ptr(&rcu_data)->dynticks); // RCU is no longer watching. Better be in extended quiescent state! - WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && - (seq & RCU_DYNTICK_CTRL_CTR)); - /* Better not have special action (TLB flush) pending! */ - WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && - (seq & RCU_DYNTICK_CTRL_MASK)); + WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && (seq & 0x1)); } /* @@ -291,7 +279,6 @@ static noinstr void rcu_dynticks_eqs_enter(void) */ static noinstr void rcu_dynticks_eqs_exit(void) { - struct rcu_data *rdp = this_cpu_ptr(&rcu_data); int seq; /* @@ -299,15 +286,10 @@ static noinstr void rcu_dynticks_eqs_exit(void) * and we also must force ordering with the next RCU read-side * critical section. */ - seq = arch_atomic_add_return(RCU_DYNTICK_CTRL_CTR, &rdp->dynticks); + seq = arch_atomic_inc_return(&this_cpu_ptr(&rcu_data)->dynticks); // RCU is now watching. Better not be in an extended quiescent state! rcu_dynticks_task_trace_exit(); // After ->dynticks update! - WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && - !(seq & RCU_DYNTICK_CTRL_CTR)); - if (seq & RCU_DYNTICK_CTRL_MASK) { - arch_atomic_andnot(RCU_DYNTICK_CTRL_MASK, &rdp->dynticks); - smp_mb__after_atomic(); /* _exit after clearing mask. */ - } + WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !(seq & 0x1)); } /* @@ -324,9 +306,9 @@ static void rcu_dynticks_eqs_online(void) { struct rcu_data *rdp = this_cpu_ptr(&rcu_data); - if (atomic_read(&rdp->dynticks) & RCU_DYNTICK_CTRL_CTR) + if (atomic_read(&rdp->dynticks) & 0x1) return; - atomic_add(RCU_DYNTICK_CTRL_CTR, &rdp->dynticks); + atomic_inc(&rdp->dynticks); } /* @@ -336,9 +318,7 @@ static void rcu_dynticks_eqs_online(void) */ static __always_inline bool rcu_dynticks_curr_cpu_in_eqs(void) { - struct rcu_data *rdp = this_cpu_ptr(&rcu_data); - - return !(arch_atomic_read(&rdp->dynticks) & RCU_DYNTICK_CTRL_CTR); + return !(arch_atomic_read(&this_cpu_ptr(&rcu_data)->dynticks) & 0x1); } /* @@ -347,9 +327,7 @@ static __always_inline bool rcu_dynticks_curr_cpu_in_eqs(void) */ static int rcu_dynticks_snap(struct rcu_data *rdp) { - int snap = atomic_add_return(0, &rdp->dynticks); - - return snap & ~RCU_DYNTICK_CTRL_MASK; + return atomic_add_return(0, &rdp->dynticks); } /* @@ -358,7 +336,7 @@ static int rcu_dynticks_snap(struct rcu_data *rdp) */ static bool rcu_dynticks_in_eqs(int snap) { - return !(snap & RCU_DYNTICK_CTRL_CTR); + return !(snap & 0x1); } /* Return true if the specified CPU is currently idle from an RCU viewpoint. */ @@ -389,8 +367,7 @@ bool rcu_dynticks_zero_in_eqs(int cpu, int *vp) int snap; // If not quiescent, force back to earlier extended quiescent state. - snap = atomic_read(&rdp->dynticks) & ~(RCU_DYNTICK_CTRL_MASK | - RCU_DYNTICK_CTRL_CTR); + snap = atomic_read(&rdp->dynticks) & ~0x1; smp_rmb(); // Order ->dynticks and *vp reads. if (READ_ONCE(*vp)) @@ -398,32 +375,7 @@ bool rcu_dynticks_zero_in_eqs(int cpu, int *vp) smp_rmb(); // Order *vp read and ->dynticks re-read. // If still in the same extended quiescent state, we are good! - return snap == (atomic_read(&rdp->dynticks) & ~RCU_DYNTICK_CTRL_MASK); -} - -/* - * Set the special (bottom) bit of the specified CPU so that it - * will take special action (such as flushing its TLB) on the - * next exit from an extended quiescent state. Returns true if - * the bit was successfully set, or false if the CPU was not in - * an extended quiescent state. - */ -bool rcu_eqs_special_set(int cpu) -{ - int old; - int new; - int new_old; - struct rcu_data *rdp = &per_cpu(rcu_data, cpu); - - new_old = atomic_read(&rdp->dynticks); - do { - old = new_old; - if (old & RCU_DYNTICK_CTRL_CTR) - return false; - new = old | RCU_DYNTICK_CTRL_MASK; - new_old = atomic_cmpxchg(&rdp->dynticks, old, new); - } while (new_old != old); - return true; + return snap == atomic_read(&rdp->dynticks); } /* @@ -442,10 +394,9 @@ notrace void rcu_momentary_dyntick_idle(void) int special; raw_cpu_write(rcu_data.rcu_need_heavy_qs, false); - special = atomic_add_return(2 * RCU_DYNTICK_CTRL_CTR, - &this_cpu_ptr(&rcu_data)->dynticks); + special = atomic_add_return(2, &this_cpu_ptr(&rcu_data)->dynticks); /* It is illegal to call this from idle state. */ - WARN_ON_ONCE(!(special & RCU_DYNTICK_CTRL_CTR)); + WARN_ON_ONCE(!(special & 0x1)); rcu_preempt_deferred_qs(current); } EXPORT_SYMBOL_GPL(rcu_momentary_dyntick_idle); -- cgit From 2be57f732889277b07ccddd205ef0616c8c1941f Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 19 May 2021 17:25:42 -0700 Subject: rcu: Weaken ->dynticks accesses and updates Accesses to the rcu_data structure's ->dynticks field have always been fully ordered because it was not possible to prove that weaker ordering was safe. However, with the removal of the rcu_eqs_special_set() function and the advent of the Linux-kernel memory model, it is now easy to show that two of the four original full memory barriers can be weakened to acquire and release operations. The remaining pair must remain full memory barriers. This change makes the memory ordering requirements more evident, and it might well also speed up the to-idle and from-idle fastpaths on some architectures. The following litmus test, adapted from one supplied off-list by Frederic Weisbecker, models the RCU grace-period kthread detecting an idle CPU that is concurrently transitioning to non-idle: C dynticks-from-idle { DYNTICKS=0; (* Initially idle. *) } P0(int *X, int *DYNTICKS) { int dynticks; int x; // Idle. dynticks = READ_ONCE(*DYNTICKS); smp_store_release(DYNTICKS, dynticks + 1); smp_mb(); // Now non-idle x = READ_ONCE(*X); } P1(int *X, int *DYNTICKS) { int dynticks; WRITE_ONCE(*X, 1); smp_mb(); dynticks = smp_load_acquire(DYNTICKS); } exists (1:dynticks=0 /\ 0:x=1) Running "herd7 -conf linux-kernel.cfg dynticks-from-idle.litmus" verifies this transition, namely, showing that if the RCU grace-period kthread (P1) sees another CPU as idle (P0), then any memory access prior to the start of the grace period (P1's write to X) will be seen by any RCU read-side critical section following the to-non-idle transition (P0's read from X). This is a straightforward use of full memory barriers to force ordering in a store-buffering (SB) litmus test. The following litmus test, also adapted from the one supplied off-list by Frederic Weisbecker, models the RCU grace-period kthread detecting a non-idle CPU that is concurrently transitioning to idle: C dynticks-into-idle { DYNTICKS=1; (* Initially non-idle. *) } P0(int *X, int *DYNTICKS) { int dynticks; // Non-idle. WRITE_ONCE(*X, 1); dynticks = READ_ONCE(*DYNTICKS); smp_store_release(DYNTICKS, dynticks + 1); smp_mb(); // Now idle. } P1(int *X, int *DYNTICKS) { int x; int dynticks; smp_mb(); dynticks = smp_load_acquire(DYNTICKS); x = READ_ONCE(*X); } exists (1:dynticks=2 /\ 1:x=0) Running "herd7 -conf linux-kernel.cfg dynticks-into-idle.litmus" verifies this transition, namely, showing that if the RCU grace-period kthread (P1) sees another CPU as newly idle (P0), then any pre-idle memory access (P0's write to X) will be seen by any code following the grace period (P1's read from X). This is a simple release-acquire pair forcing ordering in a message-passing (MP) litmus test. Of course, if the grace-period kthread detects the CPU as non-idle, it will refrain from reporting a quiescent state on behalf of that CPU, so there are no ordering requirements from the grace-period kthread in that case. However, other subsystems call rcu_is_idle_cpu() to check for CPUs being non-idle from an RCU perspective. That case is also verified by the above litmus tests with the proviso that the sense of the low-order bit of the DYNTICKS counter be inverted. Unfortunately, on x86 smp_mb() is as expensive as a cache-local atomic increment. This commit therefore weakens only the read from ->dynticks. However, the updates are abstracted into a rcu_dynticks_inc() function to ease any future changes that might be needed. [ paulmck: Apply Linus Torvalds feedback. ] Link: https://lore.kernel.org/lkml/20210721202127.2129660-4-paulmck@kernel.org/ Suggested-by: Linus Torvalds Acked-by: Mathieu Desnoyers Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 26 ++++++++++++++++++-------- 1 file changed, 18 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 42a0032dd99f..381ebd3ae26c 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -251,6 +251,15 @@ void rcu_softirq_qs(void) rcu_tasks_qs(current, false); } +/* + * Increment the current CPU's rcu_data structure's ->dynticks field + * with ordering. Return the new value. + */ +static noinline noinstr unsigned long rcu_dynticks_inc(int incby) +{ + return arch_atomic_add_return(incby, this_cpu_ptr(&rcu_data.dynticks)); +} + /* * Record entry into an extended quiescent state. This is only to be * called when not already in an extended quiescent state, that is, @@ -267,7 +276,7 @@ static noinstr void rcu_dynticks_eqs_enter(void) * next idle sojourn. */ rcu_dynticks_task_trace_enter(); // Before ->dynticks update! - seq = arch_atomic_inc_return(&this_cpu_ptr(&rcu_data)->dynticks); + seq = rcu_dynticks_inc(1); // RCU is no longer watching. Better be in extended quiescent state! WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && (seq & 0x1)); } @@ -286,7 +295,7 @@ static noinstr void rcu_dynticks_eqs_exit(void) * and we also must force ordering with the next RCU read-side * critical section. */ - seq = arch_atomic_inc_return(&this_cpu_ptr(&rcu_data)->dynticks); + seq = rcu_dynticks_inc(1); // RCU is now watching. Better not be in an extended quiescent state! rcu_dynticks_task_trace_exit(); // After ->dynticks update! WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !(seq & 0x1)); @@ -308,7 +317,7 @@ static void rcu_dynticks_eqs_online(void) if (atomic_read(&rdp->dynticks) & 0x1) return; - atomic_inc(&rdp->dynticks); + rcu_dynticks_inc(1); } /* @@ -318,7 +327,7 @@ static void rcu_dynticks_eqs_online(void) */ static __always_inline bool rcu_dynticks_curr_cpu_in_eqs(void) { - return !(arch_atomic_read(&this_cpu_ptr(&rcu_data)->dynticks) & 0x1); + return !(atomic_read(this_cpu_ptr(&rcu_data.dynticks)) & 0x1); } /* @@ -327,7 +336,8 @@ static __always_inline bool rcu_dynticks_curr_cpu_in_eqs(void) */ static int rcu_dynticks_snap(struct rcu_data *rdp) { - return atomic_add_return(0, &rdp->dynticks); + smp_mb(); // Fundamental RCU ordering guarantee. + return atomic_read_acquire(&rdp->dynticks); } /* @@ -391,12 +401,12 @@ bool rcu_dynticks_zero_in_eqs(int cpu, int *vp) */ notrace void rcu_momentary_dyntick_idle(void) { - int special; + int seq; raw_cpu_write(rcu_data.rcu_need_heavy_qs, false); - special = atomic_add_return(2, &this_cpu_ptr(&rcu_data)->dynticks); + seq = rcu_dynticks_inc(2); /* It is illegal to call this from idle state. */ - WARN_ON_ONCE(!(special & 0x1)); + WARN_ON_ONCE(!(seq & 0x1)); rcu_preempt_deferred_qs(current); } EXPORT_SYMBOL_GPL(rcu_momentary_dyntick_idle); -- cgit From 5fcb3a5f04ee6422714adb02f5364042228bfc2e Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 20 May 2021 13:35:50 -0700 Subject: rcu: Mark accesses to ->rcu_read_lock_nesting KCSAN flags accesses to ->rcu_read_lock_nesting as data races, but in the past, the overhead of marked accesses was excessive. However, that was long ago, and much has changed since then, both in terms of hardware and of compilers. Here is data taken on an eight-core laptop using Intel(R) Core(TM) i9-10885H CPU @ 2.40GHz with a kernel built using gcc version 9.3.0, with all data in nanoseconds. Unmarked accesses (status quo), measured by three refscale runs: Minimum reader duration: 3.286 2.851 3.395 Median reader duration: 3.698 3.531 3.4695 Maximum reader duration: 4.481 5.215 5.157 Marked accesses, also measured by three refscale runs: Minimum reader duration: 3.501 3.677 3.580 Median reader duration: 4.053 3.723 3.895 Maximum reader duration: 7.307 4.999 5.511 This focused microbenhmark shows only sub-nanosecond differences which are unlikely to be visible at the system level. This commit therefore marks data-racing accesses to ->rcu_read_lock_nesting. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_plugin.h | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index de1dc3bb7f70..83a702a4e296 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -405,17 +405,20 @@ static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp) static void rcu_preempt_read_enter(void) { - current->rcu_read_lock_nesting++; + WRITE_ONCE(current->rcu_read_lock_nesting, READ_ONCE(current->rcu_read_lock_nesting) + 1); } static int rcu_preempt_read_exit(void) { - return --current->rcu_read_lock_nesting; + int ret = READ_ONCE(current->rcu_read_lock_nesting) - 1; + + WRITE_ONCE(current->rcu_read_lock_nesting, ret); + return ret; } static void rcu_preempt_depth_set(int val) { - current->rcu_read_lock_nesting = val; + WRITE_ONCE(current->rcu_read_lock_nesting, val); } /* -- cgit From ccfc9dd6914feaa9a81f10f9cce56eb0f7712264 Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Sat, 22 May 2021 00:56:23 +0900 Subject: rcu/tree: Handle VM stoppage in stall detection The soft watchdog timer function checks if a virtual machine was suspended and hence what looks like a lockup in fact is a false positive. This is what kvm_check_and_clear_guest_paused() does: it tests guest PVCLOCK_GUEST_STOPPED (which is set by the host) and if it's set then we need to touch all watchdogs and bail out. Watchdog timer function runs from IRQ, so PVCLOCK_GUEST_STOPPED check works fine. There is, however, one more watchdog that runs from IRQ, so watchdog timer fn races with it, and that watchdog is not aware of PVCLOCK_GUEST_STOPPED - RCU stall detector. apic_timer_interrupt() smp_apic_timer_interrupt() hrtimer_interrupt() __hrtimer_run_queues() tick_sched_timer() tick_sched_handle() update_process_times() rcu_sched_clock_irq() This triggers RCU stalls on our devices during VM resume. If tick_sched_handle()->rcu_sched_clock_irq() runs on a VCPU before watchdog_timer_fn()->kvm_check_and_clear_guest_paused() then there is nothing on this VCPU that touches watchdogs and RCU reads stale gp stall timestamp and new jiffies value, which makes it think that RCU has stalled. Make RCU stall watchdog aware of PVCLOCK_GUEST_STOPPED and don't report RCU stalls when we resume the VM. Signed-off-by: Sergey Senozhatsky Signed-off-by: Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_stall.h | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'kernel') diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h index bd4de5bc5807..0e7a60706d1c 100644 --- a/kernel/rcu/tree_stall.h +++ b/kernel/rcu/tree_stall.h @@ -7,6 +7,8 @@ * Author: Paul E. McKenney */ +#include + ////////////////////////////////////////////////////////////////////////////// // // Controlling CPU stall warnings, including delay calculation. @@ -698,6 +700,14 @@ static void check_cpu_stall(struct rcu_data *rdp) (READ_ONCE(rnp->qsmask) & rdp->grpmask) && cmpxchg(&rcu_state.jiffies_stall, js, jn) == js) { + /* + * If a virtual machine is stopped by the host it can look to + * the watchdog like an RCU stall. Check to see if the host + * stopped the vm. + */ + if (kvm_check_and_clear_guest_paused()) + return; + /* We haven't checked in, so go dump stack. */ print_cpu_stall(gps); if (READ_ONCE(rcu_cpu_stall_ftrace_dump)) @@ -707,6 +717,14 @@ static void check_cpu_stall(struct rcu_data *rdp) ULONG_CMP_GE(j, js + RCU_STALL_RAT_DELAY) && cmpxchg(&rcu_state.jiffies_stall, js, jn) == js) { + /* + * If a virtual machine is stopped by the host it can look to + * the watchdog like an RCU stall. Check to see if the host + * stopped the vm. + */ + if (kvm_check_and_clear_guest_paused()) + return; + /* They had a few time units to dump stack, so complain. */ print_other_cpu_stall(gs2, gps); if (READ_ONCE(rcu_cpu_stall_ftrace_dump)) -- cgit From a80be428fbc1f1f3bc9ed9245906dd60850887f5 Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Sat, 22 May 2021 00:56:24 +0900 Subject: rcu: Do not disable GP stall detection in rcu_cpu_stall_reset() rcu_cpu_stall_reset() is one of the functions virtual CPUs execute during VM resume in order to handle jiffies skew that can trigger false positive stall warnings. Paul has pointed out that this approach is problematic because rcu_cpu_stall_reset() disables RCU grace period stall-detection virtually forever, while in fact it can just restart the stall-detection timeout. Suggested-by: "Paul E. McKenney" Signed-off-by: Sergey Senozhatsky Signed-off-by: Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_stall.h | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h index 0e7a60706d1c..e199022dce9d 100644 --- a/kernel/rcu/tree_stall.h +++ b/kernel/rcu/tree_stall.h @@ -119,17 +119,14 @@ static void panic_on_rcu_stall(void) } /** - * rcu_cpu_stall_reset - prevent further stall warnings in current grace period - * - * Set the stall-warning timeout way off into the future, thus preventing - * any RCU CPU stall-warning messages from appearing in the current set of - * RCU grace periods. + * rcu_cpu_stall_reset - restart stall-warning timeout for current grace period * * The caller must disable hard irqs. */ void rcu_cpu_stall_reset(void) { - WRITE_ONCE(rcu_state.jiffies_stall, jiffies + ULONG_MAX / 2); + WRITE_ONCE(rcu_state.jiffies_stall, + jiffies + rcu_jiffies_till_stall_check()); } ////////////////////////////////////////////////////////////////////////////// -- cgit From b169246feb1d82dbee5f3f6a4ce57368644dce95 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 21 May 2021 14:23:03 -0700 Subject: rcu: Start timing stall repetitions after warning complete Systems with low-bandwidth consoles can have very large printk() latencies, and on such systems it makes no sense to have the next RCU CPU stall warning message start output before the prior message completed. This commit therefore sets the time of the next stall only after the prints have completed. While printing, the time of the next stall message is set to ULONG_MAX/2 jiffies into the future. Reviewed-by: Sergey Senozhatsky Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_stall.h | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h index e199022dce9d..42847caa3909 100644 --- a/kernel/rcu/tree_stall.h +++ b/kernel/rcu/tree_stall.h @@ -647,6 +647,7 @@ static void print_cpu_stall(unsigned long gps) static void check_cpu_stall(struct rcu_data *rdp) { + bool didstall = false; unsigned long gs1; unsigned long gs2; unsigned long gps; @@ -692,7 +693,7 @@ static void check_cpu_stall(struct rcu_data *rdp) ULONG_CMP_GE(gps, js)) return; /* No stall or GP completed since entering function. */ rnp = rdp->mynode; - jn = jiffies + 3 * rcu_jiffies_till_stall_check() + 3; + jn = jiffies + ULONG_MAX / 2; if (rcu_gp_in_progress() && (READ_ONCE(rnp->qsmask) & rdp->grpmask) && cmpxchg(&rcu_state.jiffies_stall, js, jn) == js) { @@ -709,6 +710,7 @@ static void check_cpu_stall(struct rcu_data *rdp) print_cpu_stall(gps); if (READ_ONCE(rcu_cpu_stall_ftrace_dump)) rcu_ftrace_dump(DUMP_ALL); + didstall = true; } else if (rcu_gp_in_progress() && ULONG_CMP_GE(j, js + RCU_STALL_RAT_DELAY) && @@ -726,6 +728,11 @@ static void check_cpu_stall(struct rcu_data *rdp) print_other_cpu_stall(gs2, gps); if (READ_ONCE(rcu_cpu_stall_ftrace_dump)) rcu_ftrace_dump(DUMP_ALL); + didstall = true; + } + if (didstall && READ_ONCE(rcu_state.jiffies_stall) == jn) { + jn = jiffies + 3 * rcu_jiffies_till_stall_check() + 3; + WRITE_ONCE(rcu_state.jiffies_stall, jn); } } -- cgit From 65bfdd36c113f5d579a382d8f2847210ea4cdca6 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 2 Jun 2021 16:31:38 -0700 Subject: srcutiny: Mark read-side data races This commit marks some interrupt-induced read-side data races in __srcu_read_lock(), __srcu_read_unlock(), and srcu_torture_stats_print(). Signed-off-by: Paul E. McKenney --- kernel/rcu/srcutiny.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/srcutiny.c b/kernel/rcu/srcutiny.c index 26344dc6483b..a0ba2ed49bc6 100644 --- a/kernel/rcu/srcutiny.c +++ b/kernel/rcu/srcutiny.c @@ -96,7 +96,7 @@ EXPORT_SYMBOL_GPL(cleanup_srcu_struct); */ void __srcu_read_unlock(struct srcu_struct *ssp, int idx) { - int newval = ssp->srcu_lock_nesting[idx] - 1; + int newval = READ_ONCE(ssp->srcu_lock_nesting[idx]) - 1; WRITE_ONCE(ssp->srcu_lock_nesting[idx], newval); if (!newval && READ_ONCE(ssp->srcu_gp_waiting)) -- cgit From d9ee962feb4f26d4eac0042861457d941aa2df5f Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 3 Jun 2021 10:17:36 -0700 Subject: rcu: Mark lockless ->qsmask read in rcu_check_boost_fail() Accesses to ->qsmask are normally protected by ->lock, but there is an exception in the diagnostic code in rcu_check_boost_fail(). This commit therefore applies data_race() to this access to avoid KCSAN complaining about the C-language writes protected by ->lock. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_stall.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h index 42847caa3909..6dd6c9aa3f75 100644 --- a/kernel/rcu/tree_stall.h +++ b/kernel/rcu/tree_stall.h @@ -766,7 +766,7 @@ bool rcu_check_boost_fail(unsigned long gp_state, int *cpup) rcu_for_each_leaf_node(rnp) { if (!cpup) { - if (READ_ONCE(rnp->qsmask)) { + if (data_race(READ_ONCE(rnp->qsmask))) { return false; } else { if (READ_ONCE(rnp->gp_tasks)) -- cgit From f74126dcbcbffe0d9fc3cb9bbf171b124a6791e5 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 7 Jun 2021 21:57:02 -0700 Subject: rcu: Make rcu_gp_init() and rcu_gp_fqs_loop noinline to conserve stack The kbuild test project found an oversized stack frame in rcu_gp_kthread() for some kernel configurations. This oversizing was due to a very large amount of inlining, which is unnecessary due to the fact that this code executes infrequently. This commit therefore marks rcu_gp_init() and rcu_gp_fqs_loop noinline_for_stack to conserve stack space. Reported-by: kernel test robot Tested-by: Rong Chen [ paulmck: noinline_for_stack per Nathan Chancellor. ] Reviewed-by: Nathan Chancellor Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 381ebd3ae26c..d791c4d32987 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1733,7 +1733,7 @@ static void rcu_strict_gp_boundary(void *unused) /* * Initialize a new grace period. Return false if no grace period required. */ -static bool rcu_gp_init(void) +static noinline_for_stack bool rcu_gp_init(void) { unsigned long firstseq; unsigned long flags; @@ -1927,7 +1927,7 @@ static void rcu_gp_fqs(bool first_time) /* * Loop doing repeated quiescent-state forcing until the grace period ends. */ -static void rcu_gp_fqs_loop(void) +static noinline_for_stack void rcu_gp_fqs_loop(void) { bool first_gp_fqs; int gf = 0; -- cgit From d283aa1b04d9ad9ed34bfc2f51ffe0371a16ee3c Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 20 May 2021 21:08:38 -0700 Subject: rcu: Mark accesses in tree_stall.h This commit marks the accesses in tree_stall.h so as to both avoid undesirable compiler optimizations and to keep KCSAN focused on the accesses of the core algorithm. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_stall.h | 63 ++++++++++++++++++++++++++----------------------- 1 file changed, 33 insertions(+), 30 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h index 6dd6c9aa3f75..a8d0fcf0826f 100644 --- a/kernel/rcu/tree_stall.h +++ b/kernel/rcu/tree_stall.h @@ -465,9 +465,10 @@ static void rcu_check_gp_kthread_starvation(void) pr_err("%s kthread starved for %ld jiffies! g%ld f%#x %s(%d) ->state=%#x ->cpu=%d\n", rcu_state.name, j, (long)rcu_seq_current(&rcu_state.gp_seq), - data_race(rcu_state.gp_flags), - gp_state_getname(rcu_state.gp_state), rcu_state.gp_state, - gpk ? gpk->__state : ~0, cpu); + data_race(READ_ONCE(rcu_state.gp_flags)), + gp_state_getname(rcu_state.gp_state), + data_race(READ_ONCE(rcu_state.gp_state)), + gpk ? data_race(READ_ONCE(gpk->__state)) : ~0, cpu); if (gpk) { pr_err("\tUnless %s kthread gets sufficient CPU time, OOM is now expected behavior.\n", rcu_state.name); pr_err("RCU grace-period kthread stack dump:\n"); @@ -510,7 +511,7 @@ static void rcu_check_gp_kthread_expired_fqs_timer(void) (long)rcu_seq_current(&rcu_state.gp_seq), data_race(rcu_state.gp_flags), gp_state_getname(RCU_GP_WAIT_FQS), RCU_GP_WAIT_FQS, - gpk->__state); + data_race(READ_ONCE(gpk->__state))); pr_err("\tPossible timer handling issue on cpu=%d timer-softirq=%u\n", cpu, kstat_softirqs_cpu(TIMER_SOFTIRQ, cpu)); } @@ -569,11 +570,11 @@ static void print_other_cpu_stall(unsigned long gp_seq, unsigned long gps) pr_err("INFO: Stall ended before state dump start\n"); } else { j = jiffies; - gpa = data_race(rcu_state.gp_activity); + gpa = data_race(READ_ONCE(rcu_state.gp_activity)); pr_err("All QSes seen, last %s kthread activity %ld (%ld-%ld), jiffies_till_next_fqs=%ld, root ->qsmask %#lx\n", rcu_state.name, j - gpa, j, gpa, - data_race(jiffies_till_next_fqs), - rcu_get_root()->qsmask); + data_race(READ_ONCE(jiffies_till_next_fqs)), + data_race(READ_ONCE(rcu_get_root()->qsmask))); } } /* Rewrite if needed in case of slow consoles. */ @@ -815,32 +816,34 @@ void show_rcu_gp_kthreads(void) struct task_struct *t = READ_ONCE(rcu_state.gp_kthread); j = jiffies; - ja = j - data_race(rcu_state.gp_activity); - jr = j - data_race(rcu_state.gp_req_activity); - js = j - data_race(rcu_state.gp_start); - jw = j - data_race(rcu_state.gp_wake_time); + ja = j - data_race(READ_ONCE(rcu_state.gp_activity)); + jr = j - data_race(READ_ONCE(rcu_state.gp_req_activity)); + js = j - data_race(READ_ONCE(rcu_state.gp_start)); + jw = j - data_race(READ_ONCE(rcu_state.gp_wake_time)); pr_info("%s: wait state: %s(%d) ->state: %#x ->rt_priority %u delta ->gp_start %lu ->gp_activity %lu ->gp_req_activity %lu ->gp_wake_time %lu ->gp_wake_seq %ld ->gp_seq %ld ->gp_seq_needed %ld ->gp_max %lu ->gp_flags %#x\n", rcu_state.name, gp_state_getname(rcu_state.gp_state), - rcu_state.gp_state, t ? t->__state : 0x1ffff, t ? t->rt_priority : 0xffU, - js, ja, jr, jw, (long)data_race(rcu_state.gp_wake_seq), - (long)data_race(rcu_state.gp_seq), - (long)data_race(rcu_get_root()->gp_seq_needed), - data_race(rcu_state.gp_max), - data_race(rcu_state.gp_flags)); + data_race(READ_ONCE(rcu_state.gp_state)), + t ? data_race(READ_ONCE(t->__state)) : 0x1ffff, t ? t->rt_priority : 0xffU, + js, ja, jr, jw, (long)data_race(READ_ONCE(rcu_state.gp_wake_seq)), + (long)data_race(READ_ONCE(rcu_state.gp_seq)), + (long)data_race(READ_ONCE(rcu_get_root()->gp_seq_needed)), + data_race(READ_ONCE(rcu_state.gp_max)), + data_race(READ_ONCE(rcu_state.gp_flags))); rcu_for_each_node_breadth_first(rnp) { if (ULONG_CMP_GE(READ_ONCE(rcu_state.gp_seq), READ_ONCE(rnp->gp_seq_needed)) && - !data_race(rnp->qsmask) && !data_race(rnp->boost_tasks) && - !data_race(rnp->exp_tasks) && !data_race(rnp->gp_tasks)) + !data_race(READ_ONCE(rnp->qsmask)) && !data_race(READ_ONCE(rnp->boost_tasks)) && + !data_race(READ_ONCE(rnp->exp_tasks)) && !data_race(READ_ONCE(rnp->gp_tasks))) continue; pr_info("\trcu_node %d:%d ->gp_seq %ld ->gp_seq_needed %ld ->qsmask %#lx %c%c%c%c ->n_boosts %ld\n", rnp->grplo, rnp->grphi, - (long)data_race(rnp->gp_seq), (long)data_race(rnp->gp_seq_needed), - data_race(rnp->qsmask), - ".b"[!!data_race(rnp->boost_kthread_task)], - ".B"[!!data_race(rnp->boost_tasks)], - ".E"[!!data_race(rnp->exp_tasks)], - ".G"[!!data_race(rnp->gp_tasks)], - data_race(rnp->n_boosts)); + (long)data_race(READ_ONCE(rnp->gp_seq)), + (long)data_race(READ_ONCE(rnp->gp_seq_needed)), + data_race(READ_ONCE(rnp->qsmask)), + ".b"[!!data_race(READ_ONCE(rnp->boost_kthread_task))], + ".B"[!!data_race(READ_ONCE(rnp->boost_tasks))], + ".E"[!!data_race(READ_ONCE(rnp->exp_tasks))], + ".G"[!!data_race(READ_ONCE(rnp->gp_tasks))], + data_race(READ_ONCE(rnp->n_boosts))); if (!rcu_is_leaf_node(rnp)) continue; for_each_leaf_node_possible_cpu(rnp, cpu) { @@ -850,12 +853,12 @@ void show_rcu_gp_kthreads(void) READ_ONCE(rdp->gp_seq_needed))) continue; pr_info("\tcpu %d ->gp_seq_needed %ld\n", - cpu, (long)data_race(rdp->gp_seq_needed)); + cpu, (long)data_race(READ_ONCE(rdp->gp_seq_needed))); } } for_each_possible_cpu(cpu) { rdp = per_cpu_ptr(&rcu_data, cpu); - cbs += data_race(rdp->n_cbs_invoked); + cbs += data_race(READ_ONCE(rdp->n_cbs_invoked)); if (rcu_segcblist_is_offloaded(&rdp->cblist)) show_rcu_nocb_state(rdp); } @@ -937,11 +940,11 @@ void rcu_fwd_progress_check(unsigned long j) if (rcu_gp_in_progress()) { pr_info("%s: GP age %lu jiffies\n", - __func__, jiffies - rcu_state.gp_start); + __func__, jiffies - data_race(READ_ONCE(rcu_state.gp_start))); show_rcu_gp_kthreads(); } else { pr_info("%s: Last GP end %lu jiffies ago\n", - __func__, jiffies - rcu_state.gp_end); + __func__, jiffies - data_race(READ_ONCE(rcu_state.gp_end))); preempt_disable(); rdp = this_cpu_ptr(&rcu_data); rcu_check_gp_start_stall(rdp->mynode, rdp, j); -- cgit From eb880949ef41c98a203c4a033e06e05854d902ef Mon Sep 17 00:00:00 2001 From: Liu Song Date: Tue, 29 Jun 2021 21:55:51 +0800 Subject: rcu: Remove useless "ret" update in rcu_gp_fqs_loop() Within rcu_gp_fqs_loop(), the "ret" local variable is set to the return value from swait_event_idle_timeout_exclusive(), but "ret" is unconditionally overwritten later in the code. This commit therefore removes this useless assignment. Signed-off-by: Liu Song Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index d791c4d32987..2f000e8a315d 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1954,8 +1954,8 @@ static noinline_for_stack void rcu_gp_fqs_loop(void) trace_rcu_grace_period(rcu_state.name, rcu_state.gp_seq, TPS("fqswait")); WRITE_ONCE(rcu_state.gp_state, RCU_GP_WAIT_FQS); - ret = swait_event_idle_timeout_exclusive( - rcu_state.gp_wq, rcu_gp_fqs_check_wake(&gf), j); + (void)swait_event_idle_timeout_exclusive(rcu_state.gp_wq, + rcu_gp_fqs_check_wake(&gf), j); rcu_gp_torture_wait(); WRITE_ONCE(rcu_state.gp_state, RCU_GP_DOING_FQS); /* Locking provides needed memory barriers. */ -- cgit From 8211e922de2854130e3633f52cd4fc2d7817ceb0 Mon Sep 17 00:00:00 2001 From: Liu Song Date: Wed, 30 Jun 2021 22:08:02 +0800 Subject: rcu: Use per_cpu_ptr to get the pointer of per_cpu variable There are a few remaining locations in kernel/rcu that still use "&per_cpu()". This commit replaces them with "per_cpu_ptr(&)", and does not introduce any functional change. Reviewed-by: Uladzislau Rezki (Sony) Reviewed-by: Neeraj Upadhyay Reviewed-by: Joel Fernandes (Google) Signed-off-by: Liu Song Signed-off-by: Paul E. McKenney --- kernel/rcu/tasks.h | 2 +- kernel/rcu/tree.c | 2 +- kernel/rcu/tree_stall.h | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index 8536c55df514..21f00194e69d 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -920,7 +920,7 @@ reset_ipi: // Allow future IPIs to be sent on CPU and for task. // Also order this IPI handler against any later manipulations of // the intended task. - smp_store_release(&per_cpu(trc_ipi_to_cpu, smp_processor_id()), false); // ^^^ + smp_store_release(per_cpu_ptr(&trc_ipi_to_cpu, smp_processor_id()), false); // ^^^ smp_store_release(&texp->trc_ipi_to_cpu, -1); // ^^^ } diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 2f000e8a315d..7403328cdf99 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1286,7 +1286,7 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) */ jtsq = READ_ONCE(jiffies_to_sched_qs); ruqp = per_cpu_ptr(&rcu_data.rcu_urgent_qs, rdp->cpu); - rnhqp = &per_cpu(rcu_data.rcu_need_heavy_qs, rdp->cpu); + rnhqp = per_cpu_ptr(&rcu_data.rcu_need_heavy_qs, rdp->cpu); if (!READ_ONCE(*rnhqp) && (time_after(jiffies, rcu_state.gp_start + jtsq * 2) || time_after(jiffies, rcu_state.jiffies_resched) || diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h index a8d0fcf0826f..677ee3d8671b 100644 --- a/kernel/rcu/tree_stall.h +++ b/kernel/rcu/tree_stall.h @@ -351,7 +351,7 @@ static void rcu_dump_cpu_stacks(void) static void print_cpu_stall_fast_no_hz(char *cp, int cpu) { - struct rcu_data *rdp = &per_cpu(rcu_data, cpu); + struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); sprintf(cp, "last_accelerate: %04lx/%04lx dyntick_enabled: %d", rdp->last_accelerate & 0xffff, jiffies & 0xffff, -- cgit From 508958259bb3d9ca4ec37f0abdb211e9a6f3daa2 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 6 Jul 2021 01:43:43 +0200 Subject: rcu: Explain why rcu_all_qs() is a stub in preemptible TREE RCU The cond_resched() function reports an RCU quiescent state only in non-preemptible TREE RCU implementation. This commit therefore adds a comment explaining why cond_resched() does nothing in preemptible kernels. Signed-off-by: Frederic Weisbecker Cc: Neeraj Upadhyay Cc: Joel Fernandes Cc: Uladzislau Rezki Cc: Boqun Feng Cc: Peter Zijlstra (Intel) Cc: Ingo Molnar Signed-off-by: Paul E. McKenney --- kernel/sched/core.c | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 2d9ff40f4661..6a03c3fac55c 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -7781,6 +7781,17 @@ int __sched __cond_resched(void) preempt_schedule_common(); return 1; } + /* + * In preemptible kernels, ->rcu_read_lock_nesting tells the tick + * whether the current CPU is in an RCU read-side critical section, + * so the tick can report quiescent states even for CPUs looping + * in kernel context. In contrast, in non-preemptible kernels, + * RCU readers leave no in-memory hints, which means that CPU-bound + * processes executing in kernel context might never report an + * RCU quiescent state. Therefore, the following code causes + * cond_resched() to report a quiescent state, but only when RCU + * is in urgent need of one. + */ #ifndef CONFIG_PREEMPT_RCU rcu_all_qs(); #endif -- cgit From 521c89b3a4022269c75b35062358d1dae4ebfa79 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 19 Jul 2021 11:52:12 -0700 Subject: rcu: Print human-readable message for schedule() in RCU reader The WARN_ON_ONCE() invocation within the CONFIG_PREEMPT=y version of rcu_note_context_switch() triggers when there is a voluntary context switch in an RCU read-side critical section, but there is quite a gap between the output of that WARN_ON_ONCE() and this RCU-usage error. This commit therefore converts the WARN_ON_ONCE() to a WARN_ONCE() that explicitly describes the problem in its message. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_plugin.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 83a702a4e296..e8b45ab72a79 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -346,7 +346,7 @@ void rcu_note_context_switch(bool preempt) trace_rcu_utilization(TPS("Start context switch")); lockdep_assert_irqs_disabled(); - WARN_ON_ONCE(!preempt && rcu_preempt_depth() > 0); + WARN_ONCE(!preempt && rcu_preempt_depth() > 0, "Voluntary context switch within RCU read-side critical section!"); if (rcu_preempt_depth() > 0 && !t->rcu_read_unlock_special.b.blocked) { -- cgit From d3dd95a8853f1d588e38e9d9d7c8cc2da412cc36 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Tue, 3 Aug 2021 16:16:14 +0200 Subject: rcu: Replace deprecated CPU-hotplug functions The functions get_online_cpus() and put_online_cpus() have been deprecated during the CPU hotplug rework. They map directly to cpus_read_lock() and cpus_read_unlock(). Replace deprecated CPU-hotplug functions with the official version. The behavior remains unchanged. Cc: "Paul E. McKenney" Cc: Josh Triplett Cc: Steven Rostedt Cc: Mathieu Desnoyers Cc: Lai Jiangshan Cc: Joel Fernandes Cc: rcu@vger.kernel.org Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 7403328cdf99..f53a3c1bf1d1 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -4011,7 +4011,7 @@ void rcu_barrier(void) */ init_completion(&rcu_state.barrier_completion); atomic_set(&rcu_state.barrier_cpu_count, 2); - get_online_cpus(); + cpus_read_lock(); /* * Force each CPU with callbacks to register a new callback. @@ -4042,7 +4042,7 @@ void rcu_barrier(void) rcu_state.barrier_sequence); } } - put_online_cpus(); + cpus_read_unlock(); /* * Now that we have an rcu_barrier_callback() callback on each -- cgit From ed4fa2442e87bf9143d608473df117589e4bfc70 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Tue, 3 Aug 2021 16:16:18 +0200 Subject: torture: Replace deprecated CPU-hotplug functions. The functions get_online_cpus() and put_online_cpus() have been deprecated during the CPU hotplug rework. They map directly to cpus_read_lock() and cpus_read_unlock(). Replace deprecated CPU-hotplug functions with the official version. The behavior remains unchanged. Cc: Davidlohr Bueso Cc: "Paul E. McKenney" Cc: Josh Triplett Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Paul E. McKenney --- kernel/torture.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/torture.c b/kernel/torture.c index 0a315c387bed..bb8f411c974b 100644 --- a/kernel/torture.c +++ b/kernel/torture.c @@ -521,11 +521,11 @@ static void torture_shuffle_tasks(void) struct shuffle_task *stp; cpumask_setall(shuffle_tmp_mask); - get_online_cpus(); + cpus_read_lock(); /* No point in shuffling if there is only one online CPU (ex: UP) */ if (num_online_cpus() == 1) { - put_online_cpus(); + cpus_read_unlock(); return; } @@ -541,7 +541,7 @@ static void torture_shuffle_tasks(void) set_cpus_allowed_ptr(stp->st_t, shuffle_tmp_mask); mutex_unlock(&shuffle_task_mutex); - put_online_cpus(); + cpus_read_unlock(); } /* Shuffle tasks across CPUs, with the intent of allowing each CPU in the -- cgit