diff options
Diffstat (limited to 'kernel/rcu')
| -rw-r--r-- | kernel/rcu/Kconfig | 95 | ||||
| -rw-r--r-- | kernel/rcu/Kconfig.debug | 100 | ||||
| -rw-r--r-- | kernel/rcu/rcu.h | 168 | ||||
| -rw-r--r-- | kernel/rcu/rcu_segcblist.c | 17 | ||||
| -rw-r--r-- | kernel/rcu/rcu_segcblist.h | 14 | ||||
| -rw-r--r-- | kernel/rcu/rcuscale.c | 518 | ||||
| -rw-r--r-- | kernel/rcu/rcutorture.c | 1532 | ||||
| -rw-r--r-- | kernel/rcu/refscale.c | 816 | ||||
| -rw-r--r-- | kernel/rcu/srcutiny.c | 54 | ||||
| -rw-r--r-- | kernel/rcu/srcutree.c | 989 | ||||
| -rw-r--r-- | kernel/rcu/sync.c | 20 | ||||
| -rw-r--r-- | kernel/rcu/tasks.h | 653 | ||||
| -rw-r--r-- | kernel/rcu/tiny.c | 48 | ||||
| -rw-r--r-- | kernel/rcu/tree.c | 2325 | ||||
| -rw-r--r-- | kernel/rcu/tree.h | 92 | ||||
| -rw-r--r-- | kernel/rcu/tree_exp.h | 415 | ||||
| -rw-r--r-- | kernel/rcu/tree_nocb.h | 581 | ||||
| -rw-r--r-- | kernel/rcu/tree_plugin.h | 265 | ||||
| -rw-r--r-- | kernel/rcu/tree_stall.h | 342 | ||||
| -rw-r--r-- | kernel/rcu/update.c | 76 |
20 files changed, 6182 insertions, 2938 deletions
diff --git a/kernel/rcu/Kconfig b/kernel/rcu/Kconfig index ab62074174c3..4d9b21f69eaa 100644 --- a/kernel/rcu/Kconfig +++ b/kernel/rcu/Kconfig @@ -18,7 +18,7 @@ config TREE_RCU config PREEMPT_RCU bool - default y if PREEMPTION + default y if (PREEMPT || PREEMPT_RT || PREEMPT_DYNAMIC) select TREE_RCU help This option selects the RCU implementation that is @@ -31,7 +31,7 @@ config PREEMPT_RCU config TINY_RCU bool - default y if !PREEMPTION && !SMP + default y if !PREEMPT_RCU && !SMP help This option selects the RCU implementation that is designed for UP systems from which real-time response @@ -53,9 +53,6 @@ config RCU_EXPERT Say N if you are unsure. -config SRCU - def_bool y - config TINY_SRCU bool default y if TINY_RCU @@ -68,6 +65,19 @@ config TREE_SRCU help This option selects the full-fledged version of SRCU. +config FORCE_NEED_SRCU_NMI_SAFE + bool "Force selection of NEED_SRCU_NMI_SAFE" + depends on !TINY_SRCU + depends on RCU_EXPERT + depends on ARCH_HAS_NMI_SAFE_THIS_CPU_OPS + select NEED_SRCU_NMI_SAFE + default n + help + This option forces selection of the NEED_SRCU_NMI_SAFE + Kconfig option, allowing testing of srcu_read_lock_nmisafe() + and srcu_read_unlock_nmisafe() on architectures (like x86) + that select the ARCH_HAS_NMI_SAFE_THIS_CPU_OPS Kconfig option. + config NEED_SRCU_NMI_SAFE def_bool HAVE_NMI && !ARCH_HAS_NMI_SAFE_THIS_CPU_OPS && !TINY_SRCU @@ -88,9 +98,13 @@ config FORCE_TASKS_RCU idle, and user-mode execution as quiescent states. Not for manual selection in most cases. -config TASKS_RCU +config NEED_TASKS_RCU bool default n + +config TASKS_RCU + bool + default NEED_TASKS_RCU && PREEMPTION select IRQ_WORK config FORCE_TASKS_RUDE_RCU @@ -248,16 +262,24 @@ config RCU_NOCB_CPU workloads will incur significant increases in context-switch rates. - This option offloads callback invocation from the set of CPUs - specified at boot time by the rcu_nocbs parameter. For each - such CPU, a kthread ("rcuox/N") will be created to invoke - callbacks, where the "N" is the CPU being offloaded, and where - the "x" is "p" for RCU-preempt (PREEMPTION kernels) and "s" for - RCU-sched (!PREEMPTION kernels). Nothing prevents this kthread - from running on the specified CPUs, but (1) the kthreads may be - preempted between each callback, and (2) affinity or cgroups can - be used to force the kthreads to run on whatever set of CPUs is - desired. + This option offloads callback invocation from the set of + CPUs specified at boot time by the rcu_nocbs parameter. + For each such CPU, a kthread ("rcuox/N") will be created to + invoke callbacks, where the "N" is the CPU being offloaded, + and where the "x" is "p" for RCU-preempt (PREEMPTION kernels) + and "s" for RCU-sched (!PREEMPTION kernels). This option + also creates another kthread for each sqrt(nr_cpu_ids) CPUs + ("rcuog/N", where N is the first CPU in that group to come + online), which handles grace periods for its group. Nothing + prevents these kthreads from running on the specified CPUs, + but (1) the kthreads may be preempted between each callback, + and (2) affinity or cgroups can be used to force the kthreads + to run on whatever set of CPUs is desired. + + The sqrt(nr_cpu_ids) grouping may be overridden using the + rcutree.rcu_nocb_gp_stride kernel boot parameter. This can + be especially helpful for smaller numbers of CPUs, where + sqrt(nr_cpu_ids) can be a bit of a blunt instrument. Say Y here if you need reduced OS jitter, despite added overhead. Say N here if you are unsure. @@ -314,7 +336,44 @@ config RCU_LAZY depends on RCU_NOCB_CPU default n help - To save power, batch RCU callbacks and flush after delay, memory - pressure, or callback list growing too big. + To save power, batch RCU callbacks and delay starting the + corresponding grace period for multiple seconds. The grace + period will be started after this delay, in case of memory + pressure, or if the corresponding CPU's callback list grows + too large. + + These delays happen only on rcu_nocbs CPUs, that is, CPUs + whose callbacks have been offloaded. + + Use the rcutree.enable_rcu_lazy=0 kernel-boot parameter to + globally disable these delays. + +config RCU_LAZY_DEFAULT_OFF + bool "Turn RCU lazy invocation off by default" + depends on RCU_LAZY + default n + help + Build the kernel with CONFIG_RCU_LAZY=y, but cause the kernel + to boot with these energy-efficiency delays disabled. Use the + rcutree.enable_rcu_lazy=0 kernel-boot parameter to override + the this option at boot time, thus re-enabling these delays. + +config RCU_DOUBLE_CHECK_CB_TIME + bool "RCU callback-batch backup time check" + depends on RCU_EXPERT + default n + help + Use this option to provide more precise enforcement of the + rcutree.rcu_resched_ns module parameter in situations where + a single RCU callback might run for hundreds of microseconds, + thus defeating the 32-callback batching used to amortize the + cost of the fine-grained but expensive local_clock() function. + + This option rounds rcutree.rcu_resched_ns up to the next + jiffy, and overrides the 32-callback batching if this limit + is exceeded. + + Say Y here if you need tighter callback-limit enforcement. + Say N here if you are unsure. endmenu # "RCU Subsystem" diff --git a/kernel/rcu/Kconfig.debug b/kernel/rcu/Kconfig.debug index 232e29fe3e5e..625d75392647 100644 --- a/kernel/rcu/Kconfig.debug +++ b/kernel/rcu/Kconfig.debug @@ -53,6 +53,51 @@ config RCU_TORTURE_TEST Say M if you want the RCU torture tests to build as a module. Say N if you are unsure. +config RCU_TORTURE_TEST_CHK_RDR_STATE + bool "Check rcutorture reader state" + depends on RCU_TORTURE_TEST + default n + help + This option causes rcutorture to check the desired rcutorture + reader state for each segment against the actual context. + Note that PREEMPT_COUNT must be enabled if the preempt-disabled + and bh-disabled checks are to take effect, and that PREEMPT_RCU + must be enabled for the RCU-nesting checks to take effect. + These checks add overhead, and this Kconfig options is therefore + disabled by default. + + Say Y here if you want rcutorture reader contexts checked. + Say N if you are unsure. + +config RCU_TORTURE_TEST_LOG_CPU + bool "Log CPU for rcutorture failures" + depends on RCU_TORTURE_TEST + default n + help + This option causes rcutorture to decorate each entry of its + log of failure/close-call rcutorture reader segments with the + number of the CPU that the reader was running on at the time. + This information can be useful, but it does incur additional + overhead, overhead that can make both failures and close calls + less probable. + + Say Y here if you want CPU IDs logged. + Say N if you are unsure. + +config RCU_TORTURE_TEST_LOG_GP + bool "Log grace-period numbers for rcutorture failures" + depends on RCU_TORTURE_TEST + default n + help + This option causes rcutorture to decorate each entry of its + log of failure/close-call rcutorture reader segments with the + corresponding grace-period sequence numbers. This information + can be useful, but it does incur additional overhead, overhead + that can make both failures and close calls less probable. + + Say Y here if you want grace-period sequence numbers logged. + Say N if you are unsure. + config RCU_REF_SCALE_TEST tristate "Scalability tests for read-side synchronization (RCU and others)" depends on DEBUG_KERNEL @@ -82,7 +127,7 @@ config RCU_CPU_STALL_TIMEOUT config RCU_EXP_CPU_STALL_TIMEOUT int "Expedited RCU CPU stall timeout in milliseconds" depends on RCU_STALL_COMMON - range 0 21000 + range 0 300000 default 0 help If a given expedited RCU grace period extends more than the @@ -92,6 +137,44 @@ config RCU_EXP_CPU_STALL_TIMEOUT says to use the RCU_CPU_STALL_TIMEOUT value converted from seconds to milliseconds. +config RCU_CPU_STALL_CPUTIME + bool "Provide additional RCU stall debug information" + depends on RCU_STALL_COMMON + default n + help + Collect statistics during the sampling period, such as the number of + (hard interrupts, soft interrupts, task switches) and the cputime of + (hard interrupts, soft interrupts, kernel tasks) are added to the + RCU stall report. For multiple continuous RCU stalls, all sampling + periods begin at half of the first RCU stall timeout. + The boot option rcupdate.rcu_cpu_stall_cputime has the same function + as this one, but will override this if it exists. + +config RCU_CPU_STALL_NOTIFIER + bool "Provide RCU CPU-stall notifiers" + depends on RCU_STALL_COMMON + depends on DEBUG_KERNEL + depends on RCU_EXPERT + default n + help + WARNING: You almost certainly do not want this!!! + + Enable RCU CPU-stall notifiers, which are invoked just before + printing the RCU CPU stall warning. As such, bugs in notifier + callbacks can prevent stall warnings from being printed. + And the whole reason that a stall warning is being printed is + that something is hung up somewhere. Therefore, the notifier + callbacks must be written extremely carefully, preferably + containing only lockless code. After all, it is quite possible + that the whole reason that the RCU CPU stall is happening in + the first place is that someone forgot to release whatever lock + that you are thinking of acquiring. In which case, having your + notifier callback acquire that lock will hang, preventing the + RCU CPU stall warning from appearing. + + Say Y here if you want RCU CPU stall notifiers (you don't want them) + Say N if you are unsure. + config RCU_TRACE bool "Enable tracing for RCU" depends on DEBUG_KERNEL @@ -130,4 +213,19 @@ config RCU_STRICT_GRACE_PERIOD when looking for certain types of RCU usage bugs, for example, too-short RCU read-side critical sections. + +config RCU_DYNTICKS_TORTURE + bool "Minimize RCU dynticks counter size" + depends on RCU_EXPERT && !COMPILE_TEST + default n + help + This option sets the width of the dynticks counter to its + minimum usable value. This minimum width greatly increases + the probability of flushing out bugs involving counter wrap, + but it also increases the probability of extending grace period + durations. This Kconfig option should therefore be avoided in + production due to the consequent increased probability of OOMs. + + This has no value for production and is only for testing. + endmenu # "RCU Debugging" diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index c5aa934de59b..9cf01832a6c3 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -10,18 +10,56 @@ #ifndef __LINUX_RCU_H #define __LINUX_RCU_H +#include <linux/slab.h> #include <trace/events/rcu.h> /* * Grace-period counter management. + * + * The two least significant bits contain the control flags. + * The most significant bits contain the grace-period sequence counter. + * + * When both control flags are zero, no grace period is in progress. + * When either bit is non-zero, a grace period has started and is in + * progress. When the grace period completes, the control flags are reset + * to 0 and the grace-period sequence counter is incremented. + * + * However some specific RCU usages make use of custom values. + * + * SRCU special control values: + * + * SRCU_SNP_INIT_SEQ : Invalid/init value set when SRCU node + * is initialized. + * + * SRCU_STATE_IDLE : No SRCU gp is in progress + * + * SRCU_STATE_SCAN1 : State set by rcu_seq_start(). Indicates + * we are scanning the readers on the slot + * defined as inactive (there might well + * be pending readers that will use that + * index, but their number is bounded). + * + * SRCU_STATE_SCAN2 : State set manually via rcu_seq_set_state() + * Indicates we are flipping the readers + * index and then scanning the readers on the + * slot newly designated as inactive (again, + * the number of pending readers that will use + * this inactive index is bounded). + * + * RCU polled GP special control value: + * + * RCU_GET_STATE_COMPLETED : State value indicating an already-completed + * polled GP has completed. This value covers + * both the state and the counter of the + * grace-period sequence number. */ -#define RCU_SEQ_CTR_SHIFT 2 -#define RCU_SEQ_STATE_MASK ((1 << RCU_SEQ_CTR_SHIFT) - 1) - /* Low-order bit definition for polled grace-period APIs. */ #define RCU_GET_STATE_COMPLETED 0x1 +/* A complete grace period count */ +#define RCU_SEQ_GP (RCU_SEQ_STATE_MASK + 1) + extern int sysctl_sched_rt_runtime; /* @@ -122,12 +160,21 @@ static inline bool rcu_seq_done(unsigned long *sp, unsigned long s) * Given a snapshot from rcu_seq_snap(), determine whether or not a * full update-side operation has occurred, but do not allow the * (ULONG_MAX / 2) safety-factor/guard-band. + * + * The token returned by get_state_synchronize_rcu_full() is based on + * rcu_state.gp_seq but it is tested in poll_state_synchronize_rcu_full() + * against the root rnp->gp_seq. Since rcu_seq_start() is first called + * on rcu_state.gp_seq and only later reflected on the root rnp->gp_seq, + * it is possible that rcu_seq_snap(rcu_state.gp_seq) returns 2 full grace + * periods ahead of the root rnp->gp_seq. To prevent false-positives with the + * full polling API that a wrap around instantly completed the GP, when nothing + * like that happened, adjust for the 2 GPs in the ULONG_CMP_LT(). */ static inline bool rcu_seq_done_exact(unsigned long *sp, unsigned long s) { unsigned long cur_s = READ_ONCE(*sp); - return ULONG_CMP_GE(cur_s, s) || ULONG_CMP_LT(cur_s, s - (2 * RCU_SEQ_STATE_MASK + 1)); + return ULONG_CMP_GE(cur_s, s) || ULONG_CMP_LT(cur_s, s - (2 * RCU_SEQ_GP)); } /* @@ -211,6 +258,17 @@ static inline void debug_rcu_head_unqueue(struct rcu_head *head) } #endif /* #else !CONFIG_DEBUG_OBJECTS_RCU_HEAD */ +static inline void debug_rcu_head_callback(struct rcu_head *rhp) +{ + if (unlikely(!rhp->func)) + kmem_dump_obj(rhp); +} + +static inline bool rcu_barrier_cb_is_done(struct rcu_head *rhp) +{ + return rhp->next == rhp; +} + extern int rcu_cpu_stall_suppress_at_boot; static inline bool rcu_stall_is_suppressed_at_boot(void) @@ -218,12 +276,16 @@ static inline bool rcu_stall_is_suppressed_at_boot(void) return rcu_cpu_stall_suppress_at_boot && !rcu_inkernel_boot_has_ended(); } +extern int rcu_cpu_stall_notifiers; + #ifdef CONFIG_RCU_STALL_COMMON extern int rcu_cpu_stall_ftrace_dump; extern int rcu_cpu_stall_suppress; extern int rcu_cpu_stall_timeout; extern int rcu_exp_cpu_stall_timeout; +extern int rcu_cpu_stall_cputime; +extern bool rcu_exp_stall_task_details __read_mostly; int rcu_jiffies_till_stall_check(void); int rcu_exp_jiffies_till_stall_check(void); @@ -339,11 +401,13 @@ extern void rcu_init_geometry(void); * specified state structure (for SRCU) or the only rcu_state structure * (for RCU). */ -#define srcu_for_each_node_breadth_first(sp, rnp) \ +#define _rcu_for_each_node_breadth_first(sp, rnp) \ for ((rnp) = &(sp)->node[0]; \ (rnp) < &(sp)->node[rcu_num_nodes]; (rnp)++) #define rcu_for_each_node_breadth_first(rnp) \ - srcu_for_each_node_breadth_first(&rcu_state, rnp) + _rcu_for_each_node_breadth_first(&rcu_state, rnp) +#define srcu_for_each_node_breadth_first(ssp, rnp) \ + _rcu_for_each_node_breadth_first(ssp->srcu_sup, rnp) /* * Scan the leaves of the rcu_node hierarchy for the rcu_state structure. @@ -447,23 +511,49 @@ do { \ /* Tiny RCU doesn't expedite, as its purpose in life is instead to be tiny. */ static inline bool rcu_gp_is_normal(void) { return true; } static inline bool rcu_gp_is_expedited(void) { return false; } +static inline bool rcu_async_should_hurry(void) { return false; } static inline void rcu_expedite_gp(void) { } static inline void rcu_unexpedite_gp(void) { } -static inline void rcu_request_urgent_qs_task(struct task_struct *t) { } +static inline void rcu_async_hurry(void) { } +static inline void rcu_async_relax(void) { } +static inline bool rcu_cpu_online(int cpu) { return true; } #else /* #ifdef CONFIG_TINY_RCU */ bool rcu_gp_is_normal(void); /* Internal RCU use. */ bool rcu_gp_is_expedited(void); /* Internal RCU use. */ +bool rcu_async_should_hurry(void); /* Internal RCU use. */ void rcu_expedite_gp(void); void rcu_unexpedite_gp(void); +void rcu_async_hurry(void); +void rcu_async_relax(void); void rcupdate_announce_bootup_oddness(void); +bool rcu_cpu_online(int cpu); #ifdef CONFIG_TASKS_RCU_GENERIC void show_rcu_tasks_gp_kthreads(void); #else /* #ifdef CONFIG_TASKS_RCU_GENERIC */ static inline void show_rcu_tasks_gp_kthreads(void) {} #endif /* #else #ifdef CONFIG_TASKS_RCU_GENERIC */ -void rcu_request_urgent_qs_task(struct task_struct *t); #endif /* #else #ifdef CONFIG_TINY_RCU */ +#ifdef CONFIG_TASKS_RCU +struct task_struct *get_rcu_tasks_gp_kthread(void); +void rcu_tasks_get_gp_data(int *flags, unsigned long *gp_seq); +#endif // # ifdef CONFIG_TASKS_RCU + +#ifdef CONFIG_TASKS_RUDE_RCU +struct task_struct *get_rcu_tasks_rude_gp_kthread(void); +void rcu_tasks_rude_get_gp_data(int *flags, unsigned long *gp_seq); +#endif // # ifdef CONFIG_TASKS_RUDE_RCU + +#ifdef CONFIG_TASKS_TRACE_RCU +void rcu_tasks_trace_get_gp_data(int *flags, unsigned long *gp_seq); +#endif + +#ifdef CONFIG_TASKS_RCU_GENERIC +void tasks_cblist_init_generic(void); +#else /* #ifdef CONFIG_TASKS_RCU_GENERIC */ +static inline void tasks_cblist_init_generic(void) { } +#endif /* #else #ifdef CONFIG_TASKS_RCU_GENERIC */ + #define RCU_SCHEDULER_INACTIVE 0 #define RCU_SCHEDULER_INIT 1 #define RCU_SCHEDULER_RUNNING 2 @@ -479,25 +569,25 @@ enum rcutorture_type { }; #if defined(CONFIG_RCU_LAZY) -unsigned long rcu_lazy_get_jiffies_till_flush(void); -void rcu_lazy_set_jiffies_till_flush(unsigned long j); +unsigned long rcu_get_jiffies_lazy_flush(void); +void rcu_set_jiffies_lazy_flush(unsigned long j); #else -static inline unsigned long rcu_lazy_get_jiffies_till_flush(void) { return 0; } -static inline void rcu_lazy_set_jiffies_till_flush(unsigned long j) { } +static inline unsigned long rcu_get_jiffies_lazy_flush(void) { return 0; } +static inline void rcu_set_jiffies_lazy_flush(unsigned long j) { } #endif #if defined(CONFIG_TREE_RCU) -void rcutorture_get_gp_data(enum rcutorture_type test_type, int *flags, - unsigned long *gp_seq); +void rcutorture_get_gp_data(int *flags, unsigned long *gp_seq); void do_trace_rcu_torture_read(const char *rcutorturename, struct rcu_head *rhp, unsigned long secs, unsigned long c_old, unsigned long c); void rcu_gp_set_torture_wait(int duration); +void rcu_set_gpwrap_lag(unsigned long lag); +int rcu_get_gpwrap_count(int cpu); #else -static inline void rcutorture_get_gp_data(enum rcutorture_type test_type, - int *flags, unsigned long *gp_seq) +static inline void rcutorture_get_gp_data(int *flags, unsigned long *gp_seq) { *flags = 0; *gp_seq = 0; @@ -513,38 +603,32 @@ void do_trace_rcu_torture_read(const char *rcutorturename, do { } while (0) #endif static inline void rcu_gp_set_torture_wait(int duration) { } +static inline void rcu_set_gpwrap_lag(unsigned long lag) { } +static inline int rcu_get_gpwrap_count(int cpu) { return 0; } #endif - -#if IS_ENABLED(CONFIG_RCU_TORTURE_TEST) || IS_MODULE(CONFIG_RCU_TORTURE_TEST) -long rcutorture_sched_setaffinity(pid_t pid, const struct cpumask *in_mask); -#endif +unsigned long long rcutorture_gather_gp_seqs(void); +void rcutorture_format_gp_seqs(unsigned long long seqs, char *cp, size_t len); #ifdef CONFIG_TINY_SRCU -static inline void srcutorture_get_gp_data(enum rcutorture_type test_type, - struct srcu_struct *sp, int *flags, +static inline void srcutorture_get_gp_data(struct srcu_struct *sp, int *flags, unsigned long *gp_seq) { - if (test_type != SRCU_FLAVOR) - return; *flags = 0; *gp_seq = sp->srcu_idx; } #elif defined(CONFIG_TREE_SRCU) -void srcutorture_get_gp_data(enum rcutorture_type test_type, - struct srcu_struct *sp, int *flags, +void srcutorture_get_gp_data(struct srcu_struct *sp, int *flags, unsigned long *gp_seq); #endif #ifdef CONFIG_TINY_RCU -static inline bool rcu_dynticks_zero_in_eqs(int cpu, int *vp) { return false; } +static inline bool rcu_watching_zero_in_eqs(int cpu, int *vp) { return false; } static inline unsigned long rcu_get_gp_seq(void) { return 0; } static inline unsigned long rcu_exp_batches_completed(void) { return 0; } -static inline unsigned long -srcu_batches_completed(struct srcu_struct *sp) { return 0; } static inline void rcu_force_quiescent_state(void) { } static inline bool rcu_check_boost_fail(unsigned long gp_state, int *cpup) { return true; } static inline void show_rcu_gp_kthreads(void) { } @@ -553,26 +637,26 @@ static inline void rcu_fwd_progress_check(unsigned long j) { } static inline void rcu_gp_slow_register(atomic_t *rgssp) { } static inline void rcu_gp_slow_unregister(atomic_t *rgssp) { } #else /* #ifdef CONFIG_TINY_RCU */ -bool rcu_dynticks_zero_in_eqs(int cpu, int *vp); +bool rcu_watching_zero_in_eqs(int cpu, int *vp); unsigned long rcu_get_gp_seq(void); unsigned long rcu_exp_batches_completed(void); -unsigned long srcu_batches_completed(struct srcu_struct *sp); bool rcu_check_boost_fail(unsigned long gp_state, int *cpup); void show_rcu_gp_kthreads(void); int rcu_get_gp_kthreads_prio(void); void rcu_fwd_progress_check(unsigned long j); void rcu_force_quiescent_state(void); extern struct workqueue_struct *rcu_gp_wq; -#ifdef CONFIG_RCU_EXP_KTHREAD extern struct kthread_worker *rcu_exp_gp_kworker; -extern struct kthread_worker *rcu_exp_par_gp_kworker; -#else /* !CONFIG_RCU_EXP_KTHREAD */ -extern struct workqueue_struct *rcu_par_gp_wq; -#endif /* CONFIG_RCU_EXP_KTHREAD */ void rcu_gp_slow_register(atomic_t *rgssp); void rcu_gp_slow_unregister(atomic_t *rgssp); #endif /* #else #ifdef CONFIG_TINY_RCU */ +#ifdef CONFIG_TINY_SRCU +static inline unsigned long srcu_batches_completed(struct srcu_struct *sp) { return 0; } +#else // #ifdef CONFIG_TINY_SRCU +unsigned long srcu_batches_completed(struct srcu_struct *sp); +#endif // #else // #ifdef CONFIG_TINY_SRCU + #ifdef CONFIG_RCU_NOCB_CPU void rcu_bind_current_to_nocb(void); #else @@ -595,4 +679,16 @@ void show_rcu_tasks_trace_gp_kthread(void); static inline void show_rcu_tasks_trace_gp_kthread(void) {} #endif +#ifdef CONFIG_TINY_RCU +static inline bool rcu_cpu_beenfullyonline(int cpu) { return true; } +#else +bool rcu_cpu_beenfullyonline(int cpu); +#endif + +#if defined(CONFIG_RCU_STALL_COMMON) && defined(CONFIG_RCU_CPU_STALL_NOTIFIER) +int rcu_stall_notifier_call_chain(unsigned long val, void *v); +#else // #if defined(CONFIG_RCU_STALL_COMMON) && defined(CONFIG_RCU_CPU_STALL_NOTIFIER) +static inline int rcu_stall_notifier_call_chain(unsigned long val, void *v) { return NOTIFY_DONE; } +#endif // #else // #if defined(CONFIG_RCU_STALL_COMMON) && defined(CONFIG_RCU_CPU_STALL_NOTIFIER) + #endif /* __LINUX_RCU_H */ diff --git a/kernel/rcu/rcu_segcblist.c b/kernel/rcu/rcu_segcblist.c index c54ea2b6a36b..298a2c573f02 100644 --- a/kernel/rcu/rcu_segcblist.c +++ b/kernel/rcu/rcu_segcblist.c @@ -89,7 +89,7 @@ static void rcu_segcblist_set_len(struct rcu_segcblist *rsclp, long v) } /* Get the length of a segment of the rcu_segcblist structure. */ -static long rcu_segcblist_get_seglen(struct rcu_segcblist *rsclp, int seg) +long rcu_segcblist_get_seglen(struct rcu_segcblist *rsclp, int seg) { return READ_ONCE(rsclp->seglen[seg]); } @@ -261,17 +261,6 @@ void rcu_segcblist_disable(struct rcu_segcblist *rsclp) } /* - * Mark the specified rcu_segcblist structure as offloaded (or not) - */ -void rcu_segcblist_offload(struct rcu_segcblist *rsclp, bool offload) -{ - if (offload) - rcu_segcblist_set_flags(rsclp, SEGCBLIST_LOCKING | SEGCBLIST_OFFLOADED); - else - rcu_segcblist_clear_flags(rsclp, SEGCBLIST_OFFLOADED); -} - -/* * Does the specified rcu_segcblist structure contain callbacks that * are ready to be invoked? */ @@ -368,7 +357,7 @@ bool rcu_segcblist_entrain(struct rcu_segcblist *rsclp, smp_mb(); /* Ensure counts are updated before callback is entrained. */ rhp->next = NULL; for (i = RCU_NEXT_TAIL; i > RCU_DONE_TAIL; i--) - if (rsclp->tails[i] != rsclp->tails[i - 1]) + if (!rcu_segcblist_segempty(rsclp, i)) break; rcu_segcblist_inc_seglen(rsclp, i); WRITE_ONCE(*rsclp->tails[i], rhp); @@ -551,7 +540,7 @@ bool rcu_segcblist_accelerate(struct rcu_segcblist *rsclp, unsigned long seq) * as their ->gp_seq[] grace-period completion sequence number. */ for (i = RCU_NEXT_READY_TAIL; i > RCU_DONE_TAIL; i--) - if (rsclp->tails[i] != rsclp->tails[i - 1] && + if (!rcu_segcblist_segempty(rsclp, i) && ULONG_CMP_LT(rsclp->gp_seq[i], seq)) break; diff --git a/kernel/rcu/rcu_segcblist.h b/kernel/rcu/rcu_segcblist.h index 431cee212467..fadc08ad4b7b 100644 --- a/kernel/rcu/rcu_segcblist.h +++ b/kernel/rcu/rcu_segcblist.h @@ -15,6 +15,8 @@ static inline long rcu_cblist_n_cbs(struct rcu_cblist *rclp) return READ_ONCE(rclp->len); } +long rcu_segcblist_get_seglen(struct rcu_segcblist *rsclp, int seg); + /* Return number of callbacks in segmented callback list by summing seglen. */ long rcu_segcblist_n_segment_cbs(struct rcu_segcblist *rsclp); @@ -87,16 +89,7 @@ static inline bool rcu_segcblist_is_enabled(struct rcu_segcblist *rsclp) static inline bool rcu_segcblist_is_offloaded(struct rcu_segcblist *rsclp) { if (IS_ENABLED(CONFIG_RCU_NOCB_CPU) && - rcu_segcblist_test_flags(rsclp, SEGCBLIST_LOCKING)) - return true; - - return false; -} - -static inline bool rcu_segcblist_completely_offloaded(struct rcu_segcblist *rsclp) -{ - if (IS_ENABLED(CONFIG_RCU_NOCB_CPU) && - !rcu_segcblist_test_flags(rsclp, SEGCBLIST_RCU_CORE)) + rcu_segcblist_test_flags(rsclp, SEGCBLIST_OFFLOADED)) return true; return false; @@ -127,7 +120,6 @@ void rcu_segcblist_inc_len(struct rcu_segcblist *rsclp); void rcu_segcblist_add_len(struct rcu_segcblist *rsclp, long v); void rcu_segcblist_init(struct rcu_segcblist *rsclp); void rcu_segcblist_disable(struct rcu_segcblist *rsclp); -void rcu_segcblist_offload(struct rcu_segcblist *rsclp, bool offload); bool rcu_segcblist_ready_cbs(struct rcu_segcblist *rsclp); bool rcu_segcblist_pend_cbs(struct rcu_segcblist *rsclp); struct rcu_head *rcu_segcblist_first_cb(struct rcu_segcblist *rsclp); diff --git a/kernel/rcu/rcuscale.c b/kernel/rcu/rcuscale.c index 91fb5905a008..7484d8ad5767 100644 --- a/kernel/rcu/rcuscale.c +++ b/kernel/rcu/rcuscale.c @@ -39,9 +39,11 @@ #include <linux/torture.h> #include <linux/vmalloc.h> #include <linux/rcupdate_trace.h> +#include <linux/sched/debug.h> #include "rcu.h" +MODULE_DESCRIPTION("Read-Copy Update module-based scalability-test facility"); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Paul E. McKenney <paulmck@linux.ibm.com>"); @@ -84,15 +86,17 @@ MODULE_AUTHOR("Paul E. McKenney <paulmck@linux.ibm.com>"); #endif torture_param(bool, gp_async, false, "Use asynchronous GP wait primitives"); -torture_param(int, gp_async_max, 1000, "Max # outstanding waits per reader"); +torture_param(int, gp_async_max, 1000, "Max # outstanding waits per writer"); torture_param(bool, gp_exp, false, "Use expedited GP wait primitives"); torture_param(int, holdoff, 10, "Holdoff time before test start (s)"); +torture_param(int, minruntime, 0, "Minimum run time (s)"); torture_param(int, nreaders, -1, "Number of RCU reader threads"); torture_param(int, nwriters, -1, "Number of RCU updater threads"); torture_param(bool, shutdown, RCUSCALE_SHUTDOWN, "Shutdown at end of scalability tests."); torture_param(int, verbose, 1, "Enable verbose debugging printk()s"); torture_param(int, writer_holdoff, 0, "Holdoff (us) between GPs, zero to disable"); +torture_param(int, writer_holdoff_jiffies, 0, "Holdoff (jiffies) between GPs, zero to disable"); torture_param(int, kfree_rcu_test, 0, "Do we run a kfree_rcu() scale test?"); torture_param(int, kfree_mult, 1, "Multiple of kfree_obj size to allocate."); torture_param(int, kfree_by_call_rcu, 0, "Use call_rcu() to emulate kfree_rcu()?"); @@ -101,6 +105,20 @@ static char *scale_type = "rcu"; module_param(scale_type, charp, 0444); MODULE_PARM_DESC(scale_type, "Type of RCU to scalability-test (rcu, srcu, ...)"); +// Structure definitions for custom fixed-per-task allocator. +struct writer_mblock { + struct rcu_head wmb_rh; + struct llist_node wmb_node; + struct writer_freelist *wmb_wfl; +}; + +struct writer_freelist { + struct llist_head ws_lhg; + atomic_t ws_inflight; + struct llist_head ____cacheline_internodealigned_in_smp ws_lhp; + struct writer_mblock *ws_mblocks; +}; + static int nrealreaders; static int nrealwriters; static struct task_struct **writer_tasks; @@ -108,6 +126,8 @@ static struct task_struct **reader_tasks; static struct task_struct *shutdown_task; static u64 **writer_durations; +static bool *writer_done; +static struct writer_freelist *writer_freelists; static int *writer_n_durations; static atomic_t n_rcu_scale_reader_started; static atomic_t n_rcu_scale_writer_started; @@ -117,7 +137,6 @@ static u64 t_rcu_scale_writer_started; static u64 t_rcu_scale_writer_finished; static unsigned long b_rcu_gp_test_started; static unsigned long b_rcu_gp_test_finished; -static DEFINE_PER_CPU(atomic_t, n_async_inflight); #define MAX_MEAS 10000 #define MIN_MEAS 100 @@ -139,6 +158,8 @@ struct rcu_scale_ops { void (*gp_barrier)(void); void (*sync)(void); void (*exp_sync)(void); + struct task_struct *(*rso_gp_kthread)(void); + void (*stats)(void); const char *name; }; @@ -220,6 +241,11 @@ static void srcu_scale_synchronize(void) synchronize_srcu(srcu_ctlp); } +static void srcu_scale_stats(void) +{ + srcu_torture_stats_print(srcu_ctlp, scale_type, SCALE_FLAG); +} + static void srcu_scale_synchronize_expedited(void) { synchronize_srcu_expedited(srcu_ctlp); @@ -237,6 +263,7 @@ static struct rcu_scale_ops srcu_ops = { .gp_barrier = srcu_rcu_barrier, .sync = srcu_scale_synchronize, .exp_sync = srcu_scale_synchronize_expedited, + .stats = srcu_scale_stats, .name = "srcu" }; @@ -266,6 +293,7 @@ static struct rcu_scale_ops srcud_ops = { .gp_barrier = srcu_rcu_barrier, .sync = srcu_scale_synchronize, .exp_sync = srcu_scale_synchronize_expedited, + .stats = srcu_scale_stats, .name = "srcud" }; @@ -284,6 +312,11 @@ static void tasks_scale_read_unlock(int idx) { } +static void rcu_tasks_scale_stats(void) +{ + rcu_tasks_torture_stats_print(scale_type, SCALE_FLAG); +} + static struct rcu_scale_ops tasks_ops = { .ptype = RCU_TASKS_FLAVOR, .init = rcu_sync_scale_init, @@ -295,6 +328,8 @@ static struct rcu_scale_ops tasks_ops = { .gp_barrier = rcu_barrier_tasks, .sync = synchronize_rcu_tasks, .exp_sync = synchronize_rcu_tasks, + .rso_gp_kthread = get_rcu_tasks_gp_kthread, + .stats = IS_ENABLED(CONFIG_TINY_RCU) ? NULL : rcu_tasks_scale_stats, .name = "tasks" }; @@ -306,6 +341,48 @@ static struct rcu_scale_ops tasks_ops = { #endif // #else // #ifdef CONFIG_TASKS_RCU +#ifdef CONFIG_TASKS_RUDE_RCU + +/* + * Definitions for RCU-tasks-rude scalability testing. + */ + +static int tasks_rude_scale_read_lock(void) +{ + return 0; +} + +static void tasks_rude_scale_read_unlock(int idx) +{ +} + +static void rcu_tasks_rude_scale_stats(void) +{ + rcu_tasks_rude_torture_stats_print(scale_type, SCALE_FLAG); +} + +static struct rcu_scale_ops tasks_rude_ops = { + .ptype = RCU_TASKS_RUDE_FLAVOR, + .init = rcu_sync_scale_init, + .readlock = tasks_rude_scale_read_lock, + .readunlock = tasks_rude_scale_read_unlock, + .get_gp_seq = rcu_no_completed, + .gp_diff = rcu_seq_diff, + .sync = synchronize_rcu_tasks_rude, + .exp_sync = synchronize_rcu_tasks_rude, + .rso_gp_kthread = get_rcu_tasks_rude_gp_kthread, + .stats = IS_ENABLED(CONFIG_TINY_RCU) ? NULL : rcu_tasks_rude_scale_stats, + .name = "tasks-rude" +}; + +#define TASKS_RUDE_OPS &tasks_rude_ops, + +#else // #ifdef CONFIG_TASKS_RUDE_RCU + +#define TASKS_RUDE_OPS + +#endif // #else // #ifdef CONFIG_TASKS_RUDE_RCU + #ifdef CONFIG_TASKS_TRACE_RCU /* @@ -323,6 +400,11 @@ static void tasks_trace_scale_read_unlock(int idx) rcu_read_unlock_trace(); } +static void rcu_tasks_trace_scale_stats(void) +{ + rcu_tasks_trace_torture_stats_print(scale_type, SCALE_FLAG); +} + static struct rcu_scale_ops tasks_tracing_ops = { .ptype = RCU_TASKS_FLAVOR, .init = rcu_sync_scale_init, @@ -334,6 +416,8 @@ static struct rcu_scale_ops tasks_tracing_ops = { .gp_barrier = rcu_barrier_tasks_trace, .sync = synchronize_rcu_tasks_trace, .exp_sync = synchronize_rcu_tasks_trace, + .rso_gp_kthread = get_rcu_tasks_trace_gp_kthread, + .stats = IS_ENABLED(CONFIG_TINY_RCU) ? NULL : rcu_tasks_trace_scale_stats, .name = "tasks-tracing" }; @@ -394,12 +478,52 @@ rcu_scale_reader(void *arg) } /* + * Allocate a writer_mblock structure for the specified rcu_scale_writer + * task. + */ +static struct writer_mblock *rcu_scale_alloc(long me) +{ + struct llist_node *llnp; + struct writer_freelist *wflp; + struct writer_mblock *wmbp; + + if (WARN_ON_ONCE(!writer_freelists)) + return NULL; + wflp = &writer_freelists[me]; + if (llist_empty(&wflp->ws_lhp)) { + // ->ws_lhp is private to its rcu_scale_writer task. + wmbp = container_of(llist_del_all(&wflp->ws_lhg), struct writer_mblock, wmb_node); + wflp->ws_lhp.first = &wmbp->wmb_node; + } + llnp = llist_del_first(&wflp->ws_lhp); + if (!llnp) + return NULL; + return container_of(llnp, struct writer_mblock, wmb_node); +} + +/* + * Free a writer_mblock structure to its rcu_scale_writer task. + */ +static void rcu_scale_free(struct writer_mblock *wmbp) +{ + struct writer_freelist *wflp; + + if (!wmbp) + return; + wflp = wmbp->wmb_wfl; + llist_add(&wmbp->wmb_node, &wflp->ws_lhg); +} + +/* * Callback function for asynchronous grace periods from rcu_scale_writer(). */ static void rcu_scale_async_cb(struct rcu_head *rhp) { - atomic_dec(this_cpu_ptr(&n_async_inflight)); - kfree(rhp); + struct writer_mblock *wmbp = container_of(rhp, struct writer_mblock, wmb_rh); + struct writer_freelist *wflp = wmbp->wmb_wfl; + + atomic_dec(&wflp->ws_inflight); + rcu_scale_free(wmbp); } /* @@ -410,12 +534,16 @@ rcu_scale_writer(void *arg) { int i = 0; int i_max; + unsigned long jdone; long me = (long)arg; - struct rcu_head *rhp = NULL; + bool selfreport = false; bool started = false, done = false, alldone = false; u64 t; + DEFINE_TORTURE_RANDOM(tr); u64 *wdp; u64 *wdpp = writer_durations[me]; + struct writer_freelist *wflp = &writer_freelists[me]; + struct writer_mblock *wmbp = NULL; VERBOSE_SCALEOUT_STRING("rcu_scale_writer task started"); WARN_ON(!wdpp); @@ -424,7 +552,7 @@ rcu_scale_writer(void *arg) sched_set_fifo_low(current); if (holdoff) - schedule_timeout_uninterruptible(holdoff * HZ); + schedule_timeout_idle(holdoff * HZ); /* * Wait until rcu_end_inkernel_boot() is called for normal GP tests @@ -445,29 +573,36 @@ rcu_scale_writer(void *arg) } } + jdone = jiffies + minruntime * HZ; do { + bool gp_succeeded = false; + if (writer_holdoff) udelay(writer_holdoff); + if (writer_holdoff_jiffies) + schedule_timeout_idle(torture_random(&tr) % writer_holdoff_jiffies + 1); wdp = &wdpp[i]; *wdp = ktime_get_mono_fast_ns(); - if (gp_async) { -retry: - if (!rhp) - rhp = kmalloc(sizeof(*rhp), GFP_KERNEL); - if (rhp && atomic_read(this_cpu_ptr(&n_async_inflight)) < gp_async_max) { - atomic_inc(this_cpu_ptr(&n_async_inflight)); - cur_ops->async(rhp, rcu_scale_async_cb); - rhp = NULL; + if (gp_async && !WARN_ON_ONCE(!cur_ops->async)) { + if (!wmbp) + wmbp = rcu_scale_alloc(me); + if (wmbp && atomic_read(&wflp->ws_inflight) < gp_async_max) { + atomic_inc(&wflp->ws_inflight); + cur_ops->async(&wmbp->wmb_rh, rcu_scale_async_cb); + wmbp = NULL; + gp_succeeded = true; } else if (!kthread_should_stop()) { cur_ops->gp_barrier(); - goto retry; } else { - kfree(rhp); /* Because we are stopping. */ + rcu_scale_free(wmbp); /* Because we are stopping. */ + wmbp = NULL; } } else if (gp_exp) { cur_ops->exp_sync(); + gp_succeeded = true; } else { cur_ops->sync(); + gp_succeeded = true; } t = ktime_get_mono_fast_ns(); *wdp = t - *wdp; @@ -475,8 +610,9 @@ retry: if (!started && atomic_read(&n_rcu_scale_writer_started) >= nrealwriters) started = true; - if (!done && i >= MIN_MEAS) { + if (!done && i >= MIN_MEAS && time_after(jiffies, jdone)) { done = true; + WRITE_ONCE(writer_done[me], true); sched_set_normal(current, 0); pr_alert("%s%s rcu_scale_writer %ld has %d measurements\n", scale_type, SCALE_FLAG, me, MIN_MEAS); @@ -502,11 +638,32 @@ retry: if (done && !alldone && atomic_read(&n_rcu_scale_writer_finished) >= nrealwriters) alldone = true; - if (started && !alldone && i < MAX_MEAS - 1) + if (done && !alldone && time_after(jiffies, jdone + HZ * 60)) { + static atomic_t dumped; + int i; + + if (!atomic_xchg(&dumped, 1)) { + for (i = 0; i < nrealwriters; i++) { + if (writer_done[i]) + continue; + pr_info("%s: Task %ld flags writer %d:\n", __func__, me, i); + sched_show_task(writer_tasks[i]); + } + if (cur_ops->stats) + cur_ops->stats(); + } + } + if (!selfreport && time_after(jiffies, jdone + HZ * (70 + me))) { + pr_info("%s: Writer %ld self-report: started %d done %d/%d->%d i %d jdone %lu.\n", + __func__, me, started, done, writer_done[me], atomic_read(&n_rcu_scale_writer_finished), i, jiffies - jdone); + selfreport = true; + } + if (gp_succeeded && started && !alldone && i < MAX_MEAS - 1) i++; rcu_scale_wait_shutdown(); } while (!torture_must_stop()); - if (gp_async) { + if (gp_async && cur_ops->async) { + rcu_scale_free(wmbp); cur_ops->gp_barrier(); } writer_n_durations[me] = i_max + 1; @@ -518,91 +675,8 @@ static void rcu_scale_print_module_parms(struct rcu_scale_ops *cur_ops, const char *tag) { pr_alert("%s" SCALE_FLAG - "--- %s: nreaders=%d nwriters=%d verbose=%d shutdown=%d\n", - scale_type, tag, nrealreaders, nrealwriters, verbose, shutdown); -} - -static void -rcu_scale_cleanup(void) -{ - int i; - int j; - int ngps = 0; - u64 *wdp; - u64 *wdpp; - - /* - * Would like warning at start, but everything is expedited - * during the mid-boot phase, so have to wait till the end. - */ - if (rcu_gp_is_expedited() && !rcu_gp_is_normal() && !gp_exp) - SCALEOUT_ERRSTRING("All grace periods expedited, no normal ones to measure!"); - if (rcu_gp_is_normal() && gp_exp) - SCALEOUT_ERRSTRING("All grace periods normal, no expedited ones to measure!"); - if (gp_exp && gp_async) - SCALEOUT_ERRSTRING("No expedited async GPs, so went with async!"); - - if (torture_cleanup_begin()) - return; - if (!cur_ops) { - torture_cleanup_end(); - return; - } - - if (reader_tasks) { - for (i = 0; i < nrealreaders; i++) - torture_stop_kthread(rcu_scale_reader, - reader_tasks[i]); - kfree(reader_tasks); - } - - if (writer_tasks) { - for (i = 0; i < nrealwriters; i++) { - torture_stop_kthread(rcu_scale_writer, - writer_tasks[i]); - if (!writer_n_durations) - continue; - j = writer_n_durations[i]; - pr_alert("%s%s writer %d gps: %d\n", - scale_type, SCALE_FLAG, i, j); - ngps += j; - } - pr_alert("%s%s start: %llu end: %llu duration: %llu gps: %d batches: %ld\n", - scale_type, SCALE_FLAG, - t_rcu_scale_writer_started, t_rcu_scale_writer_finished, - t_rcu_scale_writer_finished - - t_rcu_scale_writer_started, - ngps, - rcuscale_seq_diff(b_rcu_gp_test_finished, - b_rcu_gp_test_started)); - for (i = 0; i < nrealwriters; i++) { - if (!writer_durations) - break; - if (!writer_n_durations) - continue; - wdpp = writer_durations[i]; - if (!wdpp) - continue; - for (j = 0; j < writer_n_durations[i]; j++) { - wdp = &wdpp[j]; - pr_alert("%s%s %4d writer-duration: %5d %llu\n", - scale_type, SCALE_FLAG, - i, j, *wdp); - if (j % 100 == 0) - schedule_timeout_uninterruptible(1); - } - kfree(writer_durations[i]); - } - kfree(writer_tasks); - kfree(writer_durations); - kfree(writer_n_durations); - } - - /* Do torture-type-specific cleanup operations. */ - if (cur_ops->cleanup != NULL) - cur_ops->cleanup(); - - torture_cleanup_end(); + "--- %s: gp_async=%d gp_async_max=%d gp_exp=%d holdoff=%d minruntime=%d nreaders=%d nwriters=%d writer_holdoff=%d writer_holdoff_jiffies=%d verbose=%d shutdown=%d\n", + scale_type, tag, gp_async, gp_async_max, gp_exp, holdoff, minruntime, nrealreaders, nrealwriters, writer_holdoff, writer_holdoff_jiffies, verbose, shutdown); } /* @@ -625,21 +699,6 @@ static int compute_real(int n) } /* - * RCU scalability shutdown kthread. Just waits to be awakened, then shuts - * down system. - */ -static int -rcu_scale_shutdown(void *arg) -{ - wait_event(shutdown_wq, - atomic_read(&n_rcu_scale_writer_finished) >= nrealwriters); - smp_mb(); /* Wake before output. */ - rcu_scale_cleanup(); - kernel_power_off(); - return -EINVAL; -} - -/* * kfree_rcu() scalability tests: Start a kfree_rcu() loop on all CPUs for number * of iterations and measure total time and number of GP for all iterations to complete. */ @@ -654,6 +713,8 @@ static struct task_struct **kfree_reader_tasks; static int kfree_nrealthreads; static atomic_t n_kfree_scale_thread_started; static atomic_t n_kfree_scale_thread_ended; +static struct task_struct *kthread_tp; +static u64 kthread_stime; struct kfree_obj { char kfree_obj[8]; @@ -701,7 +762,7 @@ kfree_scale_thread(void *arg) } for (i = 0; i < kfree_alloc_num; i++) { - alloc_ptr = kmalloc(kfree_mult * sizeof(struct kfree_obj), GFP_KERNEL); + alloc_ptr = kcalloc(kfree_mult, sizeof(struct kfree_obj), GFP_KERNEL); if (!alloc_ptr) return -ENOMEM; @@ -716,7 +777,7 @@ kfree_scale_thread(void *arg) // is tested. if ((kfree_rcu_test_single && !kfree_rcu_test_double) || (kfree_rcu_test_both && torture_random(&tr) & 0x800)) - kfree_rcu(alloc_ptr); + kfree_rcu_mightsleep(alloc_ptr); else kfree_rcu(alloc_ptr, rh); } @@ -735,7 +796,7 @@ kfree_scale_thread(void *arg) pr_alert("Total time taken by all kfree'ers: %llu ns, loops: %d, batches: %ld, memory footprint: %lldMB\n", (unsigned long long)(end_time - start_time), kfree_loops, rcuscale_seq_diff(b_rcu_gp_test_finished, b_rcu_gp_test_started), - (mem_begin - mem_during) >> (20 - PAGE_SHIFT)); + PAGES_TO_MB(mem_begin - mem_during)); if (shutdown) { smp_mb(); /* Assign before wake. */ @@ -760,6 +821,7 @@ kfree_scale_cleanup(void) torture_stop_kthread(kfree_scale_thread, kfree_reader_tasks[i]); kfree(kfree_reader_tasks); + kfree_reader_tasks = NULL; } torture_cleanup_end(); @@ -771,8 +833,8 @@ kfree_scale_cleanup(void) static int kfree_scale_shutdown(void *arg) { - wait_event(shutdown_wq, - atomic_read(&n_kfree_scale_thread_ended) >= kfree_nrealthreads); + wait_event_idle(shutdown_wq, + atomic_read(&n_kfree_scale_thread_ended) >= kfree_nrealthreads); smp_mb(); /* Wake before output. */ @@ -799,6 +861,10 @@ kfree_scale_init(void) unsigned long jif_start; unsigned long orig_jif; + pr_alert("%s" SCALE_FLAG + "--- kfree_rcu_test: kfree_mult=%d kfree_by_call_rcu=%d kfree_nthreads=%d kfree_alloc_num=%d kfree_loops=%d kfree_rcu_test_double=%d kfree_rcu_test_single=%d\n", + scale_type, kfree_mult, kfree_by_call_rcu, kfree_nthreads, kfree_alloc_num, kfree_loops, kfree_rcu_test_double, kfree_rcu_test_single); + // Also, do a quick self-test to ensure laziness is as much as // expected. if (kfree_by_call_rcu && !IS_ENABLED(CONFIG_RCU_LAZY)) { @@ -808,9 +874,9 @@ kfree_scale_init(void) if (kfree_by_call_rcu) { /* do a test to check the timeout. */ - orig_jif = rcu_lazy_get_jiffies_till_flush(); + orig_jif = rcu_get_jiffies_lazy_flush(); - rcu_lazy_set_jiffies_till_flush(2 * HZ); + rcu_set_jiffies_lazy_flush(2 * HZ); rcu_barrier(); jif_start = jiffies; @@ -819,18 +885,18 @@ kfree_scale_init(void) smp_cond_load_relaxed(&rcu_lazy_test1_cb_called, VAL == 1); - rcu_lazy_set_jiffies_till_flush(orig_jif); + rcu_set_jiffies_lazy_flush(orig_jif); if (WARN_ON_ONCE(jiffies_at_lazy_cb - jif_start < 2 * HZ)) { pr_alert("ERROR: call_rcu() CBs are not being lazy as expected!\n"); - WARN_ON_ONCE(1); - return -1; + firsterr = -1; + goto unwind; } if (WARN_ON_ONCE(jiffies_at_lazy_cb - jif_start > 3 * HZ)) { pr_alert("ERROR: call_rcu() CBs are being too lazy!\n"); - WARN_ON_ONCE(1); - return -1; + firsterr = -1; + goto unwind; } } @@ -875,13 +941,152 @@ unwind: return firsterr; } +static void +rcu_scale_cleanup(void) +{ + int i; + int j; + int ngps = 0; + u64 *wdp; + u64 *wdpp; + + /* + * Would like warning at start, but everything is expedited + * during the mid-boot phase, so have to wait till the end. + */ + if (rcu_gp_is_expedited() && !rcu_gp_is_normal() && !gp_exp) + SCALEOUT_ERRSTRING("All grace periods expedited, no normal ones to measure!"); + if (rcu_gp_is_normal() && gp_exp) + SCALEOUT_ERRSTRING("All grace periods normal, no expedited ones to measure!"); + if (gp_exp && gp_async) + SCALEOUT_ERRSTRING("No expedited async GPs, so went with async!"); + + // If built-in, just report all of the GP kthread's CPU time. + if (IS_BUILTIN(CONFIG_RCU_SCALE_TEST) && !kthread_tp && cur_ops->rso_gp_kthread) + kthread_tp = cur_ops->rso_gp_kthread(); + if (kthread_tp) { + u32 ns; + u64 us; + + kthread_stime = kthread_tp->stime - kthread_stime; + us = div_u64_rem(kthread_stime, 1000, &ns); + pr_info("rcu_scale: Grace-period kthread CPU time: %llu.%03u us\n", us, ns); + show_rcu_gp_kthreads(); + } + if (kfree_rcu_test) { + kfree_scale_cleanup(); + return; + } + + if (torture_cleanup_begin()) + return; + if (!cur_ops) { + torture_cleanup_end(); + return; + } + + if (reader_tasks) { + for (i = 0; i < nrealreaders; i++) + torture_stop_kthread(rcu_scale_reader, + reader_tasks[i]); + kfree(reader_tasks); + reader_tasks = NULL; + } + + if (writer_tasks) { + for (i = 0; i < nrealwriters; i++) { + torture_stop_kthread(rcu_scale_writer, + writer_tasks[i]); + if (!writer_n_durations) + continue; + j = writer_n_durations[i]; + pr_alert("%s%s writer %d gps: %d\n", + scale_type, SCALE_FLAG, i, j); + ngps += j; + } + pr_alert("%s%s start: %llu end: %llu duration: %llu gps: %d batches: %ld\n", + scale_type, SCALE_FLAG, + t_rcu_scale_writer_started, t_rcu_scale_writer_finished, + t_rcu_scale_writer_finished - + t_rcu_scale_writer_started, + ngps, + rcuscale_seq_diff(b_rcu_gp_test_finished, + b_rcu_gp_test_started)); + for (i = 0; i < nrealwriters; i++) { + if (!writer_durations) + break; + if (!writer_n_durations) + continue; + wdpp = writer_durations[i]; + if (!wdpp) + continue; + for (j = 0; j < writer_n_durations[i]; j++) { + wdp = &wdpp[j]; + pr_alert("%s%s %4d writer-duration: %5d %llu\n", + scale_type, SCALE_FLAG, + i, j, *wdp); + if (j % 100 == 0) + schedule_timeout_uninterruptible(1); + } + kfree(writer_durations[i]); + if (writer_freelists) { + int ctr = 0; + struct llist_node *llnp; + struct writer_freelist *wflp = &writer_freelists[i]; + + if (wflp->ws_mblocks) { + llist_for_each(llnp, wflp->ws_lhg.first) + ctr++; + llist_for_each(llnp, wflp->ws_lhp.first) + ctr++; + WARN_ONCE(ctr != gp_async_max, + "%s: ctr = %d gp_async_max = %d\n", + __func__, ctr, gp_async_max); + kfree(wflp->ws_mblocks); + } + } + } + kfree(writer_tasks); + writer_tasks = NULL; + kfree(writer_durations); + writer_durations = NULL; + kfree(writer_n_durations); + writer_n_durations = NULL; + kfree(writer_done); + writer_done = NULL; + kfree(writer_freelists); + writer_freelists = NULL; + } + + /* Do torture-type-specific cleanup operations. */ + if (cur_ops->cleanup != NULL) + cur_ops->cleanup(); + + torture_cleanup_end(); +} + +/* + * RCU scalability shutdown kthread. Just waits to be awakened, then shuts + * down system. + */ +static int +rcu_scale_shutdown(void *arg) +{ + wait_event_idle(shutdown_wq, atomic_read(&n_rcu_scale_writer_finished) >= nrealwriters); + smp_mb(); /* Wake before output. */ + rcu_scale_cleanup(); + kernel_power_off(); + return -EINVAL; +} + static int __init rcu_scale_init(void) { - long i; int firsterr = 0; + long i; + long j; static struct rcu_scale_ops *scale_ops[] = { - &rcu_ops, &srcu_ops, &srcud_ops, TASKS_OPS TASKS_TRACING_OPS + &rcu_ops, &srcu_ops, &srcud_ops, TASKS_OPS TASKS_RUDE_OPS TASKS_TRACING_OPS }; if (!torture_init_begin(scale_type, verbose)) @@ -906,6 +1111,11 @@ rcu_scale_init(void) if (cur_ops->init) cur_ops->init(); + if (cur_ops->rso_gp_kthread) { + kthread_tp = cur_ops->rso_gp_kthread(); + if (kthread_tp) + kthread_stime = kthread_tp->stime; + } if (kfree_rcu_test) return kfree_scale_init(); @@ -941,14 +1151,22 @@ rcu_scale_init(void) } while (atomic_read(&n_rcu_scale_reader_started) < nrealreaders) schedule_timeout_uninterruptible(1); - writer_tasks = kcalloc(nrealwriters, sizeof(reader_tasks[0]), - GFP_KERNEL); - writer_durations = kcalloc(nrealwriters, sizeof(*writer_durations), - GFP_KERNEL); - writer_n_durations = - kcalloc(nrealwriters, sizeof(*writer_n_durations), - GFP_KERNEL); - if (!writer_tasks || !writer_durations || !writer_n_durations) { + writer_tasks = kcalloc(nrealwriters, sizeof(writer_tasks[0]), GFP_KERNEL); + writer_durations = kcalloc(nrealwriters, sizeof(*writer_durations), GFP_KERNEL); + writer_n_durations = kcalloc(nrealwriters, sizeof(*writer_n_durations), GFP_KERNEL); + writer_done = kcalloc(nrealwriters, sizeof(writer_done[0]), GFP_KERNEL); + if (gp_async) { + if (gp_async_max <= 0) { + pr_warn("%s: gp_async_max = %d must be greater than zero.\n", + __func__, gp_async_max); + WARN_ON_ONCE(IS_BUILTIN(CONFIG_RCU_TORTURE_TEST)); + firsterr = -EINVAL; + goto unwind; + } + writer_freelists = kcalloc(nrealwriters, sizeof(writer_freelists[0]), GFP_KERNEL); + } + if (!writer_tasks || !writer_durations || !writer_n_durations || !writer_done || + (gp_async && !writer_freelists)) { SCALEOUT_ERRSTRING("out of memory"); firsterr = -ENOMEM; goto unwind; @@ -961,6 +1179,24 @@ rcu_scale_init(void) firsterr = -ENOMEM; goto unwind; } + if (writer_freelists) { + struct writer_freelist *wflp = &writer_freelists[i]; + + init_llist_head(&wflp->ws_lhg); + init_llist_head(&wflp->ws_lhp); + wflp->ws_mblocks = kcalloc(gp_async_max, sizeof(wflp->ws_mblocks[0]), + GFP_KERNEL); + if (!wflp->ws_mblocks) { + firsterr = -ENOMEM; + goto unwind; + } + for (j = 0; j < gp_async_max; j++) { + struct writer_mblock *wmbp = &wflp->ws_mblocks[j]; + + wmbp->wmb_wfl = wflp; + llist_add(&wmbp->wmb_node, &wflp->ws_lhp); + } + } firsterr = torture_create_kthread(rcu_scale_writer, (void *)i, writer_tasks[i]); if (torture_init_error(firsterr)) diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 634df26a2c27..07e51974b06b 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -21,6 +21,7 @@ #include <linux/spinlock.h> #include <linux/smp.h> #include <linux/rcupdate_wait.h> +#include <linux/rcu_notifier.h> #include <linux/interrupt.h> #include <linux/sched/signal.h> #include <uapi/linux/sched/types.h> @@ -50,25 +51,31 @@ #include "rcu.h" +MODULE_DESCRIPTION("Read-Copy Update module-based torture test facility"); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Paul E. McKenney <paulmck@linux.ibm.com> and Josh Triplett <josh@joshtriplett.org>"); -/* Bits for ->extendables field, extendables param, and related definitions. */ -#define RCUTORTURE_RDR_SHIFT_1 8 /* Put SRCU index in upper bits. */ -#define RCUTORTURE_RDR_MASK_1 (1 << RCUTORTURE_RDR_SHIFT_1) -#define RCUTORTURE_RDR_SHIFT_2 9 /* Put SRCU index in upper bits. */ -#define RCUTORTURE_RDR_MASK_2 (1 << RCUTORTURE_RDR_SHIFT_2) -#define RCUTORTURE_RDR_BH 0x01 /* Extend readers by disabling bh. */ -#define RCUTORTURE_RDR_IRQ 0x02 /* ... disabling interrupts. */ -#define RCUTORTURE_RDR_PREEMPT 0x04 /* ... disabling preemption. */ -#define RCUTORTURE_RDR_RBH 0x08 /* ... rcu_read_lock_bh(). */ -#define RCUTORTURE_RDR_SCHED 0x10 /* ... rcu_read_lock_sched(). */ -#define RCUTORTURE_RDR_RCU_1 0x20 /* ... entering another RCU reader. */ -#define RCUTORTURE_RDR_RCU_2 0x40 /* ... entering another RCU reader. */ -#define RCUTORTURE_RDR_NBITS 7 /* Number of bits defined above. */ -#define RCUTORTURE_MAX_EXTEND \ +// Bits for ->extendables field, extendables param, and related definitions. +#define RCUTORTURE_RDR_SHIFT_1 8 // Put SRCU index in upper bits. +#define RCUTORTURE_RDR_MASK_1 (0xff << RCUTORTURE_RDR_SHIFT_1) +#define RCUTORTURE_RDR_SHIFT_2 16 // Put SRCU index in upper bits. +#define RCUTORTURE_RDR_MASK_2 (0xff << RCUTORTURE_RDR_SHIFT_2) +#define RCUTORTURE_RDR_BH 0x01 // Extend readers by disabling bh. +#define RCUTORTURE_RDR_IRQ 0x02 // ... disabling interrupts. +#define RCUTORTURE_RDR_PREEMPT 0x04 // ... disabling preemption. +#define RCUTORTURE_RDR_RBH 0x08 // ... rcu_read_lock_bh(). +#define RCUTORTURE_RDR_SCHED 0x10 // ... rcu_read_lock_sched(). +#define RCUTORTURE_RDR_RCU_1 0x20 // ... entering another RCU reader. +#define RCUTORTURE_RDR_RCU_2 0x40 // ... entering another RCU reader. +#define RCUTORTURE_RDR_UPDOWN 0x80 // ... up-read from task, down-read from timer. + // Note: Manual start, automatic end. +#define RCUTORTURE_RDR_NBITS 8 // Number of bits defined above. +#define RCUTORTURE_MAX_EXTEND \ (RCUTORTURE_RDR_BH | RCUTORTURE_RDR_IRQ | RCUTORTURE_RDR_PREEMPT | \ - RCUTORTURE_RDR_RBH | RCUTORTURE_RDR_SCHED) + RCUTORTURE_RDR_RBH | RCUTORTURE_RDR_SCHED) // Intentionally omit RCUTORTURE_RDR_UPDOWN. +#define RCUTORTURE_RDR_ALLBITS \ + (RCUTORTURE_MAX_EXTEND | RCUTORTURE_RDR_RCU_1 | RCUTORTURE_RDR_RCU_2 | \ + RCUTORTURE_RDR_MASK_1 | RCUTORTURE_RDR_MASK_2) #define RCUTORTURE_RDR_MAX_LOOPS 0x7 /* Maximum reader extensions. */ /* Must be power of two minus one. */ #define RCUTORTURE_RDR_MAX_SEGS (RCUTORTURE_RDR_MAX_LOOPS + 3) @@ -87,25 +94,41 @@ torture_param(bool, gp_cond_exp, false, "Use conditional/async expedited GP wait torture_param(bool, gp_cond_full, false, "Use conditional/async full-state GP wait primitives"); torture_param(bool, gp_cond_exp_full, false, "Use conditional/async full-stateexpedited GP wait primitives"); +torture_param(int, gp_cond_wi, 16 * USEC_PER_SEC / HZ, + "Wait interval for normal conditional grace periods, us (default 16 jiffies)"); +torture_param(int, gp_cond_wi_exp, 128, + "Wait interval for expedited conditional grace periods, us (default 128 us)"); torture_param(bool, gp_exp, false, "Use expedited GP wait primitives"); torture_param(bool, gp_normal, false, "Use normal (non-expedited) GP wait primitives"); torture_param(bool, gp_poll, false, "Use polling GP wait primitives"); torture_param(bool, gp_poll_exp, false, "Use polling expedited GP wait primitives"); torture_param(bool, gp_poll_full, false, "Use polling full-state GP wait primitives"); torture_param(bool, gp_poll_exp_full, false, "Use polling full-state expedited GP wait primitives"); +torture_param(int, gp_poll_wi, 16 * USEC_PER_SEC / HZ, + "Wait interval for normal polled grace periods, us (default 16 jiffies)"); +torture_param(int, gp_poll_wi_exp, 128, + "Wait interval for expedited polled grace periods, us (default 128 us)"); torture_param(bool, gp_sync, false, "Use synchronous GP wait primitives"); torture_param(int, irqreader, 1, "Allow RCU readers from irq handlers"); torture_param(int, leakpointer, 0, "Leak pointer dereferences from readers"); torture_param(int, n_barrier_cbs, 0, "# of callbacks/kthreads for barrier testing"); +torture_param(int, n_up_down, 32, "# of concurrent up/down hrtimer-based RCU readers"); torture_param(int, nfakewriters, 4, "Number of RCU fake writer threads"); torture_param(int, nreaders, -1, "Number of RCU reader threads"); torture_param(int, object_debug, 0, "Enable debug-object double call_rcu() testing"); torture_param(int, onoff_holdoff, 0, "Time after boot before CPU hotplugs (s)"); torture_param(int, onoff_interval, 0, "Time between CPU hotplugs (jiffies), 0=disable"); +torture_param(bool, gpwrap_lag, true, "Enable grace-period wrap lag testing"); +torture_param(int, gpwrap_lag_gps, 8, "Value to set for set_gpwrap_lag during an active testing period."); +torture_param(int, gpwrap_lag_cycle_mins, 30, "Total cycle duration for gpwrap lag testing (in minutes)"); +torture_param(int, gpwrap_lag_active_mins, 5, "Duration for which gpwrap lag is active within each cycle (in minutes)"); torture_param(int, nocbs_nthreads, 0, "Number of NOCB toggle threads, 0 to disable"); torture_param(int, nocbs_toggle, 1000, "Time between toggling nocb state (ms)"); +torture_param(int, preempt_duration, 0, "Preemption duration (ms), zero to disable"); +torture_param(int, preempt_interval, MSEC_PER_SEC, "Interval between preemptions (ms)"); torture_param(int, read_exit_delay, 13, "Delay between read-then-exit episodes (s)"); torture_param(int, read_exit_burst, 16, "# of read-then-exit bursts per episode, zero to disable"); +torture_param(int, reader_flavor, SRCU_READ_FLAVOR_NORMAL, "Reader flavors to use, one per bit."); torture_param(int, shuffle_interval, 3, "Number of seconds between shuffles"); torture_param(int, shutdown_secs, 0, "Shutdown time (s), <= zero to disable."); torture_param(int, stall_cpu, 0, "Stall duration (s), zero to disable."); @@ -113,13 +136,17 @@ torture_param(int, stall_cpu_holdoff, 10, "Time to wait before starting stall (s torture_param(bool, stall_no_softlockup, false, "Avoid softlockup warning during cpu stall."); torture_param(int, stall_cpu_irqsoff, 0, "Disable interrupts while stalling."); torture_param(int, stall_cpu_block, 0, "Sleep while stalling."); +torture_param(int, stall_cpu_repeat, 0, "Number of additional stalls after the first one."); torture_param(int, stall_gp_kthread, 0, "Grace-period kthread stall duration (s)."); torture_param(int, stat_interval, 60, "Number of seconds between stats printk()s"); torture_param(int, stutter, 5, "Number of seconds to run/halt test"); torture_param(int, test_boost, 1, "Test RCU prio boost: 0=no, 1=maybe, 2=yes."); torture_param(int, test_boost_duration, 4, "Duration of each boost test, seconds."); +torture_param(int, test_boost_holdoff, 0, "Holdoff time from rcutorture start, seconds."); torture_param(int, test_boost_interval, 7, "Interval between boost tests, seconds."); +torture_param(int, test_nmis, 0, "End-test NMI tests, 0 to disable."); torture_param(bool, test_no_idle_hz, true, "Test support for tickless idle CPUs"); +torture_param(int, test_srcu_lockdep, 0, "Test specified SRCU deadlock scenario."); torture_param(int, verbose, 1, "Enable verbose debugging printk()s"); static char *torture_type = "rcu"; @@ -128,9 +155,11 @@ MODULE_PARM_DESC(torture_type, "Type of RCU to torture (rcu, srcu, ...)"); static int nrealnocbers; static int nrealreaders; +static int nrealfakewriters; static struct task_struct *writer_task; static struct task_struct **fakewriter_tasks; static struct task_struct **reader_tasks; +static struct task_struct *updown_task; static struct task_struct **nocb_tasks; static struct task_struct *stats_task; static struct task_struct *fqs_task; @@ -140,6 +169,7 @@ static struct task_struct **fwd_prog_tasks; static struct task_struct **barrier_cbs_tasks; static struct task_struct *barrier_task; static struct task_struct *read_exit_task; +static struct task_struct *preempt_task; #define RCU_TORTURE_PIPE_LEN 10 @@ -179,7 +209,6 @@ static atomic_t n_rcu_torture_mbchk_tries; static atomic_t n_rcu_torture_error; static long n_rcu_torture_barrier_error; static long n_rcu_torture_boost_ktrerror; -static long n_rcu_torture_boost_rterror; static long n_rcu_torture_boost_failure; static long n_rcu_torture_boosts; static atomic_long_t n_rcu_torture_timers; @@ -251,10 +280,16 @@ struct rt_read_seg { unsigned long rt_delay_ms; unsigned long rt_delay_us; bool rt_preempted; + int rt_cpu; + int rt_end_cpu; + unsigned long long rt_gp_seq; + unsigned long long rt_gp_seq_end; + u64 rt_ts; }; static int err_segs_recorded; static struct rt_read_seg err_segs[RCUTORTURE_RDR_MAX_SEGS]; static int rt_read_nsegs; +static int rt_read_preempted; static const char *rcu_torture_writer_state_getname(void) { @@ -345,12 +380,16 @@ struct rcu_torture_ops { void (*read_delay)(struct torture_random_state *rrsp, struct rt_read_seg *rtrsp); void (*readunlock)(int idx); - int (*readlock_held)(void); + int (*readlock_held)(void); // lockdep. + int (*readlock_nesting)(void); // actual nesting, if available, -1 if not. + int (*down_read)(void); + void (*up_read)(int idx); unsigned long (*get_gp_seq)(void); unsigned long (*gp_diff)(unsigned long new, unsigned long old); void (*deferred_free)(struct rcu_torture *p); void (*sync)(void); void (*exp_sync)(void); + void (*exp_current)(void); unsigned long (*get_gp_state_exp)(void); unsigned long (*start_gp_poll_exp)(void); void (*start_gp_poll_exp_full)(struct rcu_gp_oldstate *rgosp); @@ -363,8 +402,6 @@ struct rcu_torture_ops { bool (*same_gp_state_full)(struct rcu_gp_oldstate *rgosp1, struct rcu_gp_oldstate *rgosp2); unsigned long (*get_gp_state)(void); void (*get_gp_state_full)(struct rcu_gp_oldstate *rgosp); - unsigned long (*get_gp_completed)(void); - void (*get_gp_completed_full)(struct rcu_gp_oldstate *rgosp); unsigned long (*start_gp_poll)(void); void (*start_gp_poll_full)(struct rcu_gp_oldstate *rgosp); bool (*poll_gp_state)(unsigned long oldstate); @@ -372,6 +409,8 @@ struct rcu_torture_ops { bool (*poll_need_2gp)(bool poll, bool poll_full); void (*cond_sync)(unsigned long oldstate); void (*cond_sync_full)(struct rcu_gp_oldstate *rgosp); + int poll_active; + int poll_active_full; call_rcu_func_t call; void (*cb_barrier)(void); void (*fqs)(void); @@ -379,12 +418,23 @@ struct rcu_torture_ops { void (*gp_kthread_dbg)(void); bool (*check_boost_failed)(unsigned long gp_state, int *cpup); int (*stall_dur)(void); + void (*get_gp_data)(int *flags, unsigned long *gp_seq); + void (*gp_slow_register)(atomic_t *rgssp); + void (*gp_slow_unregister)(atomic_t *rgssp); + bool (*reader_blocked)(void); + unsigned long long (*gather_gp_seqs)(void); + void (*format_gp_seqs)(unsigned long long seqs, char *cp, size_t len); + void (*set_gpwrap_lag)(unsigned long lag); + int (*get_gpwrap_count)(int cpu); long cbflood_max; int irq_capable; int can_boost; int extendables; int slow_gps; int no_pi_lock; + int debug_objects; + int start_poll_irqsoff; + int have_up_down; const char *name; }; @@ -399,7 +449,7 @@ static int torture_readlock_not_held(void) return rcu_read_lock_bh_held() || rcu_read_lock_sched_held(); } -static int rcu_torture_read_lock(void) __acquires(RCU) +static int rcu_torture_read_lock(void) { rcu_read_lock(); return 0; @@ -422,7 +472,7 @@ rcu_read_delay(struct torture_random_state *rrsp, struct rt_read_seg *rtrsp) !(torture_random(rrsp) % (nrealreaders * 2000 * longdelay_ms))) { started = cur_ops->get_gp_seq(); ts = rcu_trace_clock_local(); - if (preempt_count() & (SOFTIRQ_MASK | HARDIRQ_MASK)) + if ((preempt_count() & HARDIRQ_MASK) || softirq_count()) longdelay_ms = 5; /* Avoid triggering BH limits. */ mdelay(longdelay_ms); rtrsp->rt_delay_ms = longdelay_ms; @@ -435,17 +485,24 @@ rcu_read_delay(struct torture_random_state *rrsp, struct rt_read_seg *rtrsp) rtrsp->rt_delay_us = shortdelay_us; } if (!preempt_count() && - !(torture_random(rrsp) % (nrealreaders * 500))) { + !(torture_random(rrsp) % (nrealreaders * 500))) torture_preempt_schedule(); /* QS only if preemptible. */ - rtrsp->rt_preempted = true; - } } -static void rcu_torture_read_unlock(int idx) __releases(RCU) +static void rcu_torture_read_unlock(int idx) { rcu_read_unlock(); } +static int rcu_torture_readlock_nesting(void) +{ + if (IS_ENABLED(CONFIG_PREEMPT_RCU)) + return rcu_preempt_depth(); + if (IS_ENABLED(CONFIG_PREEMPT_COUNT)) + return (preempt_count() & PREEMPT_MASK); + return -1; +} + /* * Update callback in the pipe. This should be invoked after a grace period. */ @@ -459,12 +516,13 @@ rcu_torture_pipe_update_one(struct rcu_torture *rp) WRITE_ONCE(rp->rtort_chkp, NULL); smp_store_release(&rtrcp->rtc_ready, 1); // Pair with smp_load_acquire(). } - i = READ_ONCE(rp->rtort_pipe_count); + i = rp->rtort_pipe_count; if (i > RCU_TORTURE_PIPE_LEN) i = RCU_TORTURE_PIPE_LEN; atomic_inc(&rcu_torture_wcount[i]); WRITE_ONCE(rp->rtort_pipe_count, i + 1); - if (rp->rtort_pipe_count >= RCU_TORTURE_PIPE_LEN) { + ASSERT_EXCLUSIVE_WRITER(rp->rtort_pipe_count); + if (i + 1 >= RCU_TORTURE_PIPE_LEN) { rp->rtort_mbtest = 0; return true; } @@ -534,6 +592,7 @@ static struct rcu_torture_ops rcu_ops = { .read_delay = rcu_read_delay, .readunlock = rcu_torture_read_unlock, .readlock_held = torture_readlock_not_held, + .readlock_nesting = rcu_torture_readlock_nesting, .get_gp_seq = rcu_get_gp_seq, .gp_diff = rcu_seq_diff, .deferred_free = rcu_torture_deferred_free, @@ -545,8 +604,6 @@ static struct rcu_torture_ops rcu_ops = { .get_comp_state_full = get_completed_synchronize_rcu_full, .get_gp_state = get_state_synchronize_rcu, .get_gp_state_full = get_state_synchronize_rcu_full, - .get_gp_completed = get_completed_synchronize_rcu, - .get_gp_completed_full = get_completed_synchronize_rcu_full, .start_gp_poll = start_poll_synchronize_rcu, .start_gp_poll_full = start_poll_synchronize_rcu_full, .poll_gp_state = poll_state_synchronize_rcu, @@ -554,21 +611,35 @@ static struct rcu_torture_ops rcu_ops = { .poll_need_2gp = rcu_poll_need_2gp, .cond_sync = cond_synchronize_rcu, .cond_sync_full = cond_synchronize_rcu_full, + .poll_active = NUM_ACTIVE_RCU_POLL_OLDSTATE, + .poll_active_full = NUM_ACTIVE_RCU_POLL_FULL_OLDSTATE, .get_gp_state_exp = get_state_synchronize_rcu, .start_gp_poll_exp = start_poll_synchronize_rcu_expedited, .start_gp_poll_exp_full = start_poll_synchronize_rcu_expedited_full, .poll_gp_state_exp = poll_state_synchronize_rcu, .cond_sync_exp = cond_synchronize_rcu_expedited, + .cond_sync_exp_full = cond_synchronize_rcu_expedited_full, .call = call_rcu_hurry, .cb_barrier = rcu_barrier, .fqs = rcu_force_quiescent_state, - .stats = NULL, .gp_kthread_dbg = show_rcu_gp_kthreads, .check_boost_failed = rcu_check_boost_fail, .stall_dur = rcu_jiffies_till_stall_check, + .get_gp_data = rcutorture_get_gp_data, + .gp_slow_register = rcu_gp_slow_register, + .gp_slow_unregister = rcu_gp_slow_unregister, + .reader_blocked = IS_ENABLED(CONFIG_RCU_TORTURE_TEST_LOG_CPU) + ? has_rcu_reader_blocked + : NULL, + .gather_gp_seqs = rcutorture_gather_gp_seqs, + .format_gp_seqs = rcutorture_format_gp_seqs, + .set_gpwrap_lag = rcu_set_gpwrap_lag, + .get_gpwrap_count = rcu_get_gpwrap_count, .irq_capable = 1, .can_boost = IS_ENABLED(CONFIG_RCU_BOOST), .extendables = RCUTORTURE_MAX_EXTEND, + .debug_objects = 1, + .start_poll_irqsoff = 1, .name = "rcu" }; @@ -609,10 +680,10 @@ static struct rcu_torture_ops rcu_busted_ops = { .sync = synchronize_rcu_busted, .exp_sync = synchronize_rcu_busted, .call = call_rcu_busted, - .cb_barrier = NULL, - .fqs = NULL, - .stats = NULL, + .gather_gp_seqs = rcutorture_gather_gp_seqs, + .format_gp_seqs = rcutorture_format_gp_seqs, .irq_capable = 1, + .extendables = RCUTORTURE_MAX_EXTEND, .name = "busted" }; @@ -621,16 +692,65 @@ static struct rcu_torture_ops rcu_busted_ops = { */ DEFINE_STATIC_SRCU(srcu_ctl); +DEFINE_STATIC_SRCU_FAST(srcu_ctlf); +DEFINE_STATIC_SRCU_FAST_UPDOWN(srcu_ctlfud); static struct srcu_struct srcu_ctld; static struct srcu_struct *srcu_ctlp = &srcu_ctl; static struct rcu_torture_ops srcud_ops; -static int srcu_torture_read_lock(void) __acquires(srcu_ctlp) +static void srcu_torture_init(void) { - if (cur_ops == &srcud_ops) - return srcu_read_lock_nmisafe(srcu_ctlp); - else - return srcu_read_lock(srcu_ctlp); + rcu_sync_torture_init(); + if (!reader_flavor || (reader_flavor & SRCU_READ_FLAVOR_NORMAL)) + VERBOSE_TOROUT_STRING("srcu_torture_init normal SRCU"); + if (reader_flavor & SRCU_READ_FLAVOR_NMI) + VERBOSE_TOROUT_STRING("srcu_torture_init NMI-safe SRCU"); + if (reader_flavor & SRCU_READ_FLAVOR_FAST) { + srcu_ctlp = &srcu_ctlf; + VERBOSE_TOROUT_STRING("srcu_torture_init fast SRCU"); + } + if (reader_flavor & SRCU_READ_FLAVOR_FAST_UPDOWN) { + srcu_ctlp = &srcu_ctlfud; + VERBOSE_TOROUT_STRING("srcu_torture_init fast-up/down SRCU"); + } +} + +static void srcu_get_gp_data(int *flags, unsigned long *gp_seq) +{ + srcutorture_get_gp_data(srcu_ctlp, flags, gp_seq); +} + +static int srcu_torture_read_lock(void) +{ + int idx; + struct srcu_ctr __percpu *scp; + int ret = 0; + + WARN_ON_ONCE(reader_flavor & ~SRCU_READ_FLAVOR_ALL); + + if ((reader_flavor & SRCU_READ_FLAVOR_NORMAL) || !(reader_flavor & SRCU_READ_FLAVOR_ALL)) { + idx = srcu_read_lock(srcu_ctlp); + WARN_ON_ONCE(idx & ~0x1); + ret += idx; + } + if (reader_flavor & SRCU_READ_FLAVOR_NMI) { + idx = srcu_read_lock_nmisafe(srcu_ctlp); + WARN_ON_ONCE(idx & ~0x1); + ret += idx << 1; + } + if (reader_flavor & SRCU_READ_FLAVOR_FAST) { + scp = srcu_read_lock_fast(srcu_ctlp); + idx = __srcu_ptr_to_ctr(srcu_ctlp, scp); + WARN_ON_ONCE(idx & ~0x1); + ret += idx << 2; + } + if (reader_flavor & SRCU_READ_FLAVOR_FAST_UPDOWN) { + scp = srcu_read_lock_fast_updown(srcu_ctlp); + idx = __srcu_ptr_to_ctr(srcu_ctlp, scp); + WARN_ON_ONCE(idx & ~0x1); + ret += idx << 3; + } + return ret; } static void @@ -652,12 +772,18 @@ srcu_read_delay(struct torture_random_state *rrsp, struct rt_read_seg *rtrsp) } } -static void srcu_torture_read_unlock(int idx) __releases(srcu_ctlp) +static void srcu_torture_read_unlock(int idx) { - if (cur_ops == &srcud_ops) - srcu_read_unlock_nmisafe(srcu_ctlp, idx); - else - srcu_read_unlock(srcu_ctlp, idx); + WARN_ON_ONCE((reader_flavor && (idx & ~reader_flavor)) || (!reader_flavor && (idx & ~0x1))); + if (reader_flavor & SRCU_READ_FLAVOR_FAST_UPDOWN) + srcu_read_unlock_fast_updown(srcu_ctlp, + __srcu_ctr_to_ptr(srcu_ctlp, (idx & 0x8) >> 3)); + if (reader_flavor & SRCU_READ_FLAVOR_FAST) + srcu_read_unlock_fast(srcu_ctlp, __srcu_ctr_to_ptr(srcu_ctlp, (idx & 0x4) >> 2)); + if (reader_flavor & SRCU_READ_FLAVOR_NMI) + srcu_read_unlock_nmisafe(srcu_ctlp, (idx & 0x2) >> 1); + if ((reader_flavor & SRCU_READ_FLAVOR_NORMAL) || !(reader_flavor & SRCU_READ_FLAVOR_ALL)) + srcu_read_unlock(srcu_ctlp, idx & 0x1); } static int torture_srcu_read_lock_held(void) @@ -665,6 +791,50 @@ static int torture_srcu_read_lock_held(void) return srcu_read_lock_held(srcu_ctlp); } +static bool srcu_torture_have_up_down(void) +{ + int rf = reader_flavor; + + if (!rf) + rf = SRCU_READ_FLAVOR_NORMAL; + return !!(cur_ops->have_up_down & rf); +} + +static int srcu_torture_down_read(void) +{ + int idx; + struct srcu_ctr __percpu *scp; + + WARN_ON_ONCE(reader_flavor & ~SRCU_READ_FLAVOR_ALL); + WARN_ON_ONCE(reader_flavor & (reader_flavor - 1)); + + if ((reader_flavor & SRCU_READ_FLAVOR_NORMAL) || !(reader_flavor & SRCU_READ_FLAVOR_ALL)) { + idx = srcu_down_read(srcu_ctlp); + WARN_ON_ONCE(idx & ~0x1); + return idx; + } + if (reader_flavor & SRCU_READ_FLAVOR_FAST_UPDOWN) { + scp = srcu_down_read_fast(srcu_ctlp); + idx = __srcu_ptr_to_ctr(srcu_ctlp, scp); + WARN_ON_ONCE(idx & ~0x1); + return idx << 3; + } + WARN_ON_ONCE(1); + return 0; +} + +static void srcu_torture_up_read(int idx) +{ + WARN_ON_ONCE((reader_flavor && (idx & ~reader_flavor)) || (!reader_flavor && (idx & ~0x1))); + if (reader_flavor & SRCU_READ_FLAVOR_FAST_UPDOWN) + srcu_up_read_fast(srcu_ctlp, __srcu_ctr_to_ptr(srcu_ctlp, (idx & 0x8) >> 3)); + else if ((reader_flavor & SRCU_READ_FLAVOR_NORMAL) || + !(reader_flavor & SRCU_READ_FLAVOR_ALL)) + srcu_up_read(srcu_ctlp, idx & 0x1); + else + WARN_ON_ONCE(1); +} + static unsigned long srcu_torture_completed(void) { return srcu_batches_completed(srcu_ctlp); @@ -716,33 +886,63 @@ static void srcu_torture_synchronize_expedited(void) synchronize_srcu_expedited(srcu_ctlp); } +static void srcu_torture_expedite_current(void) +{ + srcu_expedite_current(srcu_ctlp); +} + static struct rcu_torture_ops srcu_ops = { .ttype = SRCU_FLAVOR, - .init = rcu_sync_torture_init, + .init = srcu_torture_init, .readlock = srcu_torture_read_lock, .read_delay = srcu_read_delay, .readunlock = srcu_torture_read_unlock, + .down_read = srcu_torture_down_read, + .up_read = srcu_torture_up_read, .readlock_held = torture_srcu_read_lock_held, .get_gp_seq = srcu_torture_completed, + .gp_diff = rcu_seq_diff, .deferred_free = srcu_torture_deferred_free, .sync = srcu_torture_synchronize, .exp_sync = srcu_torture_synchronize_expedited, + .exp_current = srcu_torture_expedite_current, + .same_gp_state = same_state_synchronize_srcu, + .get_comp_state = get_completed_synchronize_srcu, .get_gp_state = srcu_torture_get_gp_state, .start_gp_poll = srcu_torture_start_gp_poll, .poll_gp_state = srcu_torture_poll_gp_state, + .poll_active = NUM_ACTIVE_SRCU_POLL_OLDSTATE, .call = srcu_torture_call, .cb_barrier = srcu_torture_barrier, .stats = srcu_torture_stats, + .get_gp_data = srcu_get_gp_data, .cbflood_max = 50000, .irq_capable = 1, .no_pi_lock = IS_ENABLED(CONFIG_TINY_SRCU), + .debug_objects = 1, + .have_up_down = IS_ENABLED(CONFIG_TINY_SRCU) + ? 0 : SRCU_READ_FLAVOR_NORMAL | SRCU_READ_FLAVOR_FAST_UPDOWN, .name = "srcu" }; -static void srcu_torture_init(void) +static void srcud_torture_init(void) { rcu_sync_torture_init(); - WARN_ON(init_srcu_struct(&srcu_ctld)); + if (!reader_flavor || (reader_flavor & SRCU_READ_FLAVOR_NORMAL)) { + WARN_ON(init_srcu_struct(&srcu_ctld)); + VERBOSE_TOROUT_STRING("srcud_torture_init normal SRCU"); + } else if (reader_flavor & SRCU_READ_FLAVOR_NMI) { + WARN_ON(init_srcu_struct(&srcu_ctld)); + VERBOSE_TOROUT_STRING("srcud_torture_init NMI-safe SRCU"); + } else if (reader_flavor & SRCU_READ_FLAVOR_FAST) { + WARN_ON(init_srcu_struct_fast(&srcu_ctld)); + VERBOSE_TOROUT_STRING("srcud_torture_init fast SRCU"); + } else if (reader_flavor & SRCU_READ_FLAVOR_FAST_UPDOWN) { + WARN_ON(init_srcu_struct_fast_updown(&srcu_ctld)); + VERBOSE_TOROUT_STRING("srcud_torture_init fast-up/down SRCU"); + } else { + WARN_ON(init_srcu_struct(&srcu_ctld)); + } srcu_ctlp = &srcu_ctld; } @@ -755,25 +955,36 @@ static void srcu_torture_cleanup(void) /* As above, but dynamically allocated. */ static struct rcu_torture_ops srcud_ops = { .ttype = SRCU_FLAVOR, - .init = srcu_torture_init, + .init = srcud_torture_init, .cleanup = srcu_torture_cleanup, .readlock = srcu_torture_read_lock, .read_delay = srcu_read_delay, .readunlock = srcu_torture_read_unlock, .readlock_held = torture_srcu_read_lock_held, + .down_read = srcu_torture_down_read, + .up_read = srcu_torture_up_read, .get_gp_seq = srcu_torture_completed, + .gp_diff = rcu_seq_diff, .deferred_free = srcu_torture_deferred_free, .sync = srcu_torture_synchronize, .exp_sync = srcu_torture_synchronize_expedited, + .exp_current = srcu_torture_expedite_current, + .same_gp_state = same_state_synchronize_srcu, + .get_comp_state = get_completed_synchronize_srcu, .get_gp_state = srcu_torture_get_gp_state, .start_gp_poll = srcu_torture_start_gp_poll, .poll_gp_state = srcu_torture_poll_gp_state, + .poll_active = NUM_ACTIVE_SRCU_POLL_OLDSTATE, .call = srcu_torture_call, .cb_barrier = srcu_torture_barrier, .stats = srcu_torture_stats, + .get_gp_data = srcu_get_gp_data, .cbflood_max = 50000, .irq_capable = 1, .no_pi_lock = IS_ENABLED(CONFIG_TINY_SRCU), + .debug_objects = 1, + .have_up_down = IS_ENABLED(CONFIG_TINY_SRCU) + ? 0 : SRCU_READ_FLAVOR_NORMAL | SRCU_READ_FLAVOR_FAST_UPDOWN, .name = "srcud" }; @@ -801,7 +1012,8 @@ static struct rcu_torture_ops busted_srcud_ops = { /* * Definitions for trivial CONFIG_PREEMPT=n-only torture testing. - * This implementation does not necessarily work well with CPU hotplug. + * This implementation does not work well with CPU hotplug nor + * with rcutorture's shuffling. */ static void synchronize_rcu_trivial(void) @@ -809,25 +1021,35 @@ static void synchronize_rcu_trivial(void) int cpu; for_each_online_cpu(cpu) { - rcutorture_sched_setaffinity(current->pid, cpumask_of(cpu)); + torture_sched_setaffinity(current->pid, cpumask_of(cpu), true); WARN_ON_ONCE(raw_smp_processor_id() != cpu); } } -static int rcu_torture_read_lock_trivial(void) __acquires(RCU) +static void rcu_sync_torture_init_trivial(void) +{ + rcu_sync_torture_init(); + // if (onoff_interval || shuffle_interval) { + if (WARN_ONCE(onoff_interval || shuffle_interval, "%s: Non-zero onoff_interval (%d) or shuffle_interval (%d) breaks trivial RCU, resetting to zero", __func__, onoff_interval, shuffle_interval)) { + onoff_interval = 0; + shuffle_interval = 0; + } +} + +static int rcu_torture_read_lock_trivial(void) { preempt_disable(); return 0; } -static void rcu_torture_read_unlock_trivial(int idx) __releases(RCU) +static void rcu_torture_read_unlock_trivial(int idx) { preempt_enable(); } static struct rcu_torture_ops trivial_ops = { .ttype = RCU_TRIVIAL_FLAVOR, - .init = rcu_sync_torture_init, + .init = rcu_sync_torture_init_trivial, .readlock = rcu_torture_read_lock_trivial, .read_delay = rcu_read_delay, /* just reuse rcu's version. */ .readunlock = rcu_torture_read_unlock_trivial, @@ -835,8 +1057,6 @@ static struct rcu_torture_ops trivial_ops = { .get_gp_seq = rcu_no_completed, .sync = synchronize_rcu_trivial, .exp_sync = synchronize_rcu_trivial, - .fqs = NULL, - .stats = NULL, .irq_capable = 1, .name = "trivial" }; @@ -879,8 +1099,7 @@ static struct rcu_torture_ops tasks_ops = { .call = call_rcu_tasks, .cb_barrier = rcu_barrier_tasks, .gp_kthread_dbg = show_rcu_tasks_classic_gp_kthread, - .fqs = NULL, - .stats = NULL, + .get_gp_data = rcu_tasks_get_gp_data, .irq_capable = 1, .slow_gps = 1, .name = "tasks" @@ -901,11 +1120,6 @@ static struct rcu_torture_ops tasks_ops = { * Definitions for rude RCU-tasks torture testing. */ -static void rcu_tasks_rude_torture_deferred_free(struct rcu_torture *p) -{ - call_rcu_tasks_rude(&p->rtort_rcu, rcu_torture_cb); -} - static struct rcu_torture_ops tasks_rude_ops = { .ttype = RCU_TASKS_RUDE_FLAVOR, .init = rcu_sync_torture_init, @@ -913,15 +1127,11 @@ static struct rcu_torture_ops tasks_rude_ops = { .read_delay = rcu_read_delay, /* just reuse rcu's version. */ .readunlock = rcu_torture_read_unlock_trivial, .get_gp_seq = rcu_no_completed, - .deferred_free = rcu_tasks_rude_torture_deferred_free, .sync = synchronize_rcu_tasks_rude, .exp_sync = synchronize_rcu_tasks_rude, - .call = call_rcu_tasks_rude, - .cb_barrier = rcu_barrier_tasks_rude, .gp_kthread_dbg = show_rcu_tasks_rude_gp_kthread, + .get_gp_data = rcu_tasks_rude_get_gp_data, .cbflood_max = 50000, - .fqs = NULL, - .stats = NULL, .irq_capable = 1, .name = "tasks-rude" }; @@ -971,9 +1181,8 @@ static struct rcu_torture_ops tasks_tracing_ops = { .call = call_rcu_tasks_trace, .cb_barrier = rcu_barrier_tasks_trace, .gp_kthread_dbg = show_rcu_tasks_trace_gp_kthread, + .get_gp_data = rcu_tasks_trace_get_gp_data, .cbflood_max = 50000, - .fqs = NULL, - .stats = NULL, .irq_capable = 1, .slow_gps = 1, .name = "tasks-tracing" @@ -1048,8 +1257,13 @@ static bool rcu_torture_boost_failed(unsigned long gp_state, unsigned long *star // At most one persisted message per boost test. j = jiffies; lp = READ_ONCE(last_persist); - if (time_after(j, lp + mininterval) && cmpxchg(&last_persist, lp, j) == lp) - pr_info("Boost inversion persisted: No QS from CPU %d\n", cpu); + if (time_after(j, lp + mininterval) && + cmpxchg(&last_persist, lp, j) == lp) { + if (cpu < 0) + pr_info("Boost inversion persisted: QS from all CPUs\n"); + else + pr_info("Boost inversion persisted: No QS from CPU %d\n", cpu); + } return false; // passed on a technicality } VERBOSE_TOROUT_STRING("rcu_torture_boost boosting failed"); @@ -1079,8 +1293,19 @@ static int rcu_torture_boost(void *arg) unsigned long gp_state; unsigned long gp_state_time; unsigned long oldstarttime; + unsigned long booststarttime = get_torture_init_jiffies() + test_boost_holdoff * HZ; - VERBOSE_TOROUT_STRING("rcu_torture_boost started"); + if (test_boost_holdoff <= 0 || time_after(jiffies, booststarttime)) { + VERBOSE_TOROUT_STRING("rcu_torture_boost started"); + } else { + VERBOSE_TOROUT_STRING("rcu_torture_boost started holdoff period"); + while (time_before(jiffies, booststarttime)) { + schedule_timeout_idle(HZ); + if (kthread_should_stop()) + goto cleanup; + } + VERBOSE_TOROUT_STRING("rcu_torture_boost finished holdoff period"); + } /* Set real-time priority. */ sched_set_fifo_low(current); @@ -1148,7 +1373,7 @@ static int rcu_torture_boost(void *arg) mutex_unlock(&boost_mutex); break; } - schedule_timeout_uninterruptible(1); + schedule_timeout_uninterruptible(HZ / 20); } /* Go do the stutter. */ @@ -1156,10 +1381,11 @@ checkwait: if (stutter_wait("rcu_torture_boost")) sched_set_fifo_low(current); } while (!torture_must_stop()); +cleanup: /* Clean up and exit. */ while (!kthread_should_stop()) { torture_shutdown_absorb("rcu_torture_boost"); - schedule_timeout_uninterruptible(1); + schedule_timeout_uninterruptible(HZ / 20); } torture_kthread_stopping("rcu_torture_boost"); return 0; @@ -1182,7 +1408,7 @@ rcu_torture_fqs(void *arg) fqs_resume_time = jiffies + fqs_stutter * HZ; while (time_before(jiffies, fqs_resume_time) && !kthread_should_stop()) { - schedule_timeout_interruptible(1); + schedule_timeout_interruptible(HZ / 20); } fqs_burst_remaining = fqs_duration; while (fqs_burst_remaining > 0 && @@ -1306,6 +1532,8 @@ static void rcu_torture_write_types(void) } else if (gp_sync && !cur_ops->sync) { pr_alert("%s: gp_sync without primitives.\n", __func__); } + pr_alert("%s: Testing %d update types.\n", __func__, nsynctypes); + pr_info("%s: gp_cond_wi %d gp_cond_wi_exp %d gp_poll_wi %d gp_poll_wi_exp %d\n", __func__, gp_cond_wi, gp_cond_wi_exp, gp_poll_wi, gp_poll_wi_exp); } /* @@ -1350,7 +1578,7 @@ static void do_rtws_sync(struct torture_random_state *trsp, void (*sync)(void)) static int rcu_torture_writer(void *arg) { - bool boot_ended; + bool booting_still = false; bool can_expedite = !rcu_gp_is_expedited() && !rcu_gp_is_normal(); unsigned long cookie; struct rcu_gp_oldstate cookie_full; @@ -1361,14 +1589,22 @@ rcu_torture_writer(void *arg) struct rcu_gp_oldstate gp_snap1_full; int i; int idx; + unsigned long j; int oldnice = task_nice(current); - struct rcu_gp_oldstate rgo[NUM_ACTIVE_RCU_POLL_FULL_OLDSTATE]; + struct rcu_gp_oldstate *rgo = NULL; + int rgo_size = 0; struct rcu_torture *rp; struct rcu_torture *old_rp; static DEFINE_TORTURE_RANDOM(rand); + unsigned long stallsdone = jiffies; bool stutter_waited; - unsigned long ulo[NUM_ACTIVE_RCU_POLL_OLDSTATE]; + unsigned long *ulo = NULL; + int ulo_size = 0; + // If a new stall test is added, this must be adjusted. + if (stall_cpu_holdoff + stall_gp_kthread + stall_cpu) + stallsdone += (stall_cpu_holdoff + stall_gp_kthread + stall_cpu + 60) * + HZ * (stall_cpu_repeat + 1); VERBOSE_TOROUT_STRING("rcu_torture_writer task started"); if (!can_expedite) pr_alert("%s" TORTURE_FLAG @@ -1385,6 +1621,26 @@ rcu_torture_writer(void *arg) torture_kthread_stopping("rcu_torture_writer"); return 0; } + if (cur_ops->poll_active > 0) { + ulo = kcalloc(cur_ops->poll_active, sizeof(*ulo), GFP_KERNEL); + if (!WARN_ON(!ulo)) + ulo_size = cur_ops->poll_active; + } + if (cur_ops->poll_active_full > 0) { + rgo = kcalloc(cur_ops->poll_active_full, sizeof(*rgo), GFP_KERNEL); + if (!WARN_ON(!rgo)) + rgo_size = cur_ops->poll_active_full; + } + + // If the system is still booting, let it finish. + j = jiffies; + while (!torture_must_stop() && !rcu_inkernel_boot_has_ended()) { + booting_still = true; + schedule_timeout_interruptible(HZ); + } + if (booting_still) + pr_alert("%s" TORTURE_FLAG " Waited %lu jiffies for boot to complete.\n", + torture_type, jiffies - j); do { rcu_torture_writer_state = RTWS_FIXED_DELAY; @@ -1393,6 +1649,7 @@ rcu_torture_writer(void *arg) if (rp == NULL) continue; rp->rtort_pipe_count = 0; + ASSERT_EXCLUSIVE_WRITER(rp->rtort_pipe_count); rcu_torture_writer_state = RTWS_DELAY; udelay(torture_random(&rand) & 0x3ff); rcu_torture_writer_state = RTWS_REPLACE; @@ -1408,6 +1665,7 @@ rcu_torture_writer(void *arg) atomic_inc(&rcu_torture_wcount[i]); WRITE_ONCE(old_rp->rtort_pipe_count, old_rp->rtort_pipe_count + 1); + ASSERT_EXCLUSIVE_WRITER(old_rp->rtort_pipe_count); // Make sure readers block polled grace periods. if (cur_ops->get_gp_state && cur_ops->poll_gp_state) { @@ -1419,8 +1677,8 @@ rcu_torture_writer(void *arg) rcu_torture_writer_state_getname(), rcu_torture_writer_state, cookie, cur_ops->get_gp_state()); - if (cur_ops->get_gp_completed) { - cookie = cur_ops->get_gp_completed(); + if (cur_ops->get_comp_state) { + cookie = cur_ops->get_comp_state(); WARN_ON_ONCE(!cur_ops->poll_gp_state(cookie)); } cur_ops->readunlock(idx); @@ -1434,8 +1692,8 @@ rcu_torture_writer(void *arg) rcu_torture_writer_state_getname(), rcu_torture_writer_state, cpumask_pr_args(cpu_online_mask)); - if (cur_ops->get_gp_completed_full) { - cur_ops->get_gp_completed_full(&cookie_full); + if (cur_ops->get_comp_state_full) { + cur_ops->get_comp_state_full(&cookie_full); WARN_ON_ONCE(!cur_ops->poll_gp_state_full(&cookie_full)); } cur_ops->readunlock(idx); @@ -1453,7 +1711,8 @@ rcu_torture_writer(void *arg) case RTWS_COND_GET: rcu_torture_writer_state = RTWS_COND_GET; gp_snap = cur_ops->get_gp_state(); - torture_hrtimeout_jiffies(torture_random(&rand) % 16, &rand); + torture_hrtimeout_us(torture_random(&rand) % gp_cond_wi, + 1000, &rand); rcu_torture_writer_state = RTWS_COND_SYNC; cur_ops->cond_sync(gp_snap); rcu_torture_pipe_update(old_rp); @@ -1461,7 +1720,8 @@ rcu_torture_writer(void *arg) case RTWS_COND_GET_EXP: rcu_torture_writer_state = RTWS_COND_GET_EXP; gp_snap = cur_ops->get_gp_state_exp(); - torture_hrtimeout_jiffies(torture_random(&rand) % 16, &rand); + torture_hrtimeout_us(torture_random(&rand) % gp_cond_wi_exp, + 1000, &rand); rcu_torture_writer_state = RTWS_COND_SYNC_EXP; cur_ops->cond_sync_exp(gp_snap); rcu_torture_pipe_update(old_rp); @@ -1469,7 +1729,8 @@ rcu_torture_writer(void *arg) case RTWS_COND_GET_FULL: rcu_torture_writer_state = RTWS_COND_GET_FULL; cur_ops->get_gp_state_full(&gp_snap_full); - torture_hrtimeout_jiffies(torture_random(&rand) % 16, &rand); + torture_hrtimeout_us(torture_random(&rand) % gp_cond_wi, + 1000, &rand); rcu_torture_writer_state = RTWS_COND_SYNC_FULL; cur_ops->cond_sync_full(&gp_snap_full); rcu_torture_pipe_update(old_rp); @@ -1477,49 +1738,54 @@ rcu_torture_writer(void *arg) case RTWS_COND_GET_EXP_FULL: rcu_torture_writer_state = RTWS_COND_GET_EXP_FULL; cur_ops->get_gp_state_full(&gp_snap_full); - torture_hrtimeout_jiffies(torture_random(&rand) % 16, &rand); + torture_hrtimeout_us(torture_random(&rand) % gp_cond_wi_exp, + 1000, &rand); rcu_torture_writer_state = RTWS_COND_SYNC_EXP_FULL; cur_ops->cond_sync_exp_full(&gp_snap_full); rcu_torture_pipe_update(old_rp); break; case RTWS_POLL_GET: rcu_torture_writer_state = RTWS_POLL_GET; - for (i = 0; i < ARRAY_SIZE(ulo); i++) + for (i = 0; i < ulo_size; i++) ulo[i] = cur_ops->get_comp_state(); gp_snap = cur_ops->start_gp_poll(); rcu_torture_writer_state = RTWS_POLL_WAIT; + if (cur_ops->exp_current && !torture_random(&rand) % 0xff) + cur_ops->exp_current(); while (!cur_ops->poll_gp_state(gp_snap)) { gp_snap1 = cur_ops->get_gp_state(); - for (i = 0; i < ARRAY_SIZE(ulo); i++) + for (i = 0; i < ulo_size; i++) if (cur_ops->poll_gp_state(ulo[i]) || cur_ops->same_gp_state(ulo[i], gp_snap1)) { ulo[i] = gp_snap1; break; } - WARN_ON_ONCE(i >= ARRAY_SIZE(ulo)); - torture_hrtimeout_jiffies(torture_random(&rand) % 16, - &rand); + WARN_ON_ONCE(ulo_size > 0 && i >= ulo_size); + torture_hrtimeout_us(torture_random(&rand) % gp_poll_wi, + 1000, &rand); } rcu_torture_pipe_update(old_rp); break; case RTWS_POLL_GET_FULL: rcu_torture_writer_state = RTWS_POLL_GET_FULL; - for (i = 0; i < ARRAY_SIZE(rgo); i++) + for (i = 0; i < rgo_size; i++) cur_ops->get_comp_state_full(&rgo[i]); cur_ops->start_gp_poll_full(&gp_snap_full); rcu_torture_writer_state = RTWS_POLL_WAIT_FULL; + if (cur_ops->exp_current && !torture_random(&rand) % 0xff) + cur_ops->exp_current(); while (!cur_ops->poll_gp_state_full(&gp_snap_full)) { cur_ops->get_gp_state_full(&gp_snap1_full); - for (i = 0; i < ARRAY_SIZE(rgo); i++) + for (i = 0; i < rgo_size; i++) if (cur_ops->poll_gp_state_full(&rgo[i]) || cur_ops->same_gp_state_full(&rgo[i], &gp_snap1_full)) { rgo[i] = gp_snap1_full; break; } - WARN_ON_ONCE(i >= ARRAY_SIZE(rgo)); - torture_hrtimeout_jiffies(torture_random(&rand) % 16, - &rand); + WARN_ON_ONCE(rgo_size > 0 && i >= rgo_size); + torture_hrtimeout_us(torture_random(&rand) % gp_poll_wi, + 1000, &rand); } rcu_torture_pipe_update(old_rp); break; @@ -1528,8 +1794,8 @@ rcu_torture_writer(void *arg) gp_snap = cur_ops->start_gp_poll_exp(); rcu_torture_writer_state = RTWS_POLL_WAIT_EXP; while (!cur_ops->poll_gp_state_exp(gp_snap)) - torture_hrtimeout_jiffies(torture_random(&rand) % 16, - &rand); + torture_hrtimeout_us(torture_random(&rand) % gp_poll_wi_exp, + 1000, &rand); rcu_torture_pipe_update(old_rp); break; case RTWS_POLL_GET_EXP_FULL: @@ -1537,8 +1803,8 @@ rcu_torture_writer(void *arg) cur_ops->start_gp_poll_exp_full(&gp_snap_full); rcu_torture_writer_state = RTWS_POLL_WAIT_EXP_FULL; while (!cur_ops->poll_gp_state_full(&gp_snap_full)) - torture_hrtimeout_jiffies(torture_random(&rand) % 16, - &rand); + torture_hrtimeout_us(torture_random(&rand) % gp_poll_wi_exp, + 1000, &rand); rcu_torture_pipe_update(old_rp); break; case RTWS_SYNC: @@ -1568,20 +1834,21 @@ rcu_torture_writer(void *arg) !rcu_gp_is_normal(); } rcu_torture_writer_state = RTWS_STUTTER; - boot_ended = rcu_inkernel_boot_has_ended(); stutter_waited = stutter_wait("rcu_torture_writer"); if (stutter_waited && !atomic_read(&rcu_fwd_cb_nodelay) && !cur_ops->slow_gps && !torture_must_stop() && - boot_ended) + time_after(jiffies, stallsdone)) for (i = 0; i < ARRAY_SIZE(rcu_tortures); i++) if (list_empty(&rcu_tortures[i].rtort_free) && - rcu_access_pointer(rcu_torture_current) != - &rcu_tortures[i]) { + rcu_access_pointer(rcu_torture_current) != &rcu_tortures[i]) { tracing_off(); + if (cur_ops->gp_kthread_dbg) + cur_ops->gp_kthread_dbg(); WARN(1, "%s: rtort_pipe_count: %d\n", __func__, rcu_tortures[i].rtort_pipe_count); rcu_ftrace_dump(DUMP_ALL); + break; } if (stutter_waited) sched_set_normal(current, oldnice); @@ -1597,6 +1864,8 @@ rcu_torture_writer(void *arg) pr_alert("%s" TORTURE_FLAG " Dynamic grace-period expediting was disabled.\n", torture_type); + kfree(ulo); + kfree(rgo); rcu_torture_writer_state = RTWS_STOPPING; torture_kthread_stopping("rcu_torture_writer"); return 0; @@ -1630,7 +1899,7 @@ rcu_torture_fakewriter(void *arg) do { torture_hrtimeout_jiffies(torture_random(&rand) % 10, &rand); if (cur_ops->cb_barrier != NULL && - torture_random(&rand) % (nfakewriters * 8) == 0) { + torture_random(&rand) % (nrealfakewriters * 8) == 0) { cur_ops->cb_barrier(); } else { switch (synctype[torture_random(&rand) % nsynctypes]) { @@ -1660,14 +1929,22 @@ rcu_torture_fakewriter(void *arg) cur_ops->cond_sync_exp_full(&gp_snap_full); break; case RTWS_POLL_GET: + if (cur_ops->start_poll_irqsoff) + local_irq_disable(); gp_snap = cur_ops->start_gp_poll(); + if (cur_ops->start_poll_irqsoff) + local_irq_enable(); while (!cur_ops->poll_gp_state(gp_snap)) { torture_hrtimeout_jiffies(torture_random(&rand) % 16, &rand); } break; case RTWS_POLL_GET_FULL: + if (cur_ops->start_poll_irqsoff) + local_irq_disable(); cur_ops->start_gp_poll_full(&gp_snap_full); + if (cur_ops->start_poll_irqsoff) + local_irq_enable(); while (!cur_ops->poll_gp_state_full(&gp_snap_full)) { torture_hrtimeout_jiffies(torture_random(&rand) % 16, &rand); @@ -1763,6 +2040,65 @@ static void rcu_torture_reader_do_mbchk(long myid, struct rcu_torture *rtp, smp_store_release(&rtrcp_assigner->rtc_chkrdr, -1); // Assigner can again assign. } +// Verify the specified RCUTORTURE_RDR* state. +#define ROEC_ARGS "%s %s: Current %#x To add %#x To remove %#x preempt_count() %#x\n", __func__, s, curstate, new, old, preempt_count() +static void rcutorture_one_extend_check(char *s, int curstate, int new, int old) +{ + int mask; + + if (!IS_ENABLED(CONFIG_RCU_TORTURE_TEST_CHK_RDR_STATE) || in_nmi()) + return; + + WARN_ONCE(!(curstate & RCUTORTURE_RDR_IRQ) && irqs_disabled() && !in_hardirq(), ROEC_ARGS); + WARN_ONCE((curstate & RCUTORTURE_RDR_IRQ) && !irqs_disabled(), ROEC_ARGS); + + // If CONFIG_PREEMPT_COUNT=n, further checks are unreliable. + if (!IS_ENABLED(CONFIG_PREEMPT_COUNT)) + return; + + WARN_ONCE((curstate & (RCUTORTURE_RDR_BH | RCUTORTURE_RDR_RBH)) && + !softirq_count(), ROEC_ARGS); + WARN_ONCE((curstate & (RCUTORTURE_RDR_PREEMPT | RCUTORTURE_RDR_SCHED)) && + !(preempt_count() & PREEMPT_MASK), ROEC_ARGS); + WARN_ONCE(cur_ops->readlock_nesting && + (curstate & (RCUTORTURE_RDR_RCU_1 | RCUTORTURE_RDR_RCU_2)) && + cur_ops->readlock_nesting() == 0, ROEC_ARGS); + + // Interrupt handlers have all sorts of stuff disabled, so ignore + // unintended disabling. + if (in_serving_softirq() || in_hardirq()) + return; + + WARN_ONCE(cur_ops->extendables && + !(curstate & (RCUTORTURE_RDR_BH | RCUTORTURE_RDR_RBH)) && + softirq_count(), ROEC_ARGS); + + /* + * non-preemptible RCU in a preemptible kernel uses preempt_disable() + * as rcu_read_lock(). + */ + mask = RCUTORTURE_RDR_PREEMPT | RCUTORTURE_RDR_SCHED; + if (!IS_ENABLED(CONFIG_PREEMPT_RCU)) + mask |= RCUTORTURE_RDR_RCU_1 | RCUTORTURE_RDR_RCU_2; + + WARN_ONCE(cur_ops->extendables && !(curstate & mask) && + (preempt_count() & PREEMPT_MASK), ROEC_ARGS); + + /* + * non-preemptible RCU in a preemptible kernel uses "preempt_count() & + * PREEMPT_MASK" as ->readlock_nesting(). + */ + mask = RCUTORTURE_RDR_RCU_1 | RCUTORTURE_RDR_RCU_2; + if (!IS_ENABLED(CONFIG_PREEMPT_RCU)) + mask |= RCUTORTURE_RDR_PREEMPT | RCUTORTURE_RDR_SCHED; + + if (IS_ENABLED(CONFIG_PREEMPT_RT) && softirq_count()) + mask |= RCUTORTURE_RDR_BH | RCUTORTURE_RDR_RBH; + + WARN_ONCE(cur_ops->readlock_nesting && !(curstate & mask) && + cur_ops->readlock_nesting() > 0, ROEC_ARGS); +} + /* * Do one extension of an RCU read-side critical section using the * current reader state in readstate (set to zero for initial entry @@ -1772,10 +2108,10 @@ static void rcu_torture_reader_do_mbchk(long myid, struct rcu_torture *rtp, * beginning or end of the critical section and if there was actually a * change, do a ->read_delay(). */ -static void rcutorture_one_extend(int *readstate, int newstate, - struct torture_random_state *trsp, +static void rcutorture_one_extend(int *readstate, int newstate, struct torture_random_state *trsp, struct rt_read_seg *rtrsp) { + bool first; unsigned long flags; int idxnew1 = -1; int idxnew2 = -1; @@ -1784,8 +2120,10 @@ static void rcutorture_one_extend(int *readstate, int newstate, int statesnew = ~*readstate & newstate; int statesold = *readstate & ~newstate; + first = idxold1 == 0; WARN_ON_ONCE(idxold2 < 0); - WARN_ON_ONCE((idxold2 >> RCUTORTURE_RDR_SHIFT_2) > 1); + WARN_ON_ONCE(idxold2 & ~(RCUTORTURE_RDR_ALLBITS | RCUTORTURE_RDR_UPDOWN)); + rcutorture_one_extend_check("before change", idxold1, statesnew, statesold); rtrsp->rt_readstate = newstate; /* First, put new protection in place to avoid critical-section gap. */ @@ -1800,9 +2138,30 @@ static void rcutorture_one_extend(int *readstate, int newstate, if (statesnew & RCUTORTURE_RDR_SCHED) rcu_read_lock_sched(); if (statesnew & RCUTORTURE_RDR_RCU_1) - idxnew1 = (cur_ops->readlock() & 0x1) << RCUTORTURE_RDR_SHIFT_1; + idxnew1 = (cur_ops->readlock() << RCUTORTURE_RDR_SHIFT_1) & RCUTORTURE_RDR_MASK_1; if (statesnew & RCUTORTURE_RDR_RCU_2) - idxnew2 = (cur_ops->readlock() & 0x1) << RCUTORTURE_RDR_SHIFT_2; + idxnew2 = (cur_ops->readlock() << RCUTORTURE_RDR_SHIFT_2) & RCUTORTURE_RDR_MASK_2; + + // Complain unless both the old and the new protection is in place. + rcutorture_one_extend_check("during change", idxold1 | statesnew, statesnew, statesold); + + // Sample CPU under both sets of protections to reduce confusion. + if (IS_ENABLED(CONFIG_RCU_TORTURE_TEST_LOG_CPU)) { + int cpu = raw_smp_processor_id(); + rtrsp->rt_cpu = cpu; + if (!first) { + rtrsp[-1].rt_end_cpu = cpu; + if (cur_ops->reader_blocked) + rtrsp[-1].rt_preempted = cur_ops->reader_blocked(); + } + } + // Sample grace-period sequence number, as good a place as any. + if (IS_ENABLED(CONFIG_RCU_TORTURE_TEST_LOG_GP) && cur_ops->gather_gp_seqs) { + rtrsp->rt_gp_seq = cur_ops->gather_gp_seqs(); + rtrsp->rt_ts = ktime_get_mono_fast_ns(); + if (!first) + rtrsp[-1].rt_gp_seq_end = rtrsp->rt_gp_seq; + } /* * Next, remove old protection, in decreasing order of strength @@ -1822,7 +2181,7 @@ static void rcutorture_one_extend(int *readstate, int newstate, if (statesold & RCUTORTURE_RDR_RBH) rcu_read_unlock_bh(); if (statesold & RCUTORTURE_RDR_RCU_2) { - cur_ops->readunlock((idxold2 >> RCUTORTURE_RDR_SHIFT_2) & 0x1); + cur_ops->readunlock((idxold2 & RCUTORTURE_RDR_MASK_2) >> RCUTORTURE_RDR_SHIFT_2); WARN_ON_ONCE(idxnew2 != -1); idxold2 = 0; } @@ -1832,12 +2191,17 @@ static void rcutorture_one_extend(int *readstate, int newstate, lockit = !cur_ops->no_pi_lock && !statesnew && !(torture_random(trsp) & 0xffff); if (lockit) raw_spin_lock_irqsave(¤t->pi_lock, flags); - cur_ops->readunlock((idxold1 >> RCUTORTURE_RDR_SHIFT_1) & 0x1); + cur_ops->readunlock((idxold1 & RCUTORTURE_RDR_MASK_1) >> RCUTORTURE_RDR_SHIFT_1); WARN_ON_ONCE(idxnew1 != -1); idxold1 = 0; if (lockit) raw_spin_unlock_irqrestore(¤t->pi_lock, flags); } + if (statesold & RCUTORTURE_RDR_UPDOWN) { + cur_ops->up_read((idxold1 & RCUTORTURE_RDR_MASK_1) >> RCUTORTURE_RDR_SHIFT_1); + WARN_ON_ONCE(idxnew1 != -1); + idxold1 = 0; + } /* Delay if neither beginning nor end and there was a change. */ if ((statesnew || statesold) && *readstate && newstate) @@ -1847,16 +2211,14 @@ static void rcutorture_one_extend(int *readstate, int newstate, if (idxnew1 == -1) idxnew1 = idxold1 & RCUTORTURE_RDR_MASK_1; WARN_ON_ONCE(idxnew1 < 0); - if (WARN_ON_ONCE((idxnew1 >> RCUTORTURE_RDR_SHIFT_1) > 1)) - pr_info("Unexpected idxnew1 value of %#x\n", idxnew1); if (idxnew2 == -1) idxnew2 = idxold2 & RCUTORTURE_RDR_MASK_2; WARN_ON_ONCE(idxnew2 < 0); - WARN_ON_ONCE((idxnew2 >> RCUTORTURE_RDR_SHIFT_2) > 1); *readstate = idxnew1 | idxnew2 | newstate; WARN_ON_ONCE(*readstate < 0); - if (WARN_ON_ONCE((*readstate >> RCUTORTURE_RDR_SHIFT_2) > 1)) - pr_info("Unexpected idxnew2 value of %#x\n", idxnew2); + if (WARN_ON_ONCE(*readstate & ~RCUTORTURE_RDR_ALLBITS)) + pr_info("Unexpected readstate value of %#x\n", *readstate); + rcutorture_one_extend_check("after change", *readstate, statesnew, statesold); } /* Return the biggest extendables mask given current RCU and boot parameters. */ @@ -1875,13 +2237,13 @@ static int rcutorture_extend_mask(int oldmask, struct torture_random_state *trsp) { int mask = rcutorture_extend_mask_max(); - unsigned long randmask1 = torture_random(trsp) >> 8; + unsigned long randmask1 = torture_random(trsp); unsigned long randmask2 = randmask1 >> 3; unsigned long preempts = RCUTORTURE_RDR_PREEMPT | RCUTORTURE_RDR_SCHED; unsigned long preempts_irq = preempts | RCUTORTURE_RDR_IRQ; unsigned long bhs = RCUTORTURE_RDR_BH | RCUTORTURE_RDR_RBH; - WARN_ON_ONCE(mask >> RCUTORTURE_RDR_SHIFT_1); + WARN_ON_ONCE(mask >> RCUTORTURE_RDR_SHIFT_1); // Can't have reader idx bits. /* Mostly only one bit (need preemption!), sometimes lots of bits. */ if (!(randmask1 & 0x7)) mask = mask & randmask2; @@ -1923,8 +2285,7 @@ rcutorture_extend_mask(int oldmask, struct torture_random_state *trsp) * critical section. */ static struct rt_read_seg * -rcutorture_loop_extend(int *readstate, struct torture_random_state *trsp, - struct rt_read_seg *rtrsp) +rcutorture_loop_extend(int *readstate, struct torture_random_state *trsp, struct rt_read_seg *rtrsp) { int i; int j; @@ -1934,109 +2295,153 @@ rcutorture_loop_extend(int *readstate, struct torture_random_state *trsp, if (!((mask - 1) & mask)) return rtrsp; /* Current RCU reader not extendable. */ /* Bias towards larger numbers of loops. */ - i = (torture_random(trsp) >> 3); + i = torture_random(trsp); i = ((i | (i >> 3)) & RCUTORTURE_RDR_MAX_LOOPS) + 1; for (j = 0; j < i; j++) { mask = rcutorture_extend_mask(*readstate, trsp); + WARN_ON_ONCE(mask & RCUTORTURE_RDR_UPDOWN); rcutorture_one_extend(readstate, mask, trsp, &rtrsp[j]); } return &rtrsp[j]; } -/* - * Do one read-side critical section, returning false if there was - * no data to read. Can be invoked both from process context and - * from a timer handler. - */ -static bool rcu_torture_one_read(struct torture_random_state *trsp, long myid) -{ - bool checkpolling = !(torture_random(trsp) & 0xfff); +struct rcu_torture_one_read_state { + bool checkpolling; unsigned long cookie; struct rcu_gp_oldstate cookie_full; - int i; unsigned long started; - unsigned long completed; - int newstate; struct rcu_torture *p; - int pipe_count; - int readstate = 0; - struct rt_read_seg rtseg[RCUTORTURE_RDR_MAX_SEGS] = { { 0 } }; - struct rt_read_seg *rtrsp = &rtseg[0]; - struct rt_read_seg *rtrsp1; + int readstate; + struct rt_read_seg rtseg[RCUTORTURE_RDR_MAX_SEGS]; + struct rt_read_seg *rtrsp; unsigned long long ts; +}; - WARN_ON_ONCE(!rcu_is_watching()); - newstate = rcutorture_extend_mask(readstate, trsp); - rcutorture_one_extend(&readstate, newstate, trsp, rtrsp++); - if (checkpolling) { +static void init_rcu_torture_one_read_state(struct rcu_torture_one_read_state *rtorsp, + struct torture_random_state *trsp) +{ + memset(rtorsp, 0, sizeof(*rtorsp)); + rtorsp->checkpolling = !(torture_random(trsp) & 0xfff); + rtorsp->rtrsp = &rtorsp->rtseg[0]; +} + +/* + * Set up the first segment of a series of overlapping read-side + * critical sections. The caller must have actually initiated the + * outermost read-side critical section. + */ +static bool rcu_torture_one_read_start(struct rcu_torture_one_read_state *rtorsp, + struct torture_random_state *trsp, long myid) +{ + if (rtorsp->checkpolling) { if (cur_ops->get_gp_state && cur_ops->poll_gp_state) - cookie = cur_ops->get_gp_state(); + rtorsp->cookie = cur_ops->get_gp_state(); if (cur_ops->get_gp_state_full && cur_ops->poll_gp_state_full) - cur_ops->get_gp_state_full(&cookie_full); + cur_ops->get_gp_state_full(&rtorsp->cookie_full); } - started = cur_ops->get_gp_seq(); - ts = rcu_trace_clock_local(); - p = rcu_dereference_check(rcu_torture_current, - !cur_ops->readlock_held || cur_ops->readlock_held()); - if (p == NULL) { + rtorsp->started = cur_ops->get_gp_seq(); + rtorsp->ts = rcu_trace_clock_local(); + rtorsp->p = rcu_dereference_check(rcu_torture_current, + !cur_ops->readlock_held || cur_ops->readlock_held() || + (rtorsp->readstate & RCUTORTURE_RDR_UPDOWN)); + if (rtorsp->p == NULL) { /* Wait for rcu_torture_writer to get underway */ - rcutorture_one_extend(&readstate, 0, trsp, rtrsp); + rcutorture_one_extend(&rtorsp->readstate, 0, trsp, rtorsp->rtrsp); return false; } - if (p->rtort_mbtest == 0) + if (rtorsp->p->rtort_mbtest == 0) atomic_inc(&n_rcu_torture_mberror); - rcu_torture_reader_do_mbchk(myid, p, trsp); - rtrsp = rcutorture_loop_extend(&readstate, trsp, rtrsp); + rcu_torture_reader_do_mbchk(myid, rtorsp->p, trsp); + return true; +} + +/* + * Complete the last segment of a series of overlapping read-side + * critical sections and check for errors. + */ +static void rcu_torture_one_read_end(struct rcu_torture_one_read_state *rtorsp, + struct torture_random_state *trsp) +{ + int i; + unsigned long completed; + int pipe_count; + bool preempted = false; + struct rt_read_seg *rtrsp1; + preempt_disable(); - pipe_count = READ_ONCE(p->rtort_pipe_count); + pipe_count = READ_ONCE(rtorsp->p->rtort_pipe_count); if (pipe_count > RCU_TORTURE_PIPE_LEN) { - /* Should not happen, but... */ + // Should not happen in a correct RCU implementation, + // happens quite often for torture_type=busted. pipe_count = RCU_TORTURE_PIPE_LEN; } completed = cur_ops->get_gp_seq(); if (pipe_count > 1) { - do_trace_rcu_torture_read(cur_ops->name, &p->rtort_rcu, - ts, started, completed); + do_trace_rcu_torture_read(cur_ops->name, &rtorsp->p->rtort_rcu, + rtorsp->ts, rtorsp->started, completed); rcu_ftrace_dump(DUMP_ALL); } __this_cpu_inc(rcu_torture_count[pipe_count]); - completed = rcutorture_seq_diff(completed, started); + completed = rcutorture_seq_diff(completed, rtorsp->started); if (completed > RCU_TORTURE_PIPE_LEN) { /* Should not happen, but... */ completed = RCU_TORTURE_PIPE_LEN; } __this_cpu_inc(rcu_torture_batch[completed]); preempt_enable(); - if (checkpolling) { + if (rtorsp->checkpolling) { if (cur_ops->get_gp_state && cur_ops->poll_gp_state) - WARN_ONCE(cur_ops->poll_gp_state(cookie), + WARN_ONCE(cur_ops->poll_gp_state(rtorsp->cookie), "%s: Cookie check 2 failed %s(%d) %lu->%lu\n", __func__, rcu_torture_writer_state_getname(), rcu_torture_writer_state, - cookie, cur_ops->get_gp_state()); + rtorsp->cookie, cur_ops->get_gp_state()); if (cur_ops->get_gp_state_full && cur_ops->poll_gp_state_full) - WARN_ONCE(cur_ops->poll_gp_state_full(&cookie_full), + WARN_ONCE(cur_ops->poll_gp_state_full(&rtorsp->cookie_full), "%s: Cookie check 6 failed %s(%d) online %*pbl\n", __func__, rcu_torture_writer_state_getname(), rcu_torture_writer_state, cpumask_pr_args(cpu_online_mask)); } - rcutorture_one_extend(&readstate, 0, trsp, rtrsp); - WARN_ON_ONCE(readstate); + if (cur_ops->reader_blocked) + preempted = cur_ops->reader_blocked(); + rcutorture_one_extend(&rtorsp->readstate, 0, trsp, rtorsp->rtrsp); + WARN_ON_ONCE(rtorsp->readstate); // This next splat is expected behavior if leakpointer, especially // for CONFIG_RCU_STRICT_GRACE_PERIOD=y kernels. - WARN_ON_ONCE(leakpointer && READ_ONCE(p->rtort_pipe_count) > 1); + WARN_ON_ONCE(leakpointer && READ_ONCE(rtorsp->p->rtort_pipe_count) > 1); /* If error or close call, record the sequence of reader protections. */ if ((pipe_count > 1 || completed > 1) && !xchg(&err_segs_recorded, 1)) { i = 0; - for (rtrsp1 = &rtseg[0]; rtrsp1 < rtrsp; rtrsp1++) + for (rtrsp1 = &rtorsp->rtseg[0]; rtrsp1 < rtorsp->rtrsp; rtrsp1++) err_segs[i++] = *rtrsp1; rt_read_nsegs = i; + rt_read_preempted = preempted; } +} + +/* + * Do one read-side critical section, returning false if there was + * no data to read. Can be invoked both from process context and + * from a timer handler. + */ +static bool rcu_torture_one_read(struct torture_random_state *trsp, long myid) +{ + int newstate; + struct rcu_torture_one_read_state rtors; + WARN_ON_ONCE(!rcu_is_watching()); + init_rcu_torture_one_read_state(&rtors, trsp); + newstate = rcutorture_extend_mask(rtors.readstate, trsp); + WARN_ON_ONCE(newstate & RCUTORTURE_RDR_UPDOWN); + rcutorture_one_extend(&rtors.readstate, newstate, trsp, rtors.rtrsp++); + if (!rcu_torture_one_read_start(&rtors, trsp, myid)) + return false; + rtors.rtrsp = rcutorture_loop_extend(&rtors.readstate, trsp, rtors.rtrsp); + rcu_torture_one_read_end(&rtors, trsp); return true; } @@ -2081,7 +2486,7 @@ rcu_torture_reader(void *arg) set_user_nice(current, MAX_NICE); if (irqreader && cur_ops->irq_capable) timer_setup_on_stack(&t, rcu_torture_timer, 0); - tick_dep_set_task(current, TICK_DEP_BIT_RCU); + tick_dep_set_task(current, TICK_DEP_BIT_RCU); // CPU bound, so need tick. do { if (irqreader && cur_ops->irq_capable) { if (!timer_pending(&t)) @@ -2093,19 +2498,166 @@ rcu_torture_reader(void *arg) torture_hrtimeout_us(500, 1000, &rand); lastsleep = jiffies + 10; } - while (torture_num_online_cpus() < mynumonline && !torture_must_stop()) + while (!torture_must_stop() && + (torture_num_online_cpus() < mynumonline || !rcu_inkernel_boot_has_ended())) schedule_timeout_interruptible(HZ / 5); stutter_wait("rcu_torture_reader"); } while (!torture_must_stop()); if (irqreader && cur_ops->irq_capable) { - del_timer_sync(&t); - destroy_timer_on_stack(&t); + timer_delete_sync(&t); + timer_destroy_on_stack(&t); } tick_dep_clear_task(current, TICK_DEP_BIT_RCU); torture_kthread_stopping("rcu_torture_reader"); return 0; } +struct rcu_torture_one_read_state_updown { + struct hrtimer rtorsu_hrt; + bool rtorsu_inuse; + ktime_t rtorsu_kt; + int rtorsu_cpu; + unsigned long rtorsu_j; + unsigned long rtorsu_ndowns; + unsigned long rtorsu_nups; + unsigned long rtorsu_nmigrates; + struct torture_random_state rtorsu_trs; + struct rcu_torture_one_read_state rtorsu_rtors; +}; + +static struct rcu_torture_one_read_state_updown *updownreaders; +static DEFINE_TORTURE_RANDOM(rcu_torture_updown_rand); +static int rcu_torture_updown(void *arg); + +static enum hrtimer_restart rcu_torture_updown_hrt(struct hrtimer *hrtp) +{ + int cpu = raw_smp_processor_id(); + struct rcu_torture_one_read_state_updown *rtorsup; + + rtorsup = container_of(hrtp, struct rcu_torture_one_read_state_updown, rtorsu_hrt); + rcu_torture_one_read_end(&rtorsup->rtorsu_rtors, &rtorsup->rtorsu_trs); + WARN_ONCE(rtorsup->rtorsu_nups >= rtorsup->rtorsu_ndowns, "%s: Up without matching down #%zu.\n", __func__, rtorsup - updownreaders); + WRITE_ONCE(rtorsup->rtorsu_nups, rtorsup->rtorsu_nups + 1); + WRITE_ONCE(rtorsup->rtorsu_nmigrates, + rtorsup->rtorsu_nmigrates + (cpu != rtorsup->rtorsu_cpu)); + smp_store_release(&rtorsup->rtorsu_inuse, false); + return HRTIMER_NORESTART; +} + +static int rcu_torture_updown_init(void) +{ + int i; + struct torture_random_state *rand = &rcu_torture_updown_rand; + int ret; + + if (n_up_down < 0) + return 0; + if (!srcu_torture_have_up_down()) { + VERBOSE_TOROUT_STRING("rcu_torture_updown_init: Disabling up/down reader tests due to lack of primitives"); + return 0; + } + updownreaders = kcalloc(n_up_down, sizeof(*updownreaders), GFP_KERNEL); + if (!updownreaders) { + VERBOSE_TOROUT_STRING("rcu_torture_updown_init: Out of memory, disabling up/down reader tests"); + return -ENOMEM; + } + for (i = 0; i < n_up_down; i++) { + init_rcu_torture_one_read_state(&updownreaders[i].rtorsu_rtors, rand); + hrtimer_setup(&updownreaders[i].rtorsu_hrt, rcu_torture_updown_hrt, CLOCK_MONOTONIC, + HRTIMER_MODE_REL | HRTIMER_MODE_HARD); + torture_random_init(&updownreaders[i].rtorsu_trs); + init_rcu_torture_one_read_state(&updownreaders[i].rtorsu_rtors, + &updownreaders[i].rtorsu_trs); + } + ret = torture_create_kthread(rcu_torture_updown, rand, updown_task); + if (ret) { + kfree(updownreaders); + updownreaders = NULL; + } + return ret; +} + +static void rcu_torture_updown_cleanup(void) +{ + struct rcu_torture_one_read_state_updown *rtorsup; + + for (rtorsup = updownreaders; rtorsup < &updownreaders[n_up_down]; rtorsup++) { + if (!smp_load_acquire(&rtorsup->rtorsu_inuse)) + continue; + if (hrtimer_cancel(&rtorsup->rtorsu_hrt) || WARN_ON_ONCE(rtorsup->rtorsu_inuse)) { + rcu_torture_one_read_end(&rtorsup->rtorsu_rtors, &rtorsup->rtorsu_trs); + WARN_ONCE(rtorsup->rtorsu_nups >= rtorsup->rtorsu_ndowns, "%s: Up without matching down #%zu.\n", __func__, rtorsup - updownreaders); + WRITE_ONCE(rtorsup->rtorsu_nups, rtorsup->rtorsu_nups + 1); + smp_store_release(&rtorsup->rtorsu_inuse, false); + } + + } + kfree(updownreaders); + updownreaders = NULL; +} + +// Do one reader for rcu_torture_updown(). +static void rcu_torture_updown_one(struct rcu_torture_one_read_state_updown *rtorsup) +{ + int idx; + int rawidx; + ktime_t t; + + init_rcu_torture_one_read_state(&rtorsup->rtorsu_rtors, &rtorsup->rtorsu_trs); + rawidx = cur_ops->down_read(); + WRITE_ONCE(rtorsup->rtorsu_ndowns, rtorsup->rtorsu_ndowns + 1); + idx = (rawidx << RCUTORTURE_RDR_SHIFT_1) & RCUTORTURE_RDR_MASK_1; + rtorsup->rtorsu_rtors.readstate = idx | RCUTORTURE_RDR_UPDOWN; + rtorsup->rtorsu_rtors.rtrsp++; + rtorsup->rtorsu_cpu = raw_smp_processor_id(); + if (!rcu_torture_one_read_start(&rtorsup->rtorsu_rtors, &rtorsup->rtorsu_trs, -1)) { + WARN_ONCE(rtorsup->rtorsu_nups >= rtorsup->rtorsu_ndowns, "%s: Up without matching down #%zu.\n", __func__, rtorsup - updownreaders); + WRITE_ONCE(rtorsup->rtorsu_nups, rtorsup->rtorsu_nups + 1); + schedule_timeout_idle(HZ); + return; + } + smp_store_release(&rtorsup->rtorsu_inuse, true); + t = torture_random(&rtorsup->rtorsu_trs) & 0xfffff; // One per million. + if (t < 10 * 1000) + t = 200 * 1000 * 1000; + hrtimer_start(&rtorsup->rtorsu_hrt, t, HRTIMER_MODE_REL | HRTIMER_MODE_HARD); + smp_mb(); // Sample jiffies after posting hrtimer. + rtorsup->rtorsu_j = jiffies; // Not used by hrtimer handler. + rtorsup->rtorsu_kt = t; +} + +/* + * RCU torture up/down reader kthread, starting RCU readers in kthread + * context and ending them in hrtimer handlers. Otherwise similar to + * rcu_torture_reader(). + */ +static int +rcu_torture_updown(void *arg) +{ + unsigned long j; + struct rcu_torture_one_read_state_updown *rtorsup; + + VERBOSE_TOROUT_STRING("rcu_torture_updown task started"); + do { + for (rtorsup = updownreaders; rtorsup < &updownreaders[n_up_down]; rtorsup++) { + if (torture_must_stop()) + break; + j = smp_load_acquire(&jiffies); // Time before ->rtorsu_inuse. + if (smp_load_acquire(&rtorsup->rtorsu_inuse)) { + WARN_ONCE(time_after(j, rtorsup->rtorsu_j + 1 + HZ * 10), + "hrtimer queued at jiffies %lu for %lld ns took %lu jiffies\n", rtorsup->rtorsu_j, rtorsup->rtorsu_kt, j - rtorsup->rtorsu_j); + continue; + } + rcu_torture_updown_one(rtorsup); + } + torture_hrtimeout_ms(1, 1000, &rcu_torture_updown_rand); + stutter_wait("rcu_torture_updown"); + } while (!torture_must_stop()); + rcu_torture_updown_cleanup(); + torture_kthread_stopping("rcu_torture_updown"); + return 0; +} + /* * Randomly Toggle CPUs' callback-offload state. This uses hrtimers to * increase race probabilities and fuzzes the interval between toggling. @@ -2124,7 +2676,7 @@ static int rcu_nocb_toggle(void *arg) VERBOSE_TOROUT_STRING("rcu_nocb_toggle task started"); while (!rcu_inkernel_boot_has_ended()) schedule_timeout_interruptible(HZ / 10); - for_each_online_cpu(cpu) + for_each_possible_cpu(cpu) maxcpu = cpu; WARN_ON(maxcpu < 0); if (toggle_interval > ULONG_MAX) @@ -2135,7 +2687,7 @@ static int rcu_nocb_toggle(void *arg) toggle_fuzz = NSEC_PER_USEC; do { r = torture_random(&rand); - cpu = (r >> 4) % (maxcpu + 1); + cpu = (r >> 1) % (maxcpu + 1); if (r & 0x1) { rcu_nocb_cpu_offload(cpu); atomic_long_inc(&n_nocb_offload); @@ -2168,6 +2720,11 @@ rcu_torture_stats_print(void) int i; long pipesummary[RCU_TORTURE_PIPE_LEN + 1] = { 0 }; long batchsummary[RCU_TORTURE_PIPE_LEN + 1] = { 0 }; + long n_gpwraps = 0; + unsigned long ndowns = 0; + unsigned long nunexpired = 0; + unsigned long nmigrates = 0; + unsigned long nups = 0; struct rcu_torture *rtcp; static unsigned long rtcv_snap = ULONG_MAX; static bool splatted; @@ -2178,11 +2735,21 @@ rcu_torture_stats_print(void) pipesummary[i] += READ_ONCE(per_cpu(rcu_torture_count, cpu)[i]); batchsummary[i] += READ_ONCE(per_cpu(rcu_torture_batch, cpu)[i]); } + if (cur_ops->get_gpwrap_count) + n_gpwraps += cur_ops->get_gpwrap_count(cpu); + } + if (updownreaders) { + for (i = 0; i < n_up_down; i++) { + ndowns += READ_ONCE(updownreaders[i].rtorsu_ndowns); + nups += READ_ONCE(updownreaders[i].rtorsu_nups); + nunexpired += READ_ONCE(updownreaders[i].rtorsu_inuse); + nmigrates += READ_ONCE(updownreaders[i].rtorsu_nmigrates); + } } for (i = RCU_TORTURE_PIPE_LEN; i >= 0; i--) { if (pipesummary[i] != 0) break; - } + } // The value of variable "i" is used later, so don't clobber it! pr_alert("%s%s ", torture_type, TORTURE_FLAG); rtcp = rcu_access_pointer(rcu_torture_current); @@ -2194,38 +2761,38 @@ rcu_torture_stats_print(void) atomic_read(&n_rcu_torture_alloc), atomic_read(&n_rcu_torture_alloc_fail), atomic_read(&n_rcu_torture_free)); - pr_cont("rtmbe: %d rtmbkf: %d/%d rtbe: %ld rtbke: %ld rtbre: %ld ", + pr_cont("rtmbe: %d rtmbkf: %d/%d rtbe: %ld rtbke: %ld ", atomic_read(&n_rcu_torture_mberror), atomic_read(&n_rcu_torture_mbchk_fail), atomic_read(&n_rcu_torture_mbchk_tries), n_rcu_torture_barrier_error, - n_rcu_torture_boost_ktrerror, - n_rcu_torture_boost_rterror); + n_rcu_torture_boost_ktrerror); pr_cont("rtbf: %ld rtb: %ld nt: %ld ", n_rcu_torture_boost_failure, n_rcu_torture_boosts, atomic_long_read(&n_rcu_torture_timers)); + if (updownreaders) + pr_cont("ndowns: %lu nups: %lu nhrt: %lu nmigrates: %lu ", ndowns, nups, nunexpired, nmigrates); torture_onoff_stats(); pr_cont("barrier: %ld/%ld:%ld ", data_race(n_barrier_successes), data_race(n_barrier_attempts), data_race(n_rcu_torture_barrier_error)); pr_cont("read-exits: %ld ", data_race(n_read_exits)); // Statistic. - pr_cont("nocb-toggles: %ld:%ld\n", + pr_cont("nocb-toggles: %ld:%ld ", atomic_long_read(&n_nocb_offload), atomic_long_read(&n_nocb_deoffload)); + pr_cont("gpwraps: %ld\n", n_gpwraps); pr_alert("%s%s ", torture_type, TORTURE_FLAG); if (atomic_read(&n_rcu_torture_mberror) || atomic_read(&n_rcu_torture_mbchk_fail) || n_rcu_torture_barrier_error || n_rcu_torture_boost_ktrerror || - n_rcu_torture_boost_rterror || n_rcu_torture_boost_failure || - i > 1) { + n_rcu_torture_boost_failure || i > 1) { pr_cont("%s", "!!! "); atomic_inc(&n_rcu_torture_error); WARN_ON_ONCE(atomic_read(&n_rcu_torture_mberror)); WARN_ON_ONCE(atomic_read(&n_rcu_torture_mbchk_fail)); WARN_ON_ONCE(n_rcu_torture_barrier_error); // rcu_barrier() WARN_ON_ONCE(n_rcu_torture_boost_ktrerror); // no boost kthread - WARN_ON_ONCE(n_rcu_torture_boost_rterror); // can't set RT prio WARN_ON_ONCE(n_rcu_torture_boost_failure); // boost failed (TIMER_SOFTIRQ RT prio?) WARN_ON_ONCE(i > 1); // Too-short grace period } @@ -2251,14 +2818,13 @@ rcu_torture_stats_print(void) cur_ops->stats(); if (rtcv_snap == rcu_torture_current_version && rcu_access_pointer(rcu_torture_current) && - !rcu_stall_is_suppressed()) { + !rcu_stall_is_suppressed() && + rcu_inkernel_boot_has_ended()) { int __maybe_unused flags = 0; unsigned long __maybe_unused gp_seq = 0; - rcutorture_get_gp_data(cur_ops->ttype, - &flags, &gp_seq); - srcutorture_get_gp_data(cur_ops->ttype, srcu_ctlp, - &flags, &gp_seq); + if (cur_ops->get_gp_data) + cur_ops->get_gp_data(&flags, &gp_seq); wtp = READ_ONCE(writer_task); pr_alert("??? Writer stall state %s(%d) g%lu f%#x ->state %#x cpu %d\n", rcu_torture_writer_state_getname(), @@ -2352,24 +2918,30 @@ rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, const char *tag) "shuffle_interval=%d stutter=%d irqreader=%d " "fqs_duration=%d fqs_holdoff=%d fqs_stutter=%d " "test_boost=%d/%d test_boost_interval=%d " - "test_boost_duration=%d shutdown_secs=%d " + "test_boost_duration=%d test_boost_holdoff=%d shutdown_secs=%d " "stall_cpu=%d stall_cpu_holdoff=%d stall_cpu_irqsoff=%d " - "stall_cpu_block=%d " + "stall_cpu_block=%d stall_cpu_repeat=%d " "n_barrier_cbs=%d " "onoff_interval=%d onoff_holdoff=%d " "read_exit_delay=%d read_exit_burst=%d " - "nocbs_nthreads=%d nocbs_toggle=%d\n", - torture_type, tag, nrealreaders, nfakewriters, + "reader_flavor=%x " + "nocbs_nthreads=%d nocbs_toggle=%d " + "test_nmis=%d " + "preempt_duration=%d preempt_interval=%d n_up_down=%d\n", + torture_type, tag, nrealreaders, nrealfakewriters, stat_interval, verbose, test_no_idle_hz, shuffle_interval, stutter, irqreader, fqs_duration, fqs_holdoff, fqs_stutter, test_boost, cur_ops->can_boost, - test_boost_interval, test_boost_duration, shutdown_secs, + test_boost_interval, test_boost_duration, test_boost_holdoff, shutdown_secs, stall_cpu, stall_cpu_holdoff, stall_cpu_irqsoff, - stall_cpu_block, + stall_cpu_block, stall_cpu_repeat, n_barrier_cbs, onoff_interval, onoff_holdoff, read_exit_delay, read_exit_burst, - nocbs_nthreads, nocbs_toggle); + reader_flavor, + nocbs_nthreads, nocbs_toggle, + test_nmis, + preempt_duration, preempt_interval, n_up_down); } static int rcutorture_booster_cleanup(unsigned int cpu) @@ -2407,6 +2979,14 @@ static int rcutorture_booster_init(unsigned int cpu) WARN_ON_ONCE(!t); sp.sched_priority = 2; sched_setscheduler_nocheck(t, SCHED_FIFO, &sp); +#ifdef CONFIG_IRQ_FORCED_THREADING + if (force_irqthreads()) { + t = per_cpu(ktimerd, cpu); + WARN_ON_ONCE(!t); + sp.sched_priority = 2; + sched_setscheduler_nocheck(t, SCHED_FIFO, &sp); + } +#endif } /* Don't allow time recalculation while creating a new task. */ @@ -2427,16 +3007,26 @@ static int rcutorture_booster_init(unsigned int cpu) return 0; } +static int rcu_torture_stall_nf(struct notifier_block *nb, unsigned long v, void *ptr) +{ + pr_info("%s: v=%lu, duration=%lu.\n", __func__, v, (unsigned long)ptr); + return NOTIFY_OK; +} + +static struct notifier_block rcu_torture_stall_block = { + .notifier_call = rcu_torture_stall_nf, +}; + /* * CPU-stall kthread. It waits as specified by stall_cpu_holdoff, then - * induces a CPU stall for the time specified by stall_cpu. + * induces a CPU stall for the time specified by stall_cpu. If a new + * stall test is added, stallsdone in rcu_torture_writer() must be adjusted. */ -static int rcu_torture_stall(void *args) +static void rcu_torture_stall_one(int rep, int irqsoff) { int idx; unsigned long stop_at; - VERBOSE_TOROUT_STRING("rcu_torture_stall task started"); if (stall_cpu_holdoff > 0) { VERBOSE_TOROUT_STRING("rcu_torture_stall begin holdoff"); schedule_timeout_interruptible(stall_cpu_holdoff * HZ); @@ -2456,14 +3046,14 @@ static int rcu_torture_stall(void *args) stop_at = ktime_get_seconds() + stall_cpu; /* RCU CPU stall is expected behavior in following code. */ idx = cur_ops->readlock(); - if (stall_cpu_irqsoff) + if (irqsoff) local_irq_disable(); else if (!stall_cpu_block) preempt_disable(); - pr_alert("%s start on CPU %d.\n", - __func__, raw_smp_processor_id()); - while (ULONG_CMP_LT((unsigned long)ktime_get_seconds(), - stop_at)) + pr_alert("%s start stall episode %d on CPU %d.\n", + __func__, rep + 1, raw_smp_processor_id()); + while (ULONG_CMP_LT((unsigned long)ktime_get_seconds(), stop_at) && + !kthread_should_stop()) if (stall_cpu_block) { #ifdef CONFIG_PREEMPTION preempt_schedule(); @@ -2473,13 +3063,48 @@ static int rcu_torture_stall(void *args) } else if (stall_no_softlockup) { touch_softlockup_watchdog(); } - if (stall_cpu_irqsoff) + if (irqsoff) local_irq_enable(); else if (!stall_cpu_block) preempt_enable(); cur_ops->readunlock(idx); } +} + +/* + * CPU-stall kthread. Invokes rcu_torture_stall_one() once, and then as many + * additional times as specified by the stall_cpu_repeat module parameter. + * Note that stall_cpu_irqsoff is ignored on the second and subsequent + * stall. + */ +static int rcu_torture_stall(void *args) +{ + int i; + int repeat = stall_cpu_repeat; + int ret; + + VERBOSE_TOROUT_STRING("rcu_torture_stall task started"); + if (repeat < 0) { + repeat = 0; + WARN_ON_ONCE(IS_BUILTIN(CONFIG_RCU_TORTURE_TEST)); + } + if (rcu_cpu_stall_notifiers) { + ret = rcu_stall_chain_notifier_register(&rcu_torture_stall_block); + if (ret) + pr_info("%s: rcu_stall_chain_notifier_register() returned %d, %sexpected.\n", + __func__, ret, !IS_ENABLED(CONFIG_RCU_STALL_COMMON) ? "un" : ""); + } + for (i = 0; i <= repeat; i++) { + if (kthread_should_stop()) + break; + rcu_torture_stall_one(i, i == 0 ? stall_cpu_irqsoff : 0); + } pr_alert("%s end.\n", __func__); + if (rcu_cpu_stall_notifiers && !ret) { + ret = rcu_stall_chain_notifier_unregister(&rcu_torture_stall_block); + if (ret) + pr_info("%s: rcu_stall_chain_notifier_unregister() returned %d.\n", __func__, ret); + } torture_shutdown_absorb("rcu_torture_stall"); while (!kthread_should_stop()) schedule_timeout_interruptible(10 * HZ); @@ -2590,7 +3215,7 @@ static void rcu_torture_fwd_cb_cr(struct rcu_head *rhp) spin_lock_irqsave(&rfp->rcu_fwd_lock, flags); rfcpp = rfp->rcu_fwd_cb_tail; rfp->rcu_fwd_cb_tail = &rfcp->rfc_next; - WRITE_ONCE(*rfcpp, rfcp); + smp_store_release(rfcpp, rfcp); WRITE_ONCE(rfp->n_launders_cb, rfp->n_launders_cb + 1); i = ((jiffies - rfp->rcu_fwd_startat) / (HZ / FWD_CBS_HIST_DIV)); if (i >= ARRAY_SIZE(rfp->n_launders_hist)) @@ -2639,7 +3264,7 @@ static unsigned long rcu_torture_fwd_prog_cbfree(struct rcu_fwd *rfp) rcu_torture_fwd_prog_cond_resched(freed); if (tick_nohz_full_enabled()) { local_irq_save(flags); - rcu_momentary_dyntick_idle(); + rcu_momentary_eqs(); local_irq_restore(flags); } } @@ -2756,7 +3381,7 @@ static void rcu_torture_fwd_prog_cr(struct rcu_fwd *rfp) cver = READ_ONCE(rcu_torture_current_version); gps = cur_ops->get_gp_seq(); rfp->rcu_launder_gp_seq_start = gps; - tick_dep_set_task(current, TICK_DEP_BIT_RCU); + tick_dep_set_task(current, TICK_DEP_BIT_RCU); // CPU bound, so need tick. while (time_before(jiffies, stopat) && !shutdown_time_arrived() && !READ_ONCE(rcu_fwd_emergency_stop) && !torture_must_stop()) { @@ -2789,7 +3414,7 @@ static void rcu_torture_fwd_prog_cr(struct rcu_fwd *rfp) rcu_torture_fwd_prog_cond_resched(n_launders + n_max_cbs); if (tick_nohz_full_enabled()) { local_irq_save(flags); - rcu_momentary_dyntick_idle(); + rcu_momentary_eqs(); local_irq_restore(flags); } } @@ -2803,13 +3428,14 @@ static void rcu_torture_fwd_prog_cr(struct rcu_fwd *rfp) if (!torture_must_stop() && !READ_ONCE(rcu_fwd_emergency_stop) && !shutdown_time_arrived()) { - WARN_ON(n_max_gps < MIN_FWD_CBS_LAUNDERED); - pr_alert("%s Duration %lu barrier: %lu pending %ld n_launders: %ld n_launders_sa: %ld n_max_gps: %ld n_max_cbs: %ld cver %ld gps %ld\n", + if (WARN_ON(n_max_gps < MIN_FWD_CBS_LAUNDERED) && cur_ops->gp_kthread_dbg) + cur_ops->gp_kthread_dbg(); + pr_alert("%s Duration %lu barrier: %lu pending %ld n_launders: %ld n_launders_sa: %ld n_max_gps: %ld n_max_cbs: %ld cver %ld gps %ld #online %u\n", __func__, stoppedat - rfp->rcu_fwd_startat, jiffies - stoppedat, n_launders + n_max_cbs - n_launders_cb_snap, n_launders, n_launders_sa, - n_max_gps, n_max_cbs, cver, gps); + n_max_gps, n_max_cbs, cver, gps, num_online_cpus()); atomic_long_add(n_max_cbs, &rcu_fwd_max_cbs); mutex_lock(&rcu_fwd_mutex); // Serialize histograms. rcu_torture_fwd_cb_hist(rfp); @@ -2883,6 +3509,8 @@ static int rcu_torture_fwd_prog(void *args) int tested_tries = 0; VERBOSE_TOROUT_STRING("rcu_torture_fwd_progress task started"); + while (!rcu_inkernel_boot_has_ended()) + schedule_timeout_interruptible(HZ / 10); rcu_bind_current_to_nocb(); if (!IS_ENABLED(CONFIG_SMP) || !IS_ENABLED(CONFIG_RCU_BOOST)) set_user_nice(current, MAX_NICE); @@ -2898,7 +3526,7 @@ static int rcu_torture_fwd_prog(void *args) WRITE_ONCE(rcu_fwd_seq, rcu_fwd_seq + 1); } else { while (READ_ONCE(rcu_fwd_seq) == oldseq && !torture_must_stop()) - schedule_timeout_interruptible(1); + schedule_timeout_interruptible(HZ / 20); oldseq = READ_ONCE(rcu_fwd_seq); } pr_alert("%s: Starting forward-progress test %d\n", __func__, rfp->rcu_fwd_id); @@ -2945,12 +3573,12 @@ static int __init rcu_torture_fwd_prog_init(void) fwd_progress = 0; return 0; } - if (stall_cpu > 0) { - VERBOSE_TOROUT_STRING("rcu_torture_fwd_prog_init: Disabled, conflicts with CPU-stall testing"); + if (stall_cpu > 0 || (preempt_duration > 0 && IS_ENABLED(CONFIG_RCU_NOCB_CPU))) { + VERBOSE_TOROUT_STRING("rcu_torture_fwd_prog_init: Disabled, conflicts with CPU-stall and/or preemption testing"); fwd_progress = 0; if (IS_MODULE(CONFIG_RCU_TORTURE_TEST)) return -EINVAL; /* In module, can fail back to user. */ - WARN_ON(1); /* Make sure rcutorture notices conflict. */ + WARN_ON(1); /* Make sure rcutorture scripting notices conflict. */ return 0; } if (fwd_progress_holdoff <= 0) @@ -3011,11 +3639,12 @@ static void rcu_torture_barrier_cbf(struct rcu_head *rcu) } /* IPI handler to get callback posted on desired CPU, if online. */ -static void rcu_torture_barrier1cb(void *rcu_void) +static int rcu_torture_barrier1cb(void *rcu_void) { struct rcu_head *rhp = rcu_void; cur_ops->call(rhp, rcu_torture_barrier_cbf); + return 0; } /* kthread function to register callbacks used to test RCU barriers. */ @@ -3041,11 +3670,9 @@ static int rcu_torture_barrier_cbs(void *arg) * The above smp_load_acquire() ensures barrier_phase load * is ordered before the following ->call(). */ - if (smp_call_function_single(myid, rcu_torture_barrier1cb, - &rcu, 1)) { - // IPI failed, so use direct call from current CPU. + if (smp_call_on_cpu(myid, rcu_torture_barrier1cb, &rcu, 1)) cur_ops->call(&rcu, rcu_torture_barrier_cbf); - } + if (atomic_dec_and_test(&barrier_cbs_count)) wake_up(&barrier_wq); } while (!torture_must_stop()); @@ -3199,7 +3826,7 @@ static int rcu_torture_read_exit_child(void *trsp_in) set_user_nice(current, MAX_NICE); // Minimize time between reading and exiting. while (!kthread_should_stop()) - schedule_timeout_uninterruptible(1); + schedule_timeout_uninterruptible(HZ / 20); (void)rcu_torture_one_read(trsp, -1); return 0; } @@ -3247,7 +3874,7 @@ static int rcu_torture_read_exit(void *unused) smp_mb(); // Store before wakeup. wake_up(&read_exit_wq); while (!torture_must_stop()) - schedule_timeout_uninterruptible(1); + schedule_timeout_uninterruptible(HZ / 20); torture_kthread_stopping("rcu_torture_read_exit"); return 0; } @@ -3273,8 +3900,111 @@ static void rcu_torture_read_exit_cleanup(void) torture_stop_kthread(rcutorture_read_exit, read_exit_task); } +static void rcutorture_test_nmis(int n) +{ +#if IS_BUILTIN(CONFIG_RCU_TORTURE_TEST) + int cpu; + int dumpcpu; + int i; + + for (i = 0; i < n; i++) { + preempt_disable(); + cpu = smp_processor_id(); + dumpcpu = cpu + 1; + if (dumpcpu >= nr_cpu_ids) + dumpcpu = 0; + pr_alert("%s: CPU %d invoking dump_cpu_task(%d)\n", __func__, cpu, dumpcpu); + dump_cpu_task(dumpcpu); + preempt_enable(); + schedule_timeout_uninterruptible(15 * HZ); + } +#else // #if IS_BUILTIN(CONFIG_RCU_TORTURE_TEST) + WARN_ONCE(n, "Non-zero rcutorture.test_nmis=%d permitted only when rcutorture is built in.\n", test_nmis); +#endif // #else // #if IS_BUILTIN(CONFIG_RCU_TORTURE_TEST) +} + +// Randomly preempt online CPUs. +static int rcu_torture_preempt(void *unused) +{ + int cpu = -1; + DEFINE_TORTURE_RANDOM(rand); + + schedule_timeout_idle(stall_cpu_holdoff); + do { + // Wait for preempt_interval ms with up to 100us fuzz. + torture_hrtimeout_ms(preempt_interval, 100, &rand); + // Select online CPU. + cpu = cpumask_next(cpu, cpu_online_mask); + if (cpu >= nr_cpu_ids) + cpu = cpumask_next(-1, cpu_online_mask); + WARN_ON_ONCE(cpu >= nr_cpu_ids); + // Move to that CPU, if can't do so, retry later. + if (torture_sched_setaffinity(current->pid, cpumask_of(cpu), false)) + continue; + // Preempt at high-ish priority, then reset to normal. + sched_set_fifo(current); + torture_sched_setaffinity(current->pid, cpu_present_mask, true); + mdelay(preempt_duration); + sched_set_normal(current, 0); + stutter_wait("rcu_torture_preempt"); + } while (!torture_must_stop()); + torture_kthread_stopping("rcu_torture_preempt"); + return 0; +} + static enum cpuhp_state rcutor_hp; +static struct hrtimer gpwrap_lag_timer; +static bool gpwrap_lag_active; + +/* Timer handler for toggling RCU grace-period sequence overflow test lag value */ +static enum hrtimer_restart rcu_gpwrap_lag_timer(struct hrtimer *timer) +{ + ktime_t next_delay; + + if (gpwrap_lag_active) { + pr_alert("rcu-torture: Disabling gpwrap lag (value=0)\n"); + cur_ops->set_gpwrap_lag(0); + gpwrap_lag_active = false; + next_delay = ktime_set((gpwrap_lag_cycle_mins - gpwrap_lag_active_mins) * 60, 0); + } else { + pr_alert("rcu-torture: Enabling gpwrap lag (value=%d)\n", gpwrap_lag_gps); + cur_ops->set_gpwrap_lag(gpwrap_lag_gps); + gpwrap_lag_active = true; + next_delay = ktime_set(gpwrap_lag_active_mins * 60, 0); + } + + if (torture_must_stop_irq()) + return HRTIMER_NORESTART; + + hrtimer_forward_now(timer, next_delay); + return HRTIMER_RESTART; +} + +static int rcu_gpwrap_lag_init(void) +{ + if (!gpwrap_lag) + return 0; + + if (gpwrap_lag_cycle_mins <= 0 || gpwrap_lag_active_mins <= 0) { + pr_alert("rcu-torture: lag timing parameters must be positive\n"); + return -EINVAL; + } + + hrtimer_setup(&gpwrap_lag_timer, rcu_gpwrap_lag_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + gpwrap_lag_active = false; + hrtimer_start(&gpwrap_lag_timer, + ktime_set((gpwrap_lag_cycle_mins - gpwrap_lag_active_mins) * 60, 0), HRTIMER_MODE_REL); + + return 0; +} + +static void rcu_gpwrap_lag_cleanup(void) +{ + hrtimer_cancel(&gpwrap_lag_timer); + cur_ops->set_gpwrap_lag(0); + gpwrap_lag_active = false; +} static void rcu_torture_cleanup(void) { @@ -3282,23 +4012,27 @@ rcu_torture_cleanup(void) int flags = 0; unsigned long gp_seq = 0; int i; + int j; if (torture_cleanup_begin()) { if (cur_ops->cb_barrier != NULL) { pr_info("%s: Invoking %pS().\n", __func__, cur_ops->cb_barrier); cur_ops->cb_barrier(); } - rcu_gp_slow_unregister(NULL); + if (cur_ops->gp_slow_unregister) + cur_ops->gp_slow_unregister(NULL); return; } if (!cur_ops) { torture_cleanup_end(); - rcu_gp_slow_unregister(NULL); return; } + rcutorture_test_nmis(test_nmis); + if (cur_ops->gp_kthread_dbg) cur_ops->gp_kthread_dbg(); + torture_stop_kthread(rcu_torture_preempt, preempt_task); rcu_torture_read_exit_cleanup(); rcu_torture_barrier_cleanup(); rcu_torture_fwd_prog_cleanup(); @@ -3312,6 +4046,10 @@ rcu_torture_cleanup(void) nocb_tasks = NULL; } + if (updown_task) { + torture_stop_kthread(rcu_torture_updown, updown_task); + updown_task = NULL; + } if (reader_tasks) { for (i = 0; i < nrealreaders; i++) torture_stop_kthread(rcu_torture_reader, @@ -3323,15 +4061,15 @@ rcu_torture_cleanup(void) rcu_torture_reader_mbchk = NULL; if (fakewriter_tasks) { - for (i = 0; i < nfakewriters; i++) + for (i = 0; i < nrealfakewriters; i++) torture_stop_kthread(rcu_torture_fakewriter, fakewriter_tasks[i]); kfree(fakewriter_tasks); fakewriter_tasks = NULL; } - rcutorture_get_gp_data(cur_ops->ttype, &flags, &gp_seq); - srcutorture_get_gp_data(cur_ops->ttype, srcu_ctlp, &flags, &gp_seq); + if (cur_ops->get_gp_data) + cur_ops->get_gp_data(&flags, &gp_seq); pr_alert("%s: End-test grace-period state: g%ld f%#x total-gps=%ld\n", cur_ops->name, (long)gp_seq, flags, rcutorture_seq_diff(gp_seq, start_gp_seq)); @@ -3361,26 +4099,74 @@ rcu_torture_cleanup(void) pr_alert("\t: No segments recorded!!!\n"); firsttime = 1; for (i = 0; i < rt_read_nsegs; i++) { - pr_alert("\t%d: %#x ", i, err_segs[i].rt_readstate); + if (IS_ENABLED(CONFIG_RCU_TORTURE_TEST_LOG_GP)) + pr_alert("\t%lluus ", div64_u64(err_segs[i].rt_ts, 1000ULL)); + else + pr_alert("\t"); + pr_cont("%d: %#4x", i, err_segs[i].rt_readstate); if (err_segs[i].rt_delay_jiffies != 0) { pr_cont("%s%ldjiffies", firsttime ? "" : "+", err_segs[i].rt_delay_jiffies); firsttime = 0; } + if (IS_ENABLED(CONFIG_RCU_TORTURE_TEST_LOG_CPU)) { + pr_cont(" CPU %2d", err_segs[i].rt_cpu); + if (err_segs[i].rt_cpu != err_segs[i].rt_end_cpu) + pr_cont("->%-2d", err_segs[i].rt_end_cpu); + else + pr_cont(" ..."); + } + if (IS_ENABLED(CONFIG_RCU_TORTURE_TEST_LOG_GP) && + cur_ops->gather_gp_seqs && cur_ops->format_gp_seqs) { + char buf1[20+1]; + char buf2[20+1]; + char sepchar = '-'; + + cur_ops->format_gp_seqs(err_segs[i].rt_gp_seq, + buf1, ARRAY_SIZE(buf1)); + cur_ops->format_gp_seqs(err_segs[i].rt_gp_seq_end, + buf2, ARRAY_SIZE(buf2)); + if (err_segs[i].rt_gp_seq == err_segs[i].rt_gp_seq_end) { + if (buf2[0]) { + for (j = 0; buf2[j]; j++) + buf2[j] = '.'; + if (j) + buf2[j - 1] = ' '; + } + sepchar = ' '; + } + pr_cont(" %s%c%s", buf1, sepchar, buf2); + } if (err_segs[i].rt_delay_ms != 0) { - pr_cont("%s%ldms", firsttime ? "" : "+", + pr_cont(" %s%ldms", firsttime ? "" : "+", err_segs[i].rt_delay_ms); firsttime = 0; } if (err_segs[i].rt_delay_us != 0) { - pr_cont("%s%ldus", firsttime ? "" : "+", + pr_cont(" %s%ldus", firsttime ? "" : "+", err_segs[i].rt_delay_us); firsttime = 0; } - pr_cont("%s\n", - err_segs[i].rt_preempted ? "preempted" : ""); + pr_cont("%s", err_segs[i].rt_preempted ? " preempted" : ""); + if (err_segs[i].rt_readstate & RCUTORTURE_RDR_BH) + pr_cont(" BH"); + if (err_segs[i].rt_readstate & RCUTORTURE_RDR_IRQ) + pr_cont(" IRQ"); + if (err_segs[i].rt_readstate & RCUTORTURE_RDR_PREEMPT) + pr_cont(" PREEMPT"); + if (err_segs[i].rt_readstate & RCUTORTURE_RDR_RBH) + pr_cont(" RBH"); + if (err_segs[i].rt_readstate & RCUTORTURE_RDR_SCHED) + pr_cont(" SCHED"); + if (err_segs[i].rt_readstate & RCUTORTURE_RDR_RCU_1) + pr_cont(" RCU_1"); + if (err_segs[i].rt_readstate & RCUTORTURE_RDR_RCU_2) + pr_cont(" RCU_2"); + pr_cont("\n"); } + if (rt_read_preempted) + pr_alert("\tReader was preempted.\n"); } if (atomic_read(&n_rcu_torture_error) || n_rcu_torture_barrier_error) rcu_torture_print_module_parms(cur_ops, "End of test: FAILURE"); @@ -3390,10 +4176,13 @@ rcu_torture_cleanup(void) else rcu_torture_print_module_parms(cur_ops, "End of test: SUCCESS"); torture_cleanup_end(); - rcu_gp_slow_unregister(&rcu_fwd_cb_nodelay); + if (cur_ops->gp_slow_unregister) + cur_ops->gp_slow_unregister(NULL); + + if (gpwrap_lag && cur_ops->set_gpwrap_lag) + rcu_gpwrap_lag_cleanup(); } -#ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD static void rcu_torture_leak_cb(struct rcu_head *rhp) { } @@ -3411,7 +4200,6 @@ static void rcu_torture_err_cb(struct rcu_head *rhp) */ pr_alert("%s: duplicated callback was invoked.\n", KBUILD_MODNAME); } -#endif /* #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */ /* * Verify that double-free causes debug-objects to complain, but only @@ -3420,39 +4208,43 @@ static void rcu_torture_err_cb(struct rcu_head *rhp) */ static void rcu_test_debug_objects(void) { -#ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD struct rcu_head rh1; struct rcu_head rh2; + int idx; + + if (!IS_ENABLED(CONFIG_DEBUG_OBJECTS_RCU_HEAD)) { + pr_alert("%s: !CONFIG_DEBUG_OBJECTS_RCU_HEAD, not testing duplicate call_%s()\n", + KBUILD_MODNAME, cur_ops->name); + return; + } + + if (WARN_ON_ONCE(cur_ops->debug_objects && + (!cur_ops->call || !cur_ops->cb_barrier))) + return; + struct rcu_head *rhp = kmalloc(sizeof(*rhp), GFP_KERNEL); init_rcu_head_on_stack(&rh1); init_rcu_head_on_stack(&rh2); - pr_alert("%s: WARN: Duplicate call_rcu() test starting.\n", KBUILD_MODNAME); + pr_alert("%s: WARN: Duplicate call_%s() test starting.\n", KBUILD_MODNAME, cur_ops->name); /* Try to queue the rh2 pair of callbacks for the same grace period. */ - preempt_disable(); /* Prevent preemption from interrupting test. */ - rcu_read_lock(); /* Make it impossible to finish a grace period. */ - call_rcu_hurry(&rh1, rcu_torture_leak_cb); /* Start grace period. */ - local_irq_disable(); /* Make it harder to start a new grace period. */ - call_rcu_hurry(&rh2, rcu_torture_leak_cb); - call_rcu_hurry(&rh2, rcu_torture_err_cb); /* Duplicate callback. */ + idx = cur_ops->readlock(); /* Make it impossible to finish a grace period. */ + cur_ops->call(&rh1, rcu_torture_leak_cb); /* Start grace period. */ + cur_ops->call(&rh2, rcu_torture_leak_cb); + cur_ops->call(&rh2, rcu_torture_err_cb); /* Duplicate callback. */ if (rhp) { - call_rcu_hurry(rhp, rcu_torture_leak_cb); - call_rcu_hurry(rhp, rcu_torture_err_cb); /* Another duplicate callback. */ + cur_ops->call(rhp, rcu_torture_leak_cb); + cur_ops->call(rhp, rcu_torture_err_cb); /* Another duplicate callback. */ } - local_irq_enable(); - rcu_read_unlock(); - preempt_enable(); + cur_ops->readunlock(idx); /* Wait for them all to get done so we can safely return. */ - rcu_barrier(); - pr_alert("%s: WARN: Duplicate call_rcu() test complete.\n", KBUILD_MODNAME); + cur_ops->cb_barrier(); + pr_alert("%s: WARN: Duplicate call_%s() test complete.\n", KBUILD_MODNAME, cur_ops->name); destroy_rcu_head_on_stack(&rh1); destroy_rcu_head_on_stack(&rh2); kfree(rhp); -#else /* #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */ - pr_alert("%s: !CONFIG_DEBUG_OBJECTS_RCU_HEAD, not testing duplicate call_rcu()\n", KBUILD_MODNAME); -#endif /* #else #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */ } static void rcutorture_sync(void) @@ -3463,6 +4255,188 @@ static void rcutorture_sync(void) cur_ops->sync(); } +static DEFINE_MUTEX(mut0); +static DEFINE_MUTEX(mut1); +static DEFINE_MUTEX(mut2); +static DEFINE_MUTEX(mut3); +static DEFINE_MUTEX(mut4); +static DEFINE_MUTEX(mut5); +static DEFINE_MUTEX(mut6); +static DEFINE_MUTEX(mut7); +static DEFINE_MUTEX(mut8); +static DEFINE_MUTEX(mut9); + +static DECLARE_RWSEM(rwsem0); +static DECLARE_RWSEM(rwsem1); +static DECLARE_RWSEM(rwsem2); +static DECLARE_RWSEM(rwsem3); +static DECLARE_RWSEM(rwsem4); +static DECLARE_RWSEM(rwsem5); +static DECLARE_RWSEM(rwsem6); +static DECLARE_RWSEM(rwsem7); +static DECLARE_RWSEM(rwsem8); +static DECLARE_RWSEM(rwsem9); + +DEFINE_STATIC_SRCU(srcu0); +DEFINE_STATIC_SRCU(srcu1); +DEFINE_STATIC_SRCU(srcu2); +DEFINE_STATIC_SRCU(srcu3); +DEFINE_STATIC_SRCU(srcu4); +DEFINE_STATIC_SRCU(srcu5); +DEFINE_STATIC_SRCU(srcu6); +DEFINE_STATIC_SRCU(srcu7); +DEFINE_STATIC_SRCU(srcu8); +DEFINE_STATIC_SRCU(srcu9); + +static int srcu_lockdep_next(const char *f, const char *fl, const char *fs, const char *fu, int i, + int cyclelen, int deadlock) +{ + int j = i + 1; + + if (j >= cyclelen) + j = deadlock ? 0 : -1; + if (j >= 0) + pr_info("%s: %s(%d), %s(%d), %s(%d)\n", f, fl, i, fs, j, fu, i); + else + pr_info("%s: %s(%d), %s(%d)\n", f, fl, i, fu, i); + return j; +} + +// Test lockdep on SRCU-based deadlock scenarios. +static void rcu_torture_init_srcu_lockdep(void) +{ + int cyclelen; + int deadlock; + bool err = false; + int i; + int j; + int idx; + struct mutex *muts[] = { &mut0, &mut1, &mut2, &mut3, &mut4, + &mut5, &mut6, &mut7, &mut8, &mut9 }; + struct rw_semaphore *rwsems[] = { &rwsem0, &rwsem1, &rwsem2, &rwsem3, &rwsem4, + &rwsem5, &rwsem6, &rwsem7, &rwsem8, &rwsem9 }; + struct srcu_struct *srcus[] = { &srcu0, &srcu1, &srcu2, &srcu3, &srcu4, + &srcu5, &srcu6, &srcu7, &srcu8, &srcu9 }; + int testtype; + + if (!test_srcu_lockdep) + return; + + deadlock = test_srcu_lockdep / 1000; + testtype = (test_srcu_lockdep / 10) % 100; + cyclelen = test_srcu_lockdep % 10; + WARN_ON_ONCE(ARRAY_SIZE(muts) != ARRAY_SIZE(srcus)); + if (WARN_ONCE(deadlock != !!deadlock, + "%s: test_srcu_lockdep=%d and deadlock digit %d must be zero or one.\n", + __func__, test_srcu_lockdep, deadlock)) + err = true; + if (WARN_ONCE(cyclelen <= 0, + "%s: test_srcu_lockdep=%d and cycle-length digit %d must be greater than zero.\n", + __func__, test_srcu_lockdep, cyclelen)) + err = true; + if (err) + goto err_out; + + if (testtype == 0) { + pr_info("%s: test_srcu_lockdep = %05d: SRCU %d-way %sdeadlock.\n", + __func__, test_srcu_lockdep, cyclelen, deadlock ? "" : "non-"); + if (deadlock && cyclelen == 1) + pr_info("%s: Expect hang.\n", __func__); + for (i = 0; i < cyclelen; i++) { + j = srcu_lockdep_next(__func__, "srcu_read_lock", "synchronize_srcu", + "srcu_read_unlock", i, cyclelen, deadlock); + idx = srcu_read_lock(srcus[i]); + if (j >= 0) + synchronize_srcu(srcus[j]); + srcu_read_unlock(srcus[i], idx); + } + return; + } + + if (testtype == 1) { + pr_info("%s: test_srcu_lockdep = %05d: SRCU/mutex %d-way %sdeadlock.\n", + __func__, test_srcu_lockdep, cyclelen, deadlock ? "" : "non-"); + for (i = 0; i < cyclelen; i++) { + pr_info("%s: srcu_read_lock(%d), mutex_lock(%d), mutex_unlock(%d), srcu_read_unlock(%d)\n", + __func__, i, i, i, i); + idx = srcu_read_lock(srcus[i]); + mutex_lock(muts[i]); + mutex_unlock(muts[i]); + srcu_read_unlock(srcus[i], idx); + + j = srcu_lockdep_next(__func__, "mutex_lock", "synchronize_srcu", + "mutex_unlock", i, cyclelen, deadlock); + mutex_lock(muts[i]); + if (j >= 0) + synchronize_srcu(srcus[j]); + mutex_unlock(muts[i]); + } + return; + } + + if (testtype == 2) { + pr_info("%s: test_srcu_lockdep = %05d: SRCU/rwsem %d-way %sdeadlock.\n", + __func__, test_srcu_lockdep, cyclelen, deadlock ? "" : "non-"); + for (i = 0; i < cyclelen; i++) { + pr_info("%s: srcu_read_lock(%d), down_read(%d), up_read(%d), srcu_read_unlock(%d)\n", + __func__, i, i, i, i); + idx = srcu_read_lock(srcus[i]); + down_read(rwsems[i]); + up_read(rwsems[i]); + srcu_read_unlock(srcus[i], idx); + + j = srcu_lockdep_next(__func__, "down_write", "synchronize_srcu", + "up_write", i, cyclelen, deadlock); + down_write(rwsems[i]); + if (j >= 0) + synchronize_srcu(srcus[j]); + up_write(rwsems[i]); + } + return; + } + +#ifdef CONFIG_TASKS_TRACE_RCU + if (testtype == 3) { + pr_info("%s: test_srcu_lockdep = %05d: SRCU and Tasks Trace RCU %d-way %sdeadlock.\n", + __func__, test_srcu_lockdep, cyclelen, deadlock ? "" : "non-"); + if (deadlock && cyclelen == 1) + pr_info("%s: Expect hang.\n", __func__); + for (i = 0; i < cyclelen; i++) { + char *fl = i == 0 ? "rcu_read_lock_trace" : "srcu_read_lock"; + char *fs = i == cyclelen - 1 ? "synchronize_rcu_tasks_trace" + : "synchronize_srcu"; + char *fu = i == 0 ? "rcu_read_unlock_trace" : "srcu_read_unlock"; + + j = srcu_lockdep_next(__func__, fl, fs, fu, i, cyclelen, deadlock); + if (i == 0) + rcu_read_lock_trace(); + else + idx = srcu_read_lock(srcus[i]); + if (j >= 0) { + if (i == cyclelen - 1) + synchronize_rcu_tasks_trace(); + else + synchronize_srcu(srcus[j]); + } + if (i == 0) + rcu_read_unlock_trace(); + else + srcu_read_unlock(srcus[i], idx); + } + return; + } +#endif // #ifdef CONFIG_TASKS_TRACE_RCU + +err_out: + pr_info("%s: test_srcu_lockdep = %05d does nothing.\n", __func__, test_srcu_lockdep); + pr_info("%s: test_srcu_lockdep = DNNL.\n", __func__); + pr_info("%s: D: Deadlock if nonzero.\n", __func__); + pr_info("%s: NN: Test number, 0=SRCU, 1=SRCU/mutex, 2=SRCU/rwsem, 3=SRCU/Tasks Trace RCU.\n", __func__); + pr_info("%s: L: Cycle length.\n", __func__); + if (!IS_ENABLED(CONFIG_TASKS_TRACE_RCU)) + pr_info("%s: NN=3 disallowed because kernel is built with CONFIG_TASKS_TRACE_RCU=n\n", __func__); +} + static int __init rcu_torture_init(void) { @@ -3501,9 +4475,25 @@ rcu_torture_init(void) pr_alert("rcu-torture: ->fqs NULL and non-zero fqs_duration, fqs disabled.\n"); fqs_duration = 0; } + if (nocbs_nthreads != 0 && (cur_ops != &rcu_ops || + !IS_ENABLED(CONFIG_RCU_NOCB_CPU))) { + pr_alert("rcu-torture types: %s and CONFIG_RCU_NOCB_CPU=%d, nocb toggle disabled.\n", + cur_ops->name, IS_ENABLED(CONFIG_RCU_NOCB_CPU)); + nocbs_nthreads = 0; + } if (cur_ops->init) cur_ops->init(); + rcu_torture_init_srcu_lockdep(); + + if (nfakewriters >= 0) { + nrealfakewriters = nfakewriters; + } else { + nrealfakewriters = num_online_cpus() - 2 - nfakewriters; + if (nrealfakewriters <= 0) + nrealfakewriters = 1; + } + if (nreaders >= 0) { nrealreaders = nreaders; } else { @@ -3512,8 +4502,8 @@ rcu_torture_init(void) nrealreaders = 1; } rcu_torture_print_module_parms(cur_ops, "Start of test"); - rcutorture_get_gp_data(cur_ops->ttype, &flags, &gp_seq); - srcutorture_get_gp_data(cur_ops->ttype, srcu_ctlp, &flags, &gp_seq); + if (cur_ops->get_gp_data) + cur_ops->get_gp_data(&flags, &gp_seq); start_gp_seq = gp_seq; pr_alert("%s: Start-test grace-period state: g%ld f%#x\n", cur_ops->name, (long)gp_seq, flags); @@ -3540,7 +4530,6 @@ rcu_torture_init(void) atomic_set(&n_rcu_torture_error, 0); n_rcu_torture_barrier_error = 0; n_rcu_torture_boost_ktrerror = 0; - n_rcu_torture_boost_rterror = 0; n_rcu_torture_boost_failure = 0; n_rcu_torture_boosts = 0; for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) @@ -3557,12 +4546,8 @@ rcu_torture_init(void) /* Start up the kthreads. */ rcu_torture_write_types(); - firsterr = torture_create_kthread(rcu_torture_writer, NULL, - writer_task); - if (torture_init_error(firsterr)) - goto unwind; - if (nfakewriters > 0) { - fakewriter_tasks = kcalloc(nfakewriters, + if (nrealfakewriters > 0) { + fakewriter_tasks = kcalloc(nrealfakewriters, sizeof(fakewriter_tasks[0]), GFP_KERNEL); if (fakewriter_tasks == NULL) { @@ -3571,7 +4556,7 @@ rcu_torture_init(void) goto unwind; } } - for (i = 0; i < nfakewriters; i++) { + for (i = 0; i < nrealfakewriters; i++) { firsterr = torture_create_kthread(rcu_torture_fakewriter, NULL, fakewriter_tasks[i]); if (torture_init_error(firsterr)) @@ -3593,6 +4578,15 @@ rcu_torture_init(void) if (torture_init_error(firsterr)) goto unwind; } + + firsterr = torture_create_kthread(rcu_torture_writer, NULL, + writer_task); + if (torture_init_error(firsterr)) + goto unwind; + + firsterr = rcu_torture_updown_init(); + if (torture_init_error(firsterr)) + goto unwind; nrealnocbers = nocbs_nthreads; if (WARN_ON(nrealnocbers < 0)) nrealnocbers = 1; @@ -3636,7 +4630,9 @@ rcu_torture_init(void) } if (fqs_duration < 0) fqs_duration = 0; - if (fqs_duration) { + if (fqs_holdoff < 0) + fqs_holdoff = 0; + if (fqs_duration && fqs_holdoff) { /* Create the fqs thread */ firsterr = torture_create_kthread(rcu_torture_fqs, NULL, fqs_task); @@ -3678,10 +4674,24 @@ rcu_torture_init(void) firsterr = rcu_torture_read_exit_init(); if (torture_init_error(firsterr)) goto unwind; + if (preempt_duration > 0) { + firsterr = torture_create_kthread(rcu_torture_preempt, NULL, preempt_task); + if (torture_init_error(firsterr)) + goto unwind; + } if (object_debug) rcu_test_debug_objects(); + + if (cur_ops->gp_slow_register && !WARN_ON_ONCE(!cur_ops->gp_slow_unregister)) + cur_ops->gp_slow_register(&rcu_fwd_cb_nodelay); + + if (gpwrap_lag && cur_ops->set_gpwrap_lag) { + firsterr = rcu_gpwrap_lag_init(); + if (torture_init_error(firsterr)) + goto unwind; + } + torture_init_end(); - rcu_gp_slow_register(&rcu_fwd_cb_nodelay); return 0; unwind: diff --git a/kernel/rcu/refscale.c b/kernel/rcu/refscale.c index 435c884c02b5..07a313782dfd 100644 --- a/kernel/rcu/refscale.c +++ b/kernel/rcu/refscale.c @@ -28,6 +28,7 @@ #include <linux/rcupdate_trace.h> #include <linux/reboot.h> #include <linux/sched.h> +#include <linux/seq_buf.h> #include <linux/spinlock.h> #include <linux/smp.h> #include <linux/stat.h> @@ -35,6 +36,7 @@ #include <linux/slab.h> #include <linux/torture.h> #include <linux/types.h> +#include <linux/sched/clock.h> #include "rcu.h" @@ -63,6 +65,7 @@ do { \ #define SCALEOUT_ERRSTRING(s, x...) pr_alert("%s" SCALE_FLAG "!!! " s "\n", scale_type, ## x) +MODULE_DESCRIPTION("Scalability test for object reference mechanisms"); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Joel Fernandes (Google) <joel@joelfernandes.org>"); @@ -73,11 +76,16 @@ MODULE_PARM_DESC(scale_type, "Type of test (rcu, srcu, refcnt, rwsem, rwlock."); torture_param(int, verbose, 0, "Enable verbose debugging printk()s"); torture_param(int, verbose_batched, 0, "Batch verbose debugging printk()s"); +// Number of seconds to extend warm-up and cool-down for multiple guest OSes +torture_param(long, guest_os_delay, 0, + "Number of seconds to extend warm-up/cool-down for multiple guest OSes."); // Wait until there are multiple CPUs before starting test. torture_param(int, holdoff, IS_BUILTIN(CONFIG_RCU_REF_SCALE_TEST) ? 10 : 0, "Holdoff time before test start (s)"); +// Number of typesafe_lookup structures, that is, the degree of concurrency. +torture_param(long, lookup_instances, 0, "Number of typesafe_lookup structures."); // Number of loops per experiment, all readers execute operations concurrently. -torture_param(long, loops, 10000, "Number of loops per experiment."); +torture_param(int, loops, 10000, "Number of loops per experiment."); // Number of readers, with -1 defaulting to about 75% of the CPUs. torture_param(int, nreaders, -1, "Number of readers, -1 for 75% of CPUs."); // Number of runs. @@ -124,14 +132,15 @@ static int exp_idx; // Operations vector for selecting different types of tests. struct ref_scale_ops { - void (*init)(void); + bool (*init)(void); void (*cleanup)(void); void (*readsection)(const int nloops); void (*delaysection)(const int nloops, const int udl, const int ndl); + bool enable_irqs; const char *name; }; -static struct ref_scale_ops *cur_ops; +static const struct ref_scale_ops *cur_ops; static void un_delay(const int udl, const int ndl) { @@ -162,11 +171,12 @@ static void ref_rcu_delay_section(const int nloops, const int udl, const int ndl } } -static void rcu_sync_scale_init(void) +static bool rcu_sync_scale_init(void) { + return true; } -static struct ref_scale_ops rcu_ops = { +static const struct ref_scale_ops rcu_ops = { .init = rcu_sync_scale_init, .readsection = ref_rcu_read_section, .delaysection = ref_rcu_delay_section, @@ -175,6 +185,8 @@ static struct ref_scale_ops rcu_ops = { // Definitions for SRCU ref scale testing. DEFINE_STATIC_SRCU(srcu_refctl_scale); +DEFINE_STATIC_SRCU_FAST(srcu_fast_refctl_scale); +DEFINE_STATIC_SRCU_FAST_UPDOWN(srcu_fast_updown_refctl_scale); static struct srcu_struct *srcu_ctlp = &srcu_refctl_scale; static void srcu_ref_scale_read_section(const int nloops) @@ -200,13 +212,85 @@ static void srcu_ref_scale_delay_section(const int nloops, const int udl, const } } -static struct ref_scale_ops srcu_ops = { +static const struct ref_scale_ops srcu_ops = { .init = rcu_sync_scale_init, .readsection = srcu_ref_scale_read_section, .delaysection = srcu_ref_scale_delay_section, .name = "srcu" }; +static bool srcu_fast_sync_scale_init(void) +{ + srcu_ctlp = &srcu_fast_refctl_scale; + return true; +} + +static void srcu_fast_ref_scale_read_section(const int nloops) +{ + int i; + struct srcu_ctr __percpu *scp; + + for (i = nloops; i >= 0; i--) { + scp = srcu_read_lock_fast(srcu_ctlp); + srcu_read_unlock_fast(srcu_ctlp, scp); + } +} + +static void srcu_fast_ref_scale_delay_section(const int nloops, const int udl, const int ndl) +{ + int i; + struct srcu_ctr __percpu *scp; + + for (i = nloops; i >= 0; i--) { + scp = srcu_read_lock_fast(srcu_ctlp); + un_delay(udl, ndl); + srcu_read_unlock_fast(srcu_ctlp, scp); + } +} + +static const struct ref_scale_ops srcu_fast_ops = { + .init = srcu_fast_sync_scale_init, + .readsection = srcu_fast_ref_scale_read_section, + .delaysection = srcu_fast_ref_scale_delay_section, + .name = "srcu-fast" +}; + +static bool srcu_fast_updown_sync_scale_init(void) +{ + srcu_ctlp = &srcu_fast_updown_refctl_scale; + return true; +} + +static void srcu_fast_updown_ref_scale_read_section(const int nloops) +{ + int i; + struct srcu_ctr __percpu *scp; + + for (i = nloops; i >= 0; i--) { + scp = srcu_read_lock_fast_updown(srcu_ctlp); + srcu_read_unlock_fast_updown(srcu_ctlp, scp); + } +} + +static void srcu_fast_updown_ref_scale_delay_section(const int nloops, const int udl, const int ndl) +{ + int i; + struct srcu_ctr __percpu *scp; + + for (i = nloops; i >= 0; i--) { + scp = srcu_read_lock_fast_updown(srcu_ctlp); + un_delay(udl, ndl); + srcu_read_unlock_fast_updown(srcu_ctlp, scp); + } +} + +static const struct ref_scale_ops srcu_fast_updown_ops = { + .init = srcu_fast_updown_sync_scale_init, + .readsection = srcu_fast_updown_ref_scale_read_section, + .delaysection = srcu_fast_updown_ref_scale_delay_section, + .name = "srcu-fast-updown" +}; + #ifdef CONFIG_TASKS_RCU // Definitions for RCU Tasks ref scale testing: Empty read markers. @@ -227,7 +311,7 @@ static void rcu_tasks_ref_scale_delay_section(const int nloops, const int udl, c un_delay(udl, ndl); } -static struct ref_scale_ops rcu_tasks_ops = { +static const struct ref_scale_ops rcu_tasks_ops = { .init = rcu_sync_scale_init, .readsection = rcu_tasks_ref_scale_read_section, .delaysection = rcu_tasks_ref_scale_delay_section, @@ -266,7 +350,7 @@ static void rcu_trace_ref_scale_delay_section(const int nloops, const int udl, c } } -static struct ref_scale_ops rcu_trace_ops = { +static const struct ref_scale_ops rcu_trace_ops = { .init = rcu_sync_scale_init, .readsection = rcu_trace_ref_scale_read_section, .delaysection = rcu_trace_ref_scale_delay_section, @@ -284,6 +368,9 @@ static struct ref_scale_ops rcu_trace_ops = { // Definitions for reference count static atomic_t refcnt; +// Definitions acquire-release. +static DEFINE_PER_CPU(unsigned long, test_acqrel); + static void ref_refcnt_section(const int nloops) { int i; @@ -305,19 +392,198 @@ static void ref_refcnt_delay_section(const int nloops, const int udl, const int } } -static struct ref_scale_ops refcnt_ops = { +static const struct ref_scale_ops refcnt_ops = { .init = rcu_sync_scale_init, .readsection = ref_refcnt_section, .delaysection = ref_refcnt_delay_section, .name = "refcnt" }; +static void ref_percpuinc_section(const int nloops) +{ + int i; + + for (i = nloops; i >= 0; i--) { + this_cpu_inc(test_acqrel); + this_cpu_dec(test_acqrel); + } +} + +static void ref_percpuinc_delay_section(const int nloops, const int udl, const int ndl) +{ + int i; + + for (i = nloops; i >= 0; i--) { + this_cpu_inc(test_acqrel); + un_delay(udl, ndl); + this_cpu_dec(test_acqrel); + } +} + +static const struct ref_scale_ops percpuinc_ops = { + .init = rcu_sync_scale_init, + .readsection = ref_percpuinc_section, + .delaysection = ref_percpuinc_delay_section, + .name = "percpuinc" +}; + +// Note that this can lose counts in preemptible kernels. +static void ref_incpercpu_section(const int nloops) +{ + int i; + + for (i = nloops; i >= 0; i--) { + unsigned long *tap = this_cpu_ptr(&test_acqrel); + + WRITE_ONCE(*tap, READ_ONCE(*tap) + 1); + WRITE_ONCE(*tap, READ_ONCE(*tap) - 1); + } +} + +static void ref_incpercpu_delay_section(const int nloops, const int udl, const int ndl) +{ + int i; + + for (i = nloops; i >= 0; i--) { + unsigned long *tap = this_cpu_ptr(&test_acqrel); + + WRITE_ONCE(*tap, READ_ONCE(*tap) + 1); + un_delay(udl, ndl); + WRITE_ONCE(*tap, READ_ONCE(*tap) - 1); + } +} + +static const struct ref_scale_ops incpercpu_ops = { + .init = rcu_sync_scale_init, + .readsection = ref_incpercpu_section, + .delaysection = ref_incpercpu_delay_section, + .name = "incpercpu" +}; + +static void ref_incpercpupreempt_section(const int nloops) +{ + int i; + + for (i = nloops; i >= 0; i--) { + unsigned long *tap; + + preempt_disable(); + tap = this_cpu_ptr(&test_acqrel); + WRITE_ONCE(*tap, READ_ONCE(*tap) + 1); + WRITE_ONCE(*tap, READ_ONCE(*tap) - 1); + preempt_enable(); + } +} + +static void ref_incpercpupreempt_delay_section(const int nloops, const int udl, const int ndl) +{ + int i; + + for (i = nloops; i >= 0; i--) { + unsigned long *tap; + + preempt_disable(); + tap = this_cpu_ptr(&test_acqrel); + WRITE_ONCE(*tap, READ_ONCE(*tap) + 1); + un_delay(udl, ndl); + WRITE_ONCE(*tap, READ_ONCE(*tap) - 1); + preempt_enable(); + } +} + +static const struct ref_scale_ops incpercpupreempt_ops = { + .init = rcu_sync_scale_init, + .readsection = ref_incpercpupreempt_section, + .delaysection = ref_incpercpupreempt_delay_section, + .name = "incpercpupreempt" +}; + +static void ref_incpercpubh_section(const int nloops) +{ + int i; + + for (i = nloops; i >= 0; i--) { + unsigned long *tap; + + local_bh_disable(); + tap = this_cpu_ptr(&test_acqrel); + WRITE_ONCE(*tap, READ_ONCE(*tap) + 1); + WRITE_ONCE(*tap, READ_ONCE(*tap) - 1); + local_bh_enable(); + } +} + +static void ref_incpercpubh_delay_section(const int nloops, const int udl, const int ndl) +{ + int i; + + for (i = nloops; i >= 0; i--) { + unsigned long *tap; + + local_bh_disable(); + tap = this_cpu_ptr(&test_acqrel); + WRITE_ONCE(*tap, READ_ONCE(*tap) + 1); + un_delay(udl, ndl); + WRITE_ONCE(*tap, READ_ONCE(*tap) - 1); + local_bh_enable(); + } +} + +static const struct ref_scale_ops incpercpubh_ops = { + .init = rcu_sync_scale_init, + .readsection = ref_incpercpubh_section, + .delaysection = ref_incpercpubh_delay_section, + .enable_irqs = true, + .name = "incpercpubh" +}; + +static void ref_incpercpuirqsave_section(const int nloops) +{ + int i; + unsigned long flags; + + for (i = nloops; i >= 0; i--) { + unsigned long *tap; + + local_irq_save(flags); + tap = this_cpu_ptr(&test_acqrel); + WRITE_ONCE(*tap, READ_ONCE(*tap) + 1); + WRITE_ONCE(*tap, READ_ONCE(*tap) - 1); + local_irq_restore(flags); + } +} + +static void ref_incpercpuirqsave_delay_section(const int nloops, const int udl, const int ndl) +{ + int i; + unsigned long flags; + + for (i = nloops; i >= 0; i--) { + unsigned long *tap; + + local_irq_save(flags); + tap = this_cpu_ptr(&test_acqrel); + WRITE_ONCE(*tap, READ_ONCE(*tap) + 1); + un_delay(udl, ndl); + WRITE_ONCE(*tap, READ_ONCE(*tap) - 1); + local_irq_restore(flags); + } +} + +static const struct ref_scale_ops incpercpuirqsave_ops = { + .init = rcu_sync_scale_init, + .readsection = ref_incpercpuirqsave_section, + .delaysection = ref_incpercpuirqsave_delay_section, + .name = "incpercpuirqsave" +}; + // Definitions for rwlock static rwlock_t test_rwlock; -static void ref_rwlock_init(void) +static bool ref_rwlock_init(void) { rwlock_init(&test_rwlock); + return true; } static void ref_rwlock_section(const int nloops) @@ -341,7 +607,7 @@ static void ref_rwlock_delay_section(const int nloops, const int udl, const int } } -static struct ref_scale_ops rwlock_ops = { +static const struct ref_scale_ops rwlock_ops = { .init = ref_rwlock_init, .readsection = ref_rwlock_section, .delaysection = ref_rwlock_delay_section, @@ -351,9 +617,10 @@ static struct ref_scale_ops rwlock_ops = { // Definitions for rwsem static struct rw_semaphore test_rwsem; -static void ref_rwsem_init(void) +static bool ref_rwsem_init(void) { init_rwsem(&test_rwsem); + return true; } static void ref_rwsem_section(const int nloops) @@ -377,7 +644,7 @@ static void ref_rwsem_delay_section(const int nloops, const int udl, const int n } } -static struct ref_scale_ops rwsem_ops = { +static const struct ref_scale_ops rwsem_ops = { .init = ref_rwsem_init, .readsection = ref_rwsem_section, .delaysection = ref_rwsem_delay_section, @@ -412,7 +679,7 @@ static void ref_lock_delay_section(const int nloops, const int udl, const int nd preempt_enable(); } -static struct ref_scale_ops lock_ops = { +static const struct ref_scale_ops lock_ops = { .readsection = ref_lock_section, .delaysection = ref_lock_delay_section, .name = "lock" @@ -447,15 +714,12 @@ static void ref_lock_irq_delay_section(const int nloops, const int udl, const in preempt_enable(); } -static struct ref_scale_ops lock_irq_ops = { +static const struct ref_scale_ops lock_irq_ops = { .readsection = ref_lock_irq_section, .delaysection = ref_lock_irq_delay_section, .name = "lock-irq" }; -// Definitions acquire-release. -static DEFINE_PER_CPU(unsigned long, test_acqrel); - static void ref_acqrel_section(const int nloops) { unsigned long x; @@ -483,7 +747,7 @@ static void ref_acqrel_delay_section(const int nloops, const int udl, const int preempt_enable(); } -static struct ref_scale_ops acqrel_ops = { +static const struct ref_scale_ops acqrel_ops = { .readsection = ref_acqrel_section, .delaysection = ref_acqrel_delay_section, .name = "acqrel" @@ -491,6 +755,39 @@ static struct ref_scale_ops acqrel_ops = { static volatile u64 stopopts; +static void ref_sched_clock_section(const int nloops) +{ + u64 x = 0; + int i; + + preempt_disable(); + for (i = nloops; i >= 0; i--) + x += sched_clock(); + preempt_enable(); + stopopts = x; +} + +static void ref_sched_clock_delay_section(const int nloops, const int udl, const int ndl) +{ + u64 x = 0; + int i; + + preempt_disable(); + for (i = nloops; i >= 0; i--) { + x += sched_clock(); + un_delay(udl, ndl); + } + preempt_enable(); + stopopts = x; +} + +static const struct ref_scale_ops sched_clock_ops = { + .readsection = ref_sched_clock_section, + .delaysection = ref_sched_clock_delay_section, + .name = "sched-clock" +}; + + static void ref_clock_section(const int nloops) { u64 x = 0; @@ -517,12 +814,402 @@ static void ref_clock_delay_section(const int nloops, const int udl, const int n stopopts = x; } -static struct ref_scale_ops clock_ops = { +static const struct ref_scale_ops clock_ops = { .readsection = ref_clock_section, .delaysection = ref_clock_delay_section, .name = "clock" }; +static void ref_jiffies_section(const int nloops) +{ + u64 x = 0; + int i; + + preempt_disable(); + for (i = nloops; i >= 0; i--) + x += jiffies; + preempt_enable(); + stopopts = x; +} + +static void ref_jiffies_delay_section(const int nloops, const int udl, const int ndl) +{ + u64 x = 0; + int i; + + preempt_disable(); + for (i = nloops; i >= 0; i--) { + x += jiffies; + un_delay(udl, ndl); + } + preempt_enable(); + stopopts = x; +} + +static const struct ref_scale_ops jiffies_ops = { + .readsection = ref_jiffies_section, + .delaysection = ref_jiffies_delay_section, + .name = "jiffies" +}; + +static void ref_preempt_section(const int nloops) +{ + int i; + + migrate_disable(); + for (i = nloops; i >= 0; i--) { + preempt_disable(); + preempt_enable(); + } + migrate_enable(); +} + +static void ref_preempt_delay_section(const int nloops, const int udl, const int ndl) +{ + int i; + + migrate_disable(); + for (i = nloops; i >= 0; i--) { + preempt_disable(); + un_delay(udl, ndl); + preempt_enable(); + } + migrate_enable(); +} + +static const struct ref_scale_ops preempt_ops = { + .readsection = ref_preempt_section, + .delaysection = ref_preempt_delay_section, + .name = "preempt" +}; + +static void ref_bh_section(const int nloops) +{ + int i; + + preempt_disable(); + for (i = nloops; i >= 0; i--) { + local_bh_disable(); + local_bh_enable(); + } + preempt_enable(); +} + +static void ref_bh_delay_section(const int nloops, const int udl, const int ndl) +{ + int i; + + preempt_disable(); + for (i = nloops; i >= 0; i--) { + local_bh_disable(); + un_delay(udl, ndl); + local_bh_enable(); + } + preempt_enable(); +} + +static const struct ref_scale_ops bh_ops = { + .readsection = ref_bh_section, + .delaysection = ref_bh_delay_section, + .enable_irqs = true, + .name = "bh" +}; + +static void ref_irq_section(const int nloops) +{ + int i; + + preempt_disable(); + for (i = nloops; i >= 0; i--) { + local_irq_disable(); + local_irq_enable(); + } + preempt_enable(); +} + +static void ref_irq_delay_section(const int nloops, const int udl, const int ndl) +{ + int i; + + preempt_disable(); + for (i = nloops; i >= 0; i--) { + local_irq_disable(); + un_delay(udl, ndl); + local_irq_enable(); + } + preempt_enable(); +} + +static const struct ref_scale_ops irq_ops = { + .readsection = ref_irq_section, + .delaysection = ref_irq_delay_section, + .name = "irq" +}; + +static void ref_irqsave_section(const int nloops) +{ + unsigned long flags; + int i; + + preempt_disable(); + for (i = nloops; i >= 0; i--) { + local_irq_save(flags); + local_irq_restore(flags); + } + preempt_enable(); +} + +static void ref_irqsave_delay_section(const int nloops, const int udl, const int ndl) +{ + unsigned long flags; + int i; + + preempt_disable(); + for (i = nloops; i >= 0; i--) { + local_irq_save(flags); + un_delay(udl, ndl); + local_irq_restore(flags); + } + preempt_enable(); +} + +static const struct ref_scale_ops irqsave_ops = { + .readsection = ref_irqsave_section, + .delaysection = ref_irqsave_delay_section, + .name = "irqsave" +}; + +//////////////////////////////////////////////////////////////////////// +// +// Methods leveraging SLAB_TYPESAFE_BY_RCU. +// + +// Item to look up in a typesafe manner. Array of pointers to these. +struct refscale_typesafe { + atomic_t rts_refctr; // Used by all flavors + spinlock_t rts_lock; + seqlock_t rts_seqlock; + unsigned int a; + unsigned int b; +}; + +static struct kmem_cache *typesafe_kmem_cachep; +static struct refscale_typesafe **rtsarray; +static long rtsarray_size; +static DEFINE_TORTURE_RANDOM_PERCPU(refscale_rand); +static bool (*rts_acquire)(struct refscale_typesafe *rtsp, unsigned int *start); +static bool (*rts_release)(struct refscale_typesafe *rtsp, unsigned int start); + +// Conditionally acquire an explicit in-structure reference count. +static bool typesafe_ref_acquire(struct refscale_typesafe *rtsp, unsigned int *start) +{ + return atomic_inc_not_zero(&rtsp->rts_refctr); +} + +// Unconditionally release an explicit in-structure reference count. +static bool typesafe_ref_release(struct refscale_typesafe *rtsp, unsigned int start) +{ + if (!atomic_dec_return(&rtsp->rts_refctr)) { + WRITE_ONCE(rtsp->a, rtsp->a + 1); + kmem_cache_free(typesafe_kmem_cachep, rtsp); + } + return true; +} + +// Unconditionally acquire an explicit in-structure spinlock. +static bool typesafe_lock_acquire(struct refscale_typesafe *rtsp, unsigned int *start) +{ + spin_lock(&rtsp->rts_lock); + return true; +} + +// Unconditionally release an explicit in-structure spinlock. +static bool typesafe_lock_release(struct refscale_typesafe *rtsp, unsigned int start) +{ + spin_unlock(&rtsp->rts_lock); + return true; +} + +// Unconditionally acquire an explicit in-structure sequence lock. +static bool typesafe_seqlock_acquire(struct refscale_typesafe *rtsp, unsigned int *start) +{ + *start = read_seqbegin(&rtsp->rts_seqlock); + return true; +} + +// Conditionally release an explicit in-structure sequence lock. Return +// true if this release was successful, that is, if no retry is required. +static bool typesafe_seqlock_release(struct refscale_typesafe *rtsp, unsigned int start) +{ + return !read_seqretry(&rtsp->rts_seqlock, start); +} + +// Do a read-side critical section with the specified delay in +// microseconds and nanoseconds inserted so as to increase probability +// of failure. +static void typesafe_delay_section(const int nloops, const int udl, const int ndl) +{ + unsigned int a; + unsigned int b; + int i; + long idx; + struct refscale_typesafe *rtsp; + unsigned int start; + + for (i = nloops; i >= 0; i--) { + preempt_disable(); + idx = torture_random(this_cpu_ptr(&refscale_rand)) % rtsarray_size; + preempt_enable(); +retry: + rcu_read_lock(); + rtsp = rcu_dereference(rtsarray[idx]); + a = READ_ONCE(rtsp->a); + if (!rts_acquire(rtsp, &start)) { + rcu_read_unlock(); + goto retry; + } + if (a != READ_ONCE(rtsp->a)) { + (void)rts_release(rtsp, start); + rcu_read_unlock(); + goto retry; + } + un_delay(udl, ndl); + b = READ_ONCE(rtsp->a); + // Remember, seqlock read-side release can fail. + if (!rts_release(rtsp, start)) { + rcu_read_unlock(); + goto retry; + } + WARN_ONCE(a != b, "Re-read of ->a changed from %u to %u.\n", a, b); + b = rtsp->b; + rcu_read_unlock(); + WARN_ON_ONCE(a * a != b); + } +} + +// Because the acquisition and release methods are expensive, there +// is no point in optimizing away the un_delay() function's two checks. +// Thus simply define typesafe_read_section() as a simple wrapper around +// typesafe_delay_section(). +static void typesafe_read_section(const int nloops) +{ + typesafe_delay_section(nloops, 0, 0); +} + +// Allocate and initialize one refscale_typesafe structure. +static struct refscale_typesafe *typesafe_alloc_one(void) +{ + struct refscale_typesafe *rtsp; + + rtsp = kmem_cache_alloc(typesafe_kmem_cachep, GFP_KERNEL); + if (!rtsp) + return NULL; + atomic_set(&rtsp->rts_refctr, 1); + WRITE_ONCE(rtsp->a, rtsp->a + 1); + WRITE_ONCE(rtsp->b, rtsp->a * rtsp->a); + return rtsp; +} + +// Slab-allocator constructor for refscale_typesafe structures created +// out of a new slab of system memory. +static void refscale_typesafe_ctor(void *rtsp_in) +{ + struct refscale_typesafe *rtsp = rtsp_in; + + spin_lock_init(&rtsp->rts_lock); + seqlock_init(&rtsp->rts_seqlock); + preempt_disable(); + rtsp->a = torture_random(this_cpu_ptr(&refscale_rand)); + preempt_enable(); +} + +static const struct ref_scale_ops typesafe_ref_ops; +static const struct ref_scale_ops typesafe_lock_ops; +static const struct ref_scale_ops typesafe_seqlock_ops; + +// Initialize for a typesafe test. +static bool typesafe_init(void) +{ + long idx; + long si = lookup_instances; + + typesafe_kmem_cachep = kmem_cache_create("refscale_typesafe", + sizeof(struct refscale_typesafe), sizeof(void *), + SLAB_TYPESAFE_BY_RCU, refscale_typesafe_ctor); + if (!typesafe_kmem_cachep) + return false; + if (si < 0) + si = -si * nr_cpu_ids; + else if (si == 0) + si = nr_cpu_ids; + rtsarray_size = si; + rtsarray = kcalloc(si, sizeof(*rtsarray), GFP_KERNEL); + if (!rtsarray) + return false; + for (idx = 0; idx < rtsarray_size; idx++) { + rtsarray[idx] = typesafe_alloc_one(); + if (!rtsarray[idx]) + return false; + } + if (cur_ops == &typesafe_ref_ops) { + rts_acquire = typesafe_ref_acquire; + rts_release = typesafe_ref_release; + } else if (cur_ops == &typesafe_lock_ops) { + rts_acquire = typesafe_lock_acquire; + rts_release = typesafe_lock_release; + } else if (cur_ops == &typesafe_seqlock_ops) { + rts_acquire = typesafe_seqlock_acquire; + rts_release = typesafe_seqlock_release; + } else { + WARN_ON_ONCE(1); + return false; + } + return true; +} + +// Clean up after a typesafe test. +static void typesafe_cleanup(void) +{ + long idx; + + if (rtsarray) { + for (idx = 0; idx < rtsarray_size; idx++) + kmem_cache_free(typesafe_kmem_cachep, rtsarray[idx]); + kfree(rtsarray); + rtsarray = NULL; + rtsarray_size = 0; + } + kmem_cache_destroy(typesafe_kmem_cachep); + typesafe_kmem_cachep = NULL; + rts_acquire = NULL; + rts_release = NULL; +} + +// The typesafe_init() function distinguishes these structures by address. +static const struct ref_scale_ops typesafe_ref_ops = { + .init = typesafe_init, + .cleanup = typesafe_cleanup, + .readsection = typesafe_read_section, + .delaysection = typesafe_delay_section, + .name = "typesafe_ref" +}; + +static const struct ref_scale_ops typesafe_lock_ops = { + .init = typesafe_init, + .cleanup = typesafe_cleanup, + .readsection = typesafe_read_section, + .delaysection = typesafe_delay_section, + .name = "typesafe_lock" +}; + +static const struct ref_scale_ops typesafe_seqlock_ops = { + .init = typesafe_init, + .cleanup = typesafe_cleanup, + .readsection = typesafe_read_section, + .delaysection = typesafe_delay_section, + .name = "typesafe_seqlock" +}; + static void rcu_scale_one_reader(void) { if (readdelay <= 0) @@ -531,6 +1218,18 @@ static void rcu_scale_one_reader(void) cur_ops->delaysection(loops, readdelay / 1000, readdelay % 1000); } +// Warm up cache, or, if needed run a series of rcu_scale_one_reader() +// to allow multiple rcuscale guest OSes to collect mutually valid data. +static void rcu_scale_warm_cool(void) +{ + unsigned long jdone = jiffies + (guest_os_delay > 0 ? guest_os_delay * HZ : -1); + + do { + rcu_scale_one_reader(); + cond_resched(); + } while (time_before(jiffies, jdone)); +} + // Reader kthread. Repeatedly does empty RCU read-side // critical section, minimizing update-side interference. static int @@ -559,7 +1258,7 @@ repeat: goto end; // Make sure that the CPU is affinitized appropriately during testing. - WARN_ON_ONCE(raw_smp_processor_id() != me); + WARN_ON_ONCE(raw_smp_processor_id() != me % nr_cpu_ids); WRITE_ONCE(rt->start_reader, 0); if (!atomic_dec_return(&n_started)) @@ -575,15 +1274,18 @@ repeat: if (!atomic_dec_return(&n_warmedup)) while (atomic_read_acquire(&n_warmedup)) rcu_scale_one_reader(); - // Also keep interrupts disabled. This also has the effect - // of preventing entries into slow path for rcu_read_unlock(). - local_irq_save(flags); + // Also keep interrupts disabled when it is safe to do so, which + // it is not for local_bh_enable(). This also has the effect of + // preventing entries into slow path for rcu_read_unlock(). + if (!cur_ops->enable_irqs) + local_irq_save(flags); start = ktime_get_mono_fast_ns(); rcu_scale_one_reader(); duration = ktime_get_mono_fast_ns() - start; - local_irq_restore(flags); + if (!cur_ops->enable_irqs) + local_irq_restore(flags); rt->last_duration_ns = WARN_ON_ONCE(duration < 0) ? 0 : duration; // To reduce runtime-skew noise, do maintain-load invocations until @@ -622,32 +1324,34 @@ static u64 process_durations(int n) { int i; struct reader_task *rt; - char buf1[64]; + struct seq_buf s; char *buf; u64 sum = 0; buf = kmalloc(800 + 64, GFP_KERNEL); if (!buf) return 0; - buf[0] = 0; - sprintf(buf, "Experiment #%d (Format: <THREAD-NUM>:<Total loop time in ns>)", - exp_idx); + seq_buf_init(&s, buf, 800 + 64); + + seq_buf_printf(&s, "Experiment #%d (Format: <THREAD-NUM>:<Total loop time in ns>)", + exp_idx); for (i = 0; i < n && !torture_must_stop(); i++) { rt = &(reader_tasks[i]); - sprintf(buf1, "%d: %llu\t", i, rt->last_duration_ns); if (i % 5 == 0) - strcat(buf, "\n"); - if (strlen(buf) >= 800) { - pr_alert("%s", buf); - buf[0] = 0; + seq_buf_putc(&s, '\n'); + + if (seq_buf_used(&s) >= 800) { + pr_alert("%s", seq_buf_str(&s)); + seq_buf_clear(&s); } - strcat(buf, buf1); + + seq_buf_printf(&s, "%d: %llu\t", i, rt->last_duration_ns); sum += rt->last_duration_ns; } - pr_alert("%s\n", buf); + pr_alert("%s\n", seq_buf_str(&s)); kfree(buf); return sum; @@ -670,7 +1374,7 @@ static int main_func(void *arg) set_user_nice(current, MAX_NICE); VERBOSE_SCALEOUT("main_func task started"); - result_avg = kzalloc(nruns * sizeof(*result_avg), GFP_KERNEL); + result_avg = kcalloc(nruns, sizeof(*result_avg), GFP_KERNEL); buf = kzalloc(800 + 64, GFP_KERNEL); if (!result_avg || !buf) { SCALEOUT_ERRSTRING("out of memory"); @@ -685,6 +1389,7 @@ static int main_func(void *arg) schedule_timeout_uninterruptible(1); // Start exp readers up per experiment + rcu_scale_warm_cool(); for (exp = 0; exp < nruns && !torture_must_stop(); exp++) { if (torture_must_stop()) goto end; @@ -715,6 +1420,7 @@ static int main_func(void *arg) result_avg[exp] = div_u64(1000 * process_durations(nreaders), nreaders * loops); } + rcu_scale_warm_cool(); // Print the average of all experiments SCALEOUT("END OF TEST. Calculating average duration per loop (nanoseconds)...\n"); @@ -754,11 +1460,11 @@ end: } static void -ref_scale_print_module_parms(struct ref_scale_ops *cur_ops, const char *tag) +ref_scale_print_module_parms(const struct ref_scale_ops *cur_ops, const char *tag) { pr_alert("%s" SCALE_FLAG - "--- %s: verbose=%d shutdown=%d holdoff=%d loops=%ld nreaders=%d nruns=%d readdelay=%d\n", scale_type, tag, - verbose, shutdown, holdoff, loops, nreaders, nruns, readdelay); + "--- %s: verbose=%d verbose_batched=%d shutdown=%d holdoff=%d lookup_instances=%ld loops=%d nreaders=%d nruns=%d readdelay=%d\n", scale_type, tag, + verbose, verbose_batched, shutdown, holdoff, lookup_instances, loops, nreaders, nruns, readdelay); } static void @@ -780,9 +1486,9 @@ ref_scale_cleanup(void) reader_tasks[i].task); } kfree(reader_tasks); + reader_tasks = NULL; torture_stop_kthread("main_task", main_task); - kfree(main_task); // Do scale-type-specific cleanup operations. if (cur_ops->cleanup != NULL) @@ -795,7 +1501,7 @@ ref_scale_cleanup(void) static int ref_scale_shutdown(void *arg) { - wait_event(shutdown_wq, shutdown_start); + wait_event_idle(shutdown_wq, shutdown_start); smp_mb(); // Wake before output. ref_scale_cleanup(); @@ -809,9 +1515,15 @@ ref_scale_init(void) { long i; int firsterr = 0; - static struct ref_scale_ops *scale_ops[] = { - &rcu_ops, &srcu_ops, RCU_TRACE_OPS RCU_TASKS_OPS &refcnt_ops, &rwlock_ops, - &rwsem_ops, &lock_ops, &lock_irq_ops, &acqrel_ops, &clock_ops, + static const struct ref_scale_ops *scale_ops[] = { + &rcu_ops, &srcu_ops, &srcu_fast_ops, &srcu_fast_updown_ops, + RCU_TRACE_OPS RCU_TASKS_OPS + &refcnt_ops, &percpuinc_ops, &incpercpu_ops, &incpercpupreempt_ops, + &incpercpubh_ops, &incpercpuirqsave_ops, + &rwlock_ops, &rwsem_ops, &lock_ops, &lock_irq_ops, &acqrel_ops, + &sched_clock_ops, &clock_ops, &jiffies_ops, + &preempt_ops, &bh_ops, &irq_ops, &irqsave_ops, + &typesafe_ref_ops, &typesafe_lock_ops, &typesafe_seqlock_ops, }; if (!torture_init_begin(scale_type, verbose)) @@ -833,7 +1545,10 @@ ref_scale_init(void) goto unwind; } if (cur_ops->init) - cur_ops->init(); + if (!cur_ops->init()) { + firsterr = -EUCLEAN; + goto unwind; + } ref_scale_print_module_parms(cur_ops, "Start of test"); @@ -850,12 +1565,16 @@ ref_scale_init(void) // Reader tasks (default to ~75% of online CPUs). if (nreaders < 0) nreaders = (num_online_cpus() >> 1) + (num_online_cpus() >> 2); - if (WARN_ONCE(loops <= 0, "%s: loops = %ld, adjusted to 1\n", __func__, loops)) + if (WARN_ONCE(loops <= 0, "%s: loops = %d, adjusted to 1\n", __func__, loops)) loops = 1; if (WARN_ONCE(nreaders <= 0, "%s: nreaders = %d, adjusted to 1\n", __func__, nreaders)) nreaders = 1; if (WARN_ONCE(nruns <= 0, "%s: nruns = %d, adjusted to 1\n", __func__, nruns)) nruns = 1; + if (WARN_ONCE(loops > INT_MAX / nreaders, + "%s: nreaders * loops will overflow, adjusted loops to %d", + __func__, INT_MAX / nreaders)) + loops = INT_MAX / nreaders; reader_tasks = kcalloc(nreaders, sizeof(reader_tasks[0]), GFP_KERNEL); if (!reader_tasks) { @@ -867,12 +1586,11 @@ ref_scale_init(void) VERBOSE_SCALEOUT("Starting %d reader threads", nreaders); for (i = 0; i < nreaders; i++) { + init_waitqueue_head(&reader_tasks[i].wq); firsterr = torture_create_kthread(ref_scale_reader, (void *)i, reader_tasks[i].task); if (torture_init_error(firsterr)) goto unwind; - - init_waitqueue_head(&(reader_tasks[i].wq)); } // Main Task diff --git a/kernel/rcu/srcutiny.c b/kernel/rcu/srcutiny.c index b12fb0cec44d..3450c3751ef7 100644 --- a/kernel/rcu/srcutiny.c +++ b/kernel/rcu/srcutiny.c @@ -20,7 +20,11 @@ #include "rcu_segcblist.h" #include "rcu.h" +#ifndef CONFIG_TREE_RCU int rcu_scheduler_active __read_mostly; +#else // #ifndef CONFIG_TREE_RCU +extern int rcu_scheduler_active; +#endif // #else // #ifndef CONFIG_TREE_RCU static LIST_HEAD(srcu_boot_list); static bool srcu_init_done; @@ -96,18 +100,21 @@ EXPORT_SYMBOL_GPL(cleanup_srcu_struct); */ void __srcu_read_unlock(struct srcu_struct *ssp, int idx) { - int newval = READ_ONCE(ssp->srcu_lock_nesting[idx]) - 1; + int newval; + preempt_disable(); // Needed for PREEMPT_LAZY + newval = READ_ONCE(ssp->srcu_lock_nesting[idx]) - 1; WRITE_ONCE(ssp->srcu_lock_nesting[idx], newval); - if (!newval && READ_ONCE(ssp->srcu_gp_waiting) && in_task()) + preempt_enable(); + if (!newval && READ_ONCE(ssp->srcu_gp_waiting) && in_task() && !irqs_disabled()) swake_up_one(&ssp->srcu_wq); } EXPORT_SYMBOL_GPL(__srcu_read_unlock); /* * Workqueue handler to drive one grace period and invoke any callbacks - * that become ready as a result. Single-CPU and !PREEMPTION operation - * means that we get away with murder on synchronization. ;-) + * that become ready as a result. Single-CPU operation and preemption + * disabling mean that we get away with murder on synchronization. ;-) */ void srcu_drive_gp(struct work_struct *wp) { @@ -117,8 +124,11 @@ void srcu_drive_gp(struct work_struct *wp) struct srcu_struct *ssp; ssp = container_of(wp, struct srcu_struct, srcu_work); - if (ssp->srcu_gp_running || ULONG_CMP_GE(ssp->srcu_idx, READ_ONCE(ssp->srcu_idx_max))) + preempt_disable(); // Needed for PREEMPT_LAZY + if (ssp->srcu_gp_running || ULONG_CMP_GE(ssp->srcu_idx, READ_ONCE(ssp->srcu_idx_max))) { + preempt_enable(); return; /* Already running or nothing to do. */ + } /* Remove recently arrived callbacks and wait for readers. */ WRITE_ONCE(ssp->srcu_gp_running, true); @@ -130,14 +140,23 @@ void srcu_drive_gp(struct work_struct *wp) idx = (ssp->srcu_idx & 0x2) / 2; WRITE_ONCE(ssp->srcu_idx, ssp->srcu_idx + 1); WRITE_ONCE(ssp->srcu_gp_waiting, true); /* srcu_read_unlock() wakes! */ - swait_event_exclusive(ssp->srcu_wq, !READ_ONCE(ssp->srcu_lock_nesting[idx])); + preempt_enable(); + do { + // Deadlock issues prevent __srcu_read_unlock() from + // doing an unconditional wakeup, so polling is required. + swait_event_timeout_exclusive(ssp->srcu_wq, + !READ_ONCE(ssp->srcu_lock_nesting[idx]), HZ / 10); + } while (READ_ONCE(ssp->srcu_lock_nesting[idx])); + preempt_disable(); // Needed for PREEMPT_LAZY WRITE_ONCE(ssp->srcu_gp_waiting, false); /* srcu_read_unlock() cheap. */ WRITE_ONCE(ssp->srcu_idx, ssp->srcu_idx + 1); + preempt_enable(); /* Invoke the callbacks we removed above. */ while (lh) { rhp = lh; lh = lh->next; + debug_rcu_head_callback(rhp); local_bh_disable(); rhp->func(rhp); local_bh_enable(); @@ -149,8 +168,11 @@ void srcu_drive_gp(struct work_struct *wp) * at interrupt level, but the ->srcu_gp_running checks will * straighten that out. */ + preempt_disable(); // Needed for PREEMPT_LAZY WRITE_ONCE(ssp->srcu_gp_running, false); - if (ULONG_CMP_LT(ssp->srcu_idx, READ_ONCE(ssp->srcu_idx_max))) + idx = ULONG_CMP_LT(ssp->srcu_idx, READ_ONCE(ssp->srcu_idx_max)); + preempt_enable(); + if (idx) schedule_work(&ssp->srcu_work); } EXPORT_SYMBOL_GPL(srcu_drive_gp); @@ -159,9 +181,11 @@ static void srcu_gp_start_if_needed(struct srcu_struct *ssp) { unsigned long cookie; + lockdep_assert_preemption_disabled(); // Needed for PREEMPT_LAZY cookie = get_state_synchronize_srcu(ssp); - if (ULONG_CMP_GE(READ_ONCE(ssp->srcu_idx_max), cookie)) + if (ULONG_CMP_GE(READ_ONCE(ssp->srcu_idx_max), cookie)) { return; + } WRITE_ONCE(ssp->srcu_idx_max, cookie); if (!READ_ONCE(ssp->srcu_gp_running)) { if (likely(srcu_init_done)) @@ -182,11 +206,13 @@ void call_srcu(struct srcu_struct *ssp, struct rcu_head *rhp, rhp->func = func; rhp->next = NULL; + preempt_disable(); // Needed for PREEMPT_LAZY local_irq_save(flags); *ssp->srcu_cb_tail = rhp; ssp->srcu_cb_tail = &rhp->next; local_irq_restore(flags); srcu_gp_start_if_needed(ssp); + preempt_enable(); } EXPORT_SYMBOL_GPL(call_srcu); @@ -197,6 +223,8 @@ void synchronize_srcu(struct srcu_struct *ssp) { struct rcu_synchronize rs; + srcu_lock_sync(&ssp->dep_map); + RCU_LOCKDEP_WARN(lockdep_is_held(ssp) || lock_is_held(&rcu_bh_lock_map) || lock_is_held(&rcu_lock_map) || @@ -238,9 +266,12 @@ EXPORT_SYMBOL_GPL(get_state_synchronize_srcu); */ unsigned long start_poll_synchronize_srcu(struct srcu_struct *ssp) { - unsigned long ret = get_state_synchronize_srcu(ssp); + unsigned long ret; + preempt_disable(); // Needed for PREEMPT_LAZY + ret = get_state_synchronize_srcu(ssp); srcu_gp_start_if_needed(ssp); + preempt_enable(); return ret; } EXPORT_SYMBOL_GPL(start_poll_synchronize_srcu); @@ -253,15 +284,18 @@ bool poll_state_synchronize_srcu(struct srcu_struct *ssp, unsigned long cookie) unsigned long cur_s = READ_ONCE(ssp->srcu_idx); barrier(); - return ULONG_CMP_GE(cur_s, cookie) || ULONG_CMP_LT(cur_s, cookie - 3); + return cookie == SRCU_GET_STATE_COMPLETED || + ULONG_CMP_GE(cur_s, cookie) || ULONG_CMP_LT(cur_s, cookie - 3); } EXPORT_SYMBOL_GPL(poll_state_synchronize_srcu); +#ifndef CONFIG_TREE_RCU /* Lockdep diagnostics. */ void __init rcu_scheduler_starting(void) { rcu_scheduler_active = RCU_SCHEDULER_RUNNING; } +#endif // #ifndef CONFIG_TREE_RCU /* * Queue work for srcu_struct structures with early boot callbacks. diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index ca4b5dcec675..ea3f128de06f 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -103,7 +103,7 @@ do { \ #define spin_trylock_irqsave_rcu_node(p, flags) \ ({ \ - bool ___locked = spin_trylock_irqsave(&ACCESS_PRIVATE(p, lock), flags); \ + bool ___locked = spin_trylock_irqsave(&ACCESS_PRIVATE(p, lock), flags); \ \ if (___locked) \ smp_mb__after_unlock_lock(); \ @@ -116,8 +116,9 @@ do { \ /* * Initialize SRCU per-CPU data. Note that statically allocated * srcu_struct structures might already have srcu_read_lock() and - * srcu_read_unlock() running against them. So if the is_static parameter - * is set, don't initialize ->srcu_lock_count[] and ->srcu_unlock_count[]. + * srcu_read_unlock() running against them. So if the is_static + * parameter is set, don't initialize ->srcu_ctrs[].srcu_locks and + * ->srcu_ctrs[].srcu_unlocks. */ static void init_srcu_struct_data(struct srcu_struct *ssp) { @@ -128,15 +129,14 @@ static void init_srcu_struct_data(struct srcu_struct *ssp) * Initialize the per-CPU srcu_data array, which feeds into the * leaves of the srcu_node tree. */ - WARN_ON_ONCE(ARRAY_SIZE(sdp->srcu_lock_count) != - ARRAY_SIZE(sdp->srcu_unlock_count)); for_each_possible_cpu(cpu) { sdp = per_cpu_ptr(ssp->sda, cpu); spin_lock_init(&ACCESS_PRIVATE(sdp, lock)); rcu_segcblist_init(&sdp->srcu_cblist); sdp->srcu_cblist_invoking = false; - sdp->srcu_gp_seq_needed = ssp->srcu_gp_seq; - sdp->srcu_gp_seq_needed_exp = ssp->srcu_gp_seq; + sdp->srcu_gp_seq_needed = ssp->srcu_sup->srcu_gp_seq; + sdp->srcu_gp_seq_needed_exp = ssp->srcu_sup->srcu_gp_seq; + sdp->srcu_barrier_head.next = &sdp->srcu_barrier_head; sdp->mynode = NULL; sdp->cpu = cpu; INIT_WORK(&sdp->work, srcu_invoke_callbacks); @@ -154,7 +154,7 @@ static void init_srcu_struct_data(struct srcu_struct *ssp) */ static inline bool srcu_invl_snp_seq(unsigned long s) { - return rcu_seq_state(s) == SRCU_SNP_INIT_SEQ; + return s == SRCU_SNP_INIT_SEQ; } /* @@ -173,20 +173,20 @@ static bool init_srcu_struct_nodes(struct srcu_struct *ssp, gfp_t gfp_flags) /* Initialize geometry if it has not already been initialized. */ rcu_init_geometry(); - ssp->node = kcalloc(rcu_num_nodes, sizeof(*ssp->node), gfp_flags); - if (!ssp->node) + ssp->srcu_sup->node = kcalloc(rcu_num_nodes, sizeof(*ssp->srcu_sup->node), gfp_flags); + if (!ssp->srcu_sup->node) return false; /* Work out the overall tree geometry. */ - ssp->level[0] = &ssp->node[0]; + ssp->srcu_sup->level[0] = &ssp->srcu_sup->node[0]; for (i = 1; i < rcu_num_lvls; i++) - ssp->level[i] = ssp->level[i - 1] + num_rcu_lvl[i - 1]; + ssp->srcu_sup->level[i] = ssp->srcu_sup->level[i - 1] + num_rcu_lvl[i - 1]; rcu_init_levelspread(levelspread, num_rcu_lvl); /* Each pass through this loop initializes one srcu_node structure. */ srcu_for_each_node_breadth_first(ssp, snp) { spin_lock_init(&ACCESS_PRIVATE(snp, lock)); - WARN_ON_ONCE(ARRAY_SIZE(snp->srcu_have_cbs) != + BUILD_BUG_ON(ARRAY_SIZE(snp->srcu_have_cbs) != ARRAY_SIZE(snp->srcu_data_have_cbs)); for (i = 0; i < ARRAY_SIZE(snp->srcu_have_cbs); i++) { snp->srcu_have_cbs[i] = SRCU_SNP_INIT_SEQ; @@ -195,17 +195,17 @@ static bool init_srcu_struct_nodes(struct srcu_struct *ssp, gfp_t gfp_flags) snp->srcu_gp_seq_needed_exp = SRCU_SNP_INIT_SEQ; snp->grplo = -1; snp->grphi = -1; - if (snp == &ssp->node[0]) { + if (snp == &ssp->srcu_sup->node[0]) { /* Root node, special case. */ snp->srcu_parent = NULL; continue; } /* Non-root node. */ - if (snp == ssp->level[level + 1]) + if (snp == ssp->srcu_sup->level[level + 1]) level++; - snp->srcu_parent = ssp->level[level - 1] + - (snp - ssp->level[level]) / + snp->srcu_parent = ssp->srcu_sup->level[level - 1] + + (snp - ssp->srcu_sup->level[level]) / levelspread[level - 1]; } @@ -214,7 +214,7 @@ static bool init_srcu_struct_nodes(struct srcu_struct *ssp, gfp_t gfp_flags) * leaves of the srcu_node tree. */ level = rcu_num_lvls - 1; - snp_first = ssp->level[level]; + snp_first = ssp->srcu_sup->level[level]; for_each_possible_cpu(cpu) { sdp = per_cpu_ptr(ssp->sda, cpu); sdp->mynode = &snp_first[cpu / levelspread[level]]; @@ -223,9 +223,9 @@ static bool init_srcu_struct_nodes(struct srcu_struct *ssp, gfp_t gfp_flags) snp->grplo = cpu; snp->grphi = cpu; } - sdp->grpmask = 1 << (cpu - sdp->mynode->grplo); + sdp->grpmask = 1UL << (cpu - sdp->mynode->grplo); } - smp_store_release(&ssp->srcu_size_state, SRCU_SIZE_WAIT_BARRIER); + smp_store_release(&ssp->srcu_sup->srcu_size_state, SRCU_SIZE_WAIT_BARRIER); return true; } @@ -236,69 +236,142 @@ static bool init_srcu_struct_nodes(struct srcu_struct *ssp, gfp_t gfp_flags) */ static int init_srcu_struct_fields(struct srcu_struct *ssp, bool is_static) { - ssp->srcu_size_state = SRCU_SIZE_SMALL; - ssp->node = NULL; - mutex_init(&ssp->srcu_cb_mutex); - mutex_init(&ssp->srcu_gp_mutex); - ssp->srcu_idx = 0; - ssp->srcu_gp_seq = 0; - ssp->srcu_barrier_seq = 0; - mutex_init(&ssp->srcu_barrier_mutex); - atomic_set(&ssp->srcu_barrier_cpu_cnt, 0); - INIT_DELAYED_WORK(&ssp->work, process_srcu); - ssp->sda_is_static = is_static; if (!is_static) + ssp->srcu_sup = kzalloc(sizeof(*ssp->srcu_sup), GFP_KERNEL); + if (!ssp->srcu_sup) + return -ENOMEM; + if (!is_static) + spin_lock_init(&ACCESS_PRIVATE(ssp->srcu_sup, lock)); + ssp->srcu_sup->srcu_size_state = SRCU_SIZE_SMALL; + ssp->srcu_sup->node = NULL; + mutex_init(&ssp->srcu_sup->srcu_cb_mutex); + mutex_init(&ssp->srcu_sup->srcu_gp_mutex); + ssp->srcu_sup->srcu_gp_seq = SRCU_GP_SEQ_INITIAL_VAL; + ssp->srcu_sup->srcu_barrier_seq = 0; + mutex_init(&ssp->srcu_sup->srcu_barrier_mutex); + atomic_set(&ssp->srcu_sup->srcu_barrier_cpu_cnt, 0); + INIT_DELAYED_WORK(&ssp->srcu_sup->work, process_srcu); + ssp->srcu_sup->sda_is_static = is_static; + if (!is_static) { ssp->sda = alloc_percpu(struct srcu_data); + ssp->srcu_ctrp = &ssp->sda->srcu_ctrs[0]; + } if (!ssp->sda) - return -ENOMEM; + goto err_free_sup; init_srcu_struct_data(ssp); - ssp->srcu_gp_seq_needed_exp = 0; - ssp->srcu_last_gp_end = ktime_get_mono_fast_ns(); - if (READ_ONCE(ssp->srcu_size_state) == SRCU_SIZE_SMALL && SRCU_SIZING_IS_INIT()) { - if (!init_srcu_struct_nodes(ssp, GFP_ATOMIC)) { - if (!ssp->sda_is_static) { - free_percpu(ssp->sda); - ssp->sda = NULL; - return -ENOMEM; - } - } else { - WRITE_ONCE(ssp->srcu_size_state, SRCU_SIZE_BIG); - } + ssp->srcu_sup->srcu_gp_seq_needed_exp = SRCU_GP_SEQ_INITIAL_VAL; + ssp->srcu_sup->srcu_last_gp_end = ktime_get_mono_fast_ns(); + if (READ_ONCE(ssp->srcu_sup->srcu_size_state) == SRCU_SIZE_SMALL && SRCU_SIZING_IS_INIT()) { + if (!init_srcu_struct_nodes(ssp, GFP_ATOMIC)) + goto err_free_sda; + WRITE_ONCE(ssp->srcu_sup->srcu_size_state, SRCU_SIZE_BIG); } - smp_store_release(&ssp->srcu_gp_seq_needed, 0); /* Init done. */ + ssp->srcu_sup->srcu_ssp = ssp; + smp_store_release(&ssp->srcu_sup->srcu_gp_seq_needed, + SRCU_GP_SEQ_INITIAL_VAL); /* Init done. */ return 0; + +err_free_sda: + if (!is_static) { + free_percpu(ssp->sda); + ssp->sda = NULL; + } +err_free_sup: + if (!is_static) { + kfree(ssp->srcu_sup); + ssp->srcu_sup = NULL; + } + return -ENOMEM; } #ifdef CONFIG_DEBUG_LOCK_ALLOC -int __init_srcu_struct(struct srcu_struct *ssp, const char *name, - struct lock_class_key *key) +static int +__init_srcu_struct_common(struct srcu_struct *ssp, const char *name, struct lock_class_key *key) { /* Don't re-initialize a lock while it is held. */ debug_check_no_locks_freed((void *)ssp, sizeof(*ssp)); lockdep_init_map(&ssp->dep_map, name, key, 0); - spin_lock_init(&ACCESS_PRIVATE(ssp, lock)); return init_srcu_struct_fields(ssp, false); } + +int __init_srcu_struct(struct srcu_struct *ssp, const char *name, struct lock_class_key *key) +{ + ssp->srcu_reader_flavor = 0; + return __init_srcu_struct_common(ssp, name, key); +} EXPORT_SYMBOL_GPL(__init_srcu_struct); +int __init_srcu_struct_fast(struct srcu_struct *ssp, const char *name, struct lock_class_key *key) +{ + ssp->srcu_reader_flavor = SRCU_READ_FLAVOR_FAST; + return __init_srcu_struct_common(ssp, name, key); +} +EXPORT_SYMBOL_GPL(__init_srcu_struct_fast); + +int __init_srcu_struct_fast_updown(struct srcu_struct *ssp, const char *name, + struct lock_class_key *key) +{ + ssp->srcu_reader_flavor = SRCU_READ_FLAVOR_FAST_UPDOWN; + return __init_srcu_struct_common(ssp, name, key); +} +EXPORT_SYMBOL_GPL(__init_srcu_struct_fast_updown); + #else /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ /** * init_srcu_struct - initialize a sleep-RCU structure * @ssp: structure to initialize. * - * Must invoke this on a given srcu_struct before passing that srcu_struct + * Use this in place of DEFINE_SRCU() and DEFINE_STATIC_SRCU() + * for non-static srcu_struct structures that are to be passed to + * srcu_read_lock(), srcu_read_lock_nmisafe(), and friends. It is necessary + * to invoke this on a given srcu_struct before passing that srcu_struct * to any other function. Each srcu_struct represents a separate domain * of SRCU protection. */ int init_srcu_struct(struct srcu_struct *ssp) { - spin_lock_init(&ACCESS_PRIVATE(ssp, lock)); + ssp->srcu_reader_flavor = 0; return init_srcu_struct_fields(ssp, false); } EXPORT_SYMBOL_GPL(init_srcu_struct); +/** + * init_srcu_struct_fast - initialize a fast-reader sleep-RCU structure + * @ssp: structure to initialize. + * + * Use this in place of DEFINE_SRCU_FAST() and DEFINE_STATIC_SRCU_FAST() + * for non-static srcu_struct structures that are to be passed to + * srcu_read_lock_fast() and friends. It is necessary to invoke this on a + * given srcu_struct before passing that srcu_struct to any other function. + * Each srcu_struct represents a separate domain of SRCU protection. + */ +int init_srcu_struct_fast(struct srcu_struct *ssp) +{ + ssp->srcu_reader_flavor = SRCU_READ_FLAVOR_FAST; + return init_srcu_struct_fields(ssp, false); +} +EXPORT_SYMBOL_GPL(init_srcu_struct_fast); + +/** + * init_srcu_struct_fast_updown - initialize a fast-reader up/down sleep-RCU structure + * @ssp: structure to initialize. + * + * Use this function in place of DEFINE_SRCU_FAST_UPDOWN() and + * DEFINE_STATIC_SRCU_FAST_UPDOWN() for non-static srcu_struct + * structures that are to be passed to srcu_read_lock_fast_updown(), + * srcu_down_read_fast(), and friends. It is necessary to invoke this on a + * given srcu_struct before passing that srcu_struct to any other function. + * Each srcu_struct represents a separate domain of SRCU protection. + */ +int init_srcu_struct_fast_updown(struct srcu_struct *ssp) +{ + ssp->srcu_reader_flavor = SRCU_READ_FLAVOR_FAST_UPDOWN; + return init_srcu_struct_fields(ssp, false); +} +EXPORT_SYMBOL_GPL(init_srcu_struct_fast_updown); + #endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */ /* @@ -306,8 +379,8 @@ EXPORT_SYMBOL_GPL(init_srcu_struct); */ static void __srcu_transition_to_big(struct srcu_struct *ssp) { - lockdep_assert_held(&ACCESS_PRIVATE(ssp, lock)); - smp_store_release(&ssp->srcu_size_state, SRCU_SIZE_ALLOC); + lockdep_assert_held(&ACCESS_PRIVATE(ssp->srcu_sup, lock)); + smp_store_release(&ssp->srcu_sup->srcu_size_state, SRCU_SIZE_ALLOC); } /* @@ -318,15 +391,15 @@ static void srcu_transition_to_big(struct srcu_struct *ssp) unsigned long flags; /* Double-checked locking on ->srcu_size-state. */ - if (smp_load_acquire(&ssp->srcu_size_state) != SRCU_SIZE_SMALL) + if (smp_load_acquire(&ssp->srcu_sup->srcu_size_state) != SRCU_SIZE_SMALL) return; - spin_lock_irqsave_rcu_node(ssp, flags); - if (smp_load_acquire(&ssp->srcu_size_state) != SRCU_SIZE_SMALL) { - spin_unlock_irqrestore_rcu_node(ssp, flags); + spin_lock_irqsave_rcu_node(ssp->srcu_sup, flags); + if (smp_load_acquire(&ssp->srcu_sup->srcu_size_state) != SRCU_SIZE_SMALL) { + spin_unlock_irqrestore_rcu_node(ssp->srcu_sup, flags); return; } __srcu_transition_to_big(ssp); - spin_unlock_irqrestore_rcu_node(ssp, flags); + spin_unlock_irqrestore_rcu_node(ssp->srcu_sup, flags); } /* @@ -337,14 +410,14 @@ static void spin_lock_irqsave_check_contention(struct srcu_struct *ssp) { unsigned long j; - if (!SRCU_SIZING_IS_CONTEND() || ssp->srcu_size_state) + if (!SRCU_SIZING_IS_CONTEND() || ssp->srcu_sup->srcu_size_state) return; j = jiffies; - if (ssp->srcu_size_jiffies != j) { - ssp->srcu_size_jiffies = j; - ssp->srcu_n_lock_retries = 0; + if (ssp->srcu_sup->srcu_size_jiffies != j) { + ssp->srcu_sup->srcu_size_jiffies = j; + ssp->srcu_sup->srcu_n_lock_retries = 0; } - if (++ssp->srcu_n_lock_retries <= small_contention_lim) + if (++ssp->srcu_sup->srcu_n_lock_retries <= small_contention_lim) return; __srcu_transition_to_big(ssp); } @@ -361,9 +434,9 @@ static void spin_lock_irqsave_sdp_contention(struct srcu_data *sdp, unsigned lon if (spin_trylock_irqsave_rcu_node(sdp, *flags)) return; - spin_lock_irqsave_rcu_node(ssp, *flags); + spin_lock_irqsave_rcu_node(ssp->srcu_sup, *flags); spin_lock_irqsave_check_contention(ssp); - spin_unlock_irqrestore_rcu_node(ssp, *flags); + spin_unlock_irqrestore_rcu_node(ssp->srcu_sup, *flags); spin_lock_irqsave_rcu_node(sdp, *flags); } @@ -375,9 +448,9 @@ static void spin_lock_irqsave_sdp_contention(struct srcu_data *sdp, unsigned lon */ static void spin_lock_irqsave_ssp_contention(struct srcu_struct *ssp, unsigned long *flags) { - if (spin_trylock_irqsave_rcu_node(ssp, *flags)) + if (spin_trylock_irqsave_rcu_node(ssp->srcu_sup, *flags)) return; - spin_lock_irqsave_rcu_node(ssp, *flags); + spin_lock_irqsave_rcu_node(ssp->srcu_sup, *flags); spin_lock_irqsave_check_contention(ssp); } @@ -394,53 +467,72 @@ static void check_init_srcu_struct(struct srcu_struct *ssp) unsigned long flags; /* The smp_load_acquire() pairs with the smp_store_release(). */ - if (!rcu_seq_state(smp_load_acquire(&ssp->srcu_gp_seq_needed))) /*^^^*/ + if (!rcu_seq_state(smp_load_acquire(&ssp->srcu_sup->srcu_gp_seq_needed))) /*^^^*/ return; /* Already initialized. */ - spin_lock_irqsave_rcu_node(ssp, flags); - if (!rcu_seq_state(ssp->srcu_gp_seq_needed)) { - spin_unlock_irqrestore_rcu_node(ssp, flags); + spin_lock_irqsave_rcu_node(ssp->srcu_sup, flags); + if (!rcu_seq_state(ssp->srcu_sup->srcu_gp_seq_needed)) { + spin_unlock_irqrestore_rcu_node(ssp->srcu_sup, flags); return; } init_srcu_struct_fields(ssp, true); - spin_unlock_irqrestore_rcu_node(ssp, flags); + spin_unlock_irqrestore_rcu_node(ssp->srcu_sup, flags); } /* - * Returns approximate total of the readers' ->srcu_lock_count[] values - * for the rank of per-CPU counters specified by idx. + * Is the current or any upcoming grace period to be expedited? */ -static unsigned long srcu_readers_lock_idx(struct srcu_struct *ssp, int idx) +static bool srcu_gp_is_expedited(struct srcu_struct *ssp) +{ + struct srcu_usage *sup = ssp->srcu_sup; + + return ULONG_CMP_LT(READ_ONCE(sup->srcu_gp_seq), READ_ONCE(sup->srcu_gp_seq_needed_exp)); +} + +/* + * Computes approximate total of the readers' ->srcu_ctrs[].srcu_locks + * values for the rank of per-CPU counters specified by idx, and returns + * true if the caller did the proper barrier (gp), and if the count of + * the locks matches that of the unlocks passed in. + */ +static bool srcu_readers_lock_idx(struct srcu_struct *ssp, int idx, bool gp, unsigned long unlocks) { int cpu; + unsigned long mask = 0; unsigned long sum = 0; for_each_possible_cpu(cpu) { - struct srcu_data *cpuc = per_cpu_ptr(ssp->sda, cpu); + struct srcu_data *sdp = per_cpu_ptr(ssp->sda, cpu); - sum += atomic_long_read(&cpuc->srcu_lock_count[idx]); + sum += atomic_long_read(&sdp->srcu_ctrs[idx].srcu_locks); + if (IS_ENABLED(CONFIG_PROVE_RCU)) + mask = mask | READ_ONCE(sdp->srcu_reader_flavor); } - return sum; + WARN_ONCE(IS_ENABLED(CONFIG_PROVE_RCU) && (mask & (mask - 1)), + "Mixed reader flavors for srcu_struct at %ps.\n", ssp); + if (mask & SRCU_READ_FLAVOR_SLOWGP && !gp) + return false; + return sum == unlocks; } /* - * Returns approximate total of the readers' ->srcu_unlock_count[] values - * for the rank of per-CPU counters specified by idx. + * Returns approximate total of the readers' ->srcu_ctrs[].srcu_unlocks + * values for the rank of per-CPU counters specified by idx. */ -static unsigned long srcu_readers_unlock_idx(struct srcu_struct *ssp, int idx) +static unsigned long srcu_readers_unlock_idx(struct srcu_struct *ssp, int idx, unsigned long *rdm) { int cpu; - unsigned long mask = 0; + unsigned long mask = ssp->srcu_reader_flavor; unsigned long sum = 0; for_each_possible_cpu(cpu) { - struct srcu_data *cpuc = per_cpu_ptr(ssp->sda, cpu); + struct srcu_data *sdp = per_cpu_ptr(ssp->sda, cpu); - sum += atomic_long_read(&cpuc->srcu_unlock_count[idx]); - if (IS_ENABLED(CONFIG_PROVE_RCU)) - mask = mask | READ_ONCE(cpuc->srcu_nmi_safety); + sum += atomic_long_read(&sdp->srcu_ctrs[idx].srcu_unlocks); + mask = mask | READ_ONCE(sdp->srcu_reader_flavor); } - WARN_ONCE(IS_ENABLED(CONFIG_PROVE_RCU) && (mask & (mask >> 1)), - "Mixed NMI-safe readers for srcu_struct at %ps.\n", ssp); + WARN_ONCE(IS_ENABLED(CONFIG_PROVE_RCU) && (mask & (mask - 1)), + "Mixed reader flavors for srcu_struct at %ps.\n", ssp); + *rdm = mask; return sum; } @@ -450,45 +542,89 @@ static unsigned long srcu_readers_unlock_idx(struct srcu_struct *ssp, int idx) */ static bool srcu_readers_active_idx_check(struct srcu_struct *ssp, int idx) { + bool did_gp; + unsigned long rdm; unsigned long unlocks; - unlocks = srcu_readers_unlock_idx(ssp, idx); + unlocks = srcu_readers_unlock_idx(ssp, idx, &rdm); + did_gp = !!(rdm & SRCU_READ_FLAVOR_SLOWGP); /* * Make sure that a lock is always counted if the corresponding * unlock is counted. Needs to be a smp_mb() as the read side may * contain a read from a variable that is written to before the * synchronize_srcu() in the write side. In this case smp_mb()s - * A and B act like the store buffering pattern. + * A and B (or X and Y) act like the store buffering pattern. * - * This smp_mb() also pairs with smp_mb() C to prevent accesses - * after the synchronize_srcu() from being executed before the - * grace period ends. + * This smp_mb() also pairs with smp_mb() C (or, in the case of X, + * Z) to prevent accesses after the synchronize_srcu() from being + * executed before the grace period ends. */ - smp_mb(); /* A */ + if (!did_gp) + smp_mb(); /* A */ + else if (srcu_gp_is_expedited(ssp)) + synchronize_rcu_expedited(); /* X */ + else + synchronize_rcu(); /* X */ /* * If the locks are the same as the unlocks, then there must have - * been no readers on this index at some time in between. This does - * not mean that there are no more readers, as one could have read - * the current index but not have incremented the lock counter yet. + * been no readers on this index at some point in this function. + * But there might be more readers, as a task might have read + * the current ->srcu_ctrp but not yet have incremented its CPU's + * ->srcu_ctrs[idx].srcu_locks counter. In fact, it is possible + * that most of the tasks have been preempted between fetching + * ->srcu_ctrp and incrementing ->srcu_ctrs[idx].srcu_locks. And + * there could be almost (ULONG_MAX / sizeof(struct task_struct)) + * tasks in a system whose address space was fully populated + * with memory. Call this quantity Nt. * - * So suppose that the updater is preempted here for so long - * that more than ULONG_MAX non-nested readers come and go in - * the meantime. It turns out that this cannot result in overflow - * because if a reader modifies its unlock count after we read it - * above, then that reader's next load of ->srcu_idx is guaranteed - * to get the new value, which will cause it to operate on the - * other bank of counters, where it cannot contribute to the - * overflow of these counters. This means that there is a maximum - * of 2*NR_CPUS increments, which cannot overflow given current - * systems, especially not on 64-bit systems. + * So suppose that the updater is preempted at this + * point in the code for a long time. That now-preempted + * updater has already flipped ->srcu_ctrp (possibly during + * the preceding grace period), done an smp_mb() (again, + * possibly during the preceding grace period), and summed up + * the ->srcu_ctrs[idx].srcu_unlocks counters. How many times + * can a given one of the aforementioned Nt tasks increment the + * old ->srcu_ctrp value's ->srcu_ctrs[idx].srcu_locks counter, + * in the absence of nesting? * - * OK, how about nesting? This does impose a limit on nesting - * of floor(ULONG_MAX/NR_CPUS/2), which should be sufficient, - * especially on 64-bit systems. + * It can clearly do so once, given that it has already fetched + * the old value of ->srcu_ctrp and is just about to use that + * value to index its increment of ->srcu_ctrs[idx].srcu_locks. + * But as soon as it leaves that SRCU read-side critical section, + * it will increment ->srcu_ctrs[idx].srcu_unlocks, which must + * follow the updater's above read from that same value. Thus, + as soon the reading task does an smp_mb() and a later fetch from + * ->srcu_ctrp, that task will be guaranteed to get the new index. + * Except that the increment of ->srcu_ctrs[idx].srcu_unlocks + * in __srcu_read_unlock() is after the smp_mb(), and the fetch + * from ->srcu_ctrp in __srcu_read_lock() is before the smp_mb(). + * Thus, that task might not see the new value of ->srcu_ctrp until + * the -second- __srcu_read_lock(), which in turn means that this + * task might well increment ->srcu_ctrs[idx].srcu_locks for the + * old value of ->srcu_ctrp twice, not just once. + * + * However, it is important to note that a given smp_mb() takes + * effect not just for the task executing it, but also for any + * later task running on that same CPU. + * + * That is, there can be almost Nt + Nc further increments + * of ->srcu_ctrs[idx].srcu_locks for the old index, where Nc + * is the number of CPUs. But this is OK because the size of + * the task_struct structure limits the value of Nt and current + * systems limit Nc to a few thousand. + * + * OK, but what about nesting? This does impose a limit on + * nesting of half of the size of the task_struct structure + * (measured in bytes), which should be sufficient. A late 2022 + * TREE01 rcutorture run reported this size to be no less than + * 9408 bytes, allowing up to 4704 levels of nesting, which is + * comfortably beyond excessive. Especially on 64-bit systems, + * which are unlikely to be configured with an address space fully + * populated with memory, at least not anytime soon. */ - return srcu_readers_lock_idx(ssp, idx) == unlocks; + return srcu_readers_lock_idx(ssp, idx, did_gp, unlocks); } /** @@ -506,12 +642,12 @@ static bool srcu_readers_active(struct srcu_struct *ssp) unsigned long sum = 0; for_each_possible_cpu(cpu) { - struct srcu_data *cpuc = per_cpu_ptr(ssp->sda, cpu); + struct srcu_data *sdp = per_cpu_ptr(ssp->sda, cpu); - sum += atomic_long_read(&cpuc->srcu_lock_count[0]); - sum += atomic_long_read(&cpuc->srcu_lock_count[1]); - sum -= atomic_long_read(&cpuc->srcu_unlock_count[0]); - sum -= atomic_long_read(&cpuc->srcu_unlock_count[1]); + sum += atomic_long_read(&sdp->srcu_ctrs[0].srcu_locks); + sum += atomic_long_read(&sdp->srcu_ctrs[1].srcu_locks); + sum -= atomic_long_read(&sdp->srcu_ctrs[0].srcu_unlocks); + sum -= atomic_long_read(&sdp->srcu_ctrs[1].srcu_unlocks); } return sum; } @@ -572,17 +708,20 @@ static unsigned long srcu_get_delay(struct srcu_struct *ssp) unsigned long gpstart; unsigned long j; unsigned long jbase = SRCU_INTERVAL; + struct srcu_usage *sup = ssp->srcu_sup; - if (ULONG_CMP_LT(READ_ONCE(ssp->srcu_gp_seq), READ_ONCE(ssp->srcu_gp_seq_needed_exp))) + lockdep_assert_held(&ACCESS_PRIVATE(ssp->srcu_sup, lock)); + if (srcu_gp_is_expedited(ssp)) jbase = 0; - if (rcu_seq_state(READ_ONCE(ssp->srcu_gp_seq))) { + if (rcu_seq_state(READ_ONCE(sup->srcu_gp_seq))) { j = jiffies - 1; - gpstart = READ_ONCE(ssp->srcu_gp_start); + gpstart = READ_ONCE(sup->srcu_gp_start); if (time_after(j, gpstart)) jbase += j - gpstart; if (!jbase) { - WRITE_ONCE(ssp->srcu_n_exp_nodelay, READ_ONCE(ssp->srcu_n_exp_nodelay) + 1); - if (READ_ONCE(ssp->srcu_n_exp_nodelay) > srcu_max_nodelay_phase) + ASSERT_EXCLUSIVE_WRITER(sup->srcu_n_exp_nodelay); + WRITE_ONCE(sup->srcu_n_exp_nodelay, READ_ONCE(sup->srcu_n_exp_nodelay) + 1); + if (READ_ONCE(sup->srcu_n_exp_nodelay) > srcu_max_nodelay_phase) jbase = 1; } } @@ -599,74 +738,88 @@ static unsigned long srcu_get_delay(struct srcu_struct *ssp) void cleanup_srcu_struct(struct srcu_struct *ssp) { int cpu; + unsigned long delay; + struct srcu_usage *sup = ssp->srcu_sup; - if (WARN_ON(!srcu_get_delay(ssp))) + spin_lock_irq_rcu_node(ssp->srcu_sup); + delay = srcu_get_delay(ssp); + spin_unlock_irq_rcu_node(ssp->srcu_sup); + if (WARN_ON(!delay)) return; /* Just leak it! */ if (WARN_ON(srcu_readers_active(ssp))) return; /* Just leak it! */ - flush_delayed_work(&ssp->work); + flush_delayed_work(&sup->work); for_each_possible_cpu(cpu) { struct srcu_data *sdp = per_cpu_ptr(ssp->sda, cpu); - del_timer_sync(&sdp->delay_work); + timer_delete_sync(&sdp->delay_work); flush_work(&sdp->work); if (WARN_ON(rcu_segcblist_n_cbs(&sdp->srcu_cblist))) return; /* Forgot srcu_barrier(), so just leak it! */ } - if (WARN_ON(rcu_seq_state(READ_ONCE(ssp->srcu_gp_seq)) != SRCU_STATE_IDLE) || - WARN_ON(rcu_seq_current(&ssp->srcu_gp_seq) != ssp->srcu_gp_seq_needed) || + if (WARN_ON(rcu_seq_state(READ_ONCE(sup->srcu_gp_seq)) != SRCU_STATE_IDLE) || + WARN_ON(rcu_seq_current(&sup->srcu_gp_seq) != sup->srcu_gp_seq_needed) || WARN_ON(srcu_readers_active(ssp))) { pr_info("%s: Active srcu_struct %p read state: %d gp state: %lu/%lu\n", - __func__, ssp, rcu_seq_state(READ_ONCE(ssp->srcu_gp_seq)), - rcu_seq_current(&ssp->srcu_gp_seq), ssp->srcu_gp_seq_needed); - return; /* Caller forgot to stop doing call_srcu()? */ + __func__, ssp, rcu_seq_state(READ_ONCE(sup->srcu_gp_seq)), + rcu_seq_current(&sup->srcu_gp_seq), sup->srcu_gp_seq_needed); + return; // Caller forgot to stop doing call_srcu()? + // Or caller invoked start_poll_synchronize_srcu() + // and then cleanup_srcu_struct() before that grace + // period ended? } - if (!ssp->sda_is_static) { + kfree(sup->node); + sup->node = NULL; + sup->srcu_size_state = SRCU_SIZE_SMALL; + if (!sup->sda_is_static) { free_percpu(ssp->sda); ssp->sda = NULL; + kfree(sup); + ssp->srcu_sup = NULL; } - kfree(ssp->node); - ssp->node = NULL; - ssp->srcu_size_state = SRCU_SIZE_SMALL; } EXPORT_SYMBOL_GPL(cleanup_srcu_struct); -#ifdef CONFIG_PROVE_RCU /* - * Check for consistent NMI safety. + * Check for consistent reader flavor. */ -void srcu_check_nmi_safety(struct srcu_struct *ssp, bool nmi_safe) +void __srcu_check_read_flavor(struct srcu_struct *ssp, int read_flavor) { - int nmi_safe_mask = 1 << nmi_safe; - int old_nmi_safe_mask; + int old_read_flavor; struct srcu_data *sdp; - /* NMI-unsafe use in NMI is a bad sign */ - WARN_ON_ONCE(!nmi_safe && in_nmi()); + /* NMI-unsafe use in NMI is a bad sign, as is multi-bit read_flavor values. */ + WARN_ON_ONCE((read_flavor != SRCU_READ_FLAVOR_NMI) && in_nmi()); + WARN_ON_ONCE(read_flavor & (read_flavor - 1)); + sdp = raw_cpu_ptr(ssp->sda); - old_nmi_safe_mask = READ_ONCE(sdp->srcu_nmi_safety); - if (!old_nmi_safe_mask) { - WRITE_ONCE(sdp->srcu_nmi_safety, nmi_safe_mask); - return; + old_read_flavor = READ_ONCE(sdp->srcu_reader_flavor); + WARN_ON_ONCE(ssp->srcu_reader_flavor && read_flavor != ssp->srcu_reader_flavor); + WARN_ON_ONCE(old_read_flavor && ssp->srcu_reader_flavor && + old_read_flavor != ssp->srcu_reader_flavor); + WARN_ON_ONCE(read_flavor == SRCU_READ_FLAVOR_FAST && !ssp->srcu_reader_flavor); + if (!old_read_flavor) { + old_read_flavor = cmpxchg(&sdp->srcu_reader_flavor, 0, read_flavor); + if (!old_read_flavor) + return; } - WARN_ONCE(old_nmi_safe_mask != nmi_safe_mask, "CPU %d old state %d new state %d\n", sdp->cpu, old_nmi_safe_mask, nmi_safe_mask); + WARN_ONCE(old_read_flavor != read_flavor, "CPU %d old state %d new state %d\n", sdp->cpu, old_read_flavor, read_flavor); } -EXPORT_SYMBOL_GPL(srcu_check_nmi_safety); -#endif /* CONFIG_PROVE_RCU */ +EXPORT_SYMBOL_GPL(__srcu_check_read_flavor); /* * Counts the new reader in the appropriate per-CPU element of the * srcu_struct. - * Returns an index that must be passed to the matching srcu_read_unlock(). + * Returns a guaranteed non-negative index that must be passed to the + * matching __srcu_read_unlock(). */ int __srcu_read_lock(struct srcu_struct *ssp) { - int idx; + struct srcu_ctr __percpu *scp = READ_ONCE(ssp->srcu_ctrp); - idx = READ_ONCE(ssp->srcu_idx) & 0x1; - this_cpu_inc(ssp->sda->srcu_lock_count[idx].counter); + this_cpu_inc(scp->srcu_locks.counter); smp_mb(); /* B */ /* Avoid leaking the critical section. */ - return idx; + return __srcu_ptr_to_ctr(ssp, scp); } EXPORT_SYMBOL_GPL(__srcu_read_lock); @@ -678,7 +831,7 @@ EXPORT_SYMBOL_GPL(__srcu_read_lock); void __srcu_read_unlock(struct srcu_struct *ssp, int idx) { smp_mb(); /* C */ /* Avoid leaking the critical section. */ - this_cpu_inc(ssp->sda->srcu_unlock_count[idx].counter); + this_cpu_inc(__srcu_ctr_to_ptr(ssp, idx)->srcu_unlocks.counter); } EXPORT_SYMBOL_GPL(__srcu_read_unlock); @@ -691,13 +844,12 @@ EXPORT_SYMBOL_GPL(__srcu_read_unlock); */ int __srcu_read_lock_nmisafe(struct srcu_struct *ssp) { - int idx; - struct srcu_data *sdp = raw_cpu_ptr(ssp->sda); + struct srcu_ctr __percpu *scpp = READ_ONCE(ssp->srcu_ctrp); + struct srcu_ctr *scp = raw_cpu_ptr(scpp); - idx = READ_ONCE(ssp->srcu_idx) & 0x1; - atomic_long_inc(&sdp->srcu_lock_count[idx]); + atomic_long_inc(&scp->srcu_locks); smp_mb__after_atomic(); /* B */ /* Avoid leaking the critical section. */ - return idx; + return __srcu_ptr_to_ctr(ssp, scpp); } EXPORT_SYMBOL_GPL(__srcu_read_lock_nmisafe); @@ -708,10 +860,8 @@ EXPORT_SYMBOL_GPL(__srcu_read_lock_nmisafe); */ void __srcu_read_unlock_nmisafe(struct srcu_struct *ssp, int idx) { - struct srcu_data *sdp = raw_cpu_ptr(ssp->sda); - smp_mb__before_atomic(); /* C */ /* Avoid leaking the critical section. */ - atomic_long_inc(&sdp->srcu_unlock_count[idx]); + atomic_long_inc(&raw_cpu_ptr(__srcu_ctr_to_ptr(ssp, idx))->srcu_unlocks); } EXPORT_SYMBOL_GPL(__srcu_read_unlock_nmisafe); @@ -722,26 +872,15 @@ EXPORT_SYMBOL_GPL(__srcu_read_unlock_nmisafe); */ static void srcu_gp_start(struct srcu_struct *ssp) { - struct srcu_data *sdp; int state; - if (smp_load_acquire(&ssp->srcu_size_state) < SRCU_SIZE_WAIT_BARRIER) - sdp = per_cpu_ptr(ssp->sda, 0); - else - sdp = this_cpu_ptr(ssp->sda); - lockdep_assert_held(&ACCESS_PRIVATE(ssp, lock)); - WARN_ON_ONCE(ULONG_CMP_GE(ssp->srcu_gp_seq, ssp->srcu_gp_seq_needed)); - spin_lock_rcu_node(sdp); /* Interrupts already disabled. */ - rcu_segcblist_advance(&sdp->srcu_cblist, - rcu_seq_current(&ssp->srcu_gp_seq)); - (void)rcu_segcblist_accelerate(&sdp->srcu_cblist, - rcu_seq_snap(&ssp->srcu_gp_seq)); - spin_unlock_rcu_node(sdp); /* Interrupts remain disabled. */ - WRITE_ONCE(ssp->srcu_gp_start, jiffies); - WRITE_ONCE(ssp->srcu_n_exp_nodelay, 0); + lockdep_assert_held(&ACCESS_PRIVATE(ssp->srcu_sup, lock)); + WARN_ON_ONCE(ULONG_CMP_GE(ssp->srcu_sup->srcu_gp_seq, ssp->srcu_sup->srcu_gp_seq_needed)); + WRITE_ONCE(ssp->srcu_sup->srcu_gp_start, jiffies); + WRITE_ONCE(ssp->srcu_sup->srcu_n_exp_nodelay, 0); smp_mb(); /* Order prior store to ->srcu_gp_seq_needed vs. GP start. */ - rcu_seq_start(&ssp->srcu_gp_seq); - state = rcu_seq_state(ssp->srcu_gp_seq); + rcu_seq_start(&ssp->srcu_sup->srcu_gp_seq); + state = rcu_seq_state(ssp->srcu_sup->srcu_gp_seq); WARN_ON_ONCE(state != SRCU_STATE_SCAN1); } @@ -785,7 +924,7 @@ static void srcu_schedule_cbs_snp(struct srcu_struct *ssp, struct srcu_node *snp int cpu; for (cpu = snp->grplo; cpu <= snp->grphi; cpu++) { - if (!(mask & (1 << (cpu - snp->grplo)))) + if (!(mask & (1UL << (cpu - snp->grplo)))) continue; srcu_schedule_cbs_sdp(per_cpu_ptr(ssp->sda, cpu), delay); } @@ -806,7 +945,6 @@ static void srcu_gp_end(struct srcu_struct *ssp) bool cbs; bool last_lvl; int cpu; - unsigned long flags; unsigned long gpseq; int idx; unsigned long mask; @@ -814,36 +952,38 @@ static void srcu_gp_end(struct srcu_struct *ssp) unsigned long sgsne; struct srcu_node *snp; int ss_state; + struct srcu_usage *sup = ssp->srcu_sup; /* Prevent more than one additional grace period. */ - mutex_lock(&ssp->srcu_cb_mutex); + mutex_lock(&sup->srcu_cb_mutex); /* End the current grace period. */ - spin_lock_irq_rcu_node(ssp); - idx = rcu_seq_state(ssp->srcu_gp_seq); + spin_lock_irq_rcu_node(sup); + idx = rcu_seq_state(sup->srcu_gp_seq); WARN_ON_ONCE(idx != SRCU_STATE_SCAN2); - if (ULONG_CMP_LT(READ_ONCE(ssp->srcu_gp_seq), READ_ONCE(ssp->srcu_gp_seq_needed_exp))) + if (srcu_gp_is_expedited(ssp)) cbdelay = 0; - WRITE_ONCE(ssp->srcu_last_gp_end, ktime_get_mono_fast_ns()); - rcu_seq_end(&ssp->srcu_gp_seq); - gpseq = rcu_seq_current(&ssp->srcu_gp_seq); - if (ULONG_CMP_LT(ssp->srcu_gp_seq_needed_exp, gpseq)) - WRITE_ONCE(ssp->srcu_gp_seq_needed_exp, gpseq); - spin_unlock_irq_rcu_node(ssp); - mutex_unlock(&ssp->srcu_gp_mutex); + WRITE_ONCE(sup->srcu_last_gp_end, ktime_get_mono_fast_ns()); + rcu_seq_end(&sup->srcu_gp_seq); + gpseq = rcu_seq_current(&sup->srcu_gp_seq); + if (ULONG_CMP_LT(sup->srcu_gp_seq_needed_exp, gpseq)) + WRITE_ONCE(sup->srcu_gp_seq_needed_exp, gpseq); + spin_unlock_irq_rcu_node(sup); + mutex_unlock(&sup->srcu_gp_mutex); /* A new grace period can start at this point. But only one. */ /* Initiate callback invocation as needed. */ - ss_state = smp_load_acquire(&ssp->srcu_size_state); + ss_state = smp_load_acquire(&sup->srcu_size_state); if (ss_state < SRCU_SIZE_WAIT_BARRIER) { - srcu_schedule_cbs_sdp(per_cpu_ptr(ssp->sda, 0), cbdelay); + srcu_schedule_cbs_sdp(per_cpu_ptr(ssp->sda, get_boot_cpu_id()), + cbdelay); } else { idx = rcu_seq_ctr(gpseq) % ARRAY_SIZE(snp->srcu_have_cbs); srcu_for_each_node_breadth_first(ssp, snp) { spin_lock_irq_rcu_node(snp); cbs = false; - last_lvl = snp >= ssp->level[rcu_num_lvls - 1]; + last_lvl = snp >= sup->level[rcu_num_lvls - 1]; if (last_lvl) cbs = ss_state < SRCU_SIZE_BIG || snp->srcu_have_cbs[idx] == gpseq; snp->srcu_have_cbs[idx] = gpseq; @@ -866,27 +1006,27 @@ static void srcu_gp_end(struct srcu_struct *ssp) if (!(gpseq & counter_wrap_check)) for_each_possible_cpu(cpu) { sdp = per_cpu_ptr(ssp->sda, cpu); - spin_lock_irqsave_rcu_node(sdp, flags); + spin_lock_irq_rcu_node(sdp); if (ULONG_CMP_GE(gpseq, sdp->srcu_gp_seq_needed + 100)) sdp->srcu_gp_seq_needed = gpseq; if (ULONG_CMP_GE(gpseq, sdp->srcu_gp_seq_needed_exp + 100)) sdp->srcu_gp_seq_needed_exp = gpseq; - spin_unlock_irqrestore_rcu_node(sdp, flags); + spin_unlock_irq_rcu_node(sdp); } /* Callback initiation done, allow grace periods after next. */ - mutex_unlock(&ssp->srcu_cb_mutex); + mutex_unlock(&sup->srcu_cb_mutex); /* Start a new grace period if needed. */ - spin_lock_irq_rcu_node(ssp); - gpseq = rcu_seq_current(&ssp->srcu_gp_seq); + spin_lock_irq_rcu_node(sup); + gpseq = rcu_seq_current(&sup->srcu_gp_seq); if (!rcu_seq_state(gpseq) && - ULONG_CMP_LT(gpseq, ssp->srcu_gp_seq_needed)) { + ULONG_CMP_LT(gpseq, sup->srcu_gp_seq_needed)) { srcu_gp_start(ssp); - spin_unlock_irq_rcu_node(ssp); + spin_unlock_irq_rcu_node(sup); srcu_reschedule(ssp, 0); } else { - spin_unlock_irq_rcu_node(ssp); + spin_unlock_irq_rcu_node(sup); } /* Transition to big if needed. */ @@ -894,7 +1034,7 @@ static void srcu_gp_end(struct srcu_struct *ssp) if (ss_state == SRCU_SIZE_ALLOC) init_srcu_struct_nodes(ssp, GFP_KERNEL); else - smp_store_release(&ssp->srcu_size_state, ss_state + 1); + smp_store_release(&sup->srcu_size_state, ss_state + 1); } } @@ -914,7 +1054,7 @@ static void srcu_funnel_exp_start(struct srcu_struct *ssp, struct srcu_node *snp if (snp) for (; snp != NULL; snp = snp->srcu_parent) { sgsne = READ_ONCE(snp->srcu_gp_seq_needed_exp); - if (rcu_seq_done(&ssp->srcu_gp_seq, s) || + if (WARN_ON_ONCE(rcu_seq_done(&ssp->srcu_sup->srcu_gp_seq, s)) || (!srcu_invl_snp_seq(sgsne) && ULONG_CMP_GE(sgsne, s))) return; spin_lock_irqsave_rcu_node(snp, flags); @@ -927,9 +1067,9 @@ static void srcu_funnel_exp_start(struct srcu_struct *ssp, struct srcu_node *snp spin_unlock_irqrestore_rcu_node(snp, flags); } spin_lock_irqsave_ssp_contention(ssp, &flags); - if (ULONG_CMP_LT(ssp->srcu_gp_seq_needed_exp, s)) - WRITE_ONCE(ssp->srcu_gp_seq_needed_exp, s); - spin_unlock_irqrestore_rcu_node(ssp, flags); + if (ULONG_CMP_LT(ssp->srcu_sup->srcu_gp_seq_needed_exp, s)) + WRITE_ONCE(ssp->srcu_sup->srcu_gp_seq_needed_exp, s); + spin_unlock_irqrestore_rcu_node(ssp->srcu_sup, flags); } /* @@ -941,6 +1081,9 @@ static void srcu_funnel_exp_start(struct srcu_struct *ssp, struct srcu_node *snp * * Note that this function also does the work of srcu_funnel_exp_start(), * in some cases by directly invoking it. + * + * The srcu read lock should be hold around this function. And s is a seq snap + * after holding that lock. */ static void srcu_funnel_gp_start(struct srcu_struct *ssp, struct srcu_data *sdp, unsigned long s, bool do_norm) @@ -951,9 +1094,10 @@ static void srcu_funnel_gp_start(struct srcu_struct *ssp, struct srcu_data *sdp, struct srcu_node *snp; struct srcu_node *snp_leaf; unsigned long snp_seq; + struct srcu_usage *sup = ssp->srcu_sup; /* Ensure that snp node tree is fully initialized before traversing it */ - if (smp_load_acquire(&ssp->srcu_size_state) < SRCU_SIZE_WAIT_BARRIER) + if (smp_load_acquire(&sup->srcu_size_state) < SRCU_SIZE_WAIT_BARRIER) snp_leaf = NULL; else snp_leaf = sdp->mynode; @@ -961,7 +1105,7 @@ static void srcu_funnel_gp_start(struct srcu_struct *ssp, struct srcu_data *sdp, if (snp_leaf) /* Each pass through the loop does one level of the srcu_node tree. */ for (snp = snp_leaf; snp != NULL; snp = snp->srcu_parent) { - if (rcu_seq_done(&ssp->srcu_gp_seq, s) && snp != snp_leaf) + if (WARN_ON_ONCE(rcu_seq_done(&sup->srcu_gp_seq, s)) && snp != snp_leaf) return; /* GP already done and CBs recorded. */ spin_lock_irqsave_rcu_node(snp, flags); snp_seq = snp->srcu_have_cbs[idx]; @@ -988,20 +1132,19 @@ static void srcu_funnel_gp_start(struct srcu_struct *ssp, struct srcu_data *sdp, /* Top of tree, must ensure the grace period will be started. */ spin_lock_irqsave_ssp_contention(ssp, &flags); - if (ULONG_CMP_LT(ssp->srcu_gp_seq_needed, s)) { + if (ULONG_CMP_LT(sup->srcu_gp_seq_needed, s)) { /* * Record need for grace period s. Pair with load * acquire setting up for initialization. */ - smp_store_release(&ssp->srcu_gp_seq_needed, s); /*^^^*/ + smp_store_release(&sup->srcu_gp_seq_needed, s); /*^^^*/ } - if (!do_norm && ULONG_CMP_LT(ssp->srcu_gp_seq_needed_exp, s)) - WRITE_ONCE(ssp->srcu_gp_seq_needed_exp, s); + if (!do_norm && ULONG_CMP_LT(sup->srcu_gp_seq_needed_exp, s)) + WRITE_ONCE(sup->srcu_gp_seq_needed_exp, s); - /* If grace period not already done and none in progress, start it. */ - if (!rcu_seq_done(&ssp->srcu_gp_seq, s) && - rcu_seq_state(ssp->srcu_gp_seq) == SRCU_STATE_IDLE) { - WARN_ON_ONCE(ULONG_CMP_GE(ssp->srcu_gp_seq, ssp->srcu_gp_seq_needed)); + /* If grace period not already in progress, start it. */ + if (!WARN_ON_ONCE(rcu_seq_done(&sup->srcu_gp_seq, s)) && + rcu_seq_state(sup->srcu_gp_seq) == SRCU_STATE_IDLE) { srcu_gp_start(ssp); // And how can that list_add() in the "else" clause @@ -1010,24 +1153,26 @@ static void srcu_funnel_gp_start(struct srcu_struct *ssp, struct srcu_data *sdp, // can only be executed during early boot when there is only // the one boot CPU running with interrupts still disabled. if (likely(srcu_init_done)) - queue_delayed_work(rcu_gp_wq, &ssp->work, + queue_delayed_work(rcu_gp_wq, &sup->work, !!srcu_get_delay(ssp)); - else if (list_empty(&ssp->work.work.entry)) - list_add(&ssp->work.work.entry, &srcu_boot_list); + else if (list_empty(&sup->work.work.entry)) + list_add(&sup->work.work.entry, &srcu_boot_list); } - spin_unlock_irqrestore_rcu_node(ssp, flags); + spin_unlock_irqrestore_rcu_node(sup, flags); } /* * Wait until all readers counted by array index idx complete, but * loop an additional time if there is an expedited grace period pending. - * The caller must ensure that ->srcu_idx is not changed while checking. + * The caller must ensure that ->srcu_ctrp is not changed while checking. */ static bool try_check_zero(struct srcu_struct *ssp, int idx, int trycount) { unsigned long curdelay; + spin_lock_irq_rcu_node(ssp->srcu_sup); curdelay = !srcu_get_delay(ssp); + spin_unlock_irq_rcu_node(ssp->srcu_sup); for (;;) { if (srcu_readers_active_idx_check(ssp, idx)) @@ -1039,36 +1184,72 @@ static bool try_check_zero(struct srcu_struct *ssp, int idx, int trycount) } /* - * Increment the ->srcu_idx counter so that future SRCU readers will + * Increment the ->srcu_ctrp counter so that future SRCU readers will * use the other rank of the ->srcu_(un)lock_count[] arrays. This allows * us to wait for pre-existing readers in a starvation-free manner. */ static void srcu_flip(struct srcu_struct *ssp) { /* - * Ensure that if this updater saw a given reader's increment - * from __srcu_read_lock(), that reader was using an old value - * of ->srcu_idx. Also ensure that if a given reader sees the - * new value of ->srcu_idx, this updater's earlier scans cannot - * have seen that reader's increments (which is OK, because this - * grace period need not wait on that reader). + * Because the flip of ->srcu_ctrp is executed only if the + * preceding call to srcu_readers_active_idx_check() found that + * the ->srcu_ctrs[].srcu_unlocks and ->srcu_ctrs[].srcu_locks sums + * matched and because that summing uses atomic_long_read(), + * there is ordering due to a control dependency between that + * summing and the WRITE_ONCE() in this call to srcu_flip(). + * This ordering ensures that if this updater saw a given reader's + * increment from __srcu_read_lock(), that reader was using a value + * of ->srcu_ctrp from before the previous call to srcu_flip(), + * which should be quite rare. This ordering thus helps forward + * progress because the grace period could otherwise be delayed + * by additional calls to __srcu_read_lock() using that old (soon + * to be new) value of ->srcu_ctrp. + * + * This sum-equality check and ordering also ensures that if + * a given call to __srcu_read_lock() uses the new value of + * ->srcu_ctrp, this updater's earlier scans cannot have seen + * that reader's increments, which is all to the good, because + * this grace period need not wait on that reader. After all, + * if those earlier scans had seen that reader, there would have + * been a sum mismatch and this code would not be reached. + * + * This means that the following smp_mb() is redundant, but + * it stays until either (1) Compilers learn about this sort of + * control dependency or (2) Some production workload running on + * a production system is unduly delayed by this slowpath smp_mb(). + * Except for _lite() readers, where it is inoperative, which + * means that it is a good thing that it is redundant. */ smp_mb(); /* E */ /* Pairs with B and C. */ - WRITE_ONCE(ssp->srcu_idx, ssp->srcu_idx + 1); + WRITE_ONCE(ssp->srcu_ctrp, + &ssp->sda->srcu_ctrs[!(ssp->srcu_ctrp - &ssp->sda->srcu_ctrs[0])]); /* * Ensure that if the updater misses an __srcu_read_unlock() - * increment, that task's next __srcu_read_lock() will see the - * above counter update. Note that both this memory barrier - * and the one in srcu_readers_active_idx_check() provide the - * guarantee for __srcu_read_lock(). + * increment, that task's __srcu_read_lock() following its next + * __srcu_read_lock() or __srcu_read_unlock() will see the above + * counter update. Note that both this memory barrier and the + * one in srcu_readers_active_idx_check() provide the guarantee + * for __srcu_read_lock(). + * + * Note that this is a performance optimization, in which we spend + * an otherwise unnecessary smp_mb() in order to reduce the number + * of full per-CPU-variable scans in srcu_readers_lock_idx() and + * srcu_readers_unlock_idx(). But this performance optimization + * is not so optimal for SRCU-fast, where we would be spending + * not smp_mb(), but rather synchronize_rcu(). At the same time, + * the overhead of the smp_mb() is in the noise, so there is no + * point in omitting it in the SRCU-fast case. So the same code + * is executed either way. */ smp_mb(); /* D */ /* Pairs with C. */ } /* - * If SRCU is likely idle, return true, otherwise return false. + * If SRCU is likely idle, in other words, the next SRCU grace period + * should be expedited, return true, otherwise return false. Except that + * in the presence of _lite() readers, always return false. * * Note that it is OK for several current from-idle requests for a new * grace period from idle to specify expediting because they will all end @@ -1088,7 +1269,7 @@ static void srcu_flip(struct srcu_struct *ssp) * negligible when amortized over that time period, and the extra latency * of a needlessly non-expedited grace period is similarly negligible. */ -static bool srcu_might_be_idle(struct srcu_struct *ssp) +static bool srcu_should_expedite(struct srcu_struct *ssp) { unsigned long curseq; unsigned long flags; @@ -1097,6 +1278,9 @@ static bool srcu_might_be_idle(struct srcu_struct *ssp) unsigned long tlast; check_init_srcu_struct(ssp); + /* If _lite() readers, don't do unsolicited expediting. */ + if (this_cpu_read(ssp->sda->srcu_reader_flavor) & SRCU_READ_FLAVOR_SLOWGP) + return false; /* If the local srcu_data structure has callbacks, not idle. */ sdp = raw_cpu_ptr(ssp->sda); spin_lock_irqsave_rcu_node(sdp, flags); @@ -1114,18 +1298,18 @@ static bool srcu_might_be_idle(struct srcu_struct *ssp) /* First, see if enough time has passed since the last GP. */ t = ktime_get_mono_fast_ns(); - tlast = READ_ONCE(ssp->srcu_last_gp_end); + tlast = READ_ONCE(ssp->srcu_sup->srcu_last_gp_end); if (exp_holdoff == 0 || time_in_range_open(t, tlast, tlast + exp_holdoff)) return false; /* Too soon after last GP. */ /* Next, check for probable idleness. */ - curseq = rcu_seq_current(&ssp->srcu_gp_seq); + curseq = rcu_seq_current(&ssp->srcu_sup->srcu_gp_seq); smp_mb(); /* Order ->srcu_gp_seq with ->srcu_gp_seq_needed. */ - if (ULONG_CMP_LT(curseq, READ_ONCE(ssp->srcu_gp_seq_needed))) + if (ULONG_CMP_LT(curseq, READ_ONCE(ssp->srcu_sup->srcu_gp_seq_needed))) return false; /* Grace period in progress, so not idle. */ smp_mb(); /* Order ->srcu_gp_seq with prior access. */ - if (curseq != rcu_seq_current(&ssp->srcu_gp_seq)) + if (curseq != rcu_seq_current(&ssp->srcu_sup->srcu_gp_seq)) return false; /* GP # changed, so not idle. */ return true; /* With reasonable probability, idle! */ } @@ -1159,18 +1343,63 @@ static unsigned long srcu_gp_start_if_needed(struct srcu_struct *ssp, * sequence number cannot wrap around in the meantime. */ idx = __srcu_read_lock_nmisafe(ssp); - ss_state = smp_load_acquire(&ssp->srcu_size_state); + ss_state = smp_load_acquire(&ssp->srcu_sup->srcu_size_state); if (ss_state < SRCU_SIZE_WAIT_CALL) - sdp = per_cpu_ptr(ssp->sda, 0); + sdp = per_cpu_ptr(ssp->sda, get_boot_cpu_id()); else sdp = raw_cpu_ptr(ssp->sda); spin_lock_irqsave_sdp_contention(sdp, &flags); if (rhp) rcu_segcblist_enqueue(&sdp->srcu_cblist, rhp); - rcu_segcblist_advance(&sdp->srcu_cblist, - rcu_seq_current(&ssp->srcu_gp_seq)); - s = rcu_seq_snap(&ssp->srcu_gp_seq); - (void)rcu_segcblist_accelerate(&sdp->srcu_cblist, s); + /* + * It's crucial to capture the snapshot 's' for acceleration before + * reading the current gp_seq that is used for advancing. This is + * essential because if the acceleration snapshot is taken after a + * failed advancement attempt, there's a risk that a grace period may + * conclude and a new one may start in the interim. If the snapshot is + * captured after this sequence of events, the acceleration snapshot 's' + * could be excessively advanced, leading to acceleration failure. + * In such a scenario, an 'acceleration leak' can occur, where new + * callbacks become indefinitely stuck in the RCU_NEXT_TAIL segment. + * Also note that encountering advancing failures is a normal + * occurrence when the grace period for RCU_WAIT_TAIL is in progress. + * + * To see this, consider the following events which occur if + * rcu_seq_snap() were to be called after advance: + * + * 1) The RCU_WAIT_TAIL segment has callbacks (gp_num = X + 4) and the + * RCU_NEXT_READY_TAIL also has callbacks (gp_num = X + 8). + * + * 2) The grace period for RCU_WAIT_TAIL is seen as started but not + * completed so rcu_seq_current() returns X + SRCU_STATE_SCAN1. + * + * 3) This value is passed to rcu_segcblist_advance() which can't move + * any segment forward and fails. + * + * 4) srcu_gp_start_if_needed() still proceeds with callback acceleration. + * But then the call to rcu_seq_snap() observes the grace period for the + * RCU_WAIT_TAIL segment as completed and the subsequent one for the + * RCU_NEXT_READY_TAIL segment as started (ie: X + 4 + SRCU_STATE_SCAN1) + * so it returns a snapshot of the next grace period, which is X + 12. + * + * 5) The value of X + 12 is passed to rcu_segcblist_accelerate() but the + * freshly enqueued callback in RCU_NEXT_TAIL can't move to + * RCU_NEXT_READY_TAIL which already has callbacks for a previous grace + * period (gp_num = X + 8). So acceleration fails. + */ + s = rcu_seq_snap(&ssp->srcu_sup->srcu_gp_seq); + if (rhp) { + rcu_segcblist_advance(&sdp->srcu_cblist, + rcu_seq_current(&ssp->srcu_sup->srcu_gp_seq)); + /* + * Acceleration can never fail because the base current gp_seq + * used for acceleration is <= the value of gp_seq used for + * advancing. This means that RCU_NEXT_TAIL segment will + * always be able to be emptied by the acceleration into the + * RCU_NEXT_READY_TAIL or RCU_WAIT_TAIL segments. + */ + WARN_ON_ONCE(!rcu_segcblist_accelerate(&sdp->srcu_cblist, s)); + } if (ULONG_CMP_LT(sdp->srcu_gp_seq_needed, s)) { sdp->srcu_gp_seq_needed = s; needgp = true; @@ -1250,8 +1479,12 @@ static void __call_srcu(struct srcu_struct *ssp, struct rcu_head *rhp, * read-side critical sections are delimited by srcu_read_lock() and * srcu_read_unlock(), and may be nested. * - * The callback will be invoked from process context, but must nevertheless - * be fast and must not block. + * The callback will be invoked from process context, but with bh + * disabled. The callback function must therefore be fast and must + * not block. + * + * See the description of call_rcu() for more detailed information on + * memory ordering guarantees. */ void call_srcu(struct srcu_struct *ssp, struct rcu_head *rhp, rcu_callback_t func) @@ -1267,6 +1500,8 @@ static void __synchronize_srcu(struct srcu_struct *ssp, bool do_norm) { struct rcu_synchronize rcu; + srcu_lock_sync(&ssp->dep_map); + RCU_LOCKDEP_WARN(lockdep_is_held(ssp) || lock_is_held(&rcu_bh_lock_map) || lock_is_held(&rcu_lock_map) || @@ -1315,8 +1550,9 @@ EXPORT_SYMBOL_GPL(synchronize_srcu_expedited); * * Wait for the count to drain to zero of both indexes. To avoid the * possible starvation of synchronize_srcu(), it waits for the count of - * the index=((->srcu_idx & 1) ^ 1) to drain to zero at first, - * and then flip the srcu_idx and wait for the count of the other index. + * the index=!(ssp->srcu_ctrp - &ssp->sda->srcu_ctrs[0]) to drain to zero + * at first, and then flip the ->srcu_ctrp and wait for the count of the + * other index. * * Can block; must be called from process context. * @@ -1351,14 +1587,15 @@ EXPORT_SYMBOL_GPL(synchronize_srcu_expedited); * Implementation of these memory-ordering guarantees is similar to * that of synchronize_rcu(). * - * If SRCU is likely idle, expedite the first request. This semantic - * was provided by Classic SRCU, and is relied upon by its users, so TREE - * SRCU must also provide it. Note that detecting idleness is heuristic - * and subject to both false positives and negatives. + * If SRCU is likely idle as determined by srcu_should_expedite(), + * expedite the first request. This semantic was provided by Classic SRCU, + * and is relied upon by its users, so TREE SRCU must also provide it. + * Note that detecting idleness is heuristic and subject to both false + * positives and negatives. */ void synchronize_srcu(struct srcu_struct *ssp) { - if (srcu_might_be_idle(ssp) || rcu_gp_is_expedited()) + if (srcu_should_expedite(ssp) || rcu_gp_is_expedited()) synchronize_srcu_expedited(ssp); else __synchronize_srcu(ssp, true); @@ -1380,7 +1617,7 @@ unsigned long get_state_synchronize_srcu(struct srcu_struct *ssp) // Any prior manipulation of SRCU-protected data must happen // before the load from ->srcu_gp_seq. smp_mb(); - return rcu_seq_snap(&ssp->srcu_gp_seq); + return rcu_seq_snap(&ssp->srcu_sup->srcu_gp_seq); } EXPORT_SYMBOL_GPL(get_state_synchronize_srcu); @@ -1427,7 +1664,8 @@ EXPORT_SYMBOL_GPL(start_poll_synchronize_srcu); */ bool poll_state_synchronize_srcu(struct srcu_struct *ssp, unsigned long cookie) { - if (!rcu_seq_done(&ssp->srcu_gp_seq, cookie)) + if (cookie != SRCU_GET_STATE_COMPLETED && + !rcu_seq_done_exact(&ssp->srcu_sup->srcu_gp_seq, cookie)) return false; // Ensure that the end of the SRCU grace period happens before // any subsequent code that the caller might execute. @@ -1444,10 +1682,11 @@ static void srcu_barrier_cb(struct rcu_head *rhp) struct srcu_data *sdp; struct srcu_struct *ssp; + rhp->next = rhp; // Mark the callback as having been invoked. sdp = container_of(rhp, struct srcu_data, srcu_barrier_head); ssp = sdp->ssp; - if (atomic_dec_and_test(&ssp->srcu_barrier_cpu_cnt)) - complete(&ssp->srcu_barrier_completion); + if (atomic_dec_and_test(&ssp->srcu_sup->srcu_barrier_cpu_cnt)) + complete(&ssp->srcu_sup->srcu_barrier_completion); } /* @@ -1461,13 +1700,13 @@ static void srcu_barrier_cb(struct rcu_head *rhp) static void srcu_barrier_one_cpu(struct srcu_struct *ssp, struct srcu_data *sdp) { spin_lock_irq_rcu_node(sdp); - atomic_inc(&ssp->srcu_barrier_cpu_cnt); + atomic_inc(&ssp->srcu_sup->srcu_barrier_cpu_cnt); sdp->srcu_barrier_head.func = srcu_barrier_cb; debug_rcu_head_queue(&sdp->srcu_barrier_head); if (!rcu_segcblist_entrain(&sdp->srcu_cblist, &sdp->srcu_barrier_head)) { debug_rcu_head_unqueue(&sdp->srcu_barrier_head); - atomic_dec(&ssp->srcu_barrier_cpu_cnt); + atomic_dec(&ssp->srcu_sup->srcu_barrier_cpu_cnt); } spin_unlock_irq_rcu_node(sdp); } @@ -1480,39 +1719,97 @@ void srcu_barrier(struct srcu_struct *ssp) { int cpu; int idx; - unsigned long s = rcu_seq_snap(&ssp->srcu_barrier_seq); + unsigned long s = rcu_seq_snap(&ssp->srcu_sup->srcu_barrier_seq); check_init_srcu_struct(ssp); - mutex_lock(&ssp->srcu_barrier_mutex); - if (rcu_seq_done(&ssp->srcu_barrier_seq, s)) { + mutex_lock(&ssp->srcu_sup->srcu_barrier_mutex); + if (rcu_seq_done(&ssp->srcu_sup->srcu_barrier_seq, s)) { smp_mb(); /* Force ordering following return. */ - mutex_unlock(&ssp->srcu_barrier_mutex); + mutex_unlock(&ssp->srcu_sup->srcu_barrier_mutex); return; /* Someone else did our work for us. */ } - rcu_seq_start(&ssp->srcu_barrier_seq); - init_completion(&ssp->srcu_barrier_completion); + rcu_seq_start(&ssp->srcu_sup->srcu_barrier_seq); + init_completion(&ssp->srcu_sup->srcu_barrier_completion); /* Initial count prevents reaching zero until all CBs are posted. */ - atomic_set(&ssp->srcu_barrier_cpu_cnt, 1); + atomic_set(&ssp->srcu_sup->srcu_barrier_cpu_cnt, 1); idx = __srcu_read_lock_nmisafe(ssp); - if (smp_load_acquire(&ssp->srcu_size_state) < SRCU_SIZE_WAIT_BARRIER) - srcu_barrier_one_cpu(ssp, per_cpu_ptr(ssp->sda, 0)); + if (smp_load_acquire(&ssp->srcu_sup->srcu_size_state) < SRCU_SIZE_WAIT_BARRIER) + srcu_barrier_one_cpu(ssp, per_cpu_ptr(ssp->sda, get_boot_cpu_id())); else for_each_possible_cpu(cpu) srcu_barrier_one_cpu(ssp, per_cpu_ptr(ssp->sda, cpu)); __srcu_read_unlock_nmisafe(ssp, idx); /* Remove the initial count, at which point reaching zero can happen. */ - if (atomic_dec_and_test(&ssp->srcu_barrier_cpu_cnt)) - complete(&ssp->srcu_barrier_completion); - wait_for_completion(&ssp->srcu_barrier_completion); + if (atomic_dec_and_test(&ssp->srcu_sup->srcu_barrier_cpu_cnt)) + complete(&ssp->srcu_sup->srcu_barrier_completion); + wait_for_completion(&ssp->srcu_sup->srcu_barrier_completion); - rcu_seq_end(&ssp->srcu_barrier_seq); - mutex_unlock(&ssp->srcu_barrier_mutex); + rcu_seq_end(&ssp->srcu_sup->srcu_barrier_seq); + mutex_unlock(&ssp->srcu_sup->srcu_barrier_mutex); } EXPORT_SYMBOL_GPL(srcu_barrier); +/* Callback for srcu_expedite_current() usage. */ +static void srcu_expedite_current_cb(struct rcu_head *rhp) +{ + unsigned long flags; + bool needcb = false; + struct srcu_data *sdp = container_of(rhp, struct srcu_data, srcu_ec_head); + + spin_lock_irqsave_sdp_contention(sdp, &flags); + if (sdp->srcu_ec_state == SRCU_EC_IDLE) { + WARN_ON_ONCE(1); + } else if (sdp->srcu_ec_state == SRCU_EC_PENDING) { + sdp->srcu_ec_state = SRCU_EC_IDLE; + } else { + WARN_ON_ONCE(sdp->srcu_ec_state != SRCU_EC_REPOST); + sdp->srcu_ec_state = SRCU_EC_PENDING; + needcb = true; + } + spin_unlock_irqrestore_rcu_node(sdp, flags); + // If needed, requeue ourselves as an expedited SRCU callback. + if (needcb) + __call_srcu(sdp->ssp, &sdp->srcu_ec_head, srcu_expedite_current_cb, false); +} + +/** + * srcu_expedite_current - Expedite the current SRCU grace period + * @ssp: srcu_struct to expedite. + * + * Cause the current SRCU grace period to become expedited. The grace + * period following the current one might also be expedited. If there is + * no current grace period, one might be created. If the current grace + * period is currently sleeping, that sleep will complete before expediting + * will take effect. + */ +void srcu_expedite_current(struct srcu_struct *ssp) +{ + unsigned long flags; + bool needcb = false; + struct srcu_data *sdp; + + migrate_disable(); + sdp = this_cpu_ptr(ssp->sda); + spin_lock_irqsave_sdp_contention(sdp, &flags); + if (sdp->srcu_ec_state == SRCU_EC_IDLE) { + sdp->srcu_ec_state = SRCU_EC_PENDING; + needcb = true; + } else if (sdp->srcu_ec_state == SRCU_EC_PENDING) { + sdp->srcu_ec_state = SRCU_EC_REPOST; + } else { + WARN_ON_ONCE(sdp->srcu_ec_state != SRCU_EC_REPOST); + } + spin_unlock_irqrestore_rcu_node(sdp, flags); + // If needed, queue an expedited SRCU callback. + if (needcb) + __call_srcu(ssp, &sdp->srcu_ec_head, srcu_expedite_current_cb, false); + migrate_enable(); +} +EXPORT_SYMBOL_GPL(srcu_expedite_current); + /** * srcu_batches_completed - return batches completed. * @ssp: srcu_struct on which to report batch completion. @@ -1522,7 +1819,7 @@ EXPORT_SYMBOL_GPL(srcu_barrier); */ unsigned long srcu_batches_completed(struct srcu_struct *ssp) { - return READ_ONCE(ssp->srcu_idx); + return READ_ONCE(ssp->srcu_sup->srcu_gp_seq); } EXPORT_SYMBOL_GPL(srcu_batches_completed); @@ -1535,11 +1832,11 @@ static void srcu_advance_state(struct srcu_struct *ssp) { int idx; - mutex_lock(&ssp->srcu_gp_mutex); + mutex_lock(&ssp->srcu_sup->srcu_gp_mutex); /* * Because readers might be delayed for an extended period after - * fetching ->srcu_idx for their index, at any point in time there + * fetching ->srcu_ctrp for their index, at any point in time there * might well be readers using both idx=0 and idx=1. We therefore * need to wait for readers to clear from both index values before * invoking a callback. @@ -1547,50 +1844,50 @@ static void srcu_advance_state(struct srcu_struct *ssp) * The load-acquire ensures that we see the accesses performed * by the prior grace period. */ - idx = rcu_seq_state(smp_load_acquire(&ssp->srcu_gp_seq)); /* ^^^ */ + idx = rcu_seq_state(smp_load_acquire(&ssp->srcu_sup->srcu_gp_seq)); /* ^^^ */ if (idx == SRCU_STATE_IDLE) { - spin_lock_irq_rcu_node(ssp); - if (ULONG_CMP_GE(ssp->srcu_gp_seq, ssp->srcu_gp_seq_needed)) { - WARN_ON_ONCE(rcu_seq_state(ssp->srcu_gp_seq)); - spin_unlock_irq_rcu_node(ssp); - mutex_unlock(&ssp->srcu_gp_mutex); + spin_lock_irq_rcu_node(ssp->srcu_sup); + if (ULONG_CMP_GE(ssp->srcu_sup->srcu_gp_seq, ssp->srcu_sup->srcu_gp_seq_needed)) { + WARN_ON_ONCE(rcu_seq_state(ssp->srcu_sup->srcu_gp_seq)); + spin_unlock_irq_rcu_node(ssp->srcu_sup); + mutex_unlock(&ssp->srcu_sup->srcu_gp_mutex); return; } - idx = rcu_seq_state(READ_ONCE(ssp->srcu_gp_seq)); + idx = rcu_seq_state(READ_ONCE(ssp->srcu_sup->srcu_gp_seq)); if (idx == SRCU_STATE_IDLE) srcu_gp_start(ssp); - spin_unlock_irq_rcu_node(ssp); + spin_unlock_irq_rcu_node(ssp->srcu_sup); if (idx != SRCU_STATE_IDLE) { - mutex_unlock(&ssp->srcu_gp_mutex); + mutex_unlock(&ssp->srcu_sup->srcu_gp_mutex); return; /* Someone else started the grace period. */ } } - if (rcu_seq_state(READ_ONCE(ssp->srcu_gp_seq)) == SRCU_STATE_SCAN1) { - idx = 1 ^ (ssp->srcu_idx & 1); + if (rcu_seq_state(READ_ONCE(ssp->srcu_sup->srcu_gp_seq)) == SRCU_STATE_SCAN1) { + idx = !(ssp->srcu_ctrp - &ssp->sda->srcu_ctrs[0]); if (!try_check_zero(ssp, idx, 1)) { - mutex_unlock(&ssp->srcu_gp_mutex); + mutex_unlock(&ssp->srcu_sup->srcu_gp_mutex); return; /* readers present, retry later. */ } srcu_flip(ssp); - spin_lock_irq_rcu_node(ssp); - rcu_seq_set_state(&ssp->srcu_gp_seq, SRCU_STATE_SCAN2); - ssp->srcu_n_exp_nodelay = 0; - spin_unlock_irq_rcu_node(ssp); + spin_lock_irq_rcu_node(ssp->srcu_sup); + rcu_seq_set_state(&ssp->srcu_sup->srcu_gp_seq, SRCU_STATE_SCAN2); + ssp->srcu_sup->srcu_n_exp_nodelay = 0; + spin_unlock_irq_rcu_node(ssp->srcu_sup); } - if (rcu_seq_state(READ_ONCE(ssp->srcu_gp_seq)) == SRCU_STATE_SCAN2) { + if (rcu_seq_state(READ_ONCE(ssp->srcu_sup->srcu_gp_seq)) == SRCU_STATE_SCAN2) { /* * SRCU read-side critical sections are normally short, * so check at least twice in quick succession after a flip. */ - idx = 1 ^ (ssp->srcu_idx & 1); + idx = !(ssp->srcu_ctrp - &ssp->sda->srcu_ctrs[0]); if (!try_check_zero(ssp, idx, 2)) { - mutex_unlock(&ssp->srcu_gp_mutex); + mutex_unlock(&ssp->srcu_sup->srcu_gp_mutex); return; /* readers present, retry later. */ } - ssp->srcu_n_exp_nodelay = 0; + ssp->srcu_sup->srcu_n_exp_nodelay = 0; srcu_gp_end(ssp); /* Releases ->srcu_gp_mutex. */ } } @@ -1615,8 +1912,14 @@ static void srcu_invoke_callbacks(struct work_struct *work) ssp = sdp->ssp; rcu_cblist_init(&ready_cbs); spin_lock_irq_rcu_node(sdp); + WARN_ON_ONCE(!rcu_segcblist_segempty(&sdp->srcu_cblist, RCU_NEXT_TAIL)); rcu_segcblist_advance(&sdp->srcu_cblist, - rcu_seq_current(&ssp->srcu_gp_seq)); + rcu_seq_current(&ssp->srcu_sup->srcu_gp_seq)); + /* + * Although this function is theoretically re-entrant, concurrent + * callbacks invocation is disallowed to avoid executing an SRCU barrier + * too early. + */ if (sdp->srcu_cblist_invoking || !rcu_segcblist_ready_cbs(&sdp->srcu_cblist)) { spin_unlock_irq_rcu_node(sdp); @@ -1631,6 +1934,7 @@ static void srcu_invoke_callbacks(struct work_struct *work) rhp = rcu_cblist_dequeue(&ready_cbs); for (; rhp != NULL; rhp = rcu_cblist_dequeue(&ready_cbs)) { debug_rcu_head_unqueue(rhp); + debug_rcu_head_callback(rhp); local_bh_disable(); rhp->func(rhp); local_bh_enable(); @@ -1643,11 +1947,10 @@ static void srcu_invoke_callbacks(struct work_struct *work) */ spin_lock_irq_rcu_node(sdp); rcu_segcblist_add_len(&sdp->srcu_cblist, -len); - (void)rcu_segcblist_accelerate(&sdp->srcu_cblist, - rcu_seq_snap(&ssp->srcu_gp_seq)); sdp->srcu_cblist_invoking = false; more = rcu_segcblist_ready_cbs(&sdp->srcu_cblist); spin_unlock_irq_rcu_node(sdp); + /* An SRCU barrier or callbacks from previous nesting work pending */ if (more) srcu_schedule_cbs_sdp(sdp, 0); } @@ -1660,20 +1963,20 @@ static void srcu_reschedule(struct srcu_struct *ssp, unsigned long delay) { bool pushgp = true; - spin_lock_irq_rcu_node(ssp); - if (ULONG_CMP_GE(ssp->srcu_gp_seq, ssp->srcu_gp_seq_needed)) { - if (!WARN_ON_ONCE(rcu_seq_state(ssp->srcu_gp_seq))) { + spin_lock_irq_rcu_node(ssp->srcu_sup); + if (ULONG_CMP_GE(ssp->srcu_sup->srcu_gp_seq, ssp->srcu_sup->srcu_gp_seq_needed)) { + if (!WARN_ON_ONCE(rcu_seq_state(ssp->srcu_sup->srcu_gp_seq))) { /* All requests fulfilled, time to go idle. */ pushgp = false; } - } else if (!rcu_seq_state(ssp->srcu_gp_seq)) { + } else if (!rcu_seq_state(ssp->srcu_sup->srcu_gp_seq)) { /* Outstanding request and no GP. Start one. */ srcu_gp_start(ssp); } - spin_unlock_irq_rcu_node(ssp); + spin_unlock_irq_rcu_node(ssp->srcu_sup); if (pushgp) - queue_delayed_work(rcu_gp_wq, &ssp->work, delay); + queue_delayed_work(rcu_gp_wq, &ssp->srcu_sup->work, delay); } /* @@ -1684,35 +1987,37 @@ static void process_srcu(struct work_struct *work) unsigned long curdelay; unsigned long j; struct srcu_struct *ssp; + struct srcu_usage *sup; - ssp = container_of(work, struct srcu_struct, work.work); + sup = container_of(work, struct srcu_usage, work.work); + ssp = sup->srcu_ssp; srcu_advance_state(ssp); + spin_lock_irq_rcu_node(ssp->srcu_sup); curdelay = srcu_get_delay(ssp); + spin_unlock_irq_rcu_node(ssp->srcu_sup); if (curdelay) { - WRITE_ONCE(ssp->reschedule_count, 0); + WRITE_ONCE(sup->reschedule_count, 0); } else { j = jiffies; - if (READ_ONCE(ssp->reschedule_jiffies) == j) { - WRITE_ONCE(ssp->reschedule_count, READ_ONCE(ssp->reschedule_count) + 1); - if (READ_ONCE(ssp->reschedule_count) > srcu_max_nodelay) + if (READ_ONCE(sup->reschedule_jiffies) == j) { + ASSERT_EXCLUSIVE_WRITER(sup->reschedule_count); + WRITE_ONCE(sup->reschedule_count, READ_ONCE(sup->reschedule_count) + 1); + if (READ_ONCE(sup->reschedule_count) > srcu_max_nodelay) curdelay = 1; } else { - WRITE_ONCE(ssp->reschedule_count, 1); - WRITE_ONCE(ssp->reschedule_jiffies, j); + WRITE_ONCE(sup->reschedule_count, 1); + WRITE_ONCE(sup->reschedule_jiffies, j); } } srcu_reschedule(ssp, curdelay); } -void srcutorture_get_gp_data(enum rcutorture_type test_type, - struct srcu_struct *ssp, int *flags, +void srcutorture_get_gp_data(struct srcu_struct *ssp, int *flags, unsigned long *gp_seq) { - if (test_type != SRCU_FLAVOR) - return; *flags = 0; - *gp_seq = rcu_seq_current(&ssp->srcu_gp_seq); + *gp_seq = rcu_seq_current(&ssp->srcu_sup->srcu_gp_seq); } EXPORT_SYMBOL_GPL(srcutorture_get_gp_data); @@ -1734,14 +2039,14 @@ void srcu_torture_stats_print(struct srcu_struct *ssp, char *tt, char *tf) int cpu; int idx; unsigned long s0 = 0, s1 = 0; - int ss_state = READ_ONCE(ssp->srcu_size_state); + int ss_state = READ_ONCE(ssp->srcu_sup->srcu_size_state); int ss_state_idx = ss_state; - idx = ssp->srcu_idx & 0x1; + idx = ssp->srcu_ctrp - &ssp->sda->srcu_ctrs[0]; if (ss_state < 0 || ss_state >= ARRAY_SIZE(srcu_size_state_name)) ss_state_idx = ARRAY_SIZE(srcu_size_state_name) - 1; pr_alert("%s%s Tree SRCU g%ld state %d (%s)", - tt, tf, rcu_seq_current(&ssp->srcu_gp_seq), ss_state, + tt, tf, rcu_seq_current(&ssp->srcu_sup->srcu_gp_seq), ss_state, srcu_size_state_name[ss_state_idx]); if (!ssp->sda) { // Called after cleanup_srcu_struct(), perhaps. @@ -1755,8 +2060,8 @@ void srcu_torture_stats_print(struct srcu_struct *ssp, char *tt, char *tf) struct srcu_data *sdp; sdp = per_cpu_ptr(ssp->sda, cpu); - u0 = data_race(atomic_long_read(&sdp->srcu_unlock_count[!idx])); - u1 = data_race(atomic_long_read(&sdp->srcu_unlock_count[idx])); + u0 = data_race(atomic_long_read(&sdp->srcu_ctrs[!idx].srcu_unlocks)); + u1 = data_race(atomic_long_read(&sdp->srcu_ctrs[idx].srcu_unlocks)); /* * Make sure that a lock is always counted if the corresponding @@ -1764,8 +2069,8 @@ void srcu_torture_stats_print(struct srcu_struct *ssp, char *tt, char *tf) */ smp_rmb(); - l0 = data_race(atomic_long_read(&sdp->srcu_lock_count[!idx])); - l1 = data_race(atomic_long_read(&sdp->srcu_lock_count[idx])); + l0 = data_race(atomic_long_read(&sdp->srcu_ctrs[!idx].srcu_locks)); + l1 = data_race(atomic_long_read(&sdp->srcu_ctrs[idx].srcu_locks)); c0 = l0 - u0; c1 = l1 - u1; @@ -1798,7 +2103,7 @@ early_initcall(srcu_bootup_announce); void __init srcu_init(void) { - struct srcu_struct *ssp; + struct srcu_usage *sup; /* Decide on srcu_struct-size strategy. */ if (SRCU_SIZING_IS(SRCU_SIZING_AUTO)) { @@ -1818,12 +2123,13 @@ void __init srcu_init(void) */ srcu_init_done = true; while (!list_empty(&srcu_boot_list)) { - ssp = list_first_entry(&srcu_boot_list, struct srcu_struct, + sup = list_first_entry(&srcu_boot_list, struct srcu_usage, work.work.entry); - list_del_init(&ssp->work.work.entry); - if (SRCU_SIZING_IS(SRCU_SIZING_INIT) && ssp->srcu_size_state == SRCU_SIZE_SMALL) - ssp->srcu_size_state = SRCU_SIZE_ALLOC; - queue_work(rcu_gp_wq, &ssp->work.work); + list_del_init(&sup->work.work.entry); + if (SRCU_SIZING_IS(SRCU_SIZING_INIT) && + sup->srcu_size_state == SRCU_SIZE_SMALL) + sup->srcu_size_state = SRCU_SIZE_ALLOC; + queue_work(rcu_gp_wq, &sup->work.work); } } @@ -1833,13 +2139,15 @@ void __init srcu_init(void) static int srcu_module_coming(struct module *mod) { int i; + struct srcu_struct *ssp; struct srcu_struct **sspp = mod->srcu_struct_ptrs; - int ret; for (i = 0; i < mod->num_srcu_structs; i++) { - ret = init_srcu_struct(*(sspp++)); - if (WARN_ON_ONCE(ret)) - return ret; + ssp = *(sspp++); + ssp->sda = alloc_percpu(struct srcu_data); + if (WARN_ON_ONCE(!ssp->sda)) + return -ENOMEM; + ssp->srcu_ctrp = &ssp->sda->srcu_ctrs[0]; } return 0; } @@ -1848,10 +2156,17 @@ static int srcu_module_coming(struct module *mod) static void srcu_module_going(struct module *mod) { int i; + struct srcu_struct *ssp; struct srcu_struct **sspp = mod->srcu_struct_ptrs; - for (i = 0; i < mod->num_srcu_structs; i++) - cleanup_srcu_struct(*(sspp++)); + for (i = 0; i < mod->num_srcu_structs; i++) { + ssp = *(sspp++); + if (!rcu_seq_state(smp_load_acquire(&ssp->srcu_sup->srcu_gp_seq_needed)) && + !WARN_ON_ONCE(!ssp->srcu_sup->sda_is_static)) + cleanup_srcu_struct(ssp); + if (!WARN_ON(srcu_readers_active(ssp))) + free_percpu(ssp->sda); + } } /* Handle one module, either coming or going. */ diff --git a/kernel/rcu/sync.c b/kernel/rcu/sync.c index e550f97779b8..da60a9947c00 100644 --- a/kernel/rcu/sync.c +++ b/kernel/rcu/sync.c @@ -24,22 +24,6 @@ void rcu_sync_init(struct rcu_sync *rsp) init_waitqueue_head(&rsp->gp_wait); } -/** - * rcu_sync_enter_start - Force readers onto slow path for multiple updates - * @rsp: Pointer to rcu_sync structure to use for synchronization - * - * Must be called after rcu_sync_init() and before first use. - * - * Ensures rcu_sync_is_idle() returns false and rcu_sync_{enter,exit}() - * pairs turn into NO-OPs. - */ -void rcu_sync_enter_start(struct rcu_sync *rsp) -{ - rsp->gp_count++; - rsp->gp_state = GP_PASSED; -} - - static void rcu_sync_func(struct rcu_head *rhp); static void rcu_sync_call(struct rcu_sync *rsp) @@ -168,9 +152,9 @@ void rcu_sync_enter(struct rcu_sync *rsp) void rcu_sync_exit(struct rcu_sync *rsp) { WARN_ON_ONCE(READ_ONCE(rsp->gp_state) == GP_IDLE); - WARN_ON_ONCE(READ_ONCE(rsp->gp_count) == 0); spin_lock_irq(&rsp->rss_lock); + WARN_ON_ONCE(rsp->gp_count == 0); if (!--rsp->gp_count) { if (rsp->gp_state == GP_PASSED) { WRITE_ONCE(rsp->gp_state, GP_EXIT); @@ -190,10 +174,10 @@ void rcu_sync_dtor(struct rcu_sync *rsp) { int gp_state; - WARN_ON_ONCE(READ_ONCE(rsp->gp_count)); WARN_ON_ONCE(READ_ONCE(rsp->gp_state) == GP_PASSED); spin_lock_irq(&rsp->rss_lock); + WARN_ON_ONCE(rsp->gp_count); if (rsp->gp_state == GP_REPLAY) WRITE_ONCE(rsp->gp_state, GP_EXIT); gp_state = rsp->gp_state; diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index fe9840d90e96..2dc044fd126e 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -25,12 +25,16 @@ typedef void (*postgp_func_t)(struct rcu_tasks *rtp); * @cblist: Callback list. * @lock: Lock protecting per-CPU callback list. * @rtp_jiffies: Jiffies counter value for statistics. + * @lazy_timer: Timer to unlazify callbacks. + * @urgent_gp: Number of additional non-lazy grace periods. * @rtp_n_lock_retries: Rough lock-contention statistic. * @rtp_work: Work queue for invoking callbacks. * @rtp_irq_work: IRQ work queue for deferred wakeups. * @barrier_q_head: RCU callback for barrier operation. * @rtp_blkd_tasks: List of tasks blocked as readers. + * @rtp_exit_list: List of tasks in the latter portion of do_exit(). * @cpu: CPU number corresponding to this entry. + * @index: Index of this CPU in rtpcp_array of the rcu_tasks structure. * @rtpp: Pointer to the rcu_tasks structure. */ struct rcu_tasks_percpu { @@ -38,11 +42,15 @@ struct rcu_tasks_percpu { raw_spinlock_t __private lock; unsigned long rtp_jiffies; unsigned long rtp_n_lock_retries; + struct timer_list lazy_timer; + unsigned int urgent_gp; struct work_struct rtp_work; struct irq_work rtp_irq_work; struct rcu_head barrier_q_head; struct list_head rtp_blkd_tasks; + struct list_head rtp_exit_list; int cpu; + int index; struct rcu_tasks *rtpp; }; @@ -51,23 +59,26 @@ struct rcu_tasks_percpu { * @cbs_wait: RCU wait allowing a new callback to get kthread's attention. * @cbs_gbl_lock: Lock protecting callback list. * @tasks_gp_mutex: Mutex protecting grace period, needed during mid-boot dead zone. - * @kthread_ptr: This flavor's grace-period/callback-invocation kthread. * @gp_func: This flavor's grace-period-wait function. * @gp_state: Grace period's most recent state transition (debugging). * @gp_sleep: Per-grace-period sleep to prevent CPU-bound looping. * @init_fract: Initial backoff sleep interval. * @gp_jiffies: Time of last @gp_state transition. * @gp_start: Most recent grace-period start in jiffies. - * @tasks_gp_seq: Number of grace periods completed since boot. + * @tasks_gp_seq: Number of grace periods completed since boot in upper bits. * @n_ipis: Number of IPIs sent to encourage grace periods to end. * @n_ipis_fails: Number of IPI-send failures. + * @kthread_ptr: This flavor's grace-period/callback-invocation kthread. + * @lazy_jiffies: Number of jiffies to allow callbacks to be lazy. * @pregp_func: This flavor's pre-grace-period function (optional). * @pertask_func: This flavor's per-task scan function (optional). * @postscan_func: This flavor's post-task scan function (optional). * @holdouts_func: This flavor's holdout-list scan function (optional). * @postgp_func: This flavor's post-grace-period function (optional). * @call_func: This flavor's call_rcu()-equivalent function. + * @wait_state: Task state for synchronous grace-period waits (default TASK_UNINTERRUPTIBLE). * @rtpcpu: This flavor's rcu_tasks_percpu structure. + * @rtpcp_array: Array of pointers to rcu_tasks_percpu structure of CPUs in cpu_possible_mask. * @percpu_enqueue_shift: Shift down CPU ID this much when enqueuing callbacks. * @percpu_enqueue_lim: Number of per-CPU callback queues in use for enqueuing. * @percpu_dequeue_lim: Number of per-CPU callback queues in use for dequeuing. @@ -76,6 +87,7 @@ struct rcu_tasks_percpu { * @barrier_q_count: Number of queues being waited on. * @barrier_q_completion: Barrier wait/wakeup mechanism. * @barrier_q_seq: Sequence number for barrier operations. + * @barrier_q_start: Most recent barrier start in jiffies. * @name: This flavor's textual name. * @kname: This flavor's kthread name. */ @@ -92,6 +104,7 @@ struct rcu_tasks { unsigned long n_ipis; unsigned long n_ipis_fails; struct task_struct *kthread_ptr; + unsigned long lazy_jiffies; rcu_tasks_gp_func_t gp_func; pregp_func_t pregp_func; pertask_func_t pertask_func; @@ -99,7 +112,9 @@ struct rcu_tasks { holdouts_func_t holdouts_func; postgp_func_t postgp_func; call_rcu_func_t call_func; + unsigned int wait_state; struct rcu_tasks_percpu __percpu *rtpcpu; + struct rcu_tasks_percpu **rtpcp_array; int percpu_enqueue_shift; int percpu_enqueue_lim; int percpu_dequeue_lim; @@ -108,6 +123,7 @@ struct rcu_tasks { atomic_t barrier_q_count; struct completion barrier_q_completion; unsigned long barrier_q_seq; + unsigned long barrier_q_start; char *name; char *kname; }; @@ -126,7 +142,9 @@ static struct rcu_tasks rt_name = \ .tasks_gp_mutex = __MUTEX_INITIALIZER(rt_name.tasks_gp_mutex), \ .gp_func = gp, \ .call_func = call, \ + .wait_state = TASK_UNINTERRUPTIBLE, \ .rtpcpu = &rt_name ## __percpu, \ + .lazy_jiffies = DIV_ROUND_UP(HZ, 4), \ .name = n, \ .percpu_enqueue_shift = order_base_2(CONFIG_NR_CPUS), \ .percpu_enqueue_lim = 1, \ @@ -136,8 +154,12 @@ static struct rcu_tasks rt_name = \ .kname = #rt_name, \ } -/* Track exiting tasks in order to allow them to be waited for. */ -DEFINE_STATIC_SRCU(tasks_rcu_exit_srcu); +#ifdef CONFIG_TASKS_RCU + +/* Report delay of scan exiting tasklist in rcu_tasks_postscan(). */ +static void tasks_rcu_exit_srcu_stall(struct timer_list *unused); +static DEFINE_TIMER(tasks_rcu_exit_srcu_stall_timer, tasks_rcu_exit_srcu_stall); +#endif /* Avoid IPIing CPUs early in the grace period. */ #define RCU_TASK_IPI_DELAY (IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB) ? HZ / 2 : 0) @@ -163,6 +185,10 @@ static int rcu_task_contend_lim __read_mostly = 100; module_param(rcu_task_contend_lim, int, 0444); static int rcu_task_collapse_lim __read_mostly = 10; module_param(rcu_task_collapse_lim, int, 0444); +static int rcu_task_lazy_lim __read_mostly = 32; +module_param(rcu_task_lazy_lim, int, 0444); + +static int rcu_task_cpu_ids; /* RCU tasks grace-period state for debugging. */ #define RTGS_INIT 0 @@ -221,50 +247,89 @@ static const char *tasks_gp_state_getname(struct rcu_tasks *rtp) #endif /* #ifndef CONFIG_TINY_RCU */ // Initialize per-CPU callback lists for the specified flavor of -// Tasks RCU. +// Tasks RCU. Do not enqueue callbacks before this function is invoked. static void cblist_init_generic(struct rcu_tasks *rtp) { int cpu; - unsigned long flags; int lim; int shift; + int maxcpu; + int index = 0; - raw_spin_lock_irqsave(&rtp->cbs_gbl_lock, flags); if (rcu_task_enqueue_lim < 0) { rcu_task_enqueue_lim = 1; rcu_task_cb_adjust = true; - pr_info("%s: Setting adjustable number of callback queues.\n", __func__); } else if (rcu_task_enqueue_lim == 0) { rcu_task_enqueue_lim = 1; } lim = rcu_task_enqueue_lim; - if (lim > nr_cpu_ids) - lim = nr_cpu_ids; - shift = ilog2(nr_cpu_ids / lim); - if (((nr_cpu_ids - 1) >> shift) >= lim) - shift++; - WRITE_ONCE(rtp->percpu_enqueue_shift, shift); - WRITE_ONCE(rtp->percpu_dequeue_lim, lim); - smp_store_release(&rtp->percpu_enqueue_lim, lim); + rtp->rtpcp_array = kcalloc(num_possible_cpus(), sizeof(struct rcu_tasks_percpu *), GFP_KERNEL); + BUG_ON(!rtp->rtpcp_array); + for_each_possible_cpu(cpu) { struct rcu_tasks_percpu *rtpcp = per_cpu_ptr(rtp->rtpcpu, cpu); WARN_ON_ONCE(!rtpcp); if (cpu) raw_spin_lock_init(&ACCESS_PRIVATE(rtpcp, lock)); - raw_spin_lock_rcu_node(rtpcp); // irqs already disabled. if (rcu_segcblist_empty(&rtpcp->cblist)) rcu_segcblist_init(&rtpcp->cblist); INIT_WORK(&rtpcp->rtp_work, rcu_tasks_invoke_cbs_wq); rtpcp->cpu = cpu; rtpcp->rtpp = rtp; + rtpcp->index = index; + rtp->rtpcp_array[index] = rtpcp; + index++; if (!rtpcp->rtp_blkd_tasks.next) INIT_LIST_HEAD(&rtpcp->rtp_blkd_tasks); - raw_spin_unlock_rcu_node(rtpcp); // irqs remain disabled. + if (!rtpcp->rtp_exit_list.next) + INIT_LIST_HEAD(&rtpcp->rtp_exit_list); + rtpcp->barrier_q_head.next = &rtpcp->barrier_q_head; + maxcpu = cpu; } - raw_spin_unlock_irqrestore(&rtp->cbs_gbl_lock, flags); - pr_info("%s: Setting shift to %d and lim to %d.\n", __func__, data_race(rtp->percpu_enqueue_shift), data_race(rtp->percpu_enqueue_lim)); + + rcu_task_cpu_ids = maxcpu + 1; + if (lim > rcu_task_cpu_ids) + lim = rcu_task_cpu_ids; + shift = ilog2(rcu_task_cpu_ids / lim); + if (((rcu_task_cpu_ids - 1) >> shift) >= lim) + shift++; + WRITE_ONCE(rtp->percpu_enqueue_shift, shift); + WRITE_ONCE(rtp->percpu_dequeue_lim, lim); + smp_store_release(&rtp->percpu_enqueue_lim, lim); + + pr_info("%s: Setting shift to %d and lim to %d rcu_task_cb_adjust=%d rcu_task_cpu_ids=%d.\n", + rtp->name, data_race(rtp->percpu_enqueue_shift), data_race(rtp->percpu_enqueue_lim), + rcu_task_cb_adjust, rcu_task_cpu_ids); +} + +// Compute wakeup time for lazy callback timer. +static unsigned long rcu_tasks_lazy_time(struct rcu_tasks *rtp) +{ + return jiffies + rtp->lazy_jiffies; +} + +// Timer handler that unlazifies lazy callbacks. +static void call_rcu_tasks_generic_timer(struct timer_list *tlp) +{ + unsigned long flags; + bool needwake = false; + struct rcu_tasks *rtp; + struct rcu_tasks_percpu *rtpcp = timer_container_of(rtpcp, tlp, + lazy_timer); + + rtp = rtpcp->rtpp; + raw_spin_lock_irqsave_rcu_node(rtpcp, flags); + if (!rcu_segcblist_empty(&rtpcp->cblist) && rtp->lazy_jiffies) { + if (!rtpcp->urgent_gp) + rtpcp->urgent_gp = 1; + needwake = true; + mod_timer(&rtpcp->lazy_timer, rcu_tasks_lazy_time(rtp)); + } + raw_spin_unlock_irqrestore_rcu_node(rtpcp, flags); + if (needwake) + rcuwait_wake_up(&rtp->cbs_wait); } // IRQ-work handler that does deferred wakeup for call_rcu_tasks_generic(). @@ -283,6 +348,7 @@ static void call_rcu_tasks_generic(struct rcu_head *rhp, rcu_callback_t func, { int chosen_cpu; unsigned long flags; + bool havekthread = smp_load_acquire(&rtp->kthread_ptr); int ideal_cpu; unsigned long j; bool needadjust = false; @@ -295,6 +361,7 @@ static void call_rcu_tasks_generic(struct rcu_head *rhp, rcu_callback_t func, rcu_read_lock(); ideal_cpu = smp_processor_id() >> READ_ONCE(rtp->percpu_enqueue_shift); chosen_cpu = cpumask_next(ideal_cpu - 1, cpu_possible_mask); + WARN_ON_ONCE(chosen_cpu >= rcu_task_cpu_ids); rtpcp = per_cpu_ptr(rtp->rtpcpu, chosen_cpu); if (!raw_spin_trylock_rcu_node(rtpcp)) { // irqs already disabled. raw_spin_lock_rcu_node(rtpcp); // irqs already disabled. @@ -304,23 +371,30 @@ static void call_rcu_tasks_generic(struct rcu_head *rhp, rcu_callback_t func, rtpcp->rtp_n_lock_retries = 0; } if (rcu_task_cb_adjust && ++rtpcp->rtp_n_lock_retries > rcu_task_contend_lim && - READ_ONCE(rtp->percpu_enqueue_lim) != nr_cpu_ids) + READ_ONCE(rtp->percpu_enqueue_lim) != rcu_task_cpu_ids) needadjust = true; // Defer adjustment to avoid deadlock. } - if (!rcu_segcblist_is_enabled(&rtpcp->cblist)) { - raw_spin_unlock_rcu_node(rtpcp); // irqs remain disabled. - cblist_init_generic(rtp); - raw_spin_lock_rcu_node(rtpcp); // irqs already disabled. + // Queuing callbacks before initialization not yet supported. + if (WARN_ON_ONCE(!rcu_segcblist_is_enabled(&rtpcp->cblist))) + rcu_segcblist_init(&rtpcp->cblist); + needwake = (func == wakeme_after_rcu) || + (rcu_segcblist_n_cbs(&rtpcp->cblist) == rcu_task_lazy_lim); + if (havekthread && !needwake && !timer_pending(&rtpcp->lazy_timer)) { + if (rtp->lazy_jiffies) + mod_timer(&rtpcp->lazy_timer, rcu_tasks_lazy_time(rtp)); + else + needwake = rcu_segcblist_empty(&rtpcp->cblist); } - needwake = rcu_segcblist_empty(&rtpcp->cblist); + if (needwake) + rtpcp->urgent_gp = 3; rcu_segcblist_enqueue(&rtpcp->cblist, rhp); raw_spin_unlock_irqrestore_rcu_node(rtpcp, flags); if (unlikely(needadjust)) { raw_spin_lock_irqsave(&rtp->cbs_gbl_lock, flags); - if (rtp->percpu_enqueue_lim != nr_cpu_ids) { + if (rtp->percpu_enqueue_lim != rcu_task_cpu_ids) { WRITE_ONCE(rtp->percpu_enqueue_shift, 0); - WRITE_ONCE(rtp->percpu_dequeue_lim, nr_cpu_ids); - smp_store_release(&rtp->percpu_enqueue_lim, nr_cpu_ids); + WRITE_ONCE(rtp->percpu_dequeue_lim, rcu_task_cpu_ids); + smp_store_release(&rtp->percpu_enqueue_lim, rcu_task_cpu_ids); pr_info("Switching %s to per-CPU callback queuing.\n", rtp->name); } raw_spin_unlock_irqrestore(&rtp->cbs_gbl_lock, flags); @@ -337,6 +411,7 @@ static void rcu_barrier_tasks_generic_cb(struct rcu_head *rhp) struct rcu_tasks *rtp; struct rcu_tasks_percpu *rtpcp; + rhp->next = rhp; // Mark the callback as having been invoked. rtpcp = container_of(rhp, struct rcu_tasks_percpu, barrier_q_head); rtp = rtpcp->rtpp; if (atomic_dec_and_test(&rtp->barrier_q_count)) @@ -345,7 +420,7 @@ static void rcu_barrier_tasks_generic_cb(struct rcu_head *rhp) // Wait for all in-flight callbacks for the specified RCU Tasks flavor. // Operates in a manner similar to rcu_barrier(). -static void rcu_barrier_tasks_generic(struct rcu_tasks *rtp) +static void __maybe_unused rcu_barrier_tasks_generic(struct rcu_tasks *rtp) { int cpu; unsigned long flags; @@ -358,6 +433,7 @@ static void rcu_barrier_tasks_generic(struct rcu_tasks *rtp) mutex_unlock(&rtp->barrier_q_mutex); return; } + rtp->barrier_q_start = jiffies; rcu_seq_start(&rtp->barrier_q_seq); init_completion(&rtp->barrier_q_completion); atomic_set(&rtp->barrier_q_count, 2); @@ -383,13 +459,18 @@ static void rcu_barrier_tasks_generic(struct rcu_tasks *rtp) static int rcu_tasks_need_gpcb(struct rcu_tasks *rtp) { int cpu; + int dequeue_limit; unsigned long flags; + bool gpdone = poll_state_synchronize_rcu(rtp->percpu_dequeue_gpseq); long n; long ncbs = 0; long ncbsnz = 0; int needgpcb = 0; - for (cpu = 0; cpu < smp_load_acquire(&rtp->percpu_dequeue_lim); cpu++) { + dequeue_limit = smp_load_acquire(&rtp->percpu_dequeue_lim); + for (cpu = 0; cpu < dequeue_limit; cpu++) { + if (!cpu_possible(cpu)) + continue; struct rcu_tasks_percpu *rtpcp = per_cpu_ptr(rtp->rtpcpu, cpu); /* Advance and accelerate any new callbacks. */ @@ -405,9 +486,14 @@ static int rcu_tasks_need_gpcb(struct rcu_tasks *rtp) } rcu_segcblist_advance(&rtpcp->cblist, rcu_seq_current(&rtp->tasks_gp_seq)); (void)rcu_segcblist_accelerate(&rtpcp->cblist, rcu_seq_snap(&rtp->tasks_gp_seq)); - if (rcu_segcblist_pend_cbs(&rtpcp->cblist)) + if (rtpcp->urgent_gp > 0 && rcu_segcblist_pend_cbs(&rtpcp->cblist)) { + if (rtp->lazy_jiffies) + rtpcp->urgent_gp--; needgpcb |= 0x3; - if (!rcu_segcblist_empty(&rtpcp->cblist)) + } else if (rcu_segcblist_empty(&rtpcp->cblist)) { + rtpcp->urgent_gp = 0; + } + if (rcu_segcblist_ready_cbs(&rtpcp->cblist)) needgpcb |= 0x1; raw_spin_unlock_irqrestore_rcu_node(rtpcp, flags); } @@ -422,24 +508,28 @@ static int rcu_tasks_need_gpcb(struct rcu_tasks *rtp) if (rcu_task_cb_adjust && ncbs <= rcu_task_collapse_lim) { raw_spin_lock_irqsave(&rtp->cbs_gbl_lock, flags); if (rtp->percpu_enqueue_lim > 1) { - WRITE_ONCE(rtp->percpu_enqueue_shift, order_base_2(nr_cpu_ids)); + WRITE_ONCE(rtp->percpu_enqueue_shift, order_base_2(rcu_task_cpu_ids)); smp_store_release(&rtp->percpu_enqueue_lim, 1); rtp->percpu_dequeue_gpseq = get_state_synchronize_rcu(); + gpdone = false; pr_info("Starting switch %s to CPU-0 callback queuing.\n", rtp->name); } raw_spin_unlock_irqrestore(&rtp->cbs_gbl_lock, flags); } - if (rcu_task_cb_adjust && !ncbsnz && - poll_state_synchronize_rcu(rtp->percpu_dequeue_gpseq)) { + if (rcu_task_cb_adjust && !ncbsnz && gpdone) { raw_spin_lock_irqsave(&rtp->cbs_gbl_lock, flags); if (rtp->percpu_enqueue_lim < rtp->percpu_dequeue_lim) { WRITE_ONCE(rtp->percpu_dequeue_lim, 1); pr_info("Completing switch %s to CPU-0 callback queuing.\n", rtp->name); } - for (cpu = rtp->percpu_dequeue_lim; cpu < nr_cpu_ids; cpu++) { - struct rcu_tasks_percpu *rtpcp = per_cpu_ptr(rtp->rtpcpu, cpu); + if (rtp->percpu_dequeue_lim == 1) { + for (cpu = rtp->percpu_dequeue_lim; cpu < rcu_task_cpu_ids; cpu++) { + if (!cpu_possible(cpu)) + continue; + struct rcu_tasks_percpu *rtpcp = per_cpu_ptr(rtp->rtpcpu, cpu); - WARN_ON_ONCE(rcu_segcblist_n_cbs(&rtpcp->cblist)); + WARN_ON_ONCE(rcu_segcblist_n_cbs(&rtpcp->cblist)); + } } raw_spin_unlock_irqrestore(&rtp->cbs_gbl_lock, flags); } @@ -450,27 +540,32 @@ static int rcu_tasks_need_gpcb(struct rcu_tasks *rtp) // Advance callbacks and invoke any that are ready. static void rcu_tasks_invoke_cbs(struct rcu_tasks *rtp, struct rcu_tasks_percpu *rtpcp) { - int cpu; - int cpunext; + int cpuwq; unsigned long flags; int len; + int index; struct rcu_head *rhp; struct rcu_cblist rcl = RCU_CBLIST_INITIALIZER(rcl); struct rcu_tasks_percpu *rtpcp_next; - cpu = rtpcp->cpu; - cpunext = cpu * 2 + 1; - if (cpunext < smp_load_acquire(&rtp->percpu_dequeue_lim)) { - rtpcp_next = per_cpu_ptr(rtp->rtpcpu, cpunext); - queue_work_on(cpunext, system_wq, &rtpcp_next->rtp_work); - cpunext++; - if (cpunext < smp_load_acquire(&rtp->percpu_dequeue_lim)) { - rtpcp_next = per_cpu_ptr(rtp->rtpcpu, cpunext); - queue_work_on(cpunext, system_wq, &rtpcp_next->rtp_work); + index = rtpcp->index * 2 + 1; + if (index < num_possible_cpus()) { + rtpcp_next = rtp->rtpcp_array[index]; + if (rtpcp_next->cpu < smp_load_acquire(&rtp->percpu_dequeue_lim)) { + cpuwq = rcu_cpu_beenfullyonline(rtpcp_next->cpu) ? rtpcp_next->cpu : WORK_CPU_UNBOUND; + queue_work_on(cpuwq, system_percpu_wq, &rtpcp_next->rtp_work); + index++; + if (index < num_possible_cpus()) { + rtpcp_next = rtp->rtpcp_array[index]; + if (rtpcp_next->cpu < smp_load_acquire(&rtp->percpu_dequeue_lim)) { + cpuwq = rcu_cpu_beenfullyonline(rtpcp_next->cpu) ? rtpcp_next->cpu : WORK_CPU_UNBOUND; + queue_work_on(cpuwq, system_percpu_wq, &rtpcp_next->rtp_work); + } + } } } - if (rcu_segcblist_empty(&rtpcp->cblist) || !cpu_possible(cpu)) + if (rcu_segcblist_empty(&rtpcp->cblist)) return; raw_spin_lock_irqsave_rcu_node(rtpcp, flags); rcu_segcblist_advance(&rtpcp->cblist, rcu_seq_current(&rtp->tasks_gp_seq)); @@ -478,6 +573,7 @@ static void rcu_tasks_invoke_cbs(struct rcu_tasks *rtp, struct rcu_tasks_percpu raw_spin_unlock_irqrestore_rcu_node(rtpcp, flags); len = rcl.len; for (rhp = rcu_cblist_dequeue(&rcl); rhp; rhp = rcu_cblist_dequeue(&rcl)) { + debug_rcu_head_callback(rhp); local_bh_disable(); rhp->func(rhp); local_bh_enable(); @@ -510,10 +606,12 @@ static void rcu_tasks_one_gp(struct rcu_tasks *rtp, bool midboot) if (unlikely(midboot)) { needgpcb = 0x2; } else { + mutex_unlock(&rtp->tasks_gp_mutex); set_tasks_gp_state(rtp, RTGS_WAIT_CBS); rcuwait_wait_event(&rtp->cbs_wait, (needgpcb = rcu_tasks_need_gpcb(rtp)), TASK_IDLE); + mutex_lock(&rtp->tasks_gp_mutex); } if (needgpcb & 0x2) { @@ -534,11 +632,19 @@ static void rcu_tasks_one_gp(struct rcu_tasks *rtp, bool midboot) // RCU-tasks kthread that detects grace periods and invokes callbacks. static int __noreturn rcu_tasks_kthread(void *arg) { + int cpu; struct rcu_tasks *rtp = arg; + for_each_possible_cpu(cpu) { + struct rcu_tasks_percpu *rtpcp = per_cpu_ptr(rtp->rtpcpu, cpu); + + timer_setup(&rtpcp->lazy_timer, call_rcu_tasks_generic_timer, 0); + rtpcp->urgent_gp = 1; + } + /* Run on housekeeping CPUs by default. Sysadm can move if desired. */ housekeeping_affine(current, HK_TYPE_RCU); - WRITE_ONCE(rtp->kthread_ptr, current); // Let GPs start! + smp_store_release(&rtp->kthread_ptr, current); // Let GPs start! /* * Each pass through the following loop makes one check for @@ -560,12 +666,13 @@ static int __noreturn rcu_tasks_kthread(void *arg) static void synchronize_rcu_tasks_generic(struct rcu_tasks *rtp) { /* Complain if the scheduler has not started. */ - WARN_ONCE(rcu_scheduler_active == RCU_SCHEDULER_INACTIVE, - "synchronize_rcu_tasks called too soon"); + if (WARN_ONCE(rcu_scheduler_active == RCU_SCHEDULER_INACTIVE, + "synchronize_%s() called too soon", rtp->name)) + return; // If the grace-period kthread is running, use it. if (READ_ONCE(rtp->kthread_ptr)) { - wait_rcu_gp(rtp->call_func); + wait_rcu_gp_state(rtp->wait_state, rtp->call_func); return; } rcu_tasks_one_gp(rtp, true); @@ -611,24 +718,28 @@ static void __init rcu_tasks_bootup_oddness(void) #endif /* #ifdef CONFIG_TASKS_TRACE_RCU */ } -#endif /* #ifndef CONFIG_TINY_RCU */ -#ifndef CONFIG_TINY_RCU /* Dump out rcutorture-relevant state common to all RCU-tasks flavors. */ static void show_rcu_tasks_generic_gp_kthread(struct rcu_tasks *rtp, char *s) { int cpu; bool havecbs = false; + bool haveurgent = false; + bool haveurgentcbs = false; for_each_possible_cpu(cpu) { struct rcu_tasks_percpu *rtpcp = per_cpu_ptr(rtp->rtpcpu, cpu); - if (!data_race(rcu_segcblist_empty(&rtpcp->cblist))) { + if (!data_race(rcu_segcblist_empty(&rtpcp->cblist))) havecbs = true; + if (data_race(rtpcp->urgent_gp)) + haveurgent = true; + if (!data_race(rcu_segcblist_empty(&rtpcp->cblist)) && data_race(rtpcp->urgent_gp)) + haveurgentcbs = true; + if (havecbs && haveurgent && haveurgentcbs) break; - } } - pr_info("%s: %s(%d) since %lu g:%lu i:%lu/%lu %c%c %s\n", + pr_info("%s: %s(%d) since %lu g:%lu i:%lu/%lu %c%c%c%c l:%lu %s\n", rtp->kname, tasks_gp_state_getname(rtp), data_race(rtp->gp_state), jiffies - data_race(rtp->gp_jiffies), @@ -636,8 +747,58 @@ static void show_rcu_tasks_generic_gp_kthread(struct rcu_tasks *rtp, char *s) data_race(rtp->n_ipis_fails), data_race(rtp->n_ipis), ".k"[!!data_race(rtp->kthread_ptr)], ".C"[havecbs], + ".u"[haveurgent], + ".U"[haveurgentcbs], + rtp->lazy_jiffies, s); } + +/* Dump out more rcutorture-relevant state common to all RCU-tasks flavors. */ +static void rcu_tasks_torture_stats_print_generic(struct rcu_tasks *rtp, char *tt, + char *tf, char *tst) +{ + cpumask_var_t cm; + int cpu; + bool gotcb = false; + unsigned long j = jiffies; + + pr_alert("%s%s Tasks%s RCU g%ld gp_start %lu gp_jiffies %lu gp_state %d (%s).\n", + tt, tf, tst, data_race(rtp->tasks_gp_seq), + j - data_race(rtp->gp_start), j - data_race(rtp->gp_jiffies), + data_race(rtp->gp_state), tasks_gp_state_getname(rtp)); + pr_alert("\tEnqueue shift %d limit %d Dequeue limit %d gpseq %lu.\n", + data_race(rtp->percpu_enqueue_shift), + data_race(rtp->percpu_enqueue_lim), + data_race(rtp->percpu_dequeue_lim), + data_race(rtp->percpu_dequeue_gpseq)); + (void)zalloc_cpumask_var(&cm, GFP_KERNEL); + pr_alert("\tCallback counts:"); + for_each_possible_cpu(cpu) { + long n; + struct rcu_tasks_percpu *rtpcp = per_cpu_ptr(rtp->rtpcpu, cpu); + + if (cpumask_available(cm) && !rcu_barrier_cb_is_done(&rtpcp->barrier_q_head)) + cpumask_set_cpu(cpu, cm); + n = rcu_segcblist_n_cbs(&rtpcp->cblist); + if (!n) + continue; + pr_cont(" %d:%ld", cpu, n); + gotcb = true; + } + if (gotcb) + pr_cont(".\n"); + else + pr_cont(" (none).\n"); + pr_alert("\tBarrier seq %lu start %lu count %d holdout CPUs ", + data_race(rtp->barrier_q_seq), j - data_race(rtp->barrier_q_start), + atomic_read(&rtp->barrier_q_count)); + if (cpumask_available(cm) && !cpumask_empty(cm)) + pr_cont(" %*pbl.\n", cpumask_pr_args(cm)); + else + pr_cont("(none).\n"); + free_cpumask_var(cm); +} + #endif // #ifndef CONFIG_TINY_RCU static void exit_tasks_rcu_finish_trace(struct task_struct *t); @@ -768,10 +929,12 @@ static void rcu_tasks_wait_gp(struct rcu_tasks *rtp) // number of voluntary context switches, and add that task to the // holdout list. // rcu_tasks_postscan(): -// Invoke synchronize_srcu() to ensure that all tasks that were -// in the process of exiting (and which thus might not know to -// synchronize with this RCU Tasks grace period) have completed -// exiting. +// Gather per-CPU lists of tasks in do_exit() to ensure that all +// tasks that were in the process of exiting (and which thus might +// not know to synchronize with this RCU Tasks grace period) have +// completed exiting. The synchronize_rcu() in rcu_tasks_postgp() +// will take care of any tasks stuck in the non-preemptible region +// of do_exit() following its call to exit_tasks_rcu_finish(). // check_all_holdout_tasks(), repeatedly until holdout list is empty: // Scans the holdout list, attempting to identify a quiescent state // for each task on the list. If there is a quiescent state, the @@ -784,8 +947,10 @@ static void rcu_tasks_wait_gp(struct rcu_tasks *rtp) // with interrupts disabled. // // For each exiting task, the exit_tasks_rcu_start() and -// exit_tasks_rcu_finish() functions begin and end, respectively, the SRCU -// read-side critical sections waited for by rcu_tasks_postscan(). +// exit_tasks_rcu_finish() functions add and remove, respectively, the +// current task to a per-CPU list of tasks that rcu_tasks_postscan() must +// wait on. This is necessary because rcu_tasks_postscan() must wait on +// tasks that have already been removed from the global list of tasks. // // Pre-grace-period update-side code is ordered before the grace // via the raw_spin_lock.*rcu_node(). Pre-grace-period read-side code @@ -812,10 +977,45 @@ static void rcu_tasks_pregp_step(struct list_head *hop) synchronize_rcu(); } +/* Check for quiescent states since the pregp's synchronize_rcu() */ +static bool rcu_tasks_is_holdout(struct task_struct *t) +{ + int cpu; + + /* Has the task been seen voluntarily sleeping? */ + if (!READ_ONCE(t->on_rq)) + return false; + + /* + * t->on_rq && !t->se.sched_delayed *could* be considered sleeping but + * since it is a spurious state (it will transition into the + * traditional blocked state or get woken up without outside + * dependencies), not considering it such should only affect timing. + * + * Be conservative for now and not include it. + */ + + /* + * Idle tasks (or idle injection) within the idle loop are RCU-tasks + * quiescent states. But CPU boot code performed by the idle task + * isn't a quiescent state. + */ + if (is_idle_task(t)) + return false; + + cpu = task_cpu(t); + + /* Idle tasks on offline CPUs are RCU-tasks quiescent states. */ + if (t == idle_task(cpu) && !rcu_cpu_online(cpu)) + return false; + + return true; +} + /* Per-task initial processing. */ static void rcu_tasks_pertask(struct task_struct *t, struct list_head *hop) { - if (t != current && READ_ONCE(t->on_rq) && !is_idle_task(t)) { + if (t != current && rcu_tasks_is_holdout(t)) { get_task_struct(t); t->rcu_tasks_nvcsw = READ_ONCE(t->nvcsw); WRITE_ONCE(t->rcu_tasks_holdout, true); @@ -823,17 +1023,71 @@ static void rcu_tasks_pertask(struct task_struct *t, struct list_head *hop) } } +void call_rcu_tasks(struct rcu_head *rhp, rcu_callback_t func); +DEFINE_RCU_TASKS(rcu_tasks, rcu_tasks_wait_gp, call_rcu_tasks, "RCU Tasks"); + /* Processing between scanning taskslist and draining the holdout list. */ static void rcu_tasks_postscan(struct list_head *hop) { + int cpu; + int rtsi = READ_ONCE(rcu_task_stall_info); + + if (!IS_ENABLED(CONFIG_TINY_RCU)) { + tasks_rcu_exit_srcu_stall_timer.expires = jiffies + rtsi; + add_timer(&tasks_rcu_exit_srcu_stall_timer); + } + /* - * Wait for tasks that are in the process of exiting. This - * does only part of the job, ensuring that all tasks that were - * previously exiting reach the point where they have disabled - * preemption, allowing the later synchronize_rcu() to finish - * the job. + * Exiting tasks may escape the tasklist scan. Those are vulnerable + * until their final schedule() with TASK_DEAD state. To cope with + * this, divide the fragile exit path part in two intersecting + * read side critical sections: + * + * 1) A task_struct list addition before calling exit_notify(), + * which may remove the task from the tasklist, with the + * removal after the final preempt_disable() call in do_exit(). + * + * 2) An _RCU_ read side starting with the final preempt_disable() + * call in do_exit() and ending with the final call to schedule() + * with TASK_DEAD state. + * + * This handles the part 1). And postgp will handle part 2) with a + * call to synchronize_rcu(). */ - synchronize_srcu(&tasks_rcu_exit_srcu); + + for_each_possible_cpu(cpu) { + unsigned long j = jiffies + 1; + struct rcu_tasks_percpu *rtpcp = per_cpu_ptr(rcu_tasks.rtpcpu, cpu); + struct task_struct *t; + struct task_struct *t1; + struct list_head tmp; + + raw_spin_lock_irq_rcu_node(rtpcp); + list_for_each_entry_safe(t, t1, &rtpcp->rtp_exit_list, rcu_tasks_exit_list) { + if (list_empty(&t->rcu_tasks_holdout_list)) + rcu_tasks_pertask(t, hop); + + // RT kernels need frequent pauses, otherwise + // pause at least once per pair of jiffies. + if (!IS_ENABLED(CONFIG_PREEMPT_RT) && time_before(jiffies, j)) + continue; + + // Keep our place in the list while pausing. + // Nothing else traverses this list, so adding a + // bare list_head is OK. + list_add(&tmp, &t->rcu_tasks_exit_list); + raw_spin_unlock_irq_rcu_node(rtpcp); + cond_resched(); // For CONFIG_PREEMPT=n kernels + raw_spin_lock_irq_rcu_node(rtpcp); + t1 = list_entry(tmp.next, struct task_struct, rcu_tasks_exit_list); + list_del(&tmp); + j = jiffies + 1; + } + raw_spin_unlock_irq_rcu_node(rtpcp); + } + + if (!IS_ENABLED(CONFIG_TINY_RCU)) + timer_delete_sync(&tasks_rcu_exit_srcu_stall_timer); } /* See if tasks are still holding out, complain if so. */ @@ -844,9 +1098,9 @@ static void check_holdout_task(struct task_struct *t, if (!READ_ONCE(t->rcu_tasks_holdout) || t->rcu_tasks_nvcsw != READ_ONCE(t->nvcsw) || - !READ_ONCE(t->on_rq) || + !rcu_tasks_is_holdout(t) || (IS_ENABLED(CONFIG_NO_HZ_FULL) && - !is_idle_task(t) && t->rcu_tasks_idle_cpu >= 0)) { + !is_idle_task(t) && READ_ONCE(t->rcu_tasks_idle_cpu) >= 0)) { WRITE_ONCE(t->rcu_tasks_holdout, false); list_del_init(&t->rcu_tasks_holdout_list); put_task_struct(t); @@ -864,7 +1118,7 @@ static void check_holdout_task(struct task_struct *t, t, ".I"[is_idle_task(t)], "N."[cpu < 0 || !tick_nohz_full_cpu(cpu)], t->rcu_tasks_nvcsw, t->nvcsw, t->rcu_tasks_holdout, - t->rcu_tasks_idle_cpu, cpu); + data_race(t->rcu_tasks_idle_cpu), cpu); sched_show_task(t); } @@ -898,13 +1152,27 @@ static void rcu_tasks_postgp(struct rcu_tasks *rtp) * * In addition, this synchronize_rcu() waits for exiting tasks * to complete their final preempt_disable() region of execution, - * cleaning up after the synchronize_srcu() above. + * enforcing the whole region before tasklist removal until + * the final schedule() with TASK_DEAD state to be an RCU TASKS + * read side critical section. */ synchronize_rcu(); } -void call_rcu_tasks(struct rcu_head *rhp, rcu_callback_t func); -DEFINE_RCU_TASKS(rcu_tasks, rcu_tasks_wait_gp, call_rcu_tasks, "RCU Tasks"); +static void tasks_rcu_exit_srcu_stall(struct timer_list *unused) +{ +#ifndef CONFIG_TINY_RCU + int rtsi; + + rtsi = READ_ONCE(rcu_task_stall_info); + pr_info("%s: %s grace period number %lu (since boot) gp_state: %s is %lu jiffies old.\n", + __func__, rcu_tasks.kname, rcu_tasks.tasks_gp_seq, + tasks_gp_state_getname(&rcu_tasks), jiffies - rcu_tasks.gp_jiffies); + pr_info("Please check any exiting tasks stuck between calls to exit_tasks_rcu_start() and exit_tasks_rcu_finish()\n"); + tasks_rcu_exit_srcu_stall_timer.expires = jiffies + rtsi; + add_timer(&tasks_rcu_exit_srcu_stall_timer); +#endif // #ifndef CONFIG_TINY_RCU +} /** * call_rcu_tasks() - Queue an RCU for invocation task-based grace period @@ -966,16 +1234,21 @@ void rcu_barrier_tasks(void) } EXPORT_SYMBOL_GPL(rcu_barrier_tasks); +static int rcu_tasks_lazy_ms = -1; +module_param(rcu_tasks_lazy_ms, int, 0444); + static int __init rcu_spawn_tasks_kthread(void) { - cblist_init_generic(&rcu_tasks); rcu_tasks.gp_sleep = HZ / 10; rcu_tasks.init_fract = HZ / 10; + if (rcu_tasks_lazy_ms >= 0) + rcu_tasks.lazy_jiffies = msecs_to_jiffies(rcu_tasks_lazy_ms); rcu_tasks.pregp_func = rcu_tasks_pregp_step; rcu_tasks.pertask_func = rcu_tasks_pertask; rcu_tasks.postscan_func = rcu_tasks_postscan; rcu_tasks.holdouts_func = check_all_holdout_tasks; rcu_tasks.postgp_func = rcu_tasks_postgp; + rcu_tasks.wait_state = TASK_IDLE; rcu_spawn_tasks_kthread_generic(&rcu_tasks); return 0; } @@ -986,24 +1259,70 @@ void show_rcu_tasks_classic_gp_kthread(void) show_rcu_tasks_generic_gp_kthread(&rcu_tasks, ""); } EXPORT_SYMBOL_GPL(show_rcu_tasks_classic_gp_kthread); + +void rcu_tasks_torture_stats_print(char *tt, char *tf) +{ + rcu_tasks_torture_stats_print_generic(&rcu_tasks, tt, tf, ""); +} +EXPORT_SYMBOL_GPL(rcu_tasks_torture_stats_print); #endif // !defined(CONFIG_TINY_RCU) -/* Do the srcu_read_lock() for the above synchronize_srcu(). */ -void exit_tasks_rcu_start(void) __acquires(&tasks_rcu_exit_srcu) +struct task_struct *get_rcu_tasks_gp_kthread(void) { - preempt_disable(); - current->rcu_tasks_idx = __srcu_read_lock(&tasks_rcu_exit_srcu); - preempt_enable(); + return rcu_tasks.kthread_ptr; +} +EXPORT_SYMBOL_GPL(get_rcu_tasks_gp_kthread); + +void rcu_tasks_get_gp_data(int *flags, unsigned long *gp_seq) +{ + *flags = 0; + *gp_seq = rcu_seq_current(&rcu_tasks.tasks_gp_seq); } +EXPORT_SYMBOL_GPL(rcu_tasks_get_gp_data); -/* Do the srcu_read_unlock() for the above synchronize_srcu(). */ -void exit_tasks_rcu_finish(void) __releases(&tasks_rcu_exit_srcu) +/* + * Protect against tasklist scan blind spot while the task is exiting and + * may be removed from the tasklist. Do this by adding the task to yet + * another list. + * + * Note that the task will remove itself from this list, so there is no + * need for get_task_struct(), except in the case where rcu_tasks_pertask() + * adds it to the holdout list, in which case rcu_tasks_pertask() supplies + * the needed get_task_struct(). + */ +void exit_tasks_rcu_start(void) { + unsigned long flags; + struct rcu_tasks_percpu *rtpcp; struct task_struct *t = current; + WARN_ON_ONCE(!list_empty(&t->rcu_tasks_exit_list)); preempt_disable(); - __srcu_read_unlock(&tasks_rcu_exit_srcu, t->rcu_tasks_idx); + rtpcp = this_cpu_ptr(rcu_tasks.rtpcpu); + t->rcu_tasks_exit_cpu = smp_processor_id(); + raw_spin_lock_irqsave_rcu_node(rtpcp, flags); + WARN_ON_ONCE(!rtpcp->rtp_exit_list.next); + list_add(&t->rcu_tasks_exit_list, &rtpcp->rtp_exit_list); + raw_spin_unlock_irqrestore_rcu_node(rtpcp, flags); preempt_enable(); +} + +/* + * Remove the task from the "yet another list" because do_exit() is now + * non-preemptible, allowing synchronize_rcu() to wait beyond this point. + */ +void exit_tasks_rcu_finish(void) +{ + unsigned long flags; + struct rcu_tasks_percpu *rtpcp; + struct task_struct *t = current; + + WARN_ON_ONCE(list_empty(&t->rcu_tasks_exit_list)); + rtpcp = per_cpu_ptr(rcu_tasks.rtpcpu, t->rcu_tasks_exit_cpu); + raw_spin_lock_irqsave_rcu_node(rtpcp, flags); + list_del_init(&t->rcu_tasks_exit_list); + raw_spin_unlock_irqrestore_rcu_node(rtpcp, flags); + exit_tasks_rcu_finish_trace(t); } @@ -1016,13 +1335,12 @@ void exit_tasks_rcu_finish(void) { exit_tasks_rcu_finish_trace(current); } //////////////////////////////////////////////////////////////////////// // -// "Rude" variant of Tasks RCU, inspired by Steve Rostedt's trick of -// passing an empty function to schedule_on_each_cpu(). This approach -// provides an asynchronous call_rcu_tasks_rude() API and batching of -// concurrent calls to the synchronous synchronize_rcu_tasks_rude() API. -// This invokes schedule_on_each_cpu() in order to send IPIs far and wide -// and induces otherwise unnecessary context switches on all online CPUs, -// whether idle or not. +// "Rude" variant of Tasks RCU, inspired by Steve Rostedt's +// trick of passing an empty function to schedule_on_each_cpu(). +// This approach provides batching of concurrent calls to the synchronous +// synchronize_rcu_tasks_rude() API. This invokes schedule_on_each_cpu() +// in order to send IPIs far and wide and induces otherwise unnecessary +// context switches on all online CPUs, whether idle or not. // // Callback handling is provided by the rcu_tasks_kthread() function. // @@ -1036,18 +1354,15 @@ static void rcu_tasks_be_rude(struct work_struct *work) // Wait for one rude RCU-tasks grace period. static void rcu_tasks_rude_wait_gp(struct rcu_tasks *rtp) { - if (num_online_cpus() <= 1) - return; // Fastpath for only one CPU. - rtp->n_ipis += cpumask_weight(cpu_online_mask); schedule_on_each_cpu(rcu_tasks_be_rude); } -void call_rcu_tasks_rude(struct rcu_head *rhp, rcu_callback_t func); +static void call_rcu_tasks_rude(struct rcu_head *rhp, rcu_callback_t func); DEFINE_RCU_TASKS(rcu_tasks_rude, rcu_tasks_rude_wait_gp, call_rcu_tasks_rude, "RCU Tasks Rude"); -/** +/* * call_rcu_tasks_rude() - Queue a callback rude task-based grace period * @rhp: structure to be used for queueing the RCU updates. * @func: actual callback function to be invoked after the grace period @@ -1064,12 +1379,14 @@ DEFINE_RCU_TASKS(rcu_tasks_rude, rcu_tasks_rude_wait_gp, call_rcu_tasks_rude, * * See the description of call_rcu() for more detailed information on * memory ordering guarantees. + * + * This is no longer exported, and is instead reserved for use by + * synchronize_rcu_tasks_rude(). */ -void call_rcu_tasks_rude(struct rcu_head *rhp, rcu_callback_t func) +static void call_rcu_tasks_rude(struct rcu_head *rhp, rcu_callback_t func) { call_rcu_tasks_generic(rhp, func, &rcu_tasks_rude); } -EXPORT_SYMBOL_GPL(call_rcu_tasks_rude); /** * synchronize_rcu_tasks_rude - wait for a rude rcu-tasks grace period @@ -1091,25 +1408,13 @@ EXPORT_SYMBOL_GPL(call_rcu_tasks_rude); */ void synchronize_rcu_tasks_rude(void) { - synchronize_rcu_tasks_generic(&rcu_tasks_rude); + if (!IS_ENABLED(CONFIG_ARCH_WANTS_NO_INSTR) || IS_ENABLED(CONFIG_FORCE_TASKS_RUDE_RCU)) + synchronize_rcu_tasks_generic(&rcu_tasks_rude); } EXPORT_SYMBOL_GPL(synchronize_rcu_tasks_rude); -/** - * rcu_barrier_tasks_rude - Wait for in-flight call_rcu_tasks_rude() callbacks. - * - * Although the current implementation is guaranteed to wait, it is not - * obligated to, for example, if there are no pending callbacks. - */ -void rcu_barrier_tasks_rude(void) -{ - rcu_barrier_tasks_generic(&rcu_tasks_rude); -} -EXPORT_SYMBOL_GPL(rcu_barrier_tasks_rude); - static int __init rcu_spawn_tasks_rude_kthread(void) { - cblist_init_generic(&rcu_tasks_rude); rcu_tasks_rude.gp_sleep = HZ / 10; rcu_spawn_tasks_kthread_generic(&rcu_tasks_rude); return 0; @@ -1121,7 +1426,27 @@ void show_rcu_tasks_rude_gp_kthread(void) show_rcu_tasks_generic_gp_kthread(&rcu_tasks_rude, ""); } EXPORT_SYMBOL_GPL(show_rcu_tasks_rude_gp_kthread); + +void rcu_tasks_rude_torture_stats_print(char *tt, char *tf) +{ + rcu_tasks_torture_stats_print_generic(&rcu_tasks_rude, tt, tf, ""); +} +EXPORT_SYMBOL_GPL(rcu_tasks_rude_torture_stats_print); #endif // !defined(CONFIG_TINY_RCU) + +struct task_struct *get_rcu_tasks_rude_gp_kthread(void) +{ + return rcu_tasks_rude.kthread_ptr; +} +EXPORT_SYMBOL_GPL(get_rcu_tasks_rude_gp_kthread); + +void rcu_tasks_rude_get_gp_data(int *flags, unsigned long *gp_seq) +{ + *flags = 0; + *gp_seq = rcu_seq_current(&rcu_tasks_rude.tasks_gp_seq); +} +EXPORT_SYMBOL_GPL(rcu_tasks_rude_get_gp_data); + #endif /* #ifdef CONFIG_TASKS_RUDE_RCU */ //////////////////////////////////////////////////////////////////////// @@ -1221,19 +1546,12 @@ static void rcu_st_need_qs(struct task_struct *t, u8 v) /* * Do a cmpxchg() on ->trc_reader_special.b.need_qs, allowing for * the four-byte operand-size restriction of some platforms. + * * Returns the old value, which is often ignored. */ u8 rcu_trc_cmpxchg_need_qs(struct task_struct *t, u8 old, u8 new) { - union rcu_special ret; - union rcu_special trs_old = READ_ONCE(t->trc_reader_special); - union rcu_special trs_new = trs_old; - - if (trs_old.b.need_qs != old) - return trs_old.b.need_qs; - trs_new.b.need_qs = new; - ret.s = cmpxchg(&t->trc_reader_special.s, trs_old.s, trs_new.s); - return ret.b.need_qs; + return cmpxchg(&t->trc_reader_special.b.need_qs, old, new); } EXPORT_SYMBOL_GPL(rcu_trc_cmpxchg_need_qs); @@ -1362,14 +1680,14 @@ static int trc_inspect_reader(struct task_struct *t, void *bhp_in) // However, we cannot safely change its state. n_heavy_reader_attempts++; // Check for "running" idle tasks on offline CPUs. - if (!rcu_dynticks_zero_in_eqs(cpu, &t->trc_reader_nesting)) + if (!rcu_watching_zero_in_eqs(cpu, &t->trc_reader_nesting)) return -EINVAL; // No quiescent state, do it the hard way. n_heavy_reader_updates++; nesting = 0; } else { // The task is not running, so C-language access is safe. nesting = t->trc_reader_nesting; - WARN_ON_ONCE(ofl && task_curr(t) && !is_idle_task(t)); + WARN_ON_ONCE(ofl && task_curr(t) && (t != idle_task(task_cpu(t)))); if (IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB) && ofl) n_heavy_reader_ofl_updates++; } @@ -1496,6 +1814,16 @@ static void rcu_tasks_trace_pregp_step(struct list_head *hop) // allow safe access to the hop list. for_each_online_cpu(cpu) { rcu_read_lock(); + // Note that cpu_curr_snapshot() picks up the target + // CPU's current task while its runqueue is locked with + // an smp_mb__after_spinlock(). This ensures that either + // the grace-period kthread will see that task's read-side + // critical section or the task will see the updater's pre-GP + // accesses. The trailing smp_mb() in cpu_curr_snapshot() + // does not currently play a role other than simplify + // that function's ordering semantics. If these simplified + // ordering semantics continue to be redundant, that smp_mb() + // might be removed. t = cpu_curr_snapshot(cpu); if (rcu_tasks_trace_pertask_prep(t, true)) trc_add_holdout(t, hop); @@ -1727,9 +2055,11 @@ void rcu_barrier_tasks_trace(void) } EXPORT_SYMBOL_GPL(rcu_barrier_tasks_trace); +int rcu_tasks_trace_lazy_ms = -1; +module_param(rcu_tasks_trace_lazy_ms, int, 0444); + static int __init rcu_spawn_tasks_trace_kthread(void) { - cblist_init_generic(&rcu_tasks_trace); if (IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB)) { rcu_tasks_trace.gp_sleep = HZ / 10; rcu_tasks_trace.init_fract = HZ / 10; @@ -1741,6 +2071,8 @@ static int __init rcu_spawn_tasks_trace_kthread(void) if (rcu_tasks_trace.init_fract <= 0) rcu_tasks_trace.init_fract = 1; } + if (rcu_tasks_trace_lazy_ms >= 0) + rcu_tasks_trace.lazy_jiffies = msecs_to_jiffies(rcu_tasks_trace_lazy_ms); rcu_tasks_trace.pregp_func = rcu_tasks_trace_pregp_step; rcu_tasks_trace.postscan_func = rcu_tasks_trace_postscan; rcu_tasks_trace.holdouts_func = check_all_holdout_tasks_trace; @@ -1754,7 +2086,7 @@ void show_rcu_tasks_trace_gp_kthread(void) { char buf[64]; - sprintf(buf, "N%lu h:%lu/%lu/%lu", + snprintf(buf, sizeof(buf), "N%lu h:%lu/%lu/%lu", data_race(n_trc_holdouts), data_race(n_heavy_reader_ofl_updates), data_race(n_heavy_reader_updates), @@ -1762,8 +2094,27 @@ void show_rcu_tasks_trace_gp_kthread(void) show_rcu_tasks_generic_gp_kthread(&rcu_tasks_trace, buf); } EXPORT_SYMBOL_GPL(show_rcu_tasks_trace_gp_kthread); + +void rcu_tasks_trace_torture_stats_print(char *tt, char *tf) +{ + rcu_tasks_torture_stats_print_generic(&rcu_tasks_trace, tt, tf, ""); +} +EXPORT_SYMBOL_GPL(rcu_tasks_trace_torture_stats_print); #endif // !defined(CONFIG_TINY_RCU) +struct task_struct *get_rcu_tasks_trace_gp_kthread(void) +{ + return rcu_tasks_trace.kthread_ptr; +} +EXPORT_SYMBOL_GPL(get_rcu_tasks_trace_gp_kthread); + +void rcu_tasks_trace_get_gp_data(int *flags, unsigned long *gp_seq) +{ + *flags = 0; + *gp_seq = rcu_seq_current(&rcu_tasks_trace.tasks_gp_seq); +} +EXPORT_SYMBOL_GPL(rcu_tasks_trace_get_gp_data); + #else /* #ifdef CONFIG_TASKS_TRACE_RCU */ static void exit_tasks_rcu_finish_trace(struct task_struct *t) { } #endif /* #else #ifdef CONFIG_TASKS_TRACE_RCU */ @@ -1792,17 +2143,13 @@ static struct rcu_tasks_test_desc tests[] = { .notrun = IS_ENABLED(CONFIG_TASKS_RCU), }, { - .name = "call_rcu_tasks_rude()", - /* If not defined, the test is skipped. */ - .notrun = IS_ENABLED(CONFIG_TASKS_RUDE_RCU), - }, - { .name = "call_rcu_tasks_trace()", /* If not defined, the test is skipped. */ .notrun = IS_ENABLED(CONFIG_TASKS_TRACE_RCU) } }; +#if defined(CONFIG_TASKS_RCU) || defined(CONFIG_TASKS_TRACE_RCU) static void test_rcu_tasks_callback(struct rcu_head *rhp) { struct rcu_tasks_test_desc *rttd = @@ -1812,28 +2159,27 @@ static void test_rcu_tasks_callback(struct rcu_head *rhp) rttd->notrun = false; } +#endif // #if defined(CONFIG_TASKS_RCU) || defined(CONFIG_TASKS_TRACE_RCU) static void rcu_tasks_initiate_self_tests(void) { - unsigned long j = jiffies; - - pr_info("Running RCU-tasks wait API self tests\n"); #ifdef CONFIG_TASKS_RCU - tests[0].runstart = j; + pr_info("Running RCU Tasks wait API self tests\n"); + tests[0].runstart = jiffies; synchronize_rcu_tasks(); call_rcu_tasks(&tests[0].rh, test_rcu_tasks_callback); #endif #ifdef CONFIG_TASKS_RUDE_RCU - tests[1].runstart = j; + pr_info("Running RCU Tasks Rude wait API self tests\n"); synchronize_rcu_tasks_rude(); - call_rcu_tasks_rude(&tests[1].rh, test_rcu_tasks_callback); #endif #ifdef CONFIG_TASKS_TRACE_RCU - tests[2].runstart = j; + pr_info("Running RCU Tasks Trace wait API self tests\n"); + tests[1].runstart = jiffies; synchronize_rcu_tasks_trace(); - call_rcu_tasks_trace(&tests[2].rh, test_rcu_tasks_callback); + call_rcu_tasks_trace(&tests[1].rh, test_rcu_tasks_callback); #endif } @@ -1893,7 +2239,25 @@ late_initcall(rcu_tasks_verify_schedule_work); static void rcu_tasks_initiate_self_tests(void) { } #endif /* #else #ifdef CONFIG_PROVE_RCU */ -void __init rcu_init_tasks_generic(void) +void __init tasks_cblist_init_generic(void) +{ + lockdep_assert_irqs_disabled(); + WARN_ON(num_online_cpus() > 1); + +#ifdef CONFIG_TASKS_RCU + cblist_init_generic(&rcu_tasks); +#endif + +#ifdef CONFIG_TASKS_RUDE_RCU + cblist_init_generic(&rcu_tasks_rude); +#endif + +#ifdef CONFIG_TASKS_TRACE_RCU + cblist_init_generic(&rcu_tasks_trace); +#endif +} + +static int __init rcu_init_tasks_generic(void) { #ifdef CONFIG_TASKS_RCU rcu_spawn_tasks_kthread(); @@ -1909,7 +2273,10 @@ void __init rcu_init_tasks_generic(void) // Run the self-tests. rcu_tasks_initiate_self_tests(); + + return 0; } +core_initcall(rcu_init_tasks_generic); #else /* #ifdef CONFIG_TASKS_RCU_GENERIC */ static inline void rcu_tasks_bootup_oddness(void) {} diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c index 72913ce21258..585cade21010 100644 --- a/kernel/rcu/tiny.c +++ b/kernel/rcu/tiny.c @@ -70,12 +70,10 @@ void rcu_qs(void) */ void rcu_sched_clock_irq(int user) { - if (user) { + if (user) rcu_qs(); - } else if (rcu_ctrlblk.donetail != rcu_ctrlblk.curtail) { - set_tsk_need_resched(current); - set_preempt_need_resched(); - } + else if (rcu_ctrlblk.donetail != rcu_ctrlblk.curtail) + set_need_resched_current(); } /* @@ -85,18 +83,12 @@ void rcu_sched_clock_irq(int user) static inline bool rcu_reclaim_tiny(struct rcu_head *head) { rcu_callback_t f; - unsigned long offset = (unsigned long)head->func; rcu_lock_acquire(&rcu_callback_map); - if (__is_kvfree_rcu_offset(offset)) { - trace_rcu_invoke_kvfree_callback("", head, offset); - kvfree((void *)head - offset); - rcu_lock_release(&rcu_callback_map); - return true; - } trace_rcu_invoke_callback("", head); f = head->func; + debug_rcu_head_callback(head); WRITE_ONCE(head->func, (rcu_callback_t)0L); f(head); rcu_lock_release(&rcu_callback_map); @@ -104,7 +96,7 @@ static inline bool rcu_reclaim_tiny(struct rcu_head *head) } /* Invoke the RCU callbacks whose grace period has elapsed. */ -static __latent_entropy void rcu_process_callbacks(struct softirq_action *unused) +static __latent_entropy void rcu_process_callbacks(void) { struct rcu_head *next, *list; unsigned long flags; @@ -129,9 +121,7 @@ static __latent_entropy void rcu_process_callbacks(struct softirq_action *unused next = list->next; prefetch(next); debug_rcu_head_unqueue(list); - local_bh_disable(); rcu_reclaim_tiny(list); - local_bh_enable(); list = next; } } @@ -154,14 +144,12 @@ void synchronize_rcu(void) lock_is_held(&rcu_lock_map) || lock_is_held(&rcu_sched_lock_map), "Illegal synchronize_rcu() in RCU read-side critical section"); + preempt_disable(); WRITE_ONCE(rcu_ctrlblk.gp_seq, rcu_ctrlblk.gp_seq + 2); + preempt_enable(); } EXPORT_SYMBOL_GPL(synchronize_rcu); -static void tiny_rcu_leak_callback(struct rcu_head *rhp) -{ -} - /* * Post an RCU callback to be invoked after the end of an RCU grace * period. But since we have but one CPU, that would be after any @@ -177,9 +165,6 @@ void call_rcu(struct rcu_head *head, rcu_callback_t func) pr_err("%s(): Double-freed CB %p->%pS()!!! ", __func__, head, head->func); mem_dump_obj(head); } - - if (!__is_kvfree_rcu_offset((unsigned long)head->func)) - WRITE_ONCE(head->func, tiny_rcu_leak_callback); return; } @@ -245,22 +230,23 @@ bool poll_state_synchronize_rcu(unsigned long oldstate) } EXPORT_SYMBOL_GPL(poll_state_synchronize_rcu); -#ifdef CONFIG_KASAN_GENERIC -void kvfree_call_rcu(struct rcu_head *head, rcu_callback_t func) +#if IS_ENABLED(CONFIG_RCU_TORTURE_TEST) +unsigned long long rcutorture_gather_gp_seqs(void) { - if (head) { - void *ptr = (void *) head - (unsigned long) func; - - kasan_record_aux_stack_noalloc(ptr); - } + return READ_ONCE(rcu_ctrlblk.gp_seq) & 0xffffULL; +} +EXPORT_SYMBOL_GPL(rcutorture_gather_gp_seqs); - __kvfree_call_rcu(head, func); +void rcutorture_format_gp_seqs(unsigned long long seqs, char *cp, size_t len) +{ + snprintf(cp, len, "g%04llx", seqs & 0xffffULL); } -EXPORT_SYMBOL_GPL(kvfree_call_rcu); +EXPORT_SYMBOL_GPL(rcutorture_format_gp_seqs); #endif void __init rcu_init(void) { open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); rcu_early_boot_tests(); + tasks_cblist_init_generic(); } diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index cf34a961821a..293bbd9ac3f4 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -31,6 +31,7 @@ #include <linux/bitops.h> #include <linux/export.h> #include <linux/completion.h> +#include <linux/kmemleak.h> #include <linux/moduleparam.h> #include <linux/panic.h> #include <linux/panic_notifier.h> @@ -74,13 +75,20 @@ #define MODULE_PARAM_PREFIX "rcutree." /* Data structures. */ +static void rcu_sr_normal_gp_cleanup_work(struct work_struct *); static DEFINE_PER_CPU_SHARED_ALIGNED(struct rcu_data, rcu_data) = { .gpwrap = true, -#ifdef CONFIG_RCU_NOCB_CPU - .cblist.flags = SEGCBLIST_RCU_CORE, -#endif }; + +int rcu_get_gpwrap_count(int cpu) +{ + struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); + + return READ_ONCE(rdp->gpwrap_count); +} +EXPORT_SYMBOL_GPL(rcu_get_gpwrap_count); + static struct rcu_state rcu_state = { .level = { &rcu_state.node[0] }, .gp_state = RCU_GP_IDLE, @@ -92,6 +100,12 @@ static struct rcu_state rcu_state = { .exp_mutex = __MUTEX_INITIALIZER(rcu_state.exp_mutex), .exp_wake_mutex = __MUTEX_INITIALIZER(rcu_state.exp_wake_mutex), .ofl_lock = __ARCH_SPIN_LOCK_UNLOCKED, + .srs_cleanup_work = __WORK_INITIALIZER(rcu_state.srs_cleanup_work, + rcu_sr_normal_gp_cleanup_work), + .srs_cleanups_pending = ATOMIC_INIT(0), +#ifdef CONFIG_RCU_NOCB_CPU + .nocb_mutex = __MUTEX_INITIALIZER(rcu_state.nocb_mutex), +#endif }; /* Dump rcu_node combining tree at boot to verify correct setup. */ @@ -144,14 +158,14 @@ static int rcu_scheduler_fully_active __read_mostly; static void rcu_report_qs_rnp(unsigned long mask, struct rcu_node *rnp, unsigned long gps, unsigned long flags); -static void rcu_init_new_rnp(struct rcu_node *rnp_leaf); -static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf); -static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu); static void invoke_rcu_core(void); static void rcu_report_exp_rdp(struct rcu_data *rdp); -static void sync_sched_exp_online_cleanup(int cpu); static void check_cb_ovld_locked(struct rcu_data *rdp, struct rcu_node *rnp); static bool rcu_rdp_is_offloaded(struct rcu_data *rdp); +static bool rcu_rdp_cpu_online(struct rcu_data *rdp); +static bool rcu_init_invoked(void); +static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf); +static void rcu_init_new_rnp(struct rcu_node *rnp_leaf); /* * rcuc/rcub/rcuop kthread realtime priority. The "rcuop" @@ -169,6 +183,9 @@ static int gp_init_delay; module_param(gp_init_delay, int, 0444); static int gp_cleanup_delay; module_param(gp_cleanup_delay, int, 0444); +static int nohz_full_patience_delay; +module_param(nohz_full_patience_delay, int, 0444); +static int nohz_full_patience_delay_jiffies; // Add delay to rcu_read_unlock() for strict grace periods. static int rcu_unlock_delay; @@ -176,26 +193,6 @@ static int rcu_unlock_delay; module_param(rcu_unlock_delay, int, 0444); #endif -/* - * This rcu parameter is runtime-read-only. It reflects - * a minimum allowed number of objects which can be cached - * per-CPU. Object size is equal to one page. This value - * can be changed at boot time. - */ -static int rcu_min_cached_objs = 5; -module_param(rcu_min_cached_objs, int, 0444); - -// A page shrinker can ask for pages to be freed to make them -// available for other parts of the system. This usually happens -// under low memory conditions, and in that case we should also -// defer page-cache filling for a short time period. -// -// The default value is 5 seconds, which is long enough to reduce -// interference with the shrinker while it asks other systems to -// drain their caches. -static int rcu_delay_page_cache_fill_msec = 5000; -module_param(rcu_delay_page_cache_fill_msec, int, 0444); - /* Retrieve RCU kthreads priority for rcutorture */ int rcu_get_gp_kthreads_prio(void) { @@ -215,27 +212,6 @@ EXPORT_SYMBOL_GPL(rcu_get_gp_kthreads_prio); #define PER_RCU_NODE_PERIOD 3 /* Number of grace periods between delays for debugging. */ /* - * Compute the mask of online CPUs for the specified rcu_node structure. - * This will not be stable unless the rcu_node structure's ->lock is - * held, but the bit corresponding to the current CPU will be stable - * in most contexts. - */ -static unsigned long rcu_rnp_online_cpus(struct rcu_node *rnp) -{ - return READ_ONCE(rnp->qsmaskinitnext); -} - -/* - * Is the CPU corresponding to the specified rcu_data structure online - * from RCU's perspective? This perspective is given by that structure's - * ->qsmaskinitnext field rather than by the global cpu_online_mask. - */ -static bool rcu_rdp_cpu_online(struct rcu_data *rdp) -{ - return !!(rdp->grpmask & rcu_rnp_online_cpus(rdp->mynode)); -} - -/* * Return true if an RCU grace period is in progress. The READ_ONCE()s * permit this function to be invoked without holding the root rcu_node * structure's ->lock, but of course results can be subject to change. @@ -258,76 +234,113 @@ static long rcu_get_n_cbs_cpu(int cpu) return 0; } +/** + * rcu_softirq_qs - Provide a set of RCU quiescent states in softirq processing + * + * Mark a quiescent state for RCU, Tasks RCU, and Tasks Trace RCU. + * This is a special-purpose function to be used in the softirq + * infrastructure and perhaps the occasional long-running softirq + * handler. + * + * Note that from RCU's viewpoint, a call to rcu_softirq_qs() is + * equivalent to momentarily completely enabling preemption. For + * example, given this code:: + * + * local_bh_disable(); + * do_something(); + * rcu_softirq_qs(); // A + * do_something_else(); + * local_bh_enable(); // B + * + * A call to synchronize_rcu() that began concurrently with the + * call to do_something() would be guaranteed to wait only until + * execution reached statement A. Without that rcu_softirq_qs(), + * that same synchronize_rcu() would instead be guaranteed to wait + * until execution reached statement B. + */ void rcu_softirq_qs(void) { + RCU_LOCKDEP_WARN(lock_is_held(&rcu_bh_lock_map) || + lock_is_held(&rcu_lock_map) || + lock_is_held(&rcu_sched_lock_map), + "Illegal rcu_softirq_qs() in RCU read-side critical section"); rcu_qs(); rcu_preempt_deferred_qs(current); rcu_tasks_qs(current, false); } /* - * Reset the current CPU's ->dynticks counter to indicate that the + * Reset the current CPU's RCU_WATCHING counter to indicate that the * newly onlined CPU is no longer in an extended quiescent state. * This will either leave the counter unchanged, or increment it * to the next non-quiescent value. * * The non-atomic test/increment sequence works because the upper bits - * of the ->dynticks counter are manipulated only by the corresponding CPU, + * of the ->state variable are manipulated only by the corresponding CPU, * or when the corresponding CPU is offline. */ -static void rcu_dynticks_eqs_online(void) +static void rcu_watching_online(void) { - if (ct_dynticks() & RCU_DYNTICKS_IDX) + if (ct_rcu_watching() & CT_RCU_WATCHING) return; - ct_state_inc(RCU_DYNTICKS_IDX); -} - -/* - * Snapshot the ->dynticks counter with full ordering so as to allow - * stable comparison of this counter with past and future snapshots. - */ -static int rcu_dynticks_snap(int cpu) -{ - smp_mb(); // Fundamental RCU ordering guarantee. - return ct_dynticks_cpu_acquire(cpu); + ct_state_inc(CT_RCU_WATCHING); } /* - * Return true if the snapshot returned from rcu_dynticks_snap() + * Return true if the snapshot returned from ct_rcu_watching() * indicates that RCU is in an extended quiescent state. */ -static bool rcu_dynticks_in_eqs(int snap) +static bool rcu_watching_snap_in_eqs(int snap) { - return !(snap & RCU_DYNTICKS_IDX); + return !(snap & CT_RCU_WATCHING); } -/* - * Return true if the CPU corresponding to the specified rcu_data - * structure has spent some time in an extended quiescent state since - * rcu_dynticks_snap() returned the specified snapshot. +/** + * rcu_watching_snap_stopped_since() - Has RCU stopped watching a given CPU + * since the specified @snap? + * + * @rdp: The rcu_data corresponding to the CPU for which to check EQS. + * @snap: rcu_watching snapshot taken when the CPU wasn't in an EQS. + * + * Returns true if the CPU corresponding to @rdp has spent some time in an + * extended quiescent state since @snap. Note that this doesn't check if it + * /still/ is in an EQS, just that it went through one since @snap. + * + * This is meant to be used in a loop waiting for a CPU to go through an EQS. */ -static bool rcu_dynticks_in_eqs_since(struct rcu_data *rdp, int snap) +static bool rcu_watching_snap_stopped_since(struct rcu_data *rdp, int snap) { - return snap != rcu_dynticks_snap(rdp->cpu); + /* + * The first failing snapshot is already ordered against the accesses + * performed by the remote CPU after it exits idle. + * + * The second snapshot therefore only needs to order against accesses + * performed by the remote CPU prior to entering idle and therefore can + * rely solely on acquire semantics. + */ + if (WARN_ON_ONCE(rcu_watching_snap_in_eqs(snap))) + return true; + + return snap != ct_rcu_watching_cpu_acquire(rdp->cpu); } /* * Return true if the referenced integer is zero while the specified * CPU remains within a single extended quiescent state. */ -bool rcu_dynticks_zero_in_eqs(int cpu, int *vp) +bool rcu_watching_zero_in_eqs(int cpu, int *vp) { int snap; // If not quiescent, force back to earlier extended quiescent state. - snap = ct_dynticks_cpu(cpu) & ~RCU_DYNTICKS_IDX; - smp_rmb(); // Order ->dynticks and *vp reads. + snap = ct_rcu_watching_cpu(cpu) & ~CT_RCU_WATCHING; + smp_rmb(); // Order CT state and *vp reads. if (READ_ONCE(*vp)) return false; // Non-zero, so report failure; - smp_rmb(); // Order *vp read and ->dynticks re-read. + smp_rmb(); // Order *vp read and CT state re-read. // If still in the same extended quiescent state, we are good! - return snap == ct_dynticks_cpu(cpu); + return snap == ct_rcu_watching_cpu(cpu); } /* @@ -341,17 +354,17 @@ bool rcu_dynticks_zero_in_eqs(int cpu, int *vp) * * The caller must have disabled interrupts and must not be idle. */ -notrace void rcu_momentary_dyntick_idle(void) +notrace void rcu_momentary_eqs(void) { int seq; raw_cpu_write(rcu_data.rcu_need_heavy_qs, false); - seq = ct_state_inc(2 * RCU_DYNTICKS_IDX); + seq = ct_state_inc(2 * CT_RCU_WATCHING); /* It is illegal to call this from idle state. */ - WARN_ON_ONCE(!(seq & RCU_DYNTICKS_IDX)); + WARN_ON_ONCE(!(seq & CT_RCU_WATCHING)); rcu_preempt_deferred_qs(current); } -EXPORT_SYMBOL_GPL(rcu_momentary_dyntick_idle); +EXPORT_SYMBOL_GPL(rcu_momentary_eqs); /** * rcu_is_cpu_rrupt_from_idle - see if 'interrupted' from idle @@ -363,7 +376,7 @@ EXPORT_SYMBOL_GPL(rcu_momentary_dyntick_idle); */ static int rcu_is_cpu_rrupt_from_idle(void) { - long nesting; + long nmi_nesting = ct_nmi_nesting(); /* * Usually called from the tick; but also used from smp_function_call() @@ -373,23 +386,30 @@ static int rcu_is_cpu_rrupt_from_idle(void) lockdep_assert_irqs_disabled(); /* Check for counter underflows */ - RCU_LOCKDEP_WARN(ct_dynticks_nesting() < 0, - "RCU dynticks_nesting counter underflow!"); - RCU_LOCKDEP_WARN(ct_dynticks_nmi_nesting() <= 0, - "RCU dynticks_nmi_nesting counter underflow/zero!"); - - /* Are we at first interrupt nesting level? */ - nesting = ct_dynticks_nmi_nesting(); - if (nesting > 1) + RCU_LOCKDEP_WARN(ct_nesting() < 0, + "RCU nesting counter underflow!"); + + /* Non-idle interrupt or nested idle interrupt */ + if (nmi_nesting > 1) return false; /* - * If we're not in an interrupt, we must be in the idle task! + * Non nested idle interrupt (interrupting section where RCU + * wasn't watching). */ - WARN_ON_ONCE(!nesting && !is_idle_task(current)); + if (nmi_nesting == 1) + return true; + + /* Not in an interrupt */ + if (!nmi_nesting) { + RCU_LOCKDEP_WARN(!in_task() || !is_idle_task(current), + "RCU nmi_nesting counter not in idle task!"); + return !rcu_is_watching_curr_cpu(); + } + + RCU_LOCKDEP_WARN(1, "RCU nmi_nesting counter underflow/zero!"); - /* Does CPU appear to be idle from an RCU standpoint? */ - return ct_dynticks_nesting() == 0; + return false; } #define DEFAULT_RCU_BLIMIT (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD) ? 1000 : 10) @@ -526,21 +546,34 @@ static struct rcu_node *rcu_get_root(void) /* * Send along grace-period-related data for rcutorture diagnostics. */ -void rcutorture_get_gp_data(enum rcutorture_type test_type, int *flags, - unsigned long *gp_seq) +void rcutorture_get_gp_data(int *flags, unsigned long *gp_seq) { - switch (test_type) { - case RCU_FLAVOR: - *flags = READ_ONCE(rcu_state.gp_flags); - *gp_seq = rcu_seq_current(&rcu_state.gp_seq); - break; - default: - break; - } + *flags = READ_ONCE(rcu_state.gp_flags); + *gp_seq = rcu_seq_current(&rcu_state.gp_seq); } EXPORT_SYMBOL_GPL(rcutorture_get_gp_data); -#if defined(CONFIG_NO_HZ_FULL) && (!defined(CONFIG_GENERIC_ENTRY) || !defined(CONFIG_KVM_XFER_TO_GUEST_WORK)) +/* Gather grace-period sequence numbers for rcutorture diagnostics. */ +unsigned long long rcutorture_gather_gp_seqs(void) +{ + return ((READ_ONCE(rcu_state.gp_seq) & 0xffffULL) << 40) | + ((READ_ONCE(rcu_state.expedited_sequence) & 0xffffffULL) << 16) | + (READ_ONCE(rcu_state.gp_seq_polled) & 0xffffULL); +} +EXPORT_SYMBOL_GPL(rcutorture_gather_gp_seqs); + +/* Format grace-period sequence numbers for rcutorture diagnostics. */ +void rcutorture_format_gp_seqs(unsigned long long seqs, char *cp, size_t len) +{ + unsigned int egp = (seqs >> 16) & 0xffffffULL; + unsigned int ggp = (seqs >> 40) & 0xffffULL; + unsigned int pgp = seqs & 0xffffULL; + + snprintf(cp, len, "g%04x:e%06x:p%04x", ggp, egp, pgp); +} +EXPORT_SYMBOL_GPL(rcutorture_format_gp_seqs); + +#if defined(CONFIG_NO_HZ_FULL) && (!defined(CONFIG_GENERIC_ENTRY) || !defined(CONFIG_VIRT_XFER_TO_GUEST_WORK)) /* * An empty function that will trigger a reschedule on * IRQ tail once IRQs get re-enabled on userspace/guest resume. @@ -569,7 +602,7 @@ noinstr void rcu_irq_work_resched(void) if (IS_ENABLED(CONFIG_GENERIC_ENTRY) && !(current->flags & PF_VCPU)) return; - if (IS_ENABLED(CONFIG_KVM_XFER_TO_GUEST_WORK) && (current->flags & PF_VCPU)) + if (IS_ENABLED(CONFIG_VIRT_XFER_TO_GUEST_WORK) && (current->flags & PF_VCPU)) return; instrumentation_begin(); @@ -578,7 +611,7 @@ noinstr void rcu_irq_work_resched(void) } instrumentation_end(); } -#endif /* #if defined(CONFIG_NO_HZ_FULL) && (!defined(CONFIG_GENERIC_ENTRY) || !defined(CONFIG_KVM_XFER_TO_GUEST_WORK)) */ +#endif /* #if defined(CONFIG_NO_HZ_FULL) && (!defined(CONFIG_GENERIC_ENTRY) || !defined(CONFIG_VIRT_XFER_TO_GUEST_WORK)) */ #ifdef CONFIG_PROVE_RCU /** @@ -588,12 +621,12 @@ void rcu_irq_exit_check_preempt(void) { lockdep_assert_irqs_disabled(); - RCU_LOCKDEP_WARN(ct_dynticks_nesting() <= 0, - "RCU dynticks_nesting counter underflow/zero!"); - RCU_LOCKDEP_WARN(ct_dynticks_nmi_nesting() != - DYNTICK_IRQ_NONIDLE, - "Bad RCU dynticks_nmi_nesting counter\n"); - RCU_LOCKDEP_WARN(rcu_dynticks_curr_cpu_in_eqs(), + RCU_LOCKDEP_WARN(ct_nesting() <= 0, + "RCU nesting counter underflow/zero!"); + RCU_LOCKDEP_WARN(ct_nmi_nesting() != + CT_NESTING_IRQ_NONIDLE, + "Bad RCU nmi_nesting counter\n"); + RCU_LOCKDEP_WARN(!rcu_is_watching_curr_cpu(), "RCU in extended quiescent state!"); } #endif /* #ifdef CONFIG_PROVE_RCU */ @@ -633,7 +666,7 @@ void __rcu_irq_enter_check_tick(void) if (in_nmi()) return; - RCU_LOCKDEP_WARN(rcu_dynticks_curr_cpu_in_eqs(), + RCU_LOCKDEP_WARN(!rcu_is_watching_curr_cpu(), "Illegal rcu_irq_enter_check_tick() from extended quiescent state"); if (!tick_nohz_full_cpu(rdp->cpu) || @@ -651,7 +684,7 @@ void __rcu_irq_enter_check_tick(void) // prevents self-deadlock. So we can safely recheck under the lock. // Note that the nohz_full state currently cannot change. raw_spin_lock_rcu_node(rdp->mynode); - if (rdp->rcu_urgent_qs && !rdp->rcu_forced_tick) { + if (READ_ONCE(rdp->rcu_urgent_qs) && !rdp->rcu_forced_tick) { // A nohz_full CPU is in the kernel and RCU needs a // quiescent state. Turn on the tick! WRITE_ONCE(rdp->rcu_forced_tick, true); @@ -659,6 +692,7 @@ void __rcu_irq_enter_check_tick(void) } raw_spin_unlock_rcu_node(rdp->mynode); } +NOKPROBE_SYMBOL(__rcu_irq_enter_check_tick); #endif /* CONFIG_NO_HZ_FULL */ /* @@ -695,12 +729,16 @@ static void rcu_disable_urgency_upon_qs(struct rcu_data *rdp) } /** - * rcu_is_watching - see if RCU thinks that the current CPU is not idle + * rcu_is_watching - RCU read-side critical sections permitted on current CPU? + * + * Return @true if RCU is watching the running CPU and @false otherwise. + * An @true return means that this CPU can safely enter RCU read-side + * critical sections. * - * Return true if RCU is watching the running CPU, which means that this - * CPU can safely enter RCU read-side critical sections. In other words, - * if the current CPU is not in its idle loop or is in an interrupt or - * NMI handler, return true. + * Although calls to rcu_is_watching() from most parts of the kernel + * will return @true, there are important exceptions. For example, if the + * current CPU is deep within its idle loop, in kernel entry/exit code, + * or offline, rcu_is_watching() will return @false. * * Make notrace because it can be called by the internal functions of * ftrace, and making this notrace removes unnecessary recursion calls. @@ -710,7 +748,7 @@ notrace bool rcu_is_watching(void) bool ret; preempt_disable_notrace(); - ret = !rcu_dynticks_curr_cpu_in_eqs(); + ret = rcu_is_watching_curr_cpu(); preempt_enable_notrace(); return ret; } @@ -734,45 +772,24 @@ void rcu_request_urgent_qs_task(struct task_struct *t) smp_store_release(per_cpu_ptr(&rcu_data.rcu_urgent_qs, cpu), true); } -#if defined(CONFIG_PROVE_RCU) && defined(CONFIG_HOTPLUG_CPU) +static unsigned long seq_gpwrap_lag = ULONG_MAX / 4; -/* - * Is the current CPU online as far as RCU is concerned? - * - * Disable preemption to avoid false positives that could otherwise - * happen due to the current CPU number being sampled, this task being - * preempted, its old CPU being taken offline, resuming on some other CPU, - * then determining that its old CPU is now offline. - * - * Disable checking if in an NMI handler because we cannot safely - * report errors from NMI handlers anyway. In addition, it is OK to use - * RCU on an offline processor during initial boot, hence the check for - * rcu_scheduler_fully_active. +/** + * rcu_set_gpwrap_lag - Set RCU GP sequence overflow lag value. + * @lag_gps: Set overflow lag to this many grace period worth of counters + * which is used by rcutorture to quickly force a gpwrap situation. + * @lag_gps = 0 means we reset it back to the boot-time value. */ -bool rcu_lockdep_current_cpu_online(void) +void rcu_set_gpwrap_lag(unsigned long lag_gps) { - struct rcu_data *rdp; - bool ret = false; + unsigned long lag_seq_count; - if (in_nmi() || !rcu_scheduler_fully_active) - return true; - preempt_disable_notrace(); - rdp = this_cpu_ptr(&rcu_data); - /* - * Strictly, we care here about the case where the current CPU is - * in rcu_cpu_starting() and thus has an excuse for rdp->grpmask - * not being up to date. So arch_spin_is_locked() might have a - * false positive if it's held by some *other* CPU, but that's - * OK because that just means a false *negative* on the warning. - */ - if (rcu_rdp_cpu_online(rdp) || arch_spin_is_locked(&rcu_state.ofl_lock)) - ret = true; - preempt_enable_notrace(); - return ret; + lag_seq_count = (lag_gps == 0) + ? ULONG_MAX / 4 + : lag_gps << RCU_SEQ_CTR_SHIFT; + WRITE_ONCE(seq_gpwrap_lag, lag_seq_count); } -EXPORT_SYMBOL_GPL(rcu_lockdep_current_cpu_online); - -#endif /* #if defined(CONFIG_PROVE_RCU) && defined(CONFIG_HOTPLUG_CPU) */ +EXPORT_SYMBOL_GPL(rcu_set_gpwrap_lag); /* * When trying to report a quiescent state on behalf of some other CPU, @@ -784,22 +801,35 @@ EXPORT_SYMBOL_GPL(rcu_lockdep_current_cpu_online); static void rcu_gpnum_ovf(struct rcu_node *rnp, struct rcu_data *rdp) { raw_lockdep_assert_held_rcu_node(rnp); - if (ULONG_CMP_LT(rcu_seq_current(&rdp->gp_seq) + ULONG_MAX / 4, - rnp->gp_seq)) + if (ULONG_CMP_LT(rcu_seq_current(&rdp->gp_seq) + seq_gpwrap_lag, + rnp->gp_seq)) { WRITE_ONCE(rdp->gpwrap, true); + WRITE_ONCE(rdp->gpwrap_count, READ_ONCE(rdp->gpwrap_count) + 1); + } if (ULONG_CMP_LT(rdp->rcu_iw_gp_seq + ULONG_MAX / 4, rnp->gp_seq)) rdp->rcu_iw_gp_seq = rnp->gp_seq + ULONG_MAX / 4; } /* - * Snapshot the specified CPU's dynticks counter so that we can later + * Snapshot the specified CPU's RCU_WATCHING counter so that we can later * credit them with an implicit quiescent state. Return 1 if this CPU * is in dynticks idle mode, which is an extended quiescent state. */ -static int dyntick_save_progress_counter(struct rcu_data *rdp) +static int rcu_watching_snap_save(struct rcu_data *rdp) { - rdp->dynticks_snap = rcu_dynticks_snap(rdp->cpu); - if (rcu_dynticks_in_eqs(rdp->dynticks_snap)) { + /* + * Full ordering between remote CPU's post idle accesses and updater's + * accesses prior to current GP (and also the started GP sequence number) + * is enforced by rcu_seq_start() implicit barrier and even further by + * smp_mb__after_unlock_lock() barriers chained all the way throughout the + * rnp locking tree since rcu_gp_init() and up to the current leaf rnp + * locking. + * + * Ordering between remote CPU's pre idle accesses and post grace period + * updater's accesses is enforced by the below acquire semantic. + */ + rdp->watching_snap = ct_rcu_watching_cpu_acquire(rdp->cpu); + if (rcu_watching_snap_in_eqs(rdp->watching_snap)) { trace_rcu_fqs(rcu_state.name, rdp->gp_seq, rdp->cpu, TPS("dti")); rcu_gpnum_ovf(rdp->mynode, rdp); return 1; @@ -807,15 +837,24 @@ static int dyntick_save_progress_counter(struct rcu_data *rdp) return 0; } +#ifndef arch_irq_stat_cpu +#define arch_irq_stat_cpu(cpu) 0 +#endif + /* - * Return true if the specified CPU has passed through a quiescent - * state by virtue of being in or having passed through an dynticks - * idle state since the last call to dyntick_save_progress_counter() - * for this same CPU, or by virtue of having been offline. + * Returns positive if the specified CPU has passed through a quiescent state + * by virtue of being in or having passed through an dynticks idle state since + * the last call to rcu_watching_snap_save() for this same CPU, or by + * virtue of having been offline. + * + * Returns negative if the specified CPU needs a force resched. + * + * Returns zero otherwise. */ -static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) +static int rcu_watching_snap_recheck(struct rcu_data *rdp) { unsigned long jtsq; + int ret = 0; struct rcu_node *rnp = rdp->mynode; /* @@ -826,7 +865,7 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) * read-side critical section that started before the beginning * of the current RCU grace period. */ - if (rcu_dynticks_in_eqs_since(rdp, rdp->dynticks_snap)) { + if (rcu_watching_snap_stopped_since(rdp, rdp->watching_snap)) { trace_rcu_fqs(rcu_state.name, rdp->gp_seq, rdp->cpu, TPS("dti")); rcu_gpnum_ovf(rnp, rdp); return 1; @@ -861,8 +900,8 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) __func__, rnp1->grplo, rnp1->grphi, rnp1->qsmask, rnp1->qsmaskinit, rnp1->qsmaskinitnext, rnp1->rcu_gp_init_mask); pr_info("%s %d: %c online: %ld(%d) offline: %ld(%d)\n", __func__, rdp->cpu, ".o"[rcu_rdp_cpu_online(rdp)], - (long)rdp->rcu_onl_gp_seq, rdp->rcu_onl_gp_flags, - (long)rdp->rcu_ofl_gp_seq, rdp->rcu_ofl_gp_flags); + (long)rdp->rcu_onl_gp_seq, rdp->rcu_onl_gp_state, + (long)rdp->rcu_ofl_gp_seq, rdp->rcu_ofl_gp_state); return 1; /* Break things loose after complaining. */ } @@ -901,8 +940,8 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) (time_after(jiffies, READ_ONCE(rdp->last_fqs_resched) + jtsq * 3) || rcu_state.cbovld)) { WRITE_ONCE(rdp->rcu_urgent_qs, true); - resched_cpu(rdp->cpu); WRITE_ONCE(rdp->last_fqs_resched, jiffies); + ret = -1; } /* @@ -915,8 +954,8 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) if (time_after(jiffies, rcu_state.jiffies_resched)) { if (time_after(jiffies, READ_ONCE(rdp->last_fqs_resched) + jtsq)) { - resched_cpu(rdp->cpu); WRITE_ONCE(rdp->last_fqs_resched, jiffies); + ret = -1; } if (IS_ENABLED(CONFIG_IRQ_WORK) && !rdp->rcu_iw_pending && rdp->rcu_iw_gp_seq != rnp->gp_seq && @@ -925,9 +964,27 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) rdp->rcu_iw_gp_seq = rnp->gp_seq; irq_work_queue_on(&rdp->rcu_iw, rdp->cpu); } + + if (rcu_cpu_stall_cputime && rdp->snap_record.gp_seq != rdp->gp_seq) { + int cpu = rdp->cpu; + struct rcu_snap_record *rsrp; + struct kernel_cpustat *kcsp; + + kcsp = &kcpustat_cpu(cpu); + + rsrp = &rdp->snap_record; + rsrp->cputime_irq = kcpustat_field(kcsp, CPUTIME_IRQ, cpu); + rsrp->cputime_softirq = kcpustat_field(kcsp, CPUTIME_SOFTIRQ, cpu); + rsrp->cputime_system = kcpustat_field(kcsp, CPUTIME_SYSTEM, cpu); + rsrp->nr_hardirqs = kstat_cpu_irqs_sum(cpu) + arch_irq_stat_cpu(cpu); + rsrp->nr_softirqs = kstat_cpu_softirqs_sum(cpu); + rsrp->nr_csw = nr_context_switches_cpu(cpu); + rsrp->jiffies = jiffies; + rsrp->gp_seq = rdp->gp_seq; + } } - return 0; + return ret; } /* Trace-event wrapper function for trace_rcu_future_grace_period. */ @@ -1225,7 +1282,7 @@ static bool __note_gp_changes(struct rcu_node *rnp, struct rcu_data *rdp) /* Handle the ends of any preceding grace periods first. */ if (rcu_seq_completed_gp(rdp->gp_seq, rnp->gp_seq) || - unlikely(READ_ONCE(rdp->gpwrap))) { + unlikely(rdp->gpwrap)) { if (!offloaded) ret = rcu_advance_cbs(rnp, rdp); /* Advance CBs. */ rdp->core_needs_qs = false; @@ -1239,7 +1296,7 @@ static bool __note_gp_changes(struct rcu_node *rnp, struct rcu_data *rdp) /* Now handle the beginnings of any new-to-this-CPU grace periods. */ if (rcu_seq_new_gp(rdp->gp_seq, rnp->gp_seq) || - unlikely(READ_ONCE(rdp->gpwrap))) { + unlikely(rdp->gpwrap)) { /* * If the current grace period is waiting for this CPU, * set up to detect a quiescent state, otherwise don't @@ -1254,7 +1311,7 @@ static bool __note_gp_changes(struct rcu_node *rnp, struct rcu_data *rdp) rdp->gp_seq = rnp->gp_seq; /* Remember new grace-period state. */ if (ULONG_CMP_LT(rdp->gp_seq_needed, rnp->gp_seq_needed) || rdp->gpwrap) WRITE_ONCE(rdp->gp_seq_needed, rnp->gp_seq_needed); - if (IS_ENABLED(CONFIG_PROVE_RCU) && READ_ONCE(rdp->gpwrap)) + if (IS_ENABLED(CONFIG_PROVE_RCU) && rdp->gpwrap) WRITE_ONCE(rdp->last_sched_clock, jiffies); WRITE_ONCE(rdp->gpwrap, false); rcu_gpnum_ovf(rnp, rdp); @@ -1296,7 +1353,7 @@ EXPORT_SYMBOL_GPL(rcu_gp_slow_register); /* Unregister a counter, with NULL for not caring which. */ void rcu_gp_slow_unregister(atomic_t *rgssp) { - WARN_ON_ONCE(rgssp && rgssp != rcu_gp_slow_suppress); + WARN_ON_ONCE(rgssp && rgssp != rcu_gp_slow_suppress && rcu_gp_slow_suppress != NULL); WRITE_ONCE(rcu_gp_slow_suppress, NULL); } @@ -1350,13 +1407,6 @@ static void rcu_strict_gp_boundary(void *unused) invoke_rcu_core(); } -// Has rcu_init() been invoked? This is used (for example) to determine -// whether spinlocks may be acquired safely. -static bool rcu_init_invoked(void) -{ - return !!rcu_state.n_online_cpus; -} - // Make the polled API aware of the beginning of a grace period. static void rcu_poll_gp_seq_start(unsigned long *snap) { @@ -1428,6 +1478,326 @@ static void rcu_poll_gp_seq_end_unlocked(unsigned long *snap) } /* + * There is a single llist, which is used for handling + * synchronize_rcu() users' enqueued rcu_synchronize nodes. + * Within this llist, there are two tail pointers: + * + * wait tail: Tracks the set of nodes, which need to + * wait for the current GP to complete. + * done tail: Tracks the set of nodes, for which grace + * period has elapsed. These nodes processing + * will be done as part of the cleanup work + * execution by a kworker. + * + * At every grace period init, a new wait node is added + * to the llist. This wait node is used as wait tail + * for this new grace period. Given that there are a fixed + * number of wait nodes, if all wait nodes are in use + * (which can happen when kworker callback processing + * is delayed) and additional grace period is requested. + * This means, a system is slow in processing callbacks. + * + * TODO: If a slow processing is detected, a first node + * in the llist should be used as a wait-tail for this + * grace period, therefore users which should wait due + * to a slow process are handled by _this_ grace period + * and not next. + * + * Below is an illustration of how the done and wait + * tail pointers move from one set of rcu_synchronize nodes + * to the other, as grace periods start and finish and + * nodes are processed by kworker. + * + * + * a. Initial llist callbacks list: + * + * +----------+ +--------+ +-------+ + * | | | | | | + * | head |---------> | cb2 |--------->| cb1 | + * | | | | | | + * +----------+ +--------+ +-------+ + * + * + * + * b. New GP1 Start: + * + * WAIT TAIL + * | + * | + * v + * +----------+ +--------+ +--------+ +-------+ + * | | | | | | | | + * | head ------> wait |------> cb2 |------> | cb1 | + * | | | head1 | | | | | + * +----------+ +--------+ +--------+ +-------+ + * + * + * + * c. GP completion: + * + * WAIT_TAIL == DONE_TAIL + * + * DONE TAIL + * | + * | + * v + * +----------+ +--------+ +--------+ +-------+ + * | | | | | | | | + * | head ------> wait |------> cb2 |------> | cb1 | + * | | | head1 | | | | | + * +----------+ +--------+ +--------+ +-------+ + * + * + * + * d. New callbacks and GP2 start: + * + * WAIT TAIL DONE TAIL + * | | + * | | + * v v + * +----------+ +------+ +------+ +------+ +-----+ +-----+ +-----+ + * | | | | | | | | | | | | | | + * | head ------> wait |--->| cb4 |--->| cb3 |--->|wait |--->| cb2 |--->| cb1 | + * | | | head2| | | | | |head1| | | | | + * +----------+ +------+ +------+ +------+ +-----+ +-----+ +-----+ + * + * + * + * e. GP2 completion: + * + * WAIT_TAIL == DONE_TAIL + * DONE TAIL + * | + * | + * v + * +----------+ +------+ +------+ +------+ +-----+ +-----+ +-----+ + * | | | | | | | | | | | | | | + * | head ------> wait |--->| cb4 |--->| cb3 |--->|wait |--->| cb2 |--->| cb1 | + * | | | head2| | | | | |head1| | | | | + * +----------+ +------+ +------+ +------+ +-----+ +-----+ +-----+ + * + * + * While the llist state transitions from d to e, a kworker + * can start executing rcu_sr_normal_gp_cleanup_work() and + * can observe either the old done tail (@c) or the new + * done tail (@e). So, done tail updates and reads need + * to use the rel-acq semantics. If the concurrent kworker + * observes the old done tail, the newly queued work + * execution will process the updated done tail. If the + * concurrent kworker observes the new done tail, then + * the newly queued work will skip processing the done + * tail, as workqueue semantics guarantees that the new + * work is executed only after the previous one completes. + * + * f. kworker callbacks processing complete: + * + * + * DONE TAIL + * | + * | + * v + * +----------+ +--------+ + * | | | | + * | head ------> wait | + * | | | head2 | + * +----------+ +--------+ + * + */ +static bool rcu_sr_is_wait_head(struct llist_node *node) +{ + return &(rcu_state.srs_wait_nodes)[0].node <= node && + node <= &(rcu_state.srs_wait_nodes)[SR_NORMAL_GP_WAIT_HEAD_MAX - 1].node; +} + +static struct llist_node *rcu_sr_get_wait_head(void) +{ + struct sr_wait_node *sr_wn; + int i; + + for (i = 0; i < SR_NORMAL_GP_WAIT_HEAD_MAX; i++) { + sr_wn = &(rcu_state.srs_wait_nodes)[i]; + + if (!atomic_cmpxchg_acquire(&sr_wn->inuse, 0, 1)) + return &sr_wn->node; + } + + return NULL; +} + +static void rcu_sr_put_wait_head(struct llist_node *node) +{ + struct sr_wait_node *sr_wn = container_of(node, struct sr_wait_node, node); + + atomic_set_release(&sr_wn->inuse, 0); +} + +/* Enable rcu_normal_wake_from_gp automatically on small systems. */ +#define WAKE_FROM_GP_CPU_THRESHOLD 16 + +static int rcu_normal_wake_from_gp = -1; +module_param(rcu_normal_wake_from_gp, int, 0644); +static struct workqueue_struct *sync_wq; + +static void rcu_sr_normal_complete(struct llist_node *node) +{ + struct rcu_synchronize *rs = container_of( + (struct rcu_head *) node, struct rcu_synchronize, head); + + WARN_ONCE(IS_ENABLED(CONFIG_PROVE_RCU) && + !poll_state_synchronize_rcu_full(&rs->oldstate), + "A full grace period is not passed yet!\n"); + + /* Finally. */ + complete(&rs->completion); +} + +static void rcu_sr_normal_gp_cleanup_work(struct work_struct *work) +{ + struct llist_node *done, *rcu, *next, *head; + + /* + * This work execution can potentially execute + * while a new done tail is being updated by + * grace period kthread in rcu_sr_normal_gp_cleanup(). + * So, read and updates of done tail need to + * follow acq-rel semantics. + * + * Given that wq semantics guarantees that a single work + * cannot execute concurrently by multiple kworkers, + * the done tail list manipulations are protected here. + */ + done = smp_load_acquire(&rcu_state.srs_done_tail); + if (WARN_ON_ONCE(!done)) + return; + + WARN_ON_ONCE(!rcu_sr_is_wait_head(done)); + head = done->next; + done->next = NULL; + + /* + * The dummy node, which is pointed to by the + * done tail which is acq-read above is not removed + * here. This allows lockless additions of new + * rcu_synchronize nodes in rcu_sr_normal_add_req(), + * while the cleanup work executes. The dummy + * nodes is removed, in next round of cleanup + * work execution. + */ + llist_for_each_safe(rcu, next, head) { + if (!rcu_sr_is_wait_head(rcu)) { + rcu_sr_normal_complete(rcu); + continue; + } + + rcu_sr_put_wait_head(rcu); + } + + /* Order list manipulations with atomic access. */ + atomic_dec_return_release(&rcu_state.srs_cleanups_pending); +} + +/* + * Helper function for rcu_gp_cleanup(). + */ +static void rcu_sr_normal_gp_cleanup(void) +{ + struct llist_node *wait_tail, *next = NULL, *rcu = NULL; + int done = 0; + + wait_tail = rcu_state.srs_wait_tail; + if (wait_tail == NULL) + return; + + rcu_state.srs_wait_tail = NULL; + ASSERT_EXCLUSIVE_WRITER(rcu_state.srs_wait_tail); + WARN_ON_ONCE(!rcu_sr_is_wait_head(wait_tail)); + + /* + * Process (a) and (d) cases. See an illustration. + */ + llist_for_each_safe(rcu, next, wait_tail->next) { + if (rcu_sr_is_wait_head(rcu)) + break; + + rcu_sr_normal_complete(rcu); + // It can be last, update a next on this step. + wait_tail->next = next; + + if (++done == SR_MAX_USERS_WAKE_FROM_GP) + break; + } + + /* + * Fast path, no more users to process except putting the second last + * wait head if no inflight-workers. If there are in-flight workers, + * they will remove the last wait head. + * + * Note that the ACQUIRE orders atomic access with list manipulation. + */ + if (wait_tail->next && wait_tail->next->next == NULL && + rcu_sr_is_wait_head(wait_tail->next) && + !atomic_read_acquire(&rcu_state.srs_cleanups_pending)) { + rcu_sr_put_wait_head(wait_tail->next); + wait_tail->next = NULL; + } + + /* Concurrent sr_normal_gp_cleanup work might observe this update. */ + ASSERT_EXCLUSIVE_WRITER(rcu_state.srs_done_tail); + smp_store_release(&rcu_state.srs_done_tail, wait_tail); + + /* + * We schedule a work in order to perform a final processing + * of outstanding users(if still left) and releasing wait-heads + * added by rcu_sr_normal_gp_init() call. + */ + if (wait_tail->next) { + atomic_inc(&rcu_state.srs_cleanups_pending); + if (!queue_work(sync_wq, &rcu_state.srs_cleanup_work)) + atomic_dec(&rcu_state.srs_cleanups_pending); + } +} + +/* + * Helper function for rcu_gp_init(). + */ +static bool rcu_sr_normal_gp_init(void) +{ + struct llist_node *first; + struct llist_node *wait_head; + bool start_new_poll = false; + + first = READ_ONCE(rcu_state.srs_next.first); + if (!first || rcu_sr_is_wait_head(first)) + return start_new_poll; + + wait_head = rcu_sr_get_wait_head(); + if (!wait_head) { + // Kick another GP to retry. + start_new_poll = true; + return start_new_poll; + } + + /* Inject a wait-dummy-node. */ + llist_add(wait_head, &rcu_state.srs_next); + + /* + * A waiting list of rcu_synchronize nodes should be empty on + * this step, since a GP-kthread, rcu_gp_init() -> gp_cleanup(), + * rolls it over. If not, it is a BUG, warn a user. + */ + WARN_ON_ONCE(rcu_state.srs_wait_tail != NULL); + rcu_state.srs_wait_tail = wait_head; + ASSERT_EXCLUSIVE_WRITER(rcu_state.srs_wait_tail); + + return start_new_poll; +} + +static void rcu_sr_normal_add_req(struct rcu_synchronize *rs) +{ + llist_add((struct llist_node *) &rs->head, &rcu_state.srs_next); +} + +/* * Initialize a new grace period. Return false if no grace period required. */ static noinline_for_stack bool rcu_gp_init(void) @@ -1437,10 +1807,12 @@ static noinline_for_stack bool rcu_gp_init(void) unsigned long mask; struct rcu_data *rdp; struct rcu_node *rnp = rcu_get_root(); + bool start_new_poll; + unsigned long old_gp_seq; WRITE_ONCE(rcu_state.gp_activity, jiffies); raw_spin_lock_irq_rcu_node(rnp); - if (!READ_ONCE(rcu_state.gp_flags)) { + if (!rcu_state.gp_flags) { /* Spurious wakeup, tell caller to go back to sleep. */ raw_spin_unlock_irq_rcu_node(rnp); return false; @@ -1458,14 +1830,49 @@ static noinline_for_stack bool rcu_gp_init(void) /* Advance to a new grace period and initialize state. */ record_gp_stall_check_time(); + /* + * A new wait segment must be started before gp_seq advanced, so + * that previous gp waiters won't observe the new gp_seq. + */ + start_new_poll = rcu_sr_normal_gp_init(); /* Record GP times before starting GP, hence rcu_seq_start(). */ + old_gp_seq = rcu_state.gp_seq; + /* + * Critical ordering: rcu_seq_start() must happen BEFORE the CPU hotplug + * scan below. Otherwise we risk a race where a newly onlining CPU could + * be missed by the current grace period, potentially leading to + * use-after-free errors. For a detailed explanation of this race, see + * Documentation/RCU/Design/Requirements/Requirements.rst in the + * "Hotplug CPU" section. + * + * Also note that the root rnp's gp_seq is kept separate from, and lags, + * the rcu_state's gp_seq, for a reason. See the Quick-Quiz on + * Single-node systems for more details (in Data-Structures.rst). + */ rcu_seq_start(&rcu_state.gp_seq); + /* Ensure that rcu_seq_done_exact() guardband doesn't give false positives. */ + WARN_ON_ONCE(IS_ENABLED(CONFIG_PROVE_RCU) && + rcu_seq_done_exact(&old_gp_seq, rcu_seq_snap(&rcu_state.gp_seq))); + ASSERT_EXCLUSIVE_WRITER(rcu_state.gp_seq); trace_rcu_grace_period(rcu_state.name, rcu_state.gp_seq, TPS("start")); rcu_poll_gp_seq_start(&rcu_state.gp_seq_polled_snap); raw_spin_unlock_irq_rcu_node(rnp); /* + * The "start_new_poll" is set to true, only when this GP is not able + * to handle anything and there are outstanding users. It happens when + * the rcu_sr_normal_gp_init() function was not able to insert a dummy + * separator to the llist, because there were no left any dummy-nodes. + * + * Number of dummy-nodes is fixed, it could be that we are run out of + * them, if so we start a new pool request to repeat a try. It is rare + * and it means that a system is doing a slow processing of callbacks. + */ + if (start_new_poll) + (void) start_poll_synchronize_rcu(); + + /* * Apply per-leaf buffered online and offline operations to * the rcu_node tree. Note that this new grace period need not * wait for subsequent online CPUs, and that RCU hooks in the CPU @@ -1477,7 +1884,11 @@ static noinline_for_stack bool rcu_gp_init(void) WRITE_ONCE(rcu_state.gp_state, RCU_GP_ONOFF); /* Exclude CPU hotplug operations. */ rcu_for_each_leaf_node(rnp) { - local_irq_save(flags); + local_irq_disable(); + /* + * Serialize with CPU offline. See Requirements.rst > Hotplug CPU > + * Concurrent Quiescent State Reporting for Offline CPUs. + */ arch_spin_lock(&rcu_state.ofl_lock); raw_spin_lock_rcu_node(rnp); if (rnp->qsmaskinit == rnp->qsmaskinitnext && @@ -1485,7 +1896,7 @@ static noinline_for_stack bool rcu_gp_init(void) /* Nothing to do on this leaf rcu_node structure. */ raw_spin_unlock_rcu_node(rnp); arch_spin_unlock(&rcu_state.ofl_lock); - local_irq_restore(flags); + local_irq_enable(); continue; } @@ -1522,7 +1933,7 @@ static noinline_for_stack bool rcu_gp_init(void) raw_spin_unlock_rcu_node(rnp); arch_spin_unlock(&rcu_state.ofl_lock); - local_irq_restore(flags); + local_irq_enable(); } rcu_gp_slow(gp_preinit_delay); /* Races with CPU hotplug. */ @@ -1552,7 +1963,12 @@ static noinline_for_stack bool rcu_gp_init(void) trace_rcu_grace_period_init(rcu_state.name, rnp->gp_seq, rnp->level, rnp->grplo, rnp->grphi, rnp->qsmask); - /* Quiescent states for tasks on any now-offline CPUs. */ + /* + * Quiescent states for tasks on any now-offline CPUs. Since we + * released the ofl and rnp lock before this loop, CPUs might + * have gone offline and we have to report QS on their behalf. + * See Requirements.rst > Hotplug CPU > Concurrent QS Reporting. + */ mask = rnp->qsmask & ~rnp->qsmaskinitnext; rnp->rcu_gp_init_mask = mask; if ((mask || rnp->wait_blkd_tasks) && rcu_is_leaf_node(rnp)) @@ -1599,22 +2015,33 @@ static bool rcu_gp_fqs_check_wake(int *gfp) */ static void rcu_gp_fqs(bool first_time) { + int nr_fqs = READ_ONCE(rcu_state.nr_fqs_jiffies_stall); struct rcu_node *rnp = rcu_get_root(); WRITE_ONCE(rcu_state.gp_activity, jiffies); WRITE_ONCE(rcu_state.n_force_qs, rcu_state.n_force_qs + 1); + + WARN_ON_ONCE(nr_fqs > 3); + /* Only countdown nr_fqs for stall purposes if jiffies moves. */ + if (nr_fqs) { + if (nr_fqs == 1) { + WRITE_ONCE(rcu_state.jiffies_stall, + jiffies + rcu_jiffies_till_stall_check()); + } + WRITE_ONCE(rcu_state.nr_fqs_jiffies_stall, --nr_fqs); + } + if (first_time) { /* Collect dyntick-idle snapshots. */ - force_qs_rnp(dyntick_save_progress_counter); + force_qs_rnp(rcu_watching_snap_save); } else { /* Handle dyntick-idle and offline CPUs. */ - force_qs_rnp(rcu_implicit_dynticks_qs); + force_qs_rnp(rcu_watching_snap_recheck); } /* Clear flag to prevent immediate re-entry. */ if (READ_ONCE(rcu_state.gp_flags) & RCU_GP_FLAG_FQS) { raw_spin_lock_irq_rcu_node(rnp); - WRITE_ONCE(rcu_state.gp_flags, - READ_ONCE(rcu_state.gp_flags) & ~RCU_GP_FLAG_FQS); + WRITE_ONCE(rcu_state.gp_flags, rcu_state.gp_flags & ~RCU_GP_FLAG_FQS); raw_spin_unlock_irq_rcu_node(rnp); } } @@ -1818,6 +2245,9 @@ static noinline void rcu_gp_cleanup(void) } raw_spin_unlock_irq_rcu_node(rnp); + // Make synchronize_rcu() users aware of the end of old grace period. + rcu_sr_normal_gp_cleanup(); + // If strict, make all CPUs aware of the end of the old grace period. if (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD)) on_each_cpu(rcu_strict_gp_boundary, NULL, 0); @@ -1875,8 +2305,7 @@ static void rcu_report_qs_rsp(unsigned long flags) { raw_lockdep_assert_held_rcu_node(rcu_get_root()); WARN_ON_ONCE(!rcu_gp_in_progress()); - WRITE_ONCE(rcu_state.gp_flags, - READ_ONCE(rcu_state.gp_flags) | RCU_GP_FLAG_FQS); + WRITE_ONCE(rcu_state.gp_flags, rcu_state.gp_flags | RCU_GP_FLAG_FQS); raw_spin_unlock_irqrestore_rcu_node(rcu_get_root(), flags); rcu_gp_kthread_wake(); } @@ -2003,8 +2432,6 @@ rcu_report_qs_rdp(struct rcu_data *rdp) { unsigned long flags; unsigned long mask; - bool needwake = false; - bool needacc = false; struct rcu_node *rnp; WARN_ON_ONCE(rdp->cpu != smp_processor_id()); @@ -2035,26 +2462,17 @@ rcu_report_qs_rdp(struct rcu_data *rdp) * NOCB kthreads have their own way to deal with that... */ if (!rcu_rdp_is_offloaded(rdp)) { - needwake = rcu_accelerate_cbs(rnp, rdp); - } else if (!rcu_segcblist_completely_offloaded(&rdp->cblist)) { /* - * ...but NOCB kthreads may miss or delay callbacks acceleration - * if in the middle of a (de-)offloading process. + * The current GP has not yet ended, so it + * should not be possible for rcu_accelerate_cbs() + * to return true. So complain, but don't awaken. */ - needacc = true; + WARN_ON_ONCE(rcu_accelerate_cbs(rnp, rdp)); } rcu_disable_urgency_upon_qs(rdp); rcu_report_qs_rnp(mask, rnp, rnp->gp_seq, flags); /* ^^^ Released rnp->lock */ - if (needwake) - rcu_gp_kthread_wake(); - - if (needacc) { - rcu_nocb_lock_irqsave(rdp, flags); - rcu_accelerate_cbs_unlocked(rnp, rdp); - rcu_nocb_unlock_irqrestore(rdp, flags); - } } } @@ -2091,90 +2509,16 @@ rcu_check_quiescent_state(struct rcu_data *rdp) rcu_report_qs_rdp(rdp); } -/* - * Near the end of the offline process. Trace the fact that this CPU - * is going offline. - */ -int rcutree_dying_cpu(unsigned int cpu) +/* Return true if callback-invocation time limit exceeded. */ +static bool rcu_do_batch_check_time(long count, long tlimit, + bool jlimit_check, unsigned long jlimit) { - bool blkd; - struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); - struct rcu_node *rnp = rdp->mynode; - - if (!IS_ENABLED(CONFIG_HOTPLUG_CPU)) - return 0; - - blkd = !!(READ_ONCE(rnp->qsmask) & rdp->grpmask); - trace_rcu_grace_period(rcu_state.name, READ_ONCE(rnp->gp_seq), - blkd ? TPS("cpuofl-bgp") : TPS("cpuofl")); - return 0; -} - -/* - * All CPUs for the specified rcu_node structure have gone offline, - * and all tasks that were preempted within an RCU read-side critical - * section while running on one of those CPUs have since exited their RCU - * read-side critical section. Some other CPU is reporting this fact with - * the specified rcu_node structure's ->lock held and interrupts disabled. - * This function therefore goes up the tree of rcu_node structures, - * clearing the corresponding bits in the ->qsmaskinit fields. Note that - * the leaf rcu_node structure's ->qsmaskinit field has already been - * updated. - * - * This function does check that the specified rcu_node structure has - * all CPUs offline and no blocked tasks, so it is OK to invoke it - * prematurely. That said, invoking it after the fact will cost you - * a needless lock acquisition. So once it has done its work, don't - * invoke it again. - */ -static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf) -{ - long mask; - struct rcu_node *rnp = rnp_leaf; - - raw_lockdep_assert_held_rcu_node(rnp_leaf); - if (!IS_ENABLED(CONFIG_HOTPLUG_CPU) || - WARN_ON_ONCE(rnp_leaf->qsmaskinit) || - WARN_ON_ONCE(rcu_preempt_has_tasks(rnp_leaf))) - return; - for (;;) { - mask = rnp->grpmask; - rnp = rnp->parent; - if (!rnp) - break; - raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */ - rnp->qsmaskinit &= ~mask; - /* Between grace periods, so better already be zero! */ - WARN_ON_ONCE(rnp->qsmask); - if (rnp->qsmaskinit) { - raw_spin_unlock_rcu_node(rnp); - /* irqs remain disabled. */ - return; - } - raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled. */ - } -} - -/* - * The CPU has been completely removed, and some other CPU is reporting - * this fact from process context. Do the remainder of the cleanup. - * There can only be one CPU hotplug operation at a time, so no need for - * explicit locking. - */ -int rcutree_dead_cpu(unsigned int cpu) -{ - struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); - struct rcu_node *rnp = rdp->mynode; /* Outgoing CPU's rdp & rnp. */ - - if (!IS_ENABLED(CONFIG_HOTPLUG_CPU)) - return 0; - - WRITE_ONCE(rcu_state.n_online_cpus, rcu_state.n_online_cpus - 1); - /* Adjust any no-longer-needed kthreads. */ - rcu_boost_kthread_setaffinity(rnp, -1); - // Stop-machine done, so allow nohz_full to disable tick. - tick_dep_clear(TICK_DEP_BIT_RCU); - return 0; + // Invoke local_clock() only once per 32 consecutive callbacks. + return unlikely(tlimit) && + (!likely(count & 31) || + (IS_ENABLED(CONFIG_RCU_DOUBLE_CHECK_CB_TIME) && + jlimit_check && time_after(jiffies, jlimit))) && + local_clock() >= tlimit; } /* @@ -2183,13 +2527,17 @@ int rcutree_dead_cpu(unsigned int cpu) */ static void rcu_do_batch(struct rcu_data *rdp) { + long bl; + long count = 0; int div; bool __maybe_unused empty; unsigned long flags; - struct rcu_head *rhp; + unsigned long jlimit; + bool jlimit_check = false; + long pending; struct rcu_cblist rcl = RCU_CBLIST_INITIALIZER(rcl); - long bl, count = 0; - long pending, tlimit = 0; + struct rcu_head *rhp; + long tlimit = 0; /* If no callbacks are ready, just return. */ if (!rcu_segcblist_ready_cbs(&rdp->cblist)) { @@ -2206,18 +2554,28 @@ static void rcu_do_batch(struct rcu_data *rdp) * Extract the list of ready callbacks, disabling IRQs to prevent * races with call_rcu() from interrupt handlers. Leave the * callback counts, as rcu_barrier() needs to be conservative. + * + * Callbacks execution is fully ordered against preceding grace period + * completion (materialized by rnp->gp_seq update) thanks to the + * smp_mb__after_unlock_lock() upon node locking required for callbacks + * advancing. In NOCB mode this ordering is then further relayed through + * the nocb locking that protects both callbacks advancing and extraction. */ rcu_nocb_lock_irqsave(rdp, flags); WARN_ON_ONCE(cpu_is_offline(smp_processor_id())); - pending = rcu_segcblist_n_cbs(&rdp->cblist); + pending = rcu_segcblist_get_seglen(&rdp->cblist, RCU_DONE_TAIL); div = READ_ONCE(rcu_divisor); div = div < 0 ? 7 : div > sizeof(long) * 8 - 2 ? sizeof(long) * 8 - 2 : div; bl = max(rdp->blimit, pending >> div); - if (in_serving_softirq() && unlikely(bl > 100)) { + if ((in_serving_softirq() || rdp->rcu_cpu_kthread_status == RCU_KTHREAD_RUNNING) && + (IS_ENABLED(CONFIG_RCU_DOUBLE_CHECK_CB_TIME) || unlikely(bl > 100))) { + const long npj = NSEC_PER_SEC / HZ; long rrn = READ_ONCE(rcu_resched_ns); rrn = rrn < NSEC_PER_MSEC ? NSEC_PER_MSEC : rrn > NSEC_PER_SEC ? NSEC_PER_SEC : rrn; tlimit = local_clock() + rrn; + jlimit = jiffies + (rrn + npj + 1) / npj; + jlimit_check = true; } trace_rcu_batch_start(rcu_state.name, rcu_segcblist_n_cbs(&rdp->cblist), bl); @@ -2242,6 +2600,7 @@ static void rcu_do_batch(struct rcu_data *rdp) trace_rcu_invoke_callback(rcu_state.name, rhp); f = rhp->func; + debug_rcu_head_callback(rhp); WRITE_ONCE(rhp->func, (rcu_callback_t)0L); f(rhp); @@ -2257,19 +2616,23 @@ static void rcu_do_batch(struct rcu_data *rdp) * Make sure we don't spend too much time here and deprive other * softirq vectors of CPU cycles. */ - if (unlikely(tlimit)) { - /* only call local_clock() every 32 callbacks */ - if (likely((count & 31) || local_clock() < tlimit)) - continue; - /* Exceeded the time limit, so leave. */ + if (rcu_do_batch_check_time(count, tlimit, jlimit_check, jlimit)) break; - } } else { + // In rcuc/rcuoc context, so no worries about + // depriving other softirq vectors of CPU cycles. local_bh_enable(); lockdep_assert_irqs_enabled(); cond_resched_tasks_rcu_qs(); lockdep_assert_irqs_enabled(); local_bh_disable(); + // But rcuc kthreads can delay quiescent-state + // reporting, so check time limits for them. + if (rdp->rcu_cpu_kthread_status == RCU_KTHREAD_RUNNING && + rcu_do_batch_check_time(count, tlimit, jlimit_check, jlimit)) { + rdp->rcu_cpu_has_work = 1; + break; + } } } @@ -2333,10 +2696,8 @@ void rcu_sched_clock_irq(int user) /* The load-acquire pairs with the store-release setting to true. */ if (smp_load_acquire(this_cpu_ptr(&rcu_data.rcu_urgent_qs))) { /* Idle and userspace execution already are quiescent states. */ - if (!rcu_is_cpu_rrupt_from_idle() && !user) { - set_tsk_need_resched(current); - set_preempt_need_resched(); - } + if (!rcu_is_cpu_rrupt_from_idle() && !user) + set_need_resched_current(); __this_cpu_write(rcu_data.rcu_urgent_qs, false); } rcu_flavor_sched_clock_irq(user); @@ -2360,15 +2721,15 @@ static void force_qs_rnp(int (*f)(struct rcu_data *rdp)) { int cpu; unsigned long flags; - unsigned long mask; - struct rcu_data *rdp; struct rcu_node *rnp; rcu_state.cbovld = rcu_state.cbovldnext; rcu_state.cbovldnext = false; rcu_for_each_leaf_node(rnp) { + unsigned long mask = 0; + unsigned long rsmask = 0; + cond_resched_tasks_rcu_qs(); - mask = 0; raw_spin_lock_irqsave_rcu_node(rnp, flags); rcu_state.cbovldnext |= !!rnp->cbovldmask; if (rnp->qsmask == 0) { @@ -2386,11 +2747,17 @@ static void force_qs_rnp(int (*f)(struct rcu_data *rdp)) continue; } for_each_leaf_node_cpu_mask(rnp, cpu, rnp->qsmask) { + struct rcu_data *rdp; + int ret; + rdp = per_cpu_ptr(&rcu_data, cpu); - if (f(rdp)) { + ret = f(rdp); + if (ret > 0) { mask |= rdp->grpmask; rcu_disable_urgency_upon_qs(rdp); } + if (ret < 0) + rsmask |= rdp->grpmask; } if (mask != 0) { /* Idle/offline CPUs, report (releases rnp->lock). */ @@ -2399,6 +2766,9 @@ static void force_qs_rnp(int (*f)(struct rcu_data *rdp)) /* Nothing to do here, so just drop the lock. */ raw_spin_unlock_irqrestore_rcu_node(rnp, flags); } + + for_each_leaf_node_cpu_mask(rnp, cpu, rsmask) + resched_cpu(cpu); } } @@ -2413,6 +2783,8 @@ void rcu_force_quiescent_state(void) struct rcu_node *rnp; struct rcu_node *rnp_old = NULL; + if (!rcu_gp_in_progress()) + return; /* Funnel through hierarchy to reduce memory contention. */ rnp = raw_cpu_read(rcu_data.mynode); for (; rnp != NULL; rnp = rnp->parent) { @@ -2433,8 +2805,7 @@ void rcu_force_quiescent_state(void) raw_spin_unlock_irqrestore_rcu_node(rnp_old, flags); return; /* Someone beat us to it. */ } - WRITE_ONCE(rcu_state.gp_flags, - READ_ONCE(rcu_state.gp_flags) | RCU_GP_FLAG_FQS); + WRITE_ONCE(rcu_state.gp_flags, rcu_state.gp_flags | RCU_GP_FLAG_FQS); raw_spin_unlock_irqrestore_rcu_node(rnp_old, flags); rcu_gp_kthread_wake(); } @@ -2451,27 +2822,8 @@ static void strict_work_handler(struct work_struct *work) /* Perform RCU core processing work for the current CPU. */ static __latent_entropy void rcu_core(void) { - unsigned long flags; struct rcu_data *rdp = raw_cpu_ptr(&rcu_data); struct rcu_node *rnp = rdp->mynode; - /* - * On RT rcu_core() can be preempted when IRQs aren't disabled. - * Therefore this function can race with concurrent NOCB (de-)offloading - * on this CPU and the below condition must be considered volatile. - * However if we race with: - * - * _ Offloading: In the worst case we accelerate or process callbacks - * concurrently with NOCB kthreads. We are guaranteed to - * call rcu_nocb_lock() if that happens. - * - * _ Deoffloading: In the worst case we miss callbacks acceleration or - * processing. This is fine because the early stage - * of deoffloading invokes rcu_core() after setting - * SEGCBLIST_RCU_CORE. So we guarantee that we'll process - * what could have been dismissed without the need to wait - * for the next rcu_pending() check in the next jiffy. - */ - const bool do_batch = !rcu_segcblist_completely_offloaded(&rdp->cblist); if (cpu_is_offline(smp_processor_id())) return; @@ -2482,8 +2834,8 @@ static __latent_entropy void rcu_core(void) if (IS_ENABLED(CONFIG_PREEMPT_COUNT) && (!(preempt_count() & PREEMPT_MASK))) { rcu_preempt_deferred_qs(current); } else if (rcu_preempt_need_deferred_qs(current)) { - set_tsk_need_resched(current); - set_preempt_need_resched(); + guard(irqsave)(); + set_need_resched_current(); } /* Update RCU state based on any recent quiescent states. */ @@ -2491,17 +2843,16 @@ static __latent_entropy void rcu_core(void) /* No grace period and unregistered callbacks? */ if (!rcu_gp_in_progress() && - rcu_segcblist_is_enabled(&rdp->cblist) && do_batch) { - rcu_nocb_lock_irqsave(rdp, flags); + rcu_segcblist_is_enabled(&rdp->cblist) && !rcu_rdp_is_offloaded(rdp)) { + guard(irqsave)(); if (!rcu_segcblist_restempty(&rdp->cblist, RCU_NEXT_READY_TAIL)) rcu_accelerate_cbs_unlocked(rnp, rdp); - rcu_nocb_unlock_irqrestore(rdp, flags); } rcu_check_gp_start_stall(rnp, rdp, rcu_jiffies_till_stall_check()); /* If there are callbacks ready, invoke them. */ - if (do_batch && rcu_segcblist_ready_cbs(&rdp->cblist) && + if (!rcu_rdp_is_offloaded(rdp) && rcu_segcblist_ready_cbs(&rdp->cblist) && likely(READ_ONCE(rcu_scheduler_fully_active))) { rcu_do_batch(rdp); /* Re-invoke RCU core processing if there are callbacks remaining. */ @@ -2518,7 +2869,7 @@ static __latent_entropy void rcu_core(void) queue_work_on(rdp->cpu, rcu_gp_wq, &rdp->strict_work); } -static void rcu_core_si(struct softirq_action *h) +static void rcu_core_si(void) { rcu_core(); } @@ -2588,12 +2939,12 @@ static void rcu_cpu_kthread(unsigned int cpu) *statusp = RCU_KTHREAD_RUNNING; local_irq_disable(); work = *workp; - *workp = 0; + WRITE_ONCE(*workp, 0); local_irq_enable(); if (work) rcu_core(); local_bh_enable(); - if (*workp == 0) { + if (!READ_ONCE(*workp)) { trace_rcu_utilization(TPS("End CPU kthread@rcu_wait")); *statusp = RCU_KTHREAD_WAITING; return; @@ -2632,12 +2983,21 @@ static int __init rcu_spawn_core_kthreads(void) return 0; } +static void rcutree_enqueue(struct rcu_data *rdp, struct rcu_head *head, rcu_callback_t func) +{ + rcu_segcblist_enqueue(&rdp->cblist, head); + trace_rcu_callback(rcu_state.name, head, + rcu_segcblist_n_cbs(&rdp->cblist)); + trace_rcu_segcb_stats(&rdp->cblist, TPS("SegCBQueued")); +} + /* * Handle any core-RCU processing required by a call_rcu() invocation. */ -static void __call_rcu_core(struct rcu_data *rdp, struct rcu_head *head, - unsigned long flags) +static void call_rcu_core(struct rcu_data *rdp, struct rcu_head *head, + rcu_callback_t func, unsigned long flags) { + rcutree_enqueue(rdp, head, func); /* * If called from an extended quiescent state, invoke the RCU * core in order to force a re-evaluation of RCU's idleness. @@ -2727,16 +3087,20 @@ static void check_cb_ovld(struct rcu_data *rdp) } static void -__call_rcu_common(struct rcu_head *head, rcu_callback_t func, bool lazy) +__call_rcu_common(struct rcu_head *head, rcu_callback_t func, bool lazy_in) { static atomic_t doublefrees; unsigned long flags; + bool lazy; struct rcu_data *rdp; - bool was_alldone; /* Misaligned rcu_head! */ WARN_ON_ONCE((unsigned long)head & (sizeof(void *) - 1)); + /* Avoid NULL dereference if callback is NULL. */ + if (WARN_ON_ONCE(!func)) + return; + if (debug_rcu_head_queue(head)) { /* * Probable double call_rcu(), so leak the callback. @@ -2752,9 +3116,13 @@ __call_rcu_common(struct rcu_head *head, rcu_callback_t func, bool lazy) } head->func = func; head->next = NULL; - kasan_record_aux_stack_noalloc(head); + kasan_record_aux_stack(head); + local_irq_save(flags); rdp = this_cpu_ptr(&rcu_data); + RCU_LOCKDEP_WARN(!rcu_rdp_cpu_online(rdp), "Callback enqueued on offline CPU!"); + + lazy = lazy_in && !rcu_async_should_hurry(); /* Add the callback to our list. */ if (unlikely(!rcu_segcblist_is_enabled(&rdp->cblist))) { @@ -2768,30 +3136,18 @@ __call_rcu_common(struct rcu_head *head, rcu_callback_t func, bool lazy) } check_cb_ovld(rdp); - if (rcu_nocb_try_bypass(rdp, head, &was_alldone, flags, lazy)) - return; // Enqueued onto ->nocb_bypass, so just leave. - // If no-CBs CPU gets here, rcu_nocb_try_bypass() acquired ->nocb_lock. - rcu_segcblist_enqueue(&rdp->cblist, head); - if (__is_kvfree_rcu_offset((unsigned long)func)) - trace_rcu_kvfree_callback(rcu_state.name, head, - (unsigned long)func, - rcu_segcblist_n_cbs(&rdp->cblist)); - else - trace_rcu_callback(rcu_state.name, head, - rcu_segcblist_n_cbs(&rdp->cblist)); - - trace_rcu_segcb_stats(&rdp->cblist, TPS("SegCBQueued")); - /* Go handle any RCU core processing required. */ - if (unlikely(rcu_rdp_is_offloaded(rdp))) { - __call_rcu_nocb_wake(rdp, was_alldone, flags); /* unlocks */ - } else { - __call_rcu_core(rdp, head, flags); - local_irq_restore(flags); - } + if (unlikely(rcu_rdp_is_offloaded(rdp))) + call_rcu_nocb(rdp, head, func, flags, lazy); + else + call_rcu_core(rdp, head, func, flags); + local_irq_restore(flags); } #ifdef CONFIG_RCU_LAZY +static bool enable_rcu_lazy __read_mostly = !IS_ENABLED(CONFIG_RCU_LAZY_DEFAULT_OFF); +module_param(enable_rcu_lazy, bool, 0444); + /** * call_rcu_hurry() - Queue RCU callback for invocation after grace period, and * flush all lazy callbacks (including the new one) to the main ->cblist while @@ -2805,7 +3161,7 @@ __call_rcu_common(struct rcu_head *head, rcu_callback_t func, bool lazy) * critical sections have completed. * * Use this API instead of call_rcu() if you don't want the callback to be - * invoked after very long periods of time, which can happen on systems without + * delayed for very long periods of time, which can happen on systems without * memory pressure and on systems which are lightly loaded or mostly idle. * This function will cause callbacks to be invoked sooner than later at the * expense of extra power. Other than that, this function is identical to, and @@ -2814,9 +3170,11 @@ __call_rcu_common(struct rcu_head *head, rcu_callback_t func, bool lazy) */ void call_rcu_hurry(struct rcu_head *head, rcu_callback_t func) { - return __call_rcu_common(head, func, false); + __call_rcu_common(head, func, false); } EXPORT_SYMBOL_GPL(call_rcu_hurry); +#else +#define enable_rcu_lazy false #endif /** @@ -2834,6 +3192,12 @@ EXPORT_SYMBOL_GPL(call_rcu_hurry); * might well execute concurrently with RCU read-side critical sections * that started after call_rcu() was invoked. * + * It is perfectly legal to repost an RCU callback, potentially with + * a different callback function, from within its callback function. + * The specified function will be invoked after another full grace period + * has elapsed. This use case is similar in form to the common practice + * of reposting a timer from within its own handler. + * * RCU read-side critical sections are delimited by rcu_read_lock() * and rcu_read_unlock(), and may be nested. In addition, but only in * v5.0 and later, regions of code across which interrupts, preemption, @@ -2862,633 +3226,74 @@ EXPORT_SYMBOL_GPL(call_rcu_hurry); * * Implementation of these memory-ordering guarantees is described here: * Documentation/RCU/Design/Memory-Ordering/Tree-RCU-Memory-Ordering.rst. + * + * Specific to call_rcu() (as opposed to the other call_rcu*() functions), + * in kernels built with CONFIG_RCU_LAZY=y, call_rcu() might delay for many + * seconds before starting the grace period needed by the corresponding + * callback. This delay can significantly improve energy-efficiency + * on low-utilization battery-powered devices. To avoid this delay, + * in latency-sensitive kernel code, use call_rcu_hurry(). */ void call_rcu(struct rcu_head *head, rcu_callback_t func) { - return __call_rcu_common(head, func, IS_ENABLED(CONFIG_RCU_LAZY)); + __call_rcu_common(head, func, enable_rcu_lazy); } EXPORT_SYMBOL_GPL(call_rcu); -/* Maximum number of jiffies to wait before draining a batch. */ -#define KFREE_DRAIN_JIFFIES (5 * HZ) -#define KFREE_N_BATCHES 2 -#define FREE_N_CHANNELS 2 - -/** - * struct kvfree_rcu_bulk_data - single block to store kvfree_rcu() pointers - * @nr_records: Number of active pointers in the array - * @next: Next bulk object in the block chain - * @records: Array of the kvfree_rcu() pointers - */ -struct kvfree_rcu_bulk_data { - unsigned long nr_records; - struct kvfree_rcu_bulk_data *next; - void *records[]; -}; - /* - * This macro defines how many entries the "records" array - * will contain. It is based on the fact that the size of - * kvfree_rcu_bulk_data structure becomes exactly one page. - */ -#define KVFREE_BULK_MAX_ENTR \ - ((PAGE_SIZE - sizeof(struct kvfree_rcu_bulk_data)) / sizeof(void *)) - -/** - * struct kfree_rcu_cpu_work - single batch of kfree_rcu() requests - * @rcu_work: Let queue_rcu_work() invoke workqueue handler after grace period - * @head_free: List of kfree_rcu() objects waiting for a grace period - * @bkvhead_free: Bulk-List of kvfree_rcu() objects waiting for a grace period - * @krcp: Pointer to @kfree_rcu_cpu structure - */ - -struct kfree_rcu_cpu_work { - struct rcu_work rcu_work; - struct rcu_head *head_free; - struct kvfree_rcu_bulk_data *bkvhead_free[FREE_N_CHANNELS]; - struct kfree_rcu_cpu *krcp; -}; - -/** - * struct kfree_rcu_cpu - batch up kfree_rcu() requests for RCU grace period - * @head: List of kfree_rcu() objects not yet waiting for a grace period - * @bkvhead: Bulk-List of kvfree_rcu() objects not yet waiting for a grace period - * @krw_arr: Array of batches of kfree_rcu() objects waiting for a grace period - * @lock: Synchronize access to this structure - * @monitor_work: Promote @head to @head_free after KFREE_DRAIN_JIFFIES - * @initialized: The @rcu_work fields have been initialized - * @count: Number of objects for which GP not started - * @bkvcache: - * A simple cache list that contains objects for reuse purpose. - * In order to save some per-cpu space the list is singular. - * Even though it is lockless an access has to be protected by the - * per-cpu lock. - * @page_cache_work: A work to refill the cache when it is empty - * @backoff_page_cache_fill: Delay cache refills - * @work_in_progress: Indicates that page_cache_work is running - * @hrtimer: A hrtimer for scheduling a page_cache_work - * @nr_bkv_objs: number of allocated objects at @bkvcache. + * During early boot, any blocking grace-period wait automatically + * implies a grace period. * - * This is a per-CPU structure. The reason that it is not included in - * the rcu_data structure is to permit this code to be extracted from - * the RCU files. Such extraction could allow further optimization of - * the interactions with the slab allocators. - */ -struct kfree_rcu_cpu { - struct rcu_head *head; - struct kvfree_rcu_bulk_data *bkvhead[FREE_N_CHANNELS]; - struct kfree_rcu_cpu_work krw_arr[KFREE_N_BATCHES]; - raw_spinlock_t lock; - struct delayed_work monitor_work; - bool initialized; - int count; - - struct delayed_work page_cache_work; - atomic_t backoff_page_cache_fill; - atomic_t work_in_progress; - struct hrtimer hrtimer; - - struct llist_head bkvcache; - int nr_bkv_objs; -}; - -static DEFINE_PER_CPU(struct kfree_rcu_cpu, krc) = { - .lock = __RAW_SPIN_LOCK_UNLOCKED(krc.lock), -}; - -static __always_inline void -debug_rcu_bhead_unqueue(struct kvfree_rcu_bulk_data *bhead) -{ -#ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD - int i; - - for (i = 0; i < bhead->nr_records; i++) - debug_rcu_head_unqueue((struct rcu_head *)(bhead->records[i])); -#endif -} - -static inline struct kfree_rcu_cpu * -krc_this_cpu_lock(unsigned long *flags) -{ - struct kfree_rcu_cpu *krcp; - - local_irq_save(*flags); // For safely calling this_cpu_ptr(). - krcp = this_cpu_ptr(&krc); - raw_spin_lock(&krcp->lock); - - return krcp; -} - -static inline void -krc_this_cpu_unlock(struct kfree_rcu_cpu *krcp, unsigned long flags) -{ - raw_spin_unlock_irqrestore(&krcp->lock, flags); -} - -static inline struct kvfree_rcu_bulk_data * -get_cached_bnode(struct kfree_rcu_cpu *krcp) -{ - if (!krcp->nr_bkv_objs) - return NULL; - - WRITE_ONCE(krcp->nr_bkv_objs, krcp->nr_bkv_objs - 1); - return (struct kvfree_rcu_bulk_data *) - llist_del_first(&krcp->bkvcache); -} - -static inline bool -put_cached_bnode(struct kfree_rcu_cpu *krcp, - struct kvfree_rcu_bulk_data *bnode) -{ - // Check the limit. - if (krcp->nr_bkv_objs >= rcu_min_cached_objs) - return false; - - llist_add((struct llist_node *) bnode, &krcp->bkvcache); - WRITE_ONCE(krcp->nr_bkv_objs, krcp->nr_bkv_objs + 1); - return true; -} - -static int -drain_page_cache(struct kfree_rcu_cpu *krcp) -{ - unsigned long flags; - struct llist_node *page_list, *pos, *n; - int freed = 0; - - raw_spin_lock_irqsave(&krcp->lock, flags); - page_list = llist_del_all(&krcp->bkvcache); - WRITE_ONCE(krcp->nr_bkv_objs, 0); - raw_spin_unlock_irqrestore(&krcp->lock, flags); - - llist_for_each_safe(pos, n, page_list) { - free_page((unsigned long)pos); - freed++; - } - - return freed; -} - -/* - * This function is invoked in workqueue context after a grace period. - * It frees all the objects queued on ->bkvhead_free or ->head_free. - */ -static void kfree_rcu_work(struct work_struct *work) -{ - unsigned long flags; - struct kvfree_rcu_bulk_data *bkvhead[FREE_N_CHANNELS], *bnext; - struct rcu_head *head, *next; - struct kfree_rcu_cpu *krcp; - struct kfree_rcu_cpu_work *krwp; - int i, j; - - krwp = container_of(to_rcu_work(work), - struct kfree_rcu_cpu_work, rcu_work); - krcp = krwp->krcp; - - raw_spin_lock_irqsave(&krcp->lock, flags); - // Channels 1 and 2. - for (i = 0; i < FREE_N_CHANNELS; i++) { - bkvhead[i] = krwp->bkvhead_free[i]; - krwp->bkvhead_free[i] = NULL; - } - - // Channel 3. - head = krwp->head_free; - krwp->head_free = NULL; - raw_spin_unlock_irqrestore(&krcp->lock, flags); - - // Handle the first two channels. - for (i = 0; i < FREE_N_CHANNELS; i++) { - for (; bkvhead[i]; bkvhead[i] = bnext) { - bnext = bkvhead[i]->next; - debug_rcu_bhead_unqueue(bkvhead[i]); - - rcu_lock_acquire(&rcu_callback_map); - if (i == 0) { // kmalloc() / kfree(). - trace_rcu_invoke_kfree_bulk_callback( - rcu_state.name, bkvhead[i]->nr_records, - bkvhead[i]->records); - - kfree_bulk(bkvhead[i]->nr_records, - bkvhead[i]->records); - } else { // vmalloc() / vfree(). - for (j = 0; j < bkvhead[i]->nr_records; j++) { - trace_rcu_invoke_kvfree_callback( - rcu_state.name, - bkvhead[i]->records[j], 0); - - vfree(bkvhead[i]->records[j]); - } - } - rcu_lock_release(&rcu_callback_map); - - raw_spin_lock_irqsave(&krcp->lock, flags); - if (put_cached_bnode(krcp, bkvhead[i])) - bkvhead[i] = NULL; - raw_spin_unlock_irqrestore(&krcp->lock, flags); - - if (bkvhead[i]) - free_page((unsigned long) bkvhead[i]); - - cond_resched_tasks_rcu_qs(); - } - } - - /* - * This is used when the "bulk" path can not be used for the - * double-argument of kvfree_rcu(). This happens when the - * page-cache is empty, which means that objects are instead - * queued on a linked list through their rcu_head structures. - * This list is named "Channel 3". - */ - for (; head; head = next) { - unsigned long offset = (unsigned long)head->func; - void *ptr = (void *)head - offset; - - next = head->next; - debug_rcu_head_unqueue((struct rcu_head *)ptr); - rcu_lock_acquire(&rcu_callback_map); - trace_rcu_invoke_kvfree_callback(rcu_state.name, head, offset); - - if (!WARN_ON_ONCE(!__is_kvfree_rcu_offset(offset))) - kvfree(ptr); - - rcu_lock_release(&rcu_callback_map); - cond_resched_tasks_rcu_qs(); - } -} - -static bool -need_offload_krc(struct kfree_rcu_cpu *krcp) -{ - int i; - - for (i = 0; i < FREE_N_CHANNELS; i++) - if (krcp->bkvhead[i]) - return true; - - return !!krcp->head; -} - -static void -schedule_delayed_monitor_work(struct kfree_rcu_cpu *krcp) -{ - long delay, delay_left; - - delay = READ_ONCE(krcp->count) >= KVFREE_BULK_MAX_ENTR ? 1:KFREE_DRAIN_JIFFIES; - if (delayed_work_pending(&krcp->monitor_work)) { - delay_left = krcp->monitor_work.timer.expires - jiffies; - if (delay < delay_left) - mod_delayed_work(system_wq, &krcp->monitor_work, delay); - return; - } - queue_delayed_work(system_wq, &krcp->monitor_work, delay); -} - -/* - * This function is invoked after the KFREE_DRAIN_JIFFIES timeout. + * Later on, this could in theory be the case for kernels built with + * CONFIG_SMP=y && CONFIG_PREEMPTION=y running on a single CPU, but this + * is not a common case. Furthermore, this optimization would cause + * the rcu_gp_oldstate structure to expand by 50%, so this potential + * grace-period optimization is ignored once the scheduler is running. */ -static void kfree_rcu_monitor(struct work_struct *work) -{ - struct kfree_rcu_cpu *krcp = container_of(work, - struct kfree_rcu_cpu, monitor_work.work); - unsigned long flags; - int i, j; - - raw_spin_lock_irqsave(&krcp->lock, flags); - - // Attempt to start a new batch. - for (i = 0; i < KFREE_N_BATCHES; i++) { - struct kfree_rcu_cpu_work *krwp = &(krcp->krw_arr[i]); - - // Try to detach bkvhead or head and attach it over any - // available corresponding free channel. It can be that - // a previous RCU batch is in progress, it means that - // immediately to queue another one is not possible so - // in that case the monitor work is rearmed. - if ((krcp->bkvhead[0] && !krwp->bkvhead_free[0]) || - (krcp->bkvhead[1] && !krwp->bkvhead_free[1]) || - (krcp->head && !krwp->head_free)) { - // Channel 1 corresponds to the SLAB-pointer bulk path. - // Channel 2 corresponds to vmalloc-pointer bulk path. - for (j = 0; j < FREE_N_CHANNELS; j++) { - if (!krwp->bkvhead_free[j]) { - krwp->bkvhead_free[j] = krcp->bkvhead[j]; - krcp->bkvhead[j] = NULL; - } - } - - // Channel 3 corresponds to both SLAB and vmalloc - // objects queued on the linked list. - if (!krwp->head_free) { - krwp->head_free = krcp->head; - krcp->head = NULL; - } - - WRITE_ONCE(krcp->count, 0); - - // One work is per one batch, so there are three - // "free channels", the batch can handle. It can - // be that the work is in the pending state when - // channels have been detached following by each - // other. - queue_rcu_work(system_wq, &krwp->rcu_work); - } - } - - // If there is nothing to detach, it means that our job is - // successfully done here. In case of having at least one - // of the channels that is still busy we should rearm the - // work to repeat an attempt. Because previous batches are - // still in progress. - if (need_offload_krc(krcp)) - schedule_delayed_monitor_work(krcp); - - raw_spin_unlock_irqrestore(&krcp->lock, flags); -} - -static enum hrtimer_restart -schedule_page_work_fn(struct hrtimer *t) -{ - struct kfree_rcu_cpu *krcp = - container_of(t, struct kfree_rcu_cpu, hrtimer); - - queue_delayed_work(system_highpri_wq, &krcp->page_cache_work, 0); - return HRTIMER_NORESTART; -} - -static void fill_page_cache_func(struct work_struct *work) -{ - struct kvfree_rcu_bulk_data *bnode; - struct kfree_rcu_cpu *krcp = - container_of(work, struct kfree_rcu_cpu, - page_cache_work.work); - unsigned long flags; - int nr_pages; - bool pushed; - int i; - - nr_pages = atomic_read(&krcp->backoff_page_cache_fill) ? - 1 : rcu_min_cached_objs; - - for (i = 0; i < nr_pages; i++) { - bnode = (struct kvfree_rcu_bulk_data *) - __get_free_page(GFP_KERNEL | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN); - - if (!bnode) - break; - - raw_spin_lock_irqsave(&krcp->lock, flags); - pushed = put_cached_bnode(krcp, bnode); - raw_spin_unlock_irqrestore(&krcp->lock, flags); - - if (!pushed) { - free_page((unsigned long) bnode); - break; - } - } - - atomic_set(&krcp->work_in_progress, 0); - atomic_set(&krcp->backoff_page_cache_fill, 0); -} - -static void -run_page_cache_worker(struct kfree_rcu_cpu *krcp) -{ - if (rcu_scheduler_active == RCU_SCHEDULER_RUNNING && - !atomic_xchg(&krcp->work_in_progress, 1)) { - if (atomic_read(&krcp->backoff_page_cache_fill)) { - queue_delayed_work(system_wq, - &krcp->page_cache_work, - msecs_to_jiffies(rcu_delay_page_cache_fill_msec)); - } else { - hrtimer_init(&krcp->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - krcp->hrtimer.function = schedule_page_work_fn; - hrtimer_start(&krcp->hrtimer, 0, HRTIMER_MODE_REL); - } - } -} - -// Record ptr in a page managed by krcp, with the pre-krc_this_cpu_lock() -// state specified by flags. If can_alloc is true, the caller must -// be schedulable and not be holding any locks or mutexes that might be -// acquired by the memory allocator or anything that it might invoke. -// Returns true if ptr was successfully recorded, else the caller must -// use a fallback. -static inline bool -add_ptr_to_bulk_krc_lock(struct kfree_rcu_cpu **krcp, - unsigned long *flags, void *ptr, bool can_alloc) +static int rcu_blocking_is_gp(void) { - struct kvfree_rcu_bulk_data *bnode; - int idx; - - *krcp = krc_this_cpu_lock(flags); - if (unlikely(!(*krcp)->initialized)) + if (rcu_scheduler_active != RCU_SCHEDULER_INACTIVE) { + might_sleep(); return false; - - idx = !!is_vmalloc_addr(ptr); - - /* Check if a new block is required. */ - if (!(*krcp)->bkvhead[idx] || - (*krcp)->bkvhead[idx]->nr_records == KVFREE_BULK_MAX_ENTR) { - bnode = get_cached_bnode(*krcp); - if (!bnode && can_alloc) { - krc_this_cpu_unlock(*krcp, *flags); - - // __GFP_NORETRY - allows a light-weight direct reclaim - // what is OK from minimizing of fallback hitting point of - // view. Apart of that it forbids any OOM invoking what is - // also beneficial since we are about to release memory soon. - // - // __GFP_NOMEMALLOC - prevents from consuming of all the - // memory reserves. Please note we have a fallback path. - // - // __GFP_NOWARN - it is supposed that an allocation can - // be failed under low memory or high memory pressure - // scenarios. - bnode = (struct kvfree_rcu_bulk_data *) - __get_free_page(GFP_KERNEL | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN); - *krcp = krc_this_cpu_lock(flags); - } - - if (!bnode) - return false; - - /* Initialize the new block. */ - bnode->nr_records = 0; - bnode->next = (*krcp)->bkvhead[idx]; - - /* Attach it to the head. */ - (*krcp)->bkvhead[idx] = bnode; } - - /* Finally insert. */ - (*krcp)->bkvhead[idx]->records - [(*krcp)->bkvhead[idx]->nr_records++] = ptr; - return true; } /* - * Queue a request for lazy invocation of the appropriate free routine - * after a grace period. Please note that three paths are maintained, - * two for the common case using arrays of pointers and a third one that - * is used only when the main paths cannot be used, for example, due to - * memory pressure. - * - * Each kvfree_call_rcu() request is added to a batch. The batch will be drained - * every KFREE_DRAIN_JIFFIES number of jiffies. All the objects in the batch will - * be free'd in workqueue context. This allows us to: batch requests together to - * reduce the number of grace periods during heavy kfree_rcu()/kvfree_rcu() load. + * Helper function for the synchronize_rcu() API. */ -void kvfree_call_rcu(struct rcu_head *head, rcu_callback_t func) +static void synchronize_rcu_normal(void) { - unsigned long flags; - struct kfree_rcu_cpu *krcp; - bool success; - void *ptr; + struct rcu_synchronize rs; - if (head) { - ptr = (void *) head - (unsigned long) func; - } else { - /* - * Please note there is a limitation for the head-less - * variant, that is why there is a clear rule for such - * objects: it can be used from might_sleep() context - * only. For other places please embed an rcu_head to - * your data. - */ - might_sleep(); - ptr = (unsigned long *) func; - } - - // Queue the object but don't yet schedule the batch. - if (debug_rcu_head_queue(ptr)) { - // Probable double kfree_rcu(), just leak. - WARN_ONCE(1, "%s(): Double-freed call. rcu_head %p\n", - __func__, head); + trace_rcu_sr_normal(rcu_state.name, &rs.head, TPS("request")); - // Mark as success and leave. - return; + if (READ_ONCE(rcu_normal_wake_from_gp) < 1) { + wait_rcu_gp(call_rcu_hurry); + goto trace_complete_out; } - kasan_record_aux_stack_noalloc(ptr); - success = add_ptr_to_bulk_krc_lock(&krcp, &flags, ptr, !head); - if (!success) { - run_page_cache_worker(krcp); - - if (head == NULL) - // Inline if kvfree_rcu(one_arg) call. - goto unlock_return; - - head->func = func; - head->next = krcp->head; - krcp->head = head; - success = true; - } - - WRITE_ONCE(krcp->count, krcp->count + 1); - - // Set timer to drain after KFREE_DRAIN_JIFFIES. - if (rcu_scheduler_active == RCU_SCHEDULER_RUNNING) - schedule_delayed_monitor_work(krcp); - -unlock_return: - krc_this_cpu_unlock(krcp, flags); + init_rcu_head_on_stack(&rs.head); + init_completion(&rs.completion); /* - * Inline kvfree() after synchronize_rcu(). We can do - * it from might_sleep() context only, so the current - * CPU can pass the QS state. + * This code might be preempted, therefore take a GP + * snapshot before adding a request. */ - if (!success) { - debug_rcu_head_unqueue((struct rcu_head *) ptr); - synchronize_rcu(); - kvfree(ptr); - } -} -EXPORT_SYMBOL_GPL(kvfree_call_rcu); - -static unsigned long -kfree_rcu_shrink_count(struct shrinker *shrink, struct shrink_control *sc) -{ - int cpu; - unsigned long count = 0; - - /* Snapshot count of all CPUs */ - for_each_possible_cpu(cpu) { - struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu); + if (IS_ENABLED(CONFIG_PROVE_RCU)) + get_state_synchronize_rcu_full(&rs.oldstate); - count += READ_ONCE(krcp->count); - count += READ_ONCE(krcp->nr_bkv_objs); - atomic_set(&krcp->backoff_page_cache_fill, 1); - } - - return count == 0 ? SHRINK_EMPTY : count; -} - -static unsigned long -kfree_rcu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc) -{ - int cpu, freed = 0; - - for_each_possible_cpu(cpu) { - int count; - struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu); + rcu_sr_normal_add_req(&rs); - count = krcp->count; - count += drain_page_cache(krcp); - kfree_rcu_monitor(&krcp->monitor_work.work); + /* Kick a GP and start waiting. */ + (void) start_poll_synchronize_rcu(); - sc->nr_to_scan -= count; - freed += count; + /* Now we can wait. */ + wait_for_completion(&rs.completion); + destroy_rcu_head_on_stack(&rs.head); - if (sc->nr_to_scan <= 0) - break; - } - - return freed == 0 ? SHRINK_STOP : freed; -} - -static struct shrinker kfree_rcu_shrinker = { - .count_objects = kfree_rcu_shrink_count, - .scan_objects = kfree_rcu_shrink_scan, - .batch = 0, - .seeks = DEFAULT_SEEKS, -}; - -void __init kfree_rcu_scheduler_running(void) -{ - int cpu; - unsigned long flags; - - for_each_possible_cpu(cpu) { - struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu); - - raw_spin_lock_irqsave(&krcp->lock, flags); - if (need_offload_krc(krcp)) - schedule_delayed_monitor_work(krcp); - raw_spin_unlock_irqrestore(&krcp->lock, flags); - } -} - -/* - * During early boot, any blocking grace-period wait automatically - * implies a grace period. - * - * Later on, this could in theory be the case for kernels built with - * CONFIG_SMP=y && CONFIG_PREEMPTION=y running on a single CPU, but this - * is not a common case. Furthermore, this optimization would cause - * the rcu_gp_oldstate structure to expand by 50%, so this potential - * grace-period optimization is ignored once the scheduler is running. - */ -static int rcu_blocking_is_gp(void) -{ - if (rcu_scheduler_active != RCU_SCHEDULER_INACTIVE) - return false; - might_sleep(); /* Check for RCU read-side critical section. */ - return true; +trace_complete_out: + trace_rcu_sr_normal(rcu_state.name, &rs.head, TPS("complete")); } /** @@ -3542,7 +3347,7 @@ void synchronize_rcu(void) if (rcu_gp_is_expedited()) synchronize_rcu_expedited(); else - wait_rcu_gp(call_rcu_hurry); + synchronize_rcu_normal(); return; } @@ -3619,14 +3424,17 @@ EXPORT_SYMBOL_GPL(get_state_synchronize_rcu); */ void get_state_synchronize_rcu_full(struct rcu_gp_oldstate *rgosp) { - struct rcu_node *rnp = rcu_get_root(); - /* * Any prior manipulation of RCU-protected data must happen * before the loads from ->gp_seq and ->expedited_sequence. */ smp_mb(); /* ^^^ */ - rgosp->rgos_norm = rcu_seq_snap(&rnp->gp_seq); + + // Yes, rcu_state.gp_seq, not rnp_root->gp_seq, the latter's use + // in poll_state_synchronize_rcu_full() notwithstanding. Use of + // the latter here would result in too-short grace periods due to + // interactions with newly onlined CPUs. + rgosp->rgos_norm = rcu_seq_snap(&rcu_state.gp_seq); rgosp->rgos_exp = rcu_seq_snap(&rcu_state.expedited_sequence); } EXPORT_SYMBOL_GPL(get_state_synchronize_rcu_full); @@ -3642,7 +3450,6 @@ static void start_poll_synchronize_rcu_common(void) struct rcu_data *rdp; struct rcu_node *rnp; - lockdep_assert_irqs_enabled(); local_irq_save(flags); rdp = this_cpu_ptr(&rcu_data); rnp = rdp->mynode; @@ -3667,9 +3474,6 @@ static void start_poll_synchronize_rcu_common(void) * grace period has elapsed in the meantime. If the needed grace period * is not already slated to start, notifies RCU core of the need for that * grace period. - * - * Interrupts must be enabled for the case where it is necessary to awaken - * the grace-period kthread. */ unsigned long start_poll_synchronize_rcu(void) { @@ -3690,9 +3494,6 @@ EXPORT_SYMBOL_GPL(start_poll_synchronize_rcu); * grace period (whether normal or expedited) has elapsed in the meantime. * If the needed grace period is not already slated to start, notifies * RCU core of the need for that grace period. - * - * Interrupts must be enabled for the case where it is necessary to awaken - * the grace-period kthread. */ void start_poll_synchronize_rcu_full(struct rcu_gp_oldstate *rgosp) { @@ -3711,7 +3512,9 @@ EXPORT_SYMBOL_GPL(start_poll_synchronize_rcu_full); * If @false is returned, it is the caller's responsibility to invoke this * function later on until it does return @true. Alternatively, the caller * can explicitly wait for a grace period, for example, by passing @oldstate - * to cond_synchronize_rcu() or by directly invoking synchronize_rcu(). + * to either cond_synchronize_rcu() or cond_synchronize_rcu_expedited() + * on the one hand or by directly invoking either synchronize_rcu() or + * synchronize_rcu_expedited() on the other. * * Yes, this function does not take counter wrap into account. * But counter wrap is harmless. If the counter wraps, we have waited for @@ -3722,6 +3525,12 @@ EXPORT_SYMBOL_GPL(start_poll_synchronize_rcu_full); * completed. Alternatively, they can use get_completed_synchronize_rcu() * to get a guaranteed-completed grace-period state. * + * In addition, because oldstate compresses the grace-period state for + * both normal and expedited grace periods into a single unsigned long, + * it can miss a grace period when synchronize_rcu() runs concurrently + * with synchronize_rcu_expedited(). If this is unacceptable, please + * instead use the _full() variant of these polling APIs. + * * This function provides the same memory-ordering guarantees that * would be provided by a synchronize_rcu() that was invoked at the call * to the function that provided @oldstate, and that returned at the end @@ -3862,11 +3671,15 @@ static int rcu_pending(int user) return 1; /* Is this a nohz_full CPU in userspace or idle? (Ignore RCU if so.) */ - if ((user || rcu_is_cpu_rrupt_from_idle()) && rcu_nohz_full_cpu()) + gp_in_progress = rcu_gp_in_progress(); + if ((user || rcu_is_cpu_rrupt_from_idle() || + (gp_in_progress && + time_before(jiffies, READ_ONCE(rcu_state.gp_start) + + nohz_full_patience_delay_jiffies))) && + rcu_nohz_full_cpu()) return 0; /* Is the RCU core waiting for a quiescent state from this CPU? */ - gp_in_progress = rcu_gp_in_progress(); if (rdp->core_needs_qs && !rdp->cpu_no_qs.b.norm && gp_in_progress) return 1; @@ -3914,6 +3727,7 @@ static void rcu_barrier_callback(struct rcu_head *rhp) { unsigned long __maybe_unused s = rcu_state.barrier_sequence; + rhp->next = rhp; // Mark the callback as having been invoked. if (atomic_dec_and_test(&rcu_state.barrier_cpu_count)) { rcu_barrier_trace(TPS("LastCB"), -1, s); complete(&rcu_state.barrier_completion); @@ -3982,6 +3796,11 @@ static void rcu_barrier_handler(void *cpu_in) * to complete. For example, if there are no RCU callbacks queued anywhere * in the system, then rcu_barrier() is within its rights to return * immediately, without waiting for anything, much less an RCU grace period. + * In fact, rcu_barrier() will normally not result in any RCU grace periods + * beyond those that were already destined to be executed. + * + * In kernels built with CONFIG_RCU_LAZY=y, this function also hurries all + * pending lazy RCU callbacks. */ void rcu_barrier(void) { @@ -4079,6 +3898,202 @@ retry: } EXPORT_SYMBOL_GPL(rcu_barrier); +static unsigned long rcu_barrier_last_throttle; + +/** + * rcu_barrier_throttled - Do rcu_barrier(), but limit to one per second + * + * This can be thought of as guard rails around rcu_barrier() that + * permits unrestricted userspace use, at least assuming the hardware's + * try_cmpxchg() is robust. There will be at most one call per second to + * rcu_barrier() system-wide from use of this function, which means that + * callers might needlessly wait a second or three. + * + * This is intended for use by test suites to avoid OOM by flushing RCU + * callbacks from the previous test before starting the next. See the + * rcutree.do_rcu_barrier module parameter for more information. + * + * Why not simply make rcu_barrier() more scalable? That might be + * the eventual endpoint, but let's keep it simple for the time being. + * Note that the module parameter infrastructure serializes calls to a + * given .set() function, but should concurrent .set() invocation ever be + * possible, we are ready! + */ +static void rcu_barrier_throttled(void) +{ + unsigned long j = jiffies; + unsigned long old = READ_ONCE(rcu_barrier_last_throttle); + unsigned long s = rcu_seq_snap(&rcu_state.barrier_sequence); + + while (time_in_range(j, old, old + HZ / 16) || + !try_cmpxchg(&rcu_barrier_last_throttle, &old, j)) { + schedule_timeout_idle(HZ / 16); + if (rcu_seq_done(&rcu_state.barrier_sequence, s)) { + smp_mb(); /* caller's subsequent code after above check. */ + return; + } + j = jiffies; + old = READ_ONCE(rcu_barrier_last_throttle); + } + rcu_barrier(); +} + +/* + * Invoke rcu_barrier_throttled() when a rcutree.do_rcu_barrier + * request arrives. We insist on a true value to allow for possible + * future expansion. + */ +static int param_set_do_rcu_barrier(const char *val, const struct kernel_param *kp) +{ + bool b; + int ret; + + if (rcu_scheduler_active != RCU_SCHEDULER_RUNNING) + return -EAGAIN; + ret = kstrtobool(val, &b); + if (!ret && b) { + atomic_inc((atomic_t *)kp->arg); + rcu_barrier_throttled(); + atomic_dec((atomic_t *)kp->arg); + } + return ret; +} + +/* + * Output the number of outstanding rcutree.do_rcu_barrier requests. + */ +static int param_get_do_rcu_barrier(char *buffer, const struct kernel_param *kp) +{ + return sprintf(buffer, "%d\n", atomic_read((atomic_t *)kp->arg)); +} + +static const struct kernel_param_ops do_rcu_barrier_ops = { + .set = param_set_do_rcu_barrier, + .get = param_get_do_rcu_barrier, +}; +static atomic_t do_rcu_barrier; +module_param_cb(do_rcu_barrier, &do_rcu_barrier_ops, &do_rcu_barrier, 0644); + +/* + * Compute the mask of online CPUs for the specified rcu_node structure. + * This will not be stable unless the rcu_node structure's ->lock is + * held, but the bit corresponding to the current CPU will be stable + * in most contexts. + */ +static unsigned long rcu_rnp_online_cpus(struct rcu_node *rnp) +{ + return READ_ONCE(rnp->qsmaskinitnext); +} + +/* + * Is the CPU corresponding to the specified rcu_data structure online + * from RCU's perspective? This perspective is given by that structure's + * ->qsmaskinitnext field rather than by the global cpu_online_mask. + */ +static bool rcu_rdp_cpu_online(struct rcu_data *rdp) +{ + return !!(rdp->grpmask & rcu_rnp_online_cpus(rdp->mynode)); +} + +bool rcu_cpu_online(int cpu) +{ + struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); + + return rcu_rdp_cpu_online(rdp); +} + +#if defined(CONFIG_PROVE_RCU) && defined(CONFIG_HOTPLUG_CPU) + +/* + * Is the current CPU online as far as RCU is concerned? + * + * Disable preemption to avoid false positives that could otherwise + * happen due to the current CPU number being sampled, this task being + * preempted, its old CPU being taken offline, resuming on some other CPU, + * then determining that its old CPU is now offline. + * + * Disable checking if in an NMI handler because we cannot safely + * report errors from NMI handlers anyway. In addition, it is OK to use + * RCU on an offline processor during initial boot, hence the check for + * rcu_scheduler_fully_active. + */ +bool notrace rcu_lockdep_current_cpu_online(void) +{ + struct rcu_data *rdp; + bool ret = false; + + if (in_nmi() || !rcu_scheduler_fully_active) + return true; + preempt_disable_notrace(); + rdp = this_cpu_ptr(&rcu_data); + /* + * Strictly, we care here about the case where the current CPU is + * in rcutree_report_cpu_starting() and thus has an excuse for rdp->grpmask + * not being up to date. So arch_spin_is_locked() might have a + * false positive if it's held by some *other* CPU, but that's + * OK because that just means a false *negative* on the warning. + */ + if (rcu_rdp_cpu_online(rdp) || arch_spin_is_locked(&rcu_state.ofl_lock)) + ret = true; + preempt_enable_notrace(); + return ret; +} +EXPORT_SYMBOL_GPL(rcu_lockdep_current_cpu_online); + +#endif /* #if defined(CONFIG_PROVE_RCU) && defined(CONFIG_HOTPLUG_CPU) */ + +// Has rcu_init() been invoked? This is used (for example) to determine +// whether spinlocks may be acquired safely. +static bool rcu_init_invoked(void) +{ + return !!READ_ONCE(rcu_state.n_online_cpus); +} + +/* + * All CPUs for the specified rcu_node structure have gone offline, + * and all tasks that were preempted within an RCU read-side critical + * section while running on one of those CPUs have since exited their RCU + * read-side critical section. Some other CPU is reporting this fact with + * the specified rcu_node structure's ->lock held and interrupts disabled. + * This function therefore goes up the tree of rcu_node structures, + * clearing the corresponding bits in the ->qsmaskinit fields. Note that + * the leaf rcu_node structure's ->qsmaskinit field has already been + * updated. + * + * This function does check that the specified rcu_node structure has + * all CPUs offline and no blocked tasks, so it is OK to invoke it + * prematurely. That said, invoking it after the fact will cost you + * a needless lock acquisition. So once it has done its work, don't + * invoke it again. + */ +static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf) +{ + long mask; + struct rcu_node *rnp = rnp_leaf; + + raw_lockdep_assert_held_rcu_node(rnp_leaf); + if (!IS_ENABLED(CONFIG_HOTPLUG_CPU) || + WARN_ON_ONCE(rnp_leaf->qsmaskinit) || + WARN_ON_ONCE(rcu_preempt_has_tasks(rnp_leaf))) + return; + for (;;) { + mask = rnp->grpmask; + rnp = rnp->parent; + if (!rnp) + break; + raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */ + rnp->qsmaskinit &= ~mask; + /* Between grace periods, so better already be zero! */ + WARN_ON_ONCE(rnp->qsmask); + if (rnp->qsmaskinit) { + raw_spin_unlock_rcu_node(rnp); + /* irqs remain disabled. */ + return; + } + raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled. */ + } +} + /* * Propagate ->qsinitmask bits up the rcu_node tree to account for the * first CPU in a given leaf rcu_node structure coming online. The caller @@ -4119,18 +4134,87 @@ rcu_boot_init_percpu_data(int cpu) /* Set up local state, ensuring consistent view of global state. */ rdp->grpmask = leaf_node_cpu_bit(rdp->mynode, cpu); INIT_WORK(&rdp->strict_work, strict_work_handler); - WARN_ON_ONCE(ct->dynticks_nesting != 1); - WARN_ON_ONCE(rcu_dynticks_in_eqs(rcu_dynticks_snap(cpu))); + WARN_ON_ONCE(ct->nesting != 1); + WARN_ON_ONCE(rcu_watching_snap_in_eqs(ct_rcu_watching_cpu(cpu))); rdp->barrier_seq_snap = rcu_state.barrier_sequence; rdp->rcu_ofl_gp_seq = rcu_state.gp_seq; - rdp->rcu_ofl_gp_flags = RCU_GP_CLEANED; + rdp->rcu_ofl_gp_state = RCU_GP_CLEANED; rdp->rcu_onl_gp_seq = rcu_state.gp_seq; - rdp->rcu_onl_gp_flags = RCU_GP_CLEANED; + rdp->rcu_onl_gp_state = RCU_GP_CLEANED; rdp->last_sched_clock = jiffies; rdp->cpu = cpu; rcu_boot_init_nocb_percpu_data(rdp); } +static void rcu_thread_affine_rnp(struct task_struct *t, struct rcu_node *rnp) +{ + cpumask_var_t affinity; + int cpu; + + if (!zalloc_cpumask_var(&affinity, GFP_KERNEL)) + return; + + for_each_leaf_node_possible_cpu(rnp, cpu) + cpumask_set_cpu(cpu, affinity); + + kthread_affine_preferred(t, affinity); + + free_cpumask_var(affinity); +} + +struct kthread_worker *rcu_exp_gp_kworker; + +static void rcu_spawn_exp_par_gp_kworker(struct rcu_node *rnp) +{ + struct kthread_worker *kworker; + const char *name = "rcu_exp_par_gp_kthread_worker/%d"; + struct sched_param param = { .sched_priority = kthread_prio }; + int rnp_index = rnp - rcu_get_root(); + + if (rnp->exp_kworker) + return; + + kworker = kthread_create_worker(0, name, rnp_index); + if (IS_ERR_OR_NULL(kworker)) { + pr_err("Failed to create par gp kworker on %d/%d\n", + rnp->grplo, rnp->grphi); + return; + } + WRITE_ONCE(rnp->exp_kworker, kworker); + + if (IS_ENABLED(CONFIG_RCU_EXP_KTHREAD)) + sched_setscheduler_nocheck(kworker->task, SCHED_FIFO, ¶m); + + rcu_thread_affine_rnp(kworker->task, rnp); + wake_up_process(kworker->task); +} + +static void __init rcu_start_exp_gp_kworker(void) +{ + const char *name = "rcu_exp_gp_kthread_worker"; + struct sched_param param = { .sched_priority = kthread_prio }; + + rcu_exp_gp_kworker = kthread_run_worker(0, name); + if (IS_ERR_OR_NULL(rcu_exp_gp_kworker)) { + pr_err("Failed to create %s!\n", name); + rcu_exp_gp_kworker = NULL; + return; + } + + if (IS_ENABLED(CONFIG_RCU_EXP_KTHREAD)) + sched_setscheduler_nocheck(rcu_exp_gp_kworker->task, SCHED_FIFO, ¶m); +} + +static void rcu_spawn_rnp_kthreads(struct rcu_node *rnp) +{ + if (rcu_scheduler_fully_active) { + mutex_lock(&rnp->kthread_mutex); + rcu_spawn_one_boost_kthread(rnp); + rcu_spawn_exp_par_gp_kworker(rnp); + mutex_unlock(&rnp->kthread_mutex); + } +} + /* * Invoked early in the CPU-online process, when pretty much all services * are available. The incoming CPU is not present. @@ -4153,7 +4237,7 @@ int rcutree_prepare_cpu(unsigned int cpu) rdp->qlen_last_fqs_check = 0; rdp->n_force_qs_snap = READ_ONCE(rcu_state.n_force_qs); rdp->blimit = blimit; - ct->dynticks_nesting = 1; /* CPU not up, no tearing. */ + ct->nesting = 1; /* CPU not up, no tearing. */ raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled. */ /* @@ -4170,7 +4254,6 @@ int rcutree_prepare_cpu(unsigned int cpu) */ rnp = rdp->mynode; raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */ - rdp->beenonline = true; /* We have now been online. */ rdp->gp_seq = READ_ONCE(rnp->gp_seq); rdp->gp_seq_needed = rdp->gp_seq; rdp->cpu_no_qs.b.norm = true; @@ -4180,21 +4263,24 @@ int rcutree_prepare_cpu(unsigned int cpu) rdp->rcu_iw_gp_seq = rdp->gp_seq - 1; trace_rcu_grace_period(rcu_state.name, rdp->gp_seq, TPS("cpuonl")); raw_spin_unlock_irqrestore_rcu_node(rnp, flags); - rcu_spawn_one_boost_kthread(rnp); + + rcu_preempt_deferred_qs_init(rdp); + rcu_spawn_rnp_kthreads(rnp); rcu_spawn_cpu_nocb_kthread(cpu); + ASSERT_EXCLUSIVE_WRITER(rcu_state.n_online_cpus); WRITE_ONCE(rcu_state.n_online_cpus, rcu_state.n_online_cpus + 1); return 0; } /* - * Update RCU priority boot kthread affinity for CPU-hotplug changes. + * Has the specified (known valid) CPU ever been fully online? */ -static void rcutree_affinity_setting(unsigned int cpu, int outgoing) +bool rcu_cpu_beenfullyonline(int cpu) { struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); - rcu_boost_kthread_setaffinity(rdp->mynode, outgoing); + return smp_load_acquire(&rdp->beenonline); } /* @@ -4214,8 +4300,6 @@ int rcutree_online_cpu(unsigned int cpu) raw_spin_unlock_irqrestore_rcu_node(rnp, flags); if (rcu_scheduler_active == RCU_SCHEDULER_INACTIVE) return 0; /* Too early in boot for scheduler work. */ - sync_sched_exp_online_cleanup(cpu); - rcutree_affinity_setting(cpu, -1); // Stop-machine done, so allow nohz_full to disable tick. tick_dep_clear(TICK_DEP_BIT_RCU); @@ -4223,29 +4307,6 @@ int rcutree_online_cpu(unsigned int cpu) } /* - * Near the beginning of the process. The CPU is still very much alive - * with pretty much all services enabled. - */ -int rcutree_offline_cpu(unsigned int cpu) -{ - unsigned long flags; - struct rcu_data *rdp; - struct rcu_node *rnp; - - rdp = per_cpu_ptr(&rcu_data, cpu); - rnp = rdp->mynode; - raw_spin_lock_irqsave_rcu_node(rnp, flags); - rnp->ffmask &= ~rdp->grpmask; - raw_spin_unlock_irqrestore_rcu_node(rnp, flags); - - rcutree_affinity_setting(cpu, cpu); - - // nohz_full CPUs need the tick for stop-machine to work quickly - tick_dep_set(TICK_DEP_BIT_RCU); - return 0; -} - -/* * Mark the specified CPU as being online so that subsequent grace periods * (both expedited and normal) will wait on it. Note that this means that * incoming CPUs are not allowed to use RCU read-side critical sections @@ -4255,15 +4316,18 @@ int rcutree_offline_cpu(unsigned int cpu) * Note that this function is special in that it is invoked directly * from the incoming CPU rather than from the cpuhp_step mechanism. * This is because this function must be invoked at a precise location. + * This incoming CPU must not have enabled interrupts yet. + * + * This mirrors the effects of rcutree_report_cpu_dead(). */ -void rcu_cpu_starting(unsigned int cpu) +void rcutree_report_cpu_starting(unsigned int cpu) { - unsigned long flags; unsigned long mask; struct rcu_data *rdp; struct rcu_node *rnp; bool newcpu; + lockdep_assert_irqs_disabled(); rdp = per_cpu_ptr(&rcu_data, cpu); if (rdp->cpu_started) return; @@ -4271,9 +4335,8 @@ void rcu_cpu_starting(unsigned int cpu) rnp = rdp->mynode; mask = rdp->grpmask; - local_irq_save(flags); arch_spin_lock(&rcu_state.ofl_lock); - rcu_dynticks_eqs_online(); + rcu_watching_online(); raw_spin_lock(&rcu_state.barrier_lock); raw_spin_lock_rcu_node(rnp); WRITE_ONCE(rnp->qsmaskinitnext, rnp->qsmaskinitnext | mask); @@ -4285,22 +4348,22 @@ void rcu_cpu_starting(unsigned int cpu) ASSERT_EXCLUSIVE_WRITER(rcu_state.ncpus); rcu_gpnum_ovf(rnp, rdp); /* Offline-induced counter wrap? */ rdp->rcu_onl_gp_seq = READ_ONCE(rcu_state.gp_seq); - rdp->rcu_onl_gp_flags = READ_ONCE(rcu_state.gp_flags); + rdp->rcu_onl_gp_state = READ_ONCE(rcu_state.gp_state); /* An incoming CPU should never be blocking a grace period. */ if (WARN_ON_ONCE(rnp->qsmask & mask)) { /* RCU waiting on incoming CPU? */ /* rcu_report_qs_rnp() *really* wants some flags to restore */ - unsigned long flags2; + unsigned long flags; - local_irq_save(flags2); + local_irq_save(flags); rcu_disable_urgency_upon_qs(rdp); /* Report QS -after- changing ->qsmaskinitnext! */ - rcu_report_qs_rnp(mask, rnp, rnp->gp_seq, flags2); + rcu_report_qs_rnp(mask, rnp, rnp->gp_seq, flags); } else { raw_spin_unlock_rcu_node(rnp); } arch_spin_unlock(&rcu_state.ofl_lock); - local_irq_restore(flags); + smp_store_release(&rdp->beenonline, true); smp_mb(); /* Ensure RCU read-side usage follows above initialization. */ } @@ -4311,14 +4374,27 @@ void rcu_cpu_starting(unsigned int cpu) * Note that this function is special in that it is invoked directly * from the outgoing CPU rather than from the cpuhp_step mechanism. * This is because this function must be invoked at a precise location. + * + * This mirrors the effect of rcutree_report_cpu_starting(). */ -void rcu_report_dead(unsigned int cpu) +void rcutree_report_cpu_dead(void) { - unsigned long flags, seq_flags; + unsigned long flags; unsigned long mask; - struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); + struct rcu_data *rdp = this_cpu_ptr(&rcu_data); struct rcu_node *rnp = rdp->mynode; /* Outgoing CPU's rdp & rnp. */ + /* + * IRQS must be disabled from now on and until the CPU dies, or an interrupt + * may introduce a new READ-side while it is actually off the QS masks. + */ + lockdep_assert_irqs_disabled(); + /* + * CPUHP_AP_SMPCFD_DYING was the last call for rcu_exp_handler() execution. + * The requested QS must have been reported on the last context switch + * from stop machine to idle. + */ + WARN_ON_ONCE(rdp->cpu_no_qs.b.exp); // Do any dangling deferred wakeups. do_nocb_deferred_wakeup(rdp); @@ -4326,22 +4402,27 @@ void rcu_report_dead(unsigned int cpu) /* Remove outgoing CPU from mask in the leaf rcu_node structure. */ mask = rdp->grpmask; - local_irq_save(seq_flags); + + /* + * Hold the ofl_lock and rnp lock to avoid races between CPU going + * offline and doing a QS report (as below), versus rcu_gp_init(). + * See Requirements.rst > Hotplug CPU > Concurrent QS Reporting section + * for more details. + */ arch_spin_lock(&rcu_state.ofl_lock); raw_spin_lock_irqsave_rcu_node(rnp, flags); /* Enforce GP memory-order guarantee. */ rdp->rcu_ofl_gp_seq = READ_ONCE(rcu_state.gp_seq); - rdp->rcu_ofl_gp_flags = READ_ONCE(rcu_state.gp_flags); + rdp->rcu_ofl_gp_state = READ_ONCE(rcu_state.gp_state); if (rnp->qsmask & mask) { /* RCU waiting on outgoing CPU? */ /* Report quiescent state -before- changing ->qsmaskinitnext! */ rcu_disable_urgency_upon_qs(rdp); rcu_report_qs_rnp(mask, rnp, rnp->gp_seq, flags); raw_spin_lock_irqsave_rcu_node(rnp, flags); } + /* Clear from ->qsmaskinitnext to mark offline. */ WRITE_ONCE(rnp->qsmaskinitnext, rnp->qsmaskinitnext & ~mask); raw_spin_unlock_irqrestore_rcu_node(rnp, flags); arch_spin_unlock(&rcu_state.ofl_lock); - local_irq_restore(seq_flags); - rdp->cpu_started = false; } @@ -4359,11 +4440,15 @@ void rcutree_migrate_callbacks(int cpu) struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); bool needwake; - if (rcu_rdp_is_offloaded(rdp) || - rcu_segcblist_empty(&rdp->cblist)) - return; /* No callbacks to migrate. */ + if (rcu_rdp_is_offloaded(rdp)) + return; raw_spin_lock_irqsave(&rcu_state.barrier_lock, flags); + if (rcu_segcblist_empty(&rdp->cblist)) { + raw_spin_unlock_irqrestore(&rcu_state.barrier_lock, flags); + return; /* No callbacks to migrate. */ + } + WARN_ON_ONCE(rcu_rdp_cpu_online(rdp)); rcu_barrier_entrain(rdp); my_rdp = this_cpu_ptr(&rcu_data); @@ -4385,8 +4470,9 @@ void rcutree_migrate_callbacks(int cpu) __call_rcu_nocb_wake(my_rdp, true, flags); } else { rcu_nocb_unlock(my_rdp); /* irqs remain disabled. */ - raw_spin_unlock_irqrestore_rcu_node(my_rnp, flags); + raw_spin_unlock_rcu_node(my_rnp); /* irqs remain disabled. */ } + local_irq_restore(flags); if (needwake) rcu_gp_kthread_wake(); lockdep_assert_irqs_enabled(); @@ -4396,7 +4482,59 @@ void rcutree_migrate_callbacks(int cpu) cpu, rcu_segcblist_n_cbs(&rdp->cblist), rcu_segcblist_first_cb(&rdp->cblist)); } -#endif + +/* + * The CPU has been completely removed, and some other CPU is reporting + * this fact from process context. Do the remainder of the cleanup. + * There can only be one CPU hotplug operation at a time, so no need for + * explicit locking. + */ +int rcutree_dead_cpu(unsigned int cpu) +{ + ASSERT_EXCLUSIVE_WRITER(rcu_state.n_online_cpus); + WRITE_ONCE(rcu_state.n_online_cpus, rcu_state.n_online_cpus - 1); + // Stop-machine done, so allow nohz_full to disable tick. + tick_dep_clear(TICK_DEP_BIT_RCU); + return 0; +} + +/* + * Near the end of the offline process. Trace the fact that this CPU + * is going offline. + */ +int rcutree_dying_cpu(unsigned int cpu) +{ + bool blkd; + struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); + struct rcu_node *rnp = rdp->mynode; + + blkd = !!(READ_ONCE(rnp->qsmask) & rdp->grpmask); + trace_rcu_grace_period(rcu_state.name, READ_ONCE(rnp->gp_seq), + blkd ? TPS("cpuofl-bgp") : TPS("cpuofl")); + return 0; +} + +/* + * Near the beginning of the process. The CPU is still very much alive + * with pretty much all services enabled. + */ +int rcutree_offline_cpu(unsigned int cpu) +{ + unsigned long flags; + struct rcu_data *rdp; + struct rcu_node *rnp; + + rdp = per_cpu_ptr(&rcu_data, cpu); + rnp = rdp->mynode; + raw_spin_lock_irqsave_rcu_node(rnp, flags); + rnp->ffmask &= ~rdp->grpmask; + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); + + // nohz_full CPUs need the tick for stop-machine to work quickly + tick_dep_set(TICK_DEP_BIT_RCU); + return 0; +} +#endif /* #ifdef CONFIG_HOTPLUG_CPU */ /* * On non-huge systems, use expedited RCU grace periods to make suspend @@ -4408,11 +4546,13 @@ static int rcu_pm_notify(struct notifier_block *self, switch (action) { case PM_HIBERNATION_PREPARE: case PM_SUSPEND_PREPARE: + rcu_async_hurry(); rcu_expedite_gp(); break; case PM_POST_HIBERNATION: case PM_POST_SUSPEND: rcu_unexpedite_gp(); + rcu_async_relax(); break; default: break; @@ -4420,51 +4560,6 @@ static int rcu_pm_notify(struct notifier_block *self, return NOTIFY_OK; } -#ifdef CONFIG_RCU_EXP_KTHREAD -struct kthread_worker *rcu_exp_gp_kworker; -struct kthread_worker *rcu_exp_par_gp_kworker; - -static void __init rcu_start_exp_gp_kworkers(void) -{ - const char *par_gp_kworker_name = "rcu_exp_par_gp_kthread_worker"; - const char *gp_kworker_name = "rcu_exp_gp_kthread_worker"; - struct sched_param param = { .sched_priority = kthread_prio }; - - rcu_exp_gp_kworker = kthread_create_worker(0, gp_kworker_name); - if (IS_ERR_OR_NULL(rcu_exp_gp_kworker)) { - pr_err("Failed to create %s!\n", gp_kworker_name); - return; - } - - rcu_exp_par_gp_kworker = kthread_create_worker(0, par_gp_kworker_name); - if (IS_ERR_OR_NULL(rcu_exp_par_gp_kworker)) { - pr_err("Failed to create %s!\n", par_gp_kworker_name); - kthread_destroy_worker(rcu_exp_gp_kworker); - return; - } - - sched_setscheduler_nocheck(rcu_exp_gp_kworker->task, SCHED_FIFO, ¶m); - sched_setscheduler_nocheck(rcu_exp_par_gp_kworker->task, SCHED_FIFO, - ¶m); -} - -static inline void rcu_alloc_par_gp_wq(void) -{ -} -#else /* !CONFIG_RCU_EXP_KTHREAD */ -struct workqueue_struct *rcu_par_gp_wq; - -static void __init rcu_start_exp_gp_kworkers(void) -{ -} - -static inline void rcu_alloc_par_gp_wq(void) -{ - rcu_par_gp_wq = alloc_workqueue("rcu_par_gp", WQ_MEM_RECLAIM, 0); - WARN_ON(!rcu_par_gp_wq); -} -#endif /* CONFIG_RCU_EXP_KTHREAD */ - /* * Spawn the kthreads that handle RCU's grace periods. */ @@ -4499,10 +4594,10 @@ static int __init rcu_spawn_gp_kthread(void) * due to rcu_scheduler_fully_active. */ rcu_spawn_cpu_nocb_kthread(smp_processor_id()); - rcu_spawn_one_boost_kthread(rdp->mynode); + rcu_spawn_rnp_kthreads(rdp->mynode); rcu_spawn_core_kthreads(); /* Create kthread worker for expedited GPs */ - rcu_start_exp_gp_kworkers(); + rcu_start_exp_gp_kworker(); return 0; } early_initcall(rcu_spawn_gp_kthread); @@ -4605,7 +4700,7 @@ static void __init rcu_init_one(void) init_waitqueue_head(&rnp->exp_wq[2]); init_waitqueue_head(&rnp->exp_wq[3]); spin_lock_init(&rnp->exp_lock); - mutex_init(&rnp->boost_kthread_mutex); + mutex_init(&rnp->kthread_mutex); raw_spin_lock_init(&rnp->exp_poll_lock); rnp->exp_seq_poll_rq = RCU_GET_STATE_COMPLETED; INIT_WORK(&rnp->exp_poll_wq, sync_rcu_do_polled_gp); @@ -4619,6 +4714,8 @@ static void __init rcu_init_one(void) while (i > rnp->grphi) rnp++; per_cpu_ptr(&rcu_data, i)->mynode = rnp; + per_cpu_ptr(&rcu_data, i)->barrier_head.next = + &per_cpu_ptr(&rcu_data, i)->barrier_head; rcu_boot_init_percpu_data(i); } } @@ -4697,8 +4794,7 @@ void rcu_init_geometry(void) * Complain and fall back to the compile-time values if this * limit is exceeded. */ - if (rcu_fanout_leaf < 2 || - rcu_fanout_leaf > sizeof(unsigned long) * 8) { + if (rcu_fanout_leaf < 2 || rcu_fanout_leaf > BITS_PER_LONG) { rcu_fanout_leaf = RCU_FANOUT_LEAF; WARN_ON(1); return; @@ -4763,46 +4859,12 @@ static void __init rcu_dump_rcu_node_tree(void) struct workqueue_struct *rcu_gp_wq; -static void __init kfree_rcu_batch_init(void) -{ - int cpu; - int i; - - /* Clamp it to [0:100] seconds interval. */ - if (rcu_delay_page_cache_fill_msec < 0 || - rcu_delay_page_cache_fill_msec > 100 * MSEC_PER_SEC) { - - rcu_delay_page_cache_fill_msec = - clamp(rcu_delay_page_cache_fill_msec, 0, - (int) (100 * MSEC_PER_SEC)); - - pr_info("Adjusting rcutree.rcu_delay_page_cache_fill_msec to %d ms.\n", - rcu_delay_page_cache_fill_msec); - } - - for_each_possible_cpu(cpu) { - struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu); - - for (i = 0; i < KFREE_N_BATCHES; i++) { - INIT_RCU_WORK(&krcp->krw_arr[i].rcu_work, kfree_rcu_work); - krcp->krw_arr[i].krcp = krcp; - } - - INIT_DELAYED_WORK(&krcp->monitor_work, kfree_rcu_monitor); - INIT_DELAYED_WORK(&krcp->page_cache_work, fill_page_cache_func); - krcp->initialized = true; - } - if (register_shrinker(&kfree_rcu_shrinker, "rcu-kfree")) - pr_err("Failed to register kfree_rcu() shrinker!\n"); -} - void __init rcu_init(void) { int cpu = smp_processor_id(); rcu_early_boot_tests(); - kfree_rcu_batch_init(); rcu_bootup_announce(); sanitize_kthread_prio(); rcu_init_geometry(); @@ -4820,13 +4882,21 @@ void __init rcu_init(void) pm_notifier(rcu_pm_notify, 0); WARN_ON(num_online_cpus() > 1); // Only one CPU this early in boot. rcutree_prepare_cpu(cpu); - rcu_cpu_starting(cpu); + rcutree_report_cpu_starting(cpu); rcutree_online_cpu(cpu); /* Create workqueue for Tree SRCU and for expedited GPs. */ - rcu_gp_wq = alloc_workqueue("rcu_gp", WQ_MEM_RECLAIM, 0); + rcu_gp_wq = alloc_workqueue("rcu_gp", WQ_MEM_RECLAIM | WQ_PERCPU, 0); WARN_ON(!rcu_gp_wq); - rcu_alloc_par_gp_wq(); + + sync_wq = alloc_workqueue("sync_wq", WQ_MEM_RECLAIM | WQ_UNBOUND, 0); + WARN_ON(!sync_wq); + + /* Respect if explicitly disabled via a boot parameter. */ + if (rcu_normal_wake_from_gp < 0) { + if (num_possible_cpus() <= WAKE_FROM_GP_CPU_THRESHOLD) + rcu_normal_wake_from_gp = 1; + } /* Fill in default value for rcutree.qovld boot parameter. */ /* -After- the rcu_node ->lock fields are initialized! */ @@ -4835,9 +4905,12 @@ void __init rcu_init(void) else qovld_calc = qovld; - // Kick-start any polled grace periods that started early. - if (!(per_cpu_ptr(&rcu_data, cpu)->mynode->exp_seq_poll_rq & 0x1)) - (void)start_poll_synchronize_rcu_expedited(); + // Kick-start in case any polled grace periods started early. + (void)start_poll_synchronize_rcu_expedited(); + + rcu_test_sync_prims(); + + tasks_cblist_init_generic(); } #include "tree_stall.h" diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index fcb5d696eb17..b8bbe7960cda 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -21,14 +21,10 @@ #include "rcu_segcblist.h" -/* Communicate arguments to a workqueue handler. */ +/* Communicate arguments to a kthread worker handler. */ struct rcu_exp_work { unsigned long rew_s; -#ifdef CONFIG_RCU_EXP_KTHREAD struct kthread_work rew_work; -#else - struct work_struct rew_work; -#endif /* CONFIG_RCU_EXP_KTHREAD */ }; /* RCU's kthread states for tracing. */ @@ -72,6 +68,9 @@ struct rcu_node { /* Online CPUs for next expedited GP. */ /* Any CPU that has ever been online will */ /* have its bit set. */ + struct kthread_worker *exp_kworker; + /* Workers performing per node expedited GP */ + /* initialization. */ unsigned long cbovldmask; /* CPUs experiencing callback overload. */ unsigned long ffmask; /* Fully functional CPUs. */ @@ -113,7 +112,7 @@ struct rcu_node { /* side effect, not as a lock. */ unsigned long boost_time; /* When to start boosting (jiffies). */ - struct mutex boost_kthread_mutex; + struct mutex kthread_mutex; /* Exclusion for thread spawning and affinity */ /* manipulation. */ struct task_struct *boost_kthread_task; @@ -158,6 +157,34 @@ union rcu_noqs { u16 s; /* Set of bits, aggregate OR here. */ }; +/* + * Record the snapshot of the core stats at half of the first RCU stall timeout. + * The member gp_seq is used to ensure that all members are updated only once + * during the sampling period. The snapshot is taken only if this gp_seq is not + * equal to rdp->gp_seq. + */ +struct rcu_snap_record { + unsigned long gp_seq; /* Track rdp->gp_seq counter */ + u64 cputime_irq; /* Accumulated cputime of hard irqs */ + u64 cputime_softirq;/* Accumulated cputime of soft irqs */ + u64 cputime_system; /* Accumulated cputime of kernel tasks */ + u64 nr_hardirqs; /* Accumulated number of hard irqs */ + unsigned int nr_softirqs; /* Accumulated number of soft irqs */ + unsigned long long nr_csw; /* Accumulated number of task switches */ + unsigned long jiffies; /* Track jiffies value */ +}; + +/* + * An IRQ work (deferred_qs_iw) is used by RCU to get the scheduler's attention. + * to report quiescent states at the soonest possible time. + * The request can be in one of the following states: + * - DEFER_QS_IDLE: An IRQ work is yet to be scheduled. + * - DEFER_QS_PENDING: An IRQ work was scheduled but either not yet run, or it + * ran and we still haven't reported a quiescent state. + */ +#define DEFER_QS_IDLE 0 +#define DEFER_QS_PENDING 1 + /* Per-CPU data for read-copy update. */ struct rcu_data { /* 1) quiescent-state and grace-period handling : */ @@ -167,6 +194,7 @@ struct rcu_data { bool core_needs_qs; /* Core waits for quiescent state. */ bool beenonline; /* CPU online at least once. */ bool gpwrap; /* Possible ->gp_seq wrap. */ + unsigned int gpwrap_count; /* Count of GP sequence wrap. */ bool cpu_started; /* RCU watching this onlining CPU. */ struct rcu_node *mynode; /* This CPU's leaf of hierarchy */ unsigned long grpmask; /* Mask to apply to leaf qsmask. */ @@ -175,7 +203,7 @@ struct rcu_data { /* during and after the last grace */ /* period it is aware of. */ struct irq_work defer_qs_iw; /* Obtain later scheduler attention. */ - bool defer_qs_iw_pending; /* Scheduler attention pending? */ + int defer_qs_iw_pending; /* Scheduler attention pending? */ struct work_struct strict_work; /* Schedule readers for strict GPs. */ /* 2) batch handling */ @@ -190,7 +218,7 @@ struct rcu_data { long blimit; /* Upper limit on a processed batch */ /* 3) dynticks interface. */ - int dynticks_snap; /* Per-GP tracking for dynticks. */ + int watching_snap; /* Per-GP tracking for dynticks. */ bool rcu_need_heavy_qs; /* GP old, so heavy quiescent state! */ bool rcu_urgent_qs; /* GP old need light quiescent state. */ bool rcu_forced_tick; /* Forced tick to provide QS. */ @@ -199,7 +227,7 @@ struct rcu_data { /* 4) rcu_barrier(), OOM callbacks, and expediting. */ unsigned long barrier_seq_snap; /* Snap of rcu_state.barrier_sequence. */ struct rcu_head barrier_head; - int exp_dynticks_snap; /* Double-check need for IPI. */ + int exp_watching_snap; /* Double-check need for IPI. */ /* 5) Callback offloading. */ #ifdef CONFIG_RCU_NOCB_CPU @@ -207,7 +235,6 @@ struct rcu_data { struct swait_queue_head nocb_state_wq; /* For offloading state changes */ struct task_struct *nocb_gp_kthread; raw_spinlock_t nocb_lock; /* Guard following pair of fields. */ - atomic_t nocb_lock_contended; /* Contention experienced. */ int nocb_defer_wakeup; /* Defer wakeup of nocb_kthread. */ struct timer_list nocb_timer; /* Enforce finite deferral. */ unsigned long nocb_gp_adv_time; /* Last call_rcu() CB adv (jiffies). */ @@ -257,11 +284,13 @@ struct rcu_data { bool rcu_iw_pending; /* Is ->rcu_iw pending? */ unsigned long rcu_iw_gp_seq; /* ->gp_seq associated with ->rcu_iw. */ unsigned long rcu_ofl_gp_seq; /* ->gp_seq at last offline. */ - short rcu_ofl_gp_flags; /* ->gp_flags at last offline. */ + short rcu_ofl_gp_state; /* ->gp_state at last offline. */ unsigned long rcu_onl_gp_seq; /* ->gp_seq at last online. */ - short rcu_onl_gp_flags; /* ->gp_flags at last online. */ + short rcu_onl_gp_state; /* ->gp_state at last online. */ unsigned long last_fqs_resched; /* Time of last rcu_resched(). */ unsigned long last_sched_clock; /* Jiffies of last rcu_sched_clock_irq(). */ + struct rcu_snap_record snap_record; /* Snapshot of core stats at half of */ + /* the first RCU stall timeout */ long lazy_len; /* Length of buffered lazy callbacks. */ int cpu; @@ -298,6 +327,19 @@ do { \ } while (0) /* + * A max threshold for synchronize_rcu() users which are + * awaken directly by the rcu_gp_kthread(). Left part is + * deferred to the main worker. + */ +#define SR_MAX_USERS_WAKE_FROM_GP 5 +#define SR_NORMAL_GP_WAIT_HEAD_MAX 5 + +struct sr_wait_node { + atomic_t inuse; + struct llist_node node; +}; + +/* * RCU global state, including node hierarchy. This hierarchy is * represented in "heap" form in a dense array. The root (first level) * of the hierarchy is in ->node[0] (referenced by ->level[0]), the second @@ -367,6 +409,10 @@ struct rcu_state { /* in jiffies. */ unsigned long jiffies_stall; /* Time at which to check */ /* for CPU stalls. */ + int nr_fqs_jiffies_stall; /* Number of fqs loops after + * which read jiffies and set + * jiffies_stall. Stall + * warnings disabled if !0. */ unsigned long jiffies_resched; /* Time at which to resched */ /* a reluctant CPU. */ unsigned long n_force_qs_gpstart; /* Snapshot of n_force_qs at */ @@ -377,7 +423,19 @@ struct rcu_state { arch_spinlock_t ofl_lock ____cacheline_internodealigned_in_smp; /* Synchronize offline with */ /* GP pre-initialization. */ + + /* synchronize_rcu() part. */ + struct llist_head srs_next; /* request a GP users. */ + struct llist_node *srs_wait_tail; /* wait for GP users. */ + struct llist_node *srs_done_tail; /* ready for GP users. */ + struct sr_wait_node srs_wait_nodes[SR_NORMAL_GP_WAIT_HEAD_MAX]; + struct work_struct srs_cleanup_work; + atomic_t srs_cleanups_pending; /* srs inflight worker cleanups. */ + +#ifdef CONFIG_RCU_NOCB_CPU + struct mutex nocb_mutex; /* Guards (de-)offloading */ int nocb_is_setup; /* nocb is setup from boot */ +#endif }; /* Values for rcu_state structure's gp_flags field. */ @@ -430,6 +488,7 @@ static int rcu_print_task_exp_stall(struct rcu_node *rnp); static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp); static void rcu_flavor_sched_clock_irq(int user); static void dump_blkd_tasks(struct rcu_node *rnp, int ncheck); +static void rcu_preempt_deferred_qs_init(struct rcu_data *rdp); static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags); static void rcu_preempt_boost_start_gp(struct rcu_node *rnp); static bool rcu_is_callbacks_kthread(struct rcu_data *rdp); @@ -444,11 +503,10 @@ static void rcu_init_one_nocb(struct rcu_node *rnp); static bool wake_nocb_gp(struct rcu_data *rdp, bool force); static bool rcu_nocb_flush_bypass(struct rcu_data *rdp, struct rcu_head *rhp, unsigned long j, bool lazy); -static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp, - bool *was_alldone, unsigned long flags, - bool lazy); -static void __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_empty, - unsigned long flags); +static void call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *head, + rcu_callback_t func, unsigned long flags, bool lazy); +static void __maybe_unused __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_empty, + unsigned long flags); static int rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp, int level); static bool do_nocb_deferred_wakeup(struct rcu_data *rdp); static void rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp); diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index ed6c3cce28f2..96c49c56fc14 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -7,10 +7,12 @@ * Authors: Paul E. McKenney <paulmck@linux.ibm.com> */ +#include <linux/console.h> #include <linux/lockdep.h> static void rcu_exp_handler(void *unused); static int rcu_print_task_exp_stall(struct rcu_node *rnp); +static void rcu_exp_print_detail_task_stall_rnp(struct rcu_node *rnp); /* * Record the start of an expedited grace period. @@ -139,6 +141,13 @@ static void __maybe_unused sync_exp_reset_tree(void) raw_spin_lock_irqsave_rcu_node(rnp, flags); WARN_ON_ONCE(rnp->expmask); WRITE_ONCE(rnp->expmask, rnp->expmaskinit); + /* + * Need to wait for any blocked tasks as well. Note that + * additional blocking tasks will also block the expedited GP + * until such time as the ->expmask bits are cleared. + */ + if (rcu_is_leaf_node(rnp) && rcu_preempt_has_tasks(rnp)) + WRITE_ONCE(rnp->exp_tasks, rnp->blkd_tasks.next); raw_spin_unlock_irqrestore_rcu_node(rnp, flags); } } @@ -172,7 +181,6 @@ static bool sync_rcu_exp_done_unlocked(struct rcu_node *rnp) return ret; } - /* * Report the exit from RCU read-side critical section for the last task * that queued itself during or before the current expedited preemptible-RCU @@ -198,10 +206,9 @@ static void __rcu_report_exp_rnp(struct rcu_node *rnp, } if (rnp->parent == NULL) { raw_spin_unlock_irqrestore_rcu_node(rnp, flags); - if (wake) { - smp_mb(); /* EGP done before wake_up(). */ + if (wake) swake_up_one(&rcu_state.expedited_wq); - } + break; } mask = rnp->grpmask; @@ -227,20 +234,22 @@ static void __maybe_unused rcu_report_exp_rnp(struct rcu_node *rnp, bool wake) /* * Report expedited quiescent state for multiple CPUs, all covered by the - * specified leaf rcu_node structure. + * specified leaf rcu_node structure, which is acquired by the caller. */ -static void rcu_report_exp_cpu_mult(struct rcu_node *rnp, - unsigned long mask, bool wake) +static void rcu_report_exp_cpu_mult(struct rcu_node *rnp, unsigned long flags, + unsigned long mask_in, bool wake) + __releases(rnp->lock) { int cpu; - unsigned long flags; + unsigned long mask; struct rcu_data *rdp; - raw_spin_lock_irqsave_rcu_node(rnp, flags); - if (!(rnp->expmask & mask)) { + raw_lockdep_assert_held_rcu_node(rnp); + if (!(rnp->expmask & mask_in)) { raw_spin_unlock_irqrestore_rcu_node(rnp, flags); return; } + mask = mask_in & rnp->expmask; WRITE_ONCE(rnp->expmask, rnp->expmask & ~mask); for_each_leaf_node_cpu_mask(rnp, cpu, mask) { rdp = per_cpu_ptr(&rcu_data, cpu); @@ -257,8 +266,13 @@ static void rcu_report_exp_cpu_mult(struct rcu_node *rnp, */ static void rcu_report_exp_rdp(struct rcu_data *rdp) { + unsigned long flags; + struct rcu_node *rnp = rdp->mynode; + + raw_spin_lock_irqsave_rcu_node(rnp, flags); WRITE_ONCE(rdp->cpu_no_qs.b.exp, false); - rcu_report_exp_cpu_mult(rdp->mynode, rdp->grpmask, true); + ASSERT_EXCLUSIVE_WRITER(rdp->cpu_no_qs.b.exp); + rcu_report_exp_cpu_mult(rnp, flags, rdp->grpmask, true); } /* Common code for work-done checking. */ @@ -266,7 +280,12 @@ static bool sync_exp_work_done(unsigned long s) { if (rcu_exp_gp_seq_done(s)) { trace_rcu_exp_grace_period(rcu_state.name, s, TPS("done")); - smp_mb(); /* Ensure test happens before caller kfree(). */ + /* + * Order GP completion with preceding accesses. Order also GP + * completion with post GP update side accesses. Pairs with + * rcu_seq_end(). + */ + smp_mb(); return true; } return false; @@ -358,22 +377,29 @@ static void __sync_rcu_exp_select_node_cpus(struct rcu_exp_work *rewp) !(rnp->qsmaskinitnext & mask)) { mask_ofl_test |= mask; } else { - snap = rcu_dynticks_snap(cpu); - if (rcu_dynticks_in_eqs(snap)) + /* + * Full ordering between remote CPU's post idle accesses + * and updater's accesses prior to current GP (and also + * the started GP sequence number) is enforced by + * rcu_seq_start() implicit barrier, relayed by kworkers + * locking and even further by smp_mb__after_unlock_lock() + * barriers chained all the way throughout the rnp locking + * tree since sync_exp_reset_tree() and up to the current + * leaf rnp locking. + * + * Ordering between remote CPU's pre idle accesses and + * post grace period updater's accesses is enforced by the + * below acquire semantic. + */ + snap = ct_rcu_watching_cpu_acquire(cpu); + if (rcu_watching_snap_in_eqs(snap)) mask_ofl_test |= mask; else - rdp->exp_dynticks_snap = snap; + rdp->exp_watching_snap = snap; } } mask_ofl_ipi = rnp->expmask & ~mask_ofl_test; - /* - * Need to wait for any blocked tasks as well. Note that - * additional blocking tasks will also block the expedited GP - * until such time as the ->expmask bits are cleared. - */ - if (rcu_preempt_has_tasks(rnp)) - WRITE_ONCE(rnp->exp_tasks, rnp->blkd_tasks.next); raw_spin_unlock_irqrestore_rcu_node(rnp, flags); /* IPI the remaining CPUs for expedited quiescent state. */ @@ -382,7 +408,7 @@ static void __sync_rcu_exp_select_node_cpus(struct rcu_exp_work *rewp) unsigned long mask = rdp->grpmask; retry_ipi: - if (rcu_dynticks_in_eqs_since(rdp, rdp->exp_dynticks_snap)) { + if (rcu_watching_snap_stopped_since(rdp, rdp->exp_watching_snap)) { mask_ofl_test |= mask; continue; } @@ -413,13 +439,14 @@ retry_ipi: raw_spin_unlock_irqrestore_rcu_node(rnp, flags); } /* Report quiescent states for those that went offline. */ - if (mask_ofl_test) - rcu_report_exp_cpu_mult(rnp, mask_ofl_test, false); + if (mask_ofl_test) { + raw_spin_lock_irqsave_rcu_node(rnp, flags); + rcu_report_exp_cpu_mult(rnp, flags, mask_ofl_test, false); + } } static void rcu_exp_sel_wait_wake(unsigned long s); -#ifdef CONFIG_RCU_EXP_KTHREAD static void sync_rcu_exp_select_node_cpus(struct kthread_work *wp) { struct rcu_exp_work *rewp = @@ -428,9 +455,14 @@ static void sync_rcu_exp_select_node_cpus(struct kthread_work *wp) __sync_rcu_exp_select_node_cpus(rewp); } -static inline bool rcu_gp_par_worker_started(void) +static inline bool rcu_exp_worker_started(void) { - return !!READ_ONCE(rcu_exp_par_gp_kworker); + return !!READ_ONCE(rcu_exp_gp_kworker); +} + +static inline bool rcu_exp_par_worker_started(struct rcu_node *rnp) +{ + return !!READ_ONCE(rnp->exp_kworker); } static inline void sync_rcu_exp_select_cpus_queue_work(struct rcu_node *rnp) @@ -441,7 +473,7 @@ static inline void sync_rcu_exp_select_cpus_queue_work(struct rcu_node *rnp) * another work item on the same kthread worker can result in * deadlock. */ - kthread_queue_work(rcu_exp_par_gp_kworker, &rnp->rew.rew_work); + kthread_queue_work(READ_ONCE(rnp->exp_kworker), &rnp->rew.rew_work); } static inline void sync_rcu_exp_select_cpus_flush_work(struct rcu_node *rnp) @@ -466,64 +498,6 @@ static inline void synchronize_rcu_expedited_queue_work(struct rcu_exp_work *rew kthread_queue_work(rcu_exp_gp_kworker, &rew->rew_work); } -static inline void synchronize_rcu_expedited_destroy_work(struct rcu_exp_work *rew) -{ -} -#else /* !CONFIG_RCU_EXP_KTHREAD */ -static void sync_rcu_exp_select_node_cpus(struct work_struct *wp) -{ - struct rcu_exp_work *rewp = - container_of(wp, struct rcu_exp_work, rew_work); - - __sync_rcu_exp_select_node_cpus(rewp); -} - -static inline bool rcu_gp_par_worker_started(void) -{ - return !!READ_ONCE(rcu_par_gp_wq); -} - -static inline void sync_rcu_exp_select_cpus_queue_work(struct rcu_node *rnp) -{ - int cpu = find_next_bit(&rnp->ffmask, BITS_PER_LONG, -1); - - INIT_WORK(&rnp->rew.rew_work, sync_rcu_exp_select_node_cpus); - /* If all offline, queue the work on an unbound CPU. */ - if (unlikely(cpu > rnp->grphi - rnp->grplo)) - cpu = WORK_CPU_UNBOUND; - else - cpu += rnp->grplo; - queue_work_on(cpu, rcu_par_gp_wq, &rnp->rew.rew_work); -} - -static inline void sync_rcu_exp_select_cpus_flush_work(struct rcu_node *rnp) -{ - flush_work(&rnp->rew.rew_work); -} - -/* - * Work-queue handler to drive an expedited grace period forward. - */ -static void wait_rcu_exp_gp(struct work_struct *wp) -{ - struct rcu_exp_work *rewp; - - rewp = container_of(wp, struct rcu_exp_work, rew_work); - rcu_exp_sel_wait_wake(rewp->rew_s); -} - -static inline void synchronize_rcu_expedited_queue_work(struct rcu_exp_work *rew) -{ - INIT_WORK_ONSTACK(&rew->rew_work, wait_rcu_exp_gp); - queue_work(rcu_gp_wq, &rew->rew_work); -} - -static inline void synchronize_rcu_expedited_destroy_work(struct rcu_exp_work *rew) -{ - destroy_work_on_stack(&rew->rew_work); -} -#endif /* CONFIG_RCU_EXP_KTHREAD */ - /* * Select the nodes that the upcoming expedited grace period needs * to wait for. @@ -541,7 +515,7 @@ static void sync_rcu_exp_select_cpus(void) rnp->exp_need_flush = false; if (!READ_ONCE(rnp->expmask)) continue; /* Avoid early boot non-existent wq. */ - if (!rcu_gp_par_worker_started() || + if (!rcu_exp_par_worker_started(rnp) || rcu_scheduler_active != RCU_SCHEDULER_RUNNING || rcu_is_last_leaf_node(rnp)) { /* No worker started yet or last leaf, do direct call. */ @@ -579,6 +553,67 @@ static bool synchronize_rcu_expedited_wait_once(long tlimit) } /* + * Print out an expedited RCU CPU stall warning message. + */ +static void synchronize_rcu_expedited_stall(unsigned long jiffies_start, unsigned long j) +{ + int cpu; + unsigned long mask; + int ndetected; + struct rcu_node *rnp; + struct rcu_node *rnp_root = rcu_get_root(); + + if (READ_ONCE(csd_lock_suppress_rcu_stall) && csd_lock_is_stuck()) { + pr_err("INFO: %s detected expedited stalls, but suppressed full report due to a stuck CSD-lock.\n", rcu_state.name); + return; + } + pr_err("INFO: %s detected expedited stalls on CPUs/tasks: {", rcu_state.name); + ndetected = 0; + rcu_for_each_leaf_node(rnp) { + ndetected += rcu_print_task_exp_stall(rnp); + for_each_leaf_node_possible_cpu(rnp, cpu) { + struct rcu_data *rdp; + + mask = leaf_node_cpu_bit(rnp, cpu); + if (!(READ_ONCE(rnp->expmask) & mask)) + continue; + ndetected++; + rdp = per_cpu_ptr(&rcu_data, cpu); + pr_cont(" %d-%c%c%c%c", cpu, + "O."[!!cpu_online(cpu)], + "o."[!!(rdp->grpmask & rnp->expmaskinit)], + "N."[!!(rdp->grpmask & rnp->expmaskinitnext)], + "D."[!!data_race(rdp->cpu_no_qs.b.exp)]); + } + } + pr_cont(" } %lu jiffies s: %lu root: %#lx/%c\n", + j - jiffies_start, rcu_state.expedited_sequence, data_race(rnp_root->expmask), + ".T"[!!data_race(rnp_root->exp_tasks)]); + if (ndetected) { + pr_err("blocking rcu_node structures (internal RCU debug):"); + rcu_for_each_node_breadth_first(rnp) { + if (rnp == rnp_root) + continue; /* printed unconditionally */ + if (sync_rcu_exp_done_unlocked(rnp)) + continue; + pr_cont(" l=%u:%d-%d:%#lx/%c", + rnp->level, rnp->grplo, rnp->grphi, data_race(rnp->expmask), + ".T"[!!data_race(rnp->exp_tasks)]); + } + pr_cont("\n"); + } + rcu_for_each_leaf_node(rnp) { + for_each_leaf_node_possible_cpu(rnp, cpu) { + mask = leaf_node_cpu_bit(rnp, cpu); + if (!(READ_ONCE(rnp->expmask) & mask)) + continue; + dump_cpu_task(cpu); + } + rcu_exp_print_detail_task_stall_rnp(rnp); + } +} + +/* * Wait for the expedited grace period to elapse, issuing any needed * RCU CPU stall warnings along the way. */ @@ -589,10 +624,9 @@ static void synchronize_rcu_expedited_wait(void) unsigned long jiffies_stall; unsigned long jiffies_start; unsigned long mask; - int ndetected; struct rcu_data *rdp; struct rcu_node *rnp; - struct rcu_node *rnp_root = rcu_get_root(); + unsigned long flags; trace_rcu_exp_grace_period(rcu_state.name, rcu_exp_gp_seq_endval(), TPS("startwait")); jiffies_stall = rcu_exp_jiffies_till_stall_check(); @@ -601,17 +635,17 @@ static void synchronize_rcu_expedited_wait(void) if (synchronize_rcu_expedited_wait_once(1)) return; rcu_for_each_leaf_node(rnp) { + raw_spin_lock_irqsave_rcu_node(rnp, flags); mask = READ_ONCE(rnp->expmask); for_each_leaf_node_cpu_mask(rnp, cpu, mask) { rdp = per_cpu_ptr(&rcu_data, cpu); if (rdp->rcu_forced_tick_exp) continue; rdp->rcu_forced_tick_exp = true; - preempt_disable(); if (cpu_online(cpu)) tick_dep_set_cpu(cpu, TICK_DEP_BIT_RCU_EXP); - preempt_enable(); } + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); } j = READ_ONCE(jiffies_till_first_fqs); if (synchronize_rcu_expedited_wait_once(j + HZ)) @@ -619,58 +653,23 @@ static void synchronize_rcu_expedited_wait(void) } for (;;) { + unsigned long j; + if (synchronize_rcu_expedited_wait_once(jiffies_stall)) return; if (rcu_stall_is_suppressed()) continue; - trace_rcu_stall_warning(rcu_state.name, TPS("ExpeditedStall")); - pr_err("INFO: %s detected expedited stalls on CPUs/tasks: {", - rcu_state.name); - ndetected = 0; - rcu_for_each_leaf_node(rnp) { - ndetected += rcu_print_task_exp_stall(rnp); - for_each_leaf_node_possible_cpu(rnp, cpu) { - struct rcu_data *rdp; - mask = leaf_node_cpu_bit(rnp, cpu); - if (!(READ_ONCE(rnp->expmask) & mask)) - continue; - ndetected++; - rdp = per_cpu_ptr(&rcu_data, cpu); - pr_cont(" %d-%c%c%c%c", cpu, - "O."[!!cpu_online(cpu)], - "o."[!!(rdp->grpmask & rnp->expmaskinit)], - "N."[!!(rdp->grpmask & rnp->expmaskinitnext)], - "D."[!!(rdp->cpu_no_qs.b.exp)]); - } - } - pr_cont(" } %lu jiffies s: %lu root: %#lx/%c\n", - jiffies - jiffies_start, rcu_state.expedited_sequence, - data_race(rnp_root->expmask), - ".T"[!!data_race(rnp_root->exp_tasks)]); - if (ndetected) { - pr_err("blocking rcu_node structures (internal RCU debug):"); - rcu_for_each_node_breadth_first(rnp) { - if (rnp == rnp_root) - continue; /* printed unconditionally */ - if (sync_rcu_exp_done_unlocked(rnp)) - continue; - pr_cont(" l=%u:%d-%d:%#lx/%c", - rnp->level, rnp->grplo, rnp->grphi, - data_race(rnp->expmask), - ".T"[!!data_race(rnp->exp_tasks)]); - } - pr_cont("\n"); - } - rcu_for_each_leaf_node(rnp) { - for_each_leaf_node_possible_cpu(rnp, cpu) { - mask = leaf_node_cpu_bit(rnp, cpu); - if (!(READ_ONCE(rnp->expmask) & mask)) - continue; - dump_cpu_task(cpu); - } - } + nbcon_cpu_emergency_enter(); + + j = jiffies; + rcu_stall_notifier_call_chain(RCU_STALL_NOTIFY_EXP, (void *)(j - jiffies_start)); + trace_rcu_stall_warning(rcu_state.name, TPS("ExpeditedStall")); + synchronize_rcu_expedited_stall(jiffies_start, j); jiffies_stall = 3 * rcu_exp_jiffies_till_stall_check() + 3; + + nbcon_cpu_emergency_exit(); + panic_on_rcu_stall(); } } @@ -722,6 +721,17 @@ static void rcu_exp_sel_wait_wake(unsigned long s) rcu_exp_wait_wake(s); } +/* Request an expedited quiescent state. */ +static void rcu_exp_need_qs(void) +{ + lockdep_assert_irqs_disabled(); + ASSERT_EXCLUSIVE_WRITER_SCOPED(*this_cpu_ptr(&rcu_data.cpu_no_qs.b.exp)); + __this_cpu_write(rcu_data.cpu_no_qs.b.exp, true); + /* Store .exp before .rcu_urgent_qs. */ + smp_store_release(this_cpu_ptr(&rcu_data.rcu_urgent_qs), true); + set_need_resched_current(); +} + #ifdef CONFIG_PREEMPT_RCU /* @@ -740,24 +750,30 @@ static void rcu_exp_handler(void *unused) struct task_struct *t = current; /* - * First, the common case of not being in an RCU read-side + * WARN if the CPU is unexpectedly already looking for a + * QS or has already reported one. + */ + ASSERT_EXCLUSIVE_WRITER_SCOPED(rdp->cpu_no_qs.b.exp); + if (WARN_ON_ONCE(!(READ_ONCE(rnp->expmask) & rdp->grpmask) || + READ_ONCE(rdp->cpu_no_qs.b.exp))) + return; + + /* + * Second, the common case of not being in an RCU read-side * critical section. If also enabled or idle, immediately * report the quiescent state, otherwise defer. */ if (!depth) { if (!(preempt_count() & (PREEMPT_MASK | SOFTIRQ_MASK)) || - rcu_is_cpu_rrupt_from_idle()) { + rcu_is_cpu_rrupt_from_idle()) rcu_report_exp_rdp(rdp); - } else { - WRITE_ONCE(rdp->cpu_no_qs.b.exp, true); - set_tsk_need_resched(t); - set_preempt_need_resched(); - } + else + rcu_exp_need_qs(); return; } /* - * Second, the less-common case of being in an RCU read-side + * Third, the less-common case of being in an RCU read-side * critical section. In this case we can count on a future * rcu_read_unlock(). However, this rcu_read_unlock() might * execute on some other CPU, but in that case there will be @@ -778,15 +794,10 @@ static void rcu_exp_handler(void *unused) return; } - // Finally, negative nesting depth should not happen. + // Fourth and finally, negative nesting depth should not happen. WARN_ON_ONCE(1); } -/* PREEMPTION=y, so no PREEMPTION=n expedited grace period to clean up after. */ -static void sync_sched_exp_online_cleanup(int cpu) -{ -} - /* * Scan the current list of tasks blocked within RCU read-side critical * sections, printing out the tid of each that is blocking the current @@ -798,9 +809,11 @@ static int rcu_print_task_exp_stall(struct rcu_node *rnp) int ndetected = 0; struct task_struct *t; - if (!READ_ONCE(rnp->exp_tasks)) - return 0; raw_spin_lock_irqsave_rcu_node(rnp, flags); + if (!rnp->exp_tasks) { + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); + return 0; + } t = list_entry(rnp->exp_tasks->prev, struct task_struct, rcu_node_entry); list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) { @@ -811,18 +824,38 @@ static int rcu_print_task_exp_stall(struct rcu_node *rnp) return ndetected; } -#else /* #ifdef CONFIG_PREEMPT_RCU */ - -/* Request an expedited quiescent state. */ -static void rcu_exp_need_qs(void) +/* + * Scan the current list of tasks blocked within RCU read-side critical + * sections, dumping the stack of each that is blocking the current + * expedited grace period. + */ +static void rcu_exp_print_detail_task_stall_rnp(struct rcu_node *rnp) { - __this_cpu_write(rcu_data.cpu_no_qs.b.exp, true); - /* Store .exp before .rcu_urgent_qs. */ - smp_store_release(this_cpu_ptr(&rcu_data.rcu_urgent_qs), true); - set_tsk_need_resched(current); - set_preempt_need_resched(); + unsigned long flags; + struct task_struct *t; + + if (!rcu_exp_stall_task_details) + return; + raw_spin_lock_irqsave_rcu_node(rnp, flags); + if (!READ_ONCE(rnp->exp_tasks)) { + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); + return; + } + t = list_entry(rnp->exp_tasks->prev, + struct task_struct, rcu_node_entry); + list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) { + /* + * We could be printing a lot while holding a spinlock. + * Avoid triggering hard lockup. + */ + touch_nmi_watchdog(); + sched_show_task(t); + } + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); } +#else /* #ifdef CONFIG_PREEMPT_RCU */ + /* Invoked on each online non-idle CPU for expedited quiescent state. */ static void rcu_exp_handler(void *unused) { @@ -830,6 +863,7 @@ static void rcu_exp_handler(void *unused) struct rcu_node *rnp = rdp->mynode; bool preempt_bh_enabled = !(preempt_count() & (PREEMPT_MASK | SOFTIRQ_MASK)); + ASSERT_EXCLUSIVE_WRITER_SCOPED(rdp->cpu_no_qs.b.exp); if (!(READ_ONCE(rnp->expmask) & rdp->grpmask) || __this_cpu_read(rcu_data.cpu_no_qs.b.exp)) return; @@ -841,38 +875,6 @@ static void rcu_exp_handler(void *unused) rcu_exp_need_qs(); } -/* Send IPI for expedited cleanup if needed at end of CPU-hotplug operation. */ -static void sync_sched_exp_online_cleanup(int cpu) -{ - unsigned long flags; - int my_cpu; - struct rcu_data *rdp; - int ret; - struct rcu_node *rnp; - - rdp = per_cpu_ptr(&rcu_data, cpu); - rnp = rdp->mynode; - my_cpu = get_cpu(); - /* Quiescent state either not needed or already requested, leave. */ - if (!(READ_ONCE(rnp->expmask) & rdp->grpmask) || - READ_ONCE(rdp->cpu_no_qs.b.exp)) { - put_cpu(); - return; - } - /* Quiescent state needed on current CPU, so set it up locally. */ - if (my_cpu == cpu) { - local_irq_save(flags); - rcu_exp_need_qs(); - local_irq_restore(flags); - put_cpu(); - return; - } - /* Quiescent state needed on some other CPU, send IPI. */ - ret = smp_call_function_single(cpu, rcu_exp_handler, NULL, 0); - put_cpu(); - WARN_ON_ONCE(ret); -} - /* * Because preemptible RCU does not exist, we never have to check for * tasks blocked within RCU read-side critical sections that are @@ -883,6 +885,15 @@ static int rcu_print_task_exp_stall(struct rcu_node *rnp) return 0; } +/* + * Because preemptible RCU does not exist, we never have to print out + * tasks blocked within RCU read-side critical sections that are blocking + * the current expedited grace period. + */ +static void rcu_exp_print_detail_task_stall_rnp(struct rcu_node *rnp) +{ +} + #endif /* #else #ifdef CONFIG_PREEMPT_RCU */ /** @@ -907,7 +918,6 @@ static int rcu_print_task_exp_stall(struct rcu_node *rnp) */ void synchronize_rcu_expedited(void) { - bool boottime = (rcu_scheduler_active == RCU_SCHEDULER_INIT); unsigned long flags; struct rcu_exp_work rew; struct rcu_node *rnp; @@ -937,7 +947,7 @@ void synchronize_rcu_expedited(void) /* If expedited grace periods are prohibited, fall back to normal. */ if (rcu_gp_is_normal()) { - wait_rcu_gp(call_rcu_hurry); + synchronize_rcu_normal(); return; } @@ -947,7 +957,7 @@ void synchronize_rcu_expedited(void) return; /* Someone else did our work for us. */ /* Ensure that load happens before action based on it. */ - if (unlikely(boottime)) { + if (unlikely((rcu_scheduler_active == RCU_SCHEDULER_INIT) || !rcu_exp_worker_started())) { /* Direct call during scheduler init and early_initcalls(). */ rcu_exp_sel_wait_wake(s); } else { @@ -960,13 +970,9 @@ void synchronize_rcu_expedited(void) rnp = rcu_get_root(); wait_event(rnp->exp_wq[rcu_seq_ctr(s) & 0x3], sync_exp_work_done(s)); - smp_mb(); /* Work actions happen before return. */ /* Let the next expedited grace period start. */ mutex_unlock(&rcu_state.exp_mutex); - - if (likely(!boottime)) - synchronize_rcu_expedited_destroy_work(&rew); } EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); @@ -1022,9 +1028,10 @@ unsigned long start_poll_synchronize_rcu_expedited(void) if (rcu_init_invoked()) raw_spin_lock_irqsave(&rnp->exp_poll_lock, flags); if (!poll_state_synchronize_rcu(s)) { - rnp->exp_seq_poll_rq = s; - if (rcu_init_invoked()) + if (rcu_init_invoked()) { + rnp->exp_seq_poll_rq = s; queue_work(rcu_gp_wq, &rnp->exp_poll_wq); + } } if (rcu_init_invoked()) raw_spin_unlock_irqrestore(&rnp->exp_poll_lock, flags); diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h index 9e1c8caec5ce..e6cd56603cad 100644 --- a/kernel/rcu/tree_nocb.h +++ b/kernel/rcu/tree_nocb.h @@ -16,10 +16,6 @@ #ifdef CONFIG_RCU_NOCB_CPU static cpumask_var_t rcu_nocb_mask; /* CPUs to have callbacks offloaded. */ static bool __read_mostly rcu_nocb_poll; /* Offload kthread are to poll. */ -static inline int rcu_lockdep_is_held_nocb(struct rcu_data *rdp) -{ - return lockdep_is_held(&rdp->nocb_lock); -} static inline bool rcu_current_is_nocb_kthread(struct rcu_data *rdp) { @@ -77,9 +73,9 @@ __setup("rcu_nocbs", rcu_nocb_setup); static int __init parse_rcu_nocb_poll(char *arg) { rcu_nocb_poll = true; - return 0; + return 1; } -early_param("rcu_nocb_poll", parse_rcu_nocb_poll); +__setup("rcu_nocb_poll", parse_rcu_nocb_poll); /* * Don't bother bypassing ->cblist if the call_rcu() rate is low. @@ -91,8 +87,7 @@ module_param(nocb_nobypass_lim_per_jiffy, int, 0); /* * Acquire the specified rcu_data structure's ->nocb_bypass_lock. If the - * lock isn't immediately available, increment ->nocb_lock_contended to - * flag the contention. + * lock isn't immediately available, perform minimal sanity check. */ static void rcu_nocb_bypass_lock(struct rcu_data *rdp) __acquires(&rdp->nocb_bypass_lock) @@ -100,29 +95,12 @@ static void rcu_nocb_bypass_lock(struct rcu_data *rdp) lockdep_assert_irqs_disabled(); if (raw_spin_trylock(&rdp->nocb_bypass_lock)) return; - atomic_inc(&rdp->nocb_lock_contended); + /* + * Contention expected only when local enqueue collide with + * remote flush from kthreads. + */ WARN_ON_ONCE(smp_processor_id() != rdp->cpu); - smp_mb__after_atomic(); /* atomic_inc() before lock. */ raw_spin_lock(&rdp->nocb_bypass_lock); - smp_mb__before_atomic(); /* atomic_dec() after lock. */ - atomic_dec(&rdp->nocb_lock_contended); -} - -/* - * Spinwait until the specified rcu_data structure's ->nocb_lock is - * not contended. Please note that this is extremely special-purpose, - * relying on the fact that at most two kthreads and one CPU contend for - * this lock, and also that the two kthreads are guaranteed to have frequent - * grace-period-duration time intervals between successive acquisitions - * of the lock. This allows us to use an extremely simple throttling - * mechanism, and further to apply it only to the CPU doing floods of - * call_rcu() invocations. Don't try this at home! - */ -static void rcu_nocb_wait_contended(struct rcu_data *rdp) -{ - WARN_ON_ONCE(smp_processor_id() != rdp->cpu); - while (WARN_ON_ONCE(atomic_read(&rdp->nocb_lock_contended))) - cpu_relax(); } /* @@ -228,7 +206,7 @@ static bool __wake_nocb_gp(struct rcu_data *rdp_gp, if (rdp_gp->nocb_defer_wakeup > RCU_NOCB_WAKE_NOT) { WRITE_ONCE(rdp_gp->nocb_defer_wakeup, RCU_NOCB_WAKE_NOT); - del_timer(&rdp_gp->nocb_timer); + timer_delete(&rdp_gp->nocb_timer); } if (force || READ_ONCE(rdp_gp->nocb_gp_sleep)) { @@ -238,7 +216,7 @@ static bool __wake_nocb_gp(struct rcu_data *rdp_gp, raw_spin_unlock_irqrestore(&rdp_gp->nocb_gp_lock, flags); if (needwake) { trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("DoWake")); - wake_up_process(rdp_gp->nocb_gp_kthread); + swake_up_one(&rdp_gp->nocb_gp_wq); } return needwake; @@ -256,6 +234,7 @@ static bool wake_nocb_gp(struct rcu_data *rdp, bool force) return __wake_nocb_gp(rdp_gp, rdp, force, flags); } +#ifdef CONFIG_RCU_LAZY /* * LAZY_FLUSH_JIFFIES decides the maximum amount of time that * can elapse before lazy callbacks are flushed. Lazy callbacks @@ -264,21 +243,20 @@ static bool wake_nocb_gp(struct rcu_data *rdp, bool force) * left unsubmitted to RCU after those many jiffies. */ #define LAZY_FLUSH_JIFFIES (10 * HZ) -static unsigned long jiffies_till_flush = LAZY_FLUSH_JIFFIES; +static unsigned long jiffies_lazy_flush = LAZY_FLUSH_JIFFIES; -#ifdef CONFIG_RCU_LAZY // To be called only from test code. -void rcu_lazy_set_jiffies_till_flush(unsigned long jif) +void rcu_set_jiffies_lazy_flush(unsigned long jif) { - jiffies_till_flush = jif; + jiffies_lazy_flush = jif; } -EXPORT_SYMBOL(rcu_lazy_set_jiffies_till_flush); +EXPORT_SYMBOL(rcu_set_jiffies_lazy_flush); -unsigned long rcu_lazy_get_jiffies_till_flush(void) +unsigned long rcu_get_jiffies_lazy_flush(void) { - return jiffies_till_flush; + return jiffies_lazy_flush; } -EXPORT_SYMBOL(rcu_lazy_get_jiffies_till_flush); +EXPORT_SYMBOL(rcu_get_jiffies_lazy_flush); #endif /* @@ -298,8 +276,8 @@ static void wake_nocb_gp_defer(struct rcu_data *rdp, int waketype, * callback storms, no need to wake up too early. */ if (waketype == RCU_NOCB_WAKE_LAZY && - rdp->nocb_defer_wakeup == RCU_NOCB_WAKE_NOT) { - mod_timer(&rdp_gp->nocb_timer, jiffies + jiffies_till_flush); + rdp_gp->nocb_defer_wakeup == RCU_NOCB_WAKE_NOT) { + mod_timer(&rdp_gp->nocb_timer, jiffies + rcu_get_jiffies_lazy_flush()); WRITE_ONCE(rdp_gp->nocb_defer_wakeup, waketype); } else if (waketype == RCU_NOCB_WAKE_BYPASS) { mod_timer(&rdp_gp->nocb_timer, jiffies + 2); @@ -431,14 +409,6 @@ static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp, return false; } - // In the process of (de-)offloading: no bypassing, but - // locking. - if (!rcu_segcblist_completely_offloaded(&rdp->cblist)) { - rcu_nocb_lock(rdp); - *was_alldone = !rcu_segcblist_pend_cbs(&rdp->cblist); - return false; /* Not offloaded, no bypassing. */ - } - // Don't use ->nocb_bypass during early boot. if (rcu_scheduler_active != RCU_SCHEDULER_RUNNING) { rcu_nocb_lock(rdp); @@ -482,7 +452,7 @@ static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp, // flush ->nocb_bypass to ->cblist. if ((ncbs && !bypass_is_lazy && j != READ_ONCE(rdp->nocb_bypass_first)) || (ncbs && bypass_is_lazy && - (time_after(j, READ_ONCE(rdp->nocb_bypass_first) + jiffies_till_flush))) || + (time_after(j, READ_ONCE(rdp->nocb_bypass_first) + rcu_get_jiffies_lazy_flush()))) || ncbs >= qhimark) { rcu_nocb_lock(rdp); *was_alldone = !rcu_segcblist_pend_cbs(&rdp->cblist); @@ -510,7 +480,6 @@ static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp, } // We need to use the bypass. - rcu_nocb_wait_contended(rdp); rcu_nocb_bypass_lock(rdp); ncbs = rcu_cblist_n_cbs(&rdp->nocb_bypass); rcu_segcblist_inc_len(&rdp->cblist); /* Must precede enqueue. */ @@ -524,7 +493,7 @@ static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp, trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("FirstBQ")); } rcu_nocb_bypass_unlock(rdp); - smp_mb(); /* Order enqueue before wake. */ + // A wake up of the grace period kthread or timer adjustment // needs to be done only if: // 1. Bypass list was fully empty before (this is the first @@ -532,9 +501,7 @@ static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp, // 2. Both of these conditions are met: // a. The bypass list previously had only lazy CBs, and: // b. The new CB is non-lazy. - if (ncbs && (!bypass_is_lazy || lazy)) { - local_irq_restore(flags); - } else { + if (!ncbs || (bypass_is_lazy && !lazy)) { // No-CBs GP kthread might be indefinitely asleep, if so, wake. rcu_nocb_lock(rdp); // Rare during call_rcu() flood. if (!rcu_segcblist_pend_cbs(&rdp->cblist)) { @@ -544,7 +511,7 @@ static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp, } else { trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("FirstBQnoWake")); - rcu_nocb_unlock_irqrestore(rdp, flags); + rcu_nocb_unlock(rdp); } } return true; // Callback already enqueued. @@ -566,11 +533,12 @@ static void __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_alldone, long lazy_len; long len; struct task_struct *t; + struct rcu_data *rdp_gp = rdp->nocb_gp_rdp; // If we are being polled or there is no kthread, just leave. t = READ_ONCE(rdp->nocb_gp_kthread); if (rcu_nocb_poll || !t) { - rcu_nocb_unlock_irqrestore(rdp, flags); + rcu_nocb_unlock(rdp); trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WakeNotPoll")); return; @@ -583,17 +551,17 @@ static void __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_alldone, rdp->qlen_last_fqs_check = len; // Only lazy CBs in bypass list if (lazy_len && bypass_len == lazy_len) { - rcu_nocb_unlock_irqrestore(rdp, flags); + rcu_nocb_unlock(rdp); wake_nocb_gp_defer(rdp, RCU_NOCB_WAKE_LAZY, TPS("WakeLazy")); } else if (!irqs_disabled_flags(flags)) { /* ... if queue was empty ... */ - rcu_nocb_unlock_irqrestore(rdp, flags); + rcu_nocb_unlock(rdp); wake_nocb_gp(rdp, false); trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WakeEmpty")); } else { - rcu_nocb_unlock_irqrestore(rdp, flags); + rcu_nocb_unlock(rdp); wake_nocb_gp_defer(rdp, RCU_NOCB_WAKE, TPS("WakeEmptyIsDeferred")); } @@ -610,56 +578,59 @@ static void __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_alldone, smp_mb(); /* Enqueue before timer_pending(). */ if ((rdp->nocb_cb_sleep || !rcu_segcblist_ready_cbs(&rdp->cblist)) && - !timer_pending(&rdp->nocb_timer)) { - rcu_nocb_unlock_irqrestore(rdp, flags); + !timer_pending(&rdp_gp->nocb_timer)) { + rcu_nocb_unlock(rdp); wake_nocb_gp_defer(rdp, RCU_NOCB_WAKE_FORCE, TPS("WakeOvfIsDeferred")); } else { - rcu_nocb_unlock_irqrestore(rdp, flags); + rcu_nocb_unlock(rdp); trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WakeNot")); } } else { - rcu_nocb_unlock_irqrestore(rdp, flags); + rcu_nocb_unlock(rdp); trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WakeNot")); } } -static int nocb_gp_toggle_rdp(struct rcu_data *rdp, - bool *wake_state) +static void call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *head, + rcu_callback_t func, unsigned long flags, bool lazy) +{ + bool was_alldone; + + if (!rcu_nocb_try_bypass(rdp, head, &was_alldone, flags, lazy)) { + /* Not enqueued on bypass but locked, do regular enqueue */ + rcutree_enqueue(rdp, head, func); + __call_rcu_nocb_wake(rdp, was_alldone, flags); /* unlocks */ + } +} + +static void nocb_gp_toggle_rdp(struct rcu_data *rdp_gp, struct rcu_data *rdp) { struct rcu_segcblist *cblist = &rdp->cblist; unsigned long flags; - int ret; - rcu_nocb_lock_irqsave(rdp, flags); - if (rcu_segcblist_test_flags(cblist, SEGCBLIST_OFFLOADED) && - !rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP)) { + /* + * Locking orders future de-offloaded callbacks enqueue against previous + * handling of this rdp. Ie: Make sure rcuog is done with this rdp before + * deoffloaded callbacks can be enqueued. + */ + raw_spin_lock_irqsave(&rdp->nocb_lock, flags); + if (!rcu_segcblist_test_flags(cblist, SEGCBLIST_OFFLOADED)) { /* * Offloading. Set our flag and notify the offload worker. * We will handle this rdp until it ever gets de-offloaded. */ - rcu_segcblist_set_flags(cblist, SEGCBLIST_KTHREAD_GP); - if (rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB)) - *wake_state = true; - ret = 1; - } else if (!rcu_segcblist_test_flags(cblist, SEGCBLIST_OFFLOADED) && - rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP)) { + list_add_tail(&rdp->nocb_entry_rdp, &rdp_gp->nocb_head_rdp); + rcu_segcblist_set_flags(cblist, SEGCBLIST_OFFLOADED); + } else { /* * De-offloading. Clear our flag and notify the de-offload worker. * We will ignore this rdp until it ever gets re-offloaded. */ - rcu_segcblist_clear_flags(cblist, SEGCBLIST_KTHREAD_GP); - if (!rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB)) - *wake_state = true; - ret = 0; - } else { - WARN_ON_ONCE(1); - ret = -1; + list_del(&rdp->nocb_entry_rdp); + rcu_segcblist_clear_flags(cblist, SEGCBLIST_OFFLOADED); } - - rcu_nocb_unlock_irqrestore(rdp, flags); - - return ret; + raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags); } static void nocb_gp_sleep(struct rcu_data *my_rdp, int cpu) @@ -723,7 +694,7 @@ static void nocb_gp_wait(struct rcu_data *my_rdp) lazy_ncbs = READ_ONCE(rdp->lazy_len); if (bypass_ncbs && (lazy_ncbs == bypass_ncbs) && - (time_after(j, READ_ONCE(rdp->nocb_bypass_first) + jiffies_till_flush) || + (time_after(j, READ_ONCE(rdp->nocb_bypass_first) + rcu_get_jiffies_lazy_flush()) || bypass_ncbs > 2 * qhimark)) { flush_bypass = true; } else if (bypass_ncbs && (lazy_ncbs != bypass_ncbs) && @@ -779,7 +750,6 @@ static void nocb_gp_wait(struct rcu_data *my_rdp) if (rcu_segcblist_ready_cbs(&rdp->cblist)) { needwake = rdp->nocb_cb_sleep; WRITE_ONCE(rdp->nocb_cb_sleep, false); - smp_mb(); /* CB invocation -after- GP end. */ } else { needwake = false; } @@ -846,7 +816,7 @@ static void nocb_gp_wait(struct rcu_data *my_rdp) if (my_rdp->nocb_defer_wakeup > RCU_NOCB_WAKE_NOT) { WRITE_ONCE(my_rdp->nocb_defer_wakeup, RCU_NOCB_WAKE_NOT); - del_timer(&my_rdp->nocb_timer); + timer_delete(&my_rdp->nocb_timer); } WRITE_ONCE(my_rdp->nocb_gp_sleep, true); raw_spin_unlock_irqrestore(&my_rdp->nocb_gp_lock, flags); @@ -867,16 +837,8 @@ static void nocb_gp_wait(struct rcu_data *my_rdp) } if (rdp_toggling) { - bool wake_state = false; - int ret; - - ret = nocb_gp_toggle_rdp(rdp_toggling, &wake_state); - if (ret == 1) - list_add_tail(&rdp_toggling->nocb_entry_rdp, &my_rdp->nocb_head_rdp); - else if (ret == 0) - list_del(&rdp_toggling->nocb_entry_rdp); - if (wake_state) - swake_up_one(&rdp_toggling->nocb_state_wq); + nocb_gp_toggle_rdp(my_rdp, rdp_toggling); + swake_up_one(&rdp_toggling->nocb_state_wq); } my_rdp->nocb_gp_seq = -1; @@ -903,16 +865,9 @@ static int rcu_nocb_gp_kthread(void *arg) return 0; } -static inline bool nocb_cb_can_run(struct rcu_data *rdp) -{ - u8 flags = SEGCBLIST_OFFLOADED | SEGCBLIST_KTHREAD_CB; - - return rcu_segcblist_test_flags(&rdp->cblist, flags); -} - static inline bool nocb_cb_wait_cond(struct rcu_data *rdp) { - return nocb_cb_can_run(rdp) && !READ_ONCE(rdp->nocb_cb_sleep); + return !READ_ONCE(rdp->nocb_cb_sleep) || kthread_should_park(); } /* @@ -924,25 +879,33 @@ static void nocb_cb_wait(struct rcu_data *rdp) struct rcu_segcblist *cblist = &rdp->cblist; unsigned long cur_gp_seq; unsigned long flags; - bool needwake_state = false; bool needwake_gp = false; - bool can_sleep = true; struct rcu_node *rnp = rdp->mynode; - do { - swait_event_interruptible_exclusive(rdp->nocb_cb_wq, - nocb_cb_wait_cond(rdp)); - - // VVV Ensure CB invocation follows _sleep test. - if (smp_load_acquire(&rdp->nocb_cb_sleep)) { // ^^^ - WARN_ON(signal_pending(current)); - trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WokeEmpty")); + swait_event_interruptible_exclusive(rdp->nocb_cb_wq, + nocb_cb_wait_cond(rdp)); + if (kthread_should_park()) { + /* + * kthread_park() must be preceded by an rcu_barrier(). + * But yet another rcu_barrier() might have sneaked in between + * the barrier callback execution and the callbacks counter + * decrement. + */ + if (rdp->nocb_cb_sleep) { + rcu_nocb_lock_irqsave(rdp, flags); + WARN_ON_ONCE(rcu_segcblist_n_cbs(&rdp->cblist)); + rcu_nocb_unlock_irqrestore(rdp, flags); + kthread_parkme(); } - } while (!nocb_cb_can_run(rdp)); + } else if (READ_ONCE(rdp->nocb_cb_sleep)) { + WARN_ON(signal_pending(current)); + trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WokeEmpty")); + } + WARN_ON_ONCE(!rcu_rdp_is_offloaded(rdp)); local_irq_save(flags); - rcu_momentary_dyntick_idle(); + rcu_momentary_eqs(); local_irq_restore(flags); /* * Disable BH to provide the expected environment. Also, when @@ -962,37 +925,16 @@ static void nocb_cb_wait(struct rcu_data *rdp) raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled. */ } - if (rcu_segcblist_test_flags(cblist, SEGCBLIST_OFFLOADED)) { - if (!rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB)) { - rcu_segcblist_set_flags(cblist, SEGCBLIST_KTHREAD_CB); - if (rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP)) - needwake_state = true; - } - if (rcu_segcblist_ready_cbs(cblist)) - can_sleep = false; + if (!rcu_segcblist_ready_cbs(cblist)) { + WRITE_ONCE(rdp->nocb_cb_sleep, true); + trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("CBSleep")); } else { - /* - * De-offloading. Clear our flag and notify the de-offload worker. - * We won't touch the callbacks and keep sleeping until we ever - * get re-offloaded. - */ - WARN_ON_ONCE(!rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB)); - rcu_segcblist_clear_flags(cblist, SEGCBLIST_KTHREAD_CB); - if (!rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP)) - needwake_state = true; + WRITE_ONCE(rdp->nocb_cb_sleep, false); } - WRITE_ONCE(rdp->nocb_cb_sleep, can_sleep); - - if (rdp->nocb_cb_sleep) - trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("CBSleep")); - rcu_nocb_unlock_irqrestore(rdp, flags); if (needwake_gp) rcu_gp_kthread_wake(); - - if (needwake_state) - swake_up_one(&rdp->nocb_state_wq); } /* @@ -1043,7 +985,7 @@ static bool do_nocb_deferred_wakeup_common(struct rcu_data *rdp_gp, static void do_nocb_deferred_wakeup_timer(struct timer_list *t) { unsigned long flags; - struct rcu_data *rdp = from_timer(rdp, t, nocb_timer); + struct rcu_data *rdp = timer_container_of(rdp, t, nocb_timer); WARN_ON_ONCE(rdp->nocb_gp_rdp != rdp); trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("Timer")); @@ -1076,25 +1018,11 @@ void rcu_nocb_flush_deferred_wakeup(void) } EXPORT_SYMBOL_GPL(rcu_nocb_flush_deferred_wakeup); -static int rdp_offload_toggle(struct rcu_data *rdp, - bool offload, unsigned long flags) - __releases(rdp->nocb_lock) +static int rcu_nocb_queue_toggle_rdp(struct rcu_data *rdp) { - struct rcu_segcblist *cblist = &rdp->cblist; struct rcu_data *rdp_gp = rdp->nocb_gp_rdp; bool wake_gp = false; - - rcu_segcblist_offload(cblist, offload); - - if (rdp->nocb_cb_sleep) - rdp->nocb_cb_sleep = false; - rcu_nocb_unlock_irqrestore(rdp, flags); - - /* - * Ignore former value of nocb_cb_sleep and force wake up as it could - * have been spuriously set to false already. - */ - swake_up_one(&rdp->nocb_cb_wq); + unsigned long flags; raw_spin_lock_irqsave(&rdp_gp->nocb_gp_lock, flags); // Queue this rdp for add/del to/from the list to iterate on rcuog @@ -1108,97 +1036,73 @@ static int rdp_offload_toggle(struct rcu_data *rdp, return wake_gp; } -static long rcu_nocb_rdp_deoffload(void *arg) +static bool rcu_nocb_rdp_deoffload_wait_cond(struct rcu_data *rdp) { - struct rcu_data *rdp = arg; - struct rcu_segcblist *cblist = &rdp->cblist; unsigned long flags; - int wake_gp; - struct rcu_data *rdp_gp = rdp->nocb_gp_rdp; + bool ret; /* - * rcu_nocb_rdp_deoffload() may be called directly if - * rcuog/o[p] spawn failed, because at this time the rdp->cpu - * is not online yet. + * Locking makes sure rcuog is done handling this rdp before deoffloaded + * enqueue can happen. Also it keeps the SEGCBLIST_OFFLOADED flag stable + * while the ->nocb_lock is held. */ - WARN_ON_ONCE((rdp->cpu != raw_smp_processor_id()) && cpu_online(rdp->cpu)); + raw_spin_lock_irqsave(&rdp->nocb_lock, flags); + ret = !rcu_segcblist_test_flags(&rdp->cblist, SEGCBLIST_OFFLOADED); + raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags); + + return ret; +} + +static int rcu_nocb_rdp_deoffload(struct rcu_data *rdp) +{ + unsigned long flags; + int wake_gp; + struct rcu_data *rdp_gp = rdp->nocb_gp_rdp; + + /* CPU must be offline, unless it's early boot */ + WARN_ON_ONCE(cpu_online(rdp->cpu) && rdp->cpu != raw_smp_processor_id()); pr_info("De-offloading %d\n", rdp->cpu); - rcu_nocb_lock_irqsave(rdp, flags); - /* - * Flush once and for all now. This suffices because we are - * running on the target CPU holding ->nocb_lock (thus having - * interrupts disabled), and because rdp_offload_toggle() - * invokes rcu_segcblist_offload(), which clears SEGCBLIST_OFFLOADED. - * Thus future calls to rcu_segcblist_completely_offloaded() will - * return false, which means that future calls to rcu_nocb_try_bypass() - * will refuse to put anything into the bypass. - */ - WARN_ON_ONCE(!rcu_nocb_flush_bypass(rdp, NULL, jiffies, false)); + /* Flush all callbacks from segcblist and bypass */ + rcu_barrier(); + /* - * Start with invoking rcu_core() early. This way if the current thread - * happens to preempt an ongoing call to rcu_core() in the middle, - * leaving some work dismissed because rcu_core() still thinks the rdp is - * completely offloaded, we are guaranteed a nearby future instance of - * rcu_core() to catch up. + * Make sure the rcuoc kthread isn't in the middle of a nocb locked + * sequence while offloading is deactivated, along with nocb locking. */ - rcu_segcblist_set_flags(cblist, SEGCBLIST_RCU_CORE); - invoke_rcu_core(); - wake_gp = rdp_offload_toggle(rdp, false, flags); + if (rdp->nocb_cb_kthread) + kthread_park(rdp->nocb_cb_kthread); + + rcu_nocb_lock_irqsave(rdp, flags); + WARN_ON_ONCE(rcu_cblist_n_cbs(&rdp->nocb_bypass)); + WARN_ON_ONCE(rcu_segcblist_n_cbs(&rdp->cblist)); + rcu_nocb_unlock_irqrestore(rdp, flags); + + wake_gp = rcu_nocb_queue_toggle_rdp(rdp); mutex_lock(&rdp_gp->nocb_gp_kthread_mutex); + if (rdp_gp->nocb_gp_kthread) { if (wake_gp) wake_up_process(rdp_gp->nocb_gp_kthread); - /* - * If rcuo[p] kthread spawn failed, directly remove SEGCBLIST_KTHREAD_CB. - * Just wait SEGCBLIST_KTHREAD_GP to be cleared by rcuog. - */ - if (!rdp->nocb_cb_kthread) { - rcu_nocb_lock_irqsave(rdp, flags); - rcu_segcblist_clear_flags(&rdp->cblist, SEGCBLIST_KTHREAD_CB); - rcu_nocb_unlock_irqrestore(rdp, flags); - } - swait_event_exclusive(rdp->nocb_state_wq, - !rcu_segcblist_test_flags(cblist, - SEGCBLIST_KTHREAD_CB | SEGCBLIST_KTHREAD_GP)); + rcu_nocb_rdp_deoffload_wait_cond(rdp)); } else { /* * No kthread to clear the flags for us or remove the rdp from the nocb list * to iterate. Do it here instead. Locking doesn't look stricly necessary * but we stick to paranoia in this rare path. */ - rcu_nocb_lock_irqsave(rdp, flags); - rcu_segcblist_clear_flags(&rdp->cblist, - SEGCBLIST_KTHREAD_CB | SEGCBLIST_KTHREAD_GP); - rcu_nocb_unlock_irqrestore(rdp, flags); + raw_spin_lock_irqsave(&rdp->nocb_lock, flags); + rcu_segcblist_clear_flags(&rdp->cblist, SEGCBLIST_OFFLOADED); + raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags); list_del(&rdp->nocb_entry_rdp); } - mutex_unlock(&rdp_gp->nocb_gp_kthread_mutex); - - /* - * Lock one last time to acquire latest callback updates from kthreads - * so we can later handle callbacks locally without locking. - */ - rcu_nocb_lock_irqsave(rdp, flags); - /* - * Theoretically we could clear SEGCBLIST_LOCKING after the nocb - * lock is released but how about being paranoid for once? - */ - rcu_segcblist_clear_flags(cblist, SEGCBLIST_LOCKING); - /* - * Without SEGCBLIST_LOCKING, we can't use - * rcu_nocb_unlock_irqrestore() anymore. - */ - raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags); - - /* Sanity check */ - WARN_ON_ONCE(rcu_cblist_n_cbs(&rdp->nocb_bypass)); + mutex_unlock(&rdp_gp->nocb_gp_kthread_mutex); return 0; } @@ -1209,33 +1113,41 @@ int rcu_nocb_cpu_deoffload(int cpu) int ret = 0; cpus_read_lock(); - mutex_lock(&rcu_state.barrier_mutex); + mutex_lock(&rcu_state.nocb_mutex); if (rcu_rdp_is_offloaded(rdp)) { - if (cpu_online(cpu)) { - ret = work_on_cpu(cpu, rcu_nocb_rdp_deoffload, rdp); + if (!cpu_online(cpu)) { + ret = rcu_nocb_rdp_deoffload(rdp); if (!ret) cpumask_clear_cpu(cpu, rcu_nocb_mask); } else { - pr_info("NOCB: Cannot CB-deoffload offline CPU %d\n", rdp->cpu); + pr_info("NOCB: Cannot CB-deoffload online CPU %d\n", rdp->cpu); ret = -EINVAL; } } - mutex_unlock(&rcu_state.barrier_mutex); + mutex_unlock(&rcu_state.nocb_mutex); cpus_read_unlock(); return ret; } EXPORT_SYMBOL_GPL(rcu_nocb_cpu_deoffload); -static long rcu_nocb_rdp_offload(void *arg) +static bool rcu_nocb_rdp_offload_wait_cond(struct rcu_data *rdp) { - struct rcu_data *rdp = arg; - struct rcu_segcblist *cblist = &rdp->cblist; unsigned long flags; + bool ret; + + raw_spin_lock_irqsave(&rdp->nocb_lock, flags); + ret = rcu_segcblist_test_flags(&rdp->cblist, SEGCBLIST_OFFLOADED); + raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags); + + return ret; +} + +static int rcu_nocb_rdp_offload(struct rcu_data *rdp) +{ int wake_gp; - struct rcu_data *rdp_gp = rdp->nocb_gp_rdp; - WARN_ON_ONCE(rdp->cpu != raw_smp_processor_id()); + WARN_ON_ONCE(cpu_online(rdp->cpu)); /* * For now we only support re-offload, ie: the rdp must have been * offloaded on boot first. @@ -1243,47 +1155,22 @@ static long rcu_nocb_rdp_offload(void *arg) if (!rdp->nocb_gp_rdp) return -EINVAL; - if (WARN_ON_ONCE(!rdp_gp->nocb_gp_kthread)) + if (WARN_ON_ONCE(!rdp->nocb_gp_kthread)) return -EINVAL; pr_info("Offloading %d\n", rdp->cpu); - /* - * Can't use rcu_nocb_lock_irqsave() before SEGCBLIST_LOCKING - * is set. - */ - raw_spin_lock_irqsave(&rdp->nocb_lock, flags); + WARN_ON_ONCE(rcu_cblist_n_cbs(&rdp->nocb_bypass)); + WARN_ON_ONCE(rcu_segcblist_n_cbs(&rdp->cblist)); - /* - * We didn't take the nocb lock while working on the - * rdp->cblist with SEGCBLIST_LOCKING cleared (pure softirq/rcuc mode). - * Every modifications that have been done previously on - * rdp->cblist must be visible remotely by the nocb kthreads - * upon wake up after reading the cblist flags. - * - * The layout against nocb_lock enforces that ordering: - * - * __rcu_nocb_rdp_offload() nocb_cb_wait()/nocb_gp_wait() - * ------------------------- ---------------------------- - * WRITE callbacks rcu_nocb_lock() - * rcu_nocb_lock() READ flags - * WRITE flags READ callbacks - * rcu_nocb_unlock() rcu_nocb_unlock() - */ - wake_gp = rdp_offload_toggle(rdp, true, flags); + wake_gp = rcu_nocb_queue_toggle_rdp(rdp); if (wake_gp) - wake_up_process(rdp_gp->nocb_gp_kthread); + wake_up_process(rdp->nocb_gp_kthread); + swait_event_exclusive(rdp->nocb_state_wq, - rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB) && - rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP)); + rcu_nocb_rdp_offload_wait_cond(rdp)); - /* - * All kthreads are ready to work, we can finally relieve rcu_core() and - * enable nocb bypass. - */ - rcu_nocb_lock_irqsave(rdp, flags); - rcu_segcblist_clear_flags(cblist, SEGCBLIST_RCU_CORE); - rcu_nocb_unlock_irqrestore(rdp, flags); + kthread_unpark(rdp->nocb_cb_kthread); return 0; } @@ -1294,37 +1181,47 @@ int rcu_nocb_cpu_offload(int cpu) int ret = 0; cpus_read_lock(); - mutex_lock(&rcu_state.barrier_mutex); + mutex_lock(&rcu_state.nocb_mutex); if (!rcu_rdp_is_offloaded(rdp)) { - if (cpu_online(cpu)) { - ret = work_on_cpu(cpu, rcu_nocb_rdp_offload, rdp); + if (!cpu_online(cpu)) { + ret = rcu_nocb_rdp_offload(rdp); if (!ret) cpumask_set_cpu(cpu, rcu_nocb_mask); } else { - pr_info("NOCB: Cannot CB-offload offline CPU %d\n", rdp->cpu); + pr_info("NOCB: Cannot CB-offload online CPU %d\n", rdp->cpu); ret = -EINVAL; } } - mutex_unlock(&rcu_state.barrier_mutex); + mutex_unlock(&rcu_state.nocb_mutex); cpus_read_unlock(); return ret; } EXPORT_SYMBOL_GPL(rcu_nocb_cpu_offload); +#ifdef CONFIG_RCU_LAZY static unsigned long lazy_rcu_shrink_count(struct shrinker *shrink, struct shrink_control *sc) { int cpu; unsigned long count = 0; + if (WARN_ON_ONCE(!cpumask_available(rcu_nocb_mask))) + return 0; + + /* Protect rcu_nocb_mask against concurrent (de-)offloading. */ + if (!mutex_trylock(&rcu_state.nocb_mutex)) + return 0; + /* Snapshot count of all CPUs */ - for_each_possible_cpu(cpu) { + for_each_cpu(cpu, rcu_nocb_mask) { struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); count += READ_ONCE(rdp->lazy_len); } + mutex_unlock(&rcu_state.nocb_mutex); + return count ? count : SHRINK_EMPTY; } @@ -1335,15 +1232,45 @@ lazy_rcu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc) unsigned long flags; unsigned long count = 0; + if (WARN_ON_ONCE(!cpumask_available(rcu_nocb_mask))) + return 0; + /* + * Protect against concurrent (de-)offloading. Otherwise nocb locking + * may be ignored or imbalanced. + */ + if (!mutex_trylock(&rcu_state.nocb_mutex)) { + /* + * But really don't insist if nocb_mutex is contended since we + * can't guarantee that it will never engage in a dependency + * chain involving memory allocation. The lock is seldom contended + * anyway. + */ + return 0; + } + /* Snapshot count of all CPUs */ - for_each_possible_cpu(cpu) { + for_each_cpu(cpu, rcu_nocb_mask) { struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); - int _count = READ_ONCE(rdp->lazy_len); + int _count; + + if (WARN_ON_ONCE(!rcu_rdp_is_offloaded(rdp))) + continue; - if (_count == 0) + if (!READ_ONCE(rdp->lazy_len)) continue; + rcu_nocb_lock_irqsave(rdp, flags); - WRITE_ONCE(rdp->lazy_len, 0); + /* + * Recheck under the nocb lock. Since we are not holding the bypass + * lock we may still race with increments from the enqueuer but still + * we know for sure if there is at least one lazy callback. + */ + _count = READ_ONCE(rdp->lazy_len); + if (!_count) { + rcu_nocb_unlock_irqrestore(rdp, flags); + continue; + } + rcu_nocb_try_flush_bypass(rdp, jiffies); rcu_nocb_unlock_irqrestore(rdp, flags); wake_nocb_gp(rdp, false); sc->nr_to_scan -= _count; @@ -1351,21 +1278,19 @@ lazy_rcu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc) if (sc->nr_to_scan <= 0) break; } + + mutex_unlock(&rcu_state.nocb_mutex); + return count ? count : SHRINK_STOP; } - -static struct shrinker lazy_rcu_shrinker = { - .count_objects = lazy_rcu_shrink_count, - .scan_objects = lazy_rcu_shrink_scan, - .batch = 0, - .seeks = DEFAULT_SEEKS, -}; +#endif // #ifdef CONFIG_RCU_LAZY void __init rcu_init_nohz(void) { int cpu; struct rcu_data *rdp; const struct cpumask *cpumask = NULL; + struct shrinker * __maybe_unused lazy_rcu_shrinker; #if defined(CONFIG_NO_HZ_FULL) if (tick_nohz_full_running && !cpumask_empty(tick_nohz_full_mask)) @@ -1391,8 +1316,17 @@ void __init rcu_init_nohz(void) if (!rcu_state.nocb_is_setup) return; - if (register_shrinker(&lazy_rcu_shrinker, "rcu-lazy")) - pr_err("Failed to register lazy_rcu shrinker!\n"); +#ifdef CONFIG_RCU_LAZY + lazy_rcu_shrinker = shrinker_alloc(0, "rcu-lazy"); + if (!lazy_rcu_shrinker) { + pr_err("Failed to allocate lazy_rcu shrinker!\n"); + } else { + lazy_rcu_shrinker->count_objects = lazy_rcu_shrink_count; + lazy_rcu_shrinker->scan_objects = lazy_rcu_shrink_scan; + + shrinker_register(lazy_rcu_shrinker); + } +#endif // #ifdef CONFIG_RCU_LAZY if (!cpumask_subset(rcu_nocb_mask, cpu_possible_mask)) { pr_info("\tNote: kernel parameter 'rcu_nocbs=', 'nohz_full', or 'isolcpus=' contains nonexistent CPUs.\n"); @@ -1411,9 +1345,7 @@ void __init rcu_init_nohz(void) rdp = per_cpu_ptr(&rcu_data, cpu); if (rcu_segcblist_empty(&rdp->cblist)) rcu_segcblist_init(&rdp->cblist); - rcu_segcblist_offload(&rdp->cblist, true); - rcu_segcblist_set_flags(&rdp->cblist, SEGCBLIST_KTHREAD_CB | SEGCBLIST_KTHREAD_GP); - rcu_segcblist_clear_flags(&rdp->cblist, SEGCBLIST_RCU_CORE); + rcu_segcblist_set_flags(&rdp->cblist, SEGCBLIST_OFFLOADED); } rcu_organize_nocb_kthreads(); } @@ -1461,7 +1393,7 @@ static void rcu_spawn_cpu_nocb_kthread(int cpu) "rcuog/%d", rdp_gp->cpu); if (WARN_ONCE(IS_ERR(t), "%s: Could not start rcuo GP kthread, OOM is now expected behavior\n", __func__)) { mutex_unlock(&rdp_gp->nocb_gp_kthread_mutex); - goto end; + goto err; } WRITE_ONCE(rdp_gp->nocb_gp_kthread, t); if (kthread_prio) @@ -1470,10 +1402,15 @@ static void rcu_spawn_cpu_nocb_kthread(int cpu) mutex_unlock(&rdp_gp->nocb_gp_kthread_mutex); /* Spawn the kthread for this CPU. */ - t = kthread_run(rcu_nocb_cb_kthread, rdp, - "rcuo%c/%d", rcu_state.abbr, cpu); + t = kthread_create(rcu_nocb_cb_kthread, rdp, + "rcuo%c/%d", rcu_state.abbr, cpu); if (WARN_ONCE(IS_ERR(t), "%s: Could not start rcuo CB kthread, OOM is now expected behavior\n", __func__)) - goto end; + goto err; + + if (rcu_rdp_is_offloaded(rdp)) + wake_up_process(t); + else + kthread_park(t); if (IS_ENABLED(CONFIG_RCU_NOCB_CPU_CB_BOOST) && kthread_prio) sched_setscheduler_nocheck(t, SCHED_FIFO, &sp); @@ -1481,13 +1418,21 @@ static void rcu_spawn_cpu_nocb_kthread(int cpu) WRITE_ONCE(rdp->nocb_cb_kthread, t); WRITE_ONCE(rdp->nocb_gp_kthread, rdp_gp->nocb_gp_kthread); return; -end: - mutex_lock(&rcu_state.barrier_mutex); + +err: + /* + * No need to protect against concurrent rcu_barrier() + * because the number of callbacks should be 0 for a non-boot CPU, + * therefore rcu_barrier() shouldn't even try to grab the nocb_lock. + * But hold nocb_mutex to avoid nocb_lock imbalance from shrinker. + */ + WARN_ON_ONCE(system_state > SYSTEM_BOOTING && rcu_segcblist_n_cbs(&rdp->cblist)); + mutex_lock(&rcu_state.nocb_mutex); if (rcu_rdp_is_offloaded(rdp)) { rcu_nocb_rdp_deoffload(rdp); cpumask_clear_cpu(cpu, rcu_nocb_mask); } - mutex_unlock(&rcu_state.barrier_mutex); + mutex_unlock(&rcu_state.nocb_mutex); } /* How many CB CPU IDs per GP kthread? Default of -1 for sqrt(nr_cpu_ids). */ @@ -1605,8 +1550,11 @@ static void show_rcu_nocb_gp_state(struct rcu_data *rdp) /* Dump out nocb kthread state for the specified rcu_data structure. */ static void show_rcu_nocb_state(struct rcu_data *rdp) { - char bufw[20]; - char bufr[20]; + char bufd[22]; + char bufw[45]; + char bufr[45]; + char bufn[22]; + char bufb[22]; struct rcu_data *nocb_next_rdp; struct rcu_segcblist *rsclp = &rdp->cblist; bool waslocked; @@ -1615,19 +1563,25 @@ static void show_rcu_nocb_state(struct rcu_data *rdp) if (rdp->nocb_gp_rdp == rdp) show_rcu_nocb_gp_state(rdp); + if (!rcu_segcblist_is_offloaded(&rdp->cblist)) + return; + nocb_next_rdp = list_next_or_null_rcu(&rdp->nocb_gp_rdp->nocb_head_rdp, &rdp->nocb_entry_rdp, typeof(*rdp), nocb_entry_rdp); - sprintf(bufw, "%ld", rsclp->gp_seq[RCU_WAIT_TAIL]); - sprintf(bufr, "%ld", rsclp->gp_seq[RCU_NEXT_READY_TAIL]); - pr_info(" CB %d^%d->%d %c%c%c%c%c%c F%ld L%ld C%d %c%c%s%c%s%c%c q%ld %c CPU %d%s\n", + sprintf(bufd, "%ld", rsclp->seglen[RCU_DONE_TAIL]); + sprintf(bufw, "%ld(%ld)", rsclp->seglen[RCU_WAIT_TAIL], rsclp->gp_seq[RCU_WAIT_TAIL]); + sprintf(bufr, "%ld(%ld)", rsclp->seglen[RCU_NEXT_READY_TAIL], + rsclp->gp_seq[RCU_NEXT_READY_TAIL]); + sprintf(bufn, "%ld", rsclp->seglen[RCU_NEXT_TAIL]); + sprintf(bufb, "%ld", rcu_cblist_n_cbs(&rdp->nocb_bypass)); + pr_info(" CB %d^%d->%d %c%c%c%c%c F%ld L%ld C%d %c%s%c%s%c%s%c%s%c%s q%ld %c CPU %d%s\n", rdp->cpu, rdp->nocb_gp_rdp->cpu, nocb_next_rdp ? nocb_next_rdp->cpu : -1, "kK"[!!rdp->nocb_cb_kthread], "bB"[raw_spin_is_locked(&rdp->nocb_bypass_lock)], - "cC"[!!atomic_read(&rdp->nocb_lock_contended)], "lL"[raw_spin_is_locked(&rdp->nocb_lock)], "sS"[!!rdp->nocb_cb_sleep], ".W"[swait_active(&rdp->nocb_cb_wq)], @@ -1635,12 +1589,15 @@ static void show_rcu_nocb_state(struct rcu_data *rdp) jiffies - rdp->nocb_nobypass_last, rdp->nocb_nobypass_count, ".D"[rcu_segcblist_ready_cbs(rsclp)], + rcu_segcblist_segempty(rsclp, RCU_DONE_TAIL) ? "" : bufd, ".W"[!rcu_segcblist_segempty(rsclp, RCU_WAIT_TAIL)], rcu_segcblist_segempty(rsclp, RCU_WAIT_TAIL) ? "" : bufw, ".R"[!rcu_segcblist_segempty(rsclp, RCU_NEXT_READY_TAIL)], rcu_segcblist_segempty(rsclp, RCU_NEXT_READY_TAIL) ? "" : bufr, ".N"[!rcu_segcblist_segempty(rsclp, RCU_NEXT_TAIL)], + rcu_segcblist_segempty(rsclp, RCU_NEXT_TAIL) ? "" : bufn, ".B"[!!rcu_cblist_n_cbs(&rdp->nocb_bypass)], + !rcu_cblist_n_cbs(&rdp->nocb_bypass) ? "" : bufb, rcu_segcblist_n_cbs(&rdp->cblist), rdp->nocb_cb_kthread ? task_state_to_char(rdp->nocb_cb_kthread) : '.', rdp->nocb_cb_kthread ? (int)task_cpu(rdp->nocb_cb_kthread) : -1, @@ -1664,16 +1621,6 @@ static void show_rcu_nocb_state(struct rcu_data *rdp) #else /* #ifdef CONFIG_RCU_NOCB_CPU */ -static inline int rcu_lockdep_is_held_nocb(struct rcu_data *rdp) -{ - return 0; -} - -static inline bool rcu_current_is_nocb_kthread(struct rcu_data *rdp) -{ - return false; -} - /* No ->nocb_lock to acquire. */ static void rcu_nocb_lock(struct rcu_data *rdp) { @@ -1721,10 +1668,10 @@ static bool rcu_nocb_flush_bypass(struct rcu_data *rdp, struct rcu_head *rhp, return true; } -static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp, - bool *was_alldone, unsigned long flags, bool lazy) +static void call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *head, + rcu_callback_t func, unsigned long flags, bool lazy) { - return false; + WARN_ON_ONCE(1); /* Should be dead code! */ } static void __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_empty, diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 7b0fe741a088..dbe2d02be824 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -24,12 +24,13 @@ static bool rcu_rdp_is_offloaded(struct rcu_data *rdp) * timers have their own means of synchronization against the * offloaded state updaters. */ - RCU_LOCKDEP_WARN( + RCU_NOCB_LOCKDEP_WARN( !(lockdep_is_held(&rcu_state.barrier_mutex) || (IS_ENABLED(CONFIG_HOTPLUG_CPU) && lockdep_is_cpus_held()) || - rcu_lockdep_is_held_nocb(rdp) || - (rdp == this_cpu_ptr(&rcu_data) && - !(IS_ENABLED(CONFIG_PREEMPT_COUNT) && preemptible())) || + lockdep_is_held(&rdp->nocb_lock) || + lockdep_is_held(&rcu_state.nocb_mutex) || + ((!(IS_ENABLED(CONFIG_PREEMPT_COUNT) && preemptible()) || softirq_count()) && + rdp == this_cpu_ptr(&rcu_data)) || rcu_current_is_nocb_kthread(rdp)), "Unsafe read of RCU_NOCB offloaded state" ); @@ -93,6 +94,16 @@ static void __init rcu_bootup_announce_oddness(void) pr_info("\tRCU debug GP init slowdown %d jiffies.\n", gp_init_delay); if (gp_cleanup_delay) pr_info("\tRCU debug GP cleanup slowdown %d jiffies.\n", gp_cleanup_delay); + if (nohz_full_patience_delay < 0) { + pr_info("\tRCU NOCB CPU patience negative (%d), resetting to zero.\n", nohz_full_patience_delay); + nohz_full_patience_delay = 0; + } else if (nohz_full_patience_delay > 5 * MSEC_PER_SEC) { + pr_info("\tRCU NOCB CPU patience too large (%d), resetting to %ld.\n", nohz_full_patience_delay, 5 * MSEC_PER_SEC); + nohz_full_patience_delay = 5 * MSEC_PER_SEC; + } else if (nohz_full_patience_delay) { + pr_info("\tRCU NOCB CPU patience set to %d milliseconds.\n", nohz_full_patience_delay); + } + nohz_full_patience_delay_jiffies = msecs_to_jiffies(nohz_full_patience_delay); if (!use_softirq) pr_info("\tRCU_SOFTIRQ processing moved to rcuc kthreads.\n"); if (IS_ENABLED(CONFIG_RCU_EQS_DEBUG)) @@ -172,9 +183,9 @@ static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp) switch (blkd_state) { case 0: case RCU_EXP_TASKS: - case RCU_EXP_TASKS + RCU_GP_BLKD: + case RCU_EXP_TASKS | RCU_GP_BLKD: case RCU_GP_TASKS: - case RCU_GP_TASKS + RCU_EXP_TASKS: + case RCU_GP_TASKS | RCU_EXP_TASKS: /* * Blocking neither GP, or first task blocking the normal @@ -187,10 +198,10 @@ static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp) case RCU_EXP_BLKD: case RCU_GP_BLKD: - case RCU_GP_BLKD + RCU_EXP_BLKD: - case RCU_GP_TASKS + RCU_EXP_BLKD: - case RCU_GP_TASKS + RCU_GP_BLKD + RCU_EXP_BLKD: - case RCU_GP_TASKS + RCU_EXP_TASKS + RCU_GP_BLKD + RCU_EXP_BLKD: + case RCU_GP_BLKD | RCU_EXP_BLKD: + case RCU_GP_TASKS | RCU_EXP_BLKD: + case RCU_GP_TASKS | RCU_GP_BLKD | RCU_EXP_BLKD: + case RCU_GP_TASKS | RCU_EXP_TASKS | RCU_GP_BLKD | RCU_EXP_BLKD: /* * First task arriving that blocks either GP, or first task @@ -203,9 +214,9 @@ static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp) list_add_tail(&t->rcu_node_entry, &rnp->blkd_tasks); break; - case RCU_EXP_TASKS + RCU_EXP_BLKD: - case RCU_EXP_TASKS + RCU_GP_BLKD + RCU_EXP_BLKD: - case RCU_GP_TASKS + RCU_EXP_TASKS + RCU_EXP_BLKD: + case RCU_EXP_TASKS | RCU_EXP_BLKD: + case RCU_EXP_TASKS | RCU_GP_BLKD | RCU_EXP_BLKD: + case RCU_GP_TASKS | RCU_EXP_TASKS | RCU_EXP_BLKD: /* * Second or subsequent task blocking the expedited GP. @@ -216,8 +227,8 @@ static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp) list_add(&t->rcu_node_entry, rnp->exp_tasks); break; - case RCU_GP_TASKS + RCU_GP_BLKD: - case RCU_GP_TASKS + RCU_EXP_TASKS + RCU_GP_BLKD: + case RCU_GP_TASKS | RCU_GP_BLKD: + case RCU_GP_TASKS | RCU_EXP_TASKS | RCU_GP_BLKD: /* * Second or subsequent task blocking the normal GP. @@ -257,11 +268,14 @@ static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp) * GP should not be able to end until we report, so there should be * no need to check for a subsequent expedited GP. (Though we are * still in a quiescent state in any case.) + * + * Interrupts are disabled, so ->cpu_no_qs.b.exp cannot change. */ if (blkd_state & RCU_EXP_BLKD && rdp->cpu_no_qs.b.exp) rcu_report_exp_rdp(rdp); else WARN_ON_ONCE(rdp->cpu_no_qs.b.exp); + ASSERT_EXCLUSIVE_WRITER_SCOPED(rdp->cpu_no_qs.b.exp); } /* @@ -472,13 +486,16 @@ rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, unsigned long flags) struct rcu_node *rnp; union rcu_special special; + rdp = this_cpu_ptr(&rcu_data); + if (rdp->defer_qs_iw_pending == DEFER_QS_PENDING) + rdp->defer_qs_iw_pending = DEFER_QS_IDLE; + /* * If RCU core is waiting for this CPU to exit its critical section, * report the fact that it has exited. Because irqs are disabled, * t->rcu_read_unlock_special cannot change. */ special = t->rcu_read_unlock_special; - rdp = this_cpu_ptr(&rcu_data); if (!special.s && !rdp->cpu_no_qs.b.exp) { local_irq_restore(flags); return; @@ -520,7 +537,6 @@ rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, unsigned long flags) WARN_ON_ONCE(rnp->completedqs == rnp->gp_seq && (!empty_norm || rnp->qsmask)); empty_exp = sync_rcu_exp_done(rnp); - smp_mb(); /* ensure expedited fastpath sees end of RCU c-s. */ np = rcu_next_node_entry(t, rnp); list_del_init(&t->rcu_node_entry); t->rcu_blocked_node = NULL; @@ -612,8 +628,93 @@ static void rcu_preempt_deferred_qs_handler(struct irq_work *iwp) { struct rcu_data *rdp; + lockdep_assert_irqs_disabled(); rdp = container_of(iwp, struct rcu_data, defer_qs_iw); - rdp->defer_qs_iw_pending = false; + + /* + * If the IRQ work handler happens to run in the middle of RCU read-side + * critical section, it could be ineffective in getting the scheduler's + * attention to report a deferred quiescent state (the whole point of the + * IRQ work). For this reason, requeue the IRQ work. + * + * Basically, we want to avoid following situation: + * 1. rcu_read_unlock() queues IRQ work (state -> DEFER_QS_PENDING) + * 2. CPU enters new rcu_read_lock() + * 3. IRQ work runs but cannot report QS due to rcu_preempt_depth() > 0 + * 4. rcu_read_unlock() does not re-queue work (state still PENDING) + * 5. Deferred QS reporting does not happen. + */ + if (rcu_preempt_depth() > 0) + WRITE_ONCE(rdp->defer_qs_iw_pending, DEFER_QS_IDLE); +} + +/* + * Check if expedited grace period processing during unlock is needed. + * + * This function determines whether expedited handling is required based on: + * 1. Task blocking an expedited grace period (based on a heuristic, could be + * false-positive, see below.) + * 2. CPU participating in an expedited grace period + * 3. Strict grace period mode requiring expedited handling + * 4. RCU priority deboosting needs when interrupts were disabled + * + * @t: The task being checked + * @rdp: The per-CPU RCU data + * @rnp: The RCU node for this CPU + * @irqs_were_disabled: Whether interrupts were disabled before rcu_read_unlock() + * + * Returns true if expedited processing of the rcu_read_unlock() is needed. + */ +static bool rcu_unlock_needs_exp_handling(struct task_struct *t, + struct rcu_data *rdp, + struct rcu_node *rnp, + bool irqs_were_disabled) +{ + /* + * Check if this task is blocking an expedited grace period. If the + * task was preempted within an RCU read-side critical section and is + * on the expedited grace period blockers list (exp_tasks), we need + * expedited handling to unblock the expedited GP. This is not an exact + * check because 't' might not be on the exp_tasks list at all - its + * just a fast heuristic that can be false-positive sometimes. + */ + if (t->rcu_blocked_node && READ_ONCE(t->rcu_blocked_node->exp_tasks)) + return true; + + /* + * Check if this CPU is participating in an expedited grace period. + * The expmask bitmap tracks which CPUs need to check in for the + * current expedited GP. If our CPU's bit is set, we need expedited + * handling to help complete the expedited GP. + */ + if (rdp->grpmask & READ_ONCE(rnp->expmask)) + return true; + + /* + * In CONFIG_RCU_STRICT_GRACE_PERIOD=y kernels, all grace periods + * are treated as short for testing purposes even if that means + * disturbing the system more. Check if either: + * - This CPU has not yet reported a quiescent state, or + * - This task was preempted within an RCU critical section + * In either case, require expedited handling for strict GP mode. + */ + if (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD) && + ((rdp->grpmask & READ_ONCE(rnp->qsmask)) || t->rcu_blocked_node)) + return true; + + /* + * RCU priority boosting case: If a task is subject to RCU priority + * boosting and exits an RCU read-side critical section with interrupts + * disabled, we need expedited handling to ensure timely deboosting. + * Without this, a low-priority task could incorrectly run at high + * real-time priority for an extended period degrading real-time + * responsiveness. This applies to all CONFIG_RCU_BOOST=y kernels, + * not just to PREEMPT_RT. + */ + if (IS_ENABLED(CONFIG_RCU_BOOST) && irqs_were_disabled && t->rcu_blocked_node) + return true; + + return false; } /* @@ -635,18 +736,14 @@ static void rcu_read_unlock_special(struct task_struct *t) local_irq_save(flags); irqs_were_disabled = irqs_disabled_flags(flags); if (preempt_bh_were_disabled || irqs_were_disabled) { - bool expboost; // Expedited GP in flight or possible boosting. + bool needs_exp; // Expedited handling needed. struct rcu_data *rdp = this_cpu_ptr(&rcu_data); struct rcu_node *rnp = rdp->mynode; - expboost = (t->rcu_blocked_node && READ_ONCE(t->rcu_blocked_node->exp_tasks)) || - (rdp->grpmask & READ_ONCE(rnp->expmask)) || - (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD) && - ((rdp->grpmask & READ_ONCE(rnp->qsmask)) || t->rcu_blocked_node)) || - (IS_ENABLED(CONFIG_RCU_BOOST) && irqs_were_disabled && - t->rcu_blocked_node); + needs_exp = rcu_unlock_needs_exp_handling(t, rdp, rnp, irqs_were_disabled); + // Need to defer quiescent state until everything is enabled. - if (use_softirq && (in_hardirq() || (expboost && !irqs_were_disabled))) { + if (use_softirq && (in_hardirq() || (needs_exp && !irqs_were_disabled))) { // Using softirq, safe to awaken, and either the // wakeup is free or there is either an expedited // GP in flight or a potential need to deboost. @@ -656,20 +753,13 @@ static void rcu_read_unlock_special(struct task_struct *t) // Also if no expediting and no possible deboosting, // slow is OK. Plus nohz_full CPUs eventually get // tick enabled. - set_tsk_need_resched(current); - set_preempt_need_resched(); + set_need_resched_current(); if (IS_ENABLED(CONFIG_IRQ_WORK) && irqs_were_disabled && - expboost && !rdp->defer_qs_iw_pending && cpu_online(rdp->cpu)) { + needs_exp && rdp->defer_qs_iw_pending != DEFER_QS_PENDING && + cpu_online(rdp->cpu)) { // Get scheduler to re-evaluate and call hooks. // If !IRQ_WORK, FQS scan will eventually IPI. - if (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD) && - IS_ENABLED(CONFIG_PREEMPT_RT)) - rdp->defer_qs_iw = IRQ_WORK_INIT_HARD( - rcu_preempt_deferred_qs_handler); - else - init_irq_work(&rdp->defer_qs_iw, - rcu_preempt_deferred_qs_handler); - rdp->defer_qs_iw_pending = true; + rdp->defer_qs_iw_pending = DEFER_QS_PENDING; irq_work_queue_on(&rdp->defer_qs_iw, rdp->cpu); } } @@ -722,10 +812,8 @@ static void rcu_flavor_sched_clock_irq(int user) if (rcu_preempt_depth() > 0 || (preempt_count() & (PREEMPT_MASK | SOFTIRQ_MASK))) { /* No QS, force context switch if deferred. */ - if (rcu_preempt_need_deferred_qs(t)) { - set_tsk_need_resched(t); - set_preempt_need_resched(); - } + if (rcu_preempt_need_deferred_qs(t)) + set_need_resched_current(); } else if (rcu_preempt_need_deferred_qs(t)) { rcu_preempt_deferred_qs(t); /* Report deferred QS. */ return; @@ -803,11 +891,15 @@ dump_blkd_tasks(struct rcu_node *rnp, int ncheck) rdp = per_cpu_ptr(&rcu_data, cpu); pr_info("\t%d: %c online: %ld(%d) offline: %ld(%d)\n", cpu, ".o"[rcu_rdp_cpu_online(rdp)], - (long)rdp->rcu_onl_gp_seq, rdp->rcu_onl_gp_flags, - (long)rdp->rcu_ofl_gp_seq, rdp->rcu_ofl_gp_flags); + (long)rdp->rcu_onl_gp_seq, rdp->rcu_onl_gp_state, + (long)rdp->rcu_ofl_gp_seq, rdp->rcu_ofl_gp_state); } } +static void rcu_preempt_deferred_qs_init(struct rcu_data *rdp) +{ + rdp->defer_qs_iw = IRQ_WORK_INIT_HARD(rcu_preempt_deferred_qs_handler); +} #else /* #ifdef CONFIG_PREEMPT_RCU */ /* @@ -819,8 +911,17 @@ void rcu_read_unlock_strict(void) { struct rcu_data *rdp; - if (irqs_disabled() || preempt_count() || !rcu_state.gp_kthread) + if (irqs_disabled() || in_atomic_preempt_off() || !rcu_state.gp_kthread) return; + + /* + * rcu_report_qs_rdp() can only be invoked with a stable rdp and + * from the local CPU. + * + * The in_atomic_preempt_off() check ensures that we come here holding + * the last preempt_count (which will get dropped once we return to + * __rcu_read_unlock(). + */ rdp = this_cpu_ptr(&rcu_data); rdp->cpu_no_qs.b.norm = false; rcu_report_qs_rdp(rdp); @@ -857,7 +958,7 @@ static void rcu_qs(void) /* * Register an urgently needed quiescent state. If there is an - * emergency, invoke rcu_momentary_dyntick_idle() to do a heavy-weight + * emergency, invoke rcu_momentary_eqs() to do a heavy-weight * dyntick-idle quiescent state visible to other CPUs, which will in * some cases serve for expedited as well as normal grace periods. * Either way, register a lightweight quiescent state. @@ -877,7 +978,7 @@ void rcu_all_qs(void) this_cpu_write(rcu_data.rcu_urgent_qs, false); if (unlikely(raw_cpu_read(rcu_data.rcu_need_heavy_qs))) { local_irq_save(flags); - rcu_momentary_dyntick_idle(); + rcu_momentary_eqs(); local_irq_restore(flags); } rcu_qs(); @@ -897,7 +998,7 @@ void rcu_note_context_switch(bool preempt) goto out; this_cpu_write(rcu_data.rcu_urgent_qs, false); if (unlikely(raw_cpu_read(rcu_data.rcu_need_heavy_qs))) - rcu_momentary_dyntick_idle(); + rcu_momentary_eqs(); out: rcu_tasks_qs(current, preempt); trace_rcu_utilization(TPS("End context switch")); @@ -941,7 +1042,7 @@ notrace void rcu_preempt_deferred_qs(struct task_struct *t) { struct rcu_data *rdp = this_cpu_ptr(&rcu_data); - if (rdp->cpu_no_qs.b.exp) + if (READ_ONCE(rdp->cpu_no_qs.b.exp)) rcu_report_exp_rdp(rdp); } @@ -961,13 +1062,16 @@ static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp) */ static void rcu_flavor_sched_clock_irq(int user) { - if (user || rcu_is_cpu_rrupt_from_idle()) { + if (user || rcu_is_cpu_rrupt_from_idle() || + (IS_ENABLED(CONFIG_PREEMPT_COUNT) && + (preempt_count() == HARDIRQ_OFFSET))) { /* * Get here if this CPU took its interrupt from user - * mode or from the idle loop, and if this is not a - * nested interrupt. In this case, the CPU is in - * a quiescent state, so note it. + * mode, from the idle loop without this being a nested + * interrupt, or while not holding the task preempt count + * (with PREEMPT_COUNT=y). In this case, the CPU is in a + * quiescent state, so note it. * * No memory barrier is required here because rcu_qs() * references only CPU-local variables that other CPUs @@ -995,6 +1099,8 @@ dump_blkd_tasks(struct rcu_node *rnp, int ncheck) WARN_ON_ONCE(!list_empty(&rnp->blkd_tasks)); } +static void rcu_preempt_deferred_qs_init(struct rcu_data *rdp) { } + #endif /* #else #ifdef CONFIG_PREEMPT_RCU */ /* @@ -1193,63 +1299,22 @@ static void rcu_spawn_one_boost_kthread(struct rcu_node *rnp) struct sched_param sp; struct task_struct *t; - mutex_lock(&rnp->boost_kthread_mutex); - if (rnp->boost_kthread_task || !rcu_scheduler_fully_active) - goto out; + if (rnp->boost_kthread_task) + return; t = kthread_create(rcu_boost_kthread, (void *)rnp, "rcub/%d", rnp_index); if (WARN_ON_ONCE(IS_ERR(t))) - goto out; + return; raw_spin_lock_irqsave_rcu_node(rnp, flags); rnp->boost_kthread_task = t; raw_spin_unlock_irqrestore_rcu_node(rnp, flags); + sp.sched_priority = kthread_prio; sched_setscheduler_nocheck(t, SCHED_FIFO, &sp); + rcu_thread_affine_rnp(t, rnp); wake_up_process(t); /* get to TASK_INTERRUPTIBLE quickly. */ - - out: - mutex_unlock(&rnp->boost_kthread_mutex); -} - -/* - * Set the per-rcu_node kthread's affinity to cover all CPUs that are - * served by the rcu_node in question. The CPU hotplug lock is still - * held, so the value of rnp->qsmaskinit will be stable. - * - * We don't include outgoingcpu in the affinity set, use -1 if there is - * no outgoing CPU. If there are no CPUs left in the affinity set, - * this function allows the kthread to execute on any CPU. - * - * Any future concurrent calls are serialized via ->boost_kthread_mutex. - */ -static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu) -{ - struct task_struct *t = rnp->boost_kthread_task; - unsigned long mask; - cpumask_var_t cm; - int cpu; - - if (!t) - return; - if (!zalloc_cpumask_var(&cm, GFP_KERNEL)) - return; - mutex_lock(&rnp->boost_kthread_mutex); - mask = rcu_rnp_online_cpus(rnp); - for_each_leaf_node_possible_cpu(rnp, cpu) - if ((mask & leaf_node_cpu_bit(rnp, cpu)) && - cpu != outgoingcpu) - cpumask_set_cpu(cpu, cm); - cpumask_and(cm, cm, housekeeping_cpumask(HK_TYPE_RCU)); - if (cpumask_empty(cm)) { - cpumask_copy(cm, housekeeping_cpumask(HK_TYPE_RCU)); - if (outgoingcpu >= 0) - cpumask_clear_cpu(outgoingcpu, cm); - } - set_cpus_allowed_ptr(t, cm); - mutex_unlock(&rnp->boost_kthread_mutex); - free_cpumask_var(cm); } #else /* #ifdef CONFIG_RCU_BOOST */ @@ -1268,10 +1333,6 @@ static void rcu_spawn_one_boost_kthread(struct rcu_node *rnp) { } -static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu) -{ -} - #endif /* #else #ifdef CONFIG_RCU_BOOST */ /* diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h index 5653560573e2..b67532cb8770 100644 --- a/kernel/rcu/tree_stall.h +++ b/kernel/rcu/tree_stall.h @@ -7,15 +7,69 @@ * Author: Paul E. McKenney <paulmck@linux.ibm.com> */ +#include <linux/console.h> #include <linux/kvm_para.h> +#include <linux/rcu_notifier.h> +#include <linux/smp.h> ////////////////////////////////////////////////////////////////////////////// // // Controlling CPU stall warnings, including delay calculation. /* panic() on RCU Stall sysctl. */ -int sysctl_panic_on_rcu_stall __read_mostly; -int sysctl_max_rcu_stall_to_panic __read_mostly; +static int sysctl_panic_on_rcu_stall __read_mostly; +static int sysctl_max_rcu_stall_to_panic __read_mostly; + +static const struct ctl_table rcu_stall_sysctl_table[] = { + { + .procname = "panic_on_rcu_stall", + .data = &sysctl_panic_on_rcu_stall, + .maxlen = sizeof(sysctl_panic_on_rcu_stall), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, + { + .procname = "max_rcu_stall_to_panic", + .data = &sysctl_max_rcu_stall_to_panic, + .maxlen = sizeof(sysctl_max_rcu_stall_to_panic), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ONE, + .extra2 = SYSCTL_INT_MAX, + }, +}; + +static int __init init_rcu_stall_sysctl(void) +{ + register_sysctl_init("kernel", rcu_stall_sysctl_table); + return 0; +} + +subsys_initcall(init_rcu_stall_sysctl); + +#ifdef CONFIG_SYSFS + +static unsigned int rcu_stall_count; + +static ssize_t rcu_stall_count_show(struct kobject *kobj, struct kobj_attribute *attr, + char *page) +{ + return sysfs_emit(page, "%u\n", rcu_stall_count); +} + +static struct kobj_attribute rcu_stall_count_attr = __ATTR_RO(rcu_stall_count); + +static __init int kernel_rcu_stall_sysfs_init(void) +{ + sysfs_add_file_to_group(kernel_kobj, &rcu_stall_count_attr.attr, NULL); + return 0; +} + +late_initcall(kernel_rcu_stall_sysfs_init); + +#endif // CONFIG_SYSFS #ifdef CONFIG_PROVE_RCU #define RCU_STALL_DELAY_DELTA (5 * HZ) @@ -39,7 +93,7 @@ int rcu_exp_jiffies_till_stall_check(void) // CONFIG_RCU_EXP_CPU_STALL_TIMEOUT, so check the allowed range. // The minimum clamped value is "2UL", because at least one full // tick has to be guaranteed. - till_stall_check = clamp(msecs_to_jiffies(cpu_stall_timeout), 2UL, 21UL * HZ); + till_stall_check = clamp(msecs_to_jiffies(cpu_stall_timeout), 2UL, 300UL * HZ); if (cpu_stall_timeout && jiffies_to_msecs(till_stall_check) != cpu_stall_timeout) WRITE_ONCE(rcu_exp_cpu_stall_timeout, jiffies_to_msecs(till_stall_check)); @@ -73,36 +127,6 @@ int rcu_jiffies_till_stall_check(void) } EXPORT_SYMBOL_GPL(rcu_jiffies_till_stall_check); -/** - * rcu_gp_might_be_stalled - Is it likely that the grace period is stalled? - * - * Returns @true if the current grace period is sufficiently old that - * it is reasonable to assume that it might be stalled. This can be - * useful when deciding whether to allocate memory to enable RCU-mediated - * freeing on the one hand or just invoking synchronize_rcu() on the other. - * The latter is preferable when the grace period is stalled. - * - * Note that sampling of the .gp_start and .gp_seq fields must be done - * carefully to avoid false positives at the beginnings and ends of - * grace periods. - */ -bool rcu_gp_might_be_stalled(void) -{ - unsigned long d = rcu_jiffies_till_stall_check() / RCU_STALL_MIGHT_DIV; - unsigned long j = jiffies; - - if (d < RCU_STALL_MIGHT_MIN) - d = RCU_STALL_MIGHT_MIN; - smp_mb(); // jiffies before .gp_seq to avoid false positives. - if (!rcu_gp_in_progress()) - return false; - // Long delays at this point avoids false positive, but a delay - // of ULONG_MAX/4 jiffies voids your no-false-positive warranty. - smp_mb(); // .gp_seq before second .gp_start - // And ditto here. - return !time_before(j, READ_ONCE(rcu_state.gp_start) + d); -} - /* Don't do RCU CPU stall warnings during long sysrq printouts. */ void rcu_sysrq_start(void) { @@ -139,6 +163,13 @@ static void panic_on_rcu_stall(void) { static int cpu_stall; + /* + * Attempt to kick out the BPF scheduler if it's installed and defer + * the panic to give the system a chance to recover. + */ + if (scx_rcu_cpu_stall()) + return; + if (++cpu_stall < sysctl_max_rcu_stall_to_panic) return; @@ -149,12 +180,17 @@ static void panic_on_rcu_stall(void) /** * rcu_cpu_stall_reset - restart stall-warning timeout for current grace period * + * To perform the reset request from the caller, disable stall detection until + * 3 fqs loops have passed. This is required to ensure a fresh jiffies is + * loaded. It should be safe to do from the fqs loop as enough timer + * interrupts and context switches should have passed. + * * The caller must disable hard irqs. */ void rcu_cpu_stall_reset(void) { - WRITE_ONCE(rcu_state.jiffies_stall, - jiffies + rcu_jiffies_till_stall_check()); + WRITE_ONCE(rcu_state.nr_fqs_jiffies_stall, 3); + WRITE_ONCE(rcu_state.jiffies_stall, ULONG_MAX); } ////////////////////////////////////////////////////////////////////////////// @@ -170,6 +206,7 @@ static void record_gp_stall_check_time(void) WRITE_ONCE(rcu_state.gp_start, j); j1 = rcu_jiffies_till_stall_check(); smp_mb(); // ->gp_start before ->jiffies_stall and caller's ->gp_seq. + WRITE_ONCE(rcu_state.nr_fqs_jiffies_stall, 0); WRITE_ONCE(rcu_state.jiffies_stall, j + j1); rcu_state.jiffies_resched = j + j1 / 2; rcu_state.n_force_qs_gpstart = READ_ONCE(rcu_state.n_force_qs); @@ -356,22 +393,32 @@ static int rcu_print_task_stall(struct rcu_node *rnp, unsigned long flags) * that don't support NMI-based stack dumps. The NMI-triggered stack * traces are more accurate because they are printed by the target CPU. */ -static void rcu_dump_cpu_stacks(void) +static void rcu_dump_cpu_stacks(unsigned long gp_seq) { int cpu; unsigned long flags; struct rcu_node *rnp; rcu_for_each_leaf_node(rnp) { - raw_spin_lock_irqsave_rcu_node(rnp, flags); - for_each_leaf_node_possible_cpu(rnp, cpu) + printk_deferred_enter(); + for_each_leaf_node_possible_cpu(rnp, cpu) { + if (gp_seq != data_race(rcu_state.gp_seq)) { + printk_deferred_exit(); + pr_err("INFO: Stall ended during stack backtracing.\n"); + return; + } + if (!(data_race(rnp->qsmask) & leaf_node_cpu_bit(rnp, cpu))) + continue; + raw_spin_lock_irqsave_rcu_node(rnp, flags); if (rnp->qsmask & leaf_node_cpu_bit(rnp, cpu)) { if (cpu_is_offline(cpu)) pr_err("Offline CPU %d blocking current GP.\n", cpu); else dump_cpu_task(cpu); } - raw_spin_unlock_irqrestore_rcu_node(rnp, flags); + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); + } + printk_deferred_exit(); } } @@ -428,6 +475,35 @@ static bool rcu_is_rcuc_kthread_starving(struct rcu_data *rdp, unsigned long *jp return j > 2 * HZ; } +static void print_cpu_stat_info(int cpu) +{ + struct rcu_snap_record rsr, *rsrp; + struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); + struct kernel_cpustat *kcsp = &kcpustat_cpu(cpu); + + if (!rcu_cpu_stall_cputime) + return; + + rsrp = &rdp->snap_record; + if (rsrp->gp_seq != rdp->gp_seq) + return; + + rsr.cputime_irq = kcpustat_field(kcsp, CPUTIME_IRQ, cpu); + rsr.cputime_softirq = kcpustat_field(kcsp, CPUTIME_SOFTIRQ, cpu); + rsr.cputime_system = kcpustat_field(kcsp, CPUTIME_SYSTEM, cpu); + + pr_err("\t hardirqs softirqs csw/system\n"); + pr_err("\t number: %8lld %10d %12lld\n", + kstat_cpu_irqs_sum(cpu) + arch_irq_stat_cpu(cpu) - rsrp->nr_hardirqs, + kstat_cpu_softirqs_sum(cpu) - rsrp->nr_softirqs, + nr_context_switches_cpu(cpu) - rsrp->nr_csw); + pr_err("\tcputime: %8lld %10lld %12lld ==> %d(ms)\n", + div_u64(rsr.cputime_irq - rsrp->cputime_irq, NSEC_PER_MSEC), + div_u64(rsr.cputime_softirq - rsrp->cputime_softirq, NSEC_PER_MSEC), + div_u64(rsr.cputime_system - rsrp->cputime_system, NSEC_PER_MSEC), + jiffies_to_msecs(jiffies - rsrp->jiffies)); +} + /* * Print out diagnostic information for the specified stalled CPU. * @@ -465,10 +541,11 @@ static void print_cpu_stall_info(int cpu) } delta = rcu_seq_ctr(rdp->mynode->gp_seq - rdp->rcu_iw_gp_seq); falsepositive = rcu_is_gp_kthread_starving(NULL) && - rcu_dynticks_in_eqs(rcu_dynticks_snap(cpu)); + rcu_watching_snap_in_eqs(ct_rcu_watching_cpu(cpu)); rcuc_starved = rcu_is_rcuc_kthread_starving(rdp, &j); if (rcuc_starved) - sprintf(buf, " rcuc=%ld jiffies(starved)", j); + // Print signed value, as negative values indicate a probable bug. + snprintf(buf, sizeof(buf), " rcuc=%ld jiffies(starved)", j); pr_err("\t%d-%c%c%c%c: (%lu %s) idle=%04x/%ld/%#lx softirq=%u/%u fqs=%ld%s%s\n", cpu, "O."[!!cpu_online(cpu)], @@ -478,12 +555,14 @@ static void print_cpu_stall_info(int cpu) rdp->rcu_iw_pending ? (int)min(delta, 9UL) + '0' : "!."[!delta], ticks_value, ticks_title, - rcu_dynticks_snap(cpu) & 0xffff, - ct_dynticks_nesting_cpu(cpu), ct_dynticks_nmi_nesting_cpu(cpu), + ct_rcu_watching_cpu(cpu) & 0xffff, + ct_nesting_cpu(cpu), ct_nmi_nesting_cpu(cpu), rdp->softirq_snap, kstat_softirqs_cpu(RCU_SOFTIRQ, cpu), data_race(rcu_state.n_force_qs) - rcu_state.n_force_qs_gpstart, rcuc_starved ? buf : "", falsepositive ? " (false positive?)" : ""); + + print_cpu_stat_info(cpu); } /* Complain about starvation of grace-period kthread. */ @@ -503,16 +582,16 @@ static void rcu_check_gp_kthread_starvation(void) data_race(READ_ONCE(rcu_state.gp_state)), gpk ? data_race(READ_ONCE(gpk->__state)) : ~0, cpu); if (gpk) { + struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); + pr_err("\tUnless %s kthread gets sufficient CPU time, OOM is now expected behavior.\n", rcu_state.name); pr_err("RCU grace-period kthread stack dump:\n"); sched_show_task(gpk); - if (cpu >= 0) { - if (cpu_is_offline(cpu)) { - pr_err("RCU GP kthread last ran on offline CPU %d.\n", cpu); - } else { - pr_err("Stack dump where RCU GP kthread last ran:\n"); - dump_cpu_task(cpu); - } + if (cpu_is_offline(cpu)) { + pr_err("RCU GP kthread last ran on offline CPU %d.\n", cpu); + } else if (!(data_race(READ_ONCE(rdp->mynode->qsmask)) & rdp->grpmask)) { + pr_err("Stack dump where RCU GP kthread last ran:\n"); + dump_cpu_task(cpu); } wake_up_process(gpk); } @@ -541,7 +620,7 @@ static void rcu_check_gp_kthread_expired_fqs_timer(void) pr_err("%s kthread timer wakeup didn't happen for %ld jiffies! g%ld f%#x %s(%d) ->state=%#x\n", rcu_state.name, (jiffies - jiffies_fqs), (long)rcu_seq_current(&rcu_state.gp_seq), - data_race(rcu_state.gp_flags), + data_race(READ_ONCE(rcu_state.gp_flags)), // Diagnostic read gp_state_getname(RCU_GP_WAIT_FQS), RCU_GP_WAIT_FQS, data_race(READ_ONCE(gpk->__state))); pr_err("\tPossible timer handling issue on cpu=%d timer-softirq=%u\n", @@ -566,6 +645,8 @@ static void print_other_cpu_stall(unsigned long gp_seq, unsigned long gps) if (rcu_stall_is_suppressed()) return; + nbcon_cpu_emergency_enter(); + /* * OK, time to rat on our buddy... * See Documentation/RCU/stallwarn.rst for info on how to debug @@ -588,11 +669,12 @@ static void print_other_cpu_stall(unsigned long gp_seq, unsigned long gps) for_each_possible_cpu(cpu) totqlen += rcu_get_n_cbs_cpu(cpu); - pr_cont("\t(detected by %d, t=%ld jiffies, g=%ld, q=%lu ncpus=%d)\n", + pr_err("\t(detected by %d, t=%ld jiffies, g=%ld, q=%lu ncpus=%d)\n", smp_processor_id(), (long)(jiffies - gps), - (long)rcu_seq_current(&rcu_state.gp_seq), totqlen, rcu_state.n_online_cpus); + (long)rcu_seq_current(&rcu_state.gp_seq), totqlen, + data_race(rcu_state.n_online_cpus)); // Diagnostic read if (ndetected) { - rcu_dump_cpu_stacks(); + rcu_dump_cpu_stacks(gp_seq); /* Complain about tasks blocking the grace period. */ rcu_for_each_leaf_node(rnp) @@ -617,12 +699,14 @@ static void print_other_cpu_stall(unsigned long gp_seq, unsigned long gps) rcu_check_gp_kthread_expired_fqs_timer(); rcu_check_gp_kthread_starvation(); + nbcon_cpu_emergency_exit(); + panic_on_rcu_stall(); rcu_force_quiescent_state(); /* Kick them all. */ } -static void print_cpu_stall(unsigned long gps) +static void print_cpu_stall(unsigned long gp_seq, unsigned long gps) { int cpu; unsigned long flags; @@ -637,6 +721,8 @@ static void print_cpu_stall(unsigned long gps) if (rcu_stall_is_suppressed()) return; + nbcon_cpu_emergency_enter(); + /* * OK, time to rat on ourselves... * See Documentation/RCU/stallwarn.rst for info on how to debug @@ -649,14 +735,15 @@ static void print_cpu_stall(unsigned long gps) raw_spin_unlock_irqrestore_rcu_node(rdp->mynode, flags); for_each_possible_cpu(cpu) totqlen += rcu_get_n_cbs_cpu(cpu); - pr_cont("\t(t=%lu jiffies g=%ld q=%lu ncpus=%d)\n", + pr_err("\t(t=%lu jiffies g=%ld q=%lu ncpus=%d)\n", jiffies - gps, - (long)rcu_seq_current(&rcu_state.gp_seq), totqlen, rcu_state.n_online_cpus); + (long)rcu_seq_current(&rcu_state.gp_seq), totqlen, + data_race(rcu_state.n_online_cpus)); // Diagnostic read rcu_check_gp_kthread_expired_fqs_timer(); rcu_check_gp_kthread_starvation(); - rcu_dump_cpu_stacks(); + rcu_dump_cpu_stacks(gp_seq); raw_spin_lock_irqsave_rcu_node(rnp, flags); /* Rewrite if needed in case of slow consoles. */ @@ -665,6 +752,8 @@ static void print_cpu_stall(unsigned long gps) jiffies + 3 * rcu_jiffies_till_stall_check() + 3); raw_spin_unlock_irqrestore_rcu_node(rnp, flags); + nbcon_cpu_emergency_exit(); + panic_on_rcu_stall(); /* @@ -674,13 +763,15 @@ static void print_cpu_stall(unsigned long gps) * progress and it could be we're stuck in kernel space without context * switches for an entirely unreasonable amount of time. */ - set_tsk_need_resched(current); - set_preempt_need_resched(); + set_need_resched_current(); } +static bool csd_lock_suppress_rcu_stall; +module_param(csd_lock_suppress_rcu_stall, bool, 0644); + static void check_cpu_stall(struct rcu_data *rdp) { - bool didstall = false; + bool self_detected; unsigned long gs1; unsigned long gs2; unsigned long gps; @@ -694,6 +785,16 @@ static void check_cpu_stall(struct rcu_data *rdp) !rcu_gp_in_progress()) return; rcu_stall_kick_kthreads(); + + /* + * Check if it was requested (via rcu_cpu_stall_reset()) that the FQS + * loop has to set jiffies to ensure a non-stale jiffies value. This + * is required to have good jiffies value after coming out of long + * breaks of jiffies updates. Not doing so can cause false positives. + */ + if (READ_ONCE(rcu_state.nr_fqs_jiffies_stall) > 0) + return; + j = jiffies; /* @@ -723,14 +824,15 @@ static void check_cpu_stall(struct rcu_data *rdp) gs2 = READ_ONCE(rcu_state.gp_seq); if (gs1 != gs2 || ULONG_CMP_LT(j, js) || - ULONG_CMP_GE(gps, js)) + ULONG_CMP_GE(gps, js) || + !rcu_seq_state(gs2)) return; /* No stall or GP completed since entering function. */ rnp = rdp->mynode; jn = jiffies + ULONG_MAX / 2; + self_detected = READ_ONCE(rnp->qsmask) & rdp->grpmask; if (rcu_gp_in_progress() && - (READ_ONCE(rnp->qsmask) & rdp->grpmask) && + (self_detected || ULONG_CMP_GE(j, js + RCU_STALL_RAT_DELAY)) && cmpxchg(&rcu_state.jiffies_stall, js, jn) == js) { - /* * If a virtual machine is stopped by the host it can look to * the watchdog like an RCU stall. Check to see if the host @@ -739,39 +841,34 @@ static void check_cpu_stall(struct rcu_data *rdp) if (kvm_check_and_clear_guest_paused()) return; - /* We haven't checked in, so go dump stack. */ - print_cpu_stall(gps); - if (READ_ONCE(rcu_cpu_stall_ftrace_dump)) - rcu_ftrace_dump(DUMP_ALL); - didstall = true; - - } else if (rcu_gp_in_progress() && - ULONG_CMP_GE(j, js + RCU_STALL_RAT_DELAY) && - cmpxchg(&rcu_state.jiffies_stall, js, jn) == js) { +#ifdef CONFIG_SYSFS + ++rcu_stall_count; +#endif - /* - * If a virtual machine is stopped by the host it can look to - * the watchdog like an RCU stall. Check to see if the host - * stopped the vm. - */ - if (kvm_check_and_clear_guest_paused()) - return; + rcu_stall_notifier_call_chain(RCU_STALL_NOTIFY_NORM, (void *)j - gps); + if (READ_ONCE(csd_lock_suppress_rcu_stall) && csd_lock_is_stuck()) { + pr_err("INFO: %s detected stall, but suppressed full report due to a stuck CSD-lock.\n", rcu_state.name); + } else if (self_detected) { + /* We haven't checked in, so go dump stack. */ + print_cpu_stall(gs2, gps); + } else { + /* They had a few time units to dump stack, so complain. */ + print_other_cpu_stall(gs2, gps); + } - /* They had a few time units to dump stack, so complain. */ - print_other_cpu_stall(gs2, gps); if (READ_ONCE(rcu_cpu_stall_ftrace_dump)) rcu_ftrace_dump(DUMP_ALL); - didstall = true; - } - if (didstall && READ_ONCE(rcu_state.jiffies_stall) == jn) { - jn = jiffies + 3 * rcu_jiffies_till_stall_check() + 3; - WRITE_ONCE(rcu_state.jiffies_stall, jn); + + if (READ_ONCE(rcu_state.jiffies_stall) == jn) { + jn = jiffies + 3 * rcu_jiffies_till_stall_check() + 3; + WRITE_ONCE(rcu_state.jiffies_stall, jn); + } } } ////////////////////////////////////////////////////////////////////////////// // -// RCU forward-progress mechanisms, including of callback invocation. +// RCU forward-progress mechanisms, including for callback invocation. /* @@ -891,8 +988,7 @@ void show_rcu_gp_kthreads(void) for_each_possible_cpu(cpu) { rdp = per_cpu_ptr(&rcu_data, cpu); cbs += data_race(READ_ONCE(rdp->n_cbs_invoked)); - if (rcu_segcblist_is_offloaded(&rdp->cblist)) - show_rcu_nocb_state(rdp); + show_rcu_nocb_state(rdp); } pr_info("RCU callbacks invoked since boot: %lu\n", cbs); show_rcu_tasks_gp_kthreads(); @@ -1004,7 +1100,7 @@ static bool sysrq_rcu; module_param(sysrq_rcu, bool, 0444); /* Dump grace-period-request information due to commandeered sysrq. */ -static void sysrq_show_rcu(int key) +static void sysrq_show_rcu(u8 key) { show_rcu_gp_kthreads(); } @@ -1023,3 +1119,67 @@ static int __init rcu_sysrq_init(void) return 0; } early_initcall(rcu_sysrq_init); + +#ifdef CONFIG_RCU_CPU_STALL_NOTIFIER + +////////////////////////////////////////////////////////////////////////////// +// +// RCU CPU stall-warning notifiers + +static ATOMIC_NOTIFIER_HEAD(rcu_cpu_stall_notifier_list); + +/** + * rcu_stall_chain_notifier_register - Add an RCU CPU stall notifier + * @n: Entry to add. + * + * Adds an RCU CPU stall notifier to an atomic notifier chain. + * The @action passed to a notifier will be @RCU_STALL_NOTIFY_NORM or + * friends. The @data will be the duration of the stalled grace period, + * in jiffies, coerced to a void* pointer. + * + * Returns 0 on success, %-EEXIST on error. + */ +int rcu_stall_chain_notifier_register(struct notifier_block *n) +{ + int rcsn = rcu_cpu_stall_notifiers; + + WARN(1, "Adding %pS() to RCU stall notifier list (%s).\n", n->notifier_call, + rcsn ? "possibly suppressing RCU CPU stall warnings" : "failed, so all is well"); + if (rcsn) + return atomic_notifier_chain_register(&rcu_cpu_stall_notifier_list, n); + return -EEXIST; +} +EXPORT_SYMBOL_GPL(rcu_stall_chain_notifier_register); + +/** + * rcu_stall_chain_notifier_unregister - Remove an RCU CPU stall notifier + * @n: Entry to add. + * + * Removes an RCU CPU stall notifier from an atomic notifier chain. + * + * Returns zero on success, %-ENOENT on failure. + */ +int rcu_stall_chain_notifier_unregister(struct notifier_block *n) +{ + return atomic_notifier_chain_unregister(&rcu_cpu_stall_notifier_list, n); +} +EXPORT_SYMBOL_GPL(rcu_stall_chain_notifier_unregister); + +/* + * rcu_stall_notifier_call_chain - Call functions in an RCU CPU stall notifier chain + * @val: Value passed unmodified to notifier function + * @v: Pointer passed unmodified to notifier function + * + * Calls each function in the RCU CPU stall notifier chain in turn, which + * is an atomic call chain. See atomic_notifier_call_chain() for more + * information. + * + * This is for use within RCU, hence the omission of the extra asterisk + * to indicate a non-kerneldoc format header comment. + */ +int rcu_stall_notifier_call_chain(unsigned long val, void *v) +{ + return atomic_notifier_call_chain(&rcu_cpu_stall_notifier_list, val, v); +} + +#endif // #ifdef CONFIG_RCU_CPU_STALL_NOTIFIER diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index f5e6a2f95a2a..dfeba9b35395 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -25,6 +25,7 @@ #include <linux/interrupt.h> #include <linux/sched/signal.h> #include <linux/sched/debug.h> +#include <linux/torture.h> #include <linux/atomic.h> #include <linux/bitops.h> #include <linux/percpu.h> @@ -116,7 +117,7 @@ static bool rcu_read_lock_held_common(bool *ret) return false; } -int rcu_read_lock_sched_held(void) +int notrace rcu_read_lock_sched_held(void) { bool ret; @@ -144,8 +145,45 @@ bool rcu_gp_is_normal(void) } EXPORT_SYMBOL_GPL(rcu_gp_is_normal); -static atomic_t rcu_expedited_nesting = ATOMIC_INIT(1); +static atomic_t rcu_async_hurry_nesting = ATOMIC_INIT(1); +/* + * Should call_rcu() callbacks be processed with urgency or are + * they OK being executed with arbitrary delays? + */ +bool rcu_async_should_hurry(void) +{ + return !IS_ENABLED(CONFIG_RCU_LAZY) || + atomic_read(&rcu_async_hurry_nesting); +} +EXPORT_SYMBOL_GPL(rcu_async_should_hurry); +/** + * rcu_async_hurry - Make future async RCU callbacks not lazy. + * + * After a call to this function, future calls to call_rcu() + * will be processed in a timely fashion. + */ +void rcu_async_hurry(void) +{ + if (IS_ENABLED(CONFIG_RCU_LAZY)) + atomic_inc(&rcu_async_hurry_nesting); +} +EXPORT_SYMBOL_GPL(rcu_async_hurry); + +/** + * rcu_async_relax - Make future async RCU callbacks lazy. + * + * After a call to this function, future calls to call_rcu() + * will be processed in a lazy fashion. + */ +void rcu_async_relax(void) +{ + if (IS_ENABLED(CONFIG_RCU_LAZY)) + atomic_dec(&rcu_async_hurry_nesting); +} +EXPORT_SYMBOL_GPL(rcu_async_relax); + +static atomic_t rcu_expedited_nesting = ATOMIC_INIT(1); /* * Should normal grace-period primitives be expedited? Intended for * use within RCU. Note that this function takes the rcu_expedited @@ -195,6 +233,7 @@ static bool rcu_boot_ended __read_mostly; void rcu_end_inkernel_boot(void) { rcu_unexpedite_gp(); + rcu_async_relax(); if (rcu_normal_after_boot) WRITE_ONCE(rcu_normal, 1); rcu_boot_ended = true; @@ -220,6 +259,7 @@ void rcu_test_sync_prims(void) { if (!IS_ENABLED(CONFIG_PROVE_RCU)) return; + pr_info("Running RCU synchronous self tests\n"); synchronize_rcu(); synchronize_rcu_expedited(); } @@ -302,7 +342,7 @@ EXPORT_SYMBOL_GPL(debug_lockdep_rcu_enabled); * Note that rcu_read_lock() is disallowed if the CPU is either idle or * offline from an RCU perspective, so check for those as well. */ -int rcu_read_lock_held(void) +int notrace rcu_read_lock_held(void) { bool ret; @@ -327,7 +367,7 @@ EXPORT_SYMBOL_GPL(rcu_read_lock_held); * Note that rcu_read_lock_bh() is disallowed if the CPU is either idle or * offline from an RCU perspective, so check for those as well. */ -int rcu_read_lock_bh_held(void) +int notrace rcu_read_lock_bh_held(void) { bool ret; @@ -337,7 +377,7 @@ int rcu_read_lock_bh_held(void) } EXPORT_SYMBOL_GPL(rcu_read_lock_bh_held); -int rcu_read_lock_any_held(void) +int notrace rcu_read_lock_any_held(void) { bool ret; @@ -368,7 +408,7 @@ void wakeme_after_rcu(struct rcu_head *head) } EXPORT_SYMBOL_GPL(wakeme_after_rcu); -void __wait_rcu_gp(bool checktiny, int n, call_rcu_func_t *crcu_array, +void __wait_rcu_gp(bool checktiny, unsigned int state, int n, call_rcu_func_t *crcu_array, struct rcu_synchronize *rs_array) { int i; @@ -400,7 +440,7 @@ void __wait_rcu_gp(bool checktiny, int n, call_rcu_func_t *crcu_array, if (crcu_array[j] == crcu_array[i]) break; if (j == i) { - wait_for_completion(&rs_array[i].completion); + wait_for_completion_state(&rs_array[i].completion, state); destroy_rcu_head_on_stack(&rs_array[i].head); } } @@ -485,22 +525,28 @@ EXPORT_SYMBOL_GPL(do_trace_rcu_torture_read); do { } while (0) #endif -#if IS_ENABLED(CONFIG_RCU_TORTURE_TEST) || IS_MODULE(CONFIG_RCU_TORTURE_TEST) +#if IS_ENABLED(CONFIG_RCU_TORTURE_TEST) || IS_MODULE(CONFIG_RCU_TORTURE_TEST) || IS_ENABLED(CONFIG_LOCK_TORTURE_TEST) || IS_MODULE(CONFIG_LOCK_TORTURE_TEST) /* Get rcutorture access to sched_setaffinity(). */ -long rcutorture_sched_setaffinity(pid_t pid, const struct cpumask *in_mask) +long torture_sched_setaffinity(pid_t pid, const struct cpumask *in_mask, bool dowarn) { int ret; ret = sched_setaffinity(pid, in_mask); - WARN_ONCE(ret, "%s: sched_setaffinity() returned %d\n", __func__, ret); + WARN_ONCE(dowarn && ret, "%s: sched_setaffinity(%d) returned %d\n", __func__, pid, ret); return ret; } -EXPORT_SYMBOL_GPL(rcutorture_sched_setaffinity); +EXPORT_SYMBOL_GPL(torture_sched_setaffinity); #endif +int rcu_cpu_stall_notifiers __read_mostly; // !0 = provide stall notifiers (rarely useful) +EXPORT_SYMBOL_GPL(rcu_cpu_stall_notifiers); + #ifdef CONFIG_RCU_STALL_COMMON int rcu_cpu_stall_ftrace_dump __read_mostly; module_param(rcu_cpu_stall_ftrace_dump, int, 0644); +#ifdef CONFIG_RCU_CPU_STALL_NOTIFIER +module_param(rcu_cpu_stall_notifiers, int, 0444); +#endif // #ifdef CONFIG_RCU_CPU_STALL_NOTIFIER int rcu_cpu_stall_suppress __read_mostly; // !0 = suppress stall warnings. EXPORT_SYMBOL_GPL(rcu_cpu_stall_suppress); module_param(rcu_cpu_stall_suppress, int, 0644); @@ -508,6 +554,10 @@ int rcu_cpu_stall_timeout __read_mostly = CONFIG_RCU_CPU_STALL_TIMEOUT; module_param(rcu_cpu_stall_timeout, int, 0644); int rcu_exp_cpu_stall_timeout __read_mostly = CONFIG_RCU_EXP_CPU_STALL_TIMEOUT; module_param(rcu_exp_cpu_stall_timeout, int, 0644); +int rcu_cpu_stall_cputime __read_mostly = IS_ENABLED(CONFIG_RCU_CPU_STALL_CPUTIME); +module_param(rcu_cpu_stall_cputime, int, 0644); +bool rcu_exp_stall_task_details __read_mostly; +module_param(rcu_exp_stall_task_details, bool, 0644); #endif /* #ifdef CONFIG_RCU_STALL_COMMON */ // Suppress boot-time RCU CPU stall warnings and rcutorture writer stall @@ -555,9 +605,12 @@ struct early_boot_kfree_rcu { static void early_boot_test_call_rcu(void) { static struct rcu_head head; + int idx; static struct rcu_head shead; struct early_boot_kfree_rcu *rhp; + idx = srcu_down_read(&early_srcu); + srcu_up_read(&early_srcu, idx); call_rcu(&head, test_callback); early_srcu_cookie = start_poll_synchronize_srcu(&early_srcu); call_srcu(&early_srcu, &shead, test_callback); @@ -586,6 +639,7 @@ static int rcu_verify_early_boot_tests(void) early_boot_test_counter++; srcu_barrier(&early_srcu); WARN_ON_ONCE(!poll_state_synchronize_srcu(&early_srcu, early_srcu_cookie)); + cleanup_srcu_struct(&early_srcu); } if (rcu_self_test_counter != early_boot_test_counter) { WARN_ON(1); |
