summaryrefslogtreecommitdiff
path: root/kernel/rcu
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/rcu')
-rw-r--r--kernel/rcu/Kconfig71
-rw-r--r--kernel/rcu/Kconfig.debug45
-rw-r--r--kernel/rcu/rcu.h61
-rw-r--r--kernel/rcu/rcu_segcblist.c11
-rw-r--r--kernel/rcu/rcu_segcblist.h12
-rw-r--r--kernel/rcu/rcuscale.c225
-rw-r--r--kernel/rcu/rcutorture.c882
-rw-r--r--kernel/rcu/refscale.c186
-rw-r--r--kernel/rcu/srcutiny.c40
-rw-r--r--kernel/rcu/srcutree.c347
-rw-r--r--kernel/rcu/sync.c4
-rw-r--r--kernel/rcu/tasks.h304
-rw-r--r--kernel/rcu/tiny.c35
-rw-r--r--kernel/rcu/tree.c1682
-rw-r--r--kernel/rcu/tree.h37
-rw-r--r--kernel/rcu/tree_exp.h232
-rw-r--r--kernel/rcu/tree_nocb.h435
-rw-r--r--kernel/rcu/tree_plugin.h85
-rw-r--r--kernel/rcu/tree_stall.h97
-rw-r--r--kernel/rcu/update.c8
20 files changed, 2693 insertions, 2106 deletions
diff --git a/kernel/rcu/Kconfig b/kernel/rcu/Kconfig
index e7d2dd267593..4d9b21f69eaa 100644
--- a/kernel/rcu/Kconfig
+++ b/kernel/rcu/Kconfig
@@ -18,7 +18,7 @@ config TREE_RCU
config PREEMPT_RCU
bool
- default y if PREEMPTION
+ default y if (PREEMPT || PREEMPT_RT || PREEMPT_DYNAMIC)
select TREE_RCU
help
This option selects the RCU implementation that is
@@ -31,7 +31,7 @@ config PREEMPT_RCU
config TINY_RCU
bool
- default y if !PREEMPTION && !SMP
+ default y if !PREEMPT_RCU && !SMP
help
This option selects the RCU implementation that is
designed for UP systems from which real-time response
@@ -65,6 +65,19 @@ config TREE_SRCU
help
This option selects the full-fledged version of SRCU.
+config FORCE_NEED_SRCU_NMI_SAFE
+ bool "Force selection of NEED_SRCU_NMI_SAFE"
+ depends on !TINY_SRCU
+ depends on RCU_EXPERT
+ depends on ARCH_HAS_NMI_SAFE_THIS_CPU_OPS
+ select NEED_SRCU_NMI_SAFE
+ default n
+ help
+ This option forces selection of the NEED_SRCU_NMI_SAFE
+ Kconfig option, allowing testing of srcu_read_lock_nmisafe()
+ and srcu_read_unlock_nmisafe() on architectures (like x86)
+ that select the ARCH_HAS_NMI_SAFE_THIS_CPU_OPS Kconfig option.
+
config NEED_SRCU_NMI_SAFE
def_bool HAVE_NMI && !ARCH_HAS_NMI_SAFE_THIS_CPU_OPS && !TINY_SRCU
@@ -85,9 +98,13 @@ config FORCE_TASKS_RCU
idle, and user-mode execution as quiescent states. Not for
manual selection in most cases.
-config TASKS_RCU
+config NEED_TASKS_RCU
bool
default n
+
+config TASKS_RCU
+ bool
+ default NEED_TASKS_RCU && PREEMPTION
select IRQ_WORK
config FORCE_TASKS_RUDE_RCU
@@ -245,16 +262,24 @@ config RCU_NOCB_CPU
workloads will incur significant increases in context-switch
rates.
- This option offloads callback invocation from the set of CPUs
- specified at boot time by the rcu_nocbs parameter. For each
- such CPU, a kthread ("rcuox/N") will be created to invoke
- callbacks, where the "N" is the CPU being offloaded, and where
- the "x" is "p" for RCU-preempt (PREEMPTION kernels) and "s" for
- RCU-sched (!PREEMPTION kernels). Nothing prevents this kthread
- from running on the specified CPUs, but (1) the kthreads may be
- preempted between each callback, and (2) affinity or cgroups can
- be used to force the kthreads to run on whatever set of CPUs is
- desired.
+ This option offloads callback invocation from the set of
+ CPUs specified at boot time by the rcu_nocbs parameter.
+ For each such CPU, a kthread ("rcuox/N") will be created to
+ invoke callbacks, where the "N" is the CPU being offloaded,
+ and where the "x" is "p" for RCU-preempt (PREEMPTION kernels)
+ and "s" for RCU-sched (!PREEMPTION kernels). This option
+ also creates another kthread for each sqrt(nr_cpu_ids) CPUs
+ ("rcuog/N", where N is the first CPU in that group to come
+ online), which handles grace periods for its group. Nothing
+ prevents these kthreads from running on the specified CPUs,
+ but (1) the kthreads may be preempted between each callback,
+ and (2) affinity or cgroups can be used to force the kthreads
+ to run on whatever set of CPUs is desired.
+
+ The sqrt(nr_cpu_ids) grouping may be overridden using the
+ rcutree.rcu_nocb_gp_stride kernel boot parameter. This can
+ be especially helpful for smaller numbers of CPUs, where
+ sqrt(nr_cpu_ids) can be a bit of a blunt instrument.
Say Y here if you need reduced OS jitter, despite added overhead.
Say N here if you are unsure.
@@ -311,21 +336,27 @@ config RCU_LAZY
depends on RCU_NOCB_CPU
default n
help
- To save power, batch RCU callbacks and flush after delay, memory
- pressure, or callback list growing too big.
+ To save power, batch RCU callbacks and delay starting the
+ corresponding grace period for multiple seconds. The grace
+ period will be started after this delay, in case of memory
+ pressure, or if the corresponding CPU's callback list grows
+ too large.
- Requires rcu_nocbs=all to be set.
+ These delays happen only on rcu_nocbs CPUs, that is, CPUs
+ whose callbacks have been offloaded.
- Use rcutree.enable_rcu_lazy=0 to turn it off at boot time.
+ Use the rcutree.enable_rcu_lazy=0 kernel-boot parameter to
+ globally disable these delays.
config RCU_LAZY_DEFAULT_OFF
bool "Turn RCU lazy invocation off by default"
depends on RCU_LAZY
default n
help
- Allows building the kernel with CONFIG_RCU_LAZY=y yet keep it default
- off. Boot time param rcutree.enable_rcu_lazy=1 can be used to switch
- it back on.
+ Build the kernel with CONFIG_RCU_LAZY=y, but cause the kernel
+ to boot with these energy-efficiency delays disabled. Use the
+ rcutree.enable_rcu_lazy=0 kernel-boot parameter to override
+ the this option at boot time, thus re-enabling these delays.
config RCU_DOUBLE_CHECK_CB_TIME
bool "RCU callback-batch backup time check"
diff --git a/kernel/rcu/Kconfig.debug b/kernel/rcu/Kconfig.debug
index 9b0b52e1836f..12e4c64ebae1 100644
--- a/kernel/rcu/Kconfig.debug
+++ b/kernel/rcu/Kconfig.debug
@@ -53,6 +53,51 @@ config RCU_TORTURE_TEST
Say M if you want the RCU torture tests to build as a module.
Say N if you are unsure.
+config RCU_TORTURE_TEST_CHK_RDR_STATE
+ bool "Check rcutorture reader state"
+ depends on RCU_TORTURE_TEST
+ default n
+ help
+ This option causes rcutorture to check the desired rcutorture
+ reader state for each segment against the actual context.
+ Note that PREEMPT_COUNT must be enabled if the preempt-disabled
+ and bh-disabled checks are to take effect, and that PREEMPT_RCU
+ must be enabled for the RCU-nesting checks to take effect.
+ These checks add overhead, and this Kconfig options is therefore
+ disabled by default.
+
+ Say Y here if you want rcutorture reader contexts checked.
+ Say N if you are unsure.
+
+config RCU_TORTURE_TEST_LOG_CPU
+ bool "Log CPU for rcutorture failures"
+ depends on RCU_TORTURE_TEST
+ default n
+ help
+ This option causes rcutorture to decorate each entry of its
+ log of failure/close-call rcutorture reader segments with the
+ number of the CPU that the reader was running on at the time.
+ This information can be useful, but it does incur additional
+ overhead, overhead that can make both failures and close calls
+ less probable.
+
+ Say Y here if you want CPU IDs logged.
+ Say N if you are unsure.
+
+config RCU_TORTURE_TEST_LOG_GP
+ bool "Log grace-period numbers for rcutorture failures"
+ depends on RCU_TORTURE_TEST
+ default n
+ help
+ This option causes rcutorture to decorate each entry of its
+ log of failure/close-call rcutorture reader segments with the
+ corresponding grace-period sequence numbers. This information
+ can be useful, but it does incur additional overhead, overhead
+ that can make both failures and close calls less probable.
+
+ Say Y here if you want grace-period sequence numbers logged.
+ Say N if you are unsure.
+
config RCU_REF_SCALE_TEST
tristate "Scalability tests for read-side synchronization (RCU and others)"
depends on DEBUG_KERNEL
diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h
index 86fce206560e..9cf01832a6c3 100644
--- a/kernel/rcu/rcu.h
+++ b/kernel/rcu/rcu.h
@@ -54,12 +54,12 @@
* grace-period sequence number.
*/
-#define RCU_SEQ_CTR_SHIFT 2
-#define RCU_SEQ_STATE_MASK ((1 << RCU_SEQ_CTR_SHIFT) - 1)
-
/* Low-order bit definition for polled grace-period APIs. */
#define RCU_GET_STATE_COMPLETED 0x1
+/* A complete grace period count */
+#define RCU_SEQ_GP (RCU_SEQ_STATE_MASK + 1)
+
extern int sysctl_sched_rt_runtime;
/*
@@ -160,12 +160,21 @@ static inline bool rcu_seq_done(unsigned long *sp, unsigned long s)
* Given a snapshot from rcu_seq_snap(), determine whether or not a
* full update-side operation has occurred, but do not allow the
* (ULONG_MAX / 2) safety-factor/guard-band.
+ *
+ * The token returned by get_state_synchronize_rcu_full() is based on
+ * rcu_state.gp_seq but it is tested in poll_state_synchronize_rcu_full()
+ * against the root rnp->gp_seq. Since rcu_seq_start() is first called
+ * on rcu_state.gp_seq and only later reflected on the root rnp->gp_seq,
+ * it is possible that rcu_seq_snap(rcu_state.gp_seq) returns 2 full grace
+ * periods ahead of the root rnp->gp_seq. To prevent false-positives with the
+ * full polling API that a wrap around instantly completed the GP, when nothing
+ * like that happened, adjust for the 2 GPs in the ULONG_CMP_LT().
*/
static inline bool rcu_seq_done_exact(unsigned long *sp, unsigned long s)
{
unsigned long cur_s = READ_ONCE(*sp);
- return ULONG_CMP_GE(cur_s, s) || ULONG_CMP_LT(cur_s, s - (2 * RCU_SEQ_STATE_MASK + 1));
+ return ULONG_CMP_GE(cur_s, s) || ULONG_CMP_LT(cur_s, s - (2 * RCU_SEQ_GP));
}
/*
@@ -255,6 +264,11 @@ static inline void debug_rcu_head_callback(struct rcu_head *rhp)
kmem_dump_obj(rhp);
}
+static inline bool rcu_barrier_cb_is_done(struct rcu_head *rhp)
+{
+ return rhp->next == rhp;
+}
+
extern int rcu_cpu_stall_suppress_at_boot;
static inline bool rcu_stall_is_suppressed_at_boot(void)
@@ -522,12 +536,18 @@ static inline void show_rcu_tasks_gp_kthreads(void) {}
#ifdef CONFIG_TASKS_RCU
struct task_struct *get_rcu_tasks_gp_kthread(void);
+void rcu_tasks_get_gp_data(int *flags, unsigned long *gp_seq);
#endif // # ifdef CONFIG_TASKS_RCU
#ifdef CONFIG_TASKS_RUDE_RCU
struct task_struct *get_rcu_tasks_rude_gp_kthread(void);
+void rcu_tasks_rude_get_gp_data(int *flags, unsigned long *gp_seq);
#endif // # ifdef CONFIG_TASKS_RUDE_RCU
+#ifdef CONFIG_TASKS_TRACE_RCU
+void rcu_tasks_trace_get_gp_data(int *flags, unsigned long *gp_seq);
+#endif
+
#ifdef CONFIG_TASKS_RCU_GENERIC
void tasks_cblist_init_generic(void);
#else /* #ifdef CONFIG_TASKS_RCU_GENERIC */
@@ -557,17 +577,17 @@ static inline void rcu_set_jiffies_lazy_flush(unsigned long j) { }
#endif
#if defined(CONFIG_TREE_RCU)
-void rcutorture_get_gp_data(enum rcutorture_type test_type, int *flags,
- unsigned long *gp_seq);
+void rcutorture_get_gp_data(int *flags, unsigned long *gp_seq);
void do_trace_rcu_torture_read(const char *rcutorturename,
struct rcu_head *rhp,
unsigned long secs,
unsigned long c_old,
unsigned long c);
void rcu_gp_set_torture_wait(int duration);
+void rcu_set_gpwrap_lag(unsigned long lag);
+int rcu_get_gpwrap_count(int cpu);
#else
-static inline void rcutorture_get_gp_data(enum rcutorture_type test_type,
- int *flags, unsigned long *gp_seq)
+static inline void rcutorture_get_gp_data(int *flags, unsigned long *gp_seq)
{
*flags = 0;
*gp_seq = 0;
@@ -583,34 +603,32 @@ void do_trace_rcu_torture_read(const char *rcutorturename,
do { } while (0)
#endif
static inline void rcu_gp_set_torture_wait(int duration) { }
+static inline void rcu_set_gpwrap_lag(unsigned long lag) { }
+static inline int rcu_get_gpwrap_count(int cpu) { return 0; }
#endif
+unsigned long long rcutorture_gather_gp_seqs(void);
+void rcutorture_format_gp_seqs(unsigned long long seqs, char *cp, size_t len);
#ifdef CONFIG_TINY_SRCU
-static inline void srcutorture_get_gp_data(enum rcutorture_type test_type,
- struct srcu_struct *sp, int *flags,
+static inline void srcutorture_get_gp_data(struct srcu_struct *sp, int *flags,
unsigned long *gp_seq)
{
- if (test_type != SRCU_FLAVOR)
- return;
*flags = 0;
*gp_seq = sp->srcu_idx;
}
#elif defined(CONFIG_TREE_SRCU)
-void srcutorture_get_gp_data(enum rcutorture_type test_type,
- struct srcu_struct *sp, int *flags,
+void srcutorture_get_gp_data(struct srcu_struct *sp, int *flags,
unsigned long *gp_seq);
#endif
#ifdef CONFIG_TINY_RCU
-static inline bool rcu_dynticks_zero_in_eqs(int cpu, int *vp) { return false; }
+static inline bool rcu_watching_zero_in_eqs(int cpu, int *vp) { return false; }
static inline unsigned long rcu_get_gp_seq(void) { return 0; }
static inline unsigned long rcu_exp_batches_completed(void) { return 0; }
-static inline unsigned long
-srcu_batches_completed(struct srcu_struct *sp) { return 0; }
static inline void rcu_force_quiescent_state(void) { }
static inline bool rcu_check_boost_fail(unsigned long gp_state, int *cpup) { return true; }
static inline void show_rcu_gp_kthreads(void) { }
@@ -619,10 +637,9 @@ static inline void rcu_fwd_progress_check(unsigned long j) { }
static inline void rcu_gp_slow_register(atomic_t *rgssp) { }
static inline void rcu_gp_slow_unregister(atomic_t *rgssp) { }
#else /* #ifdef CONFIG_TINY_RCU */
-bool rcu_dynticks_zero_in_eqs(int cpu, int *vp);
+bool rcu_watching_zero_in_eqs(int cpu, int *vp);
unsigned long rcu_get_gp_seq(void);
unsigned long rcu_exp_batches_completed(void);
-unsigned long srcu_batches_completed(struct srcu_struct *sp);
bool rcu_check_boost_fail(unsigned long gp_state, int *cpup);
void show_rcu_gp_kthreads(void);
int rcu_get_gp_kthreads_prio(void);
@@ -634,6 +651,12 @@ void rcu_gp_slow_register(atomic_t *rgssp);
void rcu_gp_slow_unregister(atomic_t *rgssp);
#endif /* #else #ifdef CONFIG_TINY_RCU */
+#ifdef CONFIG_TINY_SRCU
+static inline unsigned long srcu_batches_completed(struct srcu_struct *sp) { return 0; }
+#else // #ifdef CONFIG_TINY_SRCU
+unsigned long srcu_batches_completed(struct srcu_struct *sp);
+#endif // #else // #ifdef CONFIG_TINY_SRCU
+
#ifdef CONFIG_RCU_NOCB_CPU
void rcu_bind_current_to_nocb(void);
#else
diff --git a/kernel/rcu/rcu_segcblist.c b/kernel/rcu/rcu_segcblist.c
index 1693ea22ef1b..298a2c573f02 100644
--- a/kernel/rcu/rcu_segcblist.c
+++ b/kernel/rcu/rcu_segcblist.c
@@ -261,17 +261,6 @@ void rcu_segcblist_disable(struct rcu_segcblist *rsclp)
}
/*
- * Mark the specified rcu_segcblist structure as offloaded (or not)
- */
-void rcu_segcblist_offload(struct rcu_segcblist *rsclp, bool offload)
-{
- if (offload)
- rcu_segcblist_set_flags(rsclp, SEGCBLIST_LOCKING | SEGCBLIST_OFFLOADED);
- else
- rcu_segcblist_clear_flags(rsclp, SEGCBLIST_OFFLOADED);
-}
-
-/*
* Does the specified rcu_segcblist structure contain callbacks that
* are ready to be invoked?
*/
diff --git a/kernel/rcu/rcu_segcblist.h b/kernel/rcu/rcu_segcblist.h
index 4fe877f5f654..fadc08ad4b7b 100644
--- a/kernel/rcu/rcu_segcblist.h
+++ b/kernel/rcu/rcu_segcblist.h
@@ -89,16 +89,7 @@ static inline bool rcu_segcblist_is_enabled(struct rcu_segcblist *rsclp)
static inline bool rcu_segcblist_is_offloaded(struct rcu_segcblist *rsclp)
{
if (IS_ENABLED(CONFIG_RCU_NOCB_CPU) &&
- rcu_segcblist_test_flags(rsclp, SEGCBLIST_LOCKING))
- return true;
-
- return false;
-}
-
-static inline bool rcu_segcblist_completely_offloaded(struct rcu_segcblist *rsclp)
-{
- if (IS_ENABLED(CONFIG_RCU_NOCB_CPU) &&
- !rcu_segcblist_test_flags(rsclp, SEGCBLIST_RCU_CORE))
+ rcu_segcblist_test_flags(rsclp, SEGCBLIST_OFFLOADED))
return true;
return false;
@@ -129,7 +120,6 @@ void rcu_segcblist_inc_len(struct rcu_segcblist *rsclp);
void rcu_segcblist_add_len(struct rcu_segcblist *rsclp, long v);
void rcu_segcblist_init(struct rcu_segcblist *rsclp);
void rcu_segcblist_disable(struct rcu_segcblist *rsclp);
-void rcu_segcblist_offload(struct rcu_segcblist *rsclp, bool offload);
bool rcu_segcblist_ready_cbs(struct rcu_segcblist *rsclp);
bool rcu_segcblist_pend_cbs(struct rcu_segcblist *rsclp);
struct rcu_head *rcu_segcblist_first_cb(struct rcu_segcblist *rsclp);
diff --git a/kernel/rcu/rcuscale.c b/kernel/rcu/rcuscale.c
index 8db4fedaaa1e..b521d0455992 100644
--- a/kernel/rcu/rcuscale.c
+++ b/kernel/rcu/rcuscale.c
@@ -39,9 +39,11 @@
#include <linux/torture.h>
#include <linux/vmalloc.h>
#include <linux/rcupdate_trace.h>
+#include <linux/sched/debug.h>
#include "rcu.h"
+MODULE_DESCRIPTION("Read-Copy Update module-based scalability-test facility");
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Paul E. McKenney <paulmck@linux.ibm.com>");
@@ -103,6 +105,20 @@ static char *scale_type = "rcu";
module_param(scale_type, charp, 0444);
MODULE_PARM_DESC(scale_type, "Type of RCU to scalability-test (rcu, srcu, ...)");
+// Structure definitions for custom fixed-per-task allocator.
+struct writer_mblock {
+ struct rcu_head wmb_rh;
+ struct llist_node wmb_node;
+ struct writer_freelist *wmb_wfl;
+};
+
+struct writer_freelist {
+ struct llist_head ws_lhg;
+ atomic_t ws_inflight;
+ struct llist_head ____cacheline_internodealigned_in_smp ws_lhp;
+ struct writer_mblock *ws_mblocks;
+};
+
static int nrealreaders;
static int nrealwriters;
static struct task_struct **writer_tasks;
@@ -110,6 +126,8 @@ static struct task_struct **reader_tasks;
static struct task_struct *shutdown_task;
static u64 **writer_durations;
+static bool *writer_done;
+static struct writer_freelist *writer_freelists;
static int *writer_n_durations;
static atomic_t n_rcu_scale_reader_started;
static atomic_t n_rcu_scale_writer_started;
@@ -119,7 +137,6 @@ static u64 t_rcu_scale_writer_started;
static u64 t_rcu_scale_writer_finished;
static unsigned long b_rcu_gp_test_started;
static unsigned long b_rcu_gp_test_finished;
-static DEFINE_PER_CPU(atomic_t, n_async_inflight);
#define MAX_MEAS 10000
#define MIN_MEAS 100
@@ -142,6 +159,7 @@ struct rcu_scale_ops {
void (*sync)(void);
void (*exp_sync)(void);
struct task_struct *(*rso_gp_kthread)(void);
+ void (*stats)(void);
const char *name;
};
@@ -223,6 +241,11 @@ static void srcu_scale_synchronize(void)
synchronize_srcu(srcu_ctlp);
}
+static void srcu_scale_stats(void)
+{
+ srcu_torture_stats_print(srcu_ctlp, scale_type, SCALE_FLAG);
+}
+
static void srcu_scale_synchronize_expedited(void)
{
synchronize_srcu_expedited(srcu_ctlp);
@@ -240,6 +263,7 @@ static struct rcu_scale_ops srcu_ops = {
.gp_barrier = srcu_rcu_barrier,
.sync = srcu_scale_synchronize,
.exp_sync = srcu_scale_synchronize_expedited,
+ .stats = srcu_scale_stats,
.name = "srcu"
};
@@ -269,6 +293,7 @@ static struct rcu_scale_ops srcud_ops = {
.gp_barrier = srcu_rcu_barrier,
.sync = srcu_scale_synchronize,
.exp_sync = srcu_scale_synchronize_expedited,
+ .stats = srcu_scale_stats,
.name = "srcud"
};
@@ -287,6 +312,11 @@ static void tasks_scale_read_unlock(int idx)
{
}
+static void rcu_tasks_scale_stats(void)
+{
+ rcu_tasks_torture_stats_print(scale_type, SCALE_FLAG);
+}
+
static struct rcu_scale_ops tasks_ops = {
.ptype = RCU_TASKS_FLAVOR,
.init = rcu_sync_scale_init,
@@ -299,6 +329,7 @@ static struct rcu_scale_ops tasks_ops = {
.sync = synchronize_rcu_tasks,
.exp_sync = synchronize_rcu_tasks,
.rso_gp_kthread = get_rcu_tasks_gp_kthread,
+ .stats = IS_ENABLED(CONFIG_TINY_RCU) ? NULL : rcu_tasks_scale_stats,
.name = "tasks"
};
@@ -325,6 +356,11 @@ static void tasks_rude_scale_read_unlock(int idx)
{
}
+static void rcu_tasks_rude_scale_stats(void)
+{
+ rcu_tasks_rude_torture_stats_print(scale_type, SCALE_FLAG);
+}
+
static struct rcu_scale_ops tasks_rude_ops = {
.ptype = RCU_TASKS_RUDE_FLAVOR,
.init = rcu_sync_scale_init,
@@ -332,11 +368,10 @@ static struct rcu_scale_ops tasks_rude_ops = {
.readunlock = tasks_rude_scale_read_unlock,
.get_gp_seq = rcu_no_completed,
.gp_diff = rcu_seq_diff,
- .async = call_rcu_tasks_rude,
- .gp_barrier = rcu_barrier_tasks_rude,
.sync = synchronize_rcu_tasks_rude,
.exp_sync = synchronize_rcu_tasks_rude,
.rso_gp_kthread = get_rcu_tasks_rude_gp_kthread,
+ .stats = IS_ENABLED(CONFIG_TINY_RCU) ? NULL : rcu_tasks_rude_scale_stats,
.name = "tasks-rude"
};
@@ -365,6 +400,11 @@ static void tasks_trace_scale_read_unlock(int idx)
rcu_read_unlock_trace();
}
+static void rcu_tasks_trace_scale_stats(void)
+{
+ rcu_tasks_trace_torture_stats_print(scale_type, SCALE_FLAG);
+}
+
static struct rcu_scale_ops tasks_tracing_ops = {
.ptype = RCU_TASKS_FLAVOR,
.init = rcu_sync_scale_init,
@@ -377,6 +417,7 @@ static struct rcu_scale_ops tasks_tracing_ops = {
.sync = synchronize_rcu_tasks_trace,
.exp_sync = synchronize_rcu_tasks_trace,
.rso_gp_kthread = get_rcu_tasks_trace_gp_kthread,
+ .stats = IS_ENABLED(CONFIG_TINY_RCU) ? NULL : rcu_tasks_trace_scale_stats,
.name = "tasks-tracing"
};
@@ -437,12 +478,52 @@ rcu_scale_reader(void *arg)
}
/*
+ * Allocate a writer_mblock structure for the specified rcu_scale_writer
+ * task.
+ */
+static struct writer_mblock *rcu_scale_alloc(long me)
+{
+ struct llist_node *llnp;
+ struct writer_freelist *wflp;
+ struct writer_mblock *wmbp;
+
+ if (WARN_ON_ONCE(!writer_freelists))
+ return NULL;
+ wflp = &writer_freelists[me];
+ if (llist_empty(&wflp->ws_lhp)) {
+ // ->ws_lhp is private to its rcu_scale_writer task.
+ wmbp = container_of(llist_del_all(&wflp->ws_lhg), struct writer_mblock, wmb_node);
+ wflp->ws_lhp.first = &wmbp->wmb_node;
+ }
+ llnp = llist_del_first(&wflp->ws_lhp);
+ if (!llnp)
+ return NULL;
+ return container_of(llnp, struct writer_mblock, wmb_node);
+}
+
+/*
+ * Free a writer_mblock structure to its rcu_scale_writer task.
+ */
+static void rcu_scale_free(struct writer_mblock *wmbp)
+{
+ struct writer_freelist *wflp;
+
+ if (!wmbp)
+ return;
+ wflp = wmbp->wmb_wfl;
+ llist_add(&wmbp->wmb_node, &wflp->ws_lhg);
+}
+
+/*
* Callback function for asynchronous grace periods from rcu_scale_writer().
*/
static void rcu_scale_async_cb(struct rcu_head *rhp)
{
- atomic_dec(this_cpu_ptr(&n_async_inflight));
- kfree(rhp);
+ struct writer_mblock *wmbp = container_of(rhp, struct writer_mblock, wmb_rh);
+ struct writer_freelist *wflp = wmbp->wmb_wfl;
+
+ atomic_dec(&wflp->ws_inflight);
+ rcu_scale_free(wmbp);
}
/*
@@ -455,12 +536,14 @@ rcu_scale_writer(void *arg)
int i_max;
unsigned long jdone;
long me = (long)arg;
- struct rcu_head *rhp = NULL;
+ bool selfreport = false;
bool started = false, done = false, alldone = false;
u64 t;
DEFINE_TORTURE_RANDOM(tr);
u64 *wdp;
u64 *wdpp = writer_durations[me];
+ struct writer_freelist *wflp = &writer_freelists[me];
+ struct writer_mblock *wmbp = NULL;
VERBOSE_SCALEOUT_STRING("rcu_scale_writer task started");
WARN_ON(!wdpp);
@@ -492,30 +575,34 @@ rcu_scale_writer(void *arg)
jdone = jiffies + minruntime * HZ;
do {
+ bool gp_succeeded = false;
+
if (writer_holdoff)
udelay(writer_holdoff);
if (writer_holdoff_jiffies)
schedule_timeout_idle(torture_random(&tr) % writer_holdoff_jiffies + 1);
wdp = &wdpp[i];
*wdp = ktime_get_mono_fast_ns();
- if (gp_async) {
-retry:
- if (!rhp)
- rhp = kmalloc(sizeof(*rhp), GFP_KERNEL);
- if (rhp && atomic_read(this_cpu_ptr(&n_async_inflight)) < gp_async_max) {
- atomic_inc(this_cpu_ptr(&n_async_inflight));
- cur_ops->async(rhp, rcu_scale_async_cb);
- rhp = NULL;
+ if (gp_async && !WARN_ON_ONCE(!cur_ops->async)) {
+ if (!wmbp)
+ wmbp = rcu_scale_alloc(me);
+ if (wmbp && atomic_read(&wflp->ws_inflight) < gp_async_max) {
+ atomic_inc(&wflp->ws_inflight);
+ cur_ops->async(&wmbp->wmb_rh, rcu_scale_async_cb);
+ wmbp = NULL;
+ gp_succeeded = true;
} else if (!kthread_should_stop()) {
cur_ops->gp_barrier();
- goto retry;
} else {
- kfree(rhp); /* Because we are stopping. */
+ rcu_scale_free(wmbp); /* Because we are stopping. */
+ wmbp = NULL;
}
} else if (gp_exp) {
cur_ops->exp_sync();
+ gp_succeeded = true;
} else {
cur_ops->sync();
+ gp_succeeded = true;
}
t = ktime_get_mono_fast_ns();
*wdp = t - *wdp;
@@ -525,6 +612,7 @@ retry:
started = true;
if (!done && i >= MIN_MEAS && time_after(jiffies, jdone)) {
done = true;
+ WRITE_ONCE(writer_done[me], true);
sched_set_normal(current, 0);
pr_alert("%s%s rcu_scale_writer %ld has %d measurements\n",
scale_type, SCALE_FLAG, me, MIN_MEAS);
@@ -550,11 +638,32 @@ retry:
if (done && !alldone &&
atomic_read(&n_rcu_scale_writer_finished) >= nrealwriters)
alldone = true;
- if (started && !alldone && i < MAX_MEAS - 1)
+ if (done && !alldone && time_after(jiffies, jdone + HZ * 60)) {
+ static atomic_t dumped;
+ int i;
+
+ if (!atomic_xchg(&dumped, 1)) {
+ for (i = 0; i < nrealwriters; i++) {
+ if (writer_done[i])
+ continue;
+ pr_info("%s: Task %ld flags writer %d:\n", __func__, me, i);
+ sched_show_task(writer_tasks[i]);
+ }
+ if (cur_ops->stats)
+ cur_ops->stats();
+ }
+ }
+ if (!selfreport && time_after(jiffies, jdone + HZ * (70 + me))) {
+ pr_info("%s: Writer %ld self-report: started %d done %d/%d->%d i %d jdone %lu.\n",
+ __func__, me, started, done, writer_done[me], atomic_read(&n_rcu_scale_writer_finished), i, jiffies - jdone);
+ selfreport = true;
+ }
+ if (gp_succeeded && started && !alldone && i < MAX_MEAS - 1)
i++;
rcu_scale_wait_shutdown();
} while (!torture_must_stop());
- if (gp_async) {
+ if (gp_async && cur_ops->async) {
+ rcu_scale_free(wmbp);
cur_ops->gp_barrier();
}
writer_n_durations[me] = i_max + 1;
@@ -653,7 +762,7 @@ kfree_scale_thread(void *arg)
}
for (i = 0; i < kfree_alloc_num; i++) {
- alloc_ptr = kmalloc(kfree_mult * sizeof(struct kfree_obj), GFP_KERNEL);
+ alloc_ptr = kcalloc(kfree_mult, sizeof(struct kfree_obj), GFP_KERNEL);
if (!alloc_ptr)
return -ENOMEM;
@@ -712,6 +821,7 @@ kfree_scale_cleanup(void)
torture_stop_kthread(kfree_scale_thread,
kfree_reader_tasks[i]);
kfree(kfree_reader_tasks);
+ kfree_reader_tasks = NULL;
}
torture_cleanup_end();
@@ -779,14 +889,14 @@ kfree_scale_init(void)
if (WARN_ON_ONCE(jiffies_at_lazy_cb - jif_start < 2 * HZ)) {
pr_alert("ERROR: call_rcu() CBs are not being lazy as expected!\n");
- WARN_ON_ONCE(1);
- return -1;
+ firsterr = -1;
+ goto unwind;
}
if (WARN_ON_ONCE(jiffies_at_lazy_cb - jif_start > 3 * HZ)) {
pr_alert("ERROR: call_rcu() CBs are being too lazy!\n");
- WARN_ON_ONCE(1);
- return -1;
+ firsterr = -1;
+ goto unwind;
}
}
@@ -880,6 +990,7 @@ rcu_scale_cleanup(void)
torture_stop_kthread(rcu_scale_reader,
reader_tasks[i]);
kfree(reader_tasks);
+ reader_tasks = NULL;
}
if (writer_tasks) {
@@ -918,10 +1029,33 @@ rcu_scale_cleanup(void)
schedule_timeout_uninterruptible(1);
}
kfree(writer_durations[i]);
+ if (writer_freelists) {
+ int ctr = 0;
+ struct llist_node *llnp;
+ struct writer_freelist *wflp = &writer_freelists[i];
+
+ if (wflp->ws_mblocks) {
+ llist_for_each(llnp, wflp->ws_lhg.first)
+ ctr++;
+ llist_for_each(llnp, wflp->ws_lhp.first)
+ ctr++;
+ WARN_ONCE(ctr != gp_async_max,
+ "%s: ctr = %d gp_async_max = %d\n",
+ __func__, ctr, gp_async_max);
+ kfree(wflp->ws_mblocks);
+ }
+ }
}
kfree(writer_tasks);
+ writer_tasks = NULL;
kfree(writer_durations);
+ writer_durations = NULL;
kfree(writer_n_durations);
+ writer_n_durations = NULL;
+ kfree(writer_done);
+ writer_done = NULL;
+ kfree(writer_freelists);
+ writer_freelists = NULL;
}
/* Do torture-type-specific cleanup operations. */
@@ -948,8 +1082,9 @@ rcu_scale_shutdown(void *arg)
static int __init
rcu_scale_init(void)
{
- long i;
int firsterr = 0;
+ long i;
+ long j;
static struct rcu_scale_ops *scale_ops[] = {
&rcu_ops, &srcu_ops, &srcud_ops, TASKS_OPS TASKS_RUDE_OPS TASKS_TRACING_OPS
};
@@ -1016,14 +1151,22 @@ rcu_scale_init(void)
}
while (atomic_read(&n_rcu_scale_reader_started) < nrealreaders)
schedule_timeout_uninterruptible(1);
- writer_tasks = kcalloc(nrealwriters, sizeof(reader_tasks[0]),
- GFP_KERNEL);
- writer_durations = kcalloc(nrealwriters, sizeof(*writer_durations),
- GFP_KERNEL);
- writer_n_durations =
- kcalloc(nrealwriters, sizeof(*writer_n_durations),
- GFP_KERNEL);
- if (!writer_tasks || !writer_durations || !writer_n_durations) {
+ writer_tasks = kcalloc(nrealwriters, sizeof(writer_tasks[0]), GFP_KERNEL);
+ writer_durations = kcalloc(nrealwriters, sizeof(*writer_durations), GFP_KERNEL);
+ writer_n_durations = kcalloc(nrealwriters, sizeof(*writer_n_durations), GFP_KERNEL);
+ writer_done = kcalloc(nrealwriters, sizeof(writer_done[0]), GFP_KERNEL);
+ if (gp_async) {
+ if (gp_async_max <= 0) {
+ pr_warn("%s: gp_async_max = %d must be greater than zero.\n",
+ __func__, gp_async_max);
+ WARN_ON_ONCE(IS_BUILTIN(CONFIG_RCU_TORTURE_TEST));
+ firsterr = -EINVAL;
+ goto unwind;
+ }
+ writer_freelists = kcalloc(nrealwriters, sizeof(writer_freelists[0]), GFP_KERNEL);
+ }
+ if (!writer_tasks || !writer_durations || !writer_n_durations || !writer_done ||
+ (gp_async && !writer_freelists)) {
SCALEOUT_ERRSTRING("out of memory");
firsterr = -ENOMEM;
goto unwind;
@@ -1036,6 +1179,24 @@ rcu_scale_init(void)
firsterr = -ENOMEM;
goto unwind;
}
+ if (writer_freelists) {
+ struct writer_freelist *wflp = &writer_freelists[i];
+
+ init_llist_head(&wflp->ws_lhg);
+ init_llist_head(&wflp->ws_lhp);
+ wflp->ws_mblocks = kcalloc(gp_async_max, sizeof(wflp->ws_mblocks[0]),
+ GFP_KERNEL);
+ if (!wflp->ws_mblocks) {
+ firsterr = -ENOMEM;
+ goto unwind;
+ }
+ for (j = 0; j < gp_async_max; j++) {
+ struct writer_mblock *wmbp = &wflp->ws_mblocks[j];
+
+ wmbp->wmb_wfl = wflp;
+ llist_add(&wmbp->wmb_node, &wflp->ws_lhp);
+ }
+ }
firsterr = torture_create_kthread(rcu_scale_writer, (void *)i,
writer_tasks[i]);
if (torture_init_error(firsterr))
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index 45d6b4c3d199..70ec0f21abc3 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -51,14 +51,15 @@
#include "rcu.h"
+MODULE_DESCRIPTION("Read-Copy Update module-based torture test facility");
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Paul E. McKenney <paulmck@linux.ibm.com> and Josh Triplett <josh@joshtriplett.org>");
/* Bits for ->extendables field, extendables param, and related definitions. */
#define RCUTORTURE_RDR_SHIFT_1 8 /* Put SRCU index in upper bits. */
-#define RCUTORTURE_RDR_MASK_1 (1 << RCUTORTURE_RDR_SHIFT_1)
-#define RCUTORTURE_RDR_SHIFT_2 9 /* Put SRCU index in upper bits. */
-#define RCUTORTURE_RDR_MASK_2 (1 << RCUTORTURE_RDR_SHIFT_2)
+#define RCUTORTURE_RDR_MASK_1 (0xff << RCUTORTURE_RDR_SHIFT_1)
+#define RCUTORTURE_RDR_SHIFT_2 16 /* Put SRCU index in upper bits. */
+#define RCUTORTURE_RDR_MASK_2 (0xff << RCUTORTURE_RDR_SHIFT_2)
#define RCUTORTURE_RDR_BH 0x01 /* Extend readers by disabling bh. */
#define RCUTORTURE_RDR_IRQ 0x02 /* ... disabling interrupts. */
#define RCUTORTURE_RDR_PREEMPT 0x04 /* ... disabling preemption. */
@@ -70,6 +71,9 @@ MODULE_AUTHOR("Paul E. McKenney <paulmck@linux.ibm.com> and Josh Triplett <josh@
#define RCUTORTURE_MAX_EXTEND \
(RCUTORTURE_RDR_BH | RCUTORTURE_RDR_IRQ | RCUTORTURE_RDR_PREEMPT | \
RCUTORTURE_RDR_RBH | RCUTORTURE_RDR_SCHED)
+#define RCUTORTURE_RDR_ALLBITS \
+ (RCUTORTURE_MAX_EXTEND | RCUTORTURE_RDR_RCU_1 | RCUTORTURE_RDR_RCU_2 | \
+ RCUTORTURE_RDR_MASK_1 | RCUTORTURE_RDR_MASK_2)
#define RCUTORTURE_RDR_MAX_LOOPS 0x7 /* Maximum reader extensions. */
/* Must be power of two minus one. */
#define RCUTORTURE_RDR_MAX_SEGS (RCUTORTURE_RDR_MAX_LOOPS + 3)
@@ -88,12 +92,20 @@ torture_param(bool, gp_cond_exp, false, "Use conditional/async expedited GP wait
torture_param(bool, gp_cond_full, false, "Use conditional/async full-state GP wait primitives");
torture_param(bool, gp_cond_exp_full, false,
"Use conditional/async full-stateexpedited GP wait primitives");
+torture_param(int, gp_cond_wi, 16 * USEC_PER_SEC / HZ,
+ "Wait interval for normal conditional grace periods, us (default 16 jiffies)");
+torture_param(int, gp_cond_wi_exp, 128,
+ "Wait interval for expedited conditional grace periods, us (default 128 us)");
torture_param(bool, gp_exp, false, "Use expedited GP wait primitives");
torture_param(bool, gp_normal, false, "Use normal (non-expedited) GP wait primitives");
torture_param(bool, gp_poll, false, "Use polling GP wait primitives");
torture_param(bool, gp_poll_exp, false, "Use polling expedited GP wait primitives");
torture_param(bool, gp_poll_full, false, "Use polling full-state GP wait primitives");
torture_param(bool, gp_poll_exp_full, false, "Use polling full-state expedited GP wait primitives");
+torture_param(int, gp_poll_wi, 16 * USEC_PER_SEC / HZ,
+ "Wait interval for normal polled grace periods, us (default 16 jiffies)");
+torture_param(int, gp_poll_wi_exp, 128,
+ "Wait interval for expedited polled grace periods, us (default 128 us)");
torture_param(bool, gp_sync, false, "Use synchronous GP wait primitives");
torture_param(int, irqreader, 1, "Allow RCU readers from irq handlers");
torture_param(int, leakpointer, 0, "Leak pointer dereferences from readers");
@@ -103,10 +115,17 @@ torture_param(int, nreaders, -1, "Number of RCU reader threads");
torture_param(int, object_debug, 0, "Enable debug-object double call_rcu() testing");
torture_param(int, onoff_holdoff, 0, "Time after boot before CPU hotplugs (s)");
torture_param(int, onoff_interval, 0, "Time between CPU hotplugs (jiffies), 0=disable");
+torture_param(bool, gpwrap_lag, true, "Enable grace-period wrap lag testing");
+torture_param(int, gpwrap_lag_gps, 8, "Value to set for set_gpwrap_lag during an active testing period.");
+torture_param(int, gpwrap_lag_cycle_mins, 30, "Total cycle duration for gpwrap lag testing (in minutes)");
+torture_param(int, gpwrap_lag_active_mins, 5, "Duration for which gpwrap lag is active within each cycle (in minutes)");
torture_param(int, nocbs_nthreads, 0, "Number of NOCB toggle threads, 0 to disable");
torture_param(int, nocbs_toggle, 1000, "Time between toggling nocb state (ms)");
+torture_param(int, preempt_duration, 0, "Preemption duration (ms), zero to disable");
+torture_param(int, preempt_interval, MSEC_PER_SEC, "Interval between preemptions (ms)");
torture_param(int, read_exit_delay, 13, "Delay between read-then-exit episodes (s)");
torture_param(int, read_exit_burst, 16, "# of read-then-exit bursts per episode, zero to disable");
+torture_param(int, reader_flavor, SRCU_READ_FLAVOR_NORMAL, "Reader flavors to use, one per bit.");
torture_param(int, shuffle_interval, 3, "Number of seconds between shuffles");
torture_param(int, shutdown_secs, 0, "Shutdown time (s), <= zero to disable.");
torture_param(int, stall_cpu, 0, "Stall duration (s), zero to disable.");
@@ -114,11 +133,13 @@ torture_param(int, stall_cpu_holdoff, 10, "Time to wait before starting stall (s
torture_param(bool, stall_no_softlockup, false, "Avoid softlockup warning during cpu stall.");
torture_param(int, stall_cpu_irqsoff, 0, "Disable interrupts while stalling.");
torture_param(int, stall_cpu_block, 0, "Sleep while stalling.");
+torture_param(int, stall_cpu_repeat, 0, "Number of additional stalls after the first one.");
torture_param(int, stall_gp_kthread, 0, "Grace-period kthread stall duration (s).");
torture_param(int, stat_interval, 60, "Number of seconds between stats printk()s");
torture_param(int, stutter, 5, "Number of seconds to run/halt test");
torture_param(int, test_boost, 1, "Test RCU prio boost: 0=no, 1=maybe, 2=yes.");
torture_param(int, test_boost_duration, 4, "Duration of each boost test, seconds.");
+torture_param(int, test_boost_holdoff, 0, "Holdoff time from rcutorture start, seconds.");
torture_param(int, test_boost_interval, 7, "Interval between boost tests, seconds.");
torture_param(int, test_nmis, 0, "End-test NMI tests, 0 to disable.");
torture_param(bool, test_no_idle_hz, true, "Test support for tickless idle CPUs");
@@ -131,6 +152,7 @@ MODULE_PARM_DESC(torture_type, "Type of RCU to torture (rcu, srcu, ...)");
static int nrealnocbers;
static int nrealreaders;
+static int nrealfakewriters;
static struct task_struct *writer_task;
static struct task_struct **fakewriter_tasks;
static struct task_struct **reader_tasks;
@@ -143,6 +165,7 @@ static struct task_struct **fwd_prog_tasks;
static struct task_struct **barrier_cbs_tasks;
static struct task_struct *barrier_task;
static struct task_struct *read_exit_task;
+static struct task_struct *preempt_task;
#define RCU_TORTURE_PIPE_LEN 10
@@ -253,10 +276,16 @@ struct rt_read_seg {
unsigned long rt_delay_ms;
unsigned long rt_delay_us;
bool rt_preempted;
+ int rt_cpu;
+ int rt_end_cpu;
+ unsigned long long rt_gp_seq;
+ unsigned long long rt_gp_seq_end;
+ u64 rt_ts;
};
static int err_segs_recorded;
static struct rt_read_seg err_segs[RCUTORTURE_RDR_MAX_SEGS];
static int rt_read_nsegs;
+static int rt_read_preempted;
static const char *rcu_torture_writer_state_getname(void)
{
@@ -347,7 +376,8 @@ struct rcu_torture_ops {
void (*read_delay)(struct torture_random_state *rrsp,
struct rt_read_seg *rtrsp);
void (*readunlock)(int idx);
- int (*readlock_held)(void);
+ int (*readlock_held)(void); // lockdep.
+ int (*readlock_nesting)(void); // actual nesting, if available, -1 if not.
unsigned long (*get_gp_seq)(void);
unsigned long (*gp_diff)(unsigned long new, unsigned long old);
void (*deferred_free)(struct rcu_torture *p);
@@ -365,8 +395,6 @@ struct rcu_torture_ops {
bool (*same_gp_state_full)(struct rcu_gp_oldstate *rgosp1, struct rcu_gp_oldstate *rgosp2);
unsigned long (*get_gp_state)(void);
void (*get_gp_state_full)(struct rcu_gp_oldstate *rgosp);
- unsigned long (*get_gp_completed)(void);
- void (*get_gp_completed_full)(struct rcu_gp_oldstate *rgosp);
unsigned long (*start_gp_poll)(void);
void (*start_gp_poll_full)(struct rcu_gp_oldstate *rgosp);
bool (*poll_gp_state)(unsigned long oldstate);
@@ -374,6 +402,8 @@ struct rcu_torture_ops {
bool (*poll_need_2gp)(bool poll, bool poll_full);
void (*cond_sync)(unsigned long oldstate);
void (*cond_sync_full)(struct rcu_gp_oldstate *rgosp);
+ int poll_active;
+ int poll_active_full;
call_rcu_func_t call;
void (*cb_barrier)(void);
void (*fqs)(void);
@@ -381,12 +411,22 @@ struct rcu_torture_ops {
void (*gp_kthread_dbg)(void);
bool (*check_boost_failed)(unsigned long gp_state, int *cpup);
int (*stall_dur)(void);
+ void (*get_gp_data)(int *flags, unsigned long *gp_seq);
+ void (*gp_slow_register)(atomic_t *rgssp);
+ void (*gp_slow_unregister)(atomic_t *rgssp);
+ bool (*reader_blocked)(void);
+ unsigned long long (*gather_gp_seqs)(void);
+ void (*format_gp_seqs)(unsigned long long seqs, char *cp, size_t len);
+ void (*set_gpwrap_lag)(unsigned long lag);
+ int (*get_gpwrap_count)(int cpu);
long cbflood_max;
int irq_capable;
int can_boost;
int extendables;
int slow_gps;
int no_pi_lock;
+ int debug_objects;
+ int start_poll_irqsoff;
const char *name;
};
@@ -437,10 +477,8 @@ rcu_read_delay(struct torture_random_state *rrsp, struct rt_read_seg *rtrsp)
rtrsp->rt_delay_us = shortdelay_us;
}
if (!preempt_count() &&
- !(torture_random(rrsp) % (nrealreaders * 500))) {
+ !(torture_random(rrsp) % (nrealreaders * 500)))
torture_preempt_schedule(); /* QS only if preemptible. */
- rtrsp->rt_preempted = true;
- }
}
static void rcu_torture_read_unlock(int idx)
@@ -448,6 +486,15 @@ static void rcu_torture_read_unlock(int idx)
rcu_read_unlock();
}
+static int rcu_torture_readlock_nesting(void)
+{
+ if (IS_ENABLED(CONFIG_PREEMPT_RCU))
+ return rcu_preempt_depth();
+ if (IS_ENABLED(CONFIG_PREEMPT_COUNT))
+ return (preempt_count() & PREEMPT_MASK);
+ return -1;
+}
+
/*
* Update callback in the pipe. This should be invoked after a grace period.
*/
@@ -461,12 +508,13 @@ rcu_torture_pipe_update_one(struct rcu_torture *rp)
WRITE_ONCE(rp->rtort_chkp, NULL);
smp_store_release(&rtrcp->rtc_ready, 1); // Pair with smp_load_acquire().
}
- i = READ_ONCE(rp->rtort_pipe_count);
+ i = rp->rtort_pipe_count;
if (i > RCU_TORTURE_PIPE_LEN)
i = RCU_TORTURE_PIPE_LEN;
atomic_inc(&rcu_torture_wcount[i]);
WRITE_ONCE(rp->rtort_pipe_count, i + 1);
- if (rp->rtort_pipe_count >= RCU_TORTURE_PIPE_LEN) {
+ ASSERT_EXCLUSIVE_WRITER(rp->rtort_pipe_count);
+ if (i + 1 >= RCU_TORTURE_PIPE_LEN) {
rp->rtort_mbtest = 0;
return true;
}
@@ -536,6 +584,7 @@ static struct rcu_torture_ops rcu_ops = {
.read_delay = rcu_read_delay,
.readunlock = rcu_torture_read_unlock,
.readlock_held = torture_readlock_not_held,
+ .readlock_nesting = rcu_torture_readlock_nesting,
.get_gp_seq = rcu_get_gp_seq,
.gp_diff = rcu_seq_diff,
.deferred_free = rcu_torture_deferred_free,
@@ -547,8 +596,6 @@ static struct rcu_torture_ops rcu_ops = {
.get_comp_state_full = get_completed_synchronize_rcu_full,
.get_gp_state = get_state_synchronize_rcu,
.get_gp_state_full = get_state_synchronize_rcu_full,
- .get_gp_completed = get_completed_synchronize_rcu,
- .get_gp_completed_full = get_completed_synchronize_rcu_full,
.start_gp_poll = start_poll_synchronize_rcu,
.start_gp_poll_full = start_poll_synchronize_rcu_full,
.poll_gp_state = poll_state_synchronize_rcu,
@@ -556,21 +603,35 @@ static struct rcu_torture_ops rcu_ops = {
.poll_need_2gp = rcu_poll_need_2gp,
.cond_sync = cond_synchronize_rcu,
.cond_sync_full = cond_synchronize_rcu_full,
+ .poll_active = NUM_ACTIVE_RCU_POLL_OLDSTATE,
+ .poll_active_full = NUM_ACTIVE_RCU_POLL_FULL_OLDSTATE,
.get_gp_state_exp = get_state_synchronize_rcu,
.start_gp_poll_exp = start_poll_synchronize_rcu_expedited,
.start_gp_poll_exp_full = start_poll_synchronize_rcu_expedited_full,
.poll_gp_state_exp = poll_state_synchronize_rcu,
.cond_sync_exp = cond_synchronize_rcu_expedited,
+ .cond_sync_exp_full = cond_synchronize_rcu_expedited_full,
.call = call_rcu_hurry,
.cb_barrier = rcu_barrier,
.fqs = rcu_force_quiescent_state,
- .stats = NULL,
.gp_kthread_dbg = show_rcu_gp_kthreads,
.check_boost_failed = rcu_check_boost_fail,
.stall_dur = rcu_jiffies_till_stall_check,
+ .get_gp_data = rcutorture_get_gp_data,
+ .gp_slow_register = rcu_gp_slow_register,
+ .gp_slow_unregister = rcu_gp_slow_unregister,
+ .reader_blocked = IS_ENABLED(CONFIG_RCU_TORTURE_TEST_LOG_CPU)
+ ? has_rcu_reader_blocked
+ : NULL,
+ .gather_gp_seqs = rcutorture_gather_gp_seqs,
+ .format_gp_seqs = rcutorture_format_gp_seqs,
+ .set_gpwrap_lag = rcu_set_gpwrap_lag,
+ .get_gpwrap_count = rcu_get_gpwrap_count,
.irq_capable = 1,
.can_boost = IS_ENABLED(CONFIG_RCU_BOOST),
.extendables = RCUTORTURE_MAX_EXTEND,
+ .debug_objects = 1,
+ .start_poll_irqsoff = 1,
.name = "rcu"
};
@@ -611,10 +672,10 @@ static struct rcu_torture_ops rcu_busted_ops = {
.sync = synchronize_rcu_busted,
.exp_sync = synchronize_rcu_busted,
.call = call_rcu_busted,
- .cb_barrier = NULL,
- .fqs = NULL,
- .stats = NULL,
+ .gather_gp_seqs = rcutorture_gather_gp_seqs,
+ .format_gp_seqs = rcutorture_format_gp_seqs,
.irq_capable = 1,
+ .extendables = RCUTORTURE_MAX_EXTEND,
.name = "busted"
};
@@ -627,12 +688,41 @@ static struct srcu_struct srcu_ctld;
static struct srcu_struct *srcu_ctlp = &srcu_ctl;
static struct rcu_torture_ops srcud_ops;
+static void srcu_get_gp_data(int *flags, unsigned long *gp_seq)
+{
+ srcutorture_get_gp_data(srcu_ctlp, flags, gp_seq);
+}
+
static int srcu_torture_read_lock(void)
{
- if (cur_ops == &srcud_ops)
- return srcu_read_lock_nmisafe(srcu_ctlp);
- else
- return srcu_read_lock(srcu_ctlp);
+ int idx;
+ struct srcu_ctr __percpu *scp;
+ int ret = 0;
+
+ WARN_ON_ONCE(reader_flavor & ~SRCU_READ_FLAVOR_ALL);
+
+ if ((reader_flavor & SRCU_READ_FLAVOR_NORMAL) || !(reader_flavor & SRCU_READ_FLAVOR_ALL)) {
+ idx = srcu_read_lock(srcu_ctlp);
+ WARN_ON_ONCE(idx & ~0x1);
+ ret += idx;
+ }
+ if (reader_flavor & SRCU_READ_FLAVOR_NMI) {
+ idx = srcu_read_lock_nmisafe(srcu_ctlp);
+ WARN_ON_ONCE(idx & ~0x1);
+ ret += idx << 1;
+ }
+ if (reader_flavor & SRCU_READ_FLAVOR_LITE) {
+ idx = srcu_read_lock_lite(srcu_ctlp);
+ WARN_ON_ONCE(idx & ~0x1);
+ ret += idx << 2;
+ }
+ if (reader_flavor & SRCU_READ_FLAVOR_FAST) {
+ scp = srcu_read_lock_fast(srcu_ctlp);
+ idx = __srcu_ptr_to_ctr(srcu_ctlp, scp);
+ WARN_ON_ONCE(idx & ~0x1);
+ ret += idx << 3;
+ }
+ return ret;
}
static void
@@ -656,10 +746,15 @@ srcu_read_delay(struct torture_random_state *rrsp, struct rt_read_seg *rtrsp)
static void srcu_torture_read_unlock(int idx)
{
- if (cur_ops == &srcud_ops)
- srcu_read_unlock_nmisafe(srcu_ctlp, idx);
- else
- srcu_read_unlock(srcu_ctlp, idx);
+ WARN_ON_ONCE((reader_flavor && (idx & ~reader_flavor)) || (!reader_flavor && (idx & ~0x1)));
+ if (reader_flavor & SRCU_READ_FLAVOR_FAST)
+ srcu_read_unlock_fast(srcu_ctlp, __srcu_ctr_to_ptr(srcu_ctlp, (idx & 0x8) >> 3));
+ if (reader_flavor & SRCU_READ_FLAVOR_LITE)
+ srcu_read_unlock_lite(srcu_ctlp, (idx & 0x4) >> 2);
+ if (reader_flavor & SRCU_READ_FLAVOR_NMI)
+ srcu_read_unlock_nmisafe(srcu_ctlp, (idx & 0x2) >> 1);
+ if ((reader_flavor & SRCU_READ_FLAVOR_NORMAL) || !(reader_flavor & SRCU_READ_FLAVOR_ALL))
+ srcu_read_unlock(srcu_ctlp, idx & 0x1);
}
static int torture_srcu_read_lock_held(void)
@@ -726,18 +821,24 @@ static struct rcu_torture_ops srcu_ops = {
.readunlock = srcu_torture_read_unlock,
.readlock_held = torture_srcu_read_lock_held,
.get_gp_seq = srcu_torture_completed,
+ .gp_diff = rcu_seq_diff,
.deferred_free = srcu_torture_deferred_free,
.sync = srcu_torture_synchronize,
.exp_sync = srcu_torture_synchronize_expedited,
+ .same_gp_state = same_state_synchronize_srcu,
+ .get_comp_state = get_completed_synchronize_srcu,
.get_gp_state = srcu_torture_get_gp_state,
.start_gp_poll = srcu_torture_start_gp_poll,
.poll_gp_state = srcu_torture_poll_gp_state,
+ .poll_active = NUM_ACTIVE_SRCU_POLL_OLDSTATE,
.call = srcu_torture_call,
.cb_barrier = srcu_torture_barrier,
.stats = srcu_torture_stats,
+ .get_gp_data = srcu_get_gp_data,
.cbflood_max = 50000,
.irq_capable = 1,
.no_pi_lock = IS_ENABLED(CONFIG_TINY_SRCU),
+ .debug_objects = 1,
.name = "srcu"
};
@@ -764,18 +865,24 @@ static struct rcu_torture_ops srcud_ops = {
.readunlock = srcu_torture_read_unlock,
.readlock_held = torture_srcu_read_lock_held,
.get_gp_seq = srcu_torture_completed,
+ .gp_diff = rcu_seq_diff,
.deferred_free = srcu_torture_deferred_free,
.sync = srcu_torture_synchronize,
.exp_sync = srcu_torture_synchronize_expedited,
+ .same_gp_state = same_state_synchronize_srcu,
+ .get_comp_state = get_completed_synchronize_srcu,
.get_gp_state = srcu_torture_get_gp_state,
.start_gp_poll = srcu_torture_start_gp_poll,
.poll_gp_state = srcu_torture_poll_gp_state,
+ .poll_active = NUM_ACTIVE_SRCU_POLL_OLDSTATE,
.call = srcu_torture_call,
.cb_barrier = srcu_torture_barrier,
.stats = srcu_torture_stats,
+ .get_gp_data = srcu_get_gp_data,
.cbflood_max = 50000,
.irq_capable = 1,
.no_pi_lock = IS_ENABLED(CONFIG_TINY_SRCU),
+ .debug_objects = 1,
.name = "srcud"
};
@@ -811,7 +918,7 @@ static void synchronize_rcu_trivial(void)
int cpu;
for_each_online_cpu(cpu) {
- torture_sched_setaffinity(current->pid, cpumask_of(cpu));
+ torture_sched_setaffinity(current->pid, cpumask_of(cpu), true);
WARN_ON_ONCE(raw_smp_processor_id() != cpu);
}
}
@@ -837,8 +944,6 @@ static struct rcu_torture_ops trivial_ops = {
.get_gp_seq = rcu_no_completed,
.sync = synchronize_rcu_trivial,
.exp_sync = synchronize_rcu_trivial,
- .fqs = NULL,
- .stats = NULL,
.irq_capable = 1,
.name = "trivial"
};
@@ -881,8 +986,7 @@ static struct rcu_torture_ops tasks_ops = {
.call = call_rcu_tasks,
.cb_barrier = rcu_barrier_tasks,
.gp_kthread_dbg = show_rcu_tasks_classic_gp_kthread,
- .fqs = NULL,
- .stats = NULL,
+ .get_gp_data = rcu_tasks_get_gp_data,
.irq_capable = 1,
.slow_gps = 1,
.name = "tasks"
@@ -903,11 +1007,6 @@ static struct rcu_torture_ops tasks_ops = {
* Definitions for rude RCU-tasks torture testing.
*/
-static void rcu_tasks_rude_torture_deferred_free(struct rcu_torture *p)
-{
- call_rcu_tasks_rude(&p->rtort_rcu, rcu_torture_cb);
-}
-
static struct rcu_torture_ops tasks_rude_ops = {
.ttype = RCU_TASKS_RUDE_FLAVOR,
.init = rcu_sync_torture_init,
@@ -915,15 +1014,11 @@ static struct rcu_torture_ops tasks_rude_ops = {
.read_delay = rcu_read_delay, /* just reuse rcu's version. */
.readunlock = rcu_torture_read_unlock_trivial,
.get_gp_seq = rcu_no_completed,
- .deferred_free = rcu_tasks_rude_torture_deferred_free,
.sync = synchronize_rcu_tasks_rude,
.exp_sync = synchronize_rcu_tasks_rude,
- .call = call_rcu_tasks_rude,
- .cb_barrier = rcu_barrier_tasks_rude,
.gp_kthread_dbg = show_rcu_tasks_rude_gp_kthread,
+ .get_gp_data = rcu_tasks_rude_get_gp_data,
.cbflood_max = 50000,
- .fqs = NULL,
- .stats = NULL,
.irq_capable = 1,
.name = "tasks-rude"
};
@@ -973,9 +1068,8 @@ static struct rcu_torture_ops tasks_tracing_ops = {
.call = call_rcu_tasks_trace,
.cb_barrier = rcu_barrier_tasks_trace,
.gp_kthread_dbg = show_rcu_tasks_trace_gp_kthread,
+ .get_gp_data = rcu_tasks_trace_get_gp_data,
.cbflood_max = 50000,
- .fqs = NULL,
- .stats = NULL,
.irq_capable = 1,
.slow_gps = 1,
.name = "tasks-tracing"
@@ -1050,8 +1144,13 @@ static bool rcu_torture_boost_failed(unsigned long gp_state, unsigned long *star
// At most one persisted message per boost test.
j = jiffies;
lp = READ_ONCE(last_persist);
- if (time_after(j, lp + mininterval) && cmpxchg(&last_persist, lp, j) == lp)
- pr_info("Boost inversion persisted: No QS from CPU %d\n", cpu);
+ if (time_after(j, lp + mininterval) &&
+ cmpxchg(&last_persist, lp, j) == lp) {
+ if (cpu < 0)
+ pr_info("Boost inversion persisted: QS from all CPUs\n");
+ else
+ pr_info("Boost inversion persisted: No QS from CPU %d\n", cpu);
+ }
return false; // passed on a technicality
}
VERBOSE_TOROUT_STRING("rcu_torture_boost boosting failed");
@@ -1081,8 +1180,19 @@ static int rcu_torture_boost(void *arg)
unsigned long gp_state;
unsigned long gp_state_time;
unsigned long oldstarttime;
+ unsigned long booststarttime = get_torture_init_jiffies() + test_boost_holdoff * HZ;
- VERBOSE_TOROUT_STRING("rcu_torture_boost started");
+ if (test_boost_holdoff <= 0 || time_after(jiffies, booststarttime)) {
+ VERBOSE_TOROUT_STRING("rcu_torture_boost started");
+ } else {
+ VERBOSE_TOROUT_STRING("rcu_torture_boost started holdoff period");
+ while (time_before(jiffies, booststarttime)) {
+ schedule_timeout_idle(HZ);
+ if (kthread_should_stop())
+ goto cleanup;
+ }
+ VERBOSE_TOROUT_STRING("rcu_torture_boost finished holdoff period");
+ }
/* Set real-time priority. */
sched_set_fifo_low(current);
@@ -1158,6 +1268,7 @@ checkwait: if (stutter_wait("rcu_torture_boost"))
sched_set_fifo_low(current);
} while (!torture_must_stop());
+cleanup:
/* Clean up and exit. */
while (!kthread_should_stop()) {
torture_shutdown_absorb("rcu_torture_boost");
@@ -1308,6 +1419,8 @@ static void rcu_torture_write_types(void)
} else if (gp_sync && !cur_ops->sync) {
pr_alert("%s: gp_sync without primitives.\n", __func__);
}
+ pr_alert("%s: Testing %d update types.\n", __func__, nsynctypes);
+ pr_info("%s: gp_cond_wi %d gp_cond_wi_exp %d gp_poll_wi %d gp_poll_wi_exp %d\n", __func__, gp_cond_wi, gp_cond_wi_exp, gp_poll_wi, gp_poll_wi_exp);
}
/*
@@ -1364,17 +1477,20 @@ rcu_torture_writer(void *arg)
int i;
int idx;
int oldnice = task_nice(current);
- struct rcu_gp_oldstate rgo[NUM_ACTIVE_RCU_POLL_FULL_OLDSTATE];
+ struct rcu_gp_oldstate *rgo = NULL;
+ int rgo_size = 0;
struct rcu_torture *rp;
struct rcu_torture *old_rp;
static DEFINE_TORTURE_RANDOM(rand);
unsigned long stallsdone = jiffies;
bool stutter_waited;
- unsigned long ulo[NUM_ACTIVE_RCU_POLL_OLDSTATE];
+ unsigned long *ulo = NULL;
+ int ulo_size = 0;
// If a new stall test is added, this must be adjusted.
if (stall_cpu_holdoff + stall_gp_kthread + stall_cpu)
- stallsdone += (stall_cpu_holdoff + stall_gp_kthread + stall_cpu + 60) * HZ;
+ stallsdone += (stall_cpu_holdoff + stall_gp_kthread + stall_cpu + 60) *
+ HZ * (stall_cpu_repeat + 1);
VERBOSE_TOROUT_STRING("rcu_torture_writer task started");
if (!can_expedite)
pr_alert("%s" TORTURE_FLAG
@@ -1391,6 +1507,16 @@ rcu_torture_writer(void *arg)
torture_kthread_stopping("rcu_torture_writer");
return 0;
}
+ if (cur_ops->poll_active > 0) {
+ ulo = kzalloc(cur_ops->poll_active * sizeof(ulo[0]), GFP_KERNEL);
+ if (!WARN_ON(!ulo))
+ ulo_size = cur_ops->poll_active;
+ }
+ if (cur_ops->poll_active_full > 0) {
+ rgo = kzalloc(cur_ops->poll_active_full * sizeof(rgo[0]), GFP_KERNEL);
+ if (!WARN_ON(!rgo))
+ rgo_size = cur_ops->poll_active_full;
+ }
do {
rcu_torture_writer_state = RTWS_FIXED_DELAY;
@@ -1399,6 +1525,7 @@ rcu_torture_writer(void *arg)
if (rp == NULL)
continue;
rp->rtort_pipe_count = 0;
+ ASSERT_EXCLUSIVE_WRITER(rp->rtort_pipe_count);
rcu_torture_writer_state = RTWS_DELAY;
udelay(torture_random(&rand) & 0x3ff);
rcu_torture_writer_state = RTWS_REPLACE;
@@ -1414,6 +1541,7 @@ rcu_torture_writer(void *arg)
atomic_inc(&rcu_torture_wcount[i]);
WRITE_ONCE(old_rp->rtort_pipe_count,
old_rp->rtort_pipe_count + 1);
+ ASSERT_EXCLUSIVE_WRITER(old_rp->rtort_pipe_count);
// Make sure readers block polled grace periods.
if (cur_ops->get_gp_state && cur_ops->poll_gp_state) {
@@ -1425,8 +1553,8 @@ rcu_torture_writer(void *arg)
rcu_torture_writer_state_getname(),
rcu_torture_writer_state,
cookie, cur_ops->get_gp_state());
- if (cur_ops->get_gp_completed) {
- cookie = cur_ops->get_gp_completed();
+ if (cur_ops->get_comp_state) {
+ cookie = cur_ops->get_comp_state();
WARN_ON_ONCE(!cur_ops->poll_gp_state(cookie));
}
cur_ops->readunlock(idx);
@@ -1440,8 +1568,8 @@ rcu_torture_writer(void *arg)
rcu_torture_writer_state_getname(),
rcu_torture_writer_state,
cpumask_pr_args(cpu_online_mask));
- if (cur_ops->get_gp_completed_full) {
- cur_ops->get_gp_completed_full(&cookie_full);
+ if (cur_ops->get_comp_state_full) {
+ cur_ops->get_comp_state_full(&cookie_full);
WARN_ON_ONCE(!cur_ops->poll_gp_state_full(&cookie_full));
}
cur_ops->readunlock(idx);
@@ -1459,7 +1587,8 @@ rcu_torture_writer(void *arg)
case RTWS_COND_GET:
rcu_torture_writer_state = RTWS_COND_GET;
gp_snap = cur_ops->get_gp_state();
- torture_hrtimeout_jiffies(torture_random(&rand) % 16, &rand);
+ torture_hrtimeout_us(torture_random(&rand) % gp_cond_wi,
+ 1000, &rand);
rcu_torture_writer_state = RTWS_COND_SYNC;
cur_ops->cond_sync(gp_snap);
rcu_torture_pipe_update(old_rp);
@@ -1467,7 +1596,8 @@ rcu_torture_writer(void *arg)
case RTWS_COND_GET_EXP:
rcu_torture_writer_state = RTWS_COND_GET_EXP;
gp_snap = cur_ops->get_gp_state_exp();
- torture_hrtimeout_jiffies(torture_random(&rand) % 16, &rand);
+ torture_hrtimeout_us(torture_random(&rand) % gp_cond_wi_exp,
+ 1000, &rand);
rcu_torture_writer_state = RTWS_COND_SYNC_EXP;
cur_ops->cond_sync_exp(gp_snap);
rcu_torture_pipe_update(old_rp);
@@ -1475,7 +1605,8 @@ rcu_torture_writer(void *arg)
case RTWS_COND_GET_FULL:
rcu_torture_writer_state = RTWS_COND_GET_FULL;
cur_ops->get_gp_state_full(&gp_snap_full);
- torture_hrtimeout_jiffies(torture_random(&rand) % 16, &rand);
+ torture_hrtimeout_us(torture_random(&rand) % gp_cond_wi,
+ 1000, &rand);
rcu_torture_writer_state = RTWS_COND_SYNC_FULL;
cur_ops->cond_sync_full(&gp_snap_full);
rcu_torture_pipe_update(old_rp);
@@ -1483,49 +1614,50 @@ rcu_torture_writer(void *arg)
case RTWS_COND_GET_EXP_FULL:
rcu_torture_writer_state = RTWS_COND_GET_EXP_FULL;
cur_ops->get_gp_state_full(&gp_snap_full);
- torture_hrtimeout_jiffies(torture_random(&rand) % 16, &rand);
+ torture_hrtimeout_us(torture_random(&rand) % gp_cond_wi_exp,
+ 1000, &rand);
rcu_torture_writer_state = RTWS_COND_SYNC_EXP_FULL;
cur_ops->cond_sync_exp_full(&gp_snap_full);
rcu_torture_pipe_update(old_rp);
break;
case RTWS_POLL_GET:
rcu_torture_writer_state = RTWS_POLL_GET;
- for (i = 0; i < ARRAY_SIZE(ulo); i++)
+ for (i = 0; i < ulo_size; i++)
ulo[i] = cur_ops->get_comp_state();
gp_snap = cur_ops->start_gp_poll();
rcu_torture_writer_state = RTWS_POLL_WAIT;
while (!cur_ops->poll_gp_state(gp_snap)) {
gp_snap1 = cur_ops->get_gp_state();
- for (i = 0; i < ARRAY_SIZE(ulo); i++)
+ for (i = 0; i < ulo_size; i++)
if (cur_ops->poll_gp_state(ulo[i]) ||
cur_ops->same_gp_state(ulo[i], gp_snap1)) {
ulo[i] = gp_snap1;
break;
}
- WARN_ON_ONCE(i >= ARRAY_SIZE(ulo));
- torture_hrtimeout_jiffies(torture_random(&rand) % 16,
- &rand);
+ WARN_ON_ONCE(ulo_size > 0 && i >= ulo_size);
+ torture_hrtimeout_us(torture_random(&rand) % gp_poll_wi,
+ 1000, &rand);
}
rcu_torture_pipe_update(old_rp);
break;
case RTWS_POLL_GET_FULL:
rcu_torture_writer_state = RTWS_POLL_GET_FULL;
- for (i = 0; i < ARRAY_SIZE(rgo); i++)
+ for (i = 0; i < rgo_size; i++)
cur_ops->get_comp_state_full(&rgo[i]);
cur_ops->start_gp_poll_full(&gp_snap_full);
rcu_torture_writer_state = RTWS_POLL_WAIT_FULL;
while (!cur_ops->poll_gp_state_full(&gp_snap_full)) {
cur_ops->get_gp_state_full(&gp_snap1_full);
- for (i = 0; i < ARRAY_SIZE(rgo); i++)
+ for (i = 0; i < rgo_size; i++)
if (cur_ops->poll_gp_state_full(&rgo[i]) ||
cur_ops->same_gp_state_full(&rgo[i],
&gp_snap1_full)) {
rgo[i] = gp_snap1_full;
break;
}
- WARN_ON_ONCE(i >= ARRAY_SIZE(rgo));
- torture_hrtimeout_jiffies(torture_random(&rand) % 16,
- &rand);
+ WARN_ON_ONCE(rgo_size > 0 && i >= rgo_size);
+ torture_hrtimeout_us(torture_random(&rand) % gp_poll_wi,
+ 1000, &rand);
}
rcu_torture_pipe_update(old_rp);
break;
@@ -1534,8 +1666,8 @@ rcu_torture_writer(void *arg)
gp_snap = cur_ops->start_gp_poll_exp();
rcu_torture_writer_state = RTWS_POLL_WAIT_EXP;
while (!cur_ops->poll_gp_state_exp(gp_snap))
- torture_hrtimeout_jiffies(torture_random(&rand) % 16,
- &rand);
+ torture_hrtimeout_us(torture_random(&rand) % gp_poll_wi_exp,
+ 1000, &rand);
rcu_torture_pipe_update(old_rp);
break;
case RTWS_POLL_GET_EXP_FULL:
@@ -1543,8 +1675,8 @@ rcu_torture_writer(void *arg)
cur_ops->start_gp_poll_exp_full(&gp_snap_full);
rcu_torture_writer_state = RTWS_POLL_WAIT_EXP_FULL;
while (!cur_ops->poll_gp_state_full(&gp_snap_full))
- torture_hrtimeout_jiffies(torture_random(&rand) % 16,
- &rand);
+ torture_hrtimeout_us(torture_random(&rand) % gp_poll_wi_exp,
+ 1000, &rand);
rcu_torture_pipe_update(old_rp);
break;
case RTWS_SYNC:
@@ -1586,7 +1718,8 @@ rcu_torture_writer(void *arg)
if (list_empty(&rcu_tortures[i].rtort_free) &&
rcu_access_pointer(rcu_torture_current) != &rcu_tortures[i]) {
tracing_off();
- show_rcu_gp_kthreads();
+ if (cur_ops->gp_kthread_dbg)
+ cur_ops->gp_kthread_dbg();
WARN(1, "%s: rtort_pipe_count: %d\n", __func__, rcu_tortures[i].rtort_pipe_count);
rcu_ftrace_dump(DUMP_ALL);
}
@@ -1604,6 +1737,8 @@ rcu_torture_writer(void *arg)
pr_alert("%s" TORTURE_FLAG
" Dynamic grace-period expediting was disabled.\n",
torture_type);
+ kfree(ulo);
+ kfree(rgo);
rcu_torture_writer_state = RTWS_STOPPING;
torture_kthread_stopping("rcu_torture_writer");
return 0;
@@ -1637,7 +1772,7 @@ rcu_torture_fakewriter(void *arg)
do {
torture_hrtimeout_jiffies(torture_random(&rand) % 10, &rand);
if (cur_ops->cb_barrier != NULL &&
- torture_random(&rand) % (nfakewriters * 8) == 0) {
+ torture_random(&rand) % (nrealfakewriters * 8) == 0) {
cur_ops->cb_barrier();
} else {
switch (synctype[torture_random(&rand) % nsynctypes]) {
@@ -1667,14 +1802,22 @@ rcu_torture_fakewriter(void *arg)
cur_ops->cond_sync_exp_full(&gp_snap_full);
break;
case RTWS_POLL_GET:
+ if (cur_ops->start_poll_irqsoff)
+ local_irq_disable();
gp_snap = cur_ops->start_gp_poll();
+ if (cur_ops->start_poll_irqsoff)
+ local_irq_enable();
while (!cur_ops->poll_gp_state(gp_snap)) {
torture_hrtimeout_jiffies(torture_random(&rand) % 16,
&rand);
}
break;
case RTWS_POLL_GET_FULL:
+ if (cur_ops->start_poll_irqsoff)
+ local_irq_disable();
cur_ops->start_gp_poll_full(&gp_snap_full);
+ if (cur_ops->start_poll_irqsoff)
+ local_irq_enable();
while (!cur_ops->poll_gp_state_full(&gp_snap_full)) {
torture_hrtimeout_jiffies(torture_random(&rand) % 16,
&rand);
@@ -1770,6 +1913,62 @@ static void rcu_torture_reader_do_mbchk(long myid, struct rcu_torture *rtp,
smp_store_release(&rtrcp_assigner->rtc_chkrdr, -1); // Assigner can again assign.
}
+// Verify the specified RCUTORTURE_RDR* state.
+#define ROEC_ARGS "%s %s: Current %#x To add %#x To remove %#x preempt_count() %#x\n", __func__, s, curstate, new, old, preempt_count()
+static void rcutorture_one_extend_check(char *s, int curstate, int new, int old, bool insoftirq)
+{
+ int mask;
+
+ if (!IS_ENABLED(CONFIG_RCU_TORTURE_TEST_CHK_RDR_STATE))
+ return;
+
+ WARN_ONCE(!(curstate & RCUTORTURE_RDR_IRQ) && irqs_disabled(), ROEC_ARGS);
+ WARN_ONCE((curstate & RCUTORTURE_RDR_IRQ) && !irqs_disabled(), ROEC_ARGS);
+
+ // If CONFIG_PREEMPT_COUNT=n, further checks are unreliable.
+ if (!IS_ENABLED(CONFIG_PREEMPT_COUNT))
+ return;
+
+ WARN_ONCE((curstate & (RCUTORTURE_RDR_BH | RCUTORTURE_RDR_RBH)) &&
+ !(preempt_count() & SOFTIRQ_MASK), ROEC_ARGS);
+ WARN_ONCE((curstate & (RCUTORTURE_RDR_PREEMPT | RCUTORTURE_RDR_SCHED)) &&
+ !(preempt_count() & PREEMPT_MASK), ROEC_ARGS);
+ WARN_ONCE(cur_ops->readlock_nesting &&
+ (curstate & (RCUTORTURE_RDR_RCU_1 | RCUTORTURE_RDR_RCU_2)) &&
+ cur_ops->readlock_nesting() == 0, ROEC_ARGS);
+
+ // Timer handlers have all sorts of stuff disabled, so ignore
+ // unintended disabling.
+ if (insoftirq)
+ return;
+
+ WARN_ONCE(cur_ops->extendables &&
+ !(curstate & (RCUTORTURE_RDR_BH | RCUTORTURE_RDR_RBH)) &&
+ (preempt_count() & SOFTIRQ_MASK), ROEC_ARGS);
+
+ /*
+ * non-preemptible RCU in a preemptible kernel uses preempt_disable()
+ * as rcu_read_lock().
+ */
+ mask = RCUTORTURE_RDR_PREEMPT | RCUTORTURE_RDR_SCHED;
+ if (!IS_ENABLED(CONFIG_PREEMPT_RCU))
+ mask |= RCUTORTURE_RDR_RCU_1 | RCUTORTURE_RDR_RCU_2;
+
+ WARN_ONCE(cur_ops->extendables && !(curstate & mask) &&
+ (preempt_count() & PREEMPT_MASK), ROEC_ARGS);
+
+ /*
+ * non-preemptible RCU in a preemptible kernel uses "preempt_count() &
+ * PREEMPT_MASK" as ->readlock_nesting().
+ */
+ mask = RCUTORTURE_RDR_RCU_1 | RCUTORTURE_RDR_RCU_2;
+ if (!IS_ENABLED(CONFIG_PREEMPT_RCU))
+ mask |= RCUTORTURE_RDR_PREEMPT | RCUTORTURE_RDR_SCHED;
+
+ WARN_ONCE(cur_ops->readlock_nesting && !(curstate & mask) &&
+ cur_ops->readlock_nesting() > 0, ROEC_ARGS);
+}
+
/*
* Do one extension of an RCU read-side critical section using the
* current reader state in readstate (set to zero for initial entry
@@ -1779,10 +1978,11 @@ static void rcu_torture_reader_do_mbchk(long myid, struct rcu_torture *rtp,
* beginning or end of the critical section and if there was actually a
* change, do a ->read_delay().
*/
-static void rcutorture_one_extend(int *readstate, int newstate,
+static void rcutorture_one_extend(int *readstate, int newstate, bool insoftirq,
struct torture_random_state *trsp,
struct rt_read_seg *rtrsp)
{
+ bool first;
unsigned long flags;
int idxnew1 = -1;
int idxnew2 = -1;
@@ -1791,8 +1991,10 @@ static void rcutorture_one_extend(int *readstate, int newstate,
int statesnew = ~*readstate & newstate;
int statesold = *readstate & ~newstate;
+ first = idxold1 == 0;
WARN_ON_ONCE(idxold2 < 0);
- WARN_ON_ONCE((idxold2 >> RCUTORTURE_RDR_SHIFT_2) > 1);
+ WARN_ON_ONCE(idxold2 & ~RCUTORTURE_RDR_ALLBITS);
+ rcutorture_one_extend_check("before change", idxold1, statesnew, statesold, insoftirq);
rtrsp->rt_readstate = newstate;
/* First, put new protection in place to avoid critical-section gap. */
@@ -1807,9 +2009,31 @@ static void rcutorture_one_extend(int *readstate, int newstate,
if (statesnew & RCUTORTURE_RDR_SCHED)
rcu_read_lock_sched();
if (statesnew & RCUTORTURE_RDR_RCU_1)
- idxnew1 = (cur_ops->readlock() & 0x1) << RCUTORTURE_RDR_SHIFT_1;
+ idxnew1 = (cur_ops->readlock() << RCUTORTURE_RDR_SHIFT_1) & RCUTORTURE_RDR_MASK_1;
if (statesnew & RCUTORTURE_RDR_RCU_2)
- idxnew2 = (cur_ops->readlock() & 0x1) << RCUTORTURE_RDR_SHIFT_2;
+ idxnew2 = (cur_ops->readlock() << RCUTORTURE_RDR_SHIFT_2) & RCUTORTURE_RDR_MASK_2;
+
+ // Complain unless both the old and the new protection is in place.
+ rcutorture_one_extend_check("during change",
+ idxold1 | statesnew, statesnew, statesold, insoftirq);
+
+ // Sample CPU under both sets of protections to reduce confusion.
+ if (IS_ENABLED(CONFIG_RCU_TORTURE_TEST_LOG_CPU)) {
+ int cpu = raw_smp_processor_id();
+ rtrsp->rt_cpu = cpu;
+ if (!first) {
+ rtrsp[-1].rt_end_cpu = cpu;
+ if (cur_ops->reader_blocked)
+ rtrsp[-1].rt_preempted = cur_ops->reader_blocked();
+ }
+ }
+ // Sample grace-period sequence number, as good a place as any.
+ if (IS_ENABLED(CONFIG_RCU_TORTURE_TEST_LOG_GP) && cur_ops->gather_gp_seqs) {
+ rtrsp->rt_gp_seq = cur_ops->gather_gp_seqs();
+ rtrsp->rt_ts = ktime_get_mono_fast_ns();
+ if (!first)
+ rtrsp[-1].rt_gp_seq_end = rtrsp->rt_gp_seq;
+ }
/*
* Next, remove old protection, in decreasing order of strength
@@ -1829,7 +2053,7 @@ static void rcutorture_one_extend(int *readstate, int newstate,
if (statesold & RCUTORTURE_RDR_RBH)
rcu_read_unlock_bh();
if (statesold & RCUTORTURE_RDR_RCU_2) {
- cur_ops->readunlock((idxold2 >> RCUTORTURE_RDR_SHIFT_2) & 0x1);
+ cur_ops->readunlock((idxold2 & RCUTORTURE_RDR_MASK_2) >> RCUTORTURE_RDR_SHIFT_2);
WARN_ON_ONCE(idxnew2 != -1);
idxold2 = 0;
}
@@ -1839,7 +2063,7 @@ static void rcutorture_one_extend(int *readstate, int newstate,
lockit = !cur_ops->no_pi_lock && !statesnew && !(torture_random(trsp) & 0xffff);
if (lockit)
raw_spin_lock_irqsave(&current->pi_lock, flags);
- cur_ops->readunlock((idxold1 >> RCUTORTURE_RDR_SHIFT_1) & 0x1);
+ cur_ops->readunlock((idxold1 & RCUTORTURE_RDR_MASK_1) >> RCUTORTURE_RDR_SHIFT_1);
WARN_ON_ONCE(idxnew1 != -1);
idxold1 = 0;
if (lockit)
@@ -1854,16 +2078,14 @@ static void rcutorture_one_extend(int *readstate, int newstate,
if (idxnew1 == -1)
idxnew1 = idxold1 & RCUTORTURE_RDR_MASK_1;
WARN_ON_ONCE(idxnew1 < 0);
- if (WARN_ON_ONCE((idxnew1 >> RCUTORTURE_RDR_SHIFT_1) > 1))
- pr_info("Unexpected idxnew1 value of %#x\n", idxnew1);
if (idxnew2 == -1)
idxnew2 = idxold2 & RCUTORTURE_RDR_MASK_2;
WARN_ON_ONCE(idxnew2 < 0);
- WARN_ON_ONCE((idxnew2 >> RCUTORTURE_RDR_SHIFT_2) > 1);
*readstate = idxnew1 | idxnew2 | newstate;
WARN_ON_ONCE(*readstate < 0);
- if (WARN_ON_ONCE((*readstate >> RCUTORTURE_RDR_SHIFT_2) > 1))
- pr_info("Unexpected idxnew2 value of %#x\n", idxnew2);
+ if (WARN_ON_ONCE(*readstate & ~RCUTORTURE_RDR_ALLBITS))
+ pr_info("Unexpected readstate value of %#x\n", *readstate);
+ rcutorture_one_extend_check("after change", *readstate, statesnew, statesold, insoftirq);
}
/* Return the biggest extendables mask given current RCU and boot parameters. */
@@ -1888,7 +2110,7 @@ rcutorture_extend_mask(int oldmask, struct torture_random_state *trsp)
unsigned long preempts_irq = preempts | RCUTORTURE_RDR_IRQ;
unsigned long bhs = RCUTORTURE_RDR_BH | RCUTORTURE_RDR_RBH;
- WARN_ON_ONCE(mask >> RCUTORTURE_RDR_SHIFT_1);
+ WARN_ON_ONCE(mask >> RCUTORTURE_RDR_SHIFT_1); // Can't have reader idx bits.
/* Mostly only one bit (need preemption!), sometimes lots of bits. */
if (!(randmask1 & 0x7))
mask = mask & randmask2;
@@ -1930,7 +2152,7 @@ rcutorture_extend_mask(int oldmask, struct torture_random_state *trsp)
* critical section.
*/
static struct rt_read_seg *
-rcutorture_loop_extend(int *readstate, struct torture_random_state *trsp,
+rcutorture_loop_extend(int *readstate, bool insoftirq, struct torture_random_state *trsp,
struct rt_read_seg *rtrsp)
{
int i;
@@ -1945,105 +2167,148 @@ rcutorture_loop_extend(int *readstate, struct torture_random_state *trsp,
i = ((i | (i >> 3)) & RCUTORTURE_RDR_MAX_LOOPS) + 1;
for (j = 0; j < i; j++) {
mask = rcutorture_extend_mask(*readstate, trsp);
- rcutorture_one_extend(readstate, mask, trsp, &rtrsp[j]);
+ rcutorture_one_extend(readstate, mask, insoftirq, trsp, &rtrsp[j]);
}
return &rtrsp[j];
}
-/*
- * Do one read-side critical section, returning false if there was
- * no data to read. Can be invoked both from process context and
- * from a timer handler.
- */
-static bool rcu_torture_one_read(struct torture_random_state *trsp, long myid)
-{
- bool checkpolling = !(torture_random(trsp) & 0xfff);
+struct rcu_torture_one_read_state {
+ bool checkpolling;
unsigned long cookie;
struct rcu_gp_oldstate cookie_full;
- int i;
unsigned long started;
- unsigned long completed;
- int newstate;
struct rcu_torture *p;
- int pipe_count;
- int readstate = 0;
- struct rt_read_seg rtseg[RCUTORTURE_RDR_MAX_SEGS] = { { 0 } };
- struct rt_read_seg *rtrsp = &rtseg[0];
- struct rt_read_seg *rtrsp1;
+ int readstate;
+ struct rt_read_seg rtseg[RCUTORTURE_RDR_MAX_SEGS];
+ struct rt_read_seg *rtrsp;
unsigned long long ts;
+};
- WARN_ON_ONCE(!rcu_is_watching());
- newstate = rcutorture_extend_mask(readstate, trsp);
- rcutorture_one_extend(&readstate, newstate, trsp, rtrsp++);
- if (checkpolling) {
+static void init_rcu_torture_one_read_state(struct rcu_torture_one_read_state *rtorsp,
+ struct torture_random_state *trsp)
+{
+ memset(rtorsp, 0, sizeof(*rtorsp));
+ rtorsp->checkpolling = !(torture_random(trsp) & 0xfff);
+ rtorsp->rtrsp = &rtorsp->rtseg[0];
+}
+
+/*
+ * Set up the first segment of a series of overlapping read-side
+ * critical sections. The caller must have actually initiated the
+ * outermost read-side critical section.
+ */
+static bool rcu_torture_one_read_start(struct rcu_torture_one_read_state *rtorsp,
+ struct torture_random_state *trsp, long myid)
+{
+ if (rtorsp->checkpolling) {
if (cur_ops->get_gp_state && cur_ops->poll_gp_state)
- cookie = cur_ops->get_gp_state();
+ rtorsp->cookie = cur_ops->get_gp_state();
if (cur_ops->get_gp_state_full && cur_ops->poll_gp_state_full)
- cur_ops->get_gp_state_full(&cookie_full);
+ cur_ops->get_gp_state_full(&rtorsp->cookie_full);
}
- started = cur_ops->get_gp_seq();
- ts = rcu_trace_clock_local();
- p = rcu_dereference_check(rcu_torture_current,
+ rtorsp->started = cur_ops->get_gp_seq();
+ rtorsp->ts = rcu_trace_clock_local();
+ rtorsp->p = rcu_dereference_check(rcu_torture_current,
!cur_ops->readlock_held || cur_ops->readlock_held());
- if (p == NULL) {
+ if (rtorsp->p == NULL) {
/* Wait for rcu_torture_writer to get underway */
- rcutorture_one_extend(&readstate, 0, trsp, rtrsp);
+ rcutorture_one_extend(&rtorsp->readstate, 0, myid < 0, trsp, rtorsp->rtrsp);
return false;
}
- if (p->rtort_mbtest == 0)
+ if (rtorsp->p->rtort_mbtest == 0)
atomic_inc(&n_rcu_torture_mberror);
- rcu_torture_reader_do_mbchk(myid, p, trsp);
- rtrsp = rcutorture_loop_extend(&readstate, trsp, rtrsp);
+ rcu_torture_reader_do_mbchk(myid, rtorsp->p, trsp);
+ return true;
+}
+
+/*
+ * Complete the last segment of a series of overlapping read-side
+ * critical sections and check for errors.
+ */
+static void rcu_torture_one_read_end(struct rcu_torture_one_read_state *rtorsp,
+ struct torture_random_state *trsp, long myid)
+{
+ int i;
+ unsigned long completed;
+ int pipe_count;
+ bool preempted = false;
+ struct rt_read_seg *rtrsp1;
+
preempt_disable();
- pipe_count = READ_ONCE(p->rtort_pipe_count);
+ pipe_count = READ_ONCE(rtorsp->p->rtort_pipe_count);
if (pipe_count > RCU_TORTURE_PIPE_LEN) {
- /* Should not happen, but... */
+ // Should not happen in a correct RCU implementation,
+ // happens quite often for torture_type=busted.
pipe_count = RCU_TORTURE_PIPE_LEN;
}
completed = cur_ops->get_gp_seq();
if (pipe_count > 1) {
- do_trace_rcu_torture_read(cur_ops->name, &p->rtort_rcu,
- ts, started, completed);
+ do_trace_rcu_torture_read(cur_ops->name, &rtorsp->p->rtort_rcu,
+ rtorsp->ts, rtorsp->started, completed);
rcu_ftrace_dump(DUMP_ALL);
}
__this_cpu_inc(rcu_torture_count[pipe_count]);
- completed = rcutorture_seq_diff(completed, started);
+ completed = rcutorture_seq_diff(completed, rtorsp->started);
if (completed > RCU_TORTURE_PIPE_LEN) {
/* Should not happen, but... */
completed = RCU_TORTURE_PIPE_LEN;
}
__this_cpu_inc(rcu_torture_batch[completed]);
preempt_enable();
- if (checkpolling) {
+ if (rtorsp->checkpolling) {
if (cur_ops->get_gp_state && cur_ops->poll_gp_state)
- WARN_ONCE(cur_ops->poll_gp_state(cookie),
+ WARN_ONCE(cur_ops->poll_gp_state(rtorsp->cookie),
"%s: Cookie check 2 failed %s(%d) %lu->%lu\n",
__func__,
rcu_torture_writer_state_getname(),
rcu_torture_writer_state,
- cookie, cur_ops->get_gp_state());
+ rtorsp->cookie, cur_ops->get_gp_state());
if (cur_ops->get_gp_state_full && cur_ops->poll_gp_state_full)
- WARN_ONCE(cur_ops->poll_gp_state_full(&cookie_full),
+ WARN_ONCE(cur_ops->poll_gp_state_full(&rtorsp->cookie_full),
"%s: Cookie check 6 failed %s(%d) online %*pbl\n",
__func__,
rcu_torture_writer_state_getname(),
rcu_torture_writer_state,
cpumask_pr_args(cpu_online_mask));
}
- rcutorture_one_extend(&readstate, 0, trsp, rtrsp);
- WARN_ON_ONCE(readstate);
+ if (cur_ops->reader_blocked)
+ preempted = cur_ops->reader_blocked();
+ rcutorture_one_extend(&rtorsp->readstate, 0, myid < 0, trsp, rtorsp->rtrsp);
+ WARN_ON_ONCE(rtorsp->readstate);
// This next splat is expected behavior if leakpointer, especially
// for CONFIG_RCU_STRICT_GRACE_PERIOD=y kernels.
- WARN_ON_ONCE(leakpointer && READ_ONCE(p->rtort_pipe_count) > 1);
+ WARN_ON_ONCE(leakpointer && READ_ONCE(rtorsp->p->rtort_pipe_count) > 1);
/* If error or close call, record the sequence of reader protections. */
if ((pipe_count > 1 || completed > 1) && !xchg(&err_segs_recorded, 1)) {
i = 0;
- for (rtrsp1 = &rtseg[0]; rtrsp1 < rtrsp; rtrsp1++)
+ for (rtrsp1 = &rtorsp->rtseg[0]; rtrsp1 < rtorsp->rtrsp; rtrsp1++)
err_segs[i++] = *rtrsp1;
rt_read_nsegs = i;
+ rt_read_preempted = preempted;
}
+}
+/*
+ * Do one read-side critical section, returning false if there was
+ * no data to read. Can be invoked both from process context and
+ * from a timer handler.
+ */
+static bool rcu_torture_one_read(struct torture_random_state *trsp, long myid)
+{
+ int newstate;
+ struct rcu_torture_one_read_state rtors;
+
+ WARN_ON_ONCE(!rcu_is_watching());
+ init_rcu_torture_one_read_state(&rtors, trsp);
+ newstate = rcutorture_extend_mask(rtors.readstate, trsp);
+ rcutorture_one_extend(&rtors.readstate, newstate, myid < 0, trsp, rtors.rtrsp++);
+ if (!rcu_torture_one_read_start(&rtors, trsp, myid)) {
+ rcutorture_one_extend(&rtors.readstate, 0, myid < 0, trsp, rtors.rtrsp);
+ return false;
+ }
+ rtors.rtrsp = rcutorture_loop_extend(&rtors.readstate, myid < 0, trsp, rtors.rtrsp);
+ rcu_torture_one_read_end(&rtors, trsp, myid);
return true;
}
@@ -2088,7 +2353,7 @@ rcu_torture_reader(void *arg)
set_user_nice(current, MAX_NICE);
if (irqreader && cur_ops->irq_capable)
timer_setup_on_stack(&t, rcu_torture_timer, 0);
- tick_dep_set_task(current, TICK_DEP_BIT_RCU);
+ tick_dep_set_task(current, TICK_DEP_BIT_RCU); // CPU bound, so need tick.
do {
if (irqreader && cur_ops->irq_capable) {
if (!timer_pending(&t))
@@ -2105,8 +2370,8 @@ rcu_torture_reader(void *arg)
stutter_wait("rcu_torture_reader");
} while (!torture_must_stop());
if (irqreader && cur_ops->irq_capable) {
- del_timer_sync(&t);
- destroy_timer_on_stack(&t);
+ timer_delete_sync(&t);
+ timer_destroy_on_stack(&t);
}
tick_dep_clear_task(current, TICK_DEP_BIT_RCU);
torture_kthread_stopping("rcu_torture_reader");
@@ -2175,6 +2440,7 @@ rcu_torture_stats_print(void)
int i;
long pipesummary[RCU_TORTURE_PIPE_LEN + 1] = { 0 };
long batchsummary[RCU_TORTURE_PIPE_LEN + 1] = { 0 };
+ long n_gpwraps = 0;
struct rcu_torture *rtcp;
static unsigned long rtcv_snap = ULONG_MAX;
static bool splatted;
@@ -2185,6 +2451,8 @@ rcu_torture_stats_print(void)
pipesummary[i] += READ_ONCE(per_cpu(rcu_torture_count, cpu)[i]);
batchsummary[i] += READ_ONCE(per_cpu(rcu_torture_batch, cpu)[i]);
}
+ if (cur_ops->get_gpwrap_count)
+ n_gpwraps += cur_ops->get_gpwrap_count(cpu);
}
for (i = RCU_TORTURE_PIPE_LEN; i >= 0; i--) {
if (pipesummary[i] != 0)
@@ -2216,8 +2484,9 @@ rcu_torture_stats_print(void)
data_race(n_barrier_attempts),
data_race(n_rcu_torture_barrier_error));
pr_cont("read-exits: %ld ", data_race(n_read_exits)); // Statistic.
- pr_cont("nocb-toggles: %ld:%ld\n",
+ pr_cont("nocb-toggles: %ld:%ld ",
atomic_long_read(&n_nocb_offload), atomic_long_read(&n_nocb_deoffload));
+ pr_cont("gpwraps: %ld\n", n_gpwraps);
pr_alert("%s%s ", torture_type, TORTURE_FLAG);
if (atomic_read(&n_rcu_torture_mberror) ||
@@ -2259,10 +2528,8 @@ rcu_torture_stats_print(void)
int __maybe_unused flags = 0;
unsigned long __maybe_unused gp_seq = 0;
- rcutorture_get_gp_data(cur_ops->ttype,
- &flags, &gp_seq);
- srcutorture_get_gp_data(cur_ops->ttype, srcu_ctlp,
- &flags, &gp_seq);
+ if (cur_ops->get_gp_data)
+ cur_ops->get_gp_data(&flags, &gp_seq);
wtp = READ_ONCE(writer_task);
pr_alert("??? Writer stall state %s(%d) g%lu f%#x ->state %#x cpu %d\n",
rcu_torture_writer_state_getname(),
@@ -2356,26 +2623,30 @@ rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, const char *tag)
"shuffle_interval=%d stutter=%d irqreader=%d "
"fqs_duration=%d fqs_holdoff=%d fqs_stutter=%d "
"test_boost=%d/%d test_boost_interval=%d "
- "test_boost_duration=%d shutdown_secs=%d "
+ "test_boost_duration=%d test_boost_holdoff=%d shutdown_secs=%d "
"stall_cpu=%d stall_cpu_holdoff=%d stall_cpu_irqsoff=%d "
- "stall_cpu_block=%d "
+ "stall_cpu_block=%d stall_cpu_repeat=%d "
"n_barrier_cbs=%d "
"onoff_interval=%d onoff_holdoff=%d "
"read_exit_delay=%d read_exit_burst=%d "
+ "reader_flavor=%x "
"nocbs_nthreads=%d nocbs_toggle=%d "
- "test_nmis=%d\n",
- torture_type, tag, nrealreaders, nfakewriters,
+ "test_nmis=%d "
+ "preempt_duration=%d preempt_interval=%d\n",
+ torture_type, tag, nrealreaders, nrealfakewriters,
stat_interval, verbose, test_no_idle_hz, shuffle_interval,
stutter, irqreader, fqs_duration, fqs_holdoff, fqs_stutter,
test_boost, cur_ops->can_boost,
- test_boost_interval, test_boost_duration, shutdown_secs,
+ test_boost_interval, test_boost_duration, test_boost_holdoff, shutdown_secs,
stall_cpu, stall_cpu_holdoff, stall_cpu_irqsoff,
- stall_cpu_block,
+ stall_cpu_block, stall_cpu_repeat,
n_barrier_cbs,
onoff_interval, onoff_holdoff,
read_exit_delay, read_exit_burst,
+ reader_flavor,
nocbs_nthreads, nocbs_toggle,
- test_nmis);
+ test_nmis,
+ preempt_duration, preempt_interval);
}
static int rcutorture_booster_cleanup(unsigned int cpu)
@@ -2413,6 +2684,14 @@ static int rcutorture_booster_init(unsigned int cpu)
WARN_ON_ONCE(!t);
sp.sched_priority = 2;
sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
+#ifdef CONFIG_IRQ_FORCED_THREADING
+ if (force_irqthreads()) {
+ t = per_cpu(ktimerd, cpu);
+ WARN_ON_ONCE(!t);
+ sp.sched_priority = 2;
+ sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
+ }
+#endif
}
/* Don't allow time recalculation while creating a new task. */
@@ -2448,19 +2727,11 @@ static struct notifier_block rcu_torture_stall_block = {
* induces a CPU stall for the time specified by stall_cpu. If a new
* stall test is added, stallsdone in rcu_torture_writer() must be adjusted.
*/
-static int rcu_torture_stall(void *args)
+static void rcu_torture_stall_one(int rep, int irqsoff)
{
int idx;
- int ret;
unsigned long stop_at;
- VERBOSE_TOROUT_STRING("rcu_torture_stall task started");
- if (rcu_cpu_stall_notifiers) {
- ret = rcu_stall_chain_notifier_register(&rcu_torture_stall_block);
- if (ret)
- pr_info("%s: rcu_stall_chain_notifier_register() returned %d, %sexpected.\n",
- __func__, ret, !IS_ENABLED(CONFIG_RCU_STALL_COMMON) ? "un" : "");
- }
if (stall_cpu_holdoff > 0) {
VERBOSE_TOROUT_STRING("rcu_torture_stall begin holdoff");
schedule_timeout_interruptible(stall_cpu_holdoff * HZ);
@@ -2480,14 +2751,14 @@ static int rcu_torture_stall(void *args)
stop_at = ktime_get_seconds() + stall_cpu;
/* RCU CPU stall is expected behavior in following code. */
idx = cur_ops->readlock();
- if (stall_cpu_irqsoff)
+ if (irqsoff)
local_irq_disable();
else if (!stall_cpu_block)
preempt_disable();
- pr_alert("%s start on CPU %d.\n",
- __func__, raw_smp_processor_id());
- while (ULONG_CMP_LT((unsigned long)ktime_get_seconds(),
- stop_at))
+ pr_alert("%s start stall episode %d on CPU %d.\n",
+ __func__, rep + 1, raw_smp_processor_id());
+ while (ULONG_CMP_LT((unsigned long)ktime_get_seconds(), stop_at) &&
+ !kthread_should_stop())
if (stall_cpu_block) {
#ifdef CONFIG_PREEMPTION
preempt_schedule();
@@ -2497,12 +2768,42 @@ static int rcu_torture_stall(void *args)
} else if (stall_no_softlockup) {
touch_softlockup_watchdog();
}
- if (stall_cpu_irqsoff)
+ if (irqsoff)
local_irq_enable();
else if (!stall_cpu_block)
preempt_enable();
cur_ops->readunlock(idx);
}
+}
+
+/*
+ * CPU-stall kthread. Invokes rcu_torture_stall_one() once, and then as many
+ * additional times as specified by the stall_cpu_repeat module parameter.
+ * Note that stall_cpu_irqsoff is ignored on the second and subsequent
+ * stall.
+ */
+static int rcu_torture_stall(void *args)
+{
+ int i;
+ int repeat = stall_cpu_repeat;
+ int ret;
+
+ VERBOSE_TOROUT_STRING("rcu_torture_stall task started");
+ if (repeat < 0) {
+ repeat = 0;
+ WARN_ON_ONCE(IS_BUILTIN(CONFIG_RCU_TORTURE_TEST));
+ }
+ if (rcu_cpu_stall_notifiers) {
+ ret = rcu_stall_chain_notifier_register(&rcu_torture_stall_block);
+ if (ret)
+ pr_info("%s: rcu_stall_chain_notifier_register() returned %d, %sexpected.\n",
+ __func__, ret, !IS_ENABLED(CONFIG_RCU_STALL_COMMON) ? "un" : "");
+ }
+ for (i = 0; i <= repeat; i++) {
+ if (kthread_should_stop())
+ break;
+ rcu_torture_stall_one(i, i == 0 ? stall_cpu_irqsoff : 0);
+ }
pr_alert("%s end.\n", __func__);
if (rcu_cpu_stall_notifiers && !ret) {
ret = rcu_stall_chain_notifier_unregister(&rcu_torture_stall_block);
@@ -2619,7 +2920,7 @@ static void rcu_torture_fwd_cb_cr(struct rcu_head *rhp)
spin_lock_irqsave(&rfp->rcu_fwd_lock, flags);
rfcpp = rfp->rcu_fwd_cb_tail;
rfp->rcu_fwd_cb_tail = &rfcp->rfc_next;
- WRITE_ONCE(*rfcpp, rfcp);
+ smp_store_release(rfcpp, rfcp);
WRITE_ONCE(rfp->n_launders_cb, rfp->n_launders_cb + 1);
i = ((jiffies - rfp->rcu_fwd_startat) / (HZ / FWD_CBS_HIST_DIV));
if (i >= ARRAY_SIZE(rfp->n_launders_hist))
@@ -2668,7 +2969,7 @@ static unsigned long rcu_torture_fwd_prog_cbfree(struct rcu_fwd *rfp)
rcu_torture_fwd_prog_cond_resched(freed);
if (tick_nohz_full_enabled()) {
local_irq_save(flags);
- rcu_momentary_dyntick_idle();
+ rcu_momentary_eqs();
local_irq_restore(flags);
}
}
@@ -2785,7 +3086,7 @@ static void rcu_torture_fwd_prog_cr(struct rcu_fwd *rfp)
cver = READ_ONCE(rcu_torture_current_version);
gps = cur_ops->get_gp_seq();
rfp->rcu_launder_gp_seq_start = gps;
- tick_dep_set_task(current, TICK_DEP_BIT_RCU);
+ tick_dep_set_task(current, TICK_DEP_BIT_RCU); // CPU bound, so need tick.
while (time_before(jiffies, stopat) &&
!shutdown_time_arrived() &&
!READ_ONCE(rcu_fwd_emergency_stop) && !torture_must_stop()) {
@@ -2818,7 +3119,7 @@ static void rcu_torture_fwd_prog_cr(struct rcu_fwd *rfp)
rcu_torture_fwd_prog_cond_resched(n_launders + n_max_cbs);
if (tick_nohz_full_enabled()) {
local_irq_save(flags);
- rcu_momentary_dyntick_idle();
+ rcu_momentary_eqs();
local_irq_restore(flags);
}
}
@@ -2832,13 +3133,14 @@ static void rcu_torture_fwd_prog_cr(struct rcu_fwd *rfp)
if (!torture_must_stop() && !READ_ONCE(rcu_fwd_emergency_stop) &&
!shutdown_time_arrived()) {
- WARN_ON(n_max_gps < MIN_FWD_CBS_LAUNDERED);
- pr_alert("%s Duration %lu barrier: %lu pending %ld n_launders: %ld n_launders_sa: %ld n_max_gps: %ld n_max_cbs: %ld cver %ld gps %ld\n",
+ if (WARN_ON(n_max_gps < MIN_FWD_CBS_LAUNDERED) && cur_ops->gp_kthread_dbg)
+ cur_ops->gp_kthread_dbg();
+ pr_alert("%s Duration %lu barrier: %lu pending %ld n_launders: %ld n_launders_sa: %ld n_max_gps: %ld n_max_cbs: %ld cver %ld gps %ld #online %u\n",
__func__,
stoppedat - rfp->rcu_fwd_startat, jiffies - stoppedat,
n_launders + n_max_cbs - n_launders_cb_snap,
n_launders, n_launders_sa,
- n_max_gps, n_max_cbs, cver, gps);
+ n_max_gps, n_max_cbs, cver, gps, num_online_cpus());
atomic_long_add(n_max_cbs, &rcu_fwd_max_cbs);
mutex_lock(&rcu_fwd_mutex); // Serialize histograms.
rcu_torture_fwd_cb_hist(rfp);
@@ -2974,12 +3276,12 @@ static int __init rcu_torture_fwd_prog_init(void)
fwd_progress = 0;
return 0;
}
- if (stall_cpu > 0) {
- VERBOSE_TOROUT_STRING("rcu_torture_fwd_prog_init: Disabled, conflicts with CPU-stall testing");
+ if (stall_cpu > 0 || (preempt_duration > 0 && IS_ENABLED(CONFIG_RCU_NOCB_CPU))) {
+ VERBOSE_TOROUT_STRING("rcu_torture_fwd_prog_init: Disabled, conflicts with CPU-stall and/or preemption testing");
fwd_progress = 0;
if (IS_MODULE(CONFIG_RCU_TORTURE_TEST))
return -EINVAL; /* In module, can fail back to user. */
- WARN_ON(1); /* Make sure rcutorture notices conflict. */
+ WARN_ON(1); /* Make sure rcutorture scripting notices conflict. */
return 0;
}
if (fwd_progress_holdoff <= 0)
@@ -3040,11 +3342,12 @@ static void rcu_torture_barrier_cbf(struct rcu_head *rcu)
}
/* IPI handler to get callback posted on desired CPU, if online. */
-static void rcu_torture_barrier1cb(void *rcu_void)
+static int rcu_torture_barrier1cb(void *rcu_void)
{
struct rcu_head *rhp = rcu_void;
cur_ops->call(rhp, rcu_torture_barrier_cbf);
+ return 0;
}
/* kthread function to register callbacks used to test RCU barriers. */
@@ -3070,11 +3373,9 @@ static int rcu_torture_barrier_cbs(void *arg)
* The above smp_load_acquire() ensures barrier_phase load
* is ordered before the following ->call().
*/
- if (smp_call_function_single(myid, rcu_torture_barrier1cb,
- &rcu, 1)) {
- // IPI failed, so use direct call from current CPU.
+ if (smp_call_on_cpu(myid, rcu_torture_barrier1cb, &rcu, 1))
cur_ops->call(&rcu, rcu_torture_barrier_cbf);
- }
+
if (atomic_dec_and_test(&barrier_cbs_count))
wake_up(&barrier_wq);
} while (!torture_must_stop());
@@ -3325,8 +3626,88 @@ static void rcutorture_test_nmis(int n)
#endif // #else // #if IS_BUILTIN(CONFIG_RCU_TORTURE_TEST)
}
+// Randomly preempt online CPUs.
+static int rcu_torture_preempt(void *unused)
+{
+ int cpu = -1;
+ DEFINE_TORTURE_RANDOM(rand);
+
+ schedule_timeout_idle(stall_cpu_holdoff);
+ do {
+ // Wait for preempt_interval ms with up to 100us fuzz.
+ torture_hrtimeout_ms(preempt_interval, 100, &rand);
+ // Select online CPU.
+ cpu = cpumask_next(cpu, cpu_online_mask);
+ if (cpu >= nr_cpu_ids)
+ cpu = cpumask_next(-1, cpu_online_mask);
+ WARN_ON_ONCE(cpu >= nr_cpu_ids);
+ // Move to that CPU, if can't do so, retry later.
+ if (torture_sched_setaffinity(current->pid, cpumask_of(cpu), false))
+ continue;
+ // Preempt at high-ish priority, then reset to normal.
+ sched_set_fifo(current);
+ torture_sched_setaffinity(current->pid, cpu_present_mask, true);
+ mdelay(preempt_duration);
+ sched_set_normal(current, 0);
+ stutter_wait("rcu_torture_preempt");
+ } while (!torture_must_stop());
+ torture_kthread_stopping("rcu_torture_preempt");
+ return 0;
+}
+
static enum cpuhp_state rcutor_hp;
+static struct hrtimer gpwrap_lag_timer;
+static bool gpwrap_lag_active;
+
+/* Timer handler for toggling RCU grace-period sequence overflow test lag value */
+static enum hrtimer_restart rcu_gpwrap_lag_timer(struct hrtimer *timer)
+{
+ ktime_t next_delay;
+
+ if (gpwrap_lag_active) {
+ pr_alert("rcu-torture: Disabling gpwrap lag (value=0)\n");
+ cur_ops->set_gpwrap_lag(0);
+ gpwrap_lag_active = false;
+ next_delay = ktime_set((gpwrap_lag_cycle_mins - gpwrap_lag_active_mins) * 60, 0);
+ } else {
+ pr_alert("rcu-torture: Enabling gpwrap lag (value=%d)\n", gpwrap_lag_gps);
+ cur_ops->set_gpwrap_lag(gpwrap_lag_gps);
+ gpwrap_lag_active = true;
+ next_delay = ktime_set(gpwrap_lag_active_mins * 60, 0);
+ }
+
+ if (torture_must_stop_irq())
+ return HRTIMER_NORESTART;
+
+ hrtimer_forward_now(timer, next_delay);
+ return HRTIMER_RESTART;
+}
+
+static int rcu_gpwrap_lag_init(void)
+{
+ if (!gpwrap_lag)
+ return 0;
+
+ if (gpwrap_lag_cycle_mins <= 0 || gpwrap_lag_active_mins <= 0) {
+ pr_alert("rcu-torture: lag timing parameters must be positive\n");
+ return -EINVAL;
+ }
+
+ hrtimer_setup(&gpwrap_lag_timer, rcu_gpwrap_lag_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+ gpwrap_lag_active = false;
+ hrtimer_start(&gpwrap_lag_timer,
+ ktime_set((gpwrap_lag_cycle_mins - gpwrap_lag_active_mins) * 60, 0), HRTIMER_MODE_REL);
+
+ return 0;
+}
+
+static void rcu_gpwrap_lag_cleanup(void)
+{
+ hrtimer_cancel(&gpwrap_lag_timer);
+ cur_ops->set_gpwrap_lag(0);
+ gpwrap_lag_active = false;
+}
static void
rcu_torture_cleanup(void)
{
@@ -3334,18 +3715,19 @@ rcu_torture_cleanup(void)
int flags = 0;
unsigned long gp_seq = 0;
int i;
+ int j;
if (torture_cleanup_begin()) {
if (cur_ops->cb_barrier != NULL) {
pr_info("%s: Invoking %pS().\n", __func__, cur_ops->cb_barrier);
cur_ops->cb_barrier();
}
- rcu_gp_slow_unregister(NULL);
+ if (cur_ops->gp_slow_unregister)
+ cur_ops->gp_slow_unregister(NULL);
return;
}
if (!cur_ops) {
torture_cleanup_end();
- rcu_gp_slow_unregister(NULL);
return;
}
@@ -3353,6 +3735,7 @@ rcu_torture_cleanup(void)
if (cur_ops->gp_kthread_dbg)
cur_ops->gp_kthread_dbg();
+ torture_stop_kthread(rcu_torture_preempt, preempt_task);
rcu_torture_read_exit_cleanup();
rcu_torture_barrier_cleanup();
rcu_torture_fwd_prog_cleanup();
@@ -3377,15 +3760,15 @@ rcu_torture_cleanup(void)
rcu_torture_reader_mbchk = NULL;
if (fakewriter_tasks) {
- for (i = 0; i < nfakewriters; i++)
+ for (i = 0; i < nrealfakewriters; i++)
torture_stop_kthread(rcu_torture_fakewriter,
fakewriter_tasks[i]);
kfree(fakewriter_tasks);
fakewriter_tasks = NULL;
}
- rcutorture_get_gp_data(cur_ops->ttype, &flags, &gp_seq);
- srcutorture_get_gp_data(cur_ops->ttype, srcu_ctlp, &flags, &gp_seq);
+ if (cur_ops->get_gp_data)
+ cur_ops->get_gp_data(&flags, &gp_seq);
pr_alert("%s: End-test grace-period state: g%ld f%#x total-gps=%ld\n",
cur_ops->name, (long)gp_seq, flags,
rcutorture_seq_diff(gp_seq, start_gp_seq));
@@ -3415,26 +3798,74 @@ rcu_torture_cleanup(void)
pr_alert("\t: No segments recorded!!!\n");
firsttime = 1;
for (i = 0; i < rt_read_nsegs; i++) {
- pr_alert("\t%d: %#x ", i, err_segs[i].rt_readstate);
+ if (IS_ENABLED(CONFIG_RCU_TORTURE_TEST_LOG_GP))
+ pr_alert("\t%lluus ", div64_u64(err_segs[i].rt_ts, 1000ULL));
+ else
+ pr_alert("\t");
+ pr_cont("%d: %#4x", i, err_segs[i].rt_readstate);
if (err_segs[i].rt_delay_jiffies != 0) {
pr_cont("%s%ldjiffies", firsttime ? "" : "+",
err_segs[i].rt_delay_jiffies);
firsttime = 0;
}
+ if (IS_ENABLED(CONFIG_RCU_TORTURE_TEST_LOG_CPU)) {
+ pr_cont(" CPU %2d", err_segs[i].rt_cpu);
+ if (err_segs[i].rt_cpu != err_segs[i].rt_end_cpu)
+ pr_cont("->%-2d", err_segs[i].rt_end_cpu);
+ else
+ pr_cont(" ...");
+ }
+ if (IS_ENABLED(CONFIG_RCU_TORTURE_TEST_LOG_GP) &&
+ cur_ops->gather_gp_seqs && cur_ops->format_gp_seqs) {
+ char buf1[20+1];
+ char buf2[20+1];
+ char sepchar = '-';
+
+ cur_ops->format_gp_seqs(err_segs[i].rt_gp_seq,
+ buf1, ARRAY_SIZE(buf1));
+ cur_ops->format_gp_seqs(err_segs[i].rt_gp_seq_end,
+ buf2, ARRAY_SIZE(buf2));
+ if (err_segs[i].rt_gp_seq == err_segs[i].rt_gp_seq_end) {
+ if (buf2[0]) {
+ for (j = 0; buf2[j]; j++)
+ buf2[j] = '.';
+ if (j)
+ buf2[j - 1] = ' ';
+ }
+ sepchar = ' ';
+ }
+ pr_cont(" %s%c%s", buf1, sepchar, buf2);
+ }
if (err_segs[i].rt_delay_ms != 0) {
- pr_cont("%s%ldms", firsttime ? "" : "+",
+ pr_cont(" %s%ldms", firsttime ? "" : "+",
err_segs[i].rt_delay_ms);
firsttime = 0;
}
if (err_segs[i].rt_delay_us != 0) {
- pr_cont("%s%ldus", firsttime ? "" : "+",
+ pr_cont(" %s%ldus", firsttime ? "" : "+",
err_segs[i].rt_delay_us);
firsttime = 0;
}
- pr_cont("%s\n",
- err_segs[i].rt_preempted ? "preempted" : "");
+ pr_cont("%s", err_segs[i].rt_preempted ? " preempted" : "");
+ if (err_segs[i].rt_readstate & RCUTORTURE_RDR_BH)
+ pr_cont(" BH");
+ if (err_segs[i].rt_readstate & RCUTORTURE_RDR_IRQ)
+ pr_cont(" IRQ");
+ if (err_segs[i].rt_readstate & RCUTORTURE_RDR_PREEMPT)
+ pr_cont(" PREEMPT");
+ if (err_segs[i].rt_readstate & RCUTORTURE_RDR_RBH)
+ pr_cont(" RBH");
+ if (err_segs[i].rt_readstate & RCUTORTURE_RDR_SCHED)
+ pr_cont(" SCHED");
+ if (err_segs[i].rt_readstate & RCUTORTURE_RDR_RCU_1)
+ pr_cont(" RCU_1");
+ if (err_segs[i].rt_readstate & RCUTORTURE_RDR_RCU_2)
+ pr_cont(" RCU_2");
+ pr_cont("\n");
}
+ if (rt_read_preempted)
+ pr_alert("\tReader was preempted.\n");
}
if (atomic_read(&n_rcu_torture_error) || n_rcu_torture_barrier_error)
rcu_torture_print_module_parms(cur_ops, "End of test: FAILURE");
@@ -3444,10 +3875,13 @@ rcu_torture_cleanup(void)
else
rcu_torture_print_module_parms(cur_ops, "End of test: SUCCESS");
torture_cleanup_end();
- rcu_gp_slow_unregister(&rcu_fwd_cb_nodelay);
+ if (cur_ops->gp_slow_unregister)
+ cur_ops->gp_slow_unregister(NULL);
+
+ if (gpwrap_lag && cur_ops->set_gpwrap_lag)
+ rcu_gpwrap_lag_cleanup();
}
-#ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD
static void rcu_torture_leak_cb(struct rcu_head *rhp)
{
}
@@ -3465,7 +3899,6 @@ static void rcu_torture_err_cb(struct rcu_head *rhp)
*/
pr_alert("%s: duplicated callback was invoked.\n", KBUILD_MODNAME);
}
-#endif /* #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */
/*
* Verify that double-free causes debug-objects to complain, but only
@@ -3474,39 +3907,43 @@ static void rcu_torture_err_cb(struct rcu_head *rhp)
*/
static void rcu_test_debug_objects(void)
{
-#ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD
struct rcu_head rh1;
struct rcu_head rh2;
+ int idx;
+
+ if (!IS_ENABLED(CONFIG_DEBUG_OBJECTS_RCU_HEAD)) {
+ pr_alert("%s: !CONFIG_DEBUG_OBJECTS_RCU_HEAD, not testing duplicate call_%s()\n",
+ KBUILD_MODNAME, cur_ops->name);
+ return;
+ }
+
+ if (WARN_ON_ONCE(cur_ops->debug_objects &&
+ (!cur_ops->call || !cur_ops->cb_barrier)))
+ return;
+
struct rcu_head *rhp = kmalloc(sizeof(*rhp), GFP_KERNEL);
init_rcu_head_on_stack(&rh1);
init_rcu_head_on_stack(&rh2);
- pr_alert("%s: WARN: Duplicate call_rcu() test starting.\n", KBUILD_MODNAME);
+ pr_alert("%s: WARN: Duplicate call_%s() test starting.\n", KBUILD_MODNAME, cur_ops->name);
/* Try to queue the rh2 pair of callbacks for the same grace period. */
- preempt_disable(); /* Prevent preemption from interrupting test. */
- rcu_read_lock(); /* Make it impossible to finish a grace period. */
- call_rcu_hurry(&rh1, rcu_torture_leak_cb); /* Start grace period. */
- local_irq_disable(); /* Make it harder to start a new grace period. */
- call_rcu_hurry(&rh2, rcu_torture_leak_cb);
- call_rcu_hurry(&rh2, rcu_torture_err_cb); /* Duplicate callback. */
+ idx = cur_ops->readlock(); /* Make it impossible to finish a grace period. */
+ cur_ops->call(&rh1, rcu_torture_leak_cb); /* Start grace period. */
+ cur_ops->call(&rh2, rcu_torture_leak_cb);
+ cur_ops->call(&rh2, rcu_torture_err_cb); /* Duplicate callback. */
if (rhp) {
- call_rcu_hurry(rhp, rcu_torture_leak_cb);
- call_rcu_hurry(rhp, rcu_torture_err_cb); /* Another duplicate callback. */
+ cur_ops->call(rhp, rcu_torture_leak_cb);
+ cur_ops->call(rhp, rcu_torture_err_cb); /* Another duplicate callback. */
}
- local_irq_enable();
- rcu_read_unlock();
- preempt_enable();
+ cur_ops->readunlock(idx);
/* Wait for them all to get done so we can safely return. */
- rcu_barrier();
- pr_alert("%s: WARN: Duplicate call_rcu() test complete.\n", KBUILD_MODNAME);
+ cur_ops->cb_barrier();
+ pr_alert("%s: WARN: Duplicate call_%s() test complete.\n", KBUILD_MODNAME, cur_ops->name);
destroy_rcu_head_on_stack(&rh1);
destroy_rcu_head_on_stack(&rh2);
kfree(rhp);
-#else /* #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */
- pr_alert("%s: !CONFIG_DEBUG_OBJECTS_RCU_HEAD, not testing duplicate call_rcu()\n", KBUILD_MODNAME);
-#endif /* #else #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */
}
static void rcutorture_sync(void)
@@ -3748,6 +4185,14 @@ rcu_torture_init(void)
rcu_torture_init_srcu_lockdep();
+ if (nfakewriters >= 0) {
+ nrealfakewriters = nfakewriters;
+ } else {
+ nrealfakewriters = num_online_cpus() - 2 - nfakewriters;
+ if (nrealfakewriters <= 0)
+ nrealfakewriters = 1;
+ }
+
if (nreaders >= 0) {
nrealreaders = nreaders;
} else {
@@ -3756,8 +4201,8 @@ rcu_torture_init(void)
nrealreaders = 1;
}
rcu_torture_print_module_parms(cur_ops, "Start of test");
- rcutorture_get_gp_data(cur_ops->ttype, &flags, &gp_seq);
- srcutorture_get_gp_data(cur_ops->ttype, srcu_ctlp, &flags, &gp_seq);
+ if (cur_ops->get_gp_data)
+ cur_ops->get_gp_data(&flags, &gp_seq);
start_gp_seq = gp_seq;
pr_alert("%s: Start-test grace-period state: g%ld f%#x\n",
cur_ops->name, (long)gp_seq, flags);
@@ -3804,8 +4249,9 @@ rcu_torture_init(void)
writer_task);
if (torture_init_error(firsterr))
goto unwind;
- if (nfakewriters > 0) {
- fakewriter_tasks = kcalloc(nfakewriters,
+
+ if (nrealfakewriters > 0) {
+ fakewriter_tasks = kcalloc(nrealfakewriters,
sizeof(fakewriter_tasks[0]),
GFP_KERNEL);
if (fakewriter_tasks == NULL) {
@@ -3814,7 +4260,7 @@ rcu_torture_init(void)
goto unwind;
}
}
- for (i = 0; i < nfakewriters; i++) {
+ for (i = 0; i < nrealfakewriters; i++) {
firsterr = torture_create_kthread(rcu_torture_fakewriter,
NULL, fakewriter_tasks[i]);
if (torture_init_error(firsterr))
@@ -3923,10 +4369,24 @@ rcu_torture_init(void)
firsterr = rcu_torture_read_exit_init();
if (torture_init_error(firsterr))
goto unwind;
+ if (preempt_duration > 0) {
+ firsterr = torture_create_kthread(rcu_torture_preempt, NULL, preempt_task);
+ if (torture_init_error(firsterr))
+ goto unwind;
+ }
if (object_debug)
rcu_test_debug_objects();
+
+ if (cur_ops->gp_slow_register && !WARN_ON_ONCE(!cur_ops->gp_slow_unregister))
+ cur_ops->gp_slow_register(&rcu_fwd_cb_nodelay);
+
+ if (gpwrap_lag && cur_ops->set_gpwrap_lag) {
+ firsterr = rcu_gpwrap_lag_init();
+ if (torture_init_error(firsterr))
+ goto unwind;
+ }
+
torture_init_end();
- rcu_gp_slow_register(&rcu_fwd_cb_nodelay);
return 0;
unwind:
diff --git a/kernel/rcu/refscale.c b/kernel/rcu/refscale.c
index 2c2648a3ad30..f11a7c2af778 100644
--- a/kernel/rcu/refscale.c
+++ b/kernel/rcu/refscale.c
@@ -28,6 +28,7 @@
#include <linux/rcupdate_trace.h>
#include <linux/reboot.h>
#include <linux/sched.h>
+#include <linux/seq_buf.h>
#include <linux/spinlock.h>
#include <linux/smp.h>
#include <linux/stat.h>
@@ -35,6 +36,7 @@
#include <linux/slab.h>
#include <linux/torture.h>
#include <linux/types.h>
+#include <linux/sched/clock.h>
#include "rcu.h"
@@ -63,6 +65,7 @@ do { \
#define SCALEOUT_ERRSTRING(s, x...) pr_alert("%s" SCALE_FLAG "!!! " s "\n", scale_type, ## x)
+MODULE_DESCRIPTION("Scalability test for object reference mechanisms");
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Joel Fernandes (Google) <joel@joelfernandes.org>");
@@ -73,6 +76,9 @@ MODULE_PARM_DESC(scale_type, "Type of test (rcu, srcu, refcnt, rwsem, rwlock.");
torture_param(int, verbose, 0, "Enable verbose debugging printk()s");
torture_param(int, verbose_batched, 0, "Batch verbose debugging printk()s");
+// Number of seconds to extend warm-up and cool-down for multiple guest OSes
+torture_param(long, guest_os_delay, 0,
+ "Number of seconds to extend warm-up/cool-down for multiple guest OSes.");
// Wait until there are multiple CPUs before starting test.
torture_param(int, holdoff, IS_BUILTIN(CONFIG_RCU_REF_SCALE_TEST) ? 10 : 0,
"Holdoff time before test start (s)");
@@ -133,7 +139,7 @@ struct ref_scale_ops {
const char *name;
};
-static struct ref_scale_ops *cur_ops;
+static const struct ref_scale_ops *cur_ops;
static void un_delay(const int udl, const int ndl)
{
@@ -169,7 +175,7 @@ static bool rcu_sync_scale_init(void)
return true;
}
-static struct ref_scale_ops rcu_ops = {
+static const struct ref_scale_ops rcu_ops = {
.init = rcu_sync_scale_init,
.readsection = ref_rcu_read_section,
.delaysection = ref_rcu_delay_section,
@@ -203,13 +209,73 @@ static void srcu_ref_scale_delay_section(const int nloops, const int udl, const
}
}
-static struct ref_scale_ops srcu_ops = {
+static const struct ref_scale_ops srcu_ops = {
.init = rcu_sync_scale_init,
.readsection = srcu_ref_scale_read_section,
.delaysection = srcu_ref_scale_delay_section,
.name = "srcu"
};
+static void srcu_fast_ref_scale_read_section(const int nloops)
+{
+ int i;
+ struct srcu_ctr __percpu *scp;
+
+ for (i = nloops; i >= 0; i--) {
+ scp = srcu_read_lock_fast(srcu_ctlp);
+ srcu_read_unlock_fast(srcu_ctlp, scp);
+ }
+}
+
+static void srcu_fast_ref_scale_delay_section(const int nloops, const int udl, const int ndl)
+{
+ int i;
+ struct srcu_ctr __percpu *scp;
+
+ for (i = nloops; i >= 0; i--) {
+ scp = srcu_read_lock_fast(srcu_ctlp);
+ un_delay(udl, ndl);
+ srcu_read_unlock_fast(srcu_ctlp, scp);
+ }
+}
+
+static const struct ref_scale_ops srcu_fast_ops = {
+ .init = rcu_sync_scale_init,
+ .readsection = srcu_fast_ref_scale_read_section,
+ .delaysection = srcu_fast_ref_scale_delay_section,
+ .name = "srcu-fast"
+};
+
+static void srcu_lite_ref_scale_read_section(const int nloops)
+{
+ int i;
+ int idx;
+
+ for (i = nloops; i >= 0; i--) {
+ idx = srcu_read_lock_lite(srcu_ctlp);
+ srcu_read_unlock_lite(srcu_ctlp, idx);
+ }
+}
+
+static void srcu_lite_ref_scale_delay_section(const int nloops, const int udl, const int ndl)
+{
+ int i;
+ int idx;
+
+ for (i = nloops; i >= 0; i--) {
+ idx = srcu_read_lock_lite(srcu_ctlp);
+ un_delay(udl, ndl);
+ srcu_read_unlock_lite(srcu_ctlp, idx);
+ }
+}
+
+static const struct ref_scale_ops srcu_lite_ops = {
+ .init = rcu_sync_scale_init,
+ .readsection = srcu_lite_ref_scale_read_section,
+ .delaysection = srcu_lite_ref_scale_delay_section,
+ .name = "srcu-lite"
+};
+
#ifdef CONFIG_TASKS_RCU
// Definitions for RCU Tasks ref scale testing: Empty read markers.
@@ -230,7 +296,7 @@ static void rcu_tasks_ref_scale_delay_section(const int nloops, const int udl, c
un_delay(udl, ndl);
}
-static struct ref_scale_ops rcu_tasks_ops = {
+static const struct ref_scale_ops rcu_tasks_ops = {
.init = rcu_sync_scale_init,
.readsection = rcu_tasks_ref_scale_read_section,
.delaysection = rcu_tasks_ref_scale_delay_section,
@@ -269,7 +335,7 @@ static void rcu_trace_ref_scale_delay_section(const int nloops, const int udl, c
}
}
-static struct ref_scale_ops rcu_trace_ops = {
+static const struct ref_scale_ops rcu_trace_ops = {
.init = rcu_sync_scale_init,
.readsection = rcu_trace_ref_scale_read_section,
.delaysection = rcu_trace_ref_scale_delay_section,
@@ -308,7 +374,7 @@ static void ref_refcnt_delay_section(const int nloops, const int udl, const int
}
}
-static struct ref_scale_ops refcnt_ops = {
+static const struct ref_scale_ops refcnt_ops = {
.init = rcu_sync_scale_init,
.readsection = ref_refcnt_section,
.delaysection = ref_refcnt_delay_section,
@@ -345,7 +411,7 @@ static void ref_rwlock_delay_section(const int nloops, const int udl, const int
}
}
-static struct ref_scale_ops rwlock_ops = {
+static const struct ref_scale_ops rwlock_ops = {
.init = ref_rwlock_init,
.readsection = ref_rwlock_section,
.delaysection = ref_rwlock_delay_section,
@@ -382,7 +448,7 @@ static void ref_rwsem_delay_section(const int nloops, const int udl, const int n
}
}
-static struct ref_scale_ops rwsem_ops = {
+static const struct ref_scale_ops rwsem_ops = {
.init = ref_rwsem_init,
.readsection = ref_rwsem_section,
.delaysection = ref_rwsem_delay_section,
@@ -417,7 +483,7 @@ static void ref_lock_delay_section(const int nloops, const int udl, const int nd
preempt_enable();
}
-static struct ref_scale_ops lock_ops = {
+static const struct ref_scale_ops lock_ops = {
.readsection = ref_lock_section,
.delaysection = ref_lock_delay_section,
.name = "lock"
@@ -452,7 +518,7 @@ static void ref_lock_irq_delay_section(const int nloops, const int udl, const in
preempt_enable();
}
-static struct ref_scale_ops lock_irq_ops = {
+static const struct ref_scale_ops lock_irq_ops = {
.readsection = ref_lock_irq_section,
.delaysection = ref_lock_irq_delay_section,
.name = "lock-irq"
@@ -488,7 +554,7 @@ static void ref_acqrel_delay_section(const int nloops, const int udl, const int
preempt_enable();
}
-static struct ref_scale_ops acqrel_ops = {
+static const struct ref_scale_ops acqrel_ops = {
.readsection = ref_acqrel_section,
.delaysection = ref_acqrel_delay_section,
.name = "acqrel"
@@ -496,6 +562,39 @@ static struct ref_scale_ops acqrel_ops = {
static volatile u64 stopopts;
+static void ref_sched_clock_section(const int nloops)
+{
+ u64 x = 0;
+ int i;
+
+ preempt_disable();
+ for (i = nloops; i >= 0; i--)
+ x += sched_clock();
+ preempt_enable();
+ stopopts = x;
+}
+
+static void ref_sched_clock_delay_section(const int nloops, const int udl, const int ndl)
+{
+ u64 x = 0;
+ int i;
+
+ preempt_disable();
+ for (i = nloops; i >= 0; i--) {
+ x += sched_clock();
+ un_delay(udl, ndl);
+ }
+ preempt_enable();
+ stopopts = x;
+}
+
+static const struct ref_scale_ops sched_clock_ops = {
+ .readsection = ref_sched_clock_section,
+ .delaysection = ref_sched_clock_delay_section,
+ .name = "sched-clock"
+};
+
+
static void ref_clock_section(const int nloops)
{
u64 x = 0;
@@ -522,7 +621,7 @@ static void ref_clock_delay_section(const int nloops, const int udl, const int n
stopopts = x;
}
-static struct ref_scale_ops clock_ops = {
+static const struct ref_scale_ops clock_ops = {
.readsection = ref_clock_section,
.delaysection = ref_clock_delay_section,
.name = "clock"
@@ -554,7 +653,7 @@ static void ref_jiffies_delay_section(const int nloops, const int udl, const int
stopopts = x;
}
-static struct ref_scale_ops jiffies_ops = {
+static const struct ref_scale_ops jiffies_ops = {
.readsection = ref_jiffies_section,
.delaysection = ref_jiffies_delay_section,
.name = "jiffies"
@@ -704,9 +803,9 @@ static void refscale_typesafe_ctor(void *rtsp_in)
preempt_enable();
}
-static struct ref_scale_ops typesafe_ref_ops;
-static struct ref_scale_ops typesafe_lock_ops;
-static struct ref_scale_ops typesafe_seqlock_ops;
+static const struct ref_scale_ops typesafe_ref_ops;
+static const struct ref_scale_ops typesafe_lock_ops;
+static const struct ref_scale_ops typesafe_seqlock_ops;
// Initialize for a typesafe test.
static bool typesafe_init(void)
@@ -767,7 +866,7 @@ static void typesafe_cleanup(void)
}
// The typesafe_init() function distinguishes these structures by address.
-static struct ref_scale_ops typesafe_ref_ops = {
+static const struct ref_scale_ops typesafe_ref_ops = {
.init = typesafe_init,
.cleanup = typesafe_cleanup,
.readsection = typesafe_read_section,
@@ -775,7 +874,7 @@ static struct ref_scale_ops typesafe_ref_ops = {
.name = "typesafe_ref"
};
-static struct ref_scale_ops typesafe_lock_ops = {
+static const struct ref_scale_ops typesafe_lock_ops = {
.init = typesafe_init,
.cleanup = typesafe_cleanup,
.readsection = typesafe_read_section,
@@ -783,7 +882,7 @@ static struct ref_scale_ops typesafe_lock_ops = {
.name = "typesafe_lock"
};
-static struct ref_scale_ops typesafe_seqlock_ops = {
+static const struct ref_scale_ops typesafe_seqlock_ops = {
.init = typesafe_init,
.cleanup = typesafe_cleanup,
.readsection = typesafe_read_section,
@@ -799,6 +898,18 @@ static void rcu_scale_one_reader(void)
cur_ops->delaysection(loops, readdelay / 1000, readdelay % 1000);
}
+// Warm up cache, or, if needed run a series of rcu_scale_one_reader()
+// to allow multiple rcuscale guest OSes to collect mutually valid data.
+static void rcu_scale_warm_cool(void)
+{
+ unsigned long jdone = jiffies + (guest_os_delay > 0 ? guest_os_delay * HZ : -1);
+
+ do {
+ rcu_scale_one_reader();
+ cond_resched();
+ } while (time_before(jiffies, jdone));
+}
+
// Reader kthread. Repeatedly does empty RCU read-side
// critical section, minimizing update-side interference.
static int
@@ -827,7 +938,7 @@ repeat:
goto end;
// Make sure that the CPU is affinitized appropriately during testing.
- WARN_ON_ONCE(raw_smp_processor_id() != me);
+ WARN_ON_ONCE(raw_smp_processor_id() != me % nr_cpu_ids);
WRITE_ONCE(rt->start_reader, 0);
if (!atomic_dec_return(&n_started))
@@ -890,32 +1001,34 @@ static u64 process_durations(int n)
{
int i;
struct reader_task *rt;
- char buf1[64];
+ struct seq_buf s;
char *buf;
u64 sum = 0;
buf = kmalloc(800 + 64, GFP_KERNEL);
if (!buf)
return 0;
- buf[0] = 0;
- sprintf(buf, "Experiment #%d (Format: <THREAD-NUM>:<Total loop time in ns>)",
- exp_idx);
+ seq_buf_init(&s, buf, 800 + 64);
+
+ seq_buf_printf(&s, "Experiment #%d (Format: <THREAD-NUM>:<Total loop time in ns>)",
+ exp_idx);
for (i = 0; i < n && !torture_must_stop(); i++) {
rt = &(reader_tasks[i]);
- sprintf(buf1, "%d: %llu\t", i, rt->last_duration_ns);
if (i % 5 == 0)
- strcat(buf, "\n");
- if (strlen(buf) >= 800) {
- pr_alert("%s", buf);
- buf[0] = 0;
+ seq_buf_putc(&s, '\n');
+
+ if (seq_buf_used(&s) >= 800) {
+ pr_alert("%s", seq_buf_str(&s));
+ seq_buf_clear(&s);
}
- strcat(buf, buf1);
+
+ seq_buf_printf(&s, "%d: %llu\t", i, rt->last_duration_ns);
sum += rt->last_duration_ns;
}
- pr_alert("%s\n", buf);
+ pr_alert("%s\n", seq_buf_str(&s));
kfree(buf);
return sum;
@@ -953,6 +1066,7 @@ static int main_func(void *arg)
schedule_timeout_uninterruptible(1);
// Start exp readers up per experiment
+ rcu_scale_warm_cool();
for (exp = 0; exp < nruns && !torture_must_stop(); exp++) {
if (torture_must_stop())
goto end;
@@ -983,6 +1097,7 @@ static int main_func(void *arg)
result_avg[exp] = div_u64(1000 * process_durations(nreaders), nreaders * loops);
}
+ rcu_scale_warm_cool();
// Print the average of all experiments
SCALEOUT("END OF TEST. Calculating average duration per loop (nanoseconds)...\n");
@@ -1022,7 +1137,7 @@ end:
}
static void
-ref_scale_print_module_parms(struct ref_scale_ops *cur_ops, const char *tag)
+ref_scale_print_module_parms(const struct ref_scale_ops *cur_ops, const char *tag)
{
pr_alert("%s" SCALE_FLAG
"--- %s: verbose=%d verbose_batched=%d shutdown=%d holdoff=%d lookup_instances=%ld loops=%ld nreaders=%d nruns=%d readdelay=%d\n", scale_type, tag,
@@ -1077,9 +1192,10 @@ ref_scale_init(void)
{
long i;
int firsterr = 0;
- static struct ref_scale_ops *scale_ops[] = {
- &rcu_ops, &srcu_ops, RCU_TRACE_OPS RCU_TASKS_OPS &refcnt_ops, &rwlock_ops,
- &rwsem_ops, &lock_ops, &lock_irq_ops, &acqrel_ops, &clock_ops, &jiffies_ops,
+ static const struct ref_scale_ops *scale_ops[] = {
+ &rcu_ops, &srcu_ops, &srcu_fast_ops, &srcu_lite_ops, RCU_TRACE_OPS RCU_TASKS_OPS
+ &refcnt_ops, &rwlock_ops, &rwsem_ops, &lock_ops, &lock_irq_ops,
+ &acqrel_ops, &sched_clock_ops, &clock_ops, &jiffies_ops,
&typesafe_ref_ops, &typesafe_lock_ops, &typesafe_seqlock_ops,
};
diff --git a/kernel/rcu/srcutiny.c b/kernel/rcu/srcutiny.c
index c38e5933a5d6..6e9fe2ce1075 100644
--- a/kernel/rcu/srcutiny.c
+++ b/kernel/rcu/srcutiny.c
@@ -20,7 +20,11 @@
#include "rcu_segcblist.h"
#include "rcu.h"
+#ifndef CONFIG_TREE_RCU
int rcu_scheduler_active __read_mostly;
+#else // #ifndef CONFIG_TREE_RCU
+extern int rcu_scheduler_active;
+#endif // #else // #ifndef CONFIG_TREE_RCU
static LIST_HEAD(srcu_boot_list);
static bool srcu_init_done;
@@ -96,9 +100,12 @@ EXPORT_SYMBOL_GPL(cleanup_srcu_struct);
*/
void __srcu_read_unlock(struct srcu_struct *ssp, int idx)
{
- int newval = READ_ONCE(ssp->srcu_lock_nesting[idx]) - 1;
+ int newval;
+ preempt_disable(); // Needed for PREEMPT_LAZY
+ newval = READ_ONCE(ssp->srcu_lock_nesting[idx]) - 1;
WRITE_ONCE(ssp->srcu_lock_nesting[idx], newval);
+ preempt_enable();
if (!newval && READ_ONCE(ssp->srcu_gp_waiting) && in_task())
swake_up_one(&ssp->srcu_wq);
}
@@ -117,8 +124,11 @@ void srcu_drive_gp(struct work_struct *wp)
struct srcu_struct *ssp;
ssp = container_of(wp, struct srcu_struct, srcu_work);
- if (ssp->srcu_gp_running || ULONG_CMP_GE(ssp->srcu_idx, READ_ONCE(ssp->srcu_idx_max)))
+ preempt_disable(); // Needed for PREEMPT_LAZY
+ if (ssp->srcu_gp_running || ULONG_CMP_GE(ssp->srcu_idx, READ_ONCE(ssp->srcu_idx_max))) {
+ preempt_enable();
return; /* Already running or nothing to do. */
+ }
/* Remove recently arrived callbacks and wait for readers. */
WRITE_ONCE(ssp->srcu_gp_running, true);
@@ -130,9 +140,12 @@ void srcu_drive_gp(struct work_struct *wp)
idx = (ssp->srcu_idx & 0x2) / 2;
WRITE_ONCE(ssp->srcu_idx, ssp->srcu_idx + 1);
WRITE_ONCE(ssp->srcu_gp_waiting, true); /* srcu_read_unlock() wakes! */
+ preempt_enable();
swait_event_exclusive(ssp->srcu_wq, !READ_ONCE(ssp->srcu_lock_nesting[idx]));
+ preempt_disable(); // Needed for PREEMPT_LAZY
WRITE_ONCE(ssp->srcu_gp_waiting, false); /* srcu_read_unlock() cheap. */
WRITE_ONCE(ssp->srcu_idx, ssp->srcu_idx + 1);
+ preempt_enable();
/* Invoke the callbacks we removed above. */
while (lh) {
@@ -150,8 +163,11 @@ void srcu_drive_gp(struct work_struct *wp)
* at interrupt level, but the ->srcu_gp_running checks will
* straighten that out.
*/
+ preempt_disable(); // Needed for PREEMPT_LAZY
WRITE_ONCE(ssp->srcu_gp_running, false);
- if (ULONG_CMP_LT(ssp->srcu_idx, READ_ONCE(ssp->srcu_idx_max)))
+ idx = ULONG_CMP_LT(ssp->srcu_idx, READ_ONCE(ssp->srcu_idx_max));
+ preempt_enable();
+ if (idx)
schedule_work(&ssp->srcu_work);
}
EXPORT_SYMBOL_GPL(srcu_drive_gp);
@@ -160,9 +176,12 @@ static void srcu_gp_start_if_needed(struct srcu_struct *ssp)
{
unsigned long cookie;
+ preempt_disable(); // Needed for PREEMPT_LAZY
cookie = get_state_synchronize_srcu(ssp);
- if (ULONG_CMP_GE(READ_ONCE(ssp->srcu_idx_max), cookie))
+ if (ULONG_CMP_GE(READ_ONCE(ssp->srcu_idx_max), cookie)) {
+ preempt_enable();
return;
+ }
WRITE_ONCE(ssp->srcu_idx_max, cookie);
if (!READ_ONCE(ssp->srcu_gp_running)) {
if (likely(srcu_init_done))
@@ -170,6 +189,7 @@ static void srcu_gp_start_if_needed(struct srcu_struct *ssp)
else if (list_empty(&ssp->srcu_work.entry))
list_add(&ssp->srcu_work.entry, &srcu_boot_list);
}
+ preempt_enable();
}
/*
@@ -183,11 +203,13 @@ void call_srcu(struct srcu_struct *ssp, struct rcu_head *rhp,
rhp->func = func;
rhp->next = NULL;
+ preempt_disable(); // Needed for PREEMPT_LAZY
local_irq_save(flags);
*ssp->srcu_cb_tail = rhp;
ssp->srcu_cb_tail = &rhp->next;
local_irq_restore(flags);
srcu_gp_start_if_needed(ssp);
+ preempt_enable();
}
EXPORT_SYMBOL_GPL(call_srcu);
@@ -241,9 +263,12 @@ EXPORT_SYMBOL_GPL(get_state_synchronize_srcu);
*/
unsigned long start_poll_synchronize_srcu(struct srcu_struct *ssp)
{
- unsigned long ret = get_state_synchronize_srcu(ssp);
+ unsigned long ret;
+ preempt_disable(); // Needed for PREEMPT_LAZY
+ ret = get_state_synchronize_srcu(ssp);
srcu_gp_start_if_needed(ssp);
+ preempt_enable();
return ret;
}
EXPORT_SYMBOL_GPL(start_poll_synchronize_srcu);
@@ -256,15 +281,18 @@ bool poll_state_synchronize_srcu(struct srcu_struct *ssp, unsigned long cookie)
unsigned long cur_s = READ_ONCE(ssp->srcu_idx);
barrier();
- return ULONG_CMP_GE(cur_s, cookie) || ULONG_CMP_LT(cur_s, cookie - 3);
+ return cookie == SRCU_GET_STATE_COMPLETED ||
+ ULONG_CMP_GE(cur_s, cookie) || ULONG_CMP_LT(cur_s, cookie - 3);
}
EXPORT_SYMBOL_GPL(poll_state_synchronize_srcu);
+#ifndef CONFIG_TREE_RCU
/* Lockdep diagnostics. */
void __init rcu_scheduler_starting(void)
{
rcu_scheduler_active = RCU_SCHEDULER_RUNNING;
}
+#endif // #ifndef CONFIG_TREE_RCU
/*
* Queue work for srcu_struct structures with early boot callbacks.
diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c
index e4d673fc30f4..48047260697e 100644
--- a/kernel/rcu/srcutree.c
+++ b/kernel/rcu/srcutree.c
@@ -116,8 +116,9 @@ do { \
/*
* Initialize SRCU per-CPU data. Note that statically allocated
* srcu_struct structures might already have srcu_read_lock() and
- * srcu_read_unlock() running against them. So if the is_static parameter
- * is set, don't initialize ->srcu_lock_count[] and ->srcu_unlock_count[].
+ * srcu_read_unlock() running against them. So if the is_static
+ * parameter is set, don't initialize ->srcu_ctrs[].srcu_locks and
+ * ->srcu_ctrs[].srcu_unlocks.
*/
static void init_srcu_struct_data(struct srcu_struct *ssp)
{
@@ -128,8 +129,6 @@ static void init_srcu_struct_data(struct srcu_struct *ssp)
* Initialize the per-CPU srcu_data array, which feeds into the
* leaves of the srcu_node tree.
*/
- WARN_ON_ONCE(ARRAY_SIZE(sdp->srcu_lock_count) !=
- ARRAY_SIZE(sdp->srcu_unlock_count));
for_each_possible_cpu(cpu) {
sdp = per_cpu_ptr(ssp->sda, cpu);
spin_lock_init(&ACCESS_PRIVATE(sdp, lock));
@@ -137,6 +136,7 @@ static void init_srcu_struct_data(struct srcu_struct *ssp)
sdp->srcu_cblist_invoking = false;
sdp->srcu_gp_seq_needed = ssp->srcu_sup->srcu_gp_seq;
sdp->srcu_gp_seq_needed_exp = ssp->srcu_sup->srcu_gp_seq;
+ sdp->srcu_barrier_head.next = &sdp->srcu_barrier_head;
sdp->mynode = NULL;
sdp->cpu = cpu;
INIT_WORK(&sdp->work, srcu_invoke_callbacks);
@@ -186,7 +186,7 @@ static bool init_srcu_struct_nodes(struct srcu_struct *ssp, gfp_t gfp_flags)
/* Each pass through this loop initializes one srcu_node structure. */
srcu_for_each_node_breadth_first(ssp, snp) {
spin_lock_init(&ACCESS_PRIVATE(snp, lock));
- WARN_ON_ONCE(ARRAY_SIZE(snp->srcu_have_cbs) !=
+ BUILD_BUG_ON(ARRAY_SIZE(snp->srcu_have_cbs) !=
ARRAY_SIZE(snp->srcu_data_have_cbs));
for (i = 0; i < ARRAY_SIZE(snp->srcu_have_cbs); i++) {
snp->srcu_have_cbs[i] = SRCU_SNP_INIT_SEQ;
@@ -246,19 +246,20 @@ static int init_srcu_struct_fields(struct srcu_struct *ssp, bool is_static)
ssp->srcu_sup->node = NULL;
mutex_init(&ssp->srcu_sup->srcu_cb_mutex);
mutex_init(&ssp->srcu_sup->srcu_gp_mutex);
- ssp->srcu_idx = 0;
- ssp->srcu_sup->srcu_gp_seq = 0;
+ ssp->srcu_sup->srcu_gp_seq = SRCU_GP_SEQ_INITIAL_VAL;
ssp->srcu_sup->srcu_barrier_seq = 0;
mutex_init(&ssp->srcu_sup->srcu_barrier_mutex);
atomic_set(&ssp->srcu_sup->srcu_barrier_cpu_cnt, 0);
INIT_DELAYED_WORK(&ssp->srcu_sup->work, process_srcu);
ssp->srcu_sup->sda_is_static = is_static;
- if (!is_static)
+ if (!is_static) {
ssp->sda = alloc_percpu(struct srcu_data);
+ ssp->srcu_ctrp = &ssp->sda->srcu_ctrs[0];
+ }
if (!ssp->sda)
goto err_free_sup;
init_srcu_struct_data(ssp);
- ssp->srcu_sup->srcu_gp_seq_needed_exp = 0;
+ ssp->srcu_sup->srcu_gp_seq_needed_exp = SRCU_GP_SEQ_INITIAL_VAL;
ssp->srcu_sup->srcu_last_gp_end = ktime_get_mono_fast_ns();
if (READ_ONCE(ssp->srcu_sup->srcu_size_state) == SRCU_SIZE_SMALL && SRCU_SIZING_IS_INIT()) {
if (!init_srcu_struct_nodes(ssp, GFP_ATOMIC))
@@ -266,7 +267,8 @@ static int init_srcu_struct_fields(struct srcu_struct *ssp, bool is_static)
WRITE_ONCE(ssp->srcu_sup->srcu_size_state, SRCU_SIZE_BIG);
}
ssp->srcu_sup->srcu_ssp = ssp;
- smp_store_release(&ssp->srcu_sup->srcu_gp_seq_needed, 0); /* Init done. */
+ smp_store_release(&ssp->srcu_sup->srcu_gp_seq_needed,
+ SRCU_GP_SEQ_INITIAL_VAL); /* Init done. */
return 0;
err_free_sda:
@@ -417,41 +419,60 @@ static void check_init_srcu_struct(struct srcu_struct *ssp)
}
/*
- * Returns approximate total of the readers' ->srcu_lock_count[] values
- * for the rank of per-CPU counters specified by idx.
+ * Is the current or any upcoming grace period to be expedited?
+ */
+static bool srcu_gp_is_expedited(struct srcu_struct *ssp)
+{
+ struct srcu_usage *sup = ssp->srcu_sup;
+
+ return ULONG_CMP_LT(READ_ONCE(sup->srcu_gp_seq), READ_ONCE(sup->srcu_gp_seq_needed_exp));
+}
+
+/*
+ * Computes approximate total of the readers' ->srcu_ctrs[].srcu_locks
+ * values for the rank of per-CPU counters specified by idx, and returns
+ * true if the caller did the proper barrier (gp), and if the count of
+ * the locks matches that of the unlocks passed in.
*/
-static unsigned long srcu_readers_lock_idx(struct srcu_struct *ssp, int idx)
+static bool srcu_readers_lock_idx(struct srcu_struct *ssp, int idx, bool gp, unsigned long unlocks)
{
int cpu;
+ unsigned long mask = 0;
unsigned long sum = 0;
for_each_possible_cpu(cpu) {
- struct srcu_data *cpuc = per_cpu_ptr(ssp->sda, cpu);
+ struct srcu_data *sdp = per_cpu_ptr(ssp->sda, cpu);
- sum += atomic_long_read(&cpuc->srcu_lock_count[idx]);
+ sum += atomic_long_read(&sdp->srcu_ctrs[idx].srcu_locks);
+ if (IS_ENABLED(CONFIG_PROVE_RCU))
+ mask = mask | READ_ONCE(sdp->srcu_reader_flavor);
}
- return sum;
+ WARN_ONCE(IS_ENABLED(CONFIG_PROVE_RCU) && (mask & (mask - 1)),
+ "Mixed reader flavors for srcu_struct at %ps.\n", ssp);
+ if (mask & SRCU_READ_FLAVOR_SLOWGP && !gp)
+ return false;
+ return sum == unlocks;
}
/*
- * Returns approximate total of the readers' ->srcu_unlock_count[] values
- * for the rank of per-CPU counters specified by idx.
+ * Returns approximate total of the readers' ->srcu_ctrs[].srcu_unlocks
+ * values for the rank of per-CPU counters specified by idx.
*/
-static unsigned long srcu_readers_unlock_idx(struct srcu_struct *ssp, int idx)
+static unsigned long srcu_readers_unlock_idx(struct srcu_struct *ssp, int idx, unsigned long *rdm)
{
int cpu;
unsigned long mask = 0;
unsigned long sum = 0;
for_each_possible_cpu(cpu) {
- struct srcu_data *cpuc = per_cpu_ptr(ssp->sda, cpu);
+ struct srcu_data *sdp = per_cpu_ptr(ssp->sda, cpu);
- sum += atomic_long_read(&cpuc->srcu_unlock_count[idx]);
- if (IS_ENABLED(CONFIG_PROVE_RCU))
- mask = mask | READ_ONCE(cpuc->srcu_nmi_safety);
+ sum += atomic_long_read(&sdp->srcu_ctrs[idx].srcu_unlocks);
+ mask = mask | READ_ONCE(sdp->srcu_reader_flavor);
}
- WARN_ONCE(IS_ENABLED(CONFIG_PROVE_RCU) && (mask & (mask >> 1)),
- "Mixed NMI-safe readers for srcu_struct at %ps.\n", ssp);
+ WARN_ONCE(IS_ENABLED(CONFIG_PROVE_RCU) && (mask & (mask - 1)),
+ "Mixed reader flavors for srcu_struct at %ps.\n", ssp);
+ *rdm = mask;
return sum;
}
@@ -461,69 +482,76 @@ static unsigned long srcu_readers_unlock_idx(struct srcu_struct *ssp, int idx)
*/
static bool srcu_readers_active_idx_check(struct srcu_struct *ssp, int idx)
{
+ bool did_gp;
+ unsigned long rdm;
unsigned long unlocks;
- unlocks = srcu_readers_unlock_idx(ssp, idx);
+ unlocks = srcu_readers_unlock_idx(ssp, idx, &rdm);
+ did_gp = !!(rdm & SRCU_READ_FLAVOR_SLOWGP);
/*
* Make sure that a lock is always counted if the corresponding
* unlock is counted. Needs to be a smp_mb() as the read side may
* contain a read from a variable that is written to before the
* synchronize_srcu() in the write side. In this case smp_mb()s
- * A and B act like the store buffering pattern.
+ * A and B (or X and Y) act like the store buffering pattern.
*
- * This smp_mb() also pairs with smp_mb() C to prevent accesses
- * after the synchronize_srcu() from being executed before the
- * grace period ends.
+ * This smp_mb() also pairs with smp_mb() C (or, in the case of X,
+ * Z) to prevent accesses after the synchronize_srcu() from being
+ * executed before the grace period ends.
*/
- smp_mb(); /* A */
+ if (!did_gp)
+ smp_mb(); /* A */
+ else
+ synchronize_rcu(); /* X */
/*
* If the locks are the same as the unlocks, then there must have
* been no readers on this index at some point in this function.
* But there might be more readers, as a task might have read
- * the current ->srcu_idx but not yet have incremented its CPU's
- * ->srcu_lock_count[idx] counter. In fact, it is possible
+ * the current ->srcu_ctrp but not yet have incremented its CPU's
+ * ->srcu_ctrs[idx].srcu_locks counter. In fact, it is possible
* that most of the tasks have been preempted between fetching
- * ->srcu_idx and incrementing ->srcu_lock_count[idx]. And there
- * could be almost (ULONG_MAX / sizeof(struct task_struct)) tasks
- * in a system whose address space was fully populated with memory.
- * Call this quantity Nt.
+ * ->srcu_ctrp and incrementing ->srcu_ctrs[idx].srcu_locks. And
+ * there could be almost (ULONG_MAX / sizeof(struct task_struct))
+ * tasks in a system whose address space was fully populated
+ * with memory. Call this quantity Nt.
*
- * So suppose that the updater is preempted at this point in the
- * code for a long time. That now-preempted updater has already
- * flipped ->srcu_idx (possibly during the preceding grace period),
- * done an smp_mb() (again, possibly during the preceding grace
- * period), and summed up the ->srcu_unlock_count[idx] counters.
- * How many times can a given one of the aforementioned Nt tasks
- * increment the old ->srcu_idx value's ->srcu_lock_count[idx]
- * counter, in the absence of nesting?
+ * So suppose that the updater is preempted at this
+ * point in the code for a long time. That now-preempted
+ * updater has already flipped ->srcu_ctrp (possibly during
+ * the preceding grace period), done an smp_mb() (again,
+ * possibly during the preceding grace period), and summed up
+ * the ->srcu_ctrs[idx].srcu_unlocks counters. How many times
+ * can a given one of the aforementioned Nt tasks increment the
+ * old ->srcu_ctrp value's ->srcu_ctrs[idx].srcu_locks counter,
+ * in the absence of nesting?
*
* It can clearly do so once, given that it has already fetched
- * the old value of ->srcu_idx and is just about to use that value
- * to index its increment of ->srcu_lock_count[idx]. But as soon as
- * it leaves that SRCU read-side critical section, it will increment
- * ->srcu_unlock_count[idx], which must follow the updater's above
- * read from that same value. Thus, as soon the reading task does
- * an smp_mb() and a later fetch from ->srcu_idx, that task will be
- * guaranteed to get the new index. Except that the increment of
- * ->srcu_unlock_count[idx] in __srcu_read_unlock() is after the
- * smp_mb(), and the fetch from ->srcu_idx in __srcu_read_lock()
- * is before the smp_mb(). Thus, that task might not see the new
- * value of ->srcu_idx until the -second- __srcu_read_lock(),
- * which in turn means that this task might well increment
- * ->srcu_lock_count[idx] for the old value of ->srcu_idx twice,
- * not just once.
+ * the old value of ->srcu_ctrp and is just about to use that
+ * value to index its increment of ->srcu_ctrs[idx].srcu_locks.
+ * But as soon as it leaves that SRCU read-side critical section,
+ * it will increment ->srcu_ctrs[idx].srcu_unlocks, which must
+ * follow the updater's above read from that same value. Thus,
+ as soon the reading task does an smp_mb() and a later fetch from
+ * ->srcu_ctrp, that task will be guaranteed to get the new index.
+ * Except that the increment of ->srcu_ctrs[idx].srcu_unlocks
+ * in __srcu_read_unlock() is after the smp_mb(), and the fetch
+ * from ->srcu_ctrp in __srcu_read_lock() is before the smp_mb().
+ * Thus, that task might not see the new value of ->srcu_ctrp until
+ * the -second- __srcu_read_lock(), which in turn means that this
+ * task might well increment ->srcu_ctrs[idx].srcu_locks for the
+ * old value of ->srcu_ctrp twice, not just once.
*
* However, it is important to note that a given smp_mb() takes
* effect not just for the task executing it, but also for any
* later task running on that same CPU.
*
- * That is, there can be almost Nt + Nc further increments of
- * ->srcu_lock_count[idx] for the old index, where Nc is the number
- * of CPUs. But this is OK because the size of the task_struct
- * structure limits the value of Nt and current systems limit Nc
- * to a few thousand.
+ * That is, there can be almost Nt + Nc further increments
+ * of ->srcu_ctrs[idx].srcu_locks for the old index, where Nc
+ * is the number of CPUs. But this is OK because the size of
+ * the task_struct structure limits the value of Nt and current
+ * systems limit Nc to a few thousand.
*
* OK, but what about nesting? This does impose a limit on
* nesting of half of the size of the task_struct structure
@@ -534,7 +562,7 @@ static bool srcu_readers_active_idx_check(struct srcu_struct *ssp, int idx)
* which are unlikely to be configured with an address space fully
* populated with memory, at least not anytime soon.
*/
- return srcu_readers_lock_idx(ssp, idx) == unlocks;
+ return srcu_readers_lock_idx(ssp, idx, did_gp, unlocks);
}
/**
@@ -552,12 +580,12 @@ static bool srcu_readers_active(struct srcu_struct *ssp)
unsigned long sum = 0;
for_each_possible_cpu(cpu) {
- struct srcu_data *cpuc = per_cpu_ptr(ssp->sda, cpu);
+ struct srcu_data *sdp = per_cpu_ptr(ssp->sda, cpu);
- sum += atomic_long_read(&cpuc->srcu_lock_count[0]);
- sum += atomic_long_read(&cpuc->srcu_lock_count[1]);
- sum -= atomic_long_read(&cpuc->srcu_unlock_count[0]);
- sum -= atomic_long_read(&cpuc->srcu_unlock_count[1]);
+ sum += atomic_long_read(&sdp->srcu_ctrs[0].srcu_locks);
+ sum += atomic_long_read(&sdp->srcu_ctrs[1].srcu_locks);
+ sum -= atomic_long_read(&sdp->srcu_ctrs[0].srcu_unlocks);
+ sum -= atomic_long_read(&sdp->srcu_ctrs[1].srcu_unlocks);
}
return sum;
}
@@ -620,7 +648,8 @@ static unsigned long srcu_get_delay(struct srcu_struct *ssp)
unsigned long jbase = SRCU_INTERVAL;
struct srcu_usage *sup = ssp->srcu_sup;
- if (ULONG_CMP_LT(READ_ONCE(sup->srcu_gp_seq), READ_ONCE(sup->srcu_gp_seq_needed_exp)))
+ lockdep_assert_held(&ACCESS_PRIVATE(ssp->srcu_sup, lock));
+ if (srcu_gp_is_expedited(ssp))
jbase = 0;
if (rcu_seq_state(READ_ONCE(sup->srcu_gp_seq))) {
j = jiffies - 1;
@@ -628,6 +657,7 @@ static unsigned long srcu_get_delay(struct srcu_struct *ssp)
if (time_after(j, gpstart))
jbase += j - gpstart;
if (!jbase) {
+ ASSERT_EXCLUSIVE_WRITER(sup->srcu_n_exp_nodelay);
WRITE_ONCE(sup->srcu_n_exp_nodelay, READ_ONCE(sup->srcu_n_exp_nodelay) + 1);
if (READ_ONCE(sup->srcu_n_exp_nodelay) > srcu_max_nodelay_phase)
jbase = 1;
@@ -646,9 +676,13 @@ static unsigned long srcu_get_delay(struct srcu_struct *ssp)
void cleanup_srcu_struct(struct srcu_struct *ssp)
{
int cpu;
+ unsigned long delay;
struct srcu_usage *sup = ssp->srcu_sup;
- if (WARN_ON(!srcu_get_delay(ssp)))
+ spin_lock_irq_rcu_node(ssp->srcu_sup);
+ delay = srcu_get_delay(ssp);
+ spin_unlock_irq_rcu_node(ssp->srcu_sup);
+ if (WARN_ON(!delay))
return; /* Just leak it! */
if (WARN_ON(srcu_readers_active(ssp)))
return; /* Just leak it! */
@@ -656,7 +690,7 @@ void cleanup_srcu_struct(struct srcu_struct *ssp)
for_each_possible_cpu(cpu) {
struct srcu_data *sdp = per_cpu_ptr(ssp->sda, cpu);
- del_timer_sync(&sdp->delay_work);
+ timer_delete_sync(&sdp->delay_work);
flush_work(&sdp->work);
if (WARN_ON(rcu_segcblist_n_cbs(&sdp->srcu_cblist)))
return; /* Forgot srcu_barrier(), so just leak it! */
@@ -667,7 +701,10 @@ void cleanup_srcu_struct(struct srcu_struct *ssp)
pr_info("%s: Active srcu_struct %p read state: %d gp state: %lu/%lu\n",
__func__, ssp, rcu_seq_state(READ_ONCE(sup->srcu_gp_seq)),
rcu_seq_current(&sup->srcu_gp_seq), sup->srcu_gp_seq_needed);
- return; /* Caller forgot to stop doing call_srcu()? */
+ return; // Caller forgot to stop doing call_srcu()?
+ // Or caller invoked start_poll_synchronize_srcu()
+ // and then cleanup_srcu_struct() before that grace
+ // period ended?
}
kfree(sup->node);
sup->node = NULL;
@@ -681,42 +718,42 @@ void cleanup_srcu_struct(struct srcu_struct *ssp)
}
EXPORT_SYMBOL_GPL(cleanup_srcu_struct);
-#ifdef CONFIG_PROVE_RCU
/*
- * Check for consistent NMI safety.
+ * Check for consistent reader flavor.
*/
-void srcu_check_nmi_safety(struct srcu_struct *ssp, bool nmi_safe)
+void __srcu_check_read_flavor(struct srcu_struct *ssp, int read_flavor)
{
- int nmi_safe_mask = 1 << nmi_safe;
- int old_nmi_safe_mask;
+ int old_read_flavor;
struct srcu_data *sdp;
- /* NMI-unsafe use in NMI is a bad sign */
- WARN_ON_ONCE(!nmi_safe && in_nmi());
+ /* NMI-unsafe use in NMI is a bad sign, as is multi-bit read_flavor values. */
+ WARN_ON_ONCE((read_flavor != SRCU_READ_FLAVOR_NMI) && in_nmi());
+ WARN_ON_ONCE(read_flavor & (read_flavor - 1));
+
sdp = raw_cpu_ptr(ssp->sda);
- old_nmi_safe_mask = READ_ONCE(sdp->srcu_nmi_safety);
- if (!old_nmi_safe_mask) {
- WRITE_ONCE(sdp->srcu_nmi_safety, nmi_safe_mask);
- return;
+ old_read_flavor = READ_ONCE(sdp->srcu_reader_flavor);
+ if (!old_read_flavor) {
+ old_read_flavor = cmpxchg(&sdp->srcu_reader_flavor, 0, read_flavor);
+ if (!old_read_flavor)
+ return;
}
- WARN_ONCE(old_nmi_safe_mask != nmi_safe_mask, "CPU %d old state %d new state %d\n", sdp->cpu, old_nmi_safe_mask, nmi_safe_mask);
+ WARN_ONCE(old_read_flavor != read_flavor, "CPU %d old state %d new state %d\n", sdp->cpu, old_read_flavor, read_flavor);
}
-EXPORT_SYMBOL_GPL(srcu_check_nmi_safety);
-#endif /* CONFIG_PROVE_RCU */
+EXPORT_SYMBOL_GPL(__srcu_check_read_flavor);
/*
* Counts the new reader in the appropriate per-CPU element of the
* srcu_struct.
- * Returns an index that must be passed to the matching srcu_read_unlock().
+ * Returns a guaranteed non-negative index that must be passed to the
+ * matching __srcu_read_unlock().
*/
int __srcu_read_lock(struct srcu_struct *ssp)
{
- int idx;
+ struct srcu_ctr __percpu *scp = READ_ONCE(ssp->srcu_ctrp);
- idx = READ_ONCE(ssp->srcu_idx) & 0x1;
- this_cpu_inc(ssp->sda->srcu_lock_count[idx].counter);
+ this_cpu_inc(scp->srcu_locks.counter);
smp_mb(); /* B */ /* Avoid leaking the critical section. */
- return idx;
+ return __srcu_ptr_to_ctr(ssp, scp);
}
EXPORT_SYMBOL_GPL(__srcu_read_lock);
@@ -728,7 +765,7 @@ EXPORT_SYMBOL_GPL(__srcu_read_lock);
void __srcu_read_unlock(struct srcu_struct *ssp, int idx)
{
smp_mb(); /* C */ /* Avoid leaking the critical section. */
- this_cpu_inc(ssp->sda->srcu_unlock_count[idx].counter);
+ this_cpu_inc(__srcu_ctr_to_ptr(ssp, idx)->srcu_unlocks.counter);
}
EXPORT_SYMBOL_GPL(__srcu_read_unlock);
@@ -741,13 +778,12 @@ EXPORT_SYMBOL_GPL(__srcu_read_unlock);
*/
int __srcu_read_lock_nmisafe(struct srcu_struct *ssp)
{
- int idx;
- struct srcu_data *sdp = raw_cpu_ptr(ssp->sda);
+ struct srcu_ctr __percpu *scpp = READ_ONCE(ssp->srcu_ctrp);
+ struct srcu_ctr *scp = raw_cpu_ptr(scpp);
- idx = READ_ONCE(ssp->srcu_idx) & 0x1;
- atomic_long_inc(&sdp->srcu_lock_count[idx]);
+ atomic_long_inc(&scp->srcu_locks);
smp_mb__after_atomic(); /* B */ /* Avoid leaking the critical section. */
- return idx;
+ return __srcu_ptr_to_ctr(ssp, scpp);
}
EXPORT_SYMBOL_GPL(__srcu_read_lock_nmisafe);
@@ -758,10 +794,8 @@ EXPORT_SYMBOL_GPL(__srcu_read_lock_nmisafe);
*/
void __srcu_read_unlock_nmisafe(struct srcu_struct *ssp, int idx)
{
- struct srcu_data *sdp = raw_cpu_ptr(ssp->sda);
-
smp_mb__before_atomic(); /* C */ /* Avoid leaking the critical section. */
- atomic_long_inc(&sdp->srcu_unlock_count[idx]);
+ atomic_long_inc(&raw_cpu_ptr(__srcu_ctr_to_ptr(ssp, idx))->srcu_unlocks);
}
EXPORT_SYMBOL_GPL(__srcu_read_unlock_nmisafe);
@@ -845,7 +879,6 @@ static void srcu_gp_end(struct srcu_struct *ssp)
bool cbs;
bool last_lvl;
int cpu;
- unsigned long flags;
unsigned long gpseq;
int idx;
unsigned long mask;
@@ -862,7 +895,7 @@ static void srcu_gp_end(struct srcu_struct *ssp)
spin_lock_irq_rcu_node(sup);
idx = rcu_seq_state(sup->srcu_gp_seq);
WARN_ON_ONCE(idx != SRCU_STATE_SCAN2);
- if (ULONG_CMP_LT(READ_ONCE(sup->srcu_gp_seq), READ_ONCE(sup->srcu_gp_seq_needed_exp)))
+ if (srcu_gp_is_expedited(ssp))
cbdelay = 0;
WRITE_ONCE(sup->srcu_last_gp_end, ktime_get_mono_fast_ns());
@@ -907,12 +940,12 @@ static void srcu_gp_end(struct srcu_struct *ssp)
if (!(gpseq & counter_wrap_check))
for_each_possible_cpu(cpu) {
sdp = per_cpu_ptr(ssp->sda, cpu);
- spin_lock_irqsave_rcu_node(sdp, flags);
+ spin_lock_irq_rcu_node(sdp);
if (ULONG_CMP_GE(gpseq, sdp->srcu_gp_seq_needed + 100))
sdp->srcu_gp_seq_needed = gpseq;
if (ULONG_CMP_GE(gpseq, sdp->srcu_gp_seq_needed_exp + 100))
sdp->srcu_gp_seq_needed_exp = gpseq;
- spin_unlock_irqrestore_rcu_node(sdp, flags);
+ spin_unlock_irq_rcu_node(sdp);
}
/* Callback initiation done, allow grace periods after next. */
@@ -1046,7 +1079,6 @@ static void srcu_funnel_gp_start(struct srcu_struct *ssp, struct srcu_data *sdp,
/* If grace period not already in progress, start it. */
if (!WARN_ON_ONCE(rcu_seq_done(&sup->srcu_gp_seq, s)) &&
rcu_seq_state(sup->srcu_gp_seq) == SRCU_STATE_IDLE) {
- WARN_ON_ONCE(ULONG_CMP_GE(sup->srcu_gp_seq, sup->srcu_gp_seq_needed));
srcu_gp_start(ssp);
// And how can that list_add() in the "else" clause
@@ -1066,13 +1098,15 @@ static void srcu_funnel_gp_start(struct srcu_struct *ssp, struct srcu_data *sdp,
/*
* Wait until all readers counted by array index idx complete, but
* loop an additional time if there is an expedited grace period pending.
- * The caller must ensure that ->srcu_idx is not changed while checking.
+ * The caller must ensure that ->srcu_ctrp is not changed while checking.
*/
static bool try_check_zero(struct srcu_struct *ssp, int idx, int trycount)
{
unsigned long curdelay;
+ spin_lock_irq_rcu_node(ssp->srcu_sup);
curdelay = !srcu_get_delay(ssp);
+ spin_unlock_irq_rcu_node(ssp->srcu_sup);
for (;;) {
if (srcu_readers_active_idx_check(ssp, idx))
@@ -1084,30 +1118,30 @@ static bool try_check_zero(struct srcu_struct *ssp, int idx, int trycount)
}
/*
- * Increment the ->srcu_idx counter so that future SRCU readers will
+ * Increment the ->srcu_ctrp counter so that future SRCU readers will
* use the other rank of the ->srcu_(un)lock_count[] arrays. This allows
* us to wait for pre-existing readers in a starvation-free manner.
*/
static void srcu_flip(struct srcu_struct *ssp)
{
/*
- * Because the flip of ->srcu_idx is executed only if the
+ * Because the flip of ->srcu_ctrp is executed only if the
* preceding call to srcu_readers_active_idx_check() found that
- * the ->srcu_unlock_count[] and ->srcu_lock_count[] sums matched
- * and because that summing uses atomic_long_read(), there is
- * ordering due to a control dependency between that summing and
- * the WRITE_ONCE() in this call to srcu_flip(). This ordering
- * ensures that if this updater saw a given reader's increment from
- * __srcu_read_lock(), that reader was using a value of ->srcu_idx
- * from before the previous call to srcu_flip(), which should be
- * quite rare. This ordering thus helps forward progress because
- * the grace period could otherwise be delayed by additional
- * calls to __srcu_read_lock() using that old (soon to be new)
- * value of ->srcu_idx.
+ * the ->srcu_ctrs[].srcu_unlocks and ->srcu_ctrs[].srcu_locks sums
+ * matched and because that summing uses atomic_long_read(),
+ * there is ordering due to a control dependency between that
+ * summing and the WRITE_ONCE() in this call to srcu_flip().
+ * This ordering ensures that if this updater saw a given reader's
+ * increment from __srcu_read_lock(), that reader was using a value
+ * of ->srcu_ctrp from before the previous call to srcu_flip(),
+ * which should be quite rare. This ordering thus helps forward
+ * progress because the grace period could otherwise be delayed
+ * by additional calls to __srcu_read_lock() using that old (soon
+ * to be new) value of ->srcu_ctrp.
*
* This sum-equality check and ordering also ensures that if
* a given call to __srcu_read_lock() uses the new value of
- * ->srcu_idx, this updater's earlier scans cannot have seen
+ * ->srcu_ctrp, this updater's earlier scans cannot have seen
* that reader's increments, which is all to the good, because
* this grace period need not wait on that reader. After all,
* if those earlier scans had seen that reader, there would have
@@ -1117,10 +1151,13 @@ static void srcu_flip(struct srcu_struct *ssp)
* it stays until either (1) Compilers learn about this sort of
* control dependency or (2) Some production workload running on
* a production system is unduly delayed by this slowpath smp_mb().
+ * Except for _lite() readers, where it is inoperative, which
+ * means that it is a good thing that it is redundant.
*/
smp_mb(); /* E */ /* Pairs with B and C. */
- WRITE_ONCE(ssp->srcu_idx, ssp->srcu_idx + 1); // Flip the counter.
+ WRITE_ONCE(ssp->srcu_ctrp,
+ &ssp->sda->srcu_ctrs[!(ssp->srcu_ctrp - &ssp->sda->srcu_ctrs[0])]);
/*
* Ensure that if the updater misses an __srcu_read_unlock()
@@ -1134,7 +1171,9 @@ static void srcu_flip(struct srcu_struct *ssp)
}
/*
- * If SRCU is likely idle, return true, otherwise return false.
+ * If SRCU is likely idle, in other words, the next SRCU grace period
+ * should be expedited, return true, otherwise return false. Except that
+ * in the presence of _lite() readers, always return false.
*
* Note that it is OK for several current from-idle requests for a new
* grace period from idle to specify expediting because they will all end
@@ -1154,7 +1193,7 @@ static void srcu_flip(struct srcu_struct *ssp)
* negligible when amortized over that time period, and the extra latency
* of a needlessly non-expedited grace period is similarly negligible.
*/
-static bool srcu_might_be_idle(struct srcu_struct *ssp)
+static bool srcu_should_expedite(struct srcu_struct *ssp)
{
unsigned long curseq;
unsigned long flags;
@@ -1163,6 +1202,9 @@ static bool srcu_might_be_idle(struct srcu_struct *ssp)
unsigned long tlast;
check_init_srcu_struct(ssp);
+ /* If _lite() readers, don't do unsolicited expediting. */
+ if (this_cpu_read(ssp->sda->srcu_reader_flavor) & SRCU_READ_FLAVOR_SLOWGP)
+ return false;
/* If the local srcu_data structure has callbacks, not idle. */
sdp = raw_cpu_ptr(ssp->sda);
spin_lock_irqsave_rcu_node(sdp, flags);
@@ -1361,8 +1403,12 @@ static void __call_srcu(struct srcu_struct *ssp, struct rcu_head *rhp,
* read-side critical sections are delimited by srcu_read_lock() and
* srcu_read_unlock(), and may be nested.
*
- * The callback will be invoked from process context, but must nevertheless
- * be fast and must not block.
+ * The callback will be invoked from process context, but with bh
+ * disabled. The callback function must therefore be fast and must
+ * not block.
+ *
+ * See the description of call_rcu() for more detailed information on
+ * memory ordering guarantees.
*/
void call_srcu(struct srcu_struct *ssp, struct rcu_head *rhp,
rcu_callback_t func)
@@ -1428,8 +1474,9 @@ EXPORT_SYMBOL_GPL(synchronize_srcu_expedited);
*
* Wait for the count to drain to zero of both indexes. To avoid the
* possible starvation of synchronize_srcu(), it waits for the count of
- * the index=((->srcu_idx & 1) ^ 1) to drain to zero at first,
- * and then flip the srcu_idx and wait for the count of the other index.
+ * the index=!(ssp->srcu_ctrp - &ssp->sda->srcu_ctrs[0]) to drain to zero
+ * at first, and then flip the ->srcu_ctrp and wait for the count of the
+ * other index.
*
* Can block; must be called from process context.
*
@@ -1464,14 +1511,15 @@ EXPORT_SYMBOL_GPL(synchronize_srcu_expedited);
* Implementation of these memory-ordering guarantees is similar to
* that of synchronize_rcu().
*
- * If SRCU is likely idle, expedite the first request. This semantic
- * was provided by Classic SRCU, and is relied upon by its users, so TREE
- * SRCU must also provide it. Note that detecting idleness is heuristic
- * and subject to both false positives and negatives.
+ * If SRCU is likely idle as determined by srcu_should_expedite(),
+ * expedite the first request. This semantic was provided by Classic SRCU,
+ * and is relied upon by its users, so TREE SRCU must also provide it.
+ * Note that detecting idleness is heuristic and subject to both false
+ * positives and negatives.
*/
void synchronize_srcu(struct srcu_struct *ssp)
{
- if (srcu_might_be_idle(ssp) || rcu_gp_is_expedited())
+ if (srcu_should_expedite(ssp) || rcu_gp_is_expedited())
synchronize_srcu_expedited(ssp);
else
__synchronize_srcu(ssp, true);
@@ -1540,7 +1588,8 @@ EXPORT_SYMBOL_GPL(start_poll_synchronize_srcu);
*/
bool poll_state_synchronize_srcu(struct srcu_struct *ssp, unsigned long cookie)
{
- if (!rcu_seq_done(&ssp->srcu_sup->srcu_gp_seq, cookie))
+ if (cookie != SRCU_GET_STATE_COMPLETED &&
+ !rcu_seq_done_exact(&ssp->srcu_sup->srcu_gp_seq, cookie))
return false;
// Ensure that the end of the SRCU grace period happens before
// any subsequent code that the caller might execute.
@@ -1557,6 +1606,7 @@ static void srcu_barrier_cb(struct rcu_head *rhp)
struct srcu_data *sdp;
struct srcu_struct *ssp;
+ rhp->next = rhp; // Mark the callback as having been invoked.
sdp = container_of(rhp, struct srcu_data, srcu_barrier_head);
ssp = sdp->ssp;
if (atomic_dec_and_test(&ssp->srcu_sup->srcu_barrier_cpu_cnt))
@@ -1635,7 +1685,7 @@ EXPORT_SYMBOL_GPL(srcu_barrier);
*/
unsigned long srcu_batches_completed(struct srcu_struct *ssp)
{
- return READ_ONCE(ssp->srcu_idx);
+ return READ_ONCE(ssp->srcu_sup->srcu_gp_seq);
}
EXPORT_SYMBOL_GPL(srcu_batches_completed);
@@ -1652,7 +1702,7 @@ static void srcu_advance_state(struct srcu_struct *ssp)
/*
* Because readers might be delayed for an extended period after
- * fetching ->srcu_idx for their index, at any point in time there
+ * fetching ->srcu_ctrp for their index, at any point in time there
* might well be readers using both idx=0 and idx=1. We therefore
* need to wait for readers to clear from both index values before
* invoking a callback.
@@ -1680,7 +1730,7 @@ static void srcu_advance_state(struct srcu_struct *ssp)
}
if (rcu_seq_state(READ_ONCE(ssp->srcu_sup->srcu_gp_seq)) == SRCU_STATE_SCAN1) {
- idx = 1 ^ (ssp->srcu_idx & 1);
+ idx = !(ssp->srcu_ctrp - &ssp->sda->srcu_ctrs[0]);
if (!try_check_zero(ssp, idx, 1)) {
mutex_unlock(&ssp->srcu_sup->srcu_gp_mutex);
return; /* readers present, retry later. */
@@ -1698,7 +1748,7 @@ static void srcu_advance_state(struct srcu_struct *ssp)
* SRCU read-side critical sections are normally short,
* so check at least twice in quick succession after a flip.
*/
- idx = 1 ^ (ssp->srcu_idx & 1);
+ idx = !(ssp->srcu_ctrp - &ssp->sda->srcu_ctrs[0]);
if (!try_check_zero(ssp, idx, 2)) {
mutex_unlock(&ssp->srcu_sup->srcu_gp_mutex);
return; /* readers present, retry later. */
@@ -1809,12 +1859,15 @@ static void process_srcu(struct work_struct *work)
ssp = sup->srcu_ssp;
srcu_advance_state(ssp);
+ spin_lock_irq_rcu_node(ssp->srcu_sup);
curdelay = srcu_get_delay(ssp);
+ spin_unlock_irq_rcu_node(ssp->srcu_sup);
if (curdelay) {
WRITE_ONCE(sup->reschedule_count, 0);
} else {
j = jiffies;
if (READ_ONCE(sup->reschedule_jiffies) == j) {
+ ASSERT_EXCLUSIVE_WRITER(sup->reschedule_count);
WRITE_ONCE(sup->reschedule_count, READ_ONCE(sup->reschedule_count) + 1);
if (READ_ONCE(sup->reschedule_count) > srcu_max_nodelay)
curdelay = 1;
@@ -1826,12 +1879,9 @@ static void process_srcu(struct work_struct *work)
srcu_reschedule(ssp, curdelay);
}
-void srcutorture_get_gp_data(enum rcutorture_type test_type,
- struct srcu_struct *ssp, int *flags,
+void srcutorture_get_gp_data(struct srcu_struct *ssp, int *flags,
unsigned long *gp_seq)
{
- if (test_type != SRCU_FLAVOR)
- return;
*flags = 0;
*gp_seq = rcu_seq_current(&ssp->srcu_sup->srcu_gp_seq);
}
@@ -1858,7 +1908,7 @@ void srcu_torture_stats_print(struct srcu_struct *ssp, char *tt, char *tf)
int ss_state = READ_ONCE(ssp->srcu_sup->srcu_size_state);
int ss_state_idx = ss_state;
- idx = ssp->srcu_idx & 0x1;
+ idx = ssp->srcu_ctrp - &ssp->sda->srcu_ctrs[0];
if (ss_state < 0 || ss_state >= ARRAY_SIZE(srcu_size_state_name))
ss_state_idx = ARRAY_SIZE(srcu_size_state_name) - 1;
pr_alert("%s%s Tree SRCU g%ld state %d (%s)",
@@ -1876,8 +1926,8 @@ void srcu_torture_stats_print(struct srcu_struct *ssp, char *tt, char *tf)
struct srcu_data *sdp;
sdp = per_cpu_ptr(ssp->sda, cpu);
- u0 = data_race(atomic_long_read(&sdp->srcu_unlock_count[!idx]));
- u1 = data_race(atomic_long_read(&sdp->srcu_unlock_count[idx]));
+ u0 = data_race(atomic_long_read(&sdp->srcu_ctrs[!idx].srcu_unlocks));
+ u1 = data_race(atomic_long_read(&sdp->srcu_ctrs[idx].srcu_unlocks));
/*
* Make sure that a lock is always counted if the corresponding
@@ -1885,8 +1935,8 @@ void srcu_torture_stats_print(struct srcu_struct *ssp, char *tt, char *tf)
*/
smp_rmb();
- l0 = data_race(atomic_long_read(&sdp->srcu_lock_count[!idx]));
- l1 = data_race(atomic_long_read(&sdp->srcu_lock_count[idx]));
+ l0 = data_race(atomic_long_read(&sdp->srcu_ctrs[!idx].srcu_locks));
+ l1 = data_race(atomic_long_read(&sdp->srcu_ctrs[idx].srcu_locks));
c0 = l0 - u0;
c1 = l1 - u1;
@@ -1963,6 +2013,7 @@ static int srcu_module_coming(struct module *mod)
ssp->sda = alloc_percpu(struct srcu_data);
if (WARN_ON_ONCE(!ssp->sda))
return -ENOMEM;
+ ssp->srcu_ctrp = &ssp->sda->srcu_ctrs[0];
}
return 0;
}
diff --git a/kernel/rcu/sync.c b/kernel/rcu/sync.c
index 86df878a2fee..da60a9947c00 100644
--- a/kernel/rcu/sync.c
+++ b/kernel/rcu/sync.c
@@ -152,9 +152,9 @@ void rcu_sync_enter(struct rcu_sync *rsp)
void rcu_sync_exit(struct rcu_sync *rsp)
{
WARN_ON_ONCE(READ_ONCE(rsp->gp_state) == GP_IDLE);
- WARN_ON_ONCE(READ_ONCE(rsp->gp_count) == 0);
spin_lock_irq(&rsp->rss_lock);
+ WARN_ON_ONCE(rsp->gp_count == 0);
if (!--rsp->gp_count) {
if (rsp->gp_state == GP_PASSED) {
WRITE_ONCE(rsp->gp_state, GP_EXIT);
@@ -174,10 +174,10 @@ void rcu_sync_dtor(struct rcu_sync *rsp)
{
int gp_state;
- WARN_ON_ONCE(READ_ONCE(rsp->gp_count));
WARN_ON_ONCE(READ_ONCE(rsp->gp_state) == GP_PASSED);
spin_lock_irq(&rsp->rss_lock);
+ WARN_ON_ONCE(rsp->gp_count);
if (rsp->gp_state == GP_REPLAY)
WRITE_ONCE(rsp->gp_state, GP_EXIT);
gp_state = rsp->gp_state;
diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h
index 147b5945d67a..c0cc7ae41106 100644
--- a/kernel/rcu/tasks.h
+++ b/kernel/rcu/tasks.h
@@ -34,6 +34,7 @@ typedef void (*postgp_func_t)(struct rcu_tasks *rtp);
* @rtp_blkd_tasks: List of tasks blocked as readers.
* @rtp_exit_list: List of tasks in the latter portion of do_exit().
* @cpu: CPU number corresponding to this entry.
+ * @index: Index of this CPU in rtpcp_array of the rcu_tasks structure.
* @rtpp: Pointer to the rcu_tasks structure.
*/
struct rcu_tasks_percpu {
@@ -49,6 +50,7 @@ struct rcu_tasks_percpu {
struct list_head rtp_blkd_tasks;
struct list_head rtp_exit_list;
int cpu;
+ int index;
struct rcu_tasks *rtpp;
};
@@ -63,7 +65,7 @@ struct rcu_tasks_percpu {
* @init_fract: Initial backoff sleep interval.
* @gp_jiffies: Time of last @gp_state transition.
* @gp_start: Most recent grace-period start in jiffies.
- * @tasks_gp_seq: Number of grace periods completed since boot.
+ * @tasks_gp_seq: Number of grace periods completed since boot in upper bits.
* @n_ipis: Number of IPIs sent to encourage grace periods to end.
* @n_ipis_fails: Number of IPI-send failures.
* @kthread_ptr: This flavor's grace-period/callback-invocation kthread.
@@ -74,7 +76,9 @@ struct rcu_tasks_percpu {
* @holdouts_func: This flavor's holdout-list scan function (optional).
* @postgp_func: This flavor's post-grace-period function (optional).
* @call_func: This flavor's call_rcu()-equivalent function.
+ * @wait_state: Task state for synchronous grace-period waits (default TASK_UNINTERRUPTIBLE).
* @rtpcpu: This flavor's rcu_tasks_percpu structure.
+ * @rtpcp_array: Array of pointers to rcu_tasks_percpu structure of CPUs in cpu_possible_mask.
* @percpu_enqueue_shift: Shift down CPU ID this much when enqueuing callbacks.
* @percpu_enqueue_lim: Number of per-CPU callback queues in use for enqueuing.
* @percpu_dequeue_lim: Number of per-CPU callback queues in use for dequeuing.
@@ -83,6 +87,7 @@ struct rcu_tasks_percpu {
* @barrier_q_count: Number of queues being waited on.
* @barrier_q_completion: Barrier wait/wakeup mechanism.
* @barrier_q_seq: Sequence number for barrier operations.
+ * @barrier_q_start: Most recent barrier start in jiffies.
* @name: This flavor's textual name.
* @kname: This flavor's kthread name.
*/
@@ -107,7 +112,9 @@ struct rcu_tasks {
holdouts_func_t holdouts_func;
postgp_func_t postgp_func;
call_rcu_func_t call_func;
+ unsigned int wait_state;
struct rcu_tasks_percpu __percpu *rtpcpu;
+ struct rcu_tasks_percpu **rtpcp_array;
int percpu_enqueue_shift;
int percpu_enqueue_lim;
int percpu_dequeue_lim;
@@ -116,6 +123,7 @@ struct rcu_tasks {
atomic_t barrier_q_count;
struct completion barrier_q_completion;
unsigned long barrier_q_seq;
+ unsigned long barrier_q_start;
char *name;
char *kname;
};
@@ -134,6 +142,7 @@ static struct rcu_tasks rt_name = \
.tasks_gp_mutex = __MUTEX_INITIALIZER(rt_name.tasks_gp_mutex), \
.gp_func = gp, \
.call_func = call, \
+ .wait_state = TASK_UNINTERRUPTIBLE, \
.rtpcpu = &rt_name ## __percpu, \
.lazy_jiffies = DIV_ROUND_UP(HZ, 4), \
.name = n, \
@@ -147,7 +156,7 @@ static struct rcu_tasks rt_name = \
#ifdef CONFIG_TASKS_RCU
-/* Report delay in synchronize_srcu() completion in rcu_tasks_postscan(). */
+/* Report delay of scan exiting tasklist in rcu_tasks_postscan(). */
static void tasks_rcu_exit_srcu_stall(struct timer_list *unused);
static DEFINE_TIMER(tasks_rcu_exit_srcu_stall_timer, tasks_rcu_exit_srcu_stall);
#endif
@@ -179,6 +188,8 @@ module_param(rcu_task_collapse_lim, int, 0444);
static int rcu_task_lazy_lim __read_mostly = 32;
module_param(rcu_task_lazy_lim, int, 0444);
+static int rcu_task_cpu_ids;
+
/* RCU tasks grace-period state for debugging. */
#define RTGS_INIT 0
#define RTGS_WAIT_WAIT_CBS 1
@@ -242,6 +253,8 @@ static void cblist_init_generic(struct rcu_tasks *rtp)
int cpu;
int lim;
int shift;
+ int maxcpu;
+ int index = 0;
if (rcu_task_enqueue_lim < 0) {
rcu_task_enqueue_lim = 1;
@@ -251,14 +264,9 @@ static void cblist_init_generic(struct rcu_tasks *rtp)
}
lim = rcu_task_enqueue_lim;
- if (lim > nr_cpu_ids)
- lim = nr_cpu_ids;
- shift = ilog2(nr_cpu_ids / lim);
- if (((nr_cpu_ids - 1) >> shift) >= lim)
- shift++;
- WRITE_ONCE(rtp->percpu_enqueue_shift, shift);
- WRITE_ONCE(rtp->percpu_dequeue_lim, lim);
- smp_store_release(&rtp->percpu_enqueue_lim, lim);
+ rtp->rtpcp_array = kcalloc(num_possible_cpus(), sizeof(struct rcu_tasks_percpu *), GFP_KERNEL);
+ BUG_ON(!rtp->rtpcp_array);
+
for_each_possible_cpu(cpu) {
struct rcu_tasks_percpu *rtpcp = per_cpu_ptr(rtp->rtpcpu, cpu);
@@ -270,14 +278,30 @@ static void cblist_init_generic(struct rcu_tasks *rtp)
INIT_WORK(&rtpcp->rtp_work, rcu_tasks_invoke_cbs_wq);
rtpcp->cpu = cpu;
rtpcp->rtpp = rtp;
+ rtpcp->index = index;
+ rtp->rtpcp_array[index] = rtpcp;
+ index++;
if (!rtpcp->rtp_blkd_tasks.next)
INIT_LIST_HEAD(&rtpcp->rtp_blkd_tasks);
if (!rtpcp->rtp_exit_list.next)
INIT_LIST_HEAD(&rtpcp->rtp_exit_list);
+ rtpcp->barrier_q_head.next = &rtpcp->barrier_q_head;
+ maxcpu = cpu;
}
- pr_info("%s: Setting shift to %d and lim to %d rcu_task_cb_adjust=%d.\n", rtp->name,
- data_race(rtp->percpu_enqueue_shift), data_race(rtp->percpu_enqueue_lim), rcu_task_cb_adjust);
+ rcu_task_cpu_ids = maxcpu + 1;
+ if (lim > rcu_task_cpu_ids)
+ lim = rcu_task_cpu_ids;
+ shift = ilog2(rcu_task_cpu_ids / lim);
+ if (((rcu_task_cpu_ids - 1) >> shift) >= lim)
+ shift++;
+ WRITE_ONCE(rtp->percpu_enqueue_shift, shift);
+ WRITE_ONCE(rtp->percpu_dequeue_lim, lim);
+ smp_store_release(&rtp->percpu_enqueue_lim, lim);
+
+ pr_info("%s: Setting shift to %d and lim to %d rcu_task_cb_adjust=%d rcu_task_cpu_ids=%d.\n",
+ rtp->name, data_race(rtp->percpu_enqueue_shift), data_race(rtp->percpu_enqueue_lim),
+ rcu_task_cb_adjust, rcu_task_cpu_ids);
}
// Compute wakeup time for lazy callback timer.
@@ -336,6 +360,7 @@ static void call_rcu_tasks_generic(struct rcu_head *rhp, rcu_callback_t func,
rcu_read_lock();
ideal_cpu = smp_processor_id() >> READ_ONCE(rtp->percpu_enqueue_shift);
chosen_cpu = cpumask_next(ideal_cpu - 1, cpu_possible_mask);
+ WARN_ON_ONCE(chosen_cpu >= rcu_task_cpu_ids);
rtpcp = per_cpu_ptr(rtp->rtpcpu, chosen_cpu);
if (!raw_spin_trylock_rcu_node(rtpcp)) { // irqs already disabled.
raw_spin_lock_rcu_node(rtpcp); // irqs already disabled.
@@ -345,7 +370,7 @@ static void call_rcu_tasks_generic(struct rcu_head *rhp, rcu_callback_t func,
rtpcp->rtp_n_lock_retries = 0;
}
if (rcu_task_cb_adjust && ++rtpcp->rtp_n_lock_retries > rcu_task_contend_lim &&
- READ_ONCE(rtp->percpu_enqueue_lim) != nr_cpu_ids)
+ READ_ONCE(rtp->percpu_enqueue_lim) != rcu_task_cpu_ids)
needadjust = true; // Defer adjustment to avoid deadlock.
}
// Queuing callbacks before initialization not yet supported.
@@ -365,10 +390,10 @@ static void call_rcu_tasks_generic(struct rcu_head *rhp, rcu_callback_t func,
raw_spin_unlock_irqrestore_rcu_node(rtpcp, flags);
if (unlikely(needadjust)) {
raw_spin_lock_irqsave(&rtp->cbs_gbl_lock, flags);
- if (rtp->percpu_enqueue_lim != nr_cpu_ids) {
+ if (rtp->percpu_enqueue_lim != rcu_task_cpu_ids) {
WRITE_ONCE(rtp->percpu_enqueue_shift, 0);
- WRITE_ONCE(rtp->percpu_dequeue_lim, nr_cpu_ids);
- smp_store_release(&rtp->percpu_enqueue_lim, nr_cpu_ids);
+ WRITE_ONCE(rtp->percpu_dequeue_lim, rcu_task_cpu_ids);
+ smp_store_release(&rtp->percpu_enqueue_lim, rcu_task_cpu_ids);
pr_info("Switching %s to per-CPU callback queuing.\n", rtp->name);
}
raw_spin_unlock_irqrestore(&rtp->cbs_gbl_lock, flags);
@@ -385,6 +410,7 @@ static void rcu_barrier_tasks_generic_cb(struct rcu_head *rhp)
struct rcu_tasks *rtp;
struct rcu_tasks_percpu *rtpcp;
+ rhp->next = rhp; // Mark the callback as having been invoked.
rtpcp = container_of(rhp, struct rcu_tasks_percpu, barrier_q_head);
rtp = rtpcp->rtpp;
if (atomic_dec_and_test(&rtp->barrier_q_count))
@@ -393,7 +419,7 @@ static void rcu_barrier_tasks_generic_cb(struct rcu_head *rhp)
// Wait for all in-flight callbacks for the specified RCU Tasks flavor.
// Operates in a manner similar to rcu_barrier().
-static void rcu_barrier_tasks_generic(struct rcu_tasks *rtp)
+static void __maybe_unused rcu_barrier_tasks_generic(struct rcu_tasks *rtp)
{
int cpu;
unsigned long flags;
@@ -406,6 +432,7 @@ static void rcu_barrier_tasks_generic(struct rcu_tasks *rtp)
mutex_unlock(&rtp->barrier_q_mutex);
return;
}
+ rtp->barrier_q_start = jiffies;
rcu_seq_start(&rtp->barrier_q_seq);
init_completion(&rtp->barrier_q_completion);
atomic_set(&rtp->barrier_q_count, 2);
@@ -441,6 +468,8 @@ static int rcu_tasks_need_gpcb(struct rcu_tasks *rtp)
dequeue_limit = smp_load_acquire(&rtp->percpu_dequeue_lim);
for (cpu = 0; cpu < dequeue_limit; cpu++) {
+ if (!cpu_possible(cpu))
+ continue;
struct rcu_tasks_percpu *rtpcp = per_cpu_ptr(rtp->rtpcpu, cpu);
/* Advance and accelerate any new callbacks. */
@@ -478,7 +507,7 @@ static int rcu_tasks_need_gpcb(struct rcu_tasks *rtp)
if (rcu_task_cb_adjust && ncbs <= rcu_task_collapse_lim) {
raw_spin_lock_irqsave(&rtp->cbs_gbl_lock, flags);
if (rtp->percpu_enqueue_lim > 1) {
- WRITE_ONCE(rtp->percpu_enqueue_shift, order_base_2(nr_cpu_ids));
+ WRITE_ONCE(rtp->percpu_enqueue_shift, order_base_2(rcu_task_cpu_ids));
smp_store_release(&rtp->percpu_enqueue_lim, 1);
rtp->percpu_dequeue_gpseq = get_state_synchronize_rcu();
gpdone = false;
@@ -493,7 +522,9 @@ static int rcu_tasks_need_gpcb(struct rcu_tasks *rtp)
pr_info("Completing switch %s to CPU-0 callback queuing.\n", rtp->name);
}
if (rtp->percpu_dequeue_lim == 1) {
- for (cpu = rtp->percpu_dequeue_lim; cpu < nr_cpu_ids; cpu++) {
+ for (cpu = rtp->percpu_dequeue_lim; cpu < rcu_task_cpu_ids; cpu++) {
+ if (!cpu_possible(cpu))
+ continue;
struct rcu_tasks_percpu *rtpcp = per_cpu_ptr(rtp->rtpcpu, cpu);
WARN_ON_ONCE(rcu_segcblist_n_cbs(&rtpcp->cblist));
@@ -508,30 +539,32 @@ static int rcu_tasks_need_gpcb(struct rcu_tasks *rtp)
// Advance callbacks and invoke any that are ready.
static void rcu_tasks_invoke_cbs(struct rcu_tasks *rtp, struct rcu_tasks_percpu *rtpcp)
{
- int cpu;
- int cpunext;
int cpuwq;
unsigned long flags;
int len;
+ int index;
struct rcu_head *rhp;
struct rcu_cblist rcl = RCU_CBLIST_INITIALIZER(rcl);
struct rcu_tasks_percpu *rtpcp_next;
- cpu = rtpcp->cpu;
- cpunext = cpu * 2 + 1;
- if (cpunext < smp_load_acquire(&rtp->percpu_dequeue_lim)) {
- rtpcp_next = per_cpu_ptr(rtp->rtpcpu, cpunext);
- cpuwq = rcu_cpu_beenfullyonline(cpunext) ? cpunext : WORK_CPU_UNBOUND;
- queue_work_on(cpuwq, system_wq, &rtpcp_next->rtp_work);
- cpunext++;
- if (cpunext < smp_load_acquire(&rtp->percpu_dequeue_lim)) {
- rtpcp_next = per_cpu_ptr(rtp->rtpcpu, cpunext);
- cpuwq = rcu_cpu_beenfullyonline(cpunext) ? cpunext : WORK_CPU_UNBOUND;
+ index = rtpcp->index * 2 + 1;
+ if (index < num_possible_cpus()) {
+ rtpcp_next = rtp->rtpcp_array[index];
+ if (rtpcp_next->cpu < smp_load_acquire(&rtp->percpu_dequeue_lim)) {
+ cpuwq = rcu_cpu_beenfullyonline(rtpcp_next->cpu) ? rtpcp_next->cpu : WORK_CPU_UNBOUND;
queue_work_on(cpuwq, system_wq, &rtpcp_next->rtp_work);
+ index++;
+ if (index < num_possible_cpus()) {
+ rtpcp_next = rtp->rtpcp_array[index];
+ if (rtpcp_next->cpu < smp_load_acquire(&rtp->percpu_dequeue_lim)) {
+ cpuwq = rcu_cpu_beenfullyonline(rtpcp_next->cpu) ? rtpcp_next->cpu : WORK_CPU_UNBOUND;
+ queue_work_on(cpuwq, system_wq, &rtpcp_next->rtp_work);
+ }
+ }
}
}
- if (rcu_segcblist_empty(&rtpcp->cblist) || !cpu_possible(cpu))
+ if (rcu_segcblist_empty(&rtpcp->cblist))
return;
raw_spin_lock_irqsave_rcu_node(rtpcp, flags);
rcu_segcblist_advance(&rtpcp->cblist, rcu_seq_current(&rtp->tasks_gp_seq));
@@ -638,7 +671,7 @@ static void synchronize_rcu_tasks_generic(struct rcu_tasks *rtp)
// If the grace-period kthread is running, use it.
if (READ_ONCE(rtp->kthread_ptr)) {
- wait_rcu_gp(rtp->call_func);
+ wait_rcu_gp_state(rtp->wait_state, rtp->call_func);
return;
}
rcu_tasks_one_gp(rtp, true);
@@ -684,9 +717,7 @@ static void __init rcu_tasks_bootup_oddness(void)
#endif /* #ifdef CONFIG_TASKS_TRACE_RCU */
}
-#endif /* #ifndef CONFIG_TINY_RCU */
-#ifndef CONFIG_TINY_RCU
/* Dump out rcutorture-relevant state common to all RCU-tasks flavors. */
static void show_rcu_tasks_generic_gp_kthread(struct rcu_tasks *rtp, char *s)
{
@@ -720,6 +751,53 @@ static void show_rcu_tasks_generic_gp_kthread(struct rcu_tasks *rtp, char *s)
rtp->lazy_jiffies,
s);
}
+
+/* Dump out more rcutorture-relevant state common to all RCU-tasks flavors. */
+static void rcu_tasks_torture_stats_print_generic(struct rcu_tasks *rtp, char *tt,
+ char *tf, char *tst)
+{
+ cpumask_var_t cm;
+ int cpu;
+ bool gotcb = false;
+ unsigned long j = jiffies;
+
+ pr_alert("%s%s Tasks%s RCU g%ld gp_start %lu gp_jiffies %lu gp_state %d (%s).\n",
+ tt, tf, tst, data_race(rtp->tasks_gp_seq),
+ j - data_race(rtp->gp_start), j - data_race(rtp->gp_jiffies),
+ data_race(rtp->gp_state), tasks_gp_state_getname(rtp));
+ pr_alert("\tEnqueue shift %d limit %d Dequeue limit %d gpseq %lu.\n",
+ data_race(rtp->percpu_enqueue_shift),
+ data_race(rtp->percpu_enqueue_lim),
+ data_race(rtp->percpu_dequeue_lim),
+ data_race(rtp->percpu_dequeue_gpseq));
+ (void)zalloc_cpumask_var(&cm, GFP_KERNEL);
+ pr_alert("\tCallback counts:");
+ for_each_possible_cpu(cpu) {
+ long n;
+ struct rcu_tasks_percpu *rtpcp = per_cpu_ptr(rtp->rtpcpu, cpu);
+
+ if (cpumask_available(cm) && !rcu_barrier_cb_is_done(&rtpcp->barrier_q_head))
+ cpumask_set_cpu(cpu, cm);
+ n = rcu_segcblist_n_cbs(&rtpcp->cblist);
+ if (!n)
+ continue;
+ pr_cont(" %d:%ld", cpu, n);
+ gotcb = true;
+ }
+ if (gotcb)
+ pr_cont(".\n");
+ else
+ pr_cont(" (none).\n");
+ pr_alert("\tBarrier seq %lu start %lu count %d holdout CPUs ",
+ data_race(rtp->barrier_q_seq), j - data_race(rtp->barrier_q_start),
+ atomic_read(&rtp->barrier_q_count));
+ if (cpumask_available(cm) && !cpumask_empty(cm))
+ pr_cont(" %*pbl.\n", cpumask_pr_args(cm));
+ else
+ pr_cont("(none).\n");
+ free_cpumask_var(cm);
+}
+
#endif // #ifndef CONFIG_TINY_RCU
static void exit_tasks_rcu_finish_trace(struct task_struct *t);
@@ -855,7 +933,7 @@ static void rcu_tasks_wait_gp(struct rcu_tasks *rtp)
// not know to synchronize with this RCU Tasks grace period) have
// completed exiting. The synchronize_rcu() in rcu_tasks_postgp()
// will take care of any tasks stuck in the non-preemptible region
-// of do_exit() following its call to exit_tasks_rcu_stop().
+// of do_exit() following its call to exit_tasks_rcu_finish().
// check_all_holdout_tasks(), repeatedly until holdout list is empty:
// Scans the holdout list, attempting to identify a quiescent state
// for each task on the list. If there is a quiescent state, the
@@ -908,6 +986,15 @@ static bool rcu_tasks_is_holdout(struct task_struct *t)
return false;
/*
+ * t->on_rq && !t->se.sched_delayed *could* be considered sleeping but
+ * since it is a spurious state (it will transition into the
+ * traditional blocked state or get woken up without outside
+ * dependencies), not considering it such should only affect timing.
+ *
+ * Be conservative for now and not include it.
+ */
+
+ /*
* Idle tasks (or idle injection) within the idle loop are RCU-tasks
* quiescent states. But CPU boot code performed by the idle task
* isn't a quiescent state.
@@ -999,7 +1086,7 @@ static void rcu_tasks_postscan(struct list_head *hop)
}
if (!IS_ENABLED(CONFIG_TINY_RCU))
- del_timer_sync(&tasks_rcu_exit_srcu_stall_timer);
+ timer_delete_sync(&tasks_rcu_exit_srcu_stall_timer);
}
/* See if tasks are still holding out, complain if so. */
@@ -1160,6 +1247,7 @@ static int __init rcu_spawn_tasks_kthread(void)
rcu_tasks.postscan_func = rcu_tasks_postscan;
rcu_tasks.holdouts_func = check_all_holdout_tasks;
rcu_tasks.postgp_func = rcu_tasks_postgp;
+ rcu_tasks.wait_state = TASK_IDLE;
rcu_spawn_tasks_kthread_generic(&rcu_tasks);
return 0;
}
@@ -1170,6 +1258,12 @@ void show_rcu_tasks_classic_gp_kthread(void)
show_rcu_tasks_generic_gp_kthread(&rcu_tasks, "");
}
EXPORT_SYMBOL_GPL(show_rcu_tasks_classic_gp_kthread);
+
+void rcu_tasks_torture_stats_print(char *tt, char *tf)
+{
+ rcu_tasks_torture_stats_print_generic(&rcu_tasks, tt, tf, "");
+}
+EXPORT_SYMBOL_GPL(rcu_tasks_torture_stats_print);
#endif // !defined(CONFIG_TINY_RCU)
struct task_struct *get_rcu_tasks_gp_kthread(void)
@@ -1178,6 +1272,13 @@ struct task_struct *get_rcu_tasks_gp_kthread(void)
}
EXPORT_SYMBOL_GPL(get_rcu_tasks_gp_kthread);
+void rcu_tasks_get_gp_data(int *flags, unsigned long *gp_seq)
+{
+ *flags = 0;
+ *gp_seq = rcu_seq_current(&rcu_tasks.tasks_gp_seq);
+}
+EXPORT_SYMBOL_GPL(rcu_tasks_get_gp_data);
+
/*
* Protect against tasklist scan blind spot while the task is exiting and
* may be removed from the tasklist. Do this by adding the task to yet
@@ -1199,8 +1300,7 @@ void exit_tasks_rcu_start(void)
rtpcp = this_cpu_ptr(rcu_tasks.rtpcpu);
t->rcu_tasks_exit_cpu = smp_processor_id();
raw_spin_lock_irqsave_rcu_node(rtpcp, flags);
- if (!rtpcp->rtp_exit_list.next)
- INIT_LIST_HEAD(&rtpcp->rtp_exit_list);
+ WARN_ON_ONCE(!rtpcp->rtp_exit_list.next);
list_add(&t->rcu_tasks_exit_list, &rtpcp->rtp_exit_list);
raw_spin_unlock_irqrestore_rcu_node(rtpcp, flags);
preempt_enable();
@@ -1210,7 +1310,7 @@ void exit_tasks_rcu_start(void)
* Remove the task from the "yet another list" because do_exit() is now
* non-preemptible, allowing synchronize_rcu() to wait beyond this point.
*/
-void exit_tasks_rcu_stop(void)
+void exit_tasks_rcu_finish(void)
{
unsigned long flags;
struct rcu_tasks_percpu *rtpcp;
@@ -1221,22 +1321,12 @@ void exit_tasks_rcu_stop(void)
raw_spin_lock_irqsave_rcu_node(rtpcp, flags);
list_del_init(&t->rcu_tasks_exit_list);
raw_spin_unlock_irqrestore_rcu_node(rtpcp, flags);
-}
-/*
- * Contribute to protect against tasklist scan blind spot while the
- * task is exiting and may be removed from the tasklist. See
- * corresponding synchronize_srcu() for further details.
- */
-void exit_tasks_rcu_finish(void)
-{
- exit_tasks_rcu_stop();
- exit_tasks_rcu_finish_trace(current);
+ exit_tasks_rcu_finish_trace(t);
}
#else /* #ifdef CONFIG_TASKS_RCU */
void exit_tasks_rcu_start(void) { }
-void exit_tasks_rcu_stop(void) { }
void exit_tasks_rcu_finish(void) { exit_tasks_rcu_finish_trace(current); }
#endif /* #else #ifdef CONFIG_TASKS_RCU */
@@ -1244,13 +1334,12 @@ void exit_tasks_rcu_finish(void) { exit_tasks_rcu_finish_trace(current); }
////////////////////////////////////////////////////////////////////////
//
-// "Rude" variant of Tasks RCU, inspired by Steve Rostedt's trick of
-// passing an empty function to schedule_on_each_cpu(). This approach
-// provides an asynchronous call_rcu_tasks_rude() API and batching of
-// concurrent calls to the synchronous synchronize_rcu_tasks_rude() API.
-// This invokes schedule_on_each_cpu() in order to send IPIs far and wide
-// and induces otherwise unnecessary context switches on all online CPUs,
-// whether idle or not.
+// "Rude" variant of Tasks RCU, inspired by Steve Rostedt's
+// trick of passing an empty function to schedule_on_each_cpu().
+// This approach provides batching of concurrent calls to the synchronous
+// synchronize_rcu_tasks_rude() API. This invokes schedule_on_each_cpu()
+// in order to send IPIs far and wide and induces otherwise unnecessary
+// context switches on all online CPUs, whether idle or not.
//
// Callback handling is provided by the rcu_tasks_kthread() function.
//
@@ -1268,11 +1357,11 @@ static void rcu_tasks_rude_wait_gp(struct rcu_tasks *rtp)
schedule_on_each_cpu(rcu_tasks_be_rude);
}
-void call_rcu_tasks_rude(struct rcu_head *rhp, rcu_callback_t func);
+static void call_rcu_tasks_rude(struct rcu_head *rhp, rcu_callback_t func);
DEFINE_RCU_TASKS(rcu_tasks_rude, rcu_tasks_rude_wait_gp, call_rcu_tasks_rude,
"RCU Tasks Rude");
-/**
+/*
* call_rcu_tasks_rude() - Queue a callback rude task-based grace period
* @rhp: structure to be used for queueing the RCU updates.
* @func: actual callback function to be invoked after the grace period
@@ -1289,12 +1378,14 @@ DEFINE_RCU_TASKS(rcu_tasks_rude, rcu_tasks_rude_wait_gp, call_rcu_tasks_rude,
*
* See the description of call_rcu() for more detailed information on
* memory ordering guarantees.
+ *
+ * This is no longer exported, and is instead reserved for use by
+ * synchronize_rcu_tasks_rude().
*/
-void call_rcu_tasks_rude(struct rcu_head *rhp, rcu_callback_t func)
+static void call_rcu_tasks_rude(struct rcu_head *rhp, rcu_callback_t func)
{
call_rcu_tasks_generic(rhp, func, &rcu_tasks_rude);
}
-EXPORT_SYMBOL_GPL(call_rcu_tasks_rude);
/**
* synchronize_rcu_tasks_rude - wait for a rude rcu-tasks grace period
@@ -1316,30 +1407,14 @@ EXPORT_SYMBOL_GPL(call_rcu_tasks_rude);
*/
void synchronize_rcu_tasks_rude(void)
{
- synchronize_rcu_tasks_generic(&rcu_tasks_rude);
+ if (!IS_ENABLED(CONFIG_ARCH_WANTS_NO_INSTR) || IS_ENABLED(CONFIG_FORCE_TASKS_RUDE_RCU))
+ synchronize_rcu_tasks_generic(&rcu_tasks_rude);
}
EXPORT_SYMBOL_GPL(synchronize_rcu_tasks_rude);
-/**
- * rcu_barrier_tasks_rude - Wait for in-flight call_rcu_tasks_rude() callbacks.
- *
- * Although the current implementation is guaranteed to wait, it is not
- * obligated to, for example, if there are no pending callbacks.
- */
-void rcu_barrier_tasks_rude(void)
-{
- rcu_barrier_tasks_generic(&rcu_tasks_rude);
-}
-EXPORT_SYMBOL_GPL(rcu_barrier_tasks_rude);
-
-int rcu_tasks_rude_lazy_ms = -1;
-module_param(rcu_tasks_rude_lazy_ms, int, 0444);
-
static int __init rcu_spawn_tasks_rude_kthread(void)
{
rcu_tasks_rude.gp_sleep = HZ / 10;
- if (rcu_tasks_rude_lazy_ms >= 0)
- rcu_tasks_rude.lazy_jiffies = msecs_to_jiffies(rcu_tasks_rude_lazy_ms);
rcu_spawn_tasks_kthread_generic(&rcu_tasks_rude);
return 0;
}
@@ -1350,6 +1425,12 @@ void show_rcu_tasks_rude_gp_kthread(void)
show_rcu_tasks_generic_gp_kthread(&rcu_tasks_rude, "");
}
EXPORT_SYMBOL_GPL(show_rcu_tasks_rude_gp_kthread);
+
+void rcu_tasks_rude_torture_stats_print(char *tt, char *tf)
+{
+ rcu_tasks_torture_stats_print_generic(&rcu_tasks_rude, tt, tf, "");
+}
+EXPORT_SYMBOL_GPL(rcu_tasks_rude_torture_stats_print);
#endif // !defined(CONFIG_TINY_RCU)
struct task_struct *get_rcu_tasks_rude_gp_kthread(void)
@@ -1358,6 +1439,13 @@ struct task_struct *get_rcu_tasks_rude_gp_kthread(void)
}
EXPORT_SYMBOL_GPL(get_rcu_tasks_rude_gp_kthread);
+void rcu_tasks_rude_get_gp_data(int *flags, unsigned long *gp_seq)
+{
+ *flags = 0;
+ *gp_seq = rcu_seq_current(&rcu_tasks_rude.tasks_gp_seq);
+}
+EXPORT_SYMBOL_GPL(rcu_tasks_rude_get_gp_data);
+
#endif /* #ifdef CONFIG_TASKS_RUDE_RCU */
////////////////////////////////////////////////////////////////////////
@@ -1457,19 +1545,12 @@ static void rcu_st_need_qs(struct task_struct *t, u8 v)
/*
* Do a cmpxchg() on ->trc_reader_special.b.need_qs, allowing for
* the four-byte operand-size restriction of some platforms.
+ *
* Returns the old value, which is often ignored.
*/
u8 rcu_trc_cmpxchg_need_qs(struct task_struct *t, u8 old, u8 new)
{
- union rcu_special ret;
- union rcu_special trs_old = READ_ONCE(t->trc_reader_special);
- union rcu_special trs_new = trs_old;
-
- if (trs_old.b.need_qs != old)
- return trs_old.b.need_qs;
- trs_new.b.need_qs = new;
- ret.s = cmpxchg(&t->trc_reader_special.s, trs_old.s, trs_new.s);
- return ret.b.need_qs;
+ return cmpxchg(&t->trc_reader_special.b.need_qs, old, new);
}
EXPORT_SYMBOL_GPL(rcu_trc_cmpxchg_need_qs);
@@ -1598,7 +1679,7 @@ static int trc_inspect_reader(struct task_struct *t, void *bhp_in)
// However, we cannot safely change its state.
n_heavy_reader_attempts++;
// Check for "running" idle tasks on offline CPUs.
- if (!rcu_dynticks_zero_in_eqs(cpu, &t->trc_reader_nesting))
+ if (!rcu_watching_zero_in_eqs(cpu, &t->trc_reader_nesting))
return -EINVAL; // No quiescent state, do it the hard way.
n_heavy_reader_updates++;
nesting = 0;
@@ -1732,6 +1813,16 @@ static void rcu_tasks_trace_pregp_step(struct list_head *hop)
// allow safe access to the hop list.
for_each_online_cpu(cpu) {
rcu_read_lock();
+ // Note that cpu_curr_snapshot() picks up the target
+ // CPU's current task while its runqueue is locked with
+ // an smp_mb__after_spinlock(). This ensures that either
+ // the grace-period kthread will see that task's read-side
+ // critical section or the task will see the updater's pre-GP
+ // accesses. The trailing smp_mb() in cpu_curr_snapshot()
+ // does not currently play a role other than simplify
+ // that function's ordering semantics. If these simplified
+ // ordering semantics continue to be redundant, that smp_mb()
+ // might be removed.
t = cpu_curr_snapshot(cpu);
if (rcu_tasks_trace_pertask_prep(t, true))
trc_add_holdout(t, hop);
@@ -1994,7 +2085,7 @@ void show_rcu_tasks_trace_gp_kthread(void)
{
char buf[64];
- sprintf(buf, "N%lu h:%lu/%lu/%lu",
+ snprintf(buf, sizeof(buf), "N%lu h:%lu/%lu/%lu",
data_race(n_trc_holdouts),
data_race(n_heavy_reader_ofl_updates),
data_race(n_heavy_reader_updates),
@@ -2002,6 +2093,12 @@ void show_rcu_tasks_trace_gp_kthread(void)
show_rcu_tasks_generic_gp_kthread(&rcu_tasks_trace, buf);
}
EXPORT_SYMBOL_GPL(show_rcu_tasks_trace_gp_kthread);
+
+void rcu_tasks_trace_torture_stats_print(char *tt, char *tf)
+{
+ rcu_tasks_torture_stats_print_generic(&rcu_tasks_trace, tt, tf, "");
+}
+EXPORT_SYMBOL_GPL(rcu_tasks_trace_torture_stats_print);
#endif // !defined(CONFIG_TINY_RCU)
struct task_struct *get_rcu_tasks_trace_gp_kthread(void)
@@ -2010,6 +2107,13 @@ struct task_struct *get_rcu_tasks_trace_gp_kthread(void)
}
EXPORT_SYMBOL_GPL(get_rcu_tasks_trace_gp_kthread);
+void rcu_tasks_trace_get_gp_data(int *flags, unsigned long *gp_seq)
+{
+ *flags = 0;
+ *gp_seq = rcu_seq_current(&rcu_tasks_trace.tasks_gp_seq);
+}
+EXPORT_SYMBOL_GPL(rcu_tasks_trace_get_gp_data);
+
#else /* #ifdef CONFIG_TASKS_TRACE_RCU */
static void exit_tasks_rcu_finish_trace(struct task_struct *t) { }
#endif /* #else #ifdef CONFIG_TASKS_TRACE_RCU */
@@ -2038,17 +2142,13 @@ static struct rcu_tasks_test_desc tests[] = {
.notrun = IS_ENABLED(CONFIG_TASKS_RCU),
},
{
- .name = "call_rcu_tasks_rude()",
- /* If not defined, the test is skipped. */
- .notrun = IS_ENABLED(CONFIG_TASKS_RUDE_RCU),
- },
- {
.name = "call_rcu_tasks_trace()",
/* If not defined, the test is skipped. */
.notrun = IS_ENABLED(CONFIG_TASKS_TRACE_RCU)
}
};
+#if defined(CONFIG_TASKS_RCU) || defined(CONFIG_TASKS_TRACE_RCU)
static void test_rcu_tasks_callback(struct rcu_head *rhp)
{
struct rcu_tasks_test_desc *rttd =
@@ -2058,6 +2158,7 @@ static void test_rcu_tasks_callback(struct rcu_head *rhp)
rttd->notrun = false;
}
+#endif // #if defined(CONFIG_TASKS_RCU) || defined(CONFIG_TASKS_TRACE_RCU)
static void rcu_tasks_initiate_self_tests(void)
{
@@ -2070,16 +2171,14 @@ static void rcu_tasks_initiate_self_tests(void)
#ifdef CONFIG_TASKS_RUDE_RCU
pr_info("Running RCU Tasks Rude wait API self tests\n");
- tests[1].runstart = jiffies;
synchronize_rcu_tasks_rude();
- call_rcu_tasks_rude(&tests[1].rh, test_rcu_tasks_callback);
#endif
#ifdef CONFIG_TASKS_TRACE_RCU
pr_info("Running RCU Tasks Trace wait API self tests\n");
- tests[2].runstart = jiffies;
+ tests[1].runstart = jiffies;
synchronize_rcu_tasks_trace();
- call_rcu_tasks_trace(&tests[2].rh, test_rcu_tasks_callback);
+ call_rcu_tasks_trace(&tests[1].rh, test_rcu_tasks_callback);
#endif
}
@@ -2157,7 +2256,7 @@ void __init tasks_cblist_init_generic(void)
#endif
}
-void __init rcu_init_tasks_generic(void)
+static int __init rcu_init_tasks_generic(void)
{
#ifdef CONFIG_TASKS_RCU
rcu_spawn_tasks_kthread();
@@ -2173,7 +2272,10 @@ void __init rcu_init_tasks_generic(void)
// Run the self-tests.
rcu_tasks_initiate_self_tests();
+
+ return 0;
}
+core_initcall(rcu_init_tasks_generic);
#else /* #ifdef CONFIG_TASKS_RCU_GENERIC */
static inline void rcu_tasks_bootup_oddness(void) {}
diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c
index 705c0d16850a..c1ebfd51768b 100644
--- a/kernel/rcu/tiny.c
+++ b/kernel/rcu/tiny.c
@@ -85,15 +85,8 @@ void rcu_sched_clock_irq(int user)
static inline bool rcu_reclaim_tiny(struct rcu_head *head)
{
rcu_callback_t f;
- unsigned long offset = (unsigned long)head->func;
rcu_lock_acquire(&rcu_callback_map);
- if (__is_kvfree_rcu_offset(offset)) {
- trace_rcu_invoke_kvfree_callback("", head, offset);
- kvfree((void *)head - offset);
- rcu_lock_release(&rcu_callback_map);
- return true;
- }
trace_rcu_invoke_callback("", head);
f = head->func;
@@ -105,7 +98,7 @@ static inline bool rcu_reclaim_tiny(struct rcu_head *head)
}
/* Invoke the RCU callbacks whose grace period has elapsed. */
-static __latent_entropy void rcu_process_callbacks(struct softirq_action *unused)
+static __latent_entropy void rcu_process_callbacks(void)
{
struct rcu_head *next, *list;
unsigned long flags;
@@ -130,9 +123,7 @@ static __latent_entropy void rcu_process_callbacks(struct softirq_action *unused
next = list->next;
prefetch(next);
debug_rcu_head_unqueue(list);
- local_bh_disable();
rcu_reclaim_tiny(list);
- local_bh_enable();
list = next;
}
}
@@ -155,14 +146,12 @@ void synchronize_rcu(void)
lock_is_held(&rcu_lock_map) ||
lock_is_held(&rcu_sched_lock_map),
"Illegal synchronize_rcu() in RCU read-side critical section");
+ preempt_disable();
WRITE_ONCE(rcu_ctrlblk.gp_seq, rcu_ctrlblk.gp_seq + 2);
+ preempt_enable();
}
EXPORT_SYMBOL_GPL(synchronize_rcu);
-static void tiny_rcu_leak_callback(struct rcu_head *rhp)
-{
-}
-
/*
* Post an RCU callback to be invoked after the end of an RCU grace
* period. But since we have but one CPU, that would be after any
@@ -178,9 +167,6 @@ void call_rcu(struct rcu_head *head, rcu_callback_t func)
pr_err("%s(): Double-freed CB %p->%pS()!!! ", __func__, head, head->func);
mem_dump_obj(head);
}
-
- if (!__is_kvfree_rcu_offset((unsigned long)head->func))
- WRITE_ONCE(head->func, tiny_rcu_leak_callback);
return;
}
@@ -246,15 +232,18 @@ bool poll_state_synchronize_rcu(unsigned long oldstate)
}
EXPORT_SYMBOL_GPL(poll_state_synchronize_rcu);
-#ifdef CONFIG_KASAN_GENERIC
-void kvfree_call_rcu(struct rcu_head *head, void *ptr)
+#if IS_ENABLED(CONFIG_RCU_TORTURE_TEST)
+unsigned long long rcutorture_gather_gp_seqs(void)
{
- if (head)
- kasan_record_aux_stack_noalloc(ptr);
+ return READ_ONCE(rcu_ctrlblk.gp_seq) & 0xffffULL;
+}
+EXPORT_SYMBOL_GPL(rcutorture_gather_gp_seqs);
- __kvfree_call_rcu(head, ptr);
+void rcutorture_format_gp_seqs(unsigned long long seqs, char *cp, size_t len)
+{
+ snprintf(cp, len, "g%04llx", seqs & 0xffffULL);
}
-EXPORT_SYMBOL_GPL(kvfree_call_rcu);
+EXPORT_SYMBOL_GPL(rcutorture_format_gp_seqs);
#endif
void __init rcu_init(void)
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index d9642dd06c25..e8a4b720d7d2 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -75,13 +75,20 @@
#define MODULE_PARAM_PREFIX "rcutree."
/* Data structures. */
+static void rcu_sr_normal_gp_cleanup_work(struct work_struct *);
static DEFINE_PER_CPU_SHARED_ALIGNED(struct rcu_data, rcu_data) = {
.gpwrap = true,
-#ifdef CONFIG_RCU_NOCB_CPU
- .cblist.flags = SEGCBLIST_RCU_CORE,
-#endif
};
+
+int rcu_get_gpwrap_count(int cpu)
+{
+ struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
+
+ return READ_ONCE(rdp->gpwrap_count);
+}
+EXPORT_SYMBOL_GPL(rcu_get_gpwrap_count);
+
static struct rcu_state rcu_state = {
.level = { &rcu_state.node[0] },
.gp_state = RCU_GP_IDLE,
@@ -93,6 +100,12 @@ static struct rcu_state rcu_state = {
.exp_mutex = __MUTEX_INITIALIZER(rcu_state.exp_mutex),
.exp_wake_mutex = __MUTEX_INITIALIZER(rcu_state.exp_wake_mutex),
.ofl_lock = __ARCH_SPIN_LOCK_UNLOCKED,
+ .srs_cleanup_work = __WORK_INITIALIZER(rcu_state.srs_cleanup_work,
+ rcu_sr_normal_gp_cleanup_work),
+ .srs_cleanups_pending = ATOMIC_INIT(0),
+#ifdef CONFIG_RCU_NOCB_CPU
+ .nocb_mutex = __MUTEX_INITIALIZER(rcu_state.nocb_mutex),
+#endif
};
/* Dump rcu_node combining tree at boot to verify correct setup. */
@@ -145,7 +158,6 @@ static int rcu_scheduler_fully_active __read_mostly;
static void rcu_report_qs_rnp(unsigned long mask, struct rcu_node *rnp,
unsigned long gps, unsigned long flags);
-static struct task_struct *rcu_boost_task(struct rcu_node *rnp);
static void invoke_rcu_core(void);
static void rcu_report_exp_rdp(struct rcu_data *rdp);
static void sync_sched_exp_online_cleanup(int cpu);
@@ -172,6 +184,9 @@ static int gp_init_delay;
module_param(gp_init_delay, int, 0444);
static int gp_cleanup_delay;
module_param(gp_cleanup_delay, int, 0444);
+static int nohz_full_patience_delay;
+module_param(nohz_full_patience_delay, int, 0444);
+static int nohz_full_patience_delay_jiffies;
// Add delay to rcu_read_unlock() for strict grace periods.
static int rcu_unlock_delay;
@@ -179,26 +194,6 @@ static int rcu_unlock_delay;
module_param(rcu_unlock_delay, int, 0444);
#endif
-/*
- * This rcu parameter is runtime-read-only. It reflects
- * a minimum allowed number of objects which can be cached
- * per-CPU. Object size is equal to one page. This value
- * can be changed at boot time.
- */
-static int rcu_min_cached_objs = 5;
-module_param(rcu_min_cached_objs, int, 0444);
-
-// A page shrinker can ask for pages to be freed to make them
-// available for other parts of the system. This usually happens
-// under low memory conditions, and in that case we should also
-// defer page-cache filling for a short time period.
-//
-// The default value is 5 seconds, which is long enough to reduce
-// interference with the shrinker while it asks other systems to
-// drain their caches.
-static int rcu_delay_page_cache_fill_msec = 5000;
-module_param(rcu_delay_page_cache_fill_msec, int, 0444);
-
/* Retrieve RCU kthreads priority for rcutorture */
int rcu_get_gp_kthreads_prio(void)
{
@@ -240,76 +235,113 @@ static long rcu_get_n_cbs_cpu(int cpu)
return 0;
}
+/**
+ * rcu_softirq_qs - Provide a set of RCU quiescent states in softirq processing
+ *
+ * Mark a quiescent state for RCU, Tasks RCU, and Tasks Trace RCU.
+ * This is a special-purpose function to be used in the softirq
+ * infrastructure and perhaps the occasional long-running softirq
+ * handler.
+ *
+ * Note that from RCU's viewpoint, a call to rcu_softirq_qs() is
+ * equivalent to momentarily completely enabling preemption. For
+ * example, given this code::
+ *
+ * local_bh_disable();
+ * do_something();
+ * rcu_softirq_qs(); // A
+ * do_something_else();
+ * local_bh_enable(); // B
+ *
+ * A call to synchronize_rcu() that began concurrently with the
+ * call to do_something() would be guaranteed to wait only until
+ * execution reached statement A. Without that rcu_softirq_qs(),
+ * that same synchronize_rcu() would instead be guaranteed to wait
+ * until execution reached statement B.
+ */
void rcu_softirq_qs(void)
{
+ RCU_LOCKDEP_WARN(lock_is_held(&rcu_bh_lock_map) ||
+ lock_is_held(&rcu_lock_map) ||
+ lock_is_held(&rcu_sched_lock_map),
+ "Illegal rcu_softirq_qs() in RCU read-side critical section");
rcu_qs();
rcu_preempt_deferred_qs(current);
rcu_tasks_qs(current, false);
}
/*
- * Reset the current CPU's ->dynticks counter to indicate that the
+ * Reset the current CPU's RCU_WATCHING counter to indicate that the
* newly onlined CPU is no longer in an extended quiescent state.
* This will either leave the counter unchanged, or increment it
* to the next non-quiescent value.
*
* The non-atomic test/increment sequence works because the upper bits
- * of the ->dynticks counter are manipulated only by the corresponding CPU,
+ * of the ->state variable are manipulated only by the corresponding CPU,
* or when the corresponding CPU is offline.
*/
-static void rcu_dynticks_eqs_online(void)
+static void rcu_watching_online(void)
{
- if (ct_dynticks() & RCU_DYNTICKS_IDX)
+ if (ct_rcu_watching() & CT_RCU_WATCHING)
return;
- ct_state_inc(RCU_DYNTICKS_IDX);
-}
-
-/*
- * Snapshot the ->dynticks counter with full ordering so as to allow
- * stable comparison of this counter with past and future snapshots.
- */
-static int rcu_dynticks_snap(int cpu)
-{
- smp_mb(); // Fundamental RCU ordering guarantee.
- return ct_dynticks_cpu_acquire(cpu);
+ ct_state_inc(CT_RCU_WATCHING);
}
/*
- * Return true if the snapshot returned from rcu_dynticks_snap()
+ * Return true if the snapshot returned from ct_rcu_watching()
* indicates that RCU is in an extended quiescent state.
*/
-static bool rcu_dynticks_in_eqs(int snap)
+static bool rcu_watching_snap_in_eqs(int snap)
{
- return !(snap & RCU_DYNTICKS_IDX);
+ return !(snap & CT_RCU_WATCHING);
}
-/*
- * Return true if the CPU corresponding to the specified rcu_data
- * structure has spent some time in an extended quiescent state since
- * rcu_dynticks_snap() returned the specified snapshot.
+/**
+ * rcu_watching_snap_stopped_since() - Has RCU stopped watching a given CPU
+ * since the specified @snap?
+ *
+ * @rdp: The rcu_data corresponding to the CPU for which to check EQS.
+ * @snap: rcu_watching snapshot taken when the CPU wasn't in an EQS.
+ *
+ * Returns true if the CPU corresponding to @rdp has spent some time in an
+ * extended quiescent state since @snap. Note that this doesn't check if it
+ * /still/ is in an EQS, just that it went through one since @snap.
+ *
+ * This is meant to be used in a loop waiting for a CPU to go through an EQS.
*/
-static bool rcu_dynticks_in_eqs_since(struct rcu_data *rdp, int snap)
+static bool rcu_watching_snap_stopped_since(struct rcu_data *rdp, int snap)
{
- return snap != rcu_dynticks_snap(rdp->cpu);
+ /*
+ * The first failing snapshot is already ordered against the accesses
+ * performed by the remote CPU after it exits idle.
+ *
+ * The second snapshot therefore only needs to order against accesses
+ * performed by the remote CPU prior to entering idle and therefore can
+ * rely solely on acquire semantics.
+ */
+ if (WARN_ON_ONCE(rcu_watching_snap_in_eqs(snap)))
+ return true;
+
+ return snap != ct_rcu_watching_cpu_acquire(rdp->cpu);
}
/*
* Return true if the referenced integer is zero while the specified
* CPU remains within a single extended quiescent state.
*/
-bool rcu_dynticks_zero_in_eqs(int cpu, int *vp)
+bool rcu_watching_zero_in_eqs(int cpu, int *vp)
{
int snap;
// If not quiescent, force back to earlier extended quiescent state.
- snap = ct_dynticks_cpu(cpu) & ~RCU_DYNTICKS_IDX;
- smp_rmb(); // Order ->dynticks and *vp reads.
+ snap = ct_rcu_watching_cpu(cpu) & ~CT_RCU_WATCHING;
+ smp_rmb(); // Order CT state and *vp reads.
if (READ_ONCE(*vp))
return false; // Non-zero, so report failure;
- smp_rmb(); // Order *vp read and ->dynticks re-read.
+ smp_rmb(); // Order *vp read and CT state re-read.
// If still in the same extended quiescent state, we are good!
- return snap == ct_dynticks_cpu(cpu);
+ return snap == ct_rcu_watching_cpu(cpu);
}
/*
@@ -323,17 +355,17 @@ bool rcu_dynticks_zero_in_eqs(int cpu, int *vp)
*
* The caller must have disabled interrupts and must not be idle.
*/
-notrace void rcu_momentary_dyntick_idle(void)
+notrace void rcu_momentary_eqs(void)
{
int seq;
raw_cpu_write(rcu_data.rcu_need_heavy_qs, false);
- seq = ct_state_inc(2 * RCU_DYNTICKS_IDX);
+ seq = ct_state_inc(2 * CT_RCU_WATCHING);
/* It is illegal to call this from idle state. */
- WARN_ON_ONCE(!(seq & RCU_DYNTICKS_IDX));
+ WARN_ON_ONCE(!(seq & CT_RCU_WATCHING));
rcu_preempt_deferred_qs(current);
}
-EXPORT_SYMBOL_GPL(rcu_momentary_dyntick_idle);
+EXPORT_SYMBOL_GPL(rcu_momentary_eqs);
/**
* rcu_is_cpu_rrupt_from_idle - see if 'interrupted' from idle
@@ -355,13 +387,13 @@ static int rcu_is_cpu_rrupt_from_idle(void)
lockdep_assert_irqs_disabled();
/* Check for counter underflows */
- RCU_LOCKDEP_WARN(ct_dynticks_nesting() < 0,
- "RCU dynticks_nesting counter underflow!");
- RCU_LOCKDEP_WARN(ct_dynticks_nmi_nesting() <= 0,
- "RCU dynticks_nmi_nesting counter underflow/zero!");
+ RCU_LOCKDEP_WARN(ct_nesting() < 0,
+ "RCU nesting counter underflow!");
+ RCU_LOCKDEP_WARN(ct_nmi_nesting() <= 0,
+ "RCU nmi_nesting counter underflow/zero!");
/* Are we at first interrupt nesting level? */
- nesting = ct_dynticks_nmi_nesting();
+ nesting = ct_nmi_nesting();
if (nesting > 1)
return false;
@@ -371,7 +403,7 @@ static int rcu_is_cpu_rrupt_from_idle(void)
WARN_ON_ONCE(!nesting && !is_idle_task(current));
/* Does CPU appear to be idle from an RCU standpoint? */
- return ct_dynticks_nesting() == 0;
+ return ct_nesting() == 0;
}
#define DEFAULT_RCU_BLIMIT (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD) ? 1000 : 10)
@@ -508,20 +540,33 @@ static struct rcu_node *rcu_get_root(void)
/*
* Send along grace-period-related data for rcutorture diagnostics.
*/
-void rcutorture_get_gp_data(enum rcutorture_type test_type, int *flags,
- unsigned long *gp_seq)
+void rcutorture_get_gp_data(int *flags, unsigned long *gp_seq)
{
- switch (test_type) {
- case RCU_FLAVOR:
- *flags = READ_ONCE(rcu_state.gp_flags);
- *gp_seq = rcu_seq_current(&rcu_state.gp_seq);
- break;
- default:
- break;
- }
+ *flags = READ_ONCE(rcu_state.gp_flags);
+ *gp_seq = rcu_seq_current(&rcu_state.gp_seq);
}
EXPORT_SYMBOL_GPL(rcutorture_get_gp_data);
+/* Gather grace-period sequence numbers for rcutorture diagnostics. */
+unsigned long long rcutorture_gather_gp_seqs(void)
+{
+ return ((READ_ONCE(rcu_state.gp_seq) & 0xffffULL) << 40) |
+ ((READ_ONCE(rcu_state.expedited_sequence) & 0xffffffULL) << 16) |
+ (READ_ONCE(rcu_state.gp_seq_polled) & 0xffffULL);
+}
+EXPORT_SYMBOL_GPL(rcutorture_gather_gp_seqs);
+
+/* Format grace-period sequence numbers for rcutorture diagnostics. */
+void rcutorture_format_gp_seqs(unsigned long long seqs, char *cp, size_t len)
+{
+ unsigned int egp = (seqs >> 16) & 0xffffffULL;
+ unsigned int ggp = (seqs >> 40) & 0xffffULL;
+ unsigned int pgp = seqs & 0xffffULL;
+
+ snprintf(cp, len, "g%04x:e%06x:p%04x", ggp, egp, pgp);
+}
+EXPORT_SYMBOL_GPL(rcutorture_format_gp_seqs);
+
#if defined(CONFIG_NO_HZ_FULL) && (!defined(CONFIG_GENERIC_ENTRY) || !defined(CONFIG_KVM_XFER_TO_GUEST_WORK))
/*
* An empty function that will trigger a reschedule on
@@ -570,12 +615,12 @@ void rcu_irq_exit_check_preempt(void)
{
lockdep_assert_irqs_disabled();
- RCU_LOCKDEP_WARN(ct_dynticks_nesting() <= 0,
- "RCU dynticks_nesting counter underflow/zero!");
- RCU_LOCKDEP_WARN(ct_dynticks_nmi_nesting() !=
- DYNTICK_IRQ_NONIDLE,
- "Bad RCU dynticks_nmi_nesting counter\n");
- RCU_LOCKDEP_WARN(rcu_dynticks_curr_cpu_in_eqs(),
+ RCU_LOCKDEP_WARN(ct_nesting() <= 0,
+ "RCU nesting counter underflow/zero!");
+ RCU_LOCKDEP_WARN(ct_nmi_nesting() !=
+ CT_NESTING_IRQ_NONIDLE,
+ "Bad RCU nmi_nesting counter\n");
+ RCU_LOCKDEP_WARN(!rcu_is_watching_curr_cpu(),
"RCU in extended quiescent state!");
}
#endif /* #ifdef CONFIG_PROVE_RCU */
@@ -615,7 +660,7 @@ void __rcu_irq_enter_check_tick(void)
if (in_nmi())
return;
- RCU_LOCKDEP_WARN(rcu_dynticks_curr_cpu_in_eqs(),
+ RCU_LOCKDEP_WARN(!rcu_is_watching_curr_cpu(),
"Illegal rcu_irq_enter_check_tick() from extended quiescent state");
if (!tick_nohz_full_cpu(rdp->cpu) ||
@@ -697,7 +742,7 @@ notrace bool rcu_is_watching(void)
bool ret;
preempt_disable_notrace();
- ret = !rcu_dynticks_curr_cpu_in_eqs();
+ ret = rcu_is_watching_curr_cpu();
preempt_enable_notrace();
return ret;
}
@@ -721,6 +766,25 @@ void rcu_request_urgent_qs_task(struct task_struct *t)
smp_store_release(per_cpu_ptr(&rcu_data.rcu_urgent_qs, cpu), true);
}
+static unsigned long seq_gpwrap_lag = ULONG_MAX / 4;
+
+/**
+ * rcu_set_gpwrap_lag - Set RCU GP sequence overflow lag value.
+ * @lag_gps: Set overflow lag to this many grace period worth of counters
+ * which is used by rcutorture to quickly force a gpwrap situation.
+ * @lag_gps = 0 means we reset it back to the boot-time value.
+ */
+void rcu_set_gpwrap_lag(unsigned long lag_gps)
+{
+ unsigned long lag_seq_count;
+
+ lag_seq_count = (lag_gps == 0)
+ ? ULONG_MAX / 4
+ : lag_gps << RCU_SEQ_CTR_SHIFT;
+ WRITE_ONCE(seq_gpwrap_lag, lag_seq_count);
+}
+EXPORT_SYMBOL_GPL(rcu_set_gpwrap_lag);
+
/*
* When trying to report a quiescent state on behalf of some other CPU,
* it is our responsibility to check for and handle potential overflow
@@ -731,22 +795,35 @@ void rcu_request_urgent_qs_task(struct task_struct *t)
static void rcu_gpnum_ovf(struct rcu_node *rnp, struct rcu_data *rdp)
{
raw_lockdep_assert_held_rcu_node(rnp);
- if (ULONG_CMP_LT(rcu_seq_current(&rdp->gp_seq) + ULONG_MAX / 4,
- rnp->gp_seq))
+ if (ULONG_CMP_LT(rcu_seq_current(&rdp->gp_seq) + seq_gpwrap_lag,
+ rnp->gp_seq)) {
WRITE_ONCE(rdp->gpwrap, true);
+ WRITE_ONCE(rdp->gpwrap_count, READ_ONCE(rdp->gpwrap_count) + 1);
+ }
if (ULONG_CMP_LT(rdp->rcu_iw_gp_seq + ULONG_MAX / 4, rnp->gp_seq))
rdp->rcu_iw_gp_seq = rnp->gp_seq + ULONG_MAX / 4;
}
/*
- * Snapshot the specified CPU's dynticks counter so that we can later
+ * Snapshot the specified CPU's RCU_WATCHING counter so that we can later
* credit them with an implicit quiescent state. Return 1 if this CPU
* is in dynticks idle mode, which is an extended quiescent state.
*/
-static int dyntick_save_progress_counter(struct rcu_data *rdp)
+static int rcu_watching_snap_save(struct rcu_data *rdp)
{
- rdp->dynticks_snap = rcu_dynticks_snap(rdp->cpu);
- if (rcu_dynticks_in_eqs(rdp->dynticks_snap)) {
+ /*
+ * Full ordering between remote CPU's post idle accesses and updater's
+ * accesses prior to current GP (and also the started GP sequence number)
+ * is enforced by rcu_seq_start() implicit barrier and even further by
+ * smp_mb__after_unlock_lock() barriers chained all the way throughout the
+ * rnp locking tree since rcu_gp_init() and up to the current leaf rnp
+ * locking.
+ *
+ * Ordering between remote CPU's pre idle accesses and post grace period
+ * updater's accesses is enforced by the below acquire semantic.
+ */
+ rdp->watching_snap = ct_rcu_watching_cpu_acquire(rdp->cpu);
+ if (rcu_watching_snap_in_eqs(rdp->watching_snap)) {
trace_rcu_fqs(rcu_state.name, rdp->gp_seq, rdp->cpu, TPS("dti"));
rcu_gpnum_ovf(rdp->mynode, rdp);
return 1;
@@ -754,17 +831,21 @@ static int dyntick_save_progress_counter(struct rcu_data *rdp)
return 0;
}
+#ifndef arch_irq_stat_cpu
+#define arch_irq_stat_cpu(cpu) 0
+#endif
+
/*
* Returns positive if the specified CPU has passed through a quiescent state
* by virtue of being in or having passed through an dynticks idle state since
- * the last call to dyntick_save_progress_counter() for this same CPU, or by
+ * the last call to rcu_watching_snap_save() for this same CPU, or by
* virtue of having been offline.
*
* Returns negative if the specified CPU needs a force resched.
*
* Returns zero otherwise.
*/
-static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
+static int rcu_watching_snap_recheck(struct rcu_data *rdp)
{
unsigned long jtsq;
int ret = 0;
@@ -778,7 +859,7 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
* read-side critical section that started before the beginning
* of the current RCU grace period.
*/
- if (rcu_dynticks_in_eqs_since(rdp, rdp->dynticks_snap)) {
+ if (rcu_watching_snap_stopped_since(rdp, rdp->watching_snap)) {
trace_rcu_fqs(rcu_state.name, rdp->gp_seq, rdp->cpu, TPS("dti"));
rcu_gpnum_ovf(rnp, rdp);
return 1;
@@ -813,8 +894,8 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
__func__, rnp1->grplo, rnp1->grphi, rnp1->qsmask, rnp1->qsmaskinit, rnp1->qsmaskinitnext, rnp1->rcu_gp_init_mask);
pr_info("%s %d: %c online: %ld(%d) offline: %ld(%d)\n",
__func__, rdp->cpu, ".o"[rcu_rdp_cpu_online(rdp)],
- (long)rdp->rcu_onl_gp_seq, rdp->rcu_onl_gp_flags,
- (long)rdp->rcu_ofl_gp_seq, rdp->rcu_ofl_gp_flags);
+ (long)rdp->rcu_onl_gp_seq, rdp->rcu_onl_gp_state,
+ (long)rdp->rcu_ofl_gp_seq, rdp->rcu_ofl_gp_state);
return 1; /* Break things loose after complaining. */
}
@@ -889,9 +970,9 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
rsrp->cputime_irq = kcpustat_field(kcsp, CPUTIME_IRQ, cpu);
rsrp->cputime_softirq = kcpustat_field(kcsp, CPUTIME_SOFTIRQ, cpu);
rsrp->cputime_system = kcpustat_field(kcsp, CPUTIME_SYSTEM, cpu);
- rsrp->nr_hardirqs = kstat_cpu_irqs_sum(rdp->cpu);
- rsrp->nr_softirqs = kstat_cpu_softirqs_sum(rdp->cpu);
- rsrp->nr_csw = nr_context_switches_cpu(rdp->cpu);
+ rsrp->nr_hardirqs = kstat_cpu_irqs_sum(cpu) + arch_irq_stat_cpu(cpu);
+ rsrp->nr_softirqs = kstat_cpu_softirqs_sum(cpu);
+ rsrp->nr_csw = nr_context_switches_cpu(cpu);
rsrp->jiffies = jiffies;
rsrp->gp_seq = rdp->gp_seq;
}
@@ -1013,38 +1094,6 @@ static bool rcu_future_gp_cleanup(struct rcu_node *rnp)
return needmore;
}
-static void swake_up_one_online_ipi(void *arg)
-{
- struct swait_queue_head *wqh = arg;
-
- swake_up_one(wqh);
-}
-
-static void swake_up_one_online(struct swait_queue_head *wqh)
-{
- int cpu = get_cpu();
-
- /*
- * If called from rcutree_report_cpu_starting(), wake up
- * is dangerous that late in the CPU-down hotplug process. The
- * scheduler might queue an ignored hrtimer. Defer the wake up
- * to an online CPU instead.
- */
- if (unlikely(cpu_is_offline(cpu))) {
- int target;
-
- target = cpumask_any_and(housekeeping_cpumask(HK_TYPE_RCU),
- cpu_online_mask);
-
- smp_call_function_single(target, swake_up_one_online_ipi,
- wqh, 0);
- put_cpu();
- } else {
- put_cpu();
- swake_up_one(wqh);
- }
-}
-
/*
* Awaken the grace-period kthread. Don't do a self-awaken (unless in an
* interrupt or softirq handler, in which case we just might immediately
@@ -1069,7 +1118,7 @@ static void rcu_gp_kthread_wake(void)
return;
WRITE_ONCE(rcu_state.gp_wake_time, jiffies);
WRITE_ONCE(rcu_state.gp_wake_seq, READ_ONCE(rcu_state.gp_seq));
- swake_up_one_online(&rcu_state.gp_wq);
+ swake_up_one(&rcu_state.gp_wq);
}
/*
@@ -1227,7 +1276,7 @@ static bool __note_gp_changes(struct rcu_node *rnp, struct rcu_data *rdp)
/* Handle the ends of any preceding grace periods first. */
if (rcu_seq_completed_gp(rdp->gp_seq, rnp->gp_seq) ||
- unlikely(READ_ONCE(rdp->gpwrap))) {
+ unlikely(rdp->gpwrap)) {
if (!offloaded)
ret = rcu_advance_cbs(rnp, rdp); /* Advance CBs. */
rdp->core_needs_qs = false;
@@ -1241,7 +1290,7 @@ static bool __note_gp_changes(struct rcu_node *rnp, struct rcu_data *rdp)
/* Now handle the beginnings of any new-to-this-CPU grace periods. */
if (rcu_seq_new_gp(rdp->gp_seq, rnp->gp_seq) ||
- unlikely(READ_ONCE(rdp->gpwrap))) {
+ unlikely(rdp->gpwrap)) {
/*
* If the current grace period is waiting for this CPU,
* set up to detect a quiescent state, otherwise don't
@@ -1256,7 +1305,7 @@ static bool __note_gp_changes(struct rcu_node *rnp, struct rcu_data *rdp)
rdp->gp_seq = rnp->gp_seq; /* Remember new grace-period state. */
if (ULONG_CMP_LT(rdp->gp_seq_needed, rnp->gp_seq_needed) || rdp->gpwrap)
WRITE_ONCE(rdp->gp_seq_needed, rnp->gp_seq_needed);
- if (IS_ENABLED(CONFIG_PROVE_RCU) && READ_ONCE(rdp->gpwrap))
+ if (IS_ENABLED(CONFIG_PROVE_RCU) && rdp->gpwrap)
WRITE_ONCE(rdp->last_sched_clock, jiffies);
WRITE_ONCE(rdp->gpwrap, false);
rcu_gpnum_ovf(rnp, rdp);
@@ -1423,6 +1472,324 @@ static void rcu_poll_gp_seq_end_unlocked(unsigned long *snap)
}
/*
+ * There is a single llist, which is used for handling
+ * synchronize_rcu() users' enqueued rcu_synchronize nodes.
+ * Within this llist, there are two tail pointers:
+ *
+ * wait tail: Tracks the set of nodes, which need to
+ * wait for the current GP to complete.
+ * done tail: Tracks the set of nodes, for which grace
+ * period has elapsed. These nodes processing
+ * will be done as part of the cleanup work
+ * execution by a kworker.
+ *
+ * At every grace period init, a new wait node is added
+ * to the llist. This wait node is used as wait tail
+ * for this new grace period. Given that there are a fixed
+ * number of wait nodes, if all wait nodes are in use
+ * (which can happen when kworker callback processing
+ * is delayed) and additional grace period is requested.
+ * This means, a system is slow in processing callbacks.
+ *
+ * TODO: If a slow processing is detected, a first node
+ * in the llist should be used as a wait-tail for this
+ * grace period, therefore users which should wait due
+ * to a slow process are handled by _this_ grace period
+ * and not next.
+ *
+ * Below is an illustration of how the done and wait
+ * tail pointers move from one set of rcu_synchronize nodes
+ * to the other, as grace periods start and finish and
+ * nodes are processed by kworker.
+ *
+ *
+ * a. Initial llist callbacks list:
+ *
+ * +----------+ +--------+ +-------+
+ * | | | | | |
+ * | head |---------> | cb2 |--------->| cb1 |
+ * | | | | | |
+ * +----------+ +--------+ +-------+
+ *
+ *
+ *
+ * b. New GP1 Start:
+ *
+ * WAIT TAIL
+ * |
+ * |
+ * v
+ * +----------+ +--------+ +--------+ +-------+
+ * | | | | | | | |
+ * | head ------> wait |------> cb2 |------> | cb1 |
+ * | | | head1 | | | | |
+ * +----------+ +--------+ +--------+ +-------+
+ *
+ *
+ *
+ * c. GP completion:
+ *
+ * WAIT_TAIL == DONE_TAIL
+ *
+ * DONE TAIL
+ * |
+ * |
+ * v
+ * +----------+ +--------+ +--------+ +-------+
+ * | | | | | | | |
+ * | head ------> wait |------> cb2 |------> | cb1 |
+ * | | | head1 | | | | |
+ * +----------+ +--------+ +--------+ +-------+
+ *
+ *
+ *
+ * d. New callbacks and GP2 start:
+ *
+ * WAIT TAIL DONE TAIL
+ * | |
+ * | |
+ * v v
+ * +----------+ +------+ +------+ +------+ +-----+ +-----+ +-----+
+ * | | | | | | | | | | | | | |
+ * | head ------> wait |--->| cb4 |--->| cb3 |--->|wait |--->| cb2 |--->| cb1 |
+ * | | | head2| | | | | |head1| | | | |
+ * +----------+ +------+ +------+ +------+ +-----+ +-----+ +-----+
+ *
+ *
+ *
+ * e. GP2 completion:
+ *
+ * WAIT_TAIL == DONE_TAIL
+ * DONE TAIL
+ * |
+ * |
+ * v
+ * +----------+ +------+ +------+ +------+ +-----+ +-----+ +-----+
+ * | | | | | | | | | | | | | |
+ * | head ------> wait |--->| cb4 |--->| cb3 |--->|wait |--->| cb2 |--->| cb1 |
+ * | | | head2| | | | | |head1| | | | |
+ * +----------+ +------+ +------+ +------+ +-----+ +-----+ +-----+
+ *
+ *
+ * While the llist state transitions from d to e, a kworker
+ * can start executing rcu_sr_normal_gp_cleanup_work() and
+ * can observe either the old done tail (@c) or the new
+ * done tail (@e). So, done tail updates and reads need
+ * to use the rel-acq semantics. If the concurrent kworker
+ * observes the old done tail, the newly queued work
+ * execution will process the updated done tail. If the
+ * concurrent kworker observes the new done tail, then
+ * the newly queued work will skip processing the done
+ * tail, as workqueue semantics guarantees that the new
+ * work is executed only after the previous one completes.
+ *
+ * f. kworker callbacks processing complete:
+ *
+ *
+ * DONE TAIL
+ * |
+ * |
+ * v
+ * +----------+ +--------+
+ * | | | |
+ * | head ------> wait |
+ * | | | head2 |
+ * +----------+ +--------+
+ *
+ */
+static bool rcu_sr_is_wait_head(struct llist_node *node)
+{
+ return &(rcu_state.srs_wait_nodes)[0].node <= node &&
+ node <= &(rcu_state.srs_wait_nodes)[SR_NORMAL_GP_WAIT_HEAD_MAX - 1].node;
+}
+
+static struct llist_node *rcu_sr_get_wait_head(void)
+{
+ struct sr_wait_node *sr_wn;
+ int i;
+
+ for (i = 0; i < SR_NORMAL_GP_WAIT_HEAD_MAX; i++) {
+ sr_wn = &(rcu_state.srs_wait_nodes)[i];
+
+ if (!atomic_cmpxchg_acquire(&sr_wn->inuse, 0, 1))
+ return &sr_wn->node;
+ }
+
+ return NULL;
+}
+
+static void rcu_sr_put_wait_head(struct llist_node *node)
+{
+ struct sr_wait_node *sr_wn = container_of(node, struct sr_wait_node, node);
+
+ atomic_set_release(&sr_wn->inuse, 0);
+}
+
+/* Disabled by default. */
+static int rcu_normal_wake_from_gp;
+module_param(rcu_normal_wake_from_gp, int, 0644);
+static struct workqueue_struct *sync_wq;
+
+static void rcu_sr_normal_complete(struct llist_node *node)
+{
+ struct rcu_synchronize *rs = container_of(
+ (struct rcu_head *) node, struct rcu_synchronize, head);
+
+ WARN_ONCE(IS_ENABLED(CONFIG_PROVE_RCU) &&
+ !poll_state_synchronize_rcu_full(&rs->oldstate),
+ "A full grace period is not passed yet!\n");
+
+ /* Finally. */
+ complete(&rs->completion);
+}
+
+static void rcu_sr_normal_gp_cleanup_work(struct work_struct *work)
+{
+ struct llist_node *done, *rcu, *next, *head;
+
+ /*
+ * This work execution can potentially execute
+ * while a new done tail is being updated by
+ * grace period kthread in rcu_sr_normal_gp_cleanup().
+ * So, read and updates of done tail need to
+ * follow acq-rel semantics.
+ *
+ * Given that wq semantics guarantees that a single work
+ * cannot execute concurrently by multiple kworkers,
+ * the done tail list manipulations are protected here.
+ */
+ done = smp_load_acquire(&rcu_state.srs_done_tail);
+ if (WARN_ON_ONCE(!done))
+ return;
+
+ WARN_ON_ONCE(!rcu_sr_is_wait_head(done));
+ head = done->next;
+ done->next = NULL;
+
+ /*
+ * The dummy node, which is pointed to by the
+ * done tail which is acq-read above is not removed
+ * here. This allows lockless additions of new
+ * rcu_synchronize nodes in rcu_sr_normal_add_req(),
+ * while the cleanup work executes. The dummy
+ * nodes is removed, in next round of cleanup
+ * work execution.
+ */
+ llist_for_each_safe(rcu, next, head) {
+ if (!rcu_sr_is_wait_head(rcu)) {
+ rcu_sr_normal_complete(rcu);
+ continue;
+ }
+
+ rcu_sr_put_wait_head(rcu);
+ }
+
+ /* Order list manipulations with atomic access. */
+ atomic_dec_return_release(&rcu_state.srs_cleanups_pending);
+}
+
+/*
+ * Helper function for rcu_gp_cleanup().
+ */
+static void rcu_sr_normal_gp_cleanup(void)
+{
+ struct llist_node *wait_tail, *next = NULL, *rcu = NULL;
+ int done = 0;
+
+ wait_tail = rcu_state.srs_wait_tail;
+ if (wait_tail == NULL)
+ return;
+
+ rcu_state.srs_wait_tail = NULL;
+ ASSERT_EXCLUSIVE_WRITER(rcu_state.srs_wait_tail);
+ WARN_ON_ONCE(!rcu_sr_is_wait_head(wait_tail));
+
+ /*
+ * Process (a) and (d) cases. See an illustration.
+ */
+ llist_for_each_safe(rcu, next, wait_tail->next) {
+ if (rcu_sr_is_wait_head(rcu))
+ break;
+
+ rcu_sr_normal_complete(rcu);
+ // It can be last, update a next on this step.
+ wait_tail->next = next;
+
+ if (++done == SR_MAX_USERS_WAKE_FROM_GP)
+ break;
+ }
+
+ /*
+ * Fast path, no more users to process except putting the second last
+ * wait head if no inflight-workers. If there are in-flight workers,
+ * they will remove the last wait head.
+ *
+ * Note that the ACQUIRE orders atomic access with list manipulation.
+ */
+ if (wait_tail->next && wait_tail->next->next == NULL &&
+ rcu_sr_is_wait_head(wait_tail->next) &&
+ !atomic_read_acquire(&rcu_state.srs_cleanups_pending)) {
+ rcu_sr_put_wait_head(wait_tail->next);
+ wait_tail->next = NULL;
+ }
+
+ /* Concurrent sr_normal_gp_cleanup work might observe this update. */
+ ASSERT_EXCLUSIVE_WRITER(rcu_state.srs_done_tail);
+ smp_store_release(&rcu_state.srs_done_tail, wait_tail);
+
+ /*
+ * We schedule a work in order to perform a final processing
+ * of outstanding users(if still left) and releasing wait-heads
+ * added by rcu_sr_normal_gp_init() call.
+ */
+ if (wait_tail->next) {
+ atomic_inc(&rcu_state.srs_cleanups_pending);
+ if (!queue_work(sync_wq, &rcu_state.srs_cleanup_work))
+ atomic_dec(&rcu_state.srs_cleanups_pending);
+ }
+}
+
+/*
+ * Helper function for rcu_gp_init().
+ */
+static bool rcu_sr_normal_gp_init(void)
+{
+ struct llist_node *first;
+ struct llist_node *wait_head;
+ bool start_new_poll = false;
+
+ first = READ_ONCE(rcu_state.srs_next.first);
+ if (!first || rcu_sr_is_wait_head(first))
+ return start_new_poll;
+
+ wait_head = rcu_sr_get_wait_head();
+ if (!wait_head) {
+ // Kick another GP to retry.
+ start_new_poll = true;
+ return start_new_poll;
+ }
+
+ /* Inject a wait-dummy-node. */
+ llist_add(wait_head, &rcu_state.srs_next);
+
+ /*
+ * A waiting list of rcu_synchronize nodes should be empty on
+ * this step, since a GP-kthread, rcu_gp_init() -> gp_cleanup(),
+ * rolls it over. If not, it is a BUG, warn a user.
+ */
+ WARN_ON_ONCE(rcu_state.srs_wait_tail != NULL);
+ rcu_state.srs_wait_tail = wait_head;
+ ASSERT_EXCLUSIVE_WRITER(rcu_state.srs_wait_tail);
+
+ return start_new_poll;
+}
+
+static void rcu_sr_normal_add_req(struct rcu_synchronize *rs)
+{
+ llist_add((struct llist_node *) &rs->head, &rcu_state.srs_next);
+}
+
+/*
* Initialize a new grace period. Return false if no grace period required.
*/
static noinline_for_stack bool rcu_gp_init(void)
@@ -1432,10 +1799,12 @@ static noinline_for_stack bool rcu_gp_init(void)
unsigned long mask;
struct rcu_data *rdp;
struct rcu_node *rnp = rcu_get_root();
+ bool start_new_poll;
+ unsigned long old_gp_seq;
WRITE_ONCE(rcu_state.gp_activity, jiffies);
raw_spin_lock_irq_rcu_node(rnp);
- if (!READ_ONCE(rcu_state.gp_flags)) {
+ if (!rcu_state.gp_flags) {
/* Spurious wakeup, tell caller to go back to sleep. */
raw_spin_unlock_irq_rcu_node(rnp);
return false;
@@ -1453,14 +1822,37 @@ static noinline_for_stack bool rcu_gp_init(void)
/* Advance to a new grace period and initialize state. */
record_gp_stall_check_time();
+ /*
+ * A new wait segment must be started before gp_seq advanced, so
+ * that previous gp waiters won't observe the new gp_seq.
+ */
+ start_new_poll = rcu_sr_normal_gp_init();
/* Record GP times before starting GP, hence rcu_seq_start(). */
+ old_gp_seq = rcu_state.gp_seq;
rcu_seq_start(&rcu_state.gp_seq);
+ /* Ensure that rcu_seq_done_exact() guardband doesn't give false positives. */
+ WARN_ON_ONCE(IS_ENABLED(CONFIG_PROVE_RCU) &&
+ rcu_seq_done_exact(&old_gp_seq, rcu_seq_snap(&rcu_state.gp_seq)));
+
ASSERT_EXCLUSIVE_WRITER(rcu_state.gp_seq);
trace_rcu_grace_period(rcu_state.name, rcu_state.gp_seq, TPS("start"));
rcu_poll_gp_seq_start(&rcu_state.gp_seq_polled_snap);
raw_spin_unlock_irq_rcu_node(rnp);
/*
+ * The "start_new_poll" is set to true, only when this GP is not able
+ * to handle anything and there are outstanding users. It happens when
+ * the rcu_sr_normal_gp_init() function was not able to insert a dummy
+ * separator to the llist, because there were no left any dummy-nodes.
+ *
+ * Number of dummy-nodes is fixed, it could be that we are run out of
+ * them, if so we start a new pool request to repeat a try. It is rare
+ * and it means that a system is doing a slow processing of callbacks.
+ */
+ if (start_new_poll)
+ (void) start_poll_synchronize_rcu();
+
+ /*
* Apply per-leaf buffered online and offline operations to
* the rcu_node tree. Note that this new grace period need not
* wait for subsequent online CPUs, and that RCU hooks in the CPU
@@ -1472,7 +1864,7 @@ static noinline_for_stack bool rcu_gp_init(void)
WRITE_ONCE(rcu_state.gp_state, RCU_GP_ONOFF);
/* Exclude CPU hotplug operations. */
rcu_for_each_leaf_node(rnp) {
- local_irq_save(flags);
+ local_irq_disable();
arch_spin_lock(&rcu_state.ofl_lock);
raw_spin_lock_rcu_node(rnp);
if (rnp->qsmaskinit == rnp->qsmaskinitnext &&
@@ -1480,7 +1872,7 @@ static noinline_for_stack bool rcu_gp_init(void)
/* Nothing to do on this leaf rcu_node structure. */
raw_spin_unlock_rcu_node(rnp);
arch_spin_unlock(&rcu_state.ofl_lock);
- local_irq_restore(flags);
+ local_irq_enable();
continue;
}
@@ -1517,7 +1909,7 @@ static noinline_for_stack bool rcu_gp_init(void)
raw_spin_unlock_rcu_node(rnp);
arch_spin_unlock(&rcu_state.ofl_lock);
- local_irq_restore(flags);
+ local_irq_enable();
}
rcu_gp_slow(gp_preinit_delay); /* Races with CPU hotplug. */
@@ -1612,16 +2004,15 @@ static void rcu_gp_fqs(bool first_time)
if (first_time) {
/* Collect dyntick-idle snapshots. */
- force_qs_rnp(dyntick_save_progress_counter);
+ force_qs_rnp(rcu_watching_snap_save);
} else {
/* Handle dyntick-idle and offline CPUs. */
- force_qs_rnp(rcu_implicit_dynticks_qs);
+ force_qs_rnp(rcu_watching_snap_recheck);
}
/* Clear flag to prevent immediate re-entry. */
if (READ_ONCE(rcu_state.gp_flags) & RCU_GP_FLAG_FQS) {
raw_spin_lock_irq_rcu_node(rnp);
- WRITE_ONCE(rcu_state.gp_flags,
- READ_ONCE(rcu_state.gp_flags) & ~RCU_GP_FLAG_FQS);
+ WRITE_ONCE(rcu_state.gp_flags, rcu_state.gp_flags & ~RCU_GP_FLAG_FQS);
raw_spin_unlock_irq_rcu_node(rnp);
}
}
@@ -1825,6 +2216,9 @@ static noinline void rcu_gp_cleanup(void)
}
raw_spin_unlock_irq_rcu_node(rnp);
+ // Make synchronize_rcu() users aware of the end of old grace period.
+ rcu_sr_normal_gp_cleanup();
+
// If strict, make all CPUs aware of the end of the old grace period.
if (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD))
on_each_cpu(rcu_strict_gp_boundary, NULL, 0);
@@ -1882,8 +2276,7 @@ static void rcu_report_qs_rsp(unsigned long flags)
{
raw_lockdep_assert_held_rcu_node(rcu_get_root());
WARN_ON_ONCE(!rcu_gp_in_progress());
- WRITE_ONCE(rcu_state.gp_flags,
- READ_ONCE(rcu_state.gp_flags) | RCU_GP_FLAG_FQS);
+ WRITE_ONCE(rcu_state.gp_flags, rcu_state.gp_flags | RCU_GP_FLAG_FQS);
raw_spin_unlock_irqrestore_rcu_node(rcu_get_root(), flags);
rcu_gp_kthread_wake();
}
@@ -2010,7 +2403,6 @@ rcu_report_qs_rdp(struct rcu_data *rdp)
{
unsigned long flags;
unsigned long mask;
- bool needacc = false;
struct rcu_node *rnp;
WARN_ON_ONCE(rdp->cpu != smp_processor_id());
@@ -2047,23 +2439,11 @@ rcu_report_qs_rdp(struct rcu_data *rdp)
* to return true. So complain, but don't awaken.
*/
WARN_ON_ONCE(rcu_accelerate_cbs(rnp, rdp));
- } else if (!rcu_segcblist_completely_offloaded(&rdp->cblist)) {
- /*
- * ...but NOCB kthreads may miss or delay callbacks acceleration
- * if in the middle of a (de-)offloading process.
- */
- needacc = true;
}
rcu_disable_urgency_upon_qs(rdp);
rcu_report_qs_rnp(mask, rnp, rnp->gp_seq, flags);
/* ^^^ Released rnp->lock */
-
- if (needacc) {
- rcu_nocb_lock_irqsave(rdp, flags);
- rcu_accelerate_cbs_unlocked(rnp, rdp);
- rcu_nocb_unlock_irqrestore(rdp, flags);
- }
}
}
@@ -2398,8 +2778,7 @@ void rcu_force_quiescent_state(void)
raw_spin_unlock_irqrestore_rcu_node(rnp_old, flags);
return; /* Someone beat us to it. */
}
- WRITE_ONCE(rcu_state.gp_flags,
- READ_ONCE(rcu_state.gp_flags) | RCU_GP_FLAG_FQS);
+ WRITE_ONCE(rcu_state.gp_flags, rcu_state.gp_flags | RCU_GP_FLAG_FQS);
raw_spin_unlock_irqrestore_rcu_node(rnp_old, flags);
rcu_gp_kthread_wake();
}
@@ -2419,24 +2798,6 @@ static __latent_entropy void rcu_core(void)
unsigned long flags;
struct rcu_data *rdp = raw_cpu_ptr(&rcu_data);
struct rcu_node *rnp = rdp->mynode;
- /*
- * On RT rcu_core() can be preempted when IRQs aren't disabled.
- * Therefore this function can race with concurrent NOCB (de-)offloading
- * on this CPU and the below condition must be considered volatile.
- * However if we race with:
- *
- * _ Offloading: In the worst case we accelerate or process callbacks
- * concurrently with NOCB kthreads. We are guaranteed to
- * call rcu_nocb_lock() if that happens.
- *
- * _ Deoffloading: In the worst case we miss callbacks acceleration or
- * processing. This is fine because the early stage
- * of deoffloading invokes rcu_core() after setting
- * SEGCBLIST_RCU_CORE. So we guarantee that we'll process
- * what could have been dismissed without the need to wait
- * for the next rcu_pending() check in the next jiffy.
- */
- const bool do_batch = !rcu_segcblist_completely_offloaded(&rdp->cblist);
if (cpu_is_offline(smp_processor_id()))
return;
@@ -2456,17 +2817,17 @@ static __latent_entropy void rcu_core(void)
/* No grace period and unregistered callbacks? */
if (!rcu_gp_in_progress() &&
- rcu_segcblist_is_enabled(&rdp->cblist) && do_batch) {
- rcu_nocb_lock_irqsave(rdp, flags);
+ rcu_segcblist_is_enabled(&rdp->cblist) && !rcu_rdp_is_offloaded(rdp)) {
+ local_irq_save(flags);
if (!rcu_segcblist_restempty(&rdp->cblist, RCU_NEXT_READY_TAIL))
rcu_accelerate_cbs_unlocked(rnp, rdp);
- rcu_nocb_unlock_irqrestore(rdp, flags);
+ local_irq_restore(flags);
}
rcu_check_gp_start_stall(rnp, rdp, rcu_jiffies_till_stall_check());
/* If there are callbacks ready, invoke them. */
- if (do_batch && rcu_segcblist_ready_cbs(&rdp->cblist) &&
+ if (!rcu_rdp_is_offloaded(rdp) && rcu_segcblist_ready_cbs(&rdp->cblist) &&
likely(READ_ONCE(rcu_scheduler_fully_active))) {
rcu_do_batch(rdp);
/* Re-invoke RCU core processing if there are callbacks remaining. */
@@ -2483,7 +2844,7 @@ static __latent_entropy void rcu_core(void)
queue_work_on(rdp->cpu, rcu_gp_wq, &rdp->strict_work);
}
-static void rcu_core_si(struct softirq_action *h)
+static void rcu_core_si(void)
{
rcu_core();
}
@@ -2600,13 +2961,8 @@ static int __init rcu_spawn_core_kthreads(void)
static void rcutree_enqueue(struct rcu_data *rdp, struct rcu_head *head, rcu_callback_t func)
{
rcu_segcblist_enqueue(&rdp->cblist, head);
- if (__is_kvfree_rcu_offset((unsigned long)func))
- trace_rcu_kvfree_callback(rcu_state.name, head,
- (unsigned long)func,
- rcu_segcblist_n_cbs(&rdp->cblist));
- else
- trace_rcu_callback(rcu_state.name, head,
- rcu_segcblist_n_cbs(&rdp->cblist));
+ trace_rcu_callback(rcu_state.name, head,
+ rcu_segcblist_n_cbs(&rdp->cblist));
trace_rcu_segcb_stats(&rdp->cblist, TPS("SegCBQueued"));
}
@@ -2731,9 +3087,12 @@ __call_rcu_common(struct rcu_head *head, rcu_callback_t func, bool lazy_in)
}
head->func = func;
head->next = NULL;
- kasan_record_aux_stack_noalloc(head);
+ kasan_record_aux_stack(head);
+
local_irq_save(flags);
rdp = this_cpu_ptr(&rcu_data);
+ RCU_LOCKDEP_WARN(!rcu_rdp_cpu_online(rdp), "Callback enqueued on offline CPU!");
+
lazy = lazy_in && !rcu_async_should_hurry();
/* Add the callback to our list. */
@@ -2773,7 +3132,7 @@ module_param(enable_rcu_lazy, bool, 0444);
* critical sections have completed.
*
* Use this API instead of call_rcu() if you don't want the callback to be
- * invoked after very long periods of time, which can happen on systems without
+ * delayed for very long periods of time, which can happen on systems without
* memory pressure and on systems which are lightly loaded or mostly idle.
* This function will cause callbacks to be invoked sooner than later at the
* expense of extra power. Other than that, this function is identical to, and
@@ -2804,6 +3163,12 @@ EXPORT_SYMBOL_GPL(call_rcu_hurry);
* might well execute concurrently with RCU read-side critical sections
* that started after call_rcu() was invoked.
*
+ * It is perfectly legal to repost an RCU callback, potentially with
+ * a different callback function, from within its callback function.
+ * The specified function will be invoked after another full grace period
+ * has elapsed. This use case is similar in form to the common practice
+ * of reposting a timer from within its own handler.
+ *
* RCU read-side critical sections are delimited by rcu_read_lock()
* and rcu_read_unlock(), and may be nested. In addition, but only in
* v5.0 and later, regions of code across which interrupts, preemption,
@@ -2832,6 +3197,13 @@ EXPORT_SYMBOL_GPL(call_rcu_hurry);
*
* Implementation of these memory-ordering guarantees is described here:
* Documentation/RCU/Design/Memory-Ordering/Tree-RCU-Memory-Ordering.rst.
+ *
+ * Specific to call_rcu() (as opposed to the other call_rcu*() functions),
+ * in kernels built with CONFIG_RCU_LAZY=y, call_rcu() might delay for many
+ * seconds before starting the grace period needed by the corresponding
+ * callback. This delay can significantly improve energy-efficiency
+ * on low-utilization battery-powered devices. To avoid this delay,
+ * in latency-sensitive kernel code, use call_rcu_hurry().
*/
void call_rcu(struct rcu_head *head, rcu_callback_t func)
{
@@ -2839,724 +3211,60 @@ void call_rcu(struct rcu_head *head, rcu_callback_t func)
}
EXPORT_SYMBOL_GPL(call_rcu);
-/* Maximum number of jiffies to wait before draining a batch. */
-#define KFREE_DRAIN_JIFFIES (5 * HZ)
-#define KFREE_N_BATCHES 2
-#define FREE_N_CHANNELS 2
-
-/**
- * struct kvfree_rcu_bulk_data - single block to store kvfree_rcu() pointers
- * @list: List node. All blocks are linked between each other
- * @gp_snap: Snapshot of RCU state for objects placed to this bulk
- * @nr_records: Number of active pointers in the array
- * @records: Array of the kvfree_rcu() pointers
- */
-struct kvfree_rcu_bulk_data {
- struct list_head list;
- struct rcu_gp_oldstate gp_snap;
- unsigned long nr_records;
- void *records[];
-};
-
/*
- * This macro defines how many entries the "records" array
- * will contain. It is based on the fact that the size of
- * kvfree_rcu_bulk_data structure becomes exactly one page.
- */
-#define KVFREE_BULK_MAX_ENTR \
- ((PAGE_SIZE - sizeof(struct kvfree_rcu_bulk_data)) / sizeof(void *))
-
-/**
- * struct kfree_rcu_cpu_work - single batch of kfree_rcu() requests
- * @rcu_work: Let queue_rcu_work() invoke workqueue handler after grace period
- * @head_free: List of kfree_rcu() objects waiting for a grace period
- * @head_free_gp_snap: Grace-period snapshot to check for attempted premature frees.
- * @bulk_head_free: Bulk-List of kvfree_rcu() objects waiting for a grace period
- * @krcp: Pointer to @kfree_rcu_cpu structure
- */
-
-struct kfree_rcu_cpu_work {
- struct rcu_work rcu_work;
- struct rcu_head *head_free;
- struct rcu_gp_oldstate head_free_gp_snap;
- struct list_head bulk_head_free[FREE_N_CHANNELS];
- struct kfree_rcu_cpu *krcp;
-};
-
-/**
- * struct kfree_rcu_cpu - batch up kfree_rcu() requests for RCU grace period
- * @head: List of kfree_rcu() objects not yet waiting for a grace period
- * @head_gp_snap: Snapshot of RCU state for objects placed to "@head"
- * @bulk_head: Bulk-List of kvfree_rcu() objects not yet waiting for a grace period
- * @krw_arr: Array of batches of kfree_rcu() objects waiting for a grace period
- * @lock: Synchronize access to this structure
- * @monitor_work: Promote @head to @head_free after KFREE_DRAIN_JIFFIES
- * @initialized: The @rcu_work fields have been initialized
- * @head_count: Number of objects in rcu_head singular list
- * @bulk_count: Number of objects in bulk-list
- * @bkvcache:
- * A simple cache list that contains objects for reuse purpose.
- * In order to save some per-cpu space the list is singular.
- * Even though it is lockless an access has to be protected by the
- * per-cpu lock.
- * @page_cache_work: A work to refill the cache when it is empty
- * @backoff_page_cache_fill: Delay cache refills
- * @work_in_progress: Indicates that page_cache_work is running
- * @hrtimer: A hrtimer for scheduling a page_cache_work
- * @nr_bkv_objs: number of allocated objects at @bkvcache.
+ * During early boot, any blocking grace-period wait automatically
+ * implies a grace period.
*
- * This is a per-CPU structure. The reason that it is not included in
- * the rcu_data structure is to permit this code to be extracted from
- * the RCU files. Such extraction could allow further optimization of
- * the interactions with the slab allocators.
- */
-struct kfree_rcu_cpu {
- // Objects queued on a linked list
- // through their rcu_head structures.
- struct rcu_head *head;
- unsigned long head_gp_snap;
- atomic_t head_count;
-
- // Objects queued on a bulk-list.
- struct list_head bulk_head[FREE_N_CHANNELS];
- atomic_t bulk_count[FREE_N_CHANNELS];
-
- struct kfree_rcu_cpu_work krw_arr[KFREE_N_BATCHES];
- raw_spinlock_t lock;
- struct delayed_work monitor_work;
- bool initialized;
-
- struct delayed_work page_cache_work;
- atomic_t backoff_page_cache_fill;
- atomic_t work_in_progress;
- struct hrtimer hrtimer;
-
- struct llist_head bkvcache;
- int nr_bkv_objs;
-};
-
-static DEFINE_PER_CPU(struct kfree_rcu_cpu, krc) = {
- .lock = __RAW_SPIN_LOCK_UNLOCKED(krc.lock),
-};
-
-static __always_inline void
-debug_rcu_bhead_unqueue(struct kvfree_rcu_bulk_data *bhead)
-{
-#ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD
- int i;
-
- for (i = 0; i < bhead->nr_records; i++)
- debug_rcu_head_unqueue((struct rcu_head *)(bhead->records[i]));
-#endif
-}
-
-static inline struct kfree_rcu_cpu *
-krc_this_cpu_lock(unsigned long *flags)
-{
- struct kfree_rcu_cpu *krcp;
-
- local_irq_save(*flags); // For safely calling this_cpu_ptr().
- krcp = this_cpu_ptr(&krc);
- raw_spin_lock(&krcp->lock);
-
- return krcp;
-}
-
-static inline void
-krc_this_cpu_unlock(struct kfree_rcu_cpu *krcp, unsigned long flags)
-{
- raw_spin_unlock_irqrestore(&krcp->lock, flags);
-}
-
-static inline struct kvfree_rcu_bulk_data *
-get_cached_bnode(struct kfree_rcu_cpu *krcp)
-{
- if (!krcp->nr_bkv_objs)
- return NULL;
-
- WRITE_ONCE(krcp->nr_bkv_objs, krcp->nr_bkv_objs - 1);
- return (struct kvfree_rcu_bulk_data *)
- llist_del_first(&krcp->bkvcache);
-}
-
-static inline bool
-put_cached_bnode(struct kfree_rcu_cpu *krcp,
- struct kvfree_rcu_bulk_data *bnode)
-{
- // Check the limit.
- if (krcp->nr_bkv_objs >= rcu_min_cached_objs)
- return false;
-
- llist_add((struct llist_node *) bnode, &krcp->bkvcache);
- WRITE_ONCE(krcp->nr_bkv_objs, krcp->nr_bkv_objs + 1);
- return true;
-}
-
-static int
-drain_page_cache(struct kfree_rcu_cpu *krcp)
-{
- unsigned long flags;
- struct llist_node *page_list, *pos, *n;
- int freed = 0;
-
- if (!rcu_min_cached_objs)
- return 0;
-
- raw_spin_lock_irqsave(&krcp->lock, flags);
- page_list = llist_del_all(&krcp->bkvcache);
- WRITE_ONCE(krcp->nr_bkv_objs, 0);
- raw_spin_unlock_irqrestore(&krcp->lock, flags);
-
- llist_for_each_safe(pos, n, page_list) {
- free_page((unsigned long)pos);
- freed++;
- }
-
- return freed;
-}
-
-static void
-kvfree_rcu_bulk(struct kfree_rcu_cpu *krcp,
- struct kvfree_rcu_bulk_data *bnode, int idx)
-{
- unsigned long flags;
- int i;
-
- if (!WARN_ON_ONCE(!poll_state_synchronize_rcu_full(&bnode->gp_snap))) {
- debug_rcu_bhead_unqueue(bnode);
- rcu_lock_acquire(&rcu_callback_map);
- if (idx == 0) { // kmalloc() / kfree().
- trace_rcu_invoke_kfree_bulk_callback(
- rcu_state.name, bnode->nr_records,
- bnode->records);
-
- kfree_bulk(bnode->nr_records, bnode->records);
- } else { // vmalloc() / vfree().
- for (i = 0; i < bnode->nr_records; i++) {
- trace_rcu_invoke_kvfree_callback(
- rcu_state.name, bnode->records[i], 0);
-
- vfree(bnode->records[i]);
- }
- }
- rcu_lock_release(&rcu_callback_map);
- }
-
- raw_spin_lock_irqsave(&krcp->lock, flags);
- if (put_cached_bnode(krcp, bnode))
- bnode = NULL;
- raw_spin_unlock_irqrestore(&krcp->lock, flags);
-
- if (bnode)
- free_page((unsigned long) bnode);
-
- cond_resched_tasks_rcu_qs();
-}
-
-static void
-kvfree_rcu_list(struct rcu_head *head)
-{
- struct rcu_head *next;
-
- for (; head; head = next) {
- void *ptr = (void *) head->func;
- unsigned long offset = (void *) head - ptr;
-
- next = head->next;
- debug_rcu_head_unqueue((struct rcu_head *)ptr);
- rcu_lock_acquire(&rcu_callback_map);
- trace_rcu_invoke_kvfree_callback(rcu_state.name, head, offset);
-
- if (!WARN_ON_ONCE(!__is_kvfree_rcu_offset(offset)))
- kvfree(ptr);
-
- rcu_lock_release(&rcu_callback_map);
- cond_resched_tasks_rcu_qs();
- }
-}
-
-/*
- * This function is invoked in workqueue context after a grace period.
- * It frees all the objects queued on ->bulk_head_free or ->head_free.
- */
-static void kfree_rcu_work(struct work_struct *work)
-{
- unsigned long flags;
- struct kvfree_rcu_bulk_data *bnode, *n;
- struct list_head bulk_head[FREE_N_CHANNELS];
- struct rcu_head *head;
- struct kfree_rcu_cpu *krcp;
- struct kfree_rcu_cpu_work *krwp;
- struct rcu_gp_oldstate head_gp_snap;
- int i;
-
- krwp = container_of(to_rcu_work(work),
- struct kfree_rcu_cpu_work, rcu_work);
- krcp = krwp->krcp;
-
- raw_spin_lock_irqsave(&krcp->lock, flags);
- // Channels 1 and 2.
- for (i = 0; i < FREE_N_CHANNELS; i++)
- list_replace_init(&krwp->bulk_head_free[i], &bulk_head[i]);
-
- // Channel 3.
- head = krwp->head_free;
- krwp->head_free = NULL;
- head_gp_snap = krwp->head_free_gp_snap;
- raw_spin_unlock_irqrestore(&krcp->lock, flags);
-
- // Handle the first two channels.
- for (i = 0; i < FREE_N_CHANNELS; i++) {
- // Start from the tail page, so a GP is likely passed for it.
- list_for_each_entry_safe(bnode, n, &bulk_head[i], list)
- kvfree_rcu_bulk(krcp, bnode, i);
- }
-
- /*
- * This is used when the "bulk" path can not be used for the
- * double-argument of kvfree_rcu(). This happens when the
- * page-cache is empty, which means that objects are instead
- * queued on a linked list through their rcu_head structures.
- * This list is named "Channel 3".
- */
- if (head && !WARN_ON_ONCE(!poll_state_synchronize_rcu_full(&head_gp_snap)))
- kvfree_rcu_list(head);
-}
-
-static bool
-need_offload_krc(struct kfree_rcu_cpu *krcp)
-{
- int i;
-
- for (i = 0; i < FREE_N_CHANNELS; i++)
- if (!list_empty(&krcp->bulk_head[i]))
- return true;
-
- return !!READ_ONCE(krcp->head);
-}
-
-static bool
-need_wait_for_krwp_work(struct kfree_rcu_cpu_work *krwp)
-{
- int i;
-
- for (i = 0; i < FREE_N_CHANNELS; i++)
- if (!list_empty(&krwp->bulk_head_free[i]))
- return true;
-
- return !!krwp->head_free;
-}
-
-static int krc_count(struct kfree_rcu_cpu *krcp)
-{
- int sum = atomic_read(&krcp->head_count);
- int i;
-
- for (i = 0; i < FREE_N_CHANNELS; i++)
- sum += atomic_read(&krcp->bulk_count[i]);
-
- return sum;
-}
-
-static void
-schedule_delayed_monitor_work(struct kfree_rcu_cpu *krcp)
-{
- long delay, delay_left;
-
- delay = krc_count(krcp) >= KVFREE_BULK_MAX_ENTR ? 1:KFREE_DRAIN_JIFFIES;
- if (delayed_work_pending(&krcp->monitor_work)) {
- delay_left = krcp->monitor_work.timer.expires - jiffies;
- if (delay < delay_left)
- mod_delayed_work(system_wq, &krcp->monitor_work, delay);
- return;
- }
- queue_delayed_work(system_wq, &krcp->monitor_work, delay);
-}
-
-static void
-kvfree_rcu_drain_ready(struct kfree_rcu_cpu *krcp)
-{
- struct list_head bulk_ready[FREE_N_CHANNELS];
- struct kvfree_rcu_bulk_data *bnode, *n;
- struct rcu_head *head_ready = NULL;
- unsigned long flags;
- int i;
-
- raw_spin_lock_irqsave(&krcp->lock, flags);
- for (i = 0; i < FREE_N_CHANNELS; i++) {
- INIT_LIST_HEAD(&bulk_ready[i]);
-
- list_for_each_entry_safe_reverse(bnode, n, &krcp->bulk_head[i], list) {
- if (!poll_state_synchronize_rcu_full(&bnode->gp_snap))
- break;
-
- atomic_sub(bnode->nr_records, &krcp->bulk_count[i]);
- list_move(&bnode->list, &bulk_ready[i]);
- }
- }
-
- if (krcp->head && poll_state_synchronize_rcu(krcp->head_gp_snap)) {
- head_ready = krcp->head;
- atomic_set(&krcp->head_count, 0);
- WRITE_ONCE(krcp->head, NULL);
- }
- raw_spin_unlock_irqrestore(&krcp->lock, flags);
-
- for (i = 0; i < FREE_N_CHANNELS; i++) {
- list_for_each_entry_safe(bnode, n, &bulk_ready[i], list)
- kvfree_rcu_bulk(krcp, bnode, i);
- }
-
- if (head_ready)
- kvfree_rcu_list(head_ready);
-}
-
-/*
- * This function is invoked after the KFREE_DRAIN_JIFFIES timeout.
+ * Later on, this could in theory be the case for kernels built with
+ * CONFIG_SMP=y && CONFIG_PREEMPTION=y running on a single CPU, but this
+ * is not a common case. Furthermore, this optimization would cause
+ * the rcu_gp_oldstate structure to expand by 50%, so this potential
+ * grace-period optimization is ignored once the scheduler is running.
*/
-static void kfree_rcu_monitor(struct work_struct *work)
-{
- struct kfree_rcu_cpu *krcp = container_of(work,
- struct kfree_rcu_cpu, monitor_work.work);
- unsigned long flags;
- int i, j;
-
- // Drain ready for reclaim.
- kvfree_rcu_drain_ready(krcp);
-
- raw_spin_lock_irqsave(&krcp->lock, flags);
-
- // Attempt to start a new batch.
- for (i = 0; i < KFREE_N_BATCHES; i++) {
- struct kfree_rcu_cpu_work *krwp = &(krcp->krw_arr[i]);
-
- // Try to detach bulk_head or head and attach it, only when
- // all channels are free. Any channel is not free means at krwp
- // there is on-going rcu work to handle krwp's free business.
- if (need_wait_for_krwp_work(krwp))
- continue;
-
- // kvfree_rcu_drain_ready() might handle this krcp, if so give up.
- if (need_offload_krc(krcp)) {
- // Channel 1 corresponds to the SLAB-pointer bulk path.
- // Channel 2 corresponds to vmalloc-pointer bulk path.
- for (j = 0; j < FREE_N_CHANNELS; j++) {
- if (list_empty(&krwp->bulk_head_free[j])) {
- atomic_set(&krcp->bulk_count[j], 0);
- list_replace_init(&krcp->bulk_head[j],
- &krwp->bulk_head_free[j]);
- }
- }
-
- // Channel 3 corresponds to both SLAB and vmalloc
- // objects queued on the linked list.
- if (!krwp->head_free) {
- krwp->head_free = krcp->head;
- get_state_synchronize_rcu_full(&krwp->head_free_gp_snap);
- atomic_set(&krcp->head_count, 0);
- WRITE_ONCE(krcp->head, NULL);
- }
-
- // One work is per one batch, so there are three
- // "free channels", the batch can handle. It can
- // be that the work is in the pending state when
- // channels have been detached following by each
- // other.
- queue_rcu_work(system_wq, &krwp->rcu_work);
- }
- }
-
- raw_spin_unlock_irqrestore(&krcp->lock, flags);
-
- // If there is nothing to detach, it means that our job is
- // successfully done here. In case of having at least one
- // of the channels that is still busy we should rearm the
- // work to repeat an attempt. Because previous batches are
- // still in progress.
- if (need_offload_krc(krcp))
- schedule_delayed_monitor_work(krcp);
-}
-
-static enum hrtimer_restart
-schedule_page_work_fn(struct hrtimer *t)
-{
- struct kfree_rcu_cpu *krcp =
- container_of(t, struct kfree_rcu_cpu, hrtimer);
-
- queue_delayed_work(system_highpri_wq, &krcp->page_cache_work, 0);
- return HRTIMER_NORESTART;
-}
-
-static void fill_page_cache_func(struct work_struct *work)
-{
- struct kvfree_rcu_bulk_data *bnode;
- struct kfree_rcu_cpu *krcp =
- container_of(work, struct kfree_rcu_cpu,
- page_cache_work.work);
- unsigned long flags;
- int nr_pages;
- bool pushed;
- int i;
-
- nr_pages = atomic_read(&krcp->backoff_page_cache_fill) ?
- 1 : rcu_min_cached_objs;
-
- for (i = READ_ONCE(krcp->nr_bkv_objs); i < nr_pages; i++) {
- bnode = (struct kvfree_rcu_bulk_data *)
- __get_free_page(GFP_KERNEL | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN);
-
- if (!bnode)
- break;
-
- raw_spin_lock_irqsave(&krcp->lock, flags);
- pushed = put_cached_bnode(krcp, bnode);
- raw_spin_unlock_irqrestore(&krcp->lock, flags);
-
- if (!pushed) {
- free_page((unsigned long) bnode);
- break;
- }
- }
-
- atomic_set(&krcp->work_in_progress, 0);
- atomic_set(&krcp->backoff_page_cache_fill, 0);
-}
-
-static void
-run_page_cache_worker(struct kfree_rcu_cpu *krcp)
-{
- // If cache disabled, bail out.
- if (!rcu_min_cached_objs)
- return;
-
- if (rcu_scheduler_active == RCU_SCHEDULER_RUNNING &&
- !atomic_xchg(&krcp->work_in_progress, 1)) {
- if (atomic_read(&krcp->backoff_page_cache_fill)) {
- queue_delayed_work(system_wq,
- &krcp->page_cache_work,
- msecs_to_jiffies(rcu_delay_page_cache_fill_msec));
- } else {
- hrtimer_init(&krcp->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
- krcp->hrtimer.function = schedule_page_work_fn;
- hrtimer_start(&krcp->hrtimer, 0, HRTIMER_MODE_REL);
- }
- }
-}
-
-// Record ptr in a page managed by krcp, with the pre-krc_this_cpu_lock()
-// state specified by flags. If can_alloc is true, the caller must
-// be schedulable and not be holding any locks or mutexes that might be
-// acquired by the memory allocator or anything that it might invoke.
-// Returns true if ptr was successfully recorded, else the caller must
-// use a fallback.
-static inline bool
-add_ptr_to_bulk_krc_lock(struct kfree_rcu_cpu **krcp,
- unsigned long *flags, void *ptr, bool can_alloc)
+static int rcu_blocking_is_gp(void)
{
- struct kvfree_rcu_bulk_data *bnode;
- int idx;
-
- *krcp = krc_this_cpu_lock(flags);
- if (unlikely(!(*krcp)->initialized))
+ if (rcu_scheduler_active != RCU_SCHEDULER_INACTIVE) {
+ might_sleep();
return false;
-
- idx = !!is_vmalloc_addr(ptr);
- bnode = list_first_entry_or_null(&(*krcp)->bulk_head[idx],
- struct kvfree_rcu_bulk_data, list);
-
- /* Check if a new block is required. */
- if (!bnode || bnode->nr_records == KVFREE_BULK_MAX_ENTR) {
- bnode = get_cached_bnode(*krcp);
- if (!bnode && can_alloc) {
- krc_this_cpu_unlock(*krcp, *flags);
-
- // __GFP_NORETRY - allows a light-weight direct reclaim
- // what is OK from minimizing of fallback hitting point of
- // view. Apart of that it forbids any OOM invoking what is
- // also beneficial since we are about to release memory soon.
- //
- // __GFP_NOMEMALLOC - prevents from consuming of all the
- // memory reserves. Please note we have a fallback path.
- //
- // __GFP_NOWARN - it is supposed that an allocation can
- // be failed under low memory or high memory pressure
- // scenarios.
- bnode = (struct kvfree_rcu_bulk_data *)
- __get_free_page(GFP_KERNEL | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN);
- raw_spin_lock_irqsave(&(*krcp)->lock, *flags);
- }
-
- if (!bnode)
- return false;
-
- // Initialize the new block and attach it.
- bnode->nr_records = 0;
- list_add(&bnode->list, &(*krcp)->bulk_head[idx]);
}
-
- // Finally insert and update the GP for this page.
- bnode->records[bnode->nr_records++] = ptr;
- get_state_synchronize_rcu_full(&bnode->gp_snap);
- atomic_inc(&(*krcp)->bulk_count[idx]);
-
return true;
}
/*
- * Queue a request for lazy invocation of the appropriate free routine
- * after a grace period. Please note that three paths are maintained,
- * two for the common case using arrays of pointers and a third one that
- * is used only when the main paths cannot be used, for example, due to
- * memory pressure.
- *
- * Each kvfree_call_rcu() request is added to a batch. The batch will be drained
- * every KFREE_DRAIN_JIFFIES number of jiffies. All the objects in the batch will
- * be free'd in workqueue context. This allows us to: batch requests together to
- * reduce the number of grace periods during heavy kfree_rcu()/kvfree_rcu() load.
+ * Helper function for the synchronize_rcu() API.
*/
-void kvfree_call_rcu(struct rcu_head *head, void *ptr)
+static void synchronize_rcu_normal(void)
{
- unsigned long flags;
- struct kfree_rcu_cpu *krcp;
- bool success;
-
- /*
- * Please note there is a limitation for the head-less
- * variant, that is why there is a clear rule for such
- * objects: it can be used from might_sleep() context
- * only. For other places please embed an rcu_head to
- * your data.
- */
- if (!head)
- might_sleep();
+ struct rcu_synchronize rs;
- // Queue the object but don't yet schedule the batch.
- if (debug_rcu_head_queue(ptr)) {
- // Probable double kfree_rcu(), just leak.
- WARN_ONCE(1, "%s(): Double-freed call. rcu_head %p\n",
- __func__, head);
+ trace_rcu_sr_normal(rcu_state.name, &rs.head, TPS("request"));
- // Mark as success and leave.
- return;
+ if (!READ_ONCE(rcu_normal_wake_from_gp)) {
+ wait_rcu_gp(call_rcu_hurry);
+ goto trace_complete_out;
}
- kasan_record_aux_stack_noalloc(ptr);
- success = add_ptr_to_bulk_krc_lock(&krcp, &flags, ptr, !head);
- if (!success) {
- run_page_cache_worker(krcp);
-
- if (head == NULL)
- // Inline if kvfree_rcu(one_arg) call.
- goto unlock_return;
-
- head->func = ptr;
- head->next = krcp->head;
- WRITE_ONCE(krcp->head, head);
- atomic_inc(&krcp->head_count);
-
- // Take a snapshot for this krcp.
- krcp->head_gp_snap = get_state_synchronize_rcu();
- success = true;
- }
+ init_rcu_head_on_stack(&rs.head);
+ init_completion(&rs.completion);
/*
- * The kvfree_rcu() caller considers the pointer freed at this point
- * and likely removes any references to it. Since the actual slab
- * freeing (and kmemleak_free()) is deferred, tell kmemleak to ignore
- * this object (no scanning or false positives reporting).
+ * This code might be preempted, therefore take a GP
+ * snapshot before adding a request.
*/
- kmemleak_ignore(ptr);
-
- // Set timer to drain after KFREE_DRAIN_JIFFIES.
- if (rcu_scheduler_active == RCU_SCHEDULER_RUNNING)
- schedule_delayed_monitor_work(krcp);
+ if (IS_ENABLED(CONFIG_PROVE_RCU))
+ get_state_synchronize_rcu_full(&rs.oldstate);
-unlock_return:
- krc_this_cpu_unlock(krcp, flags);
+ rcu_sr_normal_add_req(&rs);
- /*
- * Inline kvfree() after synchronize_rcu(). We can do
- * it from might_sleep() context only, so the current
- * CPU can pass the QS state.
- */
- if (!success) {
- debug_rcu_head_unqueue((struct rcu_head *) ptr);
- synchronize_rcu();
- kvfree(ptr);
- }
-}
-EXPORT_SYMBOL_GPL(kvfree_call_rcu);
+ /* Kick a GP and start waiting. */
+ (void) start_poll_synchronize_rcu();
-static unsigned long
-kfree_rcu_shrink_count(struct shrinker *shrink, struct shrink_control *sc)
-{
- int cpu;
- unsigned long count = 0;
-
- /* Snapshot count of all CPUs */
- for_each_possible_cpu(cpu) {
- struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu);
+ /* Now we can wait. */
+ wait_for_completion(&rs.completion);
+ destroy_rcu_head_on_stack(&rs.head);
- count += krc_count(krcp);
- count += READ_ONCE(krcp->nr_bkv_objs);
- atomic_set(&krcp->backoff_page_cache_fill, 1);
- }
-
- return count == 0 ? SHRINK_EMPTY : count;
-}
-
-static unsigned long
-kfree_rcu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
-{
- int cpu, freed = 0;
-
- for_each_possible_cpu(cpu) {
- int count;
- struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu);
-
- count = krc_count(krcp);
- count += drain_page_cache(krcp);
- kfree_rcu_monitor(&krcp->monitor_work.work);
-
- sc->nr_to_scan -= count;
- freed += count;
-
- if (sc->nr_to_scan <= 0)
- break;
- }
-
- return freed == 0 ? SHRINK_STOP : freed;
-}
-
-void __init kfree_rcu_scheduler_running(void)
-{
- int cpu;
-
- for_each_possible_cpu(cpu) {
- struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu);
-
- if (need_offload_krc(krcp))
- schedule_delayed_monitor_work(krcp);
- }
-}
-
-/*
- * During early boot, any blocking grace-period wait automatically
- * implies a grace period.
- *
- * Later on, this could in theory be the case for kernels built with
- * CONFIG_SMP=y && CONFIG_PREEMPTION=y running on a single CPU, but this
- * is not a common case. Furthermore, this optimization would cause
- * the rcu_gp_oldstate structure to expand by 50%, so this potential
- * grace-period optimization is ignored once the scheduler is running.
- */
-static int rcu_blocking_is_gp(void)
-{
- if (rcu_scheduler_active != RCU_SCHEDULER_INACTIVE) {
- might_sleep();
- return false;
- }
- return true;
+trace_complete_out:
+ trace_rcu_sr_normal(rcu_state.name, &rs.head, TPS("complete"));
}
/**
@@ -3610,7 +3318,7 @@ void synchronize_rcu(void)
if (rcu_gp_is_expedited())
synchronize_rcu_expedited();
else
- wait_rcu_gp(call_rcu_hurry);
+ synchronize_rcu_normal();
return;
}
@@ -3687,14 +3395,17 @@ EXPORT_SYMBOL_GPL(get_state_synchronize_rcu);
*/
void get_state_synchronize_rcu_full(struct rcu_gp_oldstate *rgosp)
{
- struct rcu_node *rnp = rcu_get_root();
-
/*
* Any prior manipulation of RCU-protected data must happen
* before the loads from ->gp_seq and ->expedited_sequence.
*/
smp_mb(); /* ^^^ */
- rgosp->rgos_norm = rcu_seq_snap(&rnp->gp_seq);
+
+ // Yes, rcu_state.gp_seq, not rnp_root->gp_seq, the latter's use
+ // in poll_state_synchronize_rcu_full() notwithstanding. Use of
+ // the latter here would result in too-short grace periods due to
+ // interactions with newly onlined CPUs.
+ rgosp->rgos_norm = rcu_seq_snap(&rcu_state.gp_seq);
rgosp->rgos_exp = rcu_seq_snap(&rcu_state.expedited_sequence);
}
EXPORT_SYMBOL_GPL(get_state_synchronize_rcu_full);
@@ -3710,7 +3421,6 @@ static void start_poll_synchronize_rcu_common(void)
struct rcu_data *rdp;
struct rcu_node *rnp;
- lockdep_assert_irqs_enabled();
local_irq_save(flags);
rdp = this_cpu_ptr(&rcu_data);
rnp = rdp->mynode;
@@ -3735,9 +3445,6 @@ static void start_poll_synchronize_rcu_common(void)
* grace period has elapsed in the meantime. If the needed grace period
* is not already slated to start, notifies RCU core of the need for that
* grace period.
- *
- * Interrupts must be enabled for the case where it is necessary to awaken
- * the grace-period kthread.
*/
unsigned long start_poll_synchronize_rcu(void)
{
@@ -3758,9 +3465,6 @@ EXPORT_SYMBOL_GPL(start_poll_synchronize_rcu);
* grace period (whether normal or expedited) has elapsed in the meantime.
* If the needed grace period is not already slated to start, notifies
* RCU core of the need for that grace period.
- *
- * Interrupts must be enabled for the case where it is necessary to awaken
- * the grace-period kthread.
*/
void start_poll_synchronize_rcu_full(struct rcu_gp_oldstate *rgosp)
{
@@ -3938,11 +3642,15 @@ static int rcu_pending(int user)
return 1;
/* Is this a nohz_full CPU in userspace or idle? (Ignore RCU if so.) */
- if ((user || rcu_is_cpu_rrupt_from_idle()) && rcu_nohz_full_cpu())
+ gp_in_progress = rcu_gp_in_progress();
+ if ((user || rcu_is_cpu_rrupt_from_idle() ||
+ (gp_in_progress &&
+ time_before(jiffies, READ_ONCE(rcu_state.gp_start) +
+ nohz_full_patience_delay_jiffies))) &&
+ rcu_nohz_full_cpu())
return 0;
/* Is the RCU core waiting for a quiescent state from this CPU? */
- gp_in_progress = rcu_gp_in_progress();
if (rdp->core_needs_qs && !rdp->cpu_no_qs.b.norm && gp_in_progress)
return 1;
@@ -3990,6 +3698,7 @@ static void rcu_barrier_callback(struct rcu_head *rhp)
{
unsigned long __maybe_unused s = rcu_state.barrier_sequence;
+ rhp->next = rhp; // Mark the callback as having been invoked.
if (atomic_dec_and_test(&rcu_state.barrier_cpu_count)) {
rcu_barrier_trace(TPS("LastCB"), -1, s);
complete(&rcu_state.barrier_completion);
@@ -4303,7 +4012,7 @@ EXPORT_SYMBOL_GPL(rcu_lockdep_current_cpu_online);
// whether spinlocks may be acquired safely.
static bool rcu_init_invoked(void)
{
- return !!rcu_state.n_online_cpus;
+ return !!READ_ONCE(rcu_state.n_online_cpus);
}
/*
@@ -4391,18 +4100,34 @@ rcu_boot_init_percpu_data(int cpu)
/* Set up local state, ensuring consistent view of global state. */
rdp->grpmask = leaf_node_cpu_bit(rdp->mynode, cpu);
INIT_WORK(&rdp->strict_work, strict_work_handler);
- WARN_ON_ONCE(ct->dynticks_nesting != 1);
- WARN_ON_ONCE(rcu_dynticks_in_eqs(rcu_dynticks_snap(cpu)));
+ WARN_ON_ONCE(ct->nesting != 1);
+ WARN_ON_ONCE(rcu_watching_snap_in_eqs(ct_rcu_watching_cpu(cpu)));
rdp->barrier_seq_snap = rcu_state.barrier_sequence;
rdp->rcu_ofl_gp_seq = rcu_state.gp_seq;
- rdp->rcu_ofl_gp_flags = RCU_GP_CLEANED;
+ rdp->rcu_ofl_gp_state = RCU_GP_CLEANED;
rdp->rcu_onl_gp_seq = rcu_state.gp_seq;
- rdp->rcu_onl_gp_flags = RCU_GP_CLEANED;
+ rdp->rcu_onl_gp_state = RCU_GP_CLEANED;
rdp->last_sched_clock = jiffies;
rdp->cpu = cpu;
rcu_boot_init_nocb_percpu_data(rdp);
}
+static void rcu_thread_affine_rnp(struct task_struct *t, struct rcu_node *rnp)
+{
+ cpumask_var_t affinity;
+ int cpu;
+
+ if (!zalloc_cpumask_var(&affinity, GFP_KERNEL))
+ return;
+
+ for_each_leaf_node_possible_cpu(rnp, cpu)
+ cpumask_set_cpu(cpu, affinity);
+
+ kthread_affine_preferred(t, affinity);
+
+ free_cpumask_var(affinity);
+}
+
struct kthread_worker *rcu_exp_gp_kworker;
static void rcu_spawn_exp_par_gp_kworker(struct rcu_node *rnp)
@@ -4425,16 +4150,9 @@ static void rcu_spawn_exp_par_gp_kworker(struct rcu_node *rnp)
if (IS_ENABLED(CONFIG_RCU_EXP_KTHREAD))
sched_setscheduler_nocheck(kworker->task, SCHED_FIFO, &param);
-}
-
-static struct task_struct *rcu_exp_par_gp_task(struct rcu_node *rnp)
-{
- struct kthread_worker *kworker = READ_ONCE(rnp->exp_kworker);
-
- if (!kworker)
- return NULL;
- return kworker->task;
+ rcu_thread_affine_rnp(kworker->task, rnp);
+ wake_up_process(kworker->task);
}
static void __init rcu_start_exp_gp_kworker(void)
@@ -4442,7 +4160,7 @@ static void __init rcu_start_exp_gp_kworker(void)
const char *name = "rcu_exp_gp_kthread_worker";
struct sched_param param = { .sched_priority = kthread_prio };
- rcu_exp_gp_kworker = kthread_create_worker(0, name);
+ rcu_exp_gp_kworker = kthread_run_worker(0, name);
if (IS_ERR_OR_NULL(rcu_exp_gp_kworker)) {
pr_err("Failed to create %s!\n", name);
rcu_exp_gp_kworker = NULL;
@@ -4485,7 +4203,7 @@ int rcutree_prepare_cpu(unsigned int cpu)
rdp->qlen_last_fqs_check = 0;
rdp->n_force_qs_snap = READ_ONCE(rcu_state.n_force_qs);
rdp->blimit = blimit;
- ct->dynticks_nesting = 1; /* CPU not up, no tearing. */
+ ct->nesting = 1; /* CPU not up, no tearing. */
raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled. */
/*
@@ -4513,73 +4231,13 @@ int rcutree_prepare_cpu(unsigned int cpu)
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
rcu_spawn_rnp_kthreads(rnp);
rcu_spawn_cpu_nocb_kthread(cpu);
+ ASSERT_EXCLUSIVE_WRITER(rcu_state.n_online_cpus);
WRITE_ONCE(rcu_state.n_online_cpus, rcu_state.n_online_cpus + 1);
return 0;
}
/*
- * Update kthreads affinity during CPU-hotplug changes.
- *
- * Set the per-rcu_node kthread's affinity to cover all CPUs that are
- * served by the rcu_node in question. The CPU hotplug lock is still
- * held, so the value of rnp->qsmaskinit will be stable.
- *
- * We don't include outgoingcpu in the affinity set, use -1 if there is
- * no outgoing CPU. If there are no CPUs left in the affinity set,
- * this function allows the kthread to execute on any CPU.
- *
- * Any future concurrent calls are serialized via ->kthread_mutex.
- */
-static void rcutree_affinity_setting(unsigned int cpu, int outgoingcpu)
-{
- cpumask_var_t cm;
- unsigned long mask;
- struct rcu_data *rdp;
- struct rcu_node *rnp;
- struct task_struct *task_boost, *task_exp;
-
- rdp = per_cpu_ptr(&rcu_data, cpu);
- rnp = rdp->mynode;
-
- task_boost = rcu_boost_task(rnp);
- task_exp = rcu_exp_par_gp_task(rnp);
-
- /*
- * If CPU is the boot one, those tasks are created later from early
- * initcall since kthreadd must be created first.
- */
- if (!task_boost && !task_exp)
- return;
-
- if (!zalloc_cpumask_var(&cm, GFP_KERNEL))
- return;
-
- mutex_lock(&rnp->kthread_mutex);
- mask = rcu_rnp_online_cpus(rnp);
- for_each_leaf_node_possible_cpu(rnp, cpu)
- if ((mask & leaf_node_cpu_bit(rnp, cpu)) &&
- cpu != outgoingcpu)
- cpumask_set_cpu(cpu, cm);
- cpumask_and(cm, cm, housekeeping_cpumask(HK_TYPE_RCU));
- if (cpumask_empty(cm)) {
- cpumask_copy(cm, housekeeping_cpumask(HK_TYPE_RCU));
- if (outgoingcpu >= 0)
- cpumask_clear_cpu(outgoingcpu, cm);
- }
-
- if (task_exp)
- set_cpus_allowed_ptr(task_exp, cm);
-
- if (task_boost)
- set_cpus_allowed_ptr(task_boost, cm);
-
- mutex_unlock(&rnp->kthread_mutex);
-
- free_cpumask_var(cm);
-}
-
-/*
* Has the specified (known valid) CPU ever been fully online?
*/
bool rcu_cpu_beenfullyonline(int cpu)
@@ -4607,7 +4265,6 @@ int rcutree_online_cpu(unsigned int cpu)
if (rcu_scheduler_active == RCU_SCHEDULER_INACTIVE)
return 0; /* Too early in boot for scheduler work. */
sync_sched_exp_online_cleanup(cpu);
- rcutree_affinity_setting(cpu, -1);
// Stop-machine done, so allow nohz_full to disable tick.
tick_dep_clear(TICK_DEP_BIT_RCU);
@@ -4644,7 +4301,7 @@ void rcutree_report_cpu_starting(unsigned int cpu)
rnp = rdp->mynode;
mask = rdp->grpmask;
arch_spin_lock(&rcu_state.ofl_lock);
- rcu_dynticks_eqs_online();
+ rcu_watching_online();
raw_spin_lock(&rcu_state.barrier_lock);
raw_spin_lock_rcu_node(rnp);
WRITE_ONCE(rnp->qsmaskinitnext, rnp->qsmaskinitnext | mask);
@@ -4656,7 +4313,7 @@ void rcutree_report_cpu_starting(unsigned int cpu)
ASSERT_EXCLUSIVE_WRITER(rcu_state.ncpus);
rcu_gpnum_ovf(rnp, rdp); /* Offline-induced counter wrap? */
rdp->rcu_onl_gp_seq = READ_ONCE(rcu_state.gp_seq);
- rdp->rcu_onl_gp_flags = READ_ONCE(rcu_state.gp_flags);
+ rdp->rcu_onl_gp_state = READ_ONCE(rcu_state.gp_state);
/* An incoming CPU should never be blocking a grace period. */
if (WARN_ON_ONCE(rnp->qsmask & mask)) { /* RCU waiting on incoming CPU? */
@@ -4707,7 +4364,7 @@ void rcutree_report_cpu_dead(void)
arch_spin_lock(&rcu_state.ofl_lock);
raw_spin_lock_irqsave_rcu_node(rnp, flags); /* Enforce GP memory-order guarantee. */
rdp->rcu_ofl_gp_seq = READ_ONCE(rcu_state.gp_seq);
- rdp->rcu_ofl_gp_flags = READ_ONCE(rcu_state.gp_flags);
+ rdp->rcu_ofl_gp_state = READ_ONCE(rcu_state.gp_state);
if (rnp->qsmask & mask) { /* RCU waiting on outgoing CPU? */
/* Report quiescent state -before- changing ->qsmaskinitnext! */
rcu_disable_urgency_upon_qs(rdp);
@@ -4734,11 +4391,15 @@ void rcutree_migrate_callbacks(int cpu)
struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
bool needwake;
- if (rcu_rdp_is_offloaded(rdp) ||
- rcu_segcblist_empty(&rdp->cblist))
- return; /* No callbacks to migrate. */
+ if (rcu_rdp_is_offloaded(rdp))
+ return;
raw_spin_lock_irqsave(&rcu_state.barrier_lock, flags);
+ if (rcu_segcblist_empty(&rdp->cblist)) {
+ raw_spin_unlock_irqrestore(&rcu_state.barrier_lock, flags);
+ return; /* No callbacks to migrate. */
+ }
+
WARN_ON_ONCE(rcu_rdp_cpu_online(rdp));
rcu_barrier_entrain(rdp);
my_rdp = this_cpu_ptr(&rcu_data);
@@ -4781,6 +4442,7 @@ void rcutree_migrate_callbacks(int cpu)
*/
int rcutree_dead_cpu(unsigned int cpu)
{
+ ASSERT_EXCLUSIVE_WRITER(rcu_state.n_online_cpus);
WRITE_ONCE(rcu_state.n_online_cpus, rcu_state.n_online_cpus - 1);
// Stop-machine done, so allow nohz_full to disable tick.
tick_dep_clear(TICK_DEP_BIT_RCU);
@@ -4819,8 +4481,6 @@ int rcutree_offline_cpu(unsigned int cpu)
rnp->ffmask &= ~rdp->grpmask;
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
- rcutree_affinity_setting(cpu, cpu);
-
// nohz_full CPUs need the tick for stop-machine to work quickly
tick_dep_set(TICK_DEP_BIT_RCU);
return 0;
@@ -5005,6 +4665,8 @@ static void __init rcu_init_one(void)
while (i > rnp->grphi)
rnp++;
per_cpu_ptr(&rcu_data, i)->mynode = rnp;
+ per_cpu_ptr(&rcu_data, i)->barrier_head.next =
+ &per_cpu_ptr(&rcu_data, i)->barrier_head;
rcu_boot_init_percpu_data(i);
}
}
@@ -5083,8 +4745,7 @@ void rcu_init_geometry(void)
* Complain and fall back to the compile-time values if this
* limit is exceeded.
*/
- if (rcu_fanout_leaf < 2 ||
- rcu_fanout_leaf > sizeof(unsigned long) * 8) {
+ if (rcu_fanout_leaf < 2 || rcu_fanout_leaf > BITS_PER_LONG) {
rcu_fanout_leaf = RCU_FANOUT_LEAF;
WARN_ON(1);
return;
@@ -5149,62 +4810,12 @@ static void __init rcu_dump_rcu_node_tree(void)
struct workqueue_struct *rcu_gp_wq;
-static void __init kfree_rcu_batch_init(void)
-{
- int cpu;
- int i, j;
- struct shrinker *kfree_rcu_shrinker;
-
- /* Clamp it to [0:100] seconds interval. */
- if (rcu_delay_page_cache_fill_msec < 0 ||
- rcu_delay_page_cache_fill_msec > 100 * MSEC_PER_SEC) {
-
- rcu_delay_page_cache_fill_msec =
- clamp(rcu_delay_page_cache_fill_msec, 0,
- (int) (100 * MSEC_PER_SEC));
-
- pr_info("Adjusting rcutree.rcu_delay_page_cache_fill_msec to %d ms.\n",
- rcu_delay_page_cache_fill_msec);
- }
-
- for_each_possible_cpu(cpu) {
- struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu);
-
- for (i = 0; i < KFREE_N_BATCHES; i++) {
- INIT_RCU_WORK(&krcp->krw_arr[i].rcu_work, kfree_rcu_work);
- krcp->krw_arr[i].krcp = krcp;
-
- for (j = 0; j < FREE_N_CHANNELS; j++)
- INIT_LIST_HEAD(&krcp->krw_arr[i].bulk_head_free[j]);
- }
-
- for (i = 0; i < FREE_N_CHANNELS; i++)
- INIT_LIST_HEAD(&krcp->bulk_head[i]);
-
- INIT_DELAYED_WORK(&krcp->monitor_work, kfree_rcu_monitor);
- INIT_DELAYED_WORK(&krcp->page_cache_work, fill_page_cache_func);
- krcp->initialized = true;
- }
-
- kfree_rcu_shrinker = shrinker_alloc(0, "rcu-kfree");
- if (!kfree_rcu_shrinker) {
- pr_err("Failed to allocate kfree_rcu() shrinker!\n");
- return;
- }
-
- kfree_rcu_shrinker->count_objects = kfree_rcu_shrink_count;
- kfree_rcu_shrinker->scan_objects = kfree_rcu_shrink_scan;
-
- shrinker_register(kfree_rcu_shrinker);
-}
-
void __init rcu_init(void)
{
int cpu = smp_processor_id();
rcu_early_boot_tests();
- kfree_rcu_batch_init();
rcu_bootup_announce();
sanitize_kthread_prio();
rcu_init_geometry();
@@ -5229,6 +4840,9 @@ void __init rcu_init(void)
rcu_gp_wq = alloc_workqueue("rcu_gp", WQ_MEM_RECLAIM, 0);
WARN_ON(!rcu_gp_wq);
+ sync_wq = alloc_workqueue("sync_wq", WQ_MEM_RECLAIM, 0);
+ WARN_ON(!sync_wq);
+
/* Fill in default value for rcutree.qovld boot parameter. */
/* -After- the rcu_node ->lock fields are initialized! */
if (qovld < 0)
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index df48160b3136..3830c19cf2f6 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -168,7 +168,7 @@ struct rcu_snap_record {
u64 cputime_irq; /* Accumulated cputime of hard irqs */
u64 cputime_softirq;/* Accumulated cputime of soft irqs */
u64 cputime_system; /* Accumulated cputime of kernel tasks */
- unsigned long nr_hardirqs; /* Accumulated number of hard irqs */
+ u64 nr_hardirqs; /* Accumulated number of hard irqs */
unsigned int nr_softirqs; /* Accumulated number of soft irqs */
unsigned long long nr_csw; /* Accumulated number of task switches */
unsigned long jiffies; /* Track jiffies value */
@@ -183,6 +183,7 @@ struct rcu_data {
bool core_needs_qs; /* Core waits for quiescent state. */
bool beenonline; /* CPU online at least once. */
bool gpwrap; /* Possible ->gp_seq wrap. */
+ unsigned int gpwrap_count; /* Count of GP sequence wrap. */
bool cpu_started; /* RCU watching this onlining CPU. */
struct rcu_node *mynode; /* This CPU's leaf of hierarchy */
unsigned long grpmask; /* Mask to apply to leaf qsmask. */
@@ -206,7 +207,7 @@ struct rcu_data {
long blimit; /* Upper limit on a processed batch */
/* 3) dynticks interface. */
- int dynticks_snap; /* Per-GP tracking for dynticks. */
+ int watching_snap; /* Per-GP tracking for dynticks. */
bool rcu_need_heavy_qs; /* GP old, so heavy quiescent state! */
bool rcu_urgent_qs; /* GP old need light quiescent state. */
bool rcu_forced_tick; /* Forced tick to provide QS. */
@@ -215,7 +216,7 @@ struct rcu_data {
/* 4) rcu_barrier(), OOM callbacks, and expediting. */
unsigned long barrier_seq_snap; /* Snap of rcu_state.barrier_sequence. */
struct rcu_head barrier_head;
- int exp_dynticks_snap; /* Double-check need for IPI. */
+ int exp_watching_snap; /* Double-check need for IPI. */
/* 5) Callback offloading. */
#ifdef CONFIG_RCU_NOCB_CPU
@@ -223,7 +224,6 @@ struct rcu_data {
struct swait_queue_head nocb_state_wq; /* For offloading state changes */
struct task_struct *nocb_gp_kthread;
raw_spinlock_t nocb_lock; /* Guard following pair of fields. */
- atomic_t nocb_lock_contended; /* Contention experienced. */
int nocb_defer_wakeup; /* Defer wakeup of nocb_kthread. */
struct timer_list nocb_timer; /* Enforce finite deferral. */
unsigned long nocb_gp_adv_time; /* Last call_rcu() CB adv (jiffies). */
@@ -273,9 +273,9 @@ struct rcu_data {
bool rcu_iw_pending; /* Is ->rcu_iw pending? */
unsigned long rcu_iw_gp_seq; /* ->gp_seq associated with ->rcu_iw. */
unsigned long rcu_ofl_gp_seq; /* ->gp_seq at last offline. */
- short rcu_ofl_gp_flags; /* ->gp_flags at last offline. */
+ short rcu_ofl_gp_state; /* ->gp_state at last offline. */
unsigned long rcu_onl_gp_seq; /* ->gp_seq at last online. */
- short rcu_onl_gp_flags; /* ->gp_flags at last online. */
+ short rcu_onl_gp_state; /* ->gp_state at last online. */
unsigned long last_fqs_resched; /* Time of last rcu_resched(). */
unsigned long last_sched_clock; /* Jiffies of last rcu_sched_clock_irq(). */
struct rcu_snap_record snap_record; /* Snapshot of core stats at half of */
@@ -316,6 +316,19 @@ do { \
} while (0)
/*
+ * A max threshold for synchronize_rcu() users which are
+ * awaken directly by the rcu_gp_kthread(). Left part is
+ * deferred to the main worker.
+ */
+#define SR_MAX_USERS_WAKE_FROM_GP 5
+#define SR_NORMAL_GP_WAIT_HEAD_MAX 5
+
+struct sr_wait_node {
+ atomic_t inuse;
+ struct llist_node node;
+};
+
+/*
* RCU global state, including node hierarchy. This hierarchy is
* represented in "heap" form in a dense array. The root (first level)
* of the hierarchy is in ->node[0] (referenced by ->level[0]), the second
@@ -399,7 +412,19 @@ struct rcu_state {
arch_spinlock_t ofl_lock ____cacheline_internodealigned_in_smp;
/* Synchronize offline with */
/* GP pre-initialization. */
+
+ /* synchronize_rcu() part. */
+ struct llist_head srs_next; /* request a GP users. */
+ struct llist_node *srs_wait_tail; /* wait for GP users. */
+ struct llist_node *srs_done_tail; /* ready for GP users. */
+ struct sr_wait_node srs_wait_nodes[SR_NORMAL_GP_WAIT_HEAD_MAX];
+ struct work_struct srs_cleanup_work;
+ atomic_t srs_cleanups_pending; /* srs inflight worker cleanups. */
+
+#ifdef CONFIG_RCU_NOCB_CPU
+ struct mutex nocb_mutex; /* Guards (de-)offloading */
int nocb_is_setup; /* nocb is setup from boot */
+#endif
};
/* Values for rcu_state structure's gp_flags field. */
diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h
index 6b83537480b1..c36c7d5575ca 100644
--- a/kernel/rcu/tree_exp.h
+++ b/kernel/rcu/tree_exp.h
@@ -7,6 +7,7 @@
* Authors: Paul E. McKenney <paulmck@linux.ibm.com>
*/
+#include <linux/console.h>
#include <linux/lockdep.h>
static void rcu_exp_handler(void *unused);
@@ -199,7 +200,7 @@ static void __rcu_report_exp_rnp(struct rcu_node *rnp,
if (rnp->parent == NULL) {
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
if (wake)
- swake_up_one_online(&rcu_state.expedited_wq);
+ swake_up_one(&rcu_state.expedited_wq);
break;
}
@@ -226,20 +227,22 @@ static void __maybe_unused rcu_report_exp_rnp(struct rcu_node *rnp, bool wake)
/*
* Report expedited quiescent state for multiple CPUs, all covered by the
- * specified leaf rcu_node structure.
+ * specified leaf rcu_node structure, which is acquired by the caller.
*/
-static void rcu_report_exp_cpu_mult(struct rcu_node *rnp,
- unsigned long mask, bool wake)
+static void rcu_report_exp_cpu_mult(struct rcu_node *rnp, unsigned long flags,
+ unsigned long mask_in, bool wake)
+ __releases(rnp->lock)
{
int cpu;
- unsigned long flags;
+ unsigned long mask;
struct rcu_data *rdp;
- raw_spin_lock_irqsave_rcu_node(rnp, flags);
- if (!(rnp->expmask & mask)) {
+ raw_lockdep_assert_held_rcu_node(rnp);
+ if (!(rnp->expmask & mask_in)) {
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
return;
}
+ mask = mask_in & rnp->expmask;
WRITE_ONCE(rnp->expmask, rnp->expmask & ~mask);
for_each_leaf_node_cpu_mask(rnp, cpu, mask) {
rdp = per_cpu_ptr(&rcu_data, cpu);
@@ -256,8 +259,13 @@ static void rcu_report_exp_cpu_mult(struct rcu_node *rnp,
*/
static void rcu_report_exp_rdp(struct rcu_data *rdp)
{
+ unsigned long flags;
+ struct rcu_node *rnp = rdp->mynode;
+
+ raw_spin_lock_irqsave_rcu_node(rnp, flags);
WRITE_ONCE(rdp->cpu_no_qs.b.exp, false);
- rcu_report_exp_cpu_mult(rdp->mynode, rdp->grpmask, true);
+ ASSERT_EXCLUSIVE_WRITER(rdp->cpu_no_qs.b.exp);
+ rcu_report_exp_cpu_mult(rnp, flags, rdp->grpmask, true);
}
/* Common code for work-done checking. */
@@ -265,7 +273,12 @@ static bool sync_exp_work_done(unsigned long s)
{
if (rcu_exp_gp_seq_done(s)) {
trace_rcu_exp_grace_period(rcu_state.name, s, TPS("done"));
- smp_mb(); /* Ensure test happens before caller kfree(). */
+ /*
+ * Order GP completion with preceding accesses. Order also GP
+ * completion with post GP update side accesses. Pairs with
+ * rcu_seq_end().
+ */
+ smp_mb();
return true;
}
return false;
@@ -357,11 +370,25 @@ static void __sync_rcu_exp_select_node_cpus(struct rcu_exp_work *rewp)
!(rnp->qsmaskinitnext & mask)) {
mask_ofl_test |= mask;
} else {
- snap = rcu_dynticks_snap(cpu);
- if (rcu_dynticks_in_eqs(snap))
+ /*
+ * Full ordering between remote CPU's post idle accesses
+ * and updater's accesses prior to current GP (and also
+ * the started GP sequence number) is enforced by
+ * rcu_seq_start() implicit barrier, relayed by kworkers
+ * locking and even further by smp_mb__after_unlock_lock()
+ * barriers chained all the way throughout the rnp locking
+ * tree since sync_exp_reset_tree() and up to the current
+ * leaf rnp locking.
+ *
+ * Ordering between remote CPU's pre idle accesses and
+ * post grace period updater's accesses is enforced by the
+ * below acquire semantic.
+ */
+ snap = ct_rcu_watching_cpu_acquire(cpu);
+ if (rcu_watching_snap_in_eqs(snap))
mask_ofl_test |= mask;
else
- rdp->exp_dynticks_snap = snap;
+ rdp->exp_watching_snap = snap;
}
}
mask_ofl_ipi = rnp->expmask & ~mask_ofl_test;
@@ -381,7 +408,7 @@ static void __sync_rcu_exp_select_node_cpus(struct rcu_exp_work *rewp)
unsigned long mask = rdp->grpmask;
retry_ipi:
- if (rcu_dynticks_in_eqs_since(rdp, rdp->exp_dynticks_snap)) {
+ if (rcu_watching_snap_stopped_since(rdp, rdp->exp_watching_snap)) {
mask_ofl_test |= mask;
continue;
}
@@ -412,8 +439,10 @@ retry_ipi:
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
}
/* Report quiescent states for those that went offline. */
- if (mask_ofl_test)
- rcu_report_exp_cpu_mult(rnp, mask_ofl_test, false);
+ if (mask_ofl_test) {
+ raw_spin_lock_irqsave_rcu_node(rnp, flags);
+ rcu_report_exp_cpu_mult(rnp, flags, mask_ofl_test, false);
+ }
}
static void rcu_exp_sel_wait_wake(unsigned long s);
@@ -524,6 +553,67 @@ static bool synchronize_rcu_expedited_wait_once(long tlimit)
}
/*
+ * Print out an expedited RCU CPU stall warning message.
+ */
+static void synchronize_rcu_expedited_stall(unsigned long jiffies_start, unsigned long j)
+{
+ int cpu;
+ unsigned long mask;
+ int ndetected;
+ struct rcu_node *rnp;
+ struct rcu_node *rnp_root = rcu_get_root();
+
+ if (READ_ONCE(csd_lock_suppress_rcu_stall) && csd_lock_is_stuck()) {
+ pr_err("INFO: %s detected expedited stalls, but suppressed full report due to a stuck CSD-lock.\n", rcu_state.name);
+ return;
+ }
+ pr_err("INFO: %s detected expedited stalls on CPUs/tasks: {", rcu_state.name);
+ ndetected = 0;
+ rcu_for_each_leaf_node(rnp) {
+ ndetected += rcu_print_task_exp_stall(rnp);
+ for_each_leaf_node_possible_cpu(rnp, cpu) {
+ struct rcu_data *rdp;
+
+ mask = leaf_node_cpu_bit(rnp, cpu);
+ if (!(READ_ONCE(rnp->expmask) & mask))
+ continue;
+ ndetected++;
+ rdp = per_cpu_ptr(&rcu_data, cpu);
+ pr_cont(" %d-%c%c%c%c", cpu,
+ "O."[!!cpu_online(cpu)],
+ "o."[!!(rdp->grpmask & rnp->expmaskinit)],
+ "N."[!!(rdp->grpmask & rnp->expmaskinitnext)],
+ "D."[!!data_race(rdp->cpu_no_qs.b.exp)]);
+ }
+ }
+ pr_cont(" } %lu jiffies s: %lu root: %#lx/%c\n",
+ j - jiffies_start, rcu_state.expedited_sequence, data_race(rnp_root->expmask),
+ ".T"[!!data_race(rnp_root->exp_tasks)]);
+ if (ndetected) {
+ pr_err("blocking rcu_node structures (internal RCU debug):");
+ rcu_for_each_node_breadth_first(rnp) {
+ if (rnp == rnp_root)
+ continue; /* printed unconditionally */
+ if (sync_rcu_exp_done_unlocked(rnp))
+ continue;
+ pr_cont(" l=%u:%d-%d:%#lx/%c",
+ rnp->level, rnp->grplo, rnp->grphi, data_race(rnp->expmask),
+ ".T"[!!data_race(rnp->exp_tasks)]);
+ }
+ pr_cont("\n");
+ }
+ rcu_for_each_leaf_node(rnp) {
+ for_each_leaf_node_possible_cpu(rnp, cpu) {
+ mask = leaf_node_cpu_bit(rnp, cpu);
+ if (!(READ_ONCE(rnp->expmask) & mask))
+ continue;
+ dump_cpu_task(cpu);
+ }
+ rcu_exp_print_detail_task_stall_rnp(rnp);
+ }
+}
+
+/*
* Wait for the expedited grace period to elapse, issuing any needed
* RCU CPU stall warnings along the way.
*/
@@ -534,10 +624,8 @@ static void synchronize_rcu_expedited_wait(void)
unsigned long jiffies_stall;
unsigned long jiffies_start;
unsigned long mask;
- int ndetected;
struct rcu_data *rdp;
struct rcu_node *rnp;
- struct rcu_node *rnp_root = rcu_get_root();
unsigned long flags;
trace_rcu_exp_grace_period(rcu_state.name, rcu_exp_gp_seq_endval(), TPS("startwait"));
@@ -571,59 +659,17 @@ static void synchronize_rcu_expedited_wait(void)
return;
if (rcu_stall_is_suppressed())
continue;
+
+ nbcon_cpu_emergency_enter();
+
j = jiffies;
rcu_stall_notifier_call_chain(RCU_STALL_NOTIFY_EXP, (void *)(j - jiffies_start));
trace_rcu_stall_warning(rcu_state.name, TPS("ExpeditedStall"));
- pr_err("INFO: %s detected expedited stalls on CPUs/tasks: {",
- rcu_state.name);
- ndetected = 0;
- rcu_for_each_leaf_node(rnp) {
- ndetected += rcu_print_task_exp_stall(rnp);
- for_each_leaf_node_possible_cpu(rnp, cpu) {
- struct rcu_data *rdp;
-
- mask = leaf_node_cpu_bit(rnp, cpu);
- if (!(READ_ONCE(rnp->expmask) & mask))
- continue;
- ndetected++;
- rdp = per_cpu_ptr(&rcu_data, cpu);
- pr_cont(" %d-%c%c%c%c", cpu,
- "O."[!!cpu_online(cpu)],
- "o."[!!(rdp->grpmask & rnp->expmaskinit)],
- "N."[!!(rdp->grpmask & rnp->expmaskinitnext)],
- "D."[!!data_race(rdp->cpu_no_qs.b.exp)]);
- }
- }
- pr_cont(" } %lu jiffies s: %lu root: %#lx/%c\n",
- j - jiffies_start, rcu_state.expedited_sequence,
- data_race(rnp_root->expmask),
- ".T"[!!data_race(rnp_root->exp_tasks)]);
- if (ndetected) {
- pr_err("blocking rcu_node structures (internal RCU debug):");
- rcu_for_each_node_breadth_first(rnp) {
- if (rnp == rnp_root)
- continue; /* printed unconditionally */
- if (sync_rcu_exp_done_unlocked(rnp))
- continue;
- pr_cont(" l=%u:%d-%d:%#lx/%c",
- rnp->level, rnp->grplo, rnp->grphi,
- data_race(rnp->expmask),
- ".T"[!!data_race(rnp->exp_tasks)]);
- }
- pr_cont("\n");
- }
- rcu_for_each_leaf_node(rnp) {
- for_each_leaf_node_possible_cpu(rnp, cpu) {
- mask = leaf_node_cpu_bit(rnp, cpu);
- if (!(READ_ONCE(rnp->expmask) & mask))
- continue;
- preempt_disable(); // For smp_processor_id() in dump_cpu_task().
- dump_cpu_task(cpu);
- preempt_enable();
- }
- rcu_exp_print_detail_task_stall_rnp(rnp);
- }
+ synchronize_rcu_expedited_stall(jiffies_start, j);
jiffies_stall = 3 * rcu_exp_jiffies_till_stall_check() + 3;
+
+ nbcon_cpu_emergency_exit();
+
panic_on_rcu_stall();
}
}
@@ -675,6 +721,18 @@ static void rcu_exp_sel_wait_wake(unsigned long s)
rcu_exp_wait_wake(s);
}
+/* Request an expedited quiescent state. */
+static void rcu_exp_need_qs(void)
+{
+ lockdep_assert_irqs_disabled();
+ ASSERT_EXCLUSIVE_WRITER_SCOPED(*this_cpu_ptr(&rcu_data.cpu_no_qs.b.exp));
+ __this_cpu_write(rcu_data.cpu_no_qs.b.exp, true);
+ /* Store .exp before .rcu_urgent_qs. */
+ smp_store_release(this_cpu_ptr(&rcu_data.rcu_urgent_qs), true);
+ set_tsk_need_resched(current);
+ set_preempt_need_resched();
+}
+
#ifdef CONFIG_PREEMPT_RCU
/*
@@ -693,24 +751,34 @@ static void rcu_exp_handler(void *unused)
struct task_struct *t = current;
/*
- * First, the common case of not being in an RCU read-side
+ * First, is there no need for a quiescent state from this CPU,
+ * or is this CPU already looking for a quiescent state for the
+ * current grace period? If either is the case, just leave.
+ * However, this should not happen due to the preemptible
+ * sync_sched_exp_online_cleanup() implementation being a no-op,
+ * so warn if this does happen.
+ */
+ ASSERT_EXCLUSIVE_WRITER_SCOPED(rdp->cpu_no_qs.b.exp);
+ if (WARN_ON_ONCE(!(READ_ONCE(rnp->expmask) & rdp->grpmask) ||
+ READ_ONCE(rdp->cpu_no_qs.b.exp)))
+ return;
+
+ /*
+ * Second, the common case of not being in an RCU read-side
* critical section. If also enabled or idle, immediately
* report the quiescent state, otherwise defer.
*/
if (!depth) {
if (!(preempt_count() & (PREEMPT_MASK | SOFTIRQ_MASK)) ||
- rcu_is_cpu_rrupt_from_idle()) {
+ rcu_is_cpu_rrupt_from_idle())
rcu_report_exp_rdp(rdp);
- } else {
- WRITE_ONCE(rdp->cpu_no_qs.b.exp, true);
- set_tsk_need_resched(t);
- set_preempt_need_resched();
- }
+ else
+ rcu_exp_need_qs();
return;
}
/*
- * Second, the less-common case of being in an RCU read-side
+ * Third, the less-common case of being in an RCU read-side
* critical section. In this case we can count on a future
* rcu_read_unlock(). However, this rcu_read_unlock() might
* execute on some other CPU, but in that case there will be
@@ -731,7 +799,7 @@ static void rcu_exp_handler(void *unused)
return;
}
- // Finally, negative nesting depth should not happen.
+ // Fourth and finally, negative nesting depth should not happen.
WARN_ON_ONCE(1);
}
@@ -798,16 +866,6 @@ static void rcu_exp_print_detail_task_stall_rnp(struct rcu_node *rnp)
#else /* #ifdef CONFIG_PREEMPT_RCU */
-/* Request an expedited quiescent state. */
-static void rcu_exp_need_qs(void)
-{
- __this_cpu_write(rcu_data.cpu_no_qs.b.exp, true);
- /* Store .exp before .rcu_urgent_qs. */
- smp_store_release(this_cpu_ptr(&rcu_data.rcu_urgent_qs), true);
- set_tsk_need_resched(current);
- set_preempt_need_resched();
-}
-
/* Invoked on each online non-idle CPU for expedited quiescent state. */
static void rcu_exp_handler(void *unused)
{
@@ -815,6 +873,7 @@ static void rcu_exp_handler(void *unused)
struct rcu_node *rnp = rdp->mynode;
bool preempt_bh_enabled = !(preempt_count() & (PREEMPT_MASK | SOFTIRQ_MASK));
+ ASSERT_EXCLUSIVE_WRITER_SCOPED(rdp->cpu_no_qs.b.exp);
if (!(READ_ONCE(rnp->expmask) & rdp->grpmask) ||
__this_cpu_read(rcu_data.cpu_no_qs.b.exp))
return;
@@ -930,7 +989,7 @@ void synchronize_rcu_expedited(void)
/* If expedited grace periods are prohibited, fall back to normal. */
if (rcu_gp_is_normal()) {
- wait_rcu_gp(call_rcu_hurry);
+ synchronize_rcu_normal();
return;
}
@@ -953,7 +1012,6 @@ void synchronize_rcu_expedited(void)
rnp = rcu_get_root();
wait_event(rnp->exp_wq[rcu_seq_ctr(s) & 0x3],
sync_exp_work_done(s));
- smp_mb(); /* Work actions happen before return. */
/* Let the next expedited grace period start. */
mutex_unlock(&rcu_state.exp_mutex);
diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h
index 3f85577bddd4..1596812f7f12 100644
--- a/kernel/rcu/tree_nocb.h
+++ b/kernel/rcu/tree_nocb.h
@@ -16,10 +16,6 @@
#ifdef CONFIG_RCU_NOCB_CPU
static cpumask_var_t rcu_nocb_mask; /* CPUs to have callbacks offloaded. */
static bool __read_mostly rcu_nocb_poll; /* Offload kthread are to poll. */
-static inline int rcu_lockdep_is_held_nocb(struct rcu_data *rdp)
-{
- return lockdep_is_held(&rdp->nocb_lock);
-}
static inline bool rcu_current_is_nocb_kthread(struct rcu_data *rdp)
{
@@ -91,8 +87,7 @@ module_param(nocb_nobypass_lim_per_jiffy, int, 0);
/*
* Acquire the specified rcu_data structure's ->nocb_bypass_lock. If the
- * lock isn't immediately available, increment ->nocb_lock_contended to
- * flag the contention.
+ * lock isn't immediately available, perform minimal sanity check.
*/
static void rcu_nocb_bypass_lock(struct rcu_data *rdp)
__acquires(&rdp->nocb_bypass_lock)
@@ -100,29 +95,12 @@ static void rcu_nocb_bypass_lock(struct rcu_data *rdp)
lockdep_assert_irqs_disabled();
if (raw_spin_trylock(&rdp->nocb_bypass_lock))
return;
- atomic_inc(&rdp->nocb_lock_contended);
+ /*
+ * Contention expected only when local enqueue collide with
+ * remote flush from kthreads.
+ */
WARN_ON_ONCE(smp_processor_id() != rdp->cpu);
- smp_mb__after_atomic(); /* atomic_inc() before lock. */
raw_spin_lock(&rdp->nocb_bypass_lock);
- smp_mb__before_atomic(); /* atomic_dec() after lock. */
- atomic_dec(&rdp->nocb_lock_contended);
-}
-
-/*
- * Spinwait until the specified rcu_data structure's ->nocb_lock is
- * not contended. Please note that this is extremely special-purpose,
- * relying on the fact that at most two kthreads and one CPU contend for
- * this lock, and also that the two kthreads are guaranteed to have frequent
- * grace-period-duration time intervals between successive acquisitions
- * of the lock. This allows us to use an extremely simple throttling
- * mechanism, and further to apply it only to the CPU doing floods of
- * call_rcu() invocations. Don't try this at home!
- */
-static void rcu_nocb_wait_contended(struct rcu_data *rdp)
-{
- WARN_ON_ONCE(smp_processor_id() != rdp->cpu);
- while (WARN_ON_ONCE(atomic_read(&rdp->nocb_lock_contended)))
- cpu_relax();
}
/*
@@ -228,7 +206,7 @@ static bool __wake_nocb_gp(struct rcu_data *rdp_gp,
if (rdp_gp->nocb_defer_wakeup > RCU_NOCB_WAKE_NOT) {
WRITE_ONCE(rdp_gp->nocb_defer_wakeup, RCU_NOCB_WAKE_NOT);
- del_timer(&rdp_gp->nocb_timer);
+ timer_delete(&rdp_gp->nocb_timer);
}
if (force || READ_ONCE(rdp_gp->nocb_gp_sleep)) {
@@ -238,7 +216,7 @@ static bool __wake_nocb_gp(struct rcu_data *rdp_gp,
raw_spin_unlock_irqrestore(&rdp_gp->nocb_gp_lock, flags);
if (needwake) {
trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("DoWake"));
- wake_up_process(rdp_gp->nocb_gp_kthread);
+ swake_up_one(&rdp_gp->nocb_gp_wq);
}
return needwake;
@@ -431,14 +409,6 @@ static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
return false;
}
- // In the process of (de-)offloading: no bypassing, but
- // locking.
- if (!rcu_segcblist_completely_offloaded(&rdp->cblist)) {
- rcu_nocb_lock(rdp);
- *was_alldone = !rcu_segcblist_pend_cbs(&rdp->cblist);
- return false; /* Not offloaded, no bypassing. */
- }
-
// Don't use ->nocb_bypass during early boot.
if (rcu_scheduler_active != RCU_SCHEDULER_RUNNING) {
rcu_nocb_lock(rdp);
@@ -510,7 +480,6 @@ static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
}
// We need to use the bypass.
- rcu_nocb_wait_contended(rdp);
rcu_nocb_bypass_lock(rdp);
ncbs = rcu_cblist_n_cbs(&rdp->nocb_bypass);
rcu_segcblist_inc_len(&rdp->cblist); /* Must precede enqueue. */
@@ -524,7 +493,7 @@ static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("FirstBQ"));
}
rcu_nocb_bypass_unlock(rdp);
- smp_mb(); /* Order enqueue before wake. */
+
// A wake up of the grace period kthread or timer adjustment
// needs to be done only if:
// 1. Bypass list was fully empty before (this is the first
@@ -635,42 +604,33 @@ static void call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *head,
}
}
-static int nocb_gp_toggle_rdp(struct rcu_data *rdp,
- bool *wake_state)
+static void nocb_gp_toggle_rdp(struct rcu_data *rdp_gp, struct rcu_data *rdp)
{
struct rcu_segcblist *cblist = &rdp->cblist;
unsigned long flags;
- int ret;
- rcu_nocb_lock_irqsave(rdp, flags);
- if (rcu_segcblist_test_flags(cblist, SEGCBLIST_OFFLOADED) &&
- !rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP)) {
+ /*
+ * Locking orders future de-offloaded callbacks enqueue against previous
+ * handling of this rdp. Ie: Make sure rcuog is done with this rdp before
+ * deoffloaded callbacks can be enqueued.
+ */
+ raw_spin_lock_irqsave(&rdp->nocb_lock, flags);
+ if (!rcu_segcblist_test_flags(cblist, SEGCBLIST_OFFLOADED)) {
/*
* Offloading. Set our flag and notify the offload worker.
* We will handle this rdp until it ever gets de-offloaded.
*/
- rcu_segcblist_set_flags(cblist, SEGCBLIST_KTHREAD_GP);
- if (rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB))
- *wake_state = true;
- ret = 1;
- } else if (!rcu_segcblist_test_flags(cblist, SEGCBLIST_OFFLOADED) &&
- rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP)) {
+ list_add_tail(&rdp->nocb_entry_rdp, &rdp_gp->nocb_head_rdp);
+ rcu_segcblist_set_flags(cblist, SEGCBLIST_OFFLOADED);
+ } else {
/*
* De-offloading. Clear our flag and notify the de-offload worker.
* We will ignore this rdp until it ever gets re-offloaded.
*/
- rcu_segcblist_clear_flags(cblist, SEGCBLIST_KTHREAD_GP);
- if (!rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB))
- *wake_state = true;
- ret = 0;
- } else {
- WARN_ON_ONCE(1);
- ret = -1;
+ list_del(&rdp->nocb_entry_rdp);
+ rcu_segcblist_clear_flags(cblist, SEGCBLIST_OFFLOADED);
}
-
- rcu_nocb_unlock_irqrestore(rdp, flags);
-
- return ret;
+ raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags);
}
static void nocb_gp_sleep(struct rcu_data *my_rdp, int cpu)
@@ -856,7 +816,7 @@ static void nocb_gp_wait(struct rcu_data *my_rdp)
if (my_rdp->nocb_defer_wakeup > RCU_NOCB_WAKE_NOT) {
WRITE_ONCE(my_rdp->nocb_defer_wakeup, RCU_NOCB_WAKE_NOT);
- del_timer(&my_rdp->nocb_timer);
+ timer_delete(&my_rdp->nocb_timer);
}
WRITE_ONCE(my_rdp->nocb_gp_sleep, true);
raw_spin_unlock_irqrestore(&my_rdp->nocb_gp_lock, flags);
@@ -877,16 +837,8 @@ static void nocb_gp_wait(struct rcu_data *my_rdp)
}
if (rdp_toggling) {
- bool wake_state = false;
- int ret;
-
- ret = nocb_gp_toggle_rdp(rdp_toggling, &wake_state);
- if (ret == 1)
- list_add_tail(&rdp_toggling->nocb_entry_rdp, &my_rdp->nocb_head_rdp);
- else if (ret == 0)
- list_del(&rdp_toggling->nocb_entry_rdp);
- if (wake_state)
- swake_up_one(&rdp_toggling->nocb_state_wq);
+ nocb_gp_toggle_rdp(my_rdp, rdp_toggling);
+ swake_up_one(&rdp_toggling->nocb_state_wq);
}
my_rdp->nocb_gp_seq = -1;
@@ -913,16 +865,9 @@ static int rcu_nocb_gp_kthread(void *arg)
return 0;
}
-static inline bool nocb_cb_can_run(struct rcu_data *rdp)
-{
- u8 flags = SEGCBLIST_OFFLOADED | SEGCBLIST_KTHREAD_CB;
-
- return rcu_segcblist_test_flags(&rdp->cblist, flags);
-}
-
static inline bool nocb_cb_wait_cond(struct rcu_data *rdp)
{
- return nocb_cb_can_run(rdp) && !READ_ONCE(rdp->nocb_cb_sleep);
+ return !READ_ONCE(rdp->nocb_cb_sleep) || kthread_should_park();
}
/*
@@ -934,24 +879,33 @@ static void nocb_cb_wait(struct rcu_data *rdp)
struct rcu_segcblist *cblist = &rdp->cblist;
unsigned long cur_gp_seq;
unsigned long flags;
- bool needwake_state = false;
bool needwake_gp = false;
- bool can_sleep = true;
struct rcu_node *rnp = rdp->mynode;
- do {
- swait_event_interruptible_exclusive(rdp->nocb_cb_wq,
- nocb_cb_wait_cond(rdp));
-
- if (READ_ONCE(rdp->nocb_cb_sleep)) {
- WARN_ON(signal_pending(current));
- trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WokeEmpty"));
+ swait_event_interruptible_exclusive(rdp->nocb_cb_wq,
+ nocb_cb_wait_cond(rdp));
+ if (kthread_should_park()) {
+ /*
+ * kthread_park() must be preceded by an rcu_barrier().
+ * But yet another rcu_barrier() might have sneaked in between
+ * the barrier callback execution and the callbacks counter
+ * decrement.
+ */
+ if (rdp->nocb_cb_sleep) {
+ rcu_nocb_lock_irqsave(rdp, flags);
+ WARN_ON_ONCE(rcu_segcblist_n_cbs(&rdp->cblist));
+ rcu_nocb_unlock_irqrestore(rdp, flags);
+ kthread_parkme();
}
- } while (!nocb_cb_can_run(rdp));
+ } else if (READ_ONCE(rdp->nocb_cb_sleep)) {
+ WARN_ON(signal_pending(current));
+ trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WokeEmpty"));
+ }
+ WARN_ON_ONCE(!rcu_rdp_is_offloaded(rdp));
local_irq_save(flags);
- rcu_momentary_dyntick_idle();
+ rcu_momentary_eqs();
local_irq_restore(flags);
/*
* Disable BH to provide the expected environment. Also, when
@@ -971,37 +925,16 @@ static void nocb_cb_wait(struct rcu_data *rdp)
raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled. */
}
- if (rcu_segcblist_test_flags(cblist, SEGCBLIST_OFFLOADED)) {
- if (!rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB)) {
- rcu_segcblist_set_flags(cblist, SEGCBLIST_KTHREAD_CB);
- if (rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP))
- needwake_state = true;
- }
- if (rcu_segcblist_ready_cbs(cblist))
- can_sleep = false;
+ if (!rcu_segcblist_ready_cbs(cblist)) {
+ WRITE_ONCE(rdp->nocb_cb_sleep, true);
+ trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("CBSleep"));
} else {
- /*
- * De-offloading. Clear our flag and notify the de-offload worker.
- * We won't touch the callbacks and keep sleeping until we ever
- * get re-offloaded.
- */
- WARN_ON_ONCE(!rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB));
- rcu_segcblist_clear_flags(cblist, SEGCBLIST_KTHREAD_CB);
- if (!rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP))
- needwake_state = true;
+ WRITE_ONCE(rdp->nocb_cb_sleep, false);
}
- WRITE_ONCE(rdp->nocb_cb_sleep, can_sleep);
-
- if (rdp->nocb_cb_sleep)
- trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("CBSleep"));
-
rcu_nocb_unlock_irqrestore(rdp, flags);
if (needwake_gp)
rcu_gp_kthread_wake();
-
- if (needwake_state)
- swake_up_one(&rdp->nocb_state_wq);
}
/*
@@ -1085,25 +1018,11 @@ void rcu_nocb_flush_deferred_wakeup(void)
}
EXPORT_SYMBOL_GPL(rcu_nocb_flush_deferred_wakeup);
-static int rdp_offload_toggle(struct rcu_data *rdp,
- bool offload, unsigned long flags)
- __releases(rdp->nocb_lock)
+static int rcu_nocb_queue_toggle_rdp(struct rcu_data *rdp)
{
- struct rcu_segcblist *cblist = &rdp->cblist;
struct rcu_data *rdp_gp = rdp->nocb_gp_rdp;
bool wake_gp = false;
-
- rcu_segcblist_offload(cblist, offload);
-
- if (rdp->nocb_cb_sleep)
- rdp->nocb_cb_sleep = false;
- rcu_nocb_unlock_irqrestore(rdp, flags);
-
- /*
- * Ignore former value of nocb_cb_sleep and force wake up as it could
- * have been spuriously set to false already.
- */
- swake_up_one(&rdp->nocb_cb_wq);
+ unsigned long flags;
raw_spin_lock_irqsave(&rdp_gp->nocb_gp_lock, flags);
// Queue this rdp for add/del to/from the list to iterate on rcuog
@@ -1117,97 +1036,73 @@ static int rdp_offload_toggle(struct rcu_data *rdp,
return wake_gp;
}
-static long rcu_nocb_rdp_deoffload(void *arg)
+static bool rcu_nocb_rdp_deoffload_wait_cond(struct rcu_data *rdp)
{
- struct rcu_data *rdp = arg;
- struct rcu_segcblist *cblist = &rdp->cblist;
unsigned long flags;
- int wake_gp;
- struct rcu_data *rdp_gp = rdp->nocb_gp_rdp;
+ bool ret;
/*
- * rcu_nocb_rdp_deoffload() may be called directly if
- * rcuog/o[p] spawn failed, because at this time the rdp->cpu
- * is not online yet.
+ * Locking makes sure rcuog is done handling this rdp before deoffloaded
+ * enqueue can happen. Also it keeps the SEGCBLIST_OFFLOADED flag stable
+ * while the ->nocb_lock is held.
*/
- WARN_ON_ONCE((rdp->cpu != raw_smp_processor_id()) && cpu_online(rdp->cpu));
+ raw_spin_lock_irqsave(&rdp->nocb_lock, flags);
+ ret = !rcu_segcblist_test_flags(&rdp->cblist, SEGCBLIST_OFFLOADED);
+ raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags);
+
+ return ret;
+}
+
+static int rcu_nocb_rdp_deoffload(struct rcu_data *rdp)
+{
+ unsigned long flags;
+ int wake_gp;
+ struct rcu_data *rdp_gp = rdp->nocb_gp_rdp;
+
+ /* CPU must be offline, unless it's early boot */
+ WARN_ON_ONCE(cpu_online(rdp->cpu) && rdp->cpu != raw_smp_processor_id());
pr_info("De-offloading %d\n", rdp->cpu);
- rcu_nocb_lock_irqsave(rdp, flags);
- /*
- * Flush once and for all now. This suffices because we are
- * running on the target CPU holding ->nocb_lock (thus having
- * interrupts disabled), and because rdp_offload_toggle()
- * invokes rcu_segcblist_offload(), which clears SEGCBLIST_OFFLOADED.
- * Thus future calls to rcu_segcblist_completely_offloaded() will
- * return false, which means that future calls to rcu_nocb_try_bypass()
- * will refuse to put anything into the bypass.
- */
- WARN_ON_ONCE(!rcu_nocb_flush_bypass(rdp, NULL, jiffies, false));
+ /* Flush all callbacks from segcblist and bypass */
+ rcu_barrier();
+
/*
- * Start with invoking rcu_core() early. This way if the current thread
- * happens to preempt an ongoing call to rcu_core() in the middle,
- * leaving some work dismissed because rcu_core() still thinks the rdp is
- * completely offloaded, we are guaranteed a nearby future instance of
- * rcu_core() to catch up.
+ * Make sure the rcuoc kthread isn't in the middle of a nocb locked
+ * sequence while offloading is deactivated, along with nocb locking.
*/
- rcu_segcblist_set_flags(cblist, SEGCBLIST_RCU_CORE);
- invoke_rcu_core();
- wake_gp = rdp_offload_toggle(rdp, false, flags);
+ if (rdp->nocb_cb_kthread)
+ kthread_park(rdp->nocb_cb_kthread);
+
+ rcu_nocb_lock_irqsave(rdp, flags);
+ WARN_ON_ONCE(rcu_cblist_n_cbs(&rdp->nocb_bypass));
+ WARN_ON_ONCE(rcu_segcblist_n_cbs(&rdp->cblist));
+ rcu_nocb_unlock_irqrestore(rdp, flags);
+
+ wake_gp = rcu_nocb_queue_toggle_rdp(rdp);
mutex_lock(&rdp_gp->nocb_gp_kthread_mutex);
+
if (rdp_gp->nocb_gp_kthread) {
if (wake_gp)
wake_up_process(rdp_gp->nocb_gp_kthread);
- /*
- * If rcuo[p] kthread spawn failed, directly remove SEGCBLIST_KTHREAD_CB.
- * Just wait SEGCBLIST_KTHREAD_GP to be cleared by rcuog.
- */
- if (!rdp->nocb_cb_kthread) {
- rcu_nocb_lock_irqsave(rdp, flags);
- rcu_segcblist_clear_flags(&rdp->cblist, SEGCBLIST_KTHREAD_CB);
- rcu_nocb_unlock_irqrestore(rdp, flags);
- }
-
swait_event_exclusive(rdp->nocb_state_wq,
- !rcu_segcblist_test_flags(cblist,
- SEGCBLIST_KTHREAD_CB | SEGCBLIST_KTHREAD_GP));
+ rcu_nocb_rdp_deoffload_wait_cond(rdp));
} else {
/*
* No kthread to clear the flags for us or remove the rdp from the nocb list
* to iterate. Do it here instead. Locking doesn't look stricly necessary
* but we stick to paranoia in this rare path.
*/
- rcu_nocb_lock_irqsave(rdp, flags);
- rcu_segcblist_clear_flags(&rdp->cblist,
- SEGCBLIST_KTHREAD_CB | SEGCBLIST_KTHREAD_GP);
- rcu_nocb_unlock_irqrestore(rdp, flags);
+ raw_spin_lock_irqsave(&rdp->nocb_lock, flags);
+ rcu_segcblist_clear_flags(&rdp->cblist, SEGCBLIST_OFFLOADED);
+ raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags);
list_del(&rdp->nocb_entry_rdp);
}
- mutex_unlock(&rdp_gp->nocb_gp_kthread_mutex);
-
- /*
- * Lock one last time to acquire latest callback updates from kthreads
- * so we can later handle callbacks locally without locking.
- */
- rcu_nocb_lock_irqsave(rdp, flags);
- /*
- * Theoretically we could clear SEGCBLIST_LOCKING after the nocb
- * lock is released but how about being paranoid for once?
- */
- rcu_segcblist_clear_flags(cblist, SEGCBLIST_LOCKING);
- /*
- * Without SEGCBLIST_LOCKING, we can't use
- * rcu_nocb_unlock_irqrestore() anymore.
- */
- raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags);
-
- /* Sanity check */
- WARN_ON_ONCE(rcu_cblist_n_cbs(&rdp->nocb_bypass));
+ mutex_unlock(&rdp_gp->nocb_gp_kthread_mutex);
return 0;
}
@@ -1218,33 +1113,42 @@ int rcu_nocb_cpu_deoffload(int cpu)
int ret = 0;
cpus_read_lock();
- mutex_lock(&rcu_state.barrier_mutex);
+ mutex_lock(&rcu_state.nocb_mutex);
if (rcu_rdp_is_offloaded(rdp)) {
- if (cpu_online(cpu)) {
- ret = work_on_cpu(cpu, rcu_nocb_rdp_deoffload, rdp);
+ if (!cpu_online(cpu)) {
+ ret = rcu_nocb_rdp_deoffload(rdp);
if (!ret)
cpumask_clear_cpu(cpu, rcu_nocb_mask);
} else {
- pr_info("NOCB: Cannot CB-deoffload offline CPU %d\n", rdp->cpu);
+ pr_info("NOCB: Cannot CB-deoffload online CPU %d\n", rdp->cpu);
ret = -EINVAL;
}
}
- mutex_unlock(&rcu_state.barrier_mutex);
+ mutex_unlock(&rcu_state.nocb_mutex);
cpus_read_unlock();
return ret;
}
EXPORT_SYMBOL_GPL(rcu_nocb_cpu_deoffload);
-static long rcu_nocb_rdp_offload(void *arg)
+static bool rcu_nocb_rdp_offload_wait_cond(struct rcu_data *rdp)
{
- struct rcu_data *rdp = arg;
- struct rcu_segcblist *cblist = &rdp->cblist;
unsigned long flags;
+ bool ret;
+
+ raw_spin_lock_irqsave(&rdp->nocb_lock, flags);
+ ret = rcu_segcblist_test_flags(&rdp->cblist, SEGCBLIST_OFFLOADED);
+ raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags);
+
+ return ret;
+}
+
+static int rcu_nocb_rdp_offload(struct rcu_data *rdp)
+{
int wake_gp;
struct rcu_data *rdp_gp = rdp->nocb_gp_rdp;
- WARN_ON_ONCE(rdp->cpu != raw_smp_processor_id());
+ WARN_ON_ONCE(cpu_online(rdp->cpu));
/*
* For now we only support re-offload, ie: the rdp must have been
* offloaded on boot first.
@@ -1257,42 +1161,17 @@ static long rcu_nocb_rdp_offload(void *arg)
pr_info("Offloading %d\n", rdp->cpu);
- /*
- * Can't use rcu_nocb_lock_irqsave() before SEGCBLIST_LOCKING
- * is set.
- */
- raw_spin_lock_irqsave(&rdp->nocb_lock, flags);
+ WARN_ON_ONCE(rcu_cblist_n_cbs(&rdp->nocb_bypass));
+ WARN_ON_ONCE(rcu_segcblist_n_cbs(&rdp->cblist));
- /*
- * We didn't take the nocb lock while working on the
- * rdp->cblist with SEGCBLIST_LOCKING cleared (pure softirq/rcuc mode).
- * Every modifications that have been done previously on
- * rdp->cblist must be visible remotely by the nocb kthreads
- * upon wake up after reading the cblist flags.
- *
- * The layout against nocb_lock enforces that ordering:
- *
- * __rcu_nocb_rdp_offload() nocb_cb_wait()/nocb_gp_wait()
- * ------------------------- ----------------------------
- * WRITE callbacks rcu_nocb_lock()
- * rcu_nocb_lock() READ flags
- * WRITE flags READ callbacks
- * rcu_nocb_unlock() rcu_nocb_unlock()
- */
- wake_gp = rdp_offload_toggle(rdp, true, flags);
+ wake_gp = rcu_nocb_queue_toggle_rdp(rdp);
if (wake_gp)
wake_up_process(rdp_gp->nocb_gp_kthread);
+
swait_event_exclusive(rdp->nocb_state_wq,
- rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB) &&
- rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP));
+ rcu_nocb_rdp_offload_wait_cond(rdp));
- /*
- * All kthreads are ready to work, we can finally relieve rcu_core() and
- * enable nocb bypass.
- */
- rcu_nocb_lock_irqsave(rdp, flags);
- rcu_segcblist_clear_flags(cblist, SEGCBLIST_RCU_CORE);
- rcu_nocb_unlock_irqrestore(rdp, flags);
+ kthread_unpark(rdp->nocb_cb_kthread);
return 0;
}
@@ -1303,18 +1182,18 @@ int rcu_nocb_cpu_offload(int cpu)
int ret = 0;
cpus_read_lock();
- mutex_lock(&rcu_state.barrier_mutex);
+ mutex_lock(&rcu_state.nocb_mutex);
if (!rcu_rdp_is_offloaded(rdp)) {
- if (cpu_online(cpu)) {
- ret = work_on_cpu(cpu, rcu_nocb_rdp_offload, rdp);
+ if (!cpu_online(cpu)) {
+ ret = rcu_nocb_rdp_offload(rdp);
if (!ret)
cpumask_set_cpu(cpu, rcu_nocb_mask);
} else {
- pr_info("NOCB: Cannot CB-offload offline CPU %d\n", rdp->cpu);
+ pr_info("NOCB: Cannot CB-offload online CPU %d\n", rdp->cpu);
ret = -EINVAL;
}
}
- mutex_unlock(&rcu_state.barrier_mutex);
+ mutex_unlock(&rcu_state.nocb_mutex);
cpus_read_unlock();
return ret;
@@ -1332,7 +1211,7 @@ lazy_rcu_shrink_count(struct shrinker *shrink, struct shrink_control *sc)
return 0;
/* Protect rcu_nocb_mask against concurrent (de-)offloading. */
- if (!mutex_trylock(&rcu_state.barrier_mutex))
+ if (!mutex_trylock(&rcu_state.nocb_mutex))
return 0;
/* Snapshot count of all CPUs */
@@ -1342,7 +1221,7 @@ lazy_rcu_shrink_count(struct shrinker *shrink, struct shrink_control *sc)
count += READ_ONCE(rdp->lazy_len);
}
- mutex_unlock(&rcu_state.barrier_mutex);
+ mutex_unlock(&rcu_state.nocb_mutex);
return count ? count : SHRINK_EMPTY;
}
@@ -1360,9 +1239,9 @@ lazy_rcu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
* Protect against concurrent (de-)offloading. Otherwise nocb locking
* may be ignored or imbalanced.
*/
- if (!mutex_trylock(&rcu_state.barrier_mutex)) {
+ if (!mutex_trylock(&rcu_state.nocb_mutex)) {
/*
- * But really don't insist if barrier_mutex is contended since we
+ * But really don't insist if nocb_mutex is contended since we
* can't guarantee that it will never engage in a dependency
* chain involving memory allocation. The lock is seldom contended
* anyway.
@@ -1401,7 +1280,7 @@ lazy_rcu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
break;
}
- mutex_unlock(&rcu_state.barrier_mutex);
+ mutex_unlock(&rcu_state.nocb_mutex);
return count ? count : SHRINK_STOP;
}
@@ -1467,9 +1346,7 @@ void __init rcu_init_nohz(void)
rdp = per_cpu_ptr(&rcu_data, cpu);
if (rcu_segcblist_empty(&rdp->cblist))
rcu_segcblist_init(&rdp->cblist);
- rcu_segcblist_offload(&rdp->cblist, true);
- rcu_segcblist_set_flags(&rdp->cblist, SEGCBLIST_KTHREAD_CB | SEGCBLIST_KTHREAD_GP);
- rcu_segcblist_clear_flags(&rdp->cblist, SEGCBLIST_RCU_CORE);
+ rcu_segcblist_set_flags(&rdp->cblist, SEGCBLIST_OFFLOADED);
}
rcu_organize_nocb_kthreads();
}
@@ -1517,7 +1394,7 @@ static void rcu_spawn_cpu_nocb_kthread(int cpu)
"rcuog/%d", rdp_gp->cpu);
if (WARN_ONCE(IS_ERR(t), "%s: Could not start rcuo GP kthread, OOM is now expected behavior\n", __func__)) {
mutex_unlock(&rdp_gp->nocb_gp_kthread_mutex);
- goto end;
+ goto err;
}
WRITE_ONCE(rdp_gp->nocb_gp_kthread, t);
if (kthread_prio)
@@ -1526,10 +1403,15 @@ static void rcu_spawn_cpu_nocb_kthread(int cpu)
mutex_unlock(&rdp_gp->nocb_gp_kthread_mutex);
/* Spawn the kthread for this CPU. */
- t = kthread_run(rcu_nocb_cb_kthread, rdp,
- "rcuo%c/%d", rcu_state.abbr, cpu);
+ t = kthread_create(rcu_nocb_cb_kthread, rdp,
+ "rcuo%c/%d", rcu_state.abbr, cpu);
if (WARN_ONCE(IS_ERR(t), "%s: Could not start rcuo CB kthread, OOM is now expected behavior\n", __func__))
- goto end;
+ goto err;
+
+ if (rcu_rdp_is_offloaded(rdp))
+ wake_up_process(t);
+ else
+ kthread_park(t);
if (IS_ENABLED(CONFIG_RCU_NOCB_CPU_CB_BOOST) && kthread_prio)
sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
@@ -1537,13 +1419,21 @@ static void rcu_spawn_cpu_nocb_kthread(int cpu)
WRITE_ONCE(rdp->nocb_cb_kthread, t);
WRITE_ONCE(rdp->nocb_gp_kthread, rdp_gp->nocb_gp_kthread);
return;
-end:
- mutex_lock(&rcu_state.barrier_mutex);
+
+err:
+ /*
+ * No need to protect against concurrent rcu_barrier()
+ * because the number of callbacks should be 0 for a non-boot CPU,
+ * therefore rcu_barrier() shouldn't even try to grab the nocb_lock.
+ * But hold nocb_mutex to avoid nocb_lock imbalance from shrinker.
+ */
+ WARN_ON_ONCE(system_state > SYSTEM_BOOTING && rcu_segcblist_n_cbs(&rdp->cblist));
+ mutex_lock(&rcu_state.nocb_mutex);
if (rcu_rdp_is_offloaded(rdp)) {
rcu_nocb_rdp_deoffload(rdp);
cpumask_clear_cpu(cpu, rcu_nocb_mask);
}
- mutex_unlock(&rcu_state.barrier_mutex);
+ mutex_unlock(&rcu_state.nocb_mutex);
}
/* How many CB CPU IDs per GP kthread? Default of -1 for sqrt(nr_cpu_ids). */
@@ -1661,8 +1551,11 @@ static void show_rcu_nocb_gp_state(struct rcu_data *rdp)
/* Dump out nocb kthread state for the specified rcu_data structure. */
static void show_rcu_nocb_state(struct rcu_data *rdp)
{
- char bufw[20];
- char bufr[20];
+ char bufd[22];
+ char bufw[45];
+ char bufr[45];
+ char bufn[22];
+ char bufb[22];
struct rcu_data *nocb_next_rdp;
struct rcu_segcblist *rsclp = &rdp->cblist;
bool waslocked;
@@ -1676,14 +1569,17 @@ static void show_rcu_nocb_state(struct rcu_data *rdp)
typeof(*rdp),
nocb_entry_rdp);
- sprintf(bufw, "%ld", rsclp->gp_seq[RCU_WAIT_TAIL]);
- sprintf(bufr, "%ld", rsclp->gp_seq[RCU_NEXT_READY_TAIL]);
- pr_info(" CB %d^%d->%d %c%c%c%c%c%c F%ld L%ld C%d %c%c%s%c%s%c%c q%ld %c CPU %d%s\n",
+ sprintf(bufd, "%ld", rsclp->seglen[RCU_DONE_TAIL]);
+ sprintf(bufw, "%ld(%ld)", rsclp->seglen[RCU_WAIT_TAIL], rsclp->gp_seq[RCU_WAIT_TAIL]);
+ sprintf(bufr, "%ld(%ld)", rsclp->seglen[RCU_NEXT_READY_TAIL],
+ rsclp->gp_seq[RCU_NEXT_READY_TAIL]);
+ sprintf(bufn, "%ld", rsclp->seglen[RCU_NEXT_TAIL]);
+ sprintf(bufb, "%ld", rcu_cblist_n_cbs(&rdp->nocb_bypass));
+ pr_info(" CB %d^%d->%d %c%c%c%c%c F%ld L%ld C%d %c%s%c%s%c%s%c%s%c%s q%ld %c CPU %d%s\n",
rdp->cpu, rdp->nocb_gp_rdp->cpu,
nocb_next_rdp ? nocb_next_rdp->cpu : -1,
"kK"[!!rdp->nocb_cb_kthread],
"bB"[raw_spin_is_locked(&rdp->nocb_bypass_lock)],
- "cC"[!!atomic_read(&rdp->nocb_lock_contended)],
"lL"[raw_spin_is_locked(&rdp->nocb_lock)],
"sS"[!!rdp->nocb_cb_sleep],
".W"[swait_active(&rdp->nocb_cb_wq)],
@@ -1691,12 +1587,15 @@ static void show_rcu_nocb_state(struct rcu_data *rdp)
jiffies - rdp->nocb_nobypass_last,
rdp->nocb_nobypass_count,
".D"[rcu_segcblist_ready_cbs(rsclp)],
+ rcu_segcblist_segempty(rsclp, RCU_DONE_TAIL) ? "" : bufd,
".W"[!rcu_segcblist_segempty(rsclp, RCU_WAIT_TAIL)],
rcu_segcblist_segempty(rsclp, RCU_WAIT_TAIL) ? "" : bufw,
".R"[!rcu_segcblist_segempty(rsclp, RCU_NEXT_READY_TAIL)],
rcu_segcblist_segempty(rsclp, RCU_NEXT_READY_TAIL) ? "" : bufr,
".N"[!rcu_segcblist_segempty(rsclp, RCU_NEXT_TAIL)],
+ rcu_segcblist_segempty(rsclp, RCU_NEXT_TAIL) ? "" : bufn,
".B"[!!rcu_cblist_n_cbs(&rdp->nocb_bypass)],
+ !rcu_cblist_n_cbs(&rdp->nocb_bypass) ? "" : bufb,
rcu_segcblist_n_cbs(&rdp->cblist),
rdp->nocb_cb_kthread ? task_state_to_char(rdp->nocb_cb_kthread) : '.',
rdp->nocb_cb_kthread ? (int)task_cpu(rdp->nocb_cb_kthread) : -1,
@@ -1720,16 +1619,6 @@ static void show_rcu_nocb_state(struct rcu_data *rdp)
#else /* #ifdef CONFIG_RCU_NOCB_CPU */
-static inline int rcu_lockdep_is_held_nocb(struct rcu_data *rdp)
-{
- return 0;
-}
-
-static inline bool rcu_current_is_nocb_kthread(struct rcu_data *rdp)
-{
- return false;
-}
-
/* No ->nocb_lock to acquire. */
static void rcu_nocb_lock(struct rcu_data *rdp)
{
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 36a8b5dbf5b5..0b0f56f6abc8 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -24,12 +24,13 @@ static bool rcu_rdp_is_offloaded(struct rcu_data *rdp)
* timers have their own means of synchronization against the
* offloaded state updaters.
*/
- RCU_LOCKDEP_WARN(
+ RCU_NOCB_LOCKDEP_WARN(
!(lockdep_is_held(&rcu_state.barrier_mutex) ||
(IS_ENABLED(CONFIG_HOTPLUG_CPU) && lockdep_is_cpus_held()) ||
- rcu_lockdep_is_held_nocb(rdp) ||
- (rdp == this_cpu_ptr(&rcu_data) &&
- !(IS_ENABLED(CONFIG_PREEMPT_COUNT) && preemptible())) ||
+ lockdep_is_held(&rdp->nocb_lock) ||
+ lockdep_is_held(&rcu_state.nocb_mutex) ||
+ ((!(IS_ENABLED(CONFIG_PREEMPT_COUNT) && preemptible()) || softirq_count()) &&
+ rdp == this_cpu_ptr(&rcu_data)) ||
rcu_current_is_nocb_kthread(rdp)),
"Unsafe read of RCU_NOCB offloaded state"
);
@@ -93,6 +94,16 @@ static void __init rcu_bootup_announce_oddness(void)
pr_info("\tRCU debug GP init slowdown %d jiffies.\n", gp_init_delay);
if (gp_cleanup_delay)
pr_info("\tRCU debug GP cleanup slowdown %d jiffies.\n", gp_cleanup_delay);
+ if (nohz_full_patience_delay < 0) {
+ pr_info("\tRCU NOCB CPU patience negative (%d), resetting to zero.\n", nohz_full_patience_delay);
+ nohz_full_patience_delay = 0;
+ } else if (nohz_full_patience_delay > 5 * MSEC_PER_SEC) {
+ pr_info("\tRCU NOCB CPU patience too large (%d), resetting to %ld.\n", nohz_full_patience_delay, 5 * MSEC_PER_SEC);
+ nohz_full_patience_delay = 5 * MSEC_PER_SEC;
+ } else if (nohz_full_patience_delay) {
+ pr_info("\tRCU NOCB CPU patience set to %d milliseconds.\n", nohz_full_patience_delay);
+ }
+ nohz_full_patience_delay_jiffies = msecs_to_jiffies(nohz_full_patience_delay);
if (!use_softirq)
pr_info("\tRCU_SOFTIRQ processing moved to rcuc kthreads.\n");
if (IS_ENABLED(CONFIG_RCU_EQS_DEBUG))
@@ -172,9 +183,9 @@ static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp)
switch (blkd_state) {
case 0:
case RCU_EXP_TASKS:
- case RCU_EXP_TASKS + RCU_GP_BLKD:
+ case RCU_EXP_TASKS | RCU_GP_BLKD:
case RCU_GP_TASKS:
- case RCU_GP_TASKS + RCU_EXP_TASKS:
+ case RCU_GP_TASKS | RCU_EXP_TASKS:
/*
* Blocking neither GP, or first task blocking the normal
@@ -187,10 +198,10 @@ static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp)
case RCU_EXP_BLKD:
case RCU_GP_BLKD:
- case RCU_GP_BLKD + RCU_EXP_BLKD:
- case RCU_GP_TASKS + RCU_EXP_BLKD:
- case RCU_GP_TASKS + RCU_GP_BLKD + RCU_EXP_BLKD:
- case RCU_GP_TASKS + RCU_EXP_TASKS + RCU_GP_BLKD + RCU_EXP_BLKD:
+ case RCU_GP_BLKD | RCU_EXP_BLKD:
+ case RCU_GP_TASKS | RCU_EXP_BLKD:
+ case RCU_GP_TASKS | RCU_GP_BLKD | RCU_EXP_BLKD:
+ case RCU_GP_TASKS | RCU_EXP_TASKS | RCU_GP_BLKD | RCU_EXP_BLKD:
/*
* First task arriving that blocks either GP, or first task
@@ -203,9 +214,9 @@ static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp)
list_add_tail(&t->rcu_node_entry, &rnp->blkd_tasks);
break;
- case RCU_EXP_TASKS + RCU_EXP_BLKD:
- case RCU_EXP_TASKS + RCU_GP_BLKD + RCU_EXP_BLKD:
- case RCU_GP_TASKS + RCU_EXP_TASKS + RCU_EXP_BLKD:
+ case RCU_EXP_TASKS | RCU_EXP_BLKD:
+ case RCU_EXP_TASKS | RCU_GP_BLKD | RCU_EXP_BLKD:
+ case RCU_GP_TASKS | RCU_EXP_TASKS | RCU_EXP_BLKD:
/*
* Second or subsequent task blocking the expedited GP.
@@ -216,8 +227,8 @@ static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp)
list_add(&t->rcu_node_entry, rnp->exp_tasks);
break;
- case RCU_GP_TASKS + RCU_GP_BLKD:
- case RCU_GP_TASKS + RCU_EXP_TASKS + RCU_GP_BLKD:
+ case RCU_GP_TASKS | RCU_GP_BLKD:
+ case RCU_GP_TASKS | RCU_EXP_TASKS | RCU_GP_BLKD:
/*
* Second or subsequent task blocking the normal GP.
@@ -264,6 +275,7 @@ static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp)
rcu_report_exp_rdp(rdp);
else
WARN_ON_ONCE(rdp->cpu_no_qs.b.exp);
+ ASSERT_EXCLUSIVE_WRITER_SCOPED(rdp->cpu_no_qs.b.exp);
}
/*
@@ -805,8 +817,8 @@ dump_blkd_tasks(struct rcu_node *rnp, int ncheck)
rdp = per_cpu_ptr(&rcu_data, cpu);
pr_info("\t%d: %c online: %ld(%d) offline: %ld(%d)\n",
cpu, ".o"[rcu_rdp_cpu_online(rdp)],
- (long)rdp->rcu_onl_gp_seq, rdp->rcu_onl_gp_flags,
- (long)rdp->rcu_ofl_gp_seq, rdp->rcu_ofl_gp_flags);
+ (long)rdp->rcu_onl_gp_seq, rdp->rcu_onl_gp_state,
+ (long)rdp->rcu_ofl_gp_seq, rdp->rcu_ofl_gp_state);
}
}
@@ -821,8 +833,17 @@ void rcu_read_unlock_strict(void)
{
struct rcu_data *rdp;
- if (irqs_disabled() || preempt_count() || !rcu_state.gp_kthread)
+ if (irqs_disabled() || in_atomic_preempt_off() || !rcu_state.gp_kthread)
return;
+
+ /*
+ * rcu_report_qs_rdp() can only be invoked with a stable rdp and
+ * from the local CPU.
+ *
+ * The in_atomic_preempt_off() check ensures that we come here holding
+ * the last preempt_count (which will get dropped once we return to
+ * __rcu_read_unlock().
+ */
rdp = this_cpu_ptr(&rcu_data);
rdp->cpu_no_qs.b.norm = false;
rcu_report_qs_rdp(rdp);
@@ -859,7 +880,7 @@ static void rcu_qs(void)
/*
* Register an urgently needed quiescent state. If there is an
- * emergency, invoke rcu_momentary_dyntick_idle() to do a heavy-weight
+ * emergency, invoke rcu_momentary_eqs() to do a heavy-weight
* dyntick-idle quiescent state visible to other CPUs, which will in
* some cases serve for expedited as well as normal grace periods.
* Either way, register a lightweight quiescent state.
@@ -879,7 +900,7 @@ void rcu_all_qs(void)
this_cpu_write(rcu_data.rcu_urgent_qs, false);
if (unlikely(raw_cpu_read(rcu_data.rcu_need_heavy_qs))) {
local_irq_save(flags);
- rcu_momentary_dyntick_idle();
+ rcu_momentary_eqs();
local_irq_restore(flags);
}
rcu_qs();
@@ -899,7 +920,7 @@ void rcu_note_context_switch(bool preempt)
goto out;
this_cpu_write(rcu_data.rcu_urgent_qs, false);
if (unlikely(raw_cpu_read(rcu_data.rcu_need_heavy_qs)))
- rcu_momentary_dyntick_idle();
+ rcu_momentary_eqs();
out:
rcu_tasks_qs(current, preempt);
trace_rcu_utilization(TPS("End context switch"));
@@ -963,13 +984,16 @@ static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp)
*/
static void rcu_flavor_sched_clock_irq(int user)
{
- if (user || rcu_is_cpu_rrupt_from_idle()) {
+ if (user || rcu_is_cpu_rrupt_from_idle() ||
+ (IS_ENABLED(CONFIG_PREEMPT_COUNT) &&
+ (preempt_count() == HARDIRQ_OFFSET))) {
/*
* Get here if this CPU took its interrupt from user
- * mode or from the idle loop, and if this is not a
- * nested interrupt. In this case, the CPU is in
- * a quiescent state, so note it.
+ * mode, from the idle loop without this being a nested
+ * interrupt, or while not holding the task preempt count
+ * (with PREEMPT_COUNT=y). In this case, the CPU is in a
+ * quiescent state, so note it.
*
* No memory barrier is required here because rcu_qs()
* references only CPU-local variables that other CPUs
@@ -1206,16 +1230,13 @@ static void rcu_spawn_one_boost_kthread(struct rcu_node *rnp)
raw_spin_lock_irqsave_rcu_node(rnp, flags);
rnp->boost_kthread_task = t;
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+
sp.sched_priority = kthread_prio;
sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
+ rcu_thread_affine_rnp(t, rnp);
wake_up_process(t); /* get to TASK_INTERRUPTIBLE quickly. */
}
-static struct task_struct *rcu_boost_task(struct rcu_node *rnp)
-{
- return READ_ONCE(rnp->boost_kthread_task);
-}
-
#else /* #ifdef CONFIG_RCU_BOOST */
static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags)
@@ -1232,10 +1253,6 @@ static void rcu_spawn_one_boost_kthread(struct rcu_node *rnp)
{
}
-static struct task_struct *rcu_boost_task(struct rcu_node *rnp)
-{
- return NULL;
-}
#endif /* #else #ifdef CONFIG_RCU_BOOST */
/*
diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h
index 5d666428546b..56b21219442b 100644
--- a/kernel/rcu/tree_stall.h
+++ b/kernel/rcu/tree_stall.h
@@ -7,8 +7,10 @@
* Author: Paul E. McKenney <paulmck@linux.ibm.com>
*/
+#include <linux/console.h>
#include <linux/kvm_para.h>
#include <linux/rcu_notifier.h>
+#include <linux/smp.h>
//////////////////////////////////////////////////////////////////////////////
//
@@ -74,36 +76,6 @@ int rcu_jiffies_till_stall_check(void)
}
EXPORT_SYMBOL_GPL(rcu_jiffies_till_stall_check);
-/**
- * rcu_gp_might_be_stalled - Is it likely that the grace period is stalled?
- *
- * Returns @true if the current grace period is sufficiently old that
- * it is reasonable to assume that it might be stalled. This can be
- * useful when deciding whether to allocate memory to enable RCU-mediated
- * freeing on the one hand or just invoking synchronize_rcu() on the other.
- * The latter is preferable when the grace period is stalled.
- *
- * Note that sampling of the .gp_start and .gp_seq fields must be done
- * carefully to avoid false positives at the beginnings and ends of
- * grace periods.
- */
-bool rcu_gp_might_be_stalled(void)
-{
- unsigned long d = rcu_jiffies_till_stall_check() / RCU_STALL_MIGHT_DIV;
- unsigned long j = jiffies;
-
- if (d < RCU_STALL_MIGHT_MIN)
- d = RCU_STALL_MIGHT_MIN;
- smp_mb(); // jiffies before .gp_seq to avoid false positives.
- if (!rcu_gp_in_progress())
- return false;
- // Long delays at this point avoids false positive, but a delay
- // of ULONG_MAX/4 jiffies voids your no-false-positive warranty.
- smp_mb(); // .gp_seq before second .gp_start
- // And ditto here.
- return !time_before(j, READ_ONCE(rcu_state.gp_start) + d);
-}
-
/* Don't do RCU CPU stall warnings during long sysrq printouts. */
void rcu_sysrq_start(void)
{
@@ -363,22 +335,32 @@ static int rcu_print_task_stall(struct rcu_node *rnp, unsigned long flags)
* that don't support NMI-based stack dumps. The NMI-triggered stack
* traces are more accurate because they are printed by the target CPU.
*/
-static void rcu_dump_cpu_stacks(void)
+static void rcu_dump_cpu_stacks(unsigned long gp_seq)
{
int cpu;
unsigned long flags;
struct rcu_node *rnp;
rcu_for_each_leaf_node(rnp) {
- raw_spin_lock_irqsave_rcu_node(rnp, flags);
- for_each_leaf_node_possible_cpu(rnp, cpu)
+ printk_deferred_enter();
+ for_each_leaf_node_possible_cpu(rnp, cpu) {
+ if (gp_seq != data_race(rcu_state.gp_seq)) {
+ printk_deferred_exit();
+ pr_err("INFO: Stall ended during stack backtracing.\n");
+ return;
+ }
+ if (!(data_race(rnp->qsmask) & leaf_node_cpu_bit(rnp, cpu)))
+ continue;
+ raw_spin_lock_irqsave_rcu_node(rnp, flags);
if (rnp->qsmask & leaf_node_cpu_bit(rnp, cpu)) {
if (cpu_is_offline(cpu))
pr_err("Offline CPU %d blocking current GP.\n", cpu);
else
dump_cpu_task(cpu);
}
- raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+ raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+ }
+ printk_deferred_exit();
}
}
@@ -453,8 +435,8 @@ static void print_cpu_stat_info(int cpu)
rsr.cputime_system = kcpustat_field(kcsp, CPUTIME_SYSTEM, cpu);
pr_err("\t hardirqs softirqs csw/system\n");
- pr_err("\t number: %8ld %10d %12lld\n",
- kstat_cpu_irqs_sum(cpu) - rsrp->nr_hardirqs,
+ pr_err("\t number: %8lld %10d %12lld\n",
+ kstat_cpu_irqs_sum(cpu) + arch_irq_stat_cpu(cpu) - rsrp->nr_hardirqs,
kstat_cpu_softirqs_sum(cpu) - rsrp->nr_softirqs,
nr_context_switches_cpu(cpu) - rsrp->nr_csw);
pr_err("\tcputime: %8lld %10lld %12lld ==> %d(ms)\n",
@@ -501,10 +483,11 @@ static void print_cpu_stall_info(int cpu)
}
delta = rcu_seq_ctr(rdp->mynode->gp_seq - rdp->rcu_iw_gp_seq);
falsepositive = rcu_is_gp_kthread_starving(NULL) &&
- rcu_dynticks_in_eqs(rcu_dynticks_snap(cpu));
+ rcu_watching_snap_in_eqs(ct_rcu_watching_cpu(cpu));
rcuc_starved = rcu_is_rcuc_kthread_starving(rdp, &j);
if (rcuc_starved)
- sprintf(buf, " rcuc=%ld jiffies(starved)", j);
+ // Print signed value, as negative values indicate a probable bug.
+ snprintf(buf, sizeof(buf), " rcuc=%ld jiffies(starved)", j);
pr_err("\t%d-%c%c%c%c: (%lu %s) idle=%04x/%ld/%#lx softirq=%u/%u fqs=%ld%s%s\n",
cpu,
"O."[!!cpu_online(cpu)],
@@ -514,8 +497,8 @@ static void print_cpu_stall_info(int cpu)
rdp->rcu_iw_pending ? (int)min(delta, 9UL) + '0' :
"!."[!delta],
ticks_value, ticks_title,
- rcu_dynticks_snap(cpu) & 0xffff,
- ct_dynticks_nesting_cpu(cpu), ct_dynticks_nmi_nesting_cpu(cpu),
+ ct_rcu_watching_cpu(cpu) & 0xffff,
+ ct_nesting_cpu(cpu), ct_nmi_nesting_cpu(cpu),
rdp->softirq_snap, kstat_softirqs_cpu(RCU_SOFTIRQ, cpu),
data_race(rcu_state.n_force_qs) - rcu_state.n_force_qs_gpstart,
rcuc_starved ? buf : "",
@@ -579,7 +562,7 @@ static void rcu_check_gp_kthread_expired_fqs_timer(void)
pr_err("%s kthread timer wakeup didn't happen for %ld jiffies! g%ld f%#x %s(%d) ->state=%#x\n",
rcu_state.name, (jiffies - jiffies_fqs),
(long)rcu_seq_current(&rcu_state.gp_seq),
- data_race(rcu_state.gp_flags),
+ data_race(READ_ONCE(rcu_state.gp_flags)), // Diagnostic read
gp_state_getname(RCU_GP_WAIT_FQS), RCU_GP_WAIT_FQS,
data_race(READ_ONCE(gpk->__state)));
pr_err("\tPossible timer handling issue on cpu=%d timer-softirq=%u\n",
@@ -604,6 +587,8 @@ static void print_other_cpu_stall(unsigned long gp_seq, unsigned long gps)
if (rcu_stall_is_suppressed())
return;
+ nbcon_cpu_emergency_enter();
+
/*
* OK, time to rat on our buddy...
* See Documentation/RCU/stallwarn.rst for info on how to debug
@@ -628,9 +613,10 @@ static void print_other_cpu_stall(unsigned long gp_seq, unsigned long gps)
totqlen += rcu_get_n_cbs_cpu(cpu);
pr_err("\t(detected by %d, t=%ld jiffies, g=%ld, q=%lu ncpus=%d)\n",
smp_processor_id(), (long)(jiffies - gps),
- (long)rcu_seq_current(&rcu_state.gp_seq), totqlen, rcu_state.n_online_cpus);
+ (long)rcu_seq_current(&rcu_state.gp_seq), totqlen,
+ data_race(rcu_state.n_online_cpus)); // Diagnostic read
if (ndetected) {
- rcu_dump_cpu_stacks();
+ rcu_dump_cpu_stacks(gp_seq);
/* Complain about tasks blocking the grace period. */
rcu_for_each_leaf_node(rnp)
@@ -655,12 +641,14 @@ static void print_other_cpu_stall(unsigned long gp_seq, unsigned long gps)
rcu_check_gp_kthread_expired_fqs_timer();
rcu_check_gp_kthread_starvation();
+ nbcon_cpu_emergency_exit();
+
panic_on_rcu_stall();
rcu_force_quiescent_state(); /* Kick them all. */
}
-static void print_cpu_stall(unsigned long gps)
+static void print_cpu_stall(unsigned long gp_seq, unsigned long gps)
{
int cpu;
unsigned long flags;
@@ -675,6 +663,8 @@ static void print_cpu_stall(unsigned long gps)
if (rcu_stall_is_suppressed())
return;
+ nbcon_cpu_emergency_enter();
+
/*
* OK, time to rat on ourselves...
* See Documentation/RCU/stallwarn.rst for info on how to debug
@@ -689,12 +679,13 @@ static void print_cpu_stall(unsigned long gps)
totqlen += rcu_get_n_cbs_cpu(cpu);
pr_err("\t(t=%lu jiffies g=%ld q=%lu ncpus=%d)\n",
jiffies - gps,
- (long)rcu_seq_current(&rcu_state.gp_seq), totqlen, rcu_state.n_online_cpus);
+ (long)rcu_seq_current(&rcu_state.gp_seq), totqlen,
+ data_race(rcu_state.n_online_cpus)); // Diagnostic read
rcu_check_gp_kthread_expired_fqs_timer();
rcu_check_gp_kthread_starvation();
- rcu_dump_cpu_stacks();
+ rcu_dump_cpu_stacks(gp_seq);
raw_spin_lock_irqsave_rcu_node(rnp, flags);
/* Rewrite if needed in case of slow consoles. */
@@ -703,6 +694,8 @@ static void print_cpu_stall(unsigned long gps)
jiffies + 3 * rcu_jiffies_till_stall_check() + 3);
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+ nbcon_cpu_emergency_exit();
+
panic_on_rcu_stall();
/*
@@ -716,6 +709,9 @@ static void print_cpu_stall(unsigned long gps)
set_preempt_need_resched();
}
+static bool csd_lock_suppress_rcu_stall;
+module_param(csd_lock_suppress_rcu_stall, bool, 0644);
+
static void check_cpu_stall(struct rcu_data *rdp)
{
bool self_detected;
@@ -771,7 +767,8 @@ static void check_cpu_stall(struct rcu_data *rdp)
gs2 = READ_ONCE(rcu_state.gp_seq);
if (gs1 != gs2 ||
ULONG_CMP_LT(j, js) ||
- ULONG_CMP_GE(gps, js))
+ ULONG_CMP_GE(gps, js) ||
+ !rcu_seq_state(gs2))
return; /* No stall or GP completed since entering function. */
rnp = rdp->mynode;
jn = jiffies + ULONG_MAX / 2;
@@ -788,9 +785,11 @@ static void check_cpu_stall(struct rcu_data *rdp)
return;
rcu_stall_notifier_call_chain(RCU_STALL_NOTIFY_NORM, (void *)j - gps);
- if (self_detected) {
+ if (READ_ONCE(csd_lock_suppress_rcu_stall) && csd_lock_is_stuck()) {
+ pr_err("INFO: %s detected stall, but suppressed full report due to a stuck CSD-lock.\n", rcu_state.name);
+ } else if (self_detected) {
/* We haven't checked in, so go dump stack. */
- print_cpu_stall(gps);
+ print_cpu_stall(gs2, gps);
} else {
/* They had a few time units to dump stack, so complain. */
print_other_cpu_stall(gs2, gps);
diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
index 46aaaa9fe339..c912b594ba98 100644
--- a/kernel/rcu/update.c
+++ b/kernel/rcu/update.c
@@ -408,7 +408,7 @@ void wakeme_after_rcu(struct rcu_head *head)
}
EXPORT_SYMBOL_GPL(wakeme_after_rcu);
-void __wait_rcu_gp(bool checktiny, int n, call_rcu_func_t *crcu_array,
+void __wait_rcu_gp(bool checktiny, unsigned int state, int n, call_rcu_func_t *crcu_array,
struct rcu_synchronize *rs_array)
{
int i;
@@ -440,7 +440,7 @@ void __wait_rcu_gp(bool checktiny, int n, call_rcu_func_t *crcu_array,
if (crcu_array[j] == crcu_array[i])
break;
if (j == i) {
- wait_for_completion(&rs_array[i].completion);
+ wait_for_completion_state(&rs_array[i].completion, state);
destroy_rcu_head_on_stack(&rs_array[i].head);
}
}
@@ -527,12 +527,12 @@ EXPORT_SYMBOL_GPL(do_trace_rcu_torture_read);
#if IS_ENABLED(CONFIG_RCU_TORTURE_TEST) || IS_MODULE(CONFIG_RCU_TORTURE_TEST) || IS_ENABLED(CONFIG_LOCK_TORTURE_TEST) || IS_MODULE(CONFIG_LOCK_TORTURE_TEST)
/* Get rcutorture access to sched_setaffinity(). */
-long torture_sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
+long torture_sched_setaffinity(pid_t pid, const struct cpumask *in_mask, bool dowarn)
{
int ret;
ret = sched_setaffinity(pid, in_mask);
- WARN_ONCE(ret, "%s: sched_setaffinity(%d) returned %d\n", __func__, pid, ret);
+ WARN_ONCE(dowarn && ret, "%s: sched_setaffinity(%d) returned %d\n", __func__, pid, ret);
return ret;
}
EXPORT_SYMBOL_GPL(torture_sched_setaffinity);