summaryrefslogtreecommitdiff
path: root/kernel/rcu
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/rcu')
-rw-r--r--kernel/rcu/Kconfig170
-rw-r--r--kernel/rcu/Kconfig.debug65
-rw-r--r--kernel/rcu/rcu.h168
-rw-r--r--kernel/rcu/rcu_segcblist.c229
-rw-r--r--kernel/rcu/rcu_segcblist.h61
-rw-r--r--kernel/rcu/rcuscale.c412
-rw-r--r--kernel/rcu/rcutorture.c1896
-rw-r--r--kernel/rcu/refscale.c535
-rw-r--r--kernel/rcu/srcutiny.c94
-rw-r--r--kernel/rcu/srcutree.c1376
-rw-r--r--kernel/rcu/sync.c8
-rw-r--r--kernel/rcu/tasks.h1490
-rw-r--r--kernel/rcu/tiny.c89
-rw-r--r--kernel/rcu/tree.c2986
-rw-r--r--kernel/rcu/tree.h126
-rw-r--r--kernel/rcu/tree_exp.h404
-rw-r--r--kernel/rcu/tree_nocb.h1805
-rw-r--r--kernel/rcu/tree_plugin.h1606
-rw-r--r--kernel/rcu/tree_stall.h454
-rw-r--r--kernel/rcu/update.c120
20 files changed, 9811 insertions, 4283 deletions
diff --git a/kernel/rcu/Kconfig b/kernel/rcu/Kconfig
index cdc57b4f6d48..bdd7eadb33d8 100644
--- a/kernel/rcu/Kconfig
+++ b/kernel/rcu/Kconfig
@@ -8,6 +8,8 @@ menu "RCU Subsystem"
config TREE_RCU
bool
default y if SMP
+ # Dynticks-idle tracking
+ select CONTEXT_TRACKING_IDLE
help
This option selects the RCU implementation that is
designed for very large SMP system with hundreds or
@@ -51,56 +53,77 @@ config RCU_EXPERT
Say N if you are unsure.
-config SRCU
- bool
- help
- This option selects the sleepable version of RCU. This version
- permits arbitrary sleeping or blocking within RCU read-side critical
- sections.
-
config TINY_SRCU
bool
- default y if SRCU && TINY_RCU
+ default y if TINY_RCU
help
This option selects the single-CPU non-preemptible version of SRCU.
config TREE_SRCU
bool
- default y if SRCU && !TINY_RCU
+ default y if !TINY_RCU
help
This option selects the full-fledged version of SRCU.
+config NEED_SRCU_NMI_SAFE
+ def_bool HAVE_NMI && !ARCH_HAS_NMI_SAFE_THIS_CPU_OPS && !TINY_SRCU
+
config TASKS_RCU_GENERIC
def_bool TASKS_RCU || TASKS_RUDE_RCU || TASKS_TRACE_RCU
- select SRCU
help
This option enables generic infrastructure code supporting
task-based RCU implementations. Not for manual selection.
+config FORCE_TASKS_RCU
+ bool "Force selection of TASKS_RCU"
+ depends on RCU_EXPERT
+ select TASKS_RCU
+ default n
+ help
+ This option force-enables a task-based RCU implementation
+ that uses only voluntary context switch (not preemption!),
+ idle, and user-mode execution as quiescent states. Not for
+ manual selection in most cases.
+
config TASKS_RCU
- def_bool PREEMPTION
+ bool
+ default n
+ select IRQ_WORK
+
+config FORCE_TASKS_RUDE_RCU
+ bool "Force selection of Tasks Rude RCU"
+ depends on RCU_EXPERT
+ select TASKS_RUDE_RCU
+ default n
help
- This option enables a task-based RCU implementation that uses
- only voluntary context switch (not preemption!), idle, and
- user-mode execution as quiescent states. Not for manual selection.
+ This option force-enables a task-based RCU implementation
+ that uses only context switch (including preemption) and
+ user-mode execution as quiescent states. It forces IPIs and
+ context switches on all online CPUs, including idle ones,
+ so use with caution. Not for manual selection in most cases.
config TASKS_RUDE_RCU
- def_bool 0
- help
- This option enables a task-based RCU implementation that uses
- only context switch (including preemption) and user-mode
- execution as quiescent states. It forces IPIs and context
- switches on all online CPUs, including idle ones, so use
- with caution.
+ bool
+ default n
+ select IRQ_WORK
-config TASKS_TRACE_RCU
- def_bool 0
+config FORCE_TASKS_TRACE_RCU
+ bool "Force selection of Tasks Trace RCU"
+ depends on RCU_EXPERT
+ select TASKS_TRACE_RCU
+ default n
help
This option enables a task-based RCU implementation that uses
explicit rcu_read_lock_trace() read-side markers, and allows
- these readers to appear in the idle loop as well as on the CPU
- hotplug code paths. It can force IPIs on online CPUs, including
- idle ones, so use with caution.
+ these readers to appear in the idle loop as well as on the
+ CPU hotplug code paths. It can force IPIs on online CPUs,
+ including idle ones, so use with caution. Not for manual
+ selection in most cases.
+
+config TASKS_TRACE_RCU
+ bool
+ default n
+ select IRQ_WORK
config RCU_STALL_COMMON
def_bool TREE_RCU
@@ -111,7 +134,7 @@ config RCU_STALL_COMMON
making these warnings mandatory for the tree variants.
config RCU_NEED_SEGCBLIST
- def_bool ( TREE_RCU || TREE_SRCU )
+ def_bool ( TREE_RCU || TREE_SRCU || TASKS_RCU_GENERIC )
config RCU_FANOUT
int "Tree-based hierarchical RCU fanout value"
@@ -168,28 +191,10 @@ config RCU_FANOUT_LEAF
Take the default if unsure.
-config RCU_FAST_NO_HZ
- bool "Accelerate last non-dyntick-idle CPU's grace periods"
- depends on NO_HZ_COMMON && SMP && RCU_EXPERT
- default n
- help
- This option permits CPUs to enter dynticks-idle state even if
- they have RCU callbacks queued, and prevents RCU from waking
- these CPUs up more than roughly once every four jiffies (by
- default, you can adjust this using the rcutree.rcu_idle_gp_delay
- parameter), thus improving energy efficiency. On the other
- hand, this option increases the duration of RCU grace periods,
- for example, slowing down synchronize_rcu().
-
- Say Y if energy efficiency is critically important, and you
- don't care about increased grace-period durations.
-
- Say N if you are unsure.
-
config RCU_BOOST
bool "Enable RCU priority boosting"
- depends on RT_MUTEXES && PREEMPT_RCU && RCU_EXPERT
- default n
+ depends on (RT_MUTEXES && PREEMPT_RCU && RCU_EXPERT) || PREEMPT_RT
+ default y if PREEMPT_RT
help
This option boosts the priority of preempted RCU readers that
block the current preemptible RCU grace period for too long.
@@ -212,6 +217,20 @@ config RCU_BOOST_DELAY
Accept the default if unsure.
+config RCU_EXP_KTHREAD
+ bool "Perform RCU expedited work in a real-time kthread"
+ depends on RCU_BOOST && RCU_EXPERT
+ default !PREEMPT_RT && NR_CPUS <= 32
+ help
+ Use this option to further reduce the latencies of expedited
+ grace periods at the expense of being more disruptive.
+
+ This option is disabled by default on PREEMPT_RT=y kernels which
+ disable expedited grace periods after boot by unconditionally
+ setting rcupdate.rcu_normal_after_boot=1.
+
+ Accept the default if unsure.
+
config RCU_NOCB_CPU
bool "Offload RCU callback processing from boot-selected CPUs"
depends on TREE_RCU
@@ -240,9 +259,38 @@ config RCU_NOCB_CPU
Say Y here if you need reduced OS jitter, despite added overhead.
Say N here if you are unsure.
+config RCU_NOCB_CPU_DEFAULT_ALL
+ bool "Offload RCU callback processing from all CPUs by default"
+ depends on RCU_NOCB_CPU
+ default n
+ help
+ Use this option to offload callback processing from all CPUs
+ by default, in the absence of the rcu_nocbs or nohz_full boot
+ parameter. This also avoids the need to use any boot parameters
+ to achieve the effect of offloading all CPUs on boot.
+
+ Say Y here if you want offload all CPUs by default on boot.
+ Say N here if you are unsure.
+
+config RCU_NOCB_CPU_CB_BOOST
+ bool "Offload RCU callback from real-time kthread"
+ depends on RCU_NOCB_CPU && RCU_BOOST
+ default y if PREEMPT_RT
+ help
+ Use this option to invoke offloaded callbacks as SCHED_FIFO
+ to avoid starvation by heavy SCHED_OTHER background load.
+ Of course, running as SCHED_FIFO during callback floods will
+ cause the rcuo[ps] kthreads to monopolize the CPU for hundreds
+ of milliseconds or more. Therefore, when enabling this option,
+ it is your responsibility to ensure that latency-sensitive
+ tasks either run with higher priority or run on some other CPU.
+
+ Say Y here if you want to set RT priority for offloading kthreads.
+ Say N here if you are building a !PREEMPT_RT kernel and are unsure.
+
config TASKS_TRACE_RCU_READ_MB
bool "Tasks Trace RCU readers use memory barriers in user and idle"
- depends on RCU_EXPERT
+ depends on RCU_EXPERT && TASKS_TRACE_RCU
default PREEMPT_RT || NR_CPUS < 8
help
Use this option to further reduce the number of IPIs sent
@@ -258,4 +306,30 @@ config TASKS_TRACE_RCU_READ_MB
Say N here if you hate read-side memory barriers.
Take the default if you are unsure.
+config RCU_LAZY
+ bool "RCU callback lazy invocation functionality"
+ depends on RCU_NOCB_CPU
+ default n
+ help
+ To save power, batch RCU callbacks and flush after delay, memory
+ pressure, or callback list growing too big.
+
+config RCU_DOUBLE_CHECK_CB_TIME
+ bool "RCU callback-batch backup time check"
+ depends on RCU_EXPERT
+ default n
+ help
+ Use this option to provide more precise enforcement of the
+ rcutree.rcu_resched_ns module parameter in situations where
+ a single RCU callback might run for hundreds of microseconds,
+ thus defeating the 32-callback batching used to amortize the
+ cost of the fine-grained but expensive local_clock() function.
+
+ This option rounds rcutree.rcu_resched_ns up to the next
+ jiffy, and overrides the 32-callback batching if this limit
+ is exceeded.
+
+ Say Y here if you need tighter callback-limit enforcement.
+ Say N here if you are unsure.
+
endmenu # "RCU Subsystem"
diff --git a/kernel/rcu/Kconfig.debug b/kernel/rcu/Kconfig.debug
index 1942c1f1bb65..9b0b52e1836f 100644
--- a/kernel/rcu/Kconfig.debug
+++ b/kernel/rcu/Kconfig.debug
@@ -27,10 +27,6 @@ config RCU_SCALE_TEST
tristate "performance tests for RCU"
depends on DEBUG_KERNEL
select TORTURE_TEST
- select SRCU
- select TASKS_RCU
- select TASKS_RUDE_RCU
- select TASKS_TRACE_RCU
default n
help
This option provides a kernel module that runs performance
@@ -46,10 +42,6 @@ config RCU_TORTURE_TEST
tristate "torture tests for RCU"
depends on DEBUG_KERNEL
select TORTURE_TEST
- select SRCU
- select TASKS_RCU
- select TASKS_RUDE_RCU
- select TASKS_TRACE_RCU
default n
help
This option provides a kernel module that runs torture tests
@@ -65,10 +57,6 @@ config RCU_REF_SCALE_TEST
tristate "Scalability tests for read-side synchronization (RCU and others)"
depends on DEBUG_KERNEL
select TORTURE_TEST
- select SRCU
- select TASKS_RCU
- select TASKS_RUDE_RCU
- select TASKS_TRACE_RCU
default n
help
This option provides a kernel module that runs performance tests
@@ -91,6 +79,57 @@ config RCU_CPU_STALL_TIMEOUT
RCU grace period persists, additional CPU stall warnings are
printed at more widely spaced intervals.
+config RCU_EXP_CPU_STALL_TIMEOUT
+ int "Expedited RCU CPU stall timeout in milliseconds"
+ depends on RCU_STALL_COMMON
+ range 0 300000
+ default 0
+ help
+ If a given expedited RCU grace period extends more than the
+ specified number of milliseconds, a CPU stall warning is printed.
+ If the RCU grace period persists, additional CPU stall warnings
+ are printed at more widely spaced intervals. A value of zero
+ says to use the RCU_CPU_STALL_TIMEOUT value converted from
+ seconds to milliseconds.
+
+config RCU_CPU_STALL_CPUTIME
+ bool "Provide additional RCU stall debug information"
+ depends on RCU_STALL_COMMON
+ default n
+ help
+ Collect statistics during the sampling period, such as the number of
+ (hard interrupts, soft interrupts, task switches) and the cputime of
+ (hard interrupts, soft interrupts, kernel tasks) are added to the
+ RCU stall report. For multiple continuous RCU stalls, all sampling
+ periods begin at half of the first RCU stall timeout.
+ The boot option rcupdate.rcu_cpu_stall_cputime has the same function
+ as this one, but will override this if it exists.
+
+config RCU_CPU_STALL_NOTIFIER
+ bool "Provide RCU CPU-stall notifiers"
+ depends on RCU_STALL_COMMON
+ depends on DEBUG_KERNEL
+ depends on RCU_EXPERT
+ default n
+ help
+ WARNING: You almost certainly do not want this!!!
+
+ Enable RCU CPU-stall notifiers, which are invoked just before
+ printing the RCU CPU stall warning. As such, bugs in notifier
+ callbacks can prevent stall warnings from being printed.
+ And the whole reason that a stall warning is being printed is
+ that something is hung up somewhere. Therefore, the notifier
+ callbacks must be written extremely carefully, preferably
+ containing only lockless code. After all, it is quite possible
+ that the whole reason that the RCU CPU stall is happening in
+ the first place is that someone forgot to release whatever lock
+ that you are thinking of acquiring. In which case, having your
+ notifier callback acquire that lock will hang, preventing the
+ RCU CPU stall warning from appearing.
+
+ Say Y here if you want RCU CPU stall notifiers (you don't want them)
+ Say N if you are unsure.
+
config RCU_TRACE
bool "Enable tracing for RCU"
depends on DEBUG_KERNEL
@@ -116,7 +155,7 @@ config RCU_EQS_DEBUG
config RCU_STRICT_GRACE_PERIOD
bool "Provide debug RCU implementation with short grace periods"
- depends on DEBUG_KERNEL && RCU_EXPERT
+ depends on DEBUG_KERNEL && RCU_EXPERT && NR_CPUS <= 4 && !TINY_RCU
default n
select PREEMPT_COUNT if PREEMPT=n
help
diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h
index 59ef1ae6dc37..f94f65877f2b 100644
--- a/kernel/rcu/rcu.h
+++ b/kernel/rcu/rcu.h
@@ -10,19 +10,58 @@
#ifndef __LINUX_RCU_H
#define __LINUX_RCU_H
+#include <linux/slab.h>
#include <trace/events/rcu.h>
-/* Offset to allow distinguishing irq vs. task-based idle entry/exit. */
-#define DYNTICK_IRQ_NONIDLE ((LONG_MAX / 2) + 1)
-
-
/*
* Grace-period counter management.
+ *
+ * The two least significant bits contain the control flags.
+ * The most significant bits contain the grace-period sequence counter.
+ *
+ * When both control flags are zero, no grace period is in progress.
+ * When either bit is non-zero, a grace period has started and is in
+ * progress. When the grace period completes, the control flags are reset
+ * to 0 and the grace-period sequence counter is incremented.
+ *
+ * However some specific RCU usages make use of custom values.
+ *
+ * SRCU special control values:
+ *
+ * SRCU_SNP_INIT_SEQ : Invalid/init value set when SRCU node
+ * is initialized.
+ *
+ * SRCU_STATE_IDLE : No SRCU gp is in progress
+ *
+ * SRCU_STATE_SCAN1 : State set by rcu_seq_start(). Indicates
+ * we are scanning the readers on the slot
+ * defined as inactive (there might well
+ * be pending readers that will use that
+ * index, but their number is bounded).
+ *
+ * SRCU_STATE_SCAN2 : State set manually via rcu_seq_set_state()
+ * Indicates we are flipping the readers
+ * index and then scanning the readers on the
+ * slot newly designated as inactive (again,
+ * the number of pending readers that will use
+ * this inactive index is bounded).
+ *
+ * RCU polled GP special control value:
+ *
+ * RCU_GET_STATE_COMPLETED : State value indicating an already-completed
+ * polled GP has completed. This value covers
+ * both the state and the counter of the
+ * grace-period sequence number.
*/
#define RCU_SEQ_CTR_SHIFT 2
#define RCU_SEQ_STATE_MASK ((1 << RCU_SEQ_CTR_SHIFT) - 1)
+/* Low-order bit definition for polled grace-period APIs. */
+#define RCU_GET_STATE_COMPLETED 0x1
+
+extern int sysctl_sched_rt_runtime;
+
/*
* Return the counter portion of a sequence number previously returned
* by rcu_seq_snap() or rcu_seq_current().
@@ -118,6 +157,18 @@ static inline bool rcu_seq_done(unsigned long *sp, unsigned long s)
}
/*
+ * Given a snapshot from rcu_seq_snap(), determine whether or not a
+ * full update-side operation has occurred, but do not allow the
+ * (ULONG_MAX / 2) safety-factor/guard-band.
+ */
+static inline bool rcu_seq_done_exact(unsigned long *sp, unsigned long s)
+{
+ unsigned long cur_s = READ_ONCE(*sp);
+
+ return ULONG_CMP_GE(cur_s, s) || ULONG_CMP_LT(cur_s, s - (2 * RCU_SEQ_STATE_MASK + 1));
+}
+
+/*
* Has a grace period completed since the time the old gp_seq was collected?
*/
static inline bool rcu_seq_completed_gp(unsigned long old, unsigned long new)
@@ -198,6 +249,12 @@ static inline void debug_rcu_head_unqueue(struct rcu_head *head)
}
#endif /* #else !CONFIG_DEBUG_OBJECTS_RCU_HEAD */
+static inline void debug_rcu_head_callback(struct rcu_head *rhp)
+{
+ if (unlikely(!rhp->func))
+ kmem_dump_obj(rhp);
+}
+
extern int rcu_cpu_stall_suppress_at_boot;
static inline bool rcu_stall_is_suppressed_at_boot(void)
@@ -205,12 +262,18 @@ static inline bool rcu_stall_is_suppressed_at_boot(void)
return rcu_cpu_stall_suppress_at_boot && !rcu_inkernel_boot_has_ended();
}
+extern int rcu_cpu_stall_notifiers;
+
#ifdef CONFIG_RCU_STALL_COMMON
extern int rcu_cpu_stall_ftrace_dump;
extern int rcu_cpu_stall_suppress;
extern int rcu_cpu_stall_timeout;
+extern int rcu_exp_cpu_stall_timeout;
+extern int rcu_cpu_stall_cputime;
+extern bool rcu_exp_stall_task_details __read_mostly;
int rcu_jiffies_till_stall_check(void);
+int rcu_exp_jiffies_till_stall_check(void);
static inline bool rcu_stall_is_suppressed(void)
{
@@ -271,7 +334,7 @@ void rcu_test_sync_prims(void);
*/
extern void resched_cpu(int cpu);
-#if defined(CONFIG_SRCU) || !defined(CONFIG_TINY_RCU)
+#if !defined(CONFIG_TINY_RCU)
#include <linux/rcu_node_tree.h>
@@ -308,6 +371,8 @@ static inline void rcu_init_levelspread(int *levelspread, const int *levelcnt)
}
}
+extern void rcu_init_geometry(void);
+
/* Returns a pointer to the first leaf rcu_node structure. */
#define rcu_first_leaf_node() (rcu_state.level[rcu_num_lvls - 1])
@@ -322,11 +387,13 @@ static inline void rcu_init_levelspread(int *levelspread, const int *levelcnt)
* specified state structure (for SRCU) or the only rcu_state structure
* (for RCU).
*/
-#define srcu_for_each_node_breadth_first(sp, rnp) \
+#define _rcu_for_each_node_breadth_first(sp, rnp) \
for ((rnp) = &(sp)->node[0]; \
(rnp) < &(sp)->node[rcu_num_nodes]; (rnp)++)
#define rcu_for_each_node_breadth_first(rnp) \
- srcu_for_each_node_breadth_first(&rcu_state, rnp)
+ _rcu_for_each_node_breadth_first(&rcu_state, rnp)
+#define srcu_for_each_node_breadth_first(ssp, rnp) \
+ _rcu_for_each_node_breadth_first(ssp->srcu_sup, rnp)
/*
* Scan the leaves of the rcu_node hierarchy for the rcu_state structure.
@@ -358,6 +425,10 @@ static inline void rcu_init_levelspread(int *levelspread, const int *levelcnt)
(cpu) <= rnp->grphi; \
(cpu) = rcu_find_next_bit((rnp), (cpu) + 1 - (rnp->grplo), (mask)))
+#endif /* !defined(CONFIG_TINY_RCU) */
+
+#if !defined(CONFIG_TINY_RCU) || defined(CONFIG_TASKS_RCU_GENERIC)
+
/*
* Wrappers for the rcu_node::lock acquire and release.
*
@@ -378,7 +449,11 @@ do { \
smp_mb__after_unlock_lock(); \
} while (0)
-#define raw_spin_unlock_rcu_node(p) raw_spin_unlock(&ACCESS_PRIVATE(p, lock))
+#define raw_spin_unlock_rcu_node(p) \
+do { \
+ lockdep_assert_irqs_disabled(); \
+ raw_spin_unlock(&ACCESS_PRIVATE(p, lock)); \
+} while (0)
#define raw_spin_lock_irq_rcu_node(p) \
do { \
@@ -387,7 +462,10 @@ do { \
} while (0)
#define raw_spin_unlock_irq_rcu_node(p) \
- raw_spin_unlock_irq(&ACCESS_PRIVATE(p, lock))
+do { \
+ lockdep_assert_irqs_disabled(); \
+ raw_spin_unlock_irq(&ACCESS_PRIVATE(p, lock)); \
+} while (0)
#define raw_spin_lock_irqsave_rcu_node(p, flags) \
do { \
@@ -396,7 +474,10 @@ do { \
} while (0)
#define raw_spin_unlock_irqrestore_rcu_node(p, flags) \
- raw_spin_unlock_irqrestore(&ACCESS_PRIVATE(p, lock), flags)
+do { \
+ lockdep_assert_irqs_disabled(); \
+ raw_spin_unlock_irqrestore(&ACCESS_PRIVATE(p, lock), flags); \
+} while (0)
#define raw_spin_trylock_rcu_node(p) \
({ \
@@ -410,31 +491,43 @@ do { \
#define raw_lockdep_assert_held_rcu_node(p) \
lockdep_assert_held(&ACCESS_PRIVATE(p, lock))
-#endif /* #if defined(CONFIG_SRCU) || !defined(CONFIG_TINY_RCU) */
-
-#ifdef CONFIG_SRCU
-void srcu_init(void);
-#else /* #ifdef CONFIG_SRCU */
-static inline void srcu_init(void) { }
-#endif /* #else #ifdef CONFIG_SRCU */
+#endif // #if !defined(CONFIG_TINY_RCU) || defined(CONFIG_TASKS_RCU_GENERIC)
#ifdef CONFIG_TINY_RCU
/* Tiny RCU doesn't expedite, as its purpose in life is instead to be tiny. */
static inline bool rcu_gp_is_normal(void) { return true; }
static inline bool rcu_gp_is_expedited(void) { return false; }
+static inline bool rcu_async_should_hurry(void) { return false; }
static inline void rcu_expedite_gp(void) { }
static inline void rcu_unexpedite_gp(void) { }
-static inline void rcu_request_urgent_qs_task(struct task_struct *t) { }
+static inline void rcu_async_hurry(void) { }
+static inline void rcu_async_relax(void) { }
+static inline bool rcu_cpu_online(int cpu) { return true; }
#else /* #ifdef CONFIG_TINY_RCU */
bool rcu_gp_is_normal(void); /* Internal RCU use. */
bool rcu_gp_is_expedited(void); /* Internal RCU use. */
+bool rcu_async_should_hurry(void); /* Internal RCU use. */
void rcu_expedite_gp(void);
void rcu_unexpedite_gp(void);
+void rcu_async_hurry(void);
+void rcu_async_relax(void);
void rcupdate_announce_bootup_oddness(void);
+bool rcu_cpu_online(int cpu);
+#ifdef CONFIG_TASKS_RCU_GENERIC
void show_rcu_tasks_gp_kthreads(void);
-void rcu_request_urgent_qs_task(struct task_struct *t);
+#else /* #ifdef CONFIG_TASKS_RCU_GENERIC */
+static inline void show_rcu_tasks_gp_kthreads(void) {}
+#endif /* #else #ifdef CONFIG_TASKS_RCU_GENERIC */
#endif /* #else #ifdef CONFIG_TINY_RCU */
+#ifdef CONFIG_TASKS_RCU
+struct task_struct *get_rcu_tasks_gp_kthread(void);
+#endif // # ifdef CONFIG_TASKS_RCU
+
+#ifdef CONFIG_TASKS_RUDE_RCU
+struct task_struct *get_rcu_tasks_rude_gp_kthread(void);
+#endif // # ifdef CONFIG_TASKS_RUDE_RCU
+
#define RCU_SCHEDULER_INACTIVE 0
#define RCU_SCHEDULER_INIT 1
#define RCU_SCHEDULER_RUNNING 2
@@ -449,6 +542,14 @@ enum rcutorture_type {
INVALID_RCU_FLAVOR
};
+#if defined(CONFIG_RCU_LAZY)
+unsigned long rcu_lazy_get_jiffies_till_flush(void);
+void rcu_lazy_set_jiffies_till_flush(unsigned long j);
+#else
+static inline unsigned long rcu_lazy_get_jiffies_till_flush(void) { return 0; }
+static inline void rcu_lazy_set_jiffies_till_flush(unsigned long j) { }
+#endif
+
#if defined(CONFIG_TREE_RCU)
void rcutorture_get_gp_data(enum rcutorture_type test_type, int *flags,
unsigned long *gp_seq);
@@ -478,10 +579,6 @@ void do_trace_rcu_torture_read(const char *rcutorturename,
static inline void rcu_gp_set_torture_wait(int duration) { }
#endif
-#if IS_ENABLED(CONFIG_RCU_TORTURE_TEST) || IS_MODULE(CONFIG_RCU_TORTURE_TEST)
-long rcutorture_sched_setaffinity(pid_t pid, const struct cpumask *in_mask);
-#endif
-
#ifdef CONFIG_TINY_SRCU
static inline void srcutorture_get_gp_data(enum rcutorture_type test_type,
@@ -509,27 +606,36 @@ static inline unsigned long rcu_exp_batches_completed(void) { return 0; }
static inline unsigned long
srcu_batches_completed(struct srcu_struct *sp) { return 0; }
static inline void rcu_force_quiescent_state(void) { }
+static inline bool rcu_check_boost_fail(unsigned long gp_state, int *cpup) { return true; }
static inline void show_rcu_gp_kthreads(void) { }
static inline int rcu_get_gp_kthreads_prio(void) { return 0; }
static inline void rcu_fwd_progress_check(unsigned long j) { }
+static inline void rcu_gp_slow_register(atomic_t *rgssp) { }
+static inline void rcu_gp_slow_unregister(atomic_t *rgssp) { }
#else /* #ifdef CONFIG_TINY_RCU */
bool rcu_dynticks_zero_in_eqs(int cpu, int *vp);
unsigned long rcu_get_gp_seq(void);
unsigned long rcu_exp_batches_completed(void);
unsigned long srcu_batches_completed(struct srcu_struct *sp);
+bool rcu_check_boost_fail(unsigned long gp_state, int *cpup);
void show_rcu_gp_kthreads(void);
int rcu_get_gp_kthreads_prio(void);
void rcu_fwd_progress_check(unsigned long j);
void rcu_force_quiescent_state(void);
extern struct workqueue_struct *rcu_gp_wq;
+#ifdef CONFIG_RCU_EXP_KTHREAD
+extern struct kthread_worker *rcu_exp_gp_kworker;
+extern struct kthread_worker *rcu_exp_par_gp_kworker;
+#else /* !CONFIG_RCU_EXP_KTHREAD */
extern struct workqueue_struct *rcu_par_gp_wq;
+#endif /* CONFIG_RCU_EXP_KTHREAD */
+void rcu_gp_slow_register(atomic_t *rgssp);
+void rcu_gp_slow_unregister(atomic_t *rgssp);
#endif /* #else #ifdef CONFIG_TINY_RCU */
#ifdef CONFIG_RCU_NOCB_CPU
-bool rcu_is_nocb_cpu(int cpu);
void rcu_bind_current_to_nocb(void);
#else
-static inline bool rcu_is_nocb_cpu(int cpu) { return false; }
static inline void rcu_bind_current_to_nocb(void) { }
#endif
@@ -549,4 +655,16 @@ void show_rcu_tasks_trace_gp_kthread(void);
static inline void show_rcu_tasks_trace_gp_kthread(void) {}
#endif
+#ifdef CONFIG_TINY_RCU
+static inline bool rcu_cpu_beenfullyonline(int cpu) { return true; }
+#else
+bool rcu_cpu_beenfullyonline(int cpu);
+#endif
+
+#if defined(CONFIG_RCU_STALL_COMMON) && defined(CONFIG_RCU_CPU_STALL_NOTIFIER)
+int rcu_stall_notifier_call_chain(unsigned long val, void *v);
+#else // #if defined(CONFIG_RCU_STALL_COMMON) && defined(CONFIG_RCU_CPU_STALL_NOTIFIER)
+static inline int rcu_stall_notifier_call_chain(unsigned long val, void *v) { return NOTIFY_DONE; }
+#endif // #else // #if defined(CONFIG_RCU_STALL_COMMON) && defined(CONFIG_RCU_CPU_STALL_NOTIFIER)
+
#endif /* __LINUX_RCU_H */
diff --git a/kernel/rcu/rcu_segcblist.c b/kernel/rcu/rcu_segcblist.c
index 2d2a6b6b9dfb..1693ea22ef1b 100644
--- a/kernel/rcu/rcu_segcblist.c
+++ b/kernel/rcu/rcu_segcblist.c
@@ -7,10 +7,10 @@
* Authors: Paul E. McKenney <paulmck@linux.ibm.com>
*/
-#include <linux/types.h>
-#include <linux/kernel.h>
+#include <linux/cpu.h>
#include <linux/interrupt.h>
-#include <linux/rcupdate.h>
+#include <linux/kernel.h>
+#include <linux/types.h>
#include "rcu_segcblist.h"
@@ -88,23 +88,135 @@ static void rcu_segcblist_set_len(struct rcu_segcblist *rsclp, long v)
#endif
}
+/* Get the length of a segment of the rcu_segcblist structure. */
+long rcu_segcblist_get_seglen(struct rcu_segcblist *rsclp, int seg)
+{
+ return READ_ONCE(rsclp->seglen[seg]);
+}
+
+/* Return number of callbacks in segmented callback list by summing seglen. */
+long rcu_segcblist_n_segment_cbs(struct rcu_segcblist *rsclp)
+{
+ long len = 0;
+ int i;
+
+ for (i = RCU_DONE_TAIL; i < RCU_CBLIST_NSEGS; i++)
+ len += rcu_segcblist_get_seglen(rsclp, i);
+
+ return len;
+}
+
+/* Set the length of a segment of the rcu_segcblist structure. */
+static void rcu_segcblist_set_seglen(struct rcu_segcblist *rsclp, int seg, long v)
+{
+ WRITE_ONCE(rsclp->seglen[seg], v);
+}
+
+/* Increase the numeric length of a segment by a specified amount. */
+static void rcu_segcblist_add_seglen(struct rcu_segcblist *rsclp, int seg, long v)
+{
+ WRITE_ONCE(rsclp->seglen[seg], rsclp->seglen[seg] + v);
+}
+
+/* Move from's segment length to to's segment. */
+static void rcu_segcblist_move_seglen(struct rcu_segcblist *rsclp, int from, int to)
+{
+ long len;
+
+ if (from == to)
+ return;
+
+ len = rcu_segcblist_get_seglen(rsclp, from);
+ if (!len)
+ return;
+
+ rcu_segcblist_add_seglen(rsclp, to, len);
+ rcu_segcblist_set_seglen(rsclp, from, 0);
+}
+
+/* Increment segment's length. */
+static void rcu_segcblist_inc_seglen(struct rcu_segcblist *rsclp, int seg)
+{
+ rcu_segcblist_add_seglen(rsclp, seg, 1);
+}
+
/*
* Increase the numeric length of an rcu_segcblist structure by the
* specified amount, which can be negative. This can cause the ->len
* field to disagree with the actual number of callbacks on the structure.
* This increase is fully ordered with respect to the callers accesses
* both before and after.
+ *
+ * So why on earth is a memory barrier required both before and after
+ * the update to the ->len field???
+ *
+ * The reason is that rcu_barrier() locklessly samples each CPU's ->len
+ * field, and if a given CPU's field is zero, avoids IPIing that CPU.
+ * This can of course race with both queuing and invoking of callbacks.
+ * Failing to correctly handle either of these races could result in
+ * rcu_barrier() failing to IPI a CPU that actually had callbacks queued
+ * which rcu_barrier() was obligated to wait on. And if rcu_barrier()
+ * failed to wait on such a callback, unloading certain kernel modules
+ * would result in calls to functions whose code was no longer present in
+ * the kernel, for but one example.
+ *
+ * Therefore, ->len transitions from 1->0 and 0->1 have to be carefully
+ * ordered with respect with both list modifications and the rcu_barrier().
+ *
+ * The queuing case is CASE 1 and the invoking case is CASE 2.
+ *
+ * CASE 1: Suppose that CPU 0 has no callbacks queued, but invokes
+ * call_rcu() just as CPU 1 invokes rcu_barrier(). CPU 0's ->len field
+ * will transition from 0->1, which is one of the transitions that must
+ * be handled carefully. Without the full memory barriers after the ->len
+ * update and at the beginning of rcu_barrier(), the following could happen:
+ *
+ * CPU 0 CPU 1
+ *
+ * call_rcu().
+ * rcu_barrier() sees ->len as 0.
+ * set ->len = 1.
+ * rcu_barrier() does nothing.
+ * module is unloaded.
+ * callback invokes unloaded function!
+ *
+ * With the full barriers, any case where rcu_barrier() sees ->len as 0 will
+ * have unambiguously preceded the return from the racing call_rcu(), which
+ * means that this call_rcu() invocation is OK to not wait on. After all,
+ * you are supposed to make sure that any problematic call_rcu() invocations
+ * happen before the rcu_barrier().
+ *
+ *
+ * CASE 2: Suppose that CPU 0 is invoking its last callback just as
+ * CPU 1 invokes rcu_barrier(). CPU 0's ->len field will transition from
+ * 1->0, which is one of the transitions that must be handled carefully.
+ * Without the full memory barriers before the ->len update and at the
+ * end of rcu_barrier(), the following could happen:
+ *
+ * CPU 0 CPU 1
+ *
+ * start invoking last callback
+ * set ->len = 0 (reordered)
+ * rcu_barrier() sees ->len as 0
+ * rcu_barrier() does nothing.
+ * module is unloaded
+ * callback executing after unloaded!
+ *
+ * With the full barriers, any case where rcu_barrier() sees ->len as 0
+ * will be fully ordered after the completion of the callback function,
+ * so that the module unloading operation is completely safe.
+ *
*/
-static void rcu_segcblist_add_len(struct rcu_segcblist *rsclp, long v)
+void rcu_segcblist_add_len(struct rcu_segcblist *rsclp, long v)
{
#ifdef CONFIG_RCU_NOCB_CPU
- smp_mb__before_atomic(); /* Up to the caller! */
+ smp_mb__before_atomic(); // Read header comment above.
atomic_long_add(v, &rsclp->len);
- smp_mb__after_atomic(); /* Up to the caller! */
+ smp_mb__after_atomic(); // Read header comment above.
#else
- smp_mb(); /* Up to the caller! */
+ smp_mb(); // Read header comment above.
WRITE_ONCE(rsclp->len, rsclp->len + v);
- smp_mb(); /* Up to the caller! */
+ smp_mb(); // Read header comment above.
#endif
}
@@ -120,26 +232,6 @@ void rcu_segcblist_inc_len(struct rcu_segcblist *rsclp)
}
/*
- * Exchange the numeric length of the specified rcu_segcblist structure
- * with the specified value. This can cause the ->len field to disagree
- * with the actual number of callbacks on the structure. This exchange is
- * fully ordered with respect to the callers accesses both before and after.
- */
-static long rcu_segcblist_xchg_len(struct rcu_segcblist *rsclp, long v)
-{
-#ifdef CONFIG_RCU_NOCB_CPU
- return atomic_long_xchg(&rsclp->len, v);
-#else
- long ret = rsclp->len;
-
- smp_mb(); /* Up to the caller! */
- WRITE_ONCE(rsclp->len, v);
- smp_mb(); /* Up to the caller! */
- return ret;
-#endif
-}
-
-/*
* Initialize an rcu_segcblist structure.
*/
void rcu_segcblist_init(struct rcu_segcblist *rsclp)
@@ -149,10 +241,12 @@ void rcu_segcblist_init(struct rcu_segcblist *rsclp)
BUILD_BUG_ON(RCU_NEXT_TAIL + 1 != ARRAY_SIZE(rsclp->gp_seq));
BUILD_BUG_ON(ARRAY_SIZE(rsclp->tails) != ARRAY_SIZE(rsclp->gp_seq));
rsclp->head = NULL;
- for (i = 0; i < RCU_CBLIST_NSEGS; i++)
+ for (i = 0; i < RCU_CBLIST_NSEGS; i++) {
rsclp->tails[i] = &rsclp->head;
+ rcu_segcblist_set_seglen(rsclp, i, 0);
+ }
rcu_segcblist_set_len(rsclp, 0);
- rsclp->enabled = 1;
+ rcu_segcblist_set_flags(rsclp, SEGCBLIST_ENABLED);
}
/*
@@ -163,16 +257,18 @@ void rcu_segcblist_disable(struct rcu_segcblist *rsclp)
{
WARN_ON_ONCE(!rcu_segcblist_empty(rsclp));
WARN_ON_ONCE(rcu_segcblist_n_cbs(rsclp));
- rsclp->enabled = 0;
+ rcu_segcblist_clear_flags(rsclp, SEGCBLIST_ENABLED);
}
/*
- * Mark the specified rcu_segcblist structure as offloaded. This
- * structure must be empty.
+ * Mark the specified rcu_segcblist structure as offloaded (or not)
*/
-void rcu_segcblist_offload(struct rcu_segcblist *rsclp)
+void rcu_segcblist_offload(struct rcu_segcblist *rsclp, bool offload)
{
- rsclp->offloaded = 1;
+ if (offload)
+ rcu_segcblist_set_flags(rsclp, SEGCBLIST_LOCKING | SEGCBLIST_OFFLOADED);
+ else
+ rcu_segcblist_clear_flags(rsclp, SEGCBLIST_OFFLOADED);
}
/*
@@ -245,7 +341,7 @@ void rcu_segcblist_enqueue(struct rcu_segcblist *rsclp,
struct rcu_head *rhp)
{
rcu_segcblist_inc_len(rsclp);
- smp_mb(); /* Ensure counts are updated before callback is enqueued. */
+ rcu_segcblist_inc_seglen(rsclp, RCU_NEXT_TAIL);
rhp->next = NULL;
WRITE_ONCE(*rsclp->tails[RCU_NEXT_TAIL], rhp);
WRITE_ONCE(rsclp->tails[RCU_NEXT_TAIL], &rhp->next);
@@ -272,8 +368,9 @@ bool rcu_segcblist_entrain(struct rcu_segcblist *rsclp,
smp_mb(); /* Ensure counts are updated before callback is entrained. */
rhp->next = NULL;
for (i = RCU_NEXT_TAIL; i > RCU_DONE_TAIL; i--)
- if (rsclp->tails[i] != rsclp->tails[i - 1])
+ if (!rcu_segcblist_segempty(rsclp, i))
break;
+ rcu_segcblist_inc_seglen(rsclp, i);
WRITE_ONCE(*rsclp->tails[i], rhp);
for (; i <= RCU_NEXT_TAIL; i++)
WRITE_ONCE(rsclp->tails[i], &rhp->next);
@@ -281,21 +378,6 @@ bool rcu_segcblist_entrain(struct rcu_segcblist *rsclp,
}
/*
- * Extract only the counts from the specified rcu_segcblist structure,
- * and place them in the specified rcu_cblist structure. This function
- * supports both callback orphaning and invocation, hence the separation
- * of counts and callbacks. (Callbacks ready for invocation must be
- * orphaned and adopted separately from pending callbacks, but counts
- * apply to all callbacks. Locking must be used to make sure that
- * both orphaned-callbacks lists are consistent.)
- */
-void rcu_segcblist_extract_count(struct rcu_segcblist *rsclp,
- struct rcu_cblist *rclp)
-{
- rclp->len = rcu_segcblist_xchg_len(rsclp, 0);
-}
-
-/*
* Extract only those callbacks ready to be invoked from the specified
* rcu_segcblist structure and place them in the specified rcu_cblist
* structure.
@@ -307,6 +389,7 @@ void rcu_segcblist_extract_done_cbs(struct rcu_segcblist *rsclp,
if (!rcu_segcblist_ready_cbs(rsclp))
return; /* Nothing to do. */
+ rclp->len = rcu_segcblist_get_seglen(rsclp, RCU_DONE_TAIL);
*rclp->tail = rsclp->head;
WRITE_ONCE(rsclp->head, *rsclp->tails[RCU_DONE_TAIL]);
WRITE_ONCE(*rsclp->tails[RCU_DONE_TAIL], NULL);
@@ -314,6 +397,7 @@ void rcu_segcblist_extract_done_cbs(struct rcu_segcblist *rsclp,
for (i = RCU_CBLIST_NSEGS - 1; i >= RCU_DONE_TAIL; i--)
if (rsclp->tails[i] == rsclp->tails[RCU_DONE_TAIL])
WRITE_ONCE(rsclp->tails[i], &rsclp->head);
+ rcu_segcblist_set_seglen(rsclp, RCU_DONE_TAIL, 0);
}
/*
@@ -330,11 +414,15 @@ void rcu_segcblist_extract_pend_cbs(struct rcu_segcblist *rsclp,
if (!rcu_segcblist_pend_cbs(rsclp))
return; /* Nothing to do. */
+ rclp->len = 0;
*rclp->tail = *rsclp->tails[RCU_DONE_TAIL];
rclp->tail = rsclp->tails[RCU_NEXT_TAIL];
WRITE_ONCE(*rsclp->tails[RCU_DONE_TAIL], NULL);
- for (i = RCU_DONE_TAIL + 1; i < RCU_CBLIST_NSEGS; i++)
+ for (i = RCU_DONE_TAIL + 1; i < RCU_CBLIST_NSEGS; i++) {
+ rclp->len += rcu_segcblist_get_seglen(rsclp, i);
WRITE_ONCE(rsclp->tails[i], rsclp->tails[RCU_DONE_TAIL]);
+ rcu_segcblist_set_seglen(rsclp, i, 0);
+ }
}
/*
@@ -345,7 +433,6 @@ void rcu_segcblist_insert_count(struct rcu_segcblist *rsclp,
struct rcu_cblist *rclp)
{
rcu_segcblist_add_len(rsclp, rclp->len);
- rclp->len = 0;
}
/*
@@ -359,6 +446,7 @@ void rcu_segcblist_insert_done_cbs(struct rcu_segcblist *rsclp,
if (!rclp->head)
return; /* No callbacks to move. */
+ rcu_segcblist_add_seglen(rsclp, RCU_DONE_TAIL, rclp->len);
*rclp->tail = rsclp->head;
WRITE_ONCE(rsclp->head, rclp->head);
for (i = RCU_DONE_TAIL; i < RCU_CBLIST_NSEGS; i++)
@@ -379,6 +467,8 @@ void rcu_segcblist_insert_pend_cbs(struct rcu_segcblist *rsclp,
{
if (!rclp->head)
return; /* Nothing to do. */
+
+ rcu_segcblist_add_seglen(rsclp, RCU_NEXT_TAIL, rclp->len);
WRITE_ONCE(*rsclp->tails[RCU_NEXT_TAIL], rclp->head);
WRITE_ONCE(rsclp->tails[RCU_NEXT_TAIL], rclp->tail);
}
@@ -403,6 +493,7 @@ void rcu_segcblist_advance(struct rcu_segcblist *rsclp, unsigned long seq)
if (ULONG_CMP_LT(seq, rsclp->gp_seq[i]))
break;
WRITE_ONCE(rsclp->tails[RCU_DONE_TAIL], rsclp->tails[i]);
+ rcu_segcblist_move_seglen(rsclp, i, RCU_DONE_TAIL);
}
/* If no callbacks moved, nothing more need be done. */
@@ -414,15 +505,16 @@ void rcu_segcblist_advance(struct rcu_segcblist *rsclp, unsigned long seq)
WRITE_ONCE(rsclp->tails[j], rsclp->tails[RCU_DONE_TAIL]);
/*
- * Callbacks moved, so clean up the misordered ->tails[] pointers
- * that now point into the middle of the list of ready-to-invoke
- * callbacks. The overall effect is to copy down the later pointers
- * into the gap that was created by the now-ready segments.
+ * Callbacks moved, so there might be an empty RCU_WAIT_TAIL
+ * and a non-empty RCU_NEXT_READY_TAIL. If so, copy the
+ * RCU_NEXT_READY_TAIL segment to fill the RCU_WAIT_TAIL gap
+ * created by the now-ready-to-invoke segments.
*/
for (j = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++, j++) {
if (rsclp->tails[j] == rsclp->tails[RCU_NEXT_TAIL])
break; /* No more callbacks. */
WRITE_ONCE(rsclp->tails[j], rsclp->tails[i]);
+ rcu_segcblist_move_seglen(rsclp, i, j);
rsclp->gp_seq[j] = rsclp->gp_seq[i];
}
}
@@ -444,7 +536,7 @@ void rcu_segcblist_advance(struct rcu_segcblist *rsclp, unsigned long seq)
*/
bool rcu_segcblist_accelerate(struct rcu_segcblist *rsclp, unsigned long seq)
{
- int i;
+ int i, j;
WARN_ON_ONCE(!rcu_segcblist_is_enabled(rsclp));
if (rcu_segcblist_restempty(rsclp, RCU_DONE_TAIL))
@@ -459,7 +551,7 @@ bool rcu_segcblist_accelerate(struct rcu_segcblist *rsclp, unsigned long seq)
* as their ->gp_seq[] grace-period completion sequence number.
*/
for (i = RCU_NEXT_READY_TAIL; i > RCU_DONE_TAIL; i--)
- if (rsclp->tails[i] != rsclp->tails[i - 1] &&
+ if (!rcu_segcblist_segempty(rsclp, i) &&
ULONG_CMP_LT(rsclp->gp_seq[i], seq))
break;
@@ -487,6 +579,10 @@ bool rcu_segcblist_accelerate(struct rcu_segcblist *rsclp, unsigned long seq)
if (rcu_segcblist_restempty(rsclp, i) || ++i >= RCU_NEXT_TAIL)
return false;
+ /* Accounting: everything below i is about to get merged into i. */
+ for (j = i + 1; j <= RCU_NEXT_TAIL; j++)
+ rcu_segcblist_move_seglen(rsclp, j, i);
+
/*
* Merge all later callbacks, including newly arrived callbacks,
* into the segment located by the for-loop above. Assign "seq"
@@ -514,13 +610,24 @@ void rcu_segcblist_merge(struct rcu_segcblist *dst_rsclp,
struct rcu_cblist donecbs;
struct rcu_cblist pendcbs;
+ lockdep_assert_cpus_held();
+
rcu_cblist_init(&donecbs);
rcu_cblist_init(&pendcbs);
- rcu_segcblist_extract_count(src_rsclp, &donecbs);
+
rcu_segcblist_extract_done_cbs(src_rsclp, &donecbs);
rcu_segcblist_extract_pend_cbs(src_rsclp, &pendcbs);
+
+ /*
+ * No need smp_mb() before setting length to 0, because CPU hotplug
+ * lock excludes rcu_barrier.
+ */
+ rcu_segcblist_set_len(src_rsclp, 0);
+
rcu_segcblist_insert_count(dst_rsclp, &donecbs);
+ rcu_segcblist_insert_count(dst_rsclp, &pendcbs);
rcu_segcblist_insert_done_cbs(dst_rsclp, &donecbs);
rcu_segcblist_insert_pend_cbs(dst_rsclp, &pendcbs);
+
rcu_segcblist_init(src_rsclp);
}
diff --git a/kernel/rcu/rcu_segcblist.h b/kernel/rcu/rcu_segcblist.h
index 492262bcb591..4fe877f5f654 100644
--- a/kernel/rcu/rcu_segcblist.h
+++ b/kernel/rcu/rcu_segcblist.h
@@ -15,6 +15,11 @@ static inline long rcu_cblist_n_cbs(struct rcu_cblist *rclp)
return READ_ONCE(rclp->len);
}
+long rcu_segcblist_get_seglen(struct rcu_segcblist *rsclp, int seg);
+
+/* Return number of callbacks in segmented callback list by summing seglen. */
+long rcu_segcblist_n_segment_cbs(struct rcu_segcblist *rsclp);
+
void rcu_cblist_init(struct rcu_cblist *rclp);
void rcu_cblist_enqueue(struct rcu_cblist *rclp, struct rcu_head *rhp);
void rcu_cblist_flush_enqueue(struct rcu_cblist *drclp,
@@ -50,19 +55,53 @@ static inline long rcu_segcblist_n_cbs(struct rcu_segcblist *rsclp)
#endif
}
+static inline void rcu_segcblist_set_flags(struct rcu_segcblist *rsclp,
+ int flags)
+{
+ WRITE_ONCE(rsclp->flags, rsclp->flags | flags);
+}
+
+static inline void rcu_segcblist_clear_flags(struct rcu_segcblist *rsclp,
+ int flags)
+{
+ WRITE_ONCE(rsclp->flags, rsclp->flags & ~flags);
+}
+
+static inline bool rcu_segcblist_test_flags(struct rcu_segcblist *rsclp,
+ int flags)
+{
+ return READ_ONCE(rsclp->flags) & flags;
+}
+
/*
* Is the specified rcu_segcblist enabled, for example, not corresponding
* to an offline CPU?
*/
static inline bool rcu_segcblist_is_enabled(struct rcu_segcblist *rsclp)
{
- return rsclp->enabled;
+ return rcu_segcblist_test_flags(rsclp, SEGCBLIST_ENABLED);
}
-/* Is the specified rcu_segcblist offloaded? */
+/*
+ * Is the specified rcu_segcblist NOCB offloaded (or in the middle of the
+ * [de]offloading process)?
+ */
static inline bool rcu_segcblist_is_offloaded(struct rcu_segcblist *rsclp)
{
- return IS_ENABLED(CONFIG_RCU_NOCB_CPU) && rsclp->offloaded;
+ if (IS_ENABLED(CONFIG_RCU_NOCB_CPU) &&
+ rcu_segcblist_test_flags(rsclp, SEGCBLIST_LOCKING))
+ return true;
+
+ return false;
+}
+
+static inline bool rcu_segcblist_completely_offloaded(struct rcu_segcblist *rsclp)
+{
+ if (IS_ENABLED(CONFIG_RCU_NOCB_CPU) &&
+ !rcu_segcblist_test_flags(rsclp, SEGCBLIST_RCU_CORE))
+ return true;
+
+ return false;
}
/*
@@ -75,10 +114,22 @@ static inline bool rcu_segcblist_restempty(struct rcu_segcblist *rsclp, int seg)
return !READ_ONCE(*READ_ONCE(rsclp->tails[seg]));
}
+/*
+ * Is the specified segment of the specified rcu_segcblist structure
+ * empty of callbacks?
+ */
+static inline bool rcu_segcblist_segempty(struct rcu_segcblist *rsclp, int seg)
+{
+ if (seg == RCU_DONE_TAIL)
+ return &rsclp->head == rsclp->tails[RCU_DONE_TAIL];
+ return rsclp->tails[seg - 1] == rsclp->tails[seg];
+}
+
void rcu_segcblist_inc_len(struct rcu_segcblist *rsclp);
+void rcu_segcblist_add_len(struct rcu_segcblist *rsclp, long v);
void rcu_segcblist_init(struct rcu_segcblist *rsclp);
void rcu_segcblist_disable(struct rcu_segcblist *rsclp);
-void rcu_segcblist_offload(struct rcu_segcblist *rsclp);
+void rcu_segcblist_offload(struct rcu_segcblist *rsclp, bool offload);
bool rcu_segcblist_ready_cbs(struct rcu_segcblist *rsclp);
bool rcu_segcblist_pend_cbs(struct rcu_segcblist *rsclp);
struct rcu_head *rcu_segcblist_first_cb(struct rcu_segcblist *rsclp);
@@ -88,8 +139,6 @@ void rcu_segcblist_enqueue(struct rcu_segcblist *rsclp,
struct rcu_head *rhp);
bool rcu_segcblist_entrain(struct rcu_segcblist *rsclp,
struct rcu_head *rhp);
-void rcu_segcblist_extract_count(struct rcu_segcblist *rsclp,
- struct rcu_cblist *rclp);
void rcu_segcblist_extract_done_cbs(struct rcu_segcblist *rsclp,
struct rcu_cblist *rclp);
void rcu_segcblist_extract_pend_cbs(struct rcu_segcblist *rsclp,
diff --git a/kernel/rcu/rcuscale.c b/kernel/rcu/rcuscale.c
index 06491d5530db..ffdb30495e3c 100644
--- a/kernel/rcu/rcuscale.c
+++ b/kernel/rcu/rcuscale.c
@@ -50,8 +50,8 @@ MODULE_AUTHOR("Paul E. McKenney <paulmck@linux.ibm.com>");
pr_alert("%s" SCALE_FLAG " %s\n", scale_type, s)
#define VERBOSE_SCALEOUT_STRING(s) \
do { if (verbose) pr_alert("%s" SCALE_FLAG " %s\n", scale_type, s); } while (0)
-#define VERBOSE_SCALEOUT_ERRSTRING(s) \
- do { if (verbose) pr_alert("%s" SCALE_FLAG "!!! %s\n", scale_type, s); } while (0)
+#define SCALEOUT_ERRSTRING(s) \
+ pr_alert("%s" SCALE_FLAG "!!! %s\n", scale_type, s)
/*
* The intended use cases for the nreaders and nwriters module parameters
@@ -84,17 +84,20 @@ MODULE_AUTHOR("Paul E. McKenney <paulmck@linux.ibm.com>");
#endif
torture_param(bool, gp_async, false, "Use asynchronous GP wait primitives");
-torture_param(int, gp_async_max, 1000, "Max # outstanding waits per reader");
+torture_param(int, gp_async_max, 1000, "Max # outstanding waits per writer");
torture_param(bool, gp_exp, false, "Use expedited GP wait primitives");
torture_param(int, holdoff, 10, "Holdoff time before test start (s)");
+torture_param(int, minruntime, 0, "Minimum run time (s)");
torture_param(int, nreaders, -1, "Number of RCU reader threads");
torture_param(int, nwriters, -1, "Number of RCU updater threads");
torture_param(bool, shutdown, RCUSCALE_SHUTDOWN,
"Shutdown at end of scalability tests.");
torture_param(int, verbose, 1, "Enable verbose debugging printk()s");
torture_param(int, writer_holdoff, 0, "Holdoff (us) between GPs, zero to disable");
+torture_param(int, writer_holdoff_jiffies, 0, "Holdoff (jiffies) between GPs, zero to disable");
torture_param(int, kfree_rcu_test, 0, "Do we run a kfree_rcu() scale test?");
torture_param(int, kfree_mult, 1, "Multiple of kfree_obj size to allocate.");
+torture_param(int, kfree_by_call_rcu, 0, "Use call_rcu() to emulate kfree_rcu()?");
static char *scale_type = "rcu";
module_param(scale_type, charp, 0444);
@@ -138,6 +141,7 @@ struct rcu_scale_ops {
void (*gp_barrier)(void);
void (*sync)(void);
void (*exp_sync)(void);
+ struct task_struct *(*rso_gp_kthread)(void);
const char *name;
};
@@ -175,7 +179,7 @@ static struct rcu_scale_ops rcu_ops = {
.get_gp_seq = rcu_get_gp_seq,
.gp_diff = rcu_seq_diff,
.exp_completed = rcu_exp_batches_completed,
- .async = call_rcu,
+ .async = call_rcu_hurry,
.gp_barrier = rcu_barrier,
.sync = synchronize_rcu,
.exp_sync = synchronize_rcu_expedited,
@@ -268,6 +272,8 @@ static struct rcu_scale_ops srcud_ops = {
.name = "srcud"
};
+#ifdef CONFIG_TASKS_RCU
+
/*
* Definitions for RCU-tasks scalability testing.
*/
@@ -292,9 +298,58 @@ static struct rcu_scale_ops tasks_ops = {
.gp_barrier = rcu_barrier_tasks,
.sync = synchronize_rcu_tasks,
.exp_sync = synchronize_rcu_tasks,
+ .rso_gp_kthread = get_rcu_tasks_gp_kthread,
.name = "tasks"
};
+#define TASKS_OPS &tasks_ops,
+
+#else // #ifdef CONFIG_TASKS_RCU
+
+#define TASKS_OPS
+
+#endif // #else // #ifdef CONFIG_TASKS_RCU
+
+#ifdef CONFIG_TASKS_RUDE_RCU
+
+/*
+ * Definitions for RCU-tasks-rude scalability testing.
+ */
+
+static int tasks_rude_scale_read_lock(void)
+{
+ return 0;
+}
+
+static void tasks_rude_scale_read_unlock(int idx)
+{
+}
+
+static struct rcu_scale_ops tasks_rude_ops = {
+ .ptype = RCU_TASKS_RUDE_FLAVOR,
+ .init = rcu_sync_scale_init,
+ .readlock = tasks_rude_scale_read_lock,
+ .readunlock = tasks_rude_scale_read_unlock,
+ .get_gp_seq = rcu_no_completed,
+ .gp_diff = rcu_seq_diff,
+ .async = call_rcu_tasks_rude,
+ .gp_barrier = rcu_barrier_tasks_rude,
+ .sync = synchronize_rcu_tasks_rude,
+ .exp_sync = synchronize_rcu_tasks_rude,
+ .rso_gp_kthread = get_rcu_tasks_rude_gp_kthread,
+ .name = "tasks-rude"
+};
+
+#define TASKS_RUDE_OPS &tasks_rude_ops,
+
+#else // #ifdef CONFIG_TASKS_RUDE_RCU
+
+#define TASKS_RUDE_OPS
+
+#endif // #else // #ifdef CONFIG_TASKS_RUDE_RCU
+
+#ifdef CONFIG_TASKS_TRACE_RCU
+
/*
* Definitions for RCU-tasks-trace scalability testing.
*/
@@ -321,9 +376,18 @@ static struct rcu_scale_ops tasks_tracing_ops = {
.gp_barrier = rcu_barrier_tasks_trace,
.sync = synchronize_rcu_tasks_trace,
.exp_sync = synchronize_rcu_tasks_trace,
+ .rso_gp_kthread = get_rcu_tasks_trace_gp_kthread,
.name = "tasks-tracing"
};
+#define TASKS_TRACING_OPS &tasks_tracing_ops,
+
+#else // #ifdef CONFIG_TASKS_TRACE_RCU
+
+#define TASKS_TRACING_OPS
+
+#endif // #else // #ifdef CONFIG_TASKS_TRACE_RCU
+
static unsigned long rcuscale_seq_diff(unsigned long new, unsigned long old)
{
if (!cur_ops->gp_diff)
@@ -389,20 +453,23 @@ rcu_scale_writer(void *arg)
{
int i = 0;
int i_max;
+ unsigned long jdone;
long me = (long)arg;
struct rcu_head *rhp = NULL;
bool started = false, done = false, alldone = false;
u64 t;
+ DEFINE_TORTURE_RANDOM(tr);
u64 *wdp;
u64 *wdpp = writer_durations[me];
VERBOSE_SCALEOUT_STRING("rcu_scale_writer task started");
WARN_ON(!wdpp);
set_cpus_allowed_ptr(current, cpumask_of(me % nr_cpu_ids));
+ current->flags |= PF_NO_SETAFFINITY;
sched_set_fifo_low(current);
if (holdoff)
- schedule_timeout_uninterruptible(holdoff * HZ);
+ schedule_timeout_idle(holdoff * HZ);
/*
* Wait until rcu_end_inkernel_boot() is called for normal GP tests
@@ -423,9 +490,12 @@ rcu_scale_writer(void *arg)
}
}
+ jdone = jiffies + minruntime * HZ;
do {
if (writer_holdoff)
udelay(writer_holdoff);
+ if (writer_holdoff_jiffies)
+ schedule_timeout_idle(torture_random(&tr) % writer_holdoff_jiffies + 1);
wdp = &wdpp[i];
*wdp = ktime_get_mono_fast_ns();
if (gp_async) {
@@ -453,7 +523,7 @@ retry:
if (!started &&
atomic_read(&n_rcu_scale_writer_started) >= nrealwriters)
started = true;
- if (!done && i >= MIN_MEAS) {
+ if (!done && i >= MIN_MEAS && time_after(jiffies, jdone)) {
done = true;
sched_set_normal(current, 0);
pr_alert("%s%s rcu_scale_writer %ld has %d measurements\n",
@@ -487,7 +557,7 @@ retry:
if (gp_async) {
cur_ops->gp_barrier();
}
- writer_n_durations[me] = i_max;
+ writer_n_durations[me] = i_max + 1;
torture_kthread_stopping("rcu_scale_writer");
return 0;
}
@@ -496,91 +566,8 @@ static void
rcu_scale_print_module_parms(struct rcu_scale_ops *cur_ops, const char *tag)
{
pr_alert("%s" SCALE_FLAG
- "--- %s: nreaders=%d nwriters=%d verbose=%d shutdown=%d\n",
- scale_type, tag, nrealreaders, nrealwriters, verbose, shutdown);
-}
-
-static void
-rcu_scale_cleanup(void)
-{
- int i;
- int j;
- int ngps = 0;
- u64 *wdp;
- u64 *wdpp;
-
- /*
- * Would like warning at start, but everything is expedited
- * during the mid-boot phase, so have to wait till the end.
- */
- if (rcu_gp_is_expedited() && !rcu_gp_is_normal() && !gp_exp)
- VERBOSE_SCALEOUT_ERRSTRING("All grace periods expedited, no normal ones to measure!");
- if (rcu_gp_is_normal() && gp_exp)
- VERBOSE_SCALEOUT_ERRSTRING("All grace periods normal, no expedited ones to measure!");
- if (gp_exp && gp_async)
- VERBOSE_SCALEOUT_ERRSTRING("No expedited async GPs, so went with async!");
-
- if (torture_cleanup_begin())
- return;
- if (!cur_ops) {
- torture_cleanup_end();
- return;
- }
-
- if (reader_tasks) {
- for (i = 0; i < nrealreaders; i++)
- torture_stop_kthread(rcu_scale_reader,
- reader_tasks[i]);
- kfree(reader_tasks);
- }
-
- if (writer_tasks) {
- for (i = 0; i < nrealwriters; i++) {
- torture_stop_kthread(rcu_scale_writer,
- writer_tasks[i]);
- if (!writer_n_durations)
- continue;
- j = writer_n_durations[i];
- pr_alert("%s%s writer %d gps: %d\n",
- scale_type, SCALE_FLAG, i, j);
- ngps += j;
- }
- pr_alert("%s%s start: %llu end: %llu duration: %llu gps: %d batches: %ld\n",
- scale_type, SCALE_FLAG,
- t_rcu_scale_writer_started, t_rcu_scale_writer_finished,
- t_rcu_scale_writer_finished -
- t_rcu_scale_writer_started,
- ngps,
- rcuscale_seq_diff(b_rcu_gp_test_finished,
- b_rcu_gp_test_started));
- for (i = 0; i < nrealwriters; i++) {
- if (!writer_durations)
- break;
- if (!writer_n_durations)
- continue;
- wdpp = writer_durations[i];
- if (!wdpp)
- continue;
- for (j = 0; j <= writer_n_durations[i]; j++) {
- wdp = &wdpp[j];
- pr_alert("%s%s %4d writer-duration: %5d %llu\n",
- scale_type, SCALE_FLAG,
- i, j, *wdp);
- if (j % 100 == 0)
- schedule_timeout_uninterruptible(1);
- }
- kfree(writer_durations[i]);
- }
- kfree(writer_tasks);
- kfree(writer_durations);
- kfree(writer_n_durations);
- }
-
- /* Do torture-type-specific cleanup operations. */
- if (cur_ops->cleanup != NULL)
- cur_ops->cleanup();
-
- torture_cleanup_end();
+ "--- %s: gp_async=%d gp_async_max=%d gp_exp=%d holdoff=%d minruntime=%d nreaders=%d nwriters=%d writer_holdoff=%d writer_holdoff_jiffies=%d verbose=%d shutdown=%d\n",
+ scale_type, tag, gp_async, gp_async_max, gp_exp, holdoff, minruntime, nrealreaders, nrealwriters, writer_holdoff, writer_holdoff_jiffies, verbose, shutdown);
}
/*
@@ -603,21 +590,6 @@ static int compute_real(int n)
}
/*
- * RCU scalability shutdown kthread. Just waits to be awakened, then shuts
- * down system.
- */
-static int
-rcu_scale_shutdown(void *arg)
-{
- wait_event(shutdown_wq,
- atomic_read(&n_rcu_scale_writer_finished) >= nrealwriters);
- smp_mb(); /* Wake before output. */
- rcu_scale_cleanup();
- kernel_power_off();
- return -EINVAL;
-}
-
-/*
* kfree_rcu() scalability tests: Start a kfree_rcu() loop on all CPUs for number
* of iterations and measure total time and number of GP for all iterations to complete.
*/
@@ -625,17 +597,29 @@ rcu_scale_shutdown(void *arg)
torture_param(int, kfree_nthreads, -1, "Number of threads running loops of kfree_rcu().");
torture_param(int, kfree_alloc_num, 8000, "Number of allocations and frees done in an iteration.");
torture_param(int, kfree_loops, 10, "Number of loops doing kfree_alloc_num allocations and frees.");
+torture_param(bool, kfree_rcu_test_double, false, "Do we run a kfree_rcu() double-argument scale test?");
+torture_param(bool, kfree_rcu_test_single, false, "Do we run a kfree_rcu() single-argument scale test?");
static struct task_struct **kfree_reader_tasks;
static int kfree_nrealthreads;
static atomic_t n_kfree_scale_thread_started;
static atomic_t n_kfree_scale_thread_ended;
+static struct task_struct *kthread_tp;
+static u64 kthread_stime;
struct kfree_obj {
char kfree_obj[8];
struct rcu_head rh;
};
+/* Used if doing RCU-kfree'ing via call_rcu(). */
+static void kfree_call_rcu(struct rcu_head *rh)
+{
+ struct kfree_obj *obj = container_of(rh, struct kfree_obj, rh);
+
+ kfree(obj);
+}
+
static int
kfree_scale_thread(void *arg)
{
@@ -644,10 +628,13 @@ kfree_scale_thread(void *arg)
struct kfree_obj *alloc_ptr;
u64 start_time, end_time;
long long mem_begin, mem_during = 0;
+ bool kfree_rcu_test_both;
+ DEFINE_TORTURE_RANDOM(tr);
VERBOSE_SCALEOUT_STRING("kfree_scale_thread task started");
set_cpus_allowed_ptr(current, cpumask_of(me % nr_cpu_ids));
set_user_nice(current, MAX_NICE);
+ kfree_rcu_test_both = (kfree_rcu_test_single == kfree_rcu_test_double);
start_time = ktime_get_mono_fast_ns();
@@ -670,7 +657,20 @@ kfree_scale_thread(void *arg)
if (!alloc_ptr)
return -ENOMEM;
- kfree_rcu(alloc_ptr, rh);
+ if (kfree_by_call_rcu) {
+ call_rcu(&(alloc_ptr->rh), kfree_call_rcu);
+ continue;
+ }
+
+ // By default kfree_rcu_test_single and kfree_rcu_test_double are
+ // initialized to false. If both have the same value (false or true)
+ // both are randomly tested, otherwise only the one with value true
+ // is tested.
+ if ((kfree_rcu_test_single && !kfree_rcu_test_double) ||
+ (kfree_rcu_test_both && torture_random(&tr) & 0x800))
+ kfree_rcu_mightsleep(alloc_ptr);
+ else
+ kfree_rcu(alloc_ptr, rh);
}
cond_resched();
@@ -723,8 +723,8 @@ kfree_scale_cleanup(void)
static int
kfree_scale_shutdown(void *arg)
{
- wait_event(shutdown_wq,
- atomic_read(&n_kfree_scale_thread_ended) >= kfree_nrealthreads);
+ wait_event_idle(shutdown_wq,
+ atomic_read(&n_kfree_scale_thread_ended) >= kfree_nrealthreads);
smp_mb(); /* Wake before output. */
@@ -733,11 +733,62 @@ kfree_scale_shutdown(void *arg)
return -EINVAL;
}
+// Used if doing RCU-kfree'ing via call_rcu().
+static unsigned long jiffies_at_lazy_cb;
+static struct rcu_head lazy_test1_rh;
+static int rcu_lazy_test1_cb_called;
+static void call_rcu_lazy_test1(struct rcu_head *rh)
+{
+ jiffies_at_lazy_cb = jiffies;
+ WRITE_ONCE(rcu_lazy_test1_cb_called, 1);
+}
+
static int __init
kfree_scale_init(void)
{
- long i;
int firsterr = 0;
+ long i;
+ unsigned long jif_start;
+ unsigned long orig_jif;
+
+ pr_alert("%s" SCALE_FLAG
+ "--- kfree_rcu_test: kfree_mult=%d kfree_by_call_rcu=%d kfree_nthreads=%d kfree_alloc_num=%d kfree_loops=%d kfree_rcu_test_double=%d kfree_rcu_test_single=%d\n",
+ scale_type, kfree_mult, kfree_by_call_rcu, kfree_nthreads, kfree_alloc_num, kfree_loops, kfree_rcu_test_double, kfree_rcu_test_single);
+
+ // Also, do a quick self-test to ensure laziness is as much as
+ // expected.
+ if (kfree_by_call_rcu && !IS_ENABLED(CONFIG_RCU_LAZY)) {
+ pr_alert("CONFIG_RCU_LAZY is disabled, falling back to kfree_rcu() for delayed RCU kfree'ing\n");
+ kfree_by_call_rcu = 0;
+ }
+
+ if (kfree_by_call_rcu) {
+ /* do a test to check the timeout. */
+ orig_jif = rcu_lazy_get_jiffies_till_flush();
+
+ rcu_lazy_set_jiffies_till_flush(2 * HZ);
+ rcu_barrier();
+
+ jif_start = jiffies;
+ jiffies_at_lazy_cb = 0;
+ call_rcu(&lazy_test1_rh, call_rcu_lazy_test1);
+
+ smp_cond_load_relaxed(&rcu_lazy_test1_cb_called, VAL == 1);
+
+ rcu_lazy_set_jiffies_till_flush(orig_jif);
+
+ if (WARN_ON_ONCE(jiffies_at_lazy_cb - jif_start < 2 * HZ)) {
+ pr_alert("ERROR: call_rcu() CBs are not being lazy as expected!\n");
+ WARN_ON_ONCE(1);
+ return -1;
+ }
+
+ if (WARN_ON_ONCE(jiffies_at_lazy_cb - jif_start > 3 * HZ)) {
+ pr_alert("ERROR: call_rcu() CBs are being too lazy!\n");
+ WARN_ON_ONCE(1);
+ return -1;
+ }
+ }
kfree_nrealthreads = compute_real(kfree_nthreads);
/* Start up the kthreads. */
@@ -745,12 +796,14 @@ kfree_scale_init(void)
init_waitqueue_head(&shutdown_wq);
firsterr = torture_create_kthread(kfree_scale_shutdown, NULL,
shutdown_task);
- if (firsterr)
+ if (torture_init_error(firsterr))
goto unwind;
schedule_timeout_uninterruptible(1);
}
- pr_alert("kfree object size=%zu\n", kfree_mult * sizeof(struct kfree_obj));
+ pr_alert("kfree object size=%zu, kfree_by_call_rcu=%d\n",
+ kfree_mult * sizeof(struct kfree_obj),
+ kfree_by_call_rcu);
kfree_reader_tasks = kcalloc(kfree_nrealthreads, sizeof(kfree_reader_tasks[0]),
GFP_KERNEL);
@@ -762,7 +815,7 @@ kfree_scale_init(void)
for (i = 0; i < kfree_nrealthreads; i++) {
firsterr = torture_create_kthread(kfree_scale_thread, (void *)i,
kfree_reader_tasks[i]);
- if (firsterr)
+ if (torture_init_error(firsterr))
goto unwind;
}
@@ -778,13 +831,127 @@ unwind:
return firsterr;
}
+static void
+rcu_scale_cleanup(void)
+{
+ int i;
+ int j;
+ int ngps = 0;
+ u64 *wdp;
+ u64 *wdpp;
+
+ /*
+ * Would like warning at start, but everything is expedited
+ * during the mid-boot phase, so have to wait till the end.
+ */
+ if (rcu_gp_is_expedited() && !rcu_gp_is_normal() && !gp_exp)
+ SCALEOUT_ERRSTRING("All grace periods expedited, no normal ones to measure!");
+ if (rcu_gp_is_normal() && gp_exp)
+ SCALEOUT_ERRSTRING("All grace periods normal, no expedited ones to measure!");
+ if (gp_exp && gp_async)
+ SCALEOUT_ERRSTRING("No expedited async GPs, so went with async!");
+
+ // If built-in, just report all of the GP kthread's CPU time.
+ if (IS_BUILTIN(CONFIG_RCU_SCALE_TEST) && !kthread_tp && cur_ops->rso_gp_kthread)
+ kthread_tp = cur_ops->rso_gp_kthread();
+ if (kthread_tp) {
+ u32 ns;
+ u64 us;
+
+ kthread_stime = kthread_tp->stime - kthread_stime;
+ us = div_u64_rem(kthread_stime, 1000, &ns);
+ pr_info("rcu_scale: Grace-period kthread CPU time: %llu.%03u us\n", us, ns);
+ show_rcu_gp_kthreads();
+ }
+ if (kfree_rcu_test) {
+ kfree_scale_cleanup();
+ return;
+ }
+
+ if (torture_cleanup_begin())
+ return;
+ if (!cur_ops) {
+ torture_cleanup_end();
+ return;
+ }
+
+ if (reader_tasks) {
+ for (i = 0; i < nrealreaders; i++)
+ torture_stop_kthread(rcu_scale_reader,
+ reader_tasks[i]);
+ kfree(reader_tasks);
+ }
+
+ if (writer_tasks) {
+ for (i = 0; i < nrealwriters; i++) {
+ torture_stop_kthread(rcu_scale_writer,
+ writer_tasks[i]);
+ if (!writer_n_durations)
+ continue;
+ j = writer_n_durations[i];
+ pr_alert("%s%s writer %d gps: %d\n",
+ scale_type, SCALE_FLAG, i, j);
+ ngps += j;
+ }
+ pr_alert("%s%s start: %llu end: %llu duration: %llu gps: %d batches: %ld\n",
+ scale_type, SCALE_FLAG,
+ t_rcu_scale_writer_started, t_rcu_scale_writer_finished,
+ t_rcu_scale_writer_finished -
+ t_rcu_scale_writer_started,
+ ngps,
+ rcuscale_seq_diff(b_rcu_gp_test_finished,
+ b_rcu_gp_test_started));
+ for (i = 0; i < nrealwriters; i++) {
+ if (!writer_durations)
+ break;
+ if (!writer_n_durations)
+ continue;
+ wdpp = writer_durations[i];
+ if (!wdpp)
+ continue;
+ for (j = 0; j < writer_n_durations[i]; j++) {
+ wdp = &wdpp[j];
+ pr_alert("%s%s %4d writer-duration: %5d %llu\n",
+ scale_type, SCALE_FLAG,
+ i, j, *wdp);
+ if (j % 100 == 0)
+ schedule_timeout_uninterruptible(1);
+ }
+ kfree(writer_durations[i]);
+ }
+ kfree(writer_tasks);
+ kfree(writer_durations);
+ kfree(writer_n_durations);
+ }
+
+ /* Do torture-type-specific cleanup operations. */
+ if (cur_ops->cleanup != NULL)
+ cur_ops->cleanup();
+
+ torture_cleanup_end();
+}
+
+/*
+ * RCU scalability shutdown kthread. Just waits to be awakened, then shuts
+ * down system.
+ */
+static int
+rcu_scale_shutdown(void *arg)
+{
+ wait_event_idle(shutdown_wq, atomic_read(&n_rcu_scale_writer_finished) >= nrealwriters);
+ smp_mb(); /* Wake before output. */
+ rcu_scale_cleanup();
+ kernel_power_off();
+ return -EINVAL;
+}
+
static int __init
rcu_scale_init(void)
{
long i;
int firsterr = 0;
static struct rcu_scale_ops *scale_ops[] = {
- &rcu_ops, &srcu_ops, &srcud_ops, &tasks_ops, &tasks_tracing_ops
+ &rcu_ops, &srcu_ops, &srcud_ops, TASKS_OPS TASKS_RUDE_OPS TASKS_TRACING_OPS
};
if (!torture_init_begin(scale_type, verbose))
@@ -809,6 +976,11 @@ rcu_scale_init(void)
if (cur_ops->init)
cur_ops->init();
+ if (cur_ops->rso_gp_kthread) {
+ kthread_tp = cur_ops->rso_gp_kthread();
+ if (kthread_tp)
+ kthread_stime = kthread_tp->stime;
+ }
if (kfree_rcu_test)
return kfree_scale_init();
@@ -825,21 +997,21 @@ rcu_scale_init(void)
init_waitqueue_head(&shutdown_wq);
firsterr = torture_create_kthread(rcu_scale_shutdown, NULL,
shutdown_task);
- if (firsterr)
+ if (torture_init_error(firsterr))
goto unwind;
schedule_timeout_uninterruptible(1);
}
reader_tasks = kcalloc(nrealreaders, sizeof(reader_tasks[0]),
GFP_KERNEL);
if (reader_tasks == NULL) {
- VERBOSE_SCALEOUT_ERRSTRING("out of memory");
+ SCALEOUT_ERRSTRING("out of memory");
firsterr = -ENOMEM;
goto unwind;
}
for (i = 0; i < nrealreaders; i++) {
firsterr = torture_create_kthread(rcu_scale_reader, (void *)i,
reader_tasks[i]);
- if (firsterr)
+ if (torture_init_error(firsterr))
goto unwind;
}
while (atomic_read(&n_rcu_scale_reader_started) < nrealreaders)
@@ -852,7 +1024,7 @@ rcu_scale_init(void)
kcalloc(nrealwriters, sizeof(*writer_n_durations),
GFP_KERNEL);
if (!writer_tasks || !writer_durations || !writer_n_durations) {
- VERBOSE_SCALEOUT_ERRSTRING("out of memory");
+ SCALEOUT_ERRSTRING("out of memory");
firsterr = -ENOMEM;
goto unwind;
}
@@ -866,7 +1038,7 @@ rcu_scale_init(void)
}
firsterr = torture_create_kthread(rcu_scale_writer, (void *)i,
writer_tasks[i]);
- if (firsterr)
+ if (torture_init_error(firsterr))
goto unwind;
}
torture_init_end();
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index 528ed10b78fd..7567ca8e743c 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -21,6 +21,7 @@
#include <linux/spinlock.h>
#include <linux/smp.h>
#include <linux/rcupdate_wait.h>
+#include <linux/rcu_notifier.h>
#include <linux/interrupt.h>
#include <linux/sched/signal.h>
#include <uapi/linux/sched/types.h>
@@ -46,6 +47,7 @@
#include <linux/oom.h>
#include <linux/tick.h>
#include <linux/rcupdate_trace.h>
+#include <linux/nmi.h>
#include "rcu.h"
@@ -53,15 +55,18 @@ MODULE_LICENSE("GPL");
MODULE_AUTHOR("Paul E. McKenney <paulmck@linux.ibm.com> and Josh Triplett <josh@joshtriplett.org>");
/* Bits for ->extendables field, extendables param, and related definitions. */
-#define RCUTORTURE_RDR_SHIFT 8 /* Put SRCU index in upper bits. */
-#define RCUTORTURE_RDR_MASK ((1 << RCUTORTURE_RDR_SHIFT) - 1)
+#define RCUTORTURE_RDR_SHIFT_1 8 /* Put SRCU index in upper bits. */
+#define RCUTORTURE_RDR_MASK_1 (1 << RCUTORTURE_RDR_SHIFT_1)
+#define RCUTORTURE_RDR_SHIFT_2 9 /* Put SRCU index in upper bits. */
+#define RCUTORTURE_RDR_MASK_2 (1 << RCUTORTURE_RDR_SHIFT_2)
#define RCUTORTURE_RDR_BH 0x01 /* Extend readers by disabling bh. */
#define RCUTORTURE_RDR_IRQ 0x02 /* ... disabling interrupts. */
#define RCUTORTURE_RDR_PREEMPT 0x04 /* ... disabling preemption. */
#define RCUTORTURE_RDR_RBH 0x08 /* ... rcu_read_lock_bh(). */
#define RCUTORTURE_RDR_SCHED 0x10 /* ... rcu_read_lock_sched(). */
-#define RCUTORTURE_RDR_RCU 0x20 /* ... entering another RCU reader. */
-#define RCUTORTURE_RDR_NBITS 6 /* Number of bits defined above. */
+#define RCUTORTURE_RDR_RCU_1 0x20 /* ... entering another RCU reader. */
+#define RCUTORTURE_RDR_RCU_2 0x40 /* ... entering another RCU reader. */
+#define RCUTORTURE_RDR_NBITS 7 /* Number of bits defined above. */
#define RCUTORTURE_MAX_EXTEND \
(RCUTORTURE_RDR_BH | RCUTORTURE_RDR_IRQ | RCUTORTURE_RDR_PREEMPT | \
RCUTORTURE_RDR_RBH | RCUTORTURE_RDR_SCHED)
@@ -71,82 +76,92 @@ MODULE_AUTHOR("Paul E. McKenney <paulmck@linux.ibm.com> and Josh Triplett <josh@
torture_param(int, extendables, RCUTORTURE_MAX_EXTEND,
"Extend readers by disabling bh (1), irqs (2), or preempt (4)");
-torture_param(int, fqs_duration, 0,
- "Duration of fqs bursts (us), 0 to disable");
+torture_param(int, fqs_duration, 0, "Duration of fqs bursts (us), 0 to disable");
torture_param(int, fqs_holdoff, 0, "Holdoff time within fqs bursts (us)");
torture_param(int, fqs_stutter, 3, "Wait time between fqs bursts (s)");
-torture_param(bool, fwd_progress, 1, "Test grace-period forward progress");
+torture_param(int, fwd_progress, 1, "Number of grace-period forward progress tasks (0 to disable)");
torture_param(int, fwd_progress_div, 4, "Fraction of CPU stall to wait");
-torture_param(int, fwd_progress_holdoff, 60,
- "Time between forward-progress tests (s)");
-torture_param(bool, fwd_progress_need_resched, 1,
- "Hide cond_resched() behind need_resched()");
+torture_param(int, fwd_progress_holdoff, 60, "Time between forward-progress tests (s)");
+torture_param(bool, fwd_progress_need_resched, 1, "Hide cond_resched() behind need_resched()");
torture_param(bool, gp_cond, false, "Use conditional/async GP wait primitives");
+torture_param(bool, gp_cond_exp, false, "Use conditional/async expedited GP wait primitives");
+torture_param(bool, gp_cond_full, false, "Use conditional/async full-state GP wait primitives");
+torture_param(bool, gp_cond_exp_full, false,
+ "Use conditional/async full-stateexpedited GP wait primitives");
torture_param(bool, gp_exp, false, "Use expedited GP wait primitives");
-torture_param(bool, gp_normal, false,
- "Use normal (non-expedited) GP wait primitives");
+torture_param(bool, gp_normal, false, "Use normal (non-expedited) GP wait primitives");
+torture_param(bool, gp_poll, false, "Use polling GP wait primitives");
+torture_param(bool, gp_poll_exp, false, "Use polling expedited GP wait primitives");
+torture_param(bool, gp_poll_full, false, "Use polling full-state GP wait primitives");
+torture_param(bool, gp_poll_exp_full, false, "Use polling full-state expedited GP wait primitives");
torture_param(bool, gp_sync, false, "Use synchronous GP wait primitives");
torture_param(int, irqreader, 1, "Allow RCU readers from irq handlers");
torture_param(int, leakpointer, 0, "Leak pointer dereferences from readers");
-torture_param(int, n_barrier_cbs, 0,
- "# of callbacks/kthreads for barrier testing");
+torture_param(int, n_barrier_cbs, 0, "# of callbacks/kthreads for barrier testing");
torture_param(int, nfakewriters, 4, "Number of RCU fake writer threads");
torture_param(int, nreaders, -1, "Number of RCU reader threads");
-torture_param(int, object_debug, 0,
- "Enable debug-object double call_rcu() testing");
+torture_param(int, object_debug, 0, "Enable debug-object double call_rcu() testing");
torture_param(int, onoff_holdoff, 0, "Time after boot before CPU hotplugs (s)");
-torture_param(int, onoff_interval, 0,
- "Time between CPU hotplugs (jiffies), 0=disable");
-torture_param(int, read_exit_delay, 13,
- "Delay between read-then-exit episodes (s)");
-torture_param(int, read_exit_burst, 16,
- "# of read-then-exit bursts per episode, zero to disable");
+torture_param(int, onoff_interval, 0, "Time between CPU hotplugs (jiffies), 0=disable");
+torture_param(int, nocbs_nthreads, 0, "Number of NOCB toggle threads, 0 to disable");
+torture_param(int, nocbs_toggle, 1000, "Time between toggling nocb state (ms)");
+torture_param(int, read_exit_delay, 13, "Delay between read-then-exit episodes (s)");
+torture_param(int, read_exit_burst, 16, "# of read-then-exit bursts per episode, zero to disable");
torture_param(int, shuffle_interval, 3, "Number of seconds between shuffles");
torture_param(int, shutdown_secs, 0, "Shutdown time (s), <= zero to disable.");
torture_param(int, stall_cpu, 0, "Stall duration (s), zero to disable.");
-torture_param(int, stall_cpu_holdoff, 10,
- "Time to wait before starting stall (s).");
+torture_param(int, stall_cpu_holdoff, 10, "Time to wait before starting stall (s).");
+torture_param(bool, stall_no_softlockup, false, "Avoid softlockup warning during cpu stall.");
torture_param(int, stall_cpu_irqsoff, 0, "Disable interrupts while stalling.");
torture_param(int, stall_cpu_block, 0, "Sleep while stalling.");
-torture_param(int, stall_gp_kthread, 0,
- "Grace-period kthread stall duration (s).");
-torture_param(int, stat_interval, 60,
- "Number of seconds between stats printk()s");
+torture_param(int, stall_gp_kthread, 0, "Grace-period kthread stall duration (s).");
+torture_param(int, stat_interval, 60, "Number of seconds between stats printk()s");
torture_param(int, stutter, 5, "Number of seconds to run/halt test");
torture_param(int, test_boost, 1, "Test RCU prio boost: 0=no, 1=maybe, 2=yes.");
-torture_param(int, test_boost_duration, 4,
- "Duration of each boost test, seconds.");
-torture_param(int, test_boost_interval, 7,
- "Interval between boost tests, seconds.");
-torture_param(bool, test_no_idle_hz, true,
- "Test support for tickless idle CPUs");
-torture_param(int, verbose, 1,
- "Enable verbose debugging printk()s");
+torture_param(int, test_boost_duration, 4, "Duration of each boost test, seconds.");
+torture_param(int, test_boost_interval, 7, "Interval between boost tests, seconds.");
+torture_param(int, test_nmis, 0, "End-test NMI tests, 0 to disable.");
+torture_param(bool, test_no_idle_hz, true, "Test support for tickless idle CPUs");
+torture_param(int, test_srcu_lockdep, 0, "Test specified SRCU deadlock scenario.");
+torture_param(int, verbose, 1, "Enable verbose debugging printk()s");
static char *torture_type = "rcu";
module_param(torture_type, charp, 0444);
MODULE_PARM_DESC(torture_type, "Type of RCU to torture (rcu, srcu, ...)");
+static int nrealnocbers;
static int nrealreaders;
static struct task_struct *writer_task;
static struct task_struct **fakewriter_tasks;
static struct task_struct **reader_tasks;
+static struct task_struct **nocb_tasks;
static struct task_struct *stats_task;
static struct task_struct *fqs_task;
static struct task_struct *boost_tasks[NR_CPUS];
static struct task_struct *stall_task;
-static struct task_struct *fwd_prog_task;
+static struct task_struct **fwd_prog_tasks;
static struct task_struct **barrier_cbs_tasks;
static struct task_struct *barrier_task;
static struct task_struct *read_exit_task;
#define RCU_TORTURE_PIPE_LEN 10
+// Mailbox-like structure to check RCU global memory ordering.
+struct rcu_torture_reader_check {
+ unsigned long rtc_myloops;
+ int rtc_chkrdr;
+ unsigned long rtc_chkloops;
+ int rtc_ready;
+ struct rcu_torture_reader_check *rtc_assigner;
+} ____cacheline_internodealigned_in_smp;
+
+// Update-side data structure used to check RCU readers.
struct rcu_torture {
struct rcu_head rtort_rcu;
int rtort_pipe_count;
struct list_head rtort_free;
int rtort_mbtest;
+ struct rcu_torture_reader_check *rtort_chkp;
};
static LIST_HEAD(rcu_torture_freelist);
@@ -157,14 +172,16 @@ static DEFINE_SPINLOCK(rcu_torture_lock);
static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1], rcu_torture_count);
static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1], rcu_torture_batch);
static atomic_t rcu_torture_wcount[RCU_TORTURE_PIPE_LEN + 1];
+static struct rcu_torture_reader_check *rcu_torture_reader_mbchk;
static atomic_t n_rcu_torture_alloc;
static atomic_t n_rcu_torture_alloc_fail;
static atomic_t n_rcu_torture_free;
static atomic_t n_rcu_torture_mberror;
+static atomic_t n_rcu_torture_mbchk_fail;
+static atomic_t n_rcu_torture_mbchk_tries;
static atomic_t n_rcu_torture_error;
static long n_rcu_torture_barrier_error;
static long n_rcu_torture_boost_ktrerror;
-static long n_rcu_torture_boost_rterror;
static long n_rcu_torture_boost_failure;
static long n_rcu_torture_boosts;
static atomic_long_t n_rcu_torture_timers;
@@ -174,6 +191,8 @@ static unsigned long n_read_exits;
static struct list_head rcu_torture_removed;
static unsigned long shutdown_jiffies;
static unsigned long start_gp_seq;
+static atomic_long_t n_nocb_offload;
+static atomic_long_t n_nocb_deoffload;
static int rcu_torture_writer_state;
#define RTWS_FIXED_DELAY 0
@@ -182,10 +201,24 @@ static int rcu_torture_writer_state;
#define RTWS_DEF_FREE 3
#define RTWS_EXP_SYNC 4
#define RTWS_COND_GET 5
-#define RTWS_COND_SYNC 6
-#define RTWS_SYNC 7
-#define RTWS_STUTTER 8
-#define RTWS_STOPPING 9
+#define RTWS_COND_GET_FULL 6
+#define RTWS_COND_GET_EXP 7
+#define RTWS_COND_GET_EXP_FULL 8
+#define RTWS_COND_SYNC 9
+#define RTWS_COND_SYNC_FULL 10
+#define RTWS_COND_SYNC_EXP 11
+#define RTWS_COND_SYNC_EXP_FULL 12
+#define RTWS_POLL_GET 13
+#define RTWS_POLL_GET_FULL 14
+#define RTWS_POLL_GET_EXP 15
+#define RTWS_POLL_GET_EXP_FULL 16
+#define RTWS_POLL_WAIT 17
+#define RTWS_POLL_WAIT_FULL 18
+#define RTWS_POLL_WAIT_EXP 19
+#define RTWS_POLL_WAIT_EXP_FULL 20
+#define RTWS_SYNC 21
+#define RTWS_STUTTER 22
+#define RTWS_STOPPING 23
static const char * const rcu_torture_writer_state_names[] = {
"RTWS_FIXED_DELAY",
"RTWS_DELAY",
@@ -193,7 +226,21 @@ static const char * const rcu_torture_writer_state_names[] = {
"RTWS_DEF_FREE",
"RTWS_EXP_SYNC",
"RTWS_COND_GET",
+ "RTWS_COND_GET_FULL",
+ "RTWS_COND_GET_EXP",
+ "RTWS_COND_GET_EXP_FULL",
"RTWS_COND_SYNC",
+ "RTWS_COND_SYNC_FULL",
+ "RTWS_COND_SYNC_EXP",
+ "RTWS_COND_SYNC_EXP_FULL",
+ "RTWS_POLL_GET",
+ "RTWS_POLL_GET_FULL",
+ "RTWS_POLL_GET_EXP",
+ "RTWS_POLL_GET_EXP_FULL",
+ "RTWS_POLL_WAIT",
+ "RTWS_POLL_WAIT_FULL",
+ "RTWS_POLL_WAIT_EXP",
+ "RTWS_POLL_WAIT_EXP_FULL",
"RTWS_SYNC",
"RTWS_STUTTER",
"RTWS_STOPPING",
@@ -220,12 +267,6 @@ static const char *rcu_torture_writer_state_getname(void)
return rcu_torture_writer_state_names[i];
}
-#if defined(CONFIG_RCU_BOOST) && !defined(CONFIG_HOTPLUG_CPU)
-#define rcu_can_boost() 1
-#else /* #if defined(CONFIG_RCU_BOOST) && !defined(CONFIG_HOTPLUG_CPU) */
-#define rcu_can_boost() 0
-#endif /* #else #if defined(CONFIG_RCU_BOOST) && !defined(CONFIG_HOTPLUG_CPU) */
-
#ifdef CONFIG_RCU_TRACE
static u64 notrace rcu_trace_clock_local(void)
{
@@ -259,7 +300,7 @@ static atomic_t barrier_cbs_invoked; /* Barrier callbacks invoked. */
static wait_queue_head_t *barrier_cbs_wq; /* Coordinate barrier testing. */
static DECLARE_WAIT_QUEUE_HEAD(barrier_wq);
-static bool rcu_fwd_cb_nodelay; /* Short rcu_torture_delay() delays. */
+static atomic_t rcu_fwd_cb_nodelay; /* Short rcu_torture_delay() delays. */
/*
* Allocate an element from the rcu_tortures pool.
@@ -306,23 +347,46 @@ struct rcu_torture_ops {
void (*read_delay)(struct torture_random_state *rrsp,
struct rt_read_seg *rtrsp);
void (*readunlock)(int idx);
+ int (*readlock_held)(void);
unsigned long (*get_gp_seq)(void);
unsigned long (*gp_diff)(unsigned long new, unsigned long old);
void (*deferred_free)(struct rcu_torture *p);
void (*sync)(void);
void (*exp_sync)(void);
- unsigned long (*get_state)(void);
+ unsigned long (*get_gp_state_exp)(void);
+ unsigned long (*start_gp_poll_exp)(void);
+ void (*start_gp_poll_exp_full)(struct rcu_gp_oldstate *rgosp);
+ bool (*poll_gp_state_exp)(unsigned long oldstate);
+ void (*cond_sync_exp)(unsigned long oldstate);
+ void (*cond_sync_exp_full)(struct rcu_gp_oldstate *rgosp);
+ unsigned long (*get_comp_state)(void);
+ void (*get_comp_state_full)(struct rcu_gp_oldstate *rgosp);
+ bool (*same_gp_state)(unsigned long oldstate1, unsigned long oldstate2);
+ bool (*same_gp_state_full)(struct rcu_gp_oldstate *rgosp1, struct rcu_gp_oldstate *rgosp2);
+ unsigned long (*get_gp_state)(void);
+ void (*get_gp_state_full)(struct rcu_gp_oldstate *rgosp);
+ unsigned long (*get_gp_completed)(void);
+ void (*get_gp_completed_full)(struct rcu_gp_oldstate *rgosp);
+ unsigned long (*start_gp_poll)(void);
+ void (*start_gp_poll_full)(struct rcu_gp_oldstate *rgosp);
+ bool (*poll_gp_state)(unsigned long oldstate);
+ bool (*poll_gp_state_full)(struct rcu_gp_oldstate *rgosp);
+ bool (*poll_need_2gp)(bool poll, bool poll_full);
void (*cond_sync)(unsigned long oldstate);
+ void (*cond_sync_full)(struct rcu_gp_oldstate *rgosp);
call_rcu_func_t call;
void (*cb_barrier)(void);
void (*fqs)(void);
void (*stats)(void);
void (*gp_kthread_dbg)(void);
+ bool (*check_boost_failed)(unsigned long gp_state, int *cpup);
int (*stall_dur)(void);
+ long cbflood_max;
int irq_capable;
int can_boost;
int extendables;
int slow_gps;
+ int no_pi_lock;
const char *name;
};
@@ -332,7 +396,12 @@ static struct rcu_torture_ops *cur_ops;
* Definitions for rcu torture testing.
*/
-static int rcu_torture_read_lock(void) __acquires(RCU)
+static int torture_readlock_not_held(void)
+{
+ return rcu_read_lock_bh_held() || rcu_read_lock_sched_held();
+}
+
+static int rcu_torture_read_lock(void)
{
rcu_read_lock();
return 0;
@@ -351,7 +420,7 @@ rcu_read_delay(struct torture_random_state *rrsp, struct rt_read_seg *rtrsp)
* period, and we want a long delay occasionally to trigger
* force_quiescent_state. */
- if (!READ_ONCE(rcu_fwd_cb_nodelay) &&
+ if (!atomic_read(&rcu_fwd_cb_nodelay) &&
!(torture_random(rrsp) % (nrealreaders * 2000 * longdelay_ms))) {
started = cur_ops->get_gp_seq();
ts = rcu_trace_clock_local();
@@ -374,7 +443,7 @@ rcu_read_delay(struct torture_random_state *rrsp, struct rt_read_seg *rtrsp)
}
}
-static void rcu_torture_read_unlock(int idx) __releases(RCU)
+static void rcu_torture_read_unlock(int idx)
{
rcu_read_unlock();
}
@@ -386,7 +455,12 @@ static bool
rcu_torture_pipe_update_one(struct rcu_torture *rp)
{
int i;
+ struct rcu_torture_reader_check *rtrcp = READ_ONCE(rp->rtort_chkp);
+ if (rtrcp) {
+ WRITE_ONCE(rp->rtort_chkp, NULL);
+ smp_store_release(&rtrcp->rtc_ready, 1); // Pair with smp_load_acquire().
+ }
i = READ_ONCE(rp->rtort_pipe_count);
if (i > RCU_TORTURE_PIPE_LEN)
i = RCU_TORTURE_PIPE_LEN;
@@ -442,7 +516,7 @@ static unsigned long rcu_no_completed(void)
static void rcu_torture_deferred_free(struct rcu_torture *p)
{
- call_rcu(&p->rtort_rcu, rcu_torture_cb);
+ call_rcu_hurry(&p->rtort_rcu, rcu_torture_cb);
}
static void rcu_sync_torture_init(void)
@@ -450,29 +524,54 @@ static void rcu_sync_torture_init(void)
INIT_LIST_HEAD(&rcu_torture_removed);
}
+static bool rcu_poll_need_2gp(bool poll, bool poll_full)
+{
+ return poll;
+}
+
static struct rcu_torture_ops rcu_ops = {
- .ttype = RCU_FLAVOR,
- .init = rcu_sync_torture_init,
- .readlock = rcu_torture_read_lock,
- .read_delay = rcu_read_delay,
- .readunlock = rcu_torture_read_unlock,
- .get_gp_seq = rcu_get_gp_seq,
- .gp_diff = rcu_seq_diff,
- .deferred_free = rcu_torture_deferred_free,
- .sync = synchronize_rcu,
- .exp_sync = synchronize_rcu_expedited,
- .get_state = get_state_synchronize_rcu,
- .cond_sync = cond_synchronize_rcu,
- .call = call_rcu,
- .cb_barrier = rcu_barrier,
- .fqs = rcu_force_quiescent_state,
- .stats = NULL,
- .gp_kthread_dbg = show_rcu_gp_kthreads,
- .stall_dur = rcu_jiffies_till_stall_check,
- .irq_capable = 1,
- .can_boost = rcu_can_boost(),
- .extendables = RCUTORTURE_MAX_EXTEND,
- .name = "rcu"
+ .ttype = RCU_FLAVOR,
+ .init = rcu_sync_torture_init,
+ .readlock = rcu_torture_read_lock,
+ .read_delay = rcu_read_delay,
+ .readunlock = rcu_torture_read_unlock,
+ .readlock_held = torture_readlock_not_held,
+ .get_gp_seq = rcu_get_gp_seq,
+ .gp_diff = rcu_seq_diff,
+ .deferred_free = rcu_torture_deferred_free,
+ .sync = synchronize_rcu,
+ .exp_sync = synchronize_rcu_expedited,
+ .same_gp_state = same_state_synchronize_rcu,
+ .same_gp_state_full = same_state_synchronize_rcu_full,
+ .get_comp_state = get_completed_synchronize_rcu,
+ .get_comp_state_full = get_completed_synchronize_rcu_full,
+ .get_gp_state = get_state_synchronize_rcu,
+ .get_gp_state_full = get_state_synchronize_rcu_full,
+ .get_gp_completed = get_completed_synchronize_rcu,
+ .get_gp_completed_full = get_completed_synchronize_rcu_full,
+ .start_gp_poll = start_poll_synchronize_rcu,
+ .start_gp_poll_full = start_poll_synchronize_rcu_full,
+ .poll_gp_state = poll_state_synchronize_rcu,
+ .poll_gp_state_full = poll_state_synchronize_rcu_full,
+ .poll_need_2gp = rcu_poll_need_2gp,
+ .cond_sync = cond_synchronize_rcu,
+ .cond_sync_full = cond_synchronize_rcu_full,
+ .get_gp_state_exp = get_state_synchronize_rcu,
+ .start_gp_poll_exp = start_poll_synchronize_rcu_expedited,
+ .start_gp_poll_exp_full = start_poll_synchronize_rcu_expedited_full,
+ .poll_gp_state_exp = poll_state_synchronize_rcu,
+ .cond_sync_exp = cond_synchronize_rcu_expedited,
+ .call = call_rcu_hurry,
+ .cb_barrier = rcu_barrier,
+ .fqs = rcu_force_quiescent_state,
+ .stats = NULL,
+ .gp_kthread_dbg = show_rcu_gp_kthreads,
+ .check_boost_failed = rcu_check_boost_fail,
+ .stall_dur = rcu_jiffies_till_stall_check,
+ .irq_capable = 1,
+ .can_boost = IS_ENABLED(CONFIG_RCU_BOOST),
+ .extendables = RCUTORTURE_MAX_EXTEND,
+ .name = "rcu"
};
/*
@@ -506,6 +605,7 @@ static struct rcu_torture_ops rcu_busted_ops = {
.readlock = rcu_torture_read_lock,
.read_delay = rcu_read_delay, /* just reuse rcu's version. */
.readunlock = rcu_torture_read_unlock,
+ .readlock_held = torture_readlock_not_held,
.get_gp_seq = rcu_no_completed,
.deferred_free = rcu_busted_torture_deferred_free,
.sync = synchronize_rcu_busted,
@@ -525,10 +625,14 @@ static struct rcu_torture_ops rcu_busted_ops = {
DEFINE_STATIC_SRCU(srcu_ctl);
static struct srcu_struct srcu_ctld;
static struct srcu_struct *srcu_ctlp = &srcu_ctl;
+static struct rcu_torture_ops srcud_ops;
-static int srcu_torture_read_lock(void) __acquires(srcu_ctlp)
+static int srcu_torture_read_lock(void)
{
- return srcu_read_lock(srcu_ctlp);
+ if (cur_ops == &srcud_ops)
+ return srcu_read_lock_nmisafe(srcu_ctlp);
+ else
+ return srcu_read_lock(srcu_ctlp);
}
static void
@@ -550,9 +654,17 @@ srcu_read_delay(struct torture_random_state *rrsp, struct rt_read_seg *rtrsp)
}
}
-static void srcu_torture_read_unlock(int idx) __releases(srcu_ctlp)
+static void srcu_torture_read_unlock(int idx)
+{
+ if (cur_ops == &srcud_ops)
+ srcu_read_unlock_nmisafe(srcu_ctlp, idx);
+ else
+ srcu_read_unlock(srcu_ctlp, idx);
+}
+
+static int torture_srcu_read_lock_held(void)
{
- srcu_read_unlock(srcu_ctlp, idx);
+ return srcu_read_lock_held(srcu_ctlp);
}
static unsigned long srcu_torture_completed(void)
@@ -570,6 +682,21 @@ static void srcu_torture_synchronize(void)
synchronize_srcu(srcu_ctlp);
}
+static unsigned long srcu_torture_get_gp_state(void)
+{
+ return get_state_synchronize_srcu(srcu_ctlp);
+}
+
+static unsigned long srcu_torture_start_gp_poll(void)
+{
+ return start_poll_synchronize_srcu(srcu_ctlp);
+}
+
+static bool srcu_torture_poll_gp_state(unsigned long oldstate)
+{
+ return poll_state_synchronize_srcu(srcu_ctlp, oldstate);
+}
+
static void srcu_torture_call(struct rcu_head *head,
rcu_callback_t func)
{
@@ -597,14 +724,20 @@ static struct rcu_torture_ops srcu_ops = {
.readlock = srcu_torture_read_lock,
.read_delay = srcu_read_delay,
.readunlock = srcu_torture_read_unlock,
+ .readlock_held = torture_srcu_read_lock_held,
.get_gp_seq = srcu_torture_completed,
.deferred_free = srcu_torture_deferred_free,
.sync = srcu_torture_synchronize,
.exp_sync = srcu_torture_synchronize_expedited,
+ .get_gp_state = srcu_torture_get_gp_state,
+ .start_gp_poll = srcu_torture_start_gp_poll,
+ .poll_gp_state = srcu_torture_poll_gp_state,
.call = srcu_torture_call,
.cb_barrier = srcu_torture_barrier,
.stats = srcu_torture_stats,
+ .cbflood_max = 50000,
.irq_capable = 1,
+ .no_pi_lock = IS_ENABLED(CONFIG_TINY_SRCU),
.name = "srcu"
};
@@ -629,14 +762,20 @@ static struct rcu_torture_ops srcud_ops = {
.readlock = srcu_torture_read_lock,
.read_delay = srcu_read_delay,
.readunlock = srcu_torture_read_unlock,
+ .readlock_held = torture_srcu_read_lock_held,
.get_gp_seq = srcu_torture_completed,
.deferred_free = srcu_torture_deferred_free,
.sync = srcu_torture_synchronize,
.exp_sync = srcu_torture_synchronize_expedited,
+ .get_gp_state = srcu_torture_get_gp_state,
+ .start_gp_poll = srcu_torture_start_gp_poll,
+ .poll_gp_state = srcu_torture_poll_gp_state,
.call = srcu_torture_call,
.cb_barrier = srcu_torture_barrier,
.stats = srcu_torture_stats,
+ .cbflood_max = 50000,
.irq_capable = 1,
+ .no_pi_lock = IS_ENABLED(CONFIG_TINY_SRCU),
.name = "srcud"
};
@@ -648,6 +787,7 @@ static struct rcu_torture_ops busted_srcud_ops = {
.readlock = srcu_torture_read_lock,
.read_delay = rcu_read_delay,
.readunlock = srcu_torture_read_unlock,
+ .readlock_held = torture_srcu_read_lock_held,
.get_gp_seq = srcu_torture_completed,
.deferred_free = srcu_torture_deferred_free,
.sync = srcu_torture_synchronize,
@@ -656,11 +796,56 @@ static struct rcu_torture_ops busted_srcud_ops = {
.cb_barrier = srcu_torture_barrier,
.stats = srcu_torture_stats,
.irq_capable = 1,
+ .no_pi_lock = IS_ENABLED(CONFIG_TINY_SRCU),
.extendables = RCUTORTURE_MAX_EXTEND,
.name = "busted_srcud"
};
/*
+ * Definitions for trivial CONFIG_PREEMPT=n-only torture testing.
+ * This implementation does not necessarily work well with CPU hotplug.
+ */
+
+static void synchronize_rcu_trivial(void)
+{
+ int cpu;
+
+ for_each_online_cpu(cpu) {
+ torture_sched_setaffinity(current->pid, cpumask_of(cpu));
+ WARN_ON_ONCE(raw_smp_processor_id() != cpu);
+ }
+}
+
+static int rcu_torture_read_lock_trivial(void)
+{
+ preempt_disable();
+ return 0;
+}
+
+static void rcu_torture_read_unlock_trivial(int idx)
+{
+ preempt_enable();
+}
+
+static struct rcu_torture_ops trivial_ops = {
+ .ttype = RCU_TRIVIAL_FLAVOR,
+ .init = rcu_sync_torture_init,
+ .readlock = rcu_torture_read_lock_trivial,
+ .read_delay = rcu_read_delay, /* just reuse rcu's version. */
+ .readunlock = rcu_torture_read_unlock_trivial,
+ .readlock_held = torture_readlock_not_held,
+ .get_gp_seq = rcu_no_completed,
+ .sync = synchronize_rcu_trivial,
+ .exp_sync = synchronize_rcu_trivial,
+ .fqs = NULL,
+ .stats = NULL,
+ .irq_capable = 1,
+ .name = "trivial"
+};
+
+#ifdef CONFIG_TASKS_RCU
+
+/*
* Definitions for RCU-tasks torture testing.
*/
@@ -680,7 +865,7 @@ static void rcu_tasks_torture_deferred_free(struct rcu_torture *p)
static void synchronize_rcu_mult_test(void)
{
- synchronize_rcu_mult(call_rcu_tasks, call_rcu);
+ synchronize_rcu_mult(call_rcu_tasks, call_rcu_hurry);
}
static struct rcu_torture_ops tasks_ops = {
@@ -703,46 +888,16 @@ static struct rcu_torture_ops tasks_ops = {
.name = "tasks"
};
-/*
- * Definitions for trivial CONFIG_PREEMPT=n-only torture testing.
- * This implementation does not necessarily work well with CPU hotplug.
- */
+#define TASKS_OPS &tasks_ops,
-static void synchronize_rcu_trivial(void)
-{
- int cpu;
+#else // #ifdef CONFIG_TASKS_RCU
- for_each_online_cpu(cpu) {
- rcutorture_sched_setaffinity(current->pid, cpumask_of(cpu));
- WARN_ON_ONCE(raw_smp_processor_id() != cpu);
- }
-}
+#define TASKS_OPS
-static int rcu_torture_read_lock_trivial(void) __acquires(RCU)
-{
- preempt_disable();
- return 0;
-}
+#endif // #else #ifdef CONFIG_TASKS_RCU
-static void rcu_torture_read_unlock_trivial(int idx) __releases(RCU)
-{
- preempt_enable();
-}
-static struct rcu_torture_ops trivial_ops = {
- .ttype = RCU_TRIVIAL_FLAVOR,
- .init = rcu_sync_torture_init,
- .readlock = rcu_torture_read_lock_trivial,
- .read_delay = rcu_read_delay, /* just reuse rcu's version. */
- .readunlock = rcu_torture_read_unlock_trivial,
- .get_gp_seq = rcu_no_completed,
- .sync = synchronize_rcu_trivial,
- .exp_sync = synchronize_rcu_trivial,
- .fqs = NULL,
- .stats = NULL,
- .irq_capable = 1,
- .name = "trivial"
-};
+#ifdef CONFIG_TASKS_RUDE_RCU
/*
* Definitions for rude RCU-tasks torture testing.
@@ -766,12 +921,24 @@ static struct rcu_torture_ops tasks_rude_ops = {
.call = call_rcu_tasks_rude,
.cb_barrier = rcu_barrier_tasks_rude,
.gp_kthread_dbg = show_rcu_tasks_rude_gp_kthread,
+ .cbflood_max = 50000,
.fqs = NULL,
.stats = NULL,
.irq_capable = 1,
.name = "tasks-rude"
};
+#define TASKS_RUDE_OPS &tasks_rude_ops,
+
+#else // #ifdef CONFIG_TASKS_RUDE_RCU
+
+#define TASKS_RUDE_OPS
+
+#endif // #else #ifdef CONFIG_TASKS_RUDE_RCU
+
+
+#ifdef CONFIG_TASKS_TRACE_RCU
+
/*
* Definitions for tracing RCU-tasks torture testing.
*/
@@ -798,6 +965,7 @@ static struct rcu_torture_ops tasks_tracing_ops = {
.readlock = tasks_tracing_torture_read_lock,
.read_delay = srcu_read_delay, /* just reuse srcu's version. */
.readunlock = tasks_tracing_torture_read_unlock,
+ .readlock_held = rcu_read_lock_trace_held,
.get_gp_seq = rcu_no_completed,
.deferred_free = rcu_tasks_tracing_torture_deferred_free,
.sync = synchronize_rcu_tasks_trace,
@@ -805,6 +973,7 @@ static struct rcu_torture_ops tasks_tracing_ops = {
.call = call_rcu_tasks_trace,
.cb_barrier = rcu_barrier_tasks_trace,
.gp_kthread_dbg = show_rcu_tasks_trace_gp_kthread,
+ .cbflood_max = 50000,
.fqs = NULL,
.stats = NULL,
.irq_capable = 1,
@@ -812,6 +981,15 @@ static struct rcu_torture_ops tasks_tracing_ops = {
.name = "tasks-tracing"
};
+#define TASKS_TRACING_OPS &tasks_tracing_ops,
+
+#else // #ifdef CONFIG_TASKS_TRACE_RCU
+
+#define TASKS_TRACING_OPS
+
+#endif // #else #ifdef CONFIG_TASKS_TRACE_RCU
+
+
static unsigned long rcutorture_seq_diff(unsigned long new, unsigned long old)
{
if (!cur_ops->gp_diff)
@@ -819,32 +997,13 @@ static unsigned long rcutorture_seq_diff(unsigned long new, unsigned long old)
return cur_ops->gp_diff(new, old);
}
-static bool __maybe_unused torturing_tasks(void)
-{
- return cur_ops == &tasks_ops || cur_ops == &tasks_rude_ops;
-}
-
/*
* RCU torture priority-boost testing. Runs one real-time thread per
- * CPU for moderate bursts, repeatedly registering RCU callbacks and
- * spinning waiting for them to be invoked. If a given callback takes
- * too long to be invoked, we assume that priority inversion has occurred.
+ * CPU for moderate bursts, repeatedly starting grace periods and waiting
+ * for them to complete. If a given grace period takes too long, we assume
+ * that priority inversion has occurred.
*/
-struct rcu_boost_inflight {
- struct rcu_head rcu;
- int inflight;
-};
-
-static void rcu_torture_boost_cb(struct rcu_head *head)
-{
- struct rcu_boost_inflight *rbip =
- container_of(head, struct rcu_boost_inflight, rcu);
-
- /* Ensure RCU-core accesses precede clearing ->inflight */
- smp_store_release(&rbip->inflight, 0);
-}
-
static int old_rt_runtime = -1;
static void rcu_torture_disable_rt_throttle(void)
@@ -871,50 +1030,73 @@ static void rcu_torture_enable_rt_throttle(void)
old_rt_runtime = -1;
}
-static bool rcu_torture_boost_failed(unsigned long start, unsigned long end)
+static bool rcu_torture_boost_failed(unsigned long gp_state, unsigned long *start)
{
- if (end - start > test_boost_duration * HZ - HZ / 2) {
+ int cpu;
+ static int dbg_done;
+ unsigned long end = jiffies;
+ bool gp_done;
+ unsigned long j;
+ static unsigned long last_persist;
+ unsigned long lp;
+ unsigned long mininterval = test_boost_duration * HZ - HZ / 2;
+
+ if (end - *start > mininterval) {
+ // Recheck after checking time to avoid false positives.
+ smp_mb(); // Time check before grace-period check.
+ if (cur_ops->poll_gp_state(gp_state))
+ return false; // passed, though perhaps just barely
+ if (cur_ops->check_boost_failed && !cur_ops->check_boost_failed(gp_state, &cpu)) {
+ // At most one persisted message per boost test.
+ j = jiffies;
+ lp = READ_ONCE(last_persist);
+ if (time_after(j, lp + mininterval) && cmpxchg(&last_persist, lp, j) == lp)
+ pr_info("Boost inversion persisted: No QS from CPU %d\n", cpu);
+ return false; // passed on a technicality
+ }
VERBOSE_TOROUT_STRING("rcu_torture_boost boosting failed");
n_rcu_torture_boost_failure++;
+ if (!xchg(&dbg_done, 1) && cur_ops->gp_kthread_dbg) {
+ pr_info("Boost inversion thread ->rt_priority %u gp_state %lu jiffies %lu\n",
+ current->rt_priority, gp_state, end - *start);
+ cur_ops->gp_kthread_dbg();
+ // Recheck after print to flag grace period ending during splat.
+ gp_done = cur_ops->poll_gp_state(gp_state);
+ pr_info("Boost inversion: GP %lu %s.\n", gp_state,
+ gp_done ? "ended already" : "still pending");
- return true; /* failed */
+ }
+
+ return true; // failed
+ } else if (cur_ops->check_boost_failed && !cur_ops->check_boost_failed(gp_state, NULL)) {
+ *start = jiffies;
}
- return false; /* passed */
+ return false; // passed
}
static int rcu_torture_boost(void *arg)
{
- unsigned long call_rcu_time;
unsigned long endtime;
+ unsigned long gp_state;
+ unsigned long gp_state_time;
unsigned long oldstarttime;
- struct rcu_boost_inflight rbi = { .inflight = 0 };
VERBOSE_TOROUT_STRING("rcu_torture_boost started");
/* Set real-time priority. */
sched_set_fifo_low(current);
- init_rcu_head_on_stack(&rbi.rcu);
/* Each pass through the following loop does one boost-test cycle. */
do {
- /* Track if the test failed already in this test interval? */
- bool failed = false;
+ bool failed = false; // Test failed already in this test interval
+ bool gp_initiated = false;
- /* Increment n_rcu_torture_boosts once per boost-test */
- while (!kthread_should_stop()) {
- if (mutex_trylock(&boost_mutex)) {
- n_rcu_torture_boosts++;
- mutex_unlock(&boost_mutex);
- break;
- }
- schedule_timeout_uninterruptible(1);
- }
if (kthread_should_stop())
goto checkwait;
/* Wait for the next test interval. */
- oldstarttime = boost_starttime;
+ oldstarttime = READ_ONCE(boost_starttime);
while (time_before(jiffies, oldstarttime)) {
schedule_timeout_interruptible(oldstarttime - jiffies);
if (stutter_wait("rcu_torture_boost"))
@@ -923,34 +1105,33 @@ static int rcu_torture_boost(void *arg)
goto checkwait;
}
- /* Do one boost-test interval. */
+ // Do one boost-test interval.
endtime = oldstarttime + test_boost_duration * HZ;
- call_rcu_time = jiffies;
while (time_before(jiffies, endtime)) {
- /* If we don't have a callback in flight, post one. */
- if (!smp_load_acquire(&rbi.inflight)) {
- /* RCU core before ->inflight = 1. */
- smp_store_release(&rbi.inflight, 1);
- call_rcu(&rbi.rcu, rcu_torture_boost_cb);
- /* Check if the boost test failed */
- failed = failed ||
- rcu_torture_boost_failed(call_rcu_time,
- jiffies);
- call_rcu_time = jiffies;
+ // Has current GP gone too long?
+ if (gp_initiated && !failed && !cur_ops->poll_gp_state(gp_state))
+ failed = rcu_torture_boost_failed(gp_state, &gp_state_time);
+ // If we don't have a grace period in flight, start one.
+ if (!gp_initiated || cur_ops->poll_gp_state(gp_state)) {
+ gp_state = cur_ops->start_gp_poll();
+ gp_initiated = true;
+ gp_state_time = jiffies;
}
- if (stutter_wait("rcu_torture_boost"))
+ if (stutter_wait("rcu_torture_boost")) {
sched_set_fifo_low(current);
+ // If the grace period already ended,
+ // we don't know when that happened, so
+ // start over.
+ if (cur_ops->poll_gp_state(gp_state))
+ gp_initiated = false;
+ }
if (torture_must_stop())
goto checkwait;
}
- /*
- * If boost never happened, then inflight will always be 1, in
- * this case the boost check would never happen in the above
- * loop so do another one here.
- */
- if (!failed && smp_load_acquire(&rbi.inflight))
- rcu_torture_boost_failed(call_rcu_time, jiffies);
+ // In case the grace period extended beyond the end of the loop.
+ if (gp_initiated && !failed && !cur_ops->poll_gp_state(gp_state))
+ rcu_torture_boost_failed(gp_state, &gp_state_time);
/*
* Set the start time of the next test interval.
@@ -959,15 +1140,17 @@ static int rcu_torture_boost(void *arg)
* interval. Besides, we are running at RT priority,
* so delays should be relatively rare.
*/
- while (oldstarttime == boost_starttime &&
- !kthread_should_stop()) {
+ while (oldstarttime == READ_ONCE(boost_starttime) && !kthread_should_stop()) {
if (mutex_trylock(&boost_mutex)) {
- boost_starttime = jiffies +
- test_boost_interval * HZ;
+ if (oldstarttime == boost_starttime) {
+ WRITE_ONCE(boost_starttime,
+ jiffies + test_boost_interval * HZ);
+ n_rcu_torture_boosts++;
+ }
mutex_unlock(&boost_mutex);
break;
}
- schedule_timeout_uninterruptible(1);
+ schedule_timeout_uninterruptible(HZ / 20);
}
/* Go do the stutter. */
@@ -976,11 +1159,10 @@ checkwait: if (stutter_wait("rcu_torture_boost"))
} while (!torture_must_stop());
/* Clean up and exit. */
- while (!kthread_should_stop() || smp_load_acquire(&rbi.inflight)) {
+ while (!kthread_should_stop()) {
torture_shutdown_absorb("rcu_torture_boost");
- schedule_timeout_uninterruptible(1);
+ schedule_timeout_uninterruptible(HZ / 20);
}
- destroy_rcu_head_on_stack(&rbi.rcu);
torture_kthread_stopping("rcu_torture_boost");
return 0;
}
@@ -1002,7 +1184,7 @@ rcu_torture_fqs(void *arg)
fqs_resume_time = jiffies + fqs_stutter * HZ;
while (time_before(jiffies, fqs_resume_time) &&
!kthread_should_stop()) {
- schedule_timeout_interruptible(1);
+ schedule_timeout_interruptible(HZ / 20);
}
fqs_burst_remaining = fqs_duration;
while (fqs_burst_remaining > 0 &&
@@ -1018,44 +1200,69 @@ rcu_torture_fqs(void *arg)
return 0;
}
+// Used by writers to randomly choose from the available grace-period primitives.
+static int synctype[ARRAY_SIZE(rcu_torture_writer_state_names)] = { };
+static int nsynctypes;
+
/*
- * RCU torture writer kthread. Repeatedly substitutes a new structure
- * for that pointed to by rcu_torture_current, freeing the old structure
- * after a series of grace periods (the "pipeline").
+ * Determine which grace-period primitives are available.
*/
-static int
-rcu_torture_writer(void *arg)
+static void rcu_torture_write_types(void)
{
- bool can_expedite = !rcu_gp_is_expedited() && !rcu_gp_is_normal();
- int expediting = 0;
- unsigned long gp_snap;
- bool gp_cond1 = gp_cond, gp_exp1 = gp_exp, gp_normal1 = gp_normal;
- bool gp_sync1 = gp_sync;
- int i;
- int oldnice = task_nice(current);
- struct rcu_torture *rp;
- struct rcu_torture *old_rp;
- static DEFINE_TORTURE_RANDOM(rand);
- bool stutter_waited;
- int synctype[] = { RTWS_DEF_FREE, RTWS_EXP_SYNC,
- RTWS_COND_GET, RTWS_SYNC };
- int nsynctypes = 0;
-
- VERBOSE_TOROUT_STRING("rcu_torture_writer task started");
- if (!can_expedite)
- pr_alert("%s" TORTURE_FLAG
- " GP expediting controlled from boot/sysfs for %s.\n",
- torture_type, cur_ops->name);
+ bool gp_cond1 = gp_cond, gp_cond_exp1 = gp_cond_exp, gp_cond_full1 = gp_cond_full;
+ bool gp_cond_exp_full1 = gp_cond_exp_full, gp_exp1 = gp_exp, gp_poll_exp1 = gp_poll_exp;
+ bool gp_poll_exp_full1 = gp_poll_exp_full, gp_normal1 = gp_normal, gp_poll1 = gp_poll;
+ bool gp_poll_full1 = gp_poll_full, gp_sync1 = gp_sync;
/* Initialize synctype[] array. If none set, take default. */
- if (!gp_cond1 && !gp_exp1 && !gp_normal1 && !gp_sync1)
- gp_cond1 = gp_exp1 = gp_normal1 = gp_sync1 = true;
- if (gp_cond1 && cur_ops->get_state && cur_ops->cond_sync) {
+ if (!gp_cond1 &&
+ !gp_cond_exp1 &&
+ !gp_cond_full1 &&
+ !gp_cond_exp_full1 &&
+ !gp_exp1 &&
+ !gp_poll_exp1 &&
+ !gp_poll_exp_full1 &&
+ !gp_normal1 &&
+ !gp_poll1 &&
+ !gp_poll_full1 &&
+ !gp_sync1) {
+ gp_cond1 = true;
+ gp_cond_exp1 = true;
+ gp_cond_full1 = true;
+ gp_cond_exp_full1 = true;
+ gp_exp1 = true;
+ gp_poll_exp1 = true;
+ gp_poll_exp_full1 = true;
+ gp_normal1 = true;
+ gp_poll1 = true;
+ gp_poll_full1 = true;
+ gp_sync1 = true;
+ }
+ if (gp_cond1 && cur_ops->get_gp_state && cur_ops->cond_sync) {
synctype[nsynctypes++] = RTWS_COND_GET;
pr_info("%s: Testing conditional GPs.\n", __func__);
- } else if (gp_cond && (!cur_ops->get_state || !cur_ops->cond_sync)) {
+ } else if (gp_cond && (!cur_ops->get_gp_state || !cur_ops->cond_sync)) {
pr_alert("%s: gp_cond without primitives.\n", __func__);
}
+ if (gp_cond_exp1 && cur_ops->get_gp_state_exp && cur_ops->cond_sync_exp) {
+ synctype[nsynctypes++] = RTWS_COND_GET_EXP;
+ pr_info("%s: Testing conditional expedited GPs.\n", __func__);
+ } else if (gp_cond_exp && (!cur_ops->get_gp_state_exp || !cur_ops->cond_sync_exp)) {
+ pr_alert("%s: gp_cond_exp without primitives.\n", __func__);
+ }
+ if (gp_cond_full1 && cur_ops->get_gp_state && cur_ops->cond_sync_full) {
+ synctype[nsynctypes++] = RTWS_COND_GET_FULL;
+ pr_info("%s: Testing conditional full-state GPs.\n", __func__);
+ } else if (gp_cond_full && (!cur_ops->get_gp_state || !cur_ops->cond_sync_full)) {
+ pr_alert("%s: gp_cond_full without primitives.\n", __func__);
+ }
+ if (gp_cond_exp_full1 && cur_ops->get_gp_state_exp && cur_ops->cond_sync_exp_full) {
+ synctype[nsynctypes++] = RTWS_COND_GET_EXP_FULL;
+ pr_info("%s: Testing conditional full-state expedited GPs.\n", __func__);
+ } else if (gp_cond_exp_full &&
+ (!cur_ops->get_gp_state_exp || !cur_ops->cond_sync_exp_full)) {
+ pr_alert("%s: gp_cond_exp_full without primitives.\n", __func__);
+ }
if (gp_exp1 && cur_ops->exp_sync) {
synctype[nsynctypes++] = RTWS_EXP_SYNC;
pr_info("%s: Testing expedited GPs.\n", __func__);
@@ -1068,14 +1275,109 @@ rcu_torture_writer(void *arg)
} else if (gp_normal && !cur_ops->deferred_free) {
pr_alert("%s: gp_normal without primitives.\n", __func__);
}
+ if (gp_poll1 && cur_ops->get_comp_state && cur_ops->same_gp_state &&
+ cur_ops->start_gp_poll && cur_ops->poll_gp_state) {
+ synctype[nsynctypes++] = RTWS_POLL_GET;
+ pr_info("%s: Testing polling GPs.\n", __func__);
+ } else if (gp_poll && (!cur_ops->start_gp_poll || !cur_ops->poll_gp_state)) {
+ pr_alert("%s: gp_poll without primitives.\n", __func__);
+ }
+ if (gp_poll_full1 && cur_ops->get_comp_state_full && cur_ops->same_gp_state_full
+ && cur_ops->start_gp_poll_full && cur_ops->poll_gp_state_full) {
+ synctype[nsynctypes++] = RTWS_POLL_GET_FULL;
+ pr_info("%s: Testing polling full-state GPs.\n", __func__);
+ } else if (gp_poll_full && (!cur_ops->start_gp_poll_full || !cur_ops->poll_gp_state_full)) {
+ pr_alert("%s: gp_poll_full without primitives.\n", __func__);
+ }
+ if (gp_poll_exp1 && cur_ops->start_gp_poll_exp && cur_ops->poll_gp_state_exp) {
+ synctype[nsynctypes++] = RTWS_POLL_GET_EXP;
+ pr_info("%s: Testing polling expedited GPs.\n", __func__);
+ } else if (gp_poll_exp && (!cur_ops->start_gp_poll_exp || !cur_ops->poll_gp_state_exp)) {
+ pr_alert("%s: gp_poll_exp without primitives.\n", __func__);
+ }
+ if (gp_poll_exp_full1 && cur_ops->start_gp_poll_exp_full && cur_ops->poll_gp_state_full) {
+ synctype[nsynctypes++] = RTWS_POLL_GET_EXP_FULL;
+ pr_info("%s: Testing polling full-state expedited GPs.\n", __func__);
+ } else if (gp_poll_exp_full &&
+ (!cur_ops->start_gp_poll_exp_full || !cur_ops->poll_gp_state_full)) {
+ pr_alert("%s: gp_poll_exp_full without primitives.\n", __func__);
+ }
if (gp_sync1 && cur_ops->sync) {
synctype[nsynctypes++] = RTWS_SYNC;
pr_info("%s: Testing normal GPs.\n", __func__);
} else if (gp_sync && !cur_ops->sync) {
pr_alert("%s: gp_sync without primitives.\n", __func__);
}
+}
+
+/*
+ * Do the specified rcu_torture_writer() synchronous grace period,
+ * while also testing out the polled APIs. Note well that the single-CPU
+ * grace-period optimizations must be accounted for.
+ */
+static void do_rtws_sync(struct torture_random_state *trsp, void (*sync)(void))
+{
+ unsigned long cookie;
+ struct rcu_gp_oldstate cookie_full;
+ bool dopoll;
+ bool dopoll_full;
+ unsigned long r = torture_random(trsp);
+
+ dopoll = cur_ops->get_gp_state && cur_ops->poll_gp_state && !(r & 0x300);
+ dopoll_full = cur_ops->get_gp_state_full && cur_ops->poll_gp_state_full && !(r & 0xc00);
+ if (dopoll || dopoll_full)
+ cpus_read_lock();
+ if (dopoll)
+ cookie = cur_ops->get_gp_state();
+ if (dopoll_full)
+ cur_ops->get_gp_state_full(&cookie_full);
+ if (cur_ops->poll_need_2gp && cur_ops->poll_need_2gp(dopoll, dopoll_full))
+ sync();
+ sync();
+ WARN_ONCE(dopoll && !cur_ops->poll_gp_state(cookie),
+ "%s: Cookie check 3 failed %pS() online %*pbl.",
+ __func__, sync, cpumask_pr_args(cpu_online_mask));
+ WARN_ONCE(dopoll_full && !cur_ops->poll_gp_state_full(&cookie_full),
+ "%s: Cookie check 4 failed %pS() online %*pbl",
+ __func__, sync, cpumask_pr_args(cpu_online_mask));
+ if (dopoll || dopoll_full)
+ cpus_read_unlock();
+}
+
+/*
+ * RCU torture writer kthread. Repeatedly substitutes a new structure
+ * for that pointed to by rcu_torture_current, freeing the old structure
+ * after a series of grace periods (the "pipeline").
+ */
+static int
+rcu_torture_writer(void *arg)
+{
+ bool boot_ended;
+ bool can_expedite = !rcu_gp_is_expedited() && !rcu_gp_is_normal();
+ unsigned long cookie;
+ struct rcu_gp_oldstate cookie_full;
+ int expediting = 0;
+ unsigned long gp_snap;
+ unsigned long gp_snap1;
+ struct rcu_gp_oldstate gp_snap_full;
+ struct rcu_gp_oldstate gp_snap1_full;
+ int i;
+ int idx;
+ int oldnice = task_nice(current);
+ struct rcu_gp_oldstate rgo[NUM_ACTIVE_RCU_POLL_FULL_OLDSTATE];
+ struct rcu_torture *rp;
+ struct rcu_torture *old_rp;
+ static DEFINE_TORTURE_RANDOM(rand);
+ bool stutter_waited;
+ unsigned long ulo[NUM_ACTIVE_RCU_POLL_OLDSTATE];
+
+ VERBOSE_TOROUT_STRING("rcu_torture_writer task started");
+ if (!can_expedite)
+ pr_alert("%s" TORTURE_FLAG
+ " GP expediting controlled from boot/sysfs for %s.\n",
+ torture_type, cur_ops->name);
if (WARN_ONCE(nsynctypes == 0,
- "rcu_torture_writer: No update-side primitives.\n")) {
+ "%s: No update-side primitives.\n", __func__)) {
/*
* No updates primitives, so don't try updating.
* The resulting test won't be testing much, hence the
@@ -1083,11 +1385,12 @@ rcu_torture_writer(void *arg)
*/
rcu_torture_writer_state = RTWS_STOPPING;
torture_kthread_stopping("rcu_torture_writer");
+ return 0;
}
do {
rcu_torture_writer_state = RTWS_FIXED_DELAY;
- schedule_timeout_uninterruptible(1);
+ torture_hrtimeout_us(500, 1000, &rand);
rp = rcu_torture_alloc();
if (rp == NULL)
continue;
@@ -1107,6 +1410,38 @@ rcu_torture_writer(void *arg)
atomic_inc(&rcu_torture_wcount[i]);
WRITE_ONCE(old_rp->rtort_pipe_count,
old_rp->rtort_pipe_count + 1);
+
+ // Make sure readers block polled grace periods.
+ if (cur_ops->get_gp_state && cur_ops->poll_gp_state) {
+ idx = cur_ops->readlock();
+ cookie = cur_ops->get_gp_state();
+ WARN_ONCE(cur_ops->poll_gp_state(cookie),
+ "%s: Cookie check 1 failed %s(%d) %lu->%lu\n",
+ __func__,
+ rcu_torture_writer_state_getname(),
+ rcu_torture_writer_state,
+ cookie, cur_ops->get_gp_state());
+ if (cur_ops->get_gp_completed) {
+ cookie = cur_ops->get_gp_completed();
+ WARN_ON_ONCE(!cur_ops->poll_gp_state(cookie));
+ }
+ cur_ops->readunlock(idx);
+ }
+ if (cur_ops->get_gp_state_full && cur_ops->poll_gp_state_full) {
+ idx = cur_ops->readlock();
+ cur_ops->get_gp_state_full(&cookie_full);
+ WARN_ONCE(cur_ops->poll_gp_state_full(&cookie_full),
+ "%s: Cookie check 5 failed %s(%d) online %*pbl\n",
+ __func__,
+ rcu_torture_writer_state_getname(),
+ rcu_torture_writer_state,
+ cpumask_pr_args(cpu_online_mask));
+ if (cur_ops->get_gp_completed_full) {
+ cur_ops->get_gp_completed_full(&cookie_full);
+ WARN_ON_ONCE(!cur_ops->poll_gp_state_full(&cookie_full));
+ }
+ cur_ops->readunlock(idx);
+ }
switch (synctype[torture_random(&rand) % nsynctypes]) {
case RTWS_DEF_FREE:
rcu_torture_writer_state = RTWS_DEF_FREE;
@@ -1114,23 +1449,103 @@ rcu_torture_writer(void *arg)
break;
case RTWS_EXP_SYNC:
rcu_torture_writer_state = RTWS_EXP_SYNC;
- cur_ops->exp_sync();
+ do_rtws_sync(&rand, cur_ops->exp_sync);
rcu_torture_pipe_update(old_rp);
break;
case RTWS_COND_GET:
rcu_torture_writer_state = RTWS_COND_GET;
- gp_snap = cur_ops->get_state();
- i = torture_random(&rand) % 16;
- if (i != 0)
- schedule_timeout_interruptible(i);
- udelay(torture_random(&rand) % 1000);
+ gp_snap = cur_ops->get_gp_state();
+ torture_hrtimeout_jiffies(torture_random(&rand) % 16, &rand);
rcu_torture_writer_state = RTWS_COND_SYNC;
cur_ops->cond_sync(gp_snap);
rcu_torture_pipe_update(old_rp);
break;
+ case RTWS_COND_GET_EXP:
+ rcu_torture_writer_state = RTWS_COND_GET_EXP;
+ gp_snap = cur_ops->get_gp_state_exp();
+ torture_hrtimeout_jiffies(torture_random(&rand) % 16, &rand);
+ rcu_torture_writer_state = RTWS_COND_SYNC_EXP;
+ cur_ops->cond_sync_exp(gp_snap);
+ rcu_torture_pipe_update(old_rp);
+ break;
+ case RTWS_COND_GET_FULL:
+ rcu_torture_writer_state = RTWS_COND_GET_FULL;
+ cur_ops->get_gp_state_full(&gp_snap_full);
+ torture_hrtimeout_jiffies(torture_random(&rand) % 16, &rand);
+ rcu_torture_writer_state = RTWS_COND_SYNC_FULL;
+ cur_ops->cond_sync_full(&gp_snap_full);
+ rcu_torture_pipe_update(old_rp);
+ break;
+ case RTWS_COND_GET_EXP_FULL:
+ rcu_torture_writer_state = RTWS_COND_GET_EXP_FULL;
+ cur_ops->get_gp_state_full(&gp_snap_full);
+ torture_hrtimeout_jiffies(torture_random(&rand) % 16, &rand);
+ rcu_torture_writer_state = RTWS_COND_SYNC_EXP_FULL;
+ cur_ops->cond_sync_exp_full(&gp_snap_full);
+ rcu_torture_pipe_update(old_rp);
+ break;
+ case RTWS_POLL_GET:
+ rcu_torture_writer_state = RTWS_POLL_GET;
+ for (i = 0; i < ARRAY_SIZE(ulo); i++)
+ ulo[i] = cur_ops->get_comp_state();
+ gp_snap = cur_ops->start_gp_poll();
+ rcu_torture_writer_state = RTWS_POLL_WAIT;
+ while (!cur_ops->poll_gp_state(gp_snap)) {
+ gp_snap1 = cur_ops->get_gp_state();
+ for (i = 0; i < ARRAY_SIZE(ulo); i++)
+ if (cur_ops->poll_gp_state(ulo[i]) ||
+ cur_ops->same_gp_state(ulo[i], gp_snap1)) {
+ ulo[i] = gp_snap1;
+ break;
+ }
+ WARN_ON_ONCE(i >= ARRAY_SIZE(ulo));
+ torture_hrtimeout_jiffies(torture_random(&rand) % 16,
+ &rand);
+ }
+ rcu_torture_pipe_update(old_rp);
+ break;
+ case RTWS_POLL_GET_FULL:
+ rcu_torture_writer_state = RTWS_POLL_GET_FULL;
+ for (i = 0; i < ARRAY_SIZE(rgo); i++)
+ cur_ops->get_comp_state_full(&rgo[i]);
+ cur_ops->start_gp_poll_full(&gp_snap_full);
+ rcu_torture_writer_state = RTWS_POLL_WAIT_FULL;
+ while (!cur_ops->poll_gp_state_full(&gp_snap_full)) {
+ cur_ops->get_gp_state_full(&gp_snap1_full);
+ for (i = 0; i < ARRAY_SIZE(rgo); i++)
+ if (cur_ops->poll_gp_state_full(&rgo[i]) ||
+ cur_ops->same_gp_state_full(&rgo[i],
+ &gp_snap1_full)) {
+ rgo[i] = gp_snap1_full;
+ break;
+ }
+ WARN_ON_ONCE(i >= ARRAY_SIZE(rgo));
+ torture_hrtimeout_jiffies(torture_random(&rand) % 16,
+ &rand);
+ }
+ rcu_torture_pipe_update(old_rp);
+ break;
+ case RTWS_POLL_GET_EXP:
+ rcu_torture_writer_state = RTWS_POLL_GET_EXP;
+ gp_snap = cur_ops->start_gp_poll_exp();
+ rcu_torture_writer_state = RTWS_POLL_WAIT_EXP;
+ while (!cur_ops->poll_gp_state_exp(gp_snap))
+ torture_hrtimeout_jiffies(torture_random(&rand) % 16,
+ &rand);
+ rcu_torture_pipe_update(old_rp);
+ break;
+ case RTWS_POLL_GET_EXP_FULL:
+ rcu_torture_writer_state = RTWS_POLL_GET_EXP_FULL;
+ cur_ops->start_gp_poll_exp_full(&gp_snap_full);
+ rcu_torture_writer_state = RTWS_POLL_WAIT_EXP_FULL;
+ while (!cur_ops->poll_gp_state_full(&gp_snap_full))
+ torture_hrtimeout_jiffies(torture_random(&rand) % 16,
+ &rand);
+ rcu_torture_pipe_update(old_rp);
+ break;
case RTWS_SYNC:
rcu_torture_writer_state = RTWS_SYNC;
- cur_ops->sync();
+ do_rtws_sync(&rand, cur_ops->sync);
rcu_torture_pipe_update(old_rp);
break;
default:
@@ -1155,18 +1570,21 @@ rcu_torture_writer(void *arg)
!rcu_gp_is_normal();
}
rcu_torture_writer_state = RTWS_STUTTER;
+ boot_ended = rcu_inkernel_boot_has_ended();
stutter_waited = stutter_wait("rcu_torture_writer");
if (stutter_waited &&
- !READ_ONCE(rcu_fwd_cb_nodelay) &&
+ !atomic_read(&rcu_fwd_cb_nodelay) &&
!cur_ops->slow_gps &&
!torture_must_stop() &&
- rcu_inkernel_boot_has_ended())
+ boot_ended)
for (i = 0; i < ARRAY_SIZE(rcu_tortures); i++)
if (list_empty(&rcu_tortures[i].rtort_free) &&
rcu_access_pointer(rcu_torture_current) !=
&rcu_tortures[i]) {
- rcu_ftrace_dump(DUMP_ALL);
+ tracing_off();
+ show_rcu_gp_kthreads();
WARN(1, "%s: rtort_pipe_count: %d\n", __func__, rcu_tortures[i].rtort_pipe_count);
+ rcu_ftrace_dump(DUMP_ALL);
}
if (stutter_waited)
sched_set_normal(current, oldnice);
@@ -1194,26 +1612,91 @@ rcu_torture_writer(void *arg)
static int
rcu_torture_fakewriter(void *arg)
{
+ unsigned long gp_snap;
+ struct rcu_gp_oldstate gp_snap_full;
DEFINE_TORTURE_RANDOM(rand);
VERBOSE_TOROUT_STRING("rcu_torture_fakewriter task started");
set_user_nice(current, MAX_NICE);
+ if (WARN_ONCE(nsynctypes == 0,
+ "%s: No update-side primitives.\n", __func__)) {
+ /*
+ * No updates primitives, so don't try updating.
+ * The resulting test won't be testing much, hence the
+ * above WARN_ONCE().
+ */
+ torture_kthread_stopping("rcu_torture_fakewriter");
+ return 0;
+ }
+
do {
- schedule_timeout_uninterruptible(1 + torture_random(&rand)%10);
- udelay(torture_random(&rand) & 0x3ff);
+ torture_hrtimeout_jiffies(torture_random(&rand) % 10, &rand);
if (cur_ops->cb_barrier != NULL &&
torture_random(&rand) % (nfakewriters * 8) == 0) {
cur_ops->cb_barrier();
- } else if (gp_normal == gp_exp) {
- if (cur_ops->sync && torture_random(&rand) & 0x80)
- cur_ops->sync();
- else if (cur_ops->exp_sync)
+ } else {
+ switch (synctype[torture_random(&rand) % nsynctypes]) {
+ case RTWS_DEF_FREE:
+ break;
+ case RTWS_EXP_SYNC:
cur_ops->exp_sync();
- } else if (gp_normal && cur_ops->sync) {
- cur_ops->sync();
- } else if (cur_ops->exp_sync) {
- cur_ops->exp_sync();
+ break;
+ case RTWS_COND_GET:
+ gp_snap = cur_ops->get_gp_state();
+ torture_hrtimeout_jiffies(torture_random(&rand) % 16, &rand);
+ cur_ops->cond_sync(gp_snap);
+ break;
+ case RTWS_COND_GET_EXP:
+ gp_snap = cur_ops->get_gp_state_exp();
+ torture_hrtimeout_jiffies(torture_random(&rand) % 16, &rand);
+ cur_ops->cond_sync_exp(gp_snap);
+ break;
+ case RTWS_COND_GET_FULL:
+ cur_ops->get_gp_state_full(&gp_snap_full);
+ torture_hrtimeout_jiffies(torture_random(&rand) % 16, &rand);
+ cur_ops->cond_sync_full(&gp_snap_full);
+ break;
+ case RTWS_COND_GET_EXP_FULL:
+ cur_ops->get_gp_state_full(&gp_snap_full);
+ torture_hrtimeout_jiffies(torture_random(&rand) % 16, &rand);
+ cur_ops->cond_sync_exp_full(&gp_snap_full);
+ break;
+ case RTWS_POLL_GET:
+ gp_snap = cur_ops->start_gp_poll();
+ while (!cur_ops->poll_gp_state(gp_snap)) {
+ torture_hrtimeout_jiffies(torture_random(&rand) % 16,
+ &rand);
+ }
+ break;
+ case RTWS_POLL_GET_FULL:
+ cur_ops->start_gp_poll_full(&gp_snap_full);
+ while (!cur_ops->poll_gp_state_full(&gp_snap_full)) {
+ torture_hrtimeout_jiffies(torture_random(&rand) % 16,
+ &rand);
+ }
+ break;
+ case RTWS_POLL_GET_EXP:
+ gp_snap = cur_ops->start_gp_poll_exp();
+ while (!cur_ops->poll_gp_state_exp(gp_snap)) {
+ torture_hrtimeout_jiffies(torture_random(&rand) % 16,
+ &rand);
+ }
+ break;
+ case RTWS_POLL_GET_EXP_FULL:
+ cur_ops->start_gp_poll_exp_full(&gp_snap_full);
+ while (!cur_ops->poll_gp_state_full(&gp_snap_full)) {
+ torture_hrtimeout_jiffies(torture_random(&rand) % 16,
+ &rand);
+ }
+ break;
+ case RTWS_SYNC:
+ cur_ops->sync();
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ break;
+ }
}
stutter_wait("rcu_torture_fakewriter");
} while (!torture_must_stop());
@@ -1227,6 +1710,62 @@ static void rcu_torture_timer_cb(struct rcu_head *rhp)
kfree(rhp);
}
+// Set up and carry out testing of RCU's global memory ordering
+static void rcu_torture_reader_do_mbchk(long myid, struct rcu_torture *rtp,
+ struct torture_random_state *trsp)
+{
+ unsigned long loops;
+ int noc = torture_num_online_cpus();
+ int rdrchked;
+ int rdrchker;
+ struct rcu_torture_reader_check *rtrcp; // Me.
+ struct rcu_torture_reader_check *rtrcp_assigner; // Assigned us to do checking.
+ struct rcu_torture_reader_check *rtrcp_chked; // Reader being checked.
+ struct rcu_torture_reader_check *rtrcp_chker; // Reader doing checking when not me.
+
+ if (myid < 0)
+ return; // Don't try this from timer handlers.
+
+ // Increment my counter.
+ rtrcp = &rcu_torture_reader_mbchk[myid];
+ WRITE_ONCE(rtrcp->rtc_myloops, rtrcp->rtc_myloops + 1);
+
+ // Attempt to assign someone else some checking work.
+ rdrchked = torture_random(trsp) % nrealreaders;
+ rtrcp_chked = &rcu_torture_reader_mbchk[rdrchked];
+ rdrchker = torture_random(trsp) % nrealreaders;
+ rtrcp_chker = &rcu_torture_reader_mbchk[rdrchker];
+ if (rdrchked != myid && rdrchked != rdrchker && noc >= rdrchked && noc >= rdrchker &&
+ smp_load_acquire(&rtrcp->rtc_chkrdr) < 0 && // Pairs with smp_store_release below.
+ !READ_ONCE(rtp->rtort_chkp) &&
+ !smp_load_acquire(&rtrcp_chker->rtc_assigner)) { // Pairs with smp_store_release below.
+ rtrcp->rtc_chkloops = READ_ONCE(rtrcp_chked->rtc_myloops);
+ WARN_ON_ONCE(rtrcp->rtc_chkrdr >= 0);
+ rtrcp->rtc_chkrdr = rdrchked;
+ WARN_ON_ONCE(rtrcp->rtc_ready); // This gets set after the grace period ends.
+ if (cmpxchg_relaxed(&rtrcp_chker->rtc_assigner, NULL, rtrcp) ||
+ cmpxchg_relaxed(&rtp->rtort_chkp, NULL, rtrcp))
+ (void)cmpxchg_relaxed(&rtrcp_chker->rtc_assigner, rtrcp, NULL); // Back out.
+ }
+
+ // If assigned some completed work, do it!
+ rtrcp_assigner = READ_ONCE(rtrcp->rtc_assigner);
+ if (!rtrcp_assigner || !smp_load_acquire(&rtrcp_assigner->rtc_ready))
+ return; // No work or work not yet ready.
+ rdrchked = rtrcp_assigner->rtc_chkrdr;
+ if (WARN_ON_ONCE(rdrchked < 0))
+ return;
+ rtrcp_chked = &rcu_torture_reader_mbchk[rdrchked];
+ loops = READ_ONCE(rtrcp_chked->rtc_myloops);
+ atomic_inc(&n_rcu_torture_mbchk_tries);
+ if (ULONG_CMP_LT(loops, rtrcp_assigner->rtc_chkloops))
+ atomic_inc(&n_rcu_torture_mbchk_fail);
+ rtrcp_assigner->rtc_chkloops = loops + ULONG_MAX / 2;
+ rtrcp_assigner->rtc_ready = 0;
+ smp_store_release(&rtrcp->rtc_assigner, NULL); // Someone else can assign us work.
+ smp_store_release(&rtrcp_assigner->rtc_chkrdr, -1); // Assigner can again assign.
+}
+
/*
* Do one extension of an RCU read-side critical section using the
* current reader state in readstate (set to zero for initial entry
@@ -1241,46 +1780,64 @@ static void rcutorture_one_extend(int *readstate, int newstate,
struct rt_read_seg *rtrsp)
{
unsigned long flags;
- int idxnew = -1;
- int idxold = *readstate;
+ int idxnew1 = -1;
+ int idxnew2 = -1;
+ int idxold1 = *readstate;
+ int idxold2 = idxold1;
int statesnew = ~*readstate & newstate;
int statesold = *readstate & ~newstate;
- WARN_ON_ONCE(idxold < 0);
- WARN_ON_ONCE((idxold >> RCUTORTURE_RDR_SHIFT) > 1);
+ WARN_ON_ONCE(idxold2 < 0);
+ WARN_ON_ONCE((idxold2 >> RCUTORTURE_RDR_SHIFT_2) > 1);
rtrsp->rt_readstate = newstate;
/* First, put new protection in place to avoid critical-section gap. */
if (statesnew & RCUTORTURE_RDR_BH)
local_bh_disable();
+ if (statesnew & RCUTORTURE_RDR_RBH)
+ rcu_read_lock_bh();
if (statesnew & RCUTORTURE_RDR_IRQ)
local_irq_disable();
if (statesnew & RCUTORTURE_RDR_PREEMPT)
preempt_disable();
- if (statesnew & RCUTORTURE_RDR_RBH)
- rcu_read_lock_bh();
if (statesnew & RCUTORTURE_RDR_SCHED)
rcu_read_lock_sched();
- if (statesnew & RCUTORTURE_RDR_RCU)
- idxnew = cur_ops->readlock() << RCUTORTURE_RDR_SHIFT;
+ if (statesnew & RCUTORTURE_RDR_RCU_1)
+ idxnew1 = (cur_ops->readlock() & 0x1) << RCUTORTURE_RDR_SHIFT_1;
+ if (statesnew & RCUTORTURE_RDR_RCU_2)
+ idxnew2 = (cur_ops->readlock() & 0x1) << RCUTORTURE_RDR_SHIFT_2;
- /* Next, remove old protection, irq first due to bh conflict. */
+ /*
+ * Next, remove old protection, in decreasing order of strength
+ * to avoid unlock paths that aren't safe in the stronger
+ * context. Namely: BH can not be enabled with disabled interrupts.
+ * Additionally PREEMPT_RT requires that BH is enabled in preemptible
+ * context.
+ */
if (statesold & RCUTORTURE_RDR_IRQ)
local_irq_enable();
- if (statesold & RCUTORTURE_RDR_BH)
- local_bh_enable();
if (statesold & RCUTORTURE_RDR_PREEMPT)
preempt_enable();
- if (statesold & RCUTORTURE_RDR_RBH)
- rcu_read_unlock_bh();
if (statesold & RCUTORTURE_RDR_SCHED)
rcu_read_unlock_sched();
- if (statesold & RCUTORTURE_RDR_RCU) {
- bool lockit = !statesnew && !(torture_random(trsp) & 0xffff);
+ if (statesold & RCUTORTURE_RDR_BH)
+ local_bh_enable();
+ if (statesold & RCUTORTURE_RDR_RBH)
+ rcu_read_unlock_bh();
+ if (statesold & RCUTORTURE_RDR_RCU_2) {
+ cur_ops->readunlock((idxold2 >> RCUTORTURE_RDR_SHIFT_2) & 0x1);
+ WARN_ON_ONCE(idxnew2 != -1);
+ idxold2 = 0;
+ }
+ if (statesold & RCUTORTURE_RDR_RCU_1) {
+ bool lockit;
+ lockit = !cur_ops->no_pi_lock && !statesnew && !(torture_random(trsp) & 0xffff);
if (lockit)
raw_spin_lock_irqsave(&current->pi_lock, flags);
- cur_ops->readunlock(idxold >> RCUTORTURE_RDR_SHIFT);
+ cur_ops->readunlock((idxold1 >> RCUTORTURE_RDR_SHIFT_1) & 0x1);
+ WARN_ON_ONCE(idxnew1 != -1);
+ idxold1 = 0;
if (lockit)
raw_spin_unlock_irqrestore(&current->pi_lock, flags);
}
@@ -1290,13 +1847,19 @@ static void rcutorture_one_extend(int *readstate, int newstate,
cur_ops->read_delay(trsp, rtrsp);
/* Update the reader state. */
- if (idxnew == -1)
- idxnew = idxold & ~RCUTORTURE_RDR_MASK;
- WARN_ON_ONCE(idxnew < 0);
- WARN_ON_ONCE((idxnew >> RCUTORTURE_RDR_SHIFT) > 1);
- *readstate = idxnew | newstate;
- WARN_ON_ONCE((*readstate >> RCUTORTURE_RDR_SHIFT) < 0);
- WARN_ON_ONCE((*readstate >> RCUTORTURE_RDR_SHIFT) > 1);
+ if (idxnew1 == -1)
+ idxnew1 = idxold1 & RCUTORTURE_RDR_MASK_1;
+ WARN_ON_ONCE(idxnew1 < 0);
+ if (WARN_ON_ONCE((idxnew1 >> RCUTORTURE_RDR_SHIFT_1) > 1))
+ pr_info("Unexpected idxnew1 value of %#x\n", idxnew1);
+ if (idxnew2 == -1)
+ idxnew2 = idxold2 & RCUTORTURE_RDR_MASK_2;
+ WARN_ON_ONCE(idxnew2 < 0);
+ WARN_ON_ONCE((idxnew2 >> RCUTORTURE_RDR_SHIFT_2) > 1);
+ *readstate = idxnew1 | idxnew2 | newstate;
+ WARN_ON_ONCE(*readstate < 0);
+ if (WARN_ON_ONCE((*readstate >> RCUTORTURE_RDR_SHIFT_2) > 1))
+ pr_info("Unexpected idxnew2 value of %#x\n", idxnew2);
}
/* Return the biggest extendables mask given current RCU and boot parameters. */
@@ -1306,7 +1869,7 @@ static int rcutorture_extend_mask_max(void)
WARN_ON_ONCE(extendables & ~RCUTORTURE_MAX_EXTEND);
mask = extendables & RCUTORTURE_MAX_EXTEND & cur_ops->extendables;
- mask = mask | RCUTORTURE_RDR_RCU;
+ mask = mask | RCUTORTURE_RDR_RCU_1 | RCUTORTURE_RDR_RCU_2;
return mask;
}
@@ -1315,21 +1878,47 @@ static int
rcutorture_extend_mask(int oldmask, struct torture_random_state *trsp)
{
int mask = rcutorture_extend_mask_max();
- unsigned long randmask1 = torture_random(trsp) >> 8;
+ unsigned long randmask1 = torture_random(trsp);
unsigned long randmask2 = randmask1 >> 3;
+ unsigned long preempts = RCUTORTURE_RDR_PREEMPT | RCUTORTURE_RDR_SCHED;
+ unsigned long preempts_irq = preempts | RCUTORTURE_RDR_IRQ;
+ unsigned long bhs = RCUTORTURE_RDR_BH | RCUTORTURE_RDR_RBH;
- WARN_ON_ONCE(mask >> RCUTORTURE_RDR_SHIFT);
+ WARN_ON_ONCE(mask >> RCUTORTURE_RDR_SHIFT_1);
/* Mostly only one bit (need preemption!), sometimes lots of bits. */
if (!(randmask1 & 0x7))
mask = mask & randmask2;
else
mask = mask & (1 << (randmask2 % RCUTORTURE_RDR_NBITS));
- /* Can't enable bh w/irq disabled. */
- if ((mask & RCUTORTURE_RDR_IRQ) &&
- ((!(mask & RCUTORTURE_RDR_BH) && (oldmask & RCUTORTURE_RDR_BH)) ||
- (!(mask & RCUTORTURE_RDR_RBH) && (oldmask & RCUTORTURE_RDR_RBH))))
- mask |= RCUTORTURE_RDR_BH | RCUTORTURE_RDR_RBH;
- return mask ?: RCUTORTURE_RDR_RCU;
+
+ // Can't have nested RCU reader without outer RCU reader.
+ if (!(mask & RCUTORTURE_RDR_RCU_1) && (mask & RCUTORTURE_RDR_RCU_2)) {
+ if (oldmask & RCUTORTURE_RDR_RCU_1)
+ mask &= ~RCUTORTURE_RDR_RCU_2;
+ else
+ mask |= RCUTORTURE_RDR_RCU_1;
+ }
+
+ /*
+ * Can't enable bh w/irq disabled.
+ */
+ if (mask & RCUTORTURE_RDR_IRQ)
+ mask |= oldmask & bhs;
+
+ /*
+ * Ideally these sequences would be detected in debug builds
+ * (regardless of RT), but until then don't stop testing
+ * them on non-RT.
+ */
+ if (IS_ENABLED(CONFIG_PREEMPT_RT)) {
+ /* Can't modify BH in atomic context */
+ if (oldmask & preempts_irq)
+ mask &= ~bhs;
+ if ((oldmask | mask) & preempts_irq)
+ mask |= oldmask & bhs;
+ }
+
+ return mask ?: RCUTORTURE_RDR_RCU_1;
}
/*
@@ -1348,7 +1937,7 @@ rcutorture_loop_extend(int *readstate, struct torture_random_state *trsp,
if (!((mask - 1) & mask))
return rtrsp; /* Current RCU reader not extendable. */
/* Bias towards larger numbers of loops. */
- i = (torture_random(trsp) >> 3);
+ i = torture_random(trsp);
i = ((i | (i >> 3)) & RCUTORTURE_RDR_MAX_LOOPS) + 1;
for (j = 0; j < i; j++) {
mask = rcutorture_extend_mask(*readstate, trsp);
@@ -1362,8 +1951,11 @@ rcutorture_loop_extend(int *readstate, struct torture_random_state *trsp,
* no data to read. Can be invoked both from process context and
* from a timer handler.
*/
-static bool rcu_torture_one_read(struct torture_random_state *trsp)
+static bool rcu_torture_one_read(struct torture_random_state *trsp, long myid)
{
+ bool checkpolling = !(torture_random(trsp) & 0xfff);
+ unsigned long cookie;
+ struct rcu_gp_oldstate cookie_full;
int i;
unsigned long started;
unsigned long completed;
@@ -1379,14 +1971,16 @@ static bool rcu_torture_one_read(struct torture_random_state *trsp)
WARN_ON_ONCE(!rcu_is_watching());
newstate = rcutorture_extend_mask(readstate, trsp);
rcutorture_one_extend(&readstate, newstate, trsp, rtrsp++);
+ if (checkpolling) {
+ if (cur_ops->get_gp_state && cur_ops->poll_gp_state)
+ cookie = cur_ops->get_gp_state();
+ if (cur_ops->get_gp_state_full && cur_ops->poll_gp_state_full)
+ cur_ops->get_gp_state_full(&cookie_full);
+ }
started = cur_ops->get_gp_seq();
ts = rcu_trace_clock_local();
p = rcu_dereference_check(rcu_torture_current,
- rcu_read_lock_bh_held() ||
- rcu_read_lock_sched_held() ||
- srcu_read_lock_held(srcu_ctlp) ||
- rcu_read_lock_trace_held() ||
- torturing_tasks());
+ !cur_ops->readlock_held || cur_ops->readlock_held());
if (p == NULL) {
/* Wait for rcu_torture_writer to get underway */
rcutorture_one_extend(&readstate, 0, trsp, rtrsp);
@@ -1394,6 +1988,7 @@ static bool rcu_torture_one_read(struct torture_random_state *trsp)
}
if (p->rtort_mbtest == 0)
atomic_inc(&n_rcu_torture_mberror);
+ rcu_torture_reader_do_mbchk(myid, p, trsp);
rtrsp = rcutorture_loop_extend(&readstate, trsp, rtrsp);
preempt_disable();
pipe_count = READ_ONCE(p->rtort_pipe_count);
@@ -1415,8 +2010,24 @@ static bool rcu_torture_one_read(struct torture_random_state *trsp)
}
__this_cpu_inc(rcu_torture_batch[completed]);
preempt_enable();
+ if (checkpolling) {
+ if (cur_ops->get_gp_state && cur_ops->poll_gp_state)
+ WARN_ONCE(cur_ops->poll_gp_state(cookie),
+ "%s: Cookie check 2 failed %s(%d) %lu->%lu\n",
+ __func__,
+ rcu_torture_writer_state_getname(),
+ rcu_torture_writer_state,
+ cookie, cur_ops->get_gp_state());
+ if (cur_ops->get_gp_state_full && cur_ops->poll_gp_state_full)
+ WARN_ONCE(cur_ops->poll_gp_state_full(&cookie_full),
+ "%s: Cookie check 6 failed %s(%d) online %*pbl\n",
+ __func__,
+ rcu_torture_writer_state_getname(),
+ rcu_torture_writer_state,
+ cpumask_pr_args(cpu_online_mask));
+ }
rcutorture_one_extend(&readstate, 0, trsp, rtrsp);
- WARN_ON_ONCE(readstate & RCUTORTURE_RDR_MASK);
+ WARN_ON_ONCE(readstate);
// This next splat is expected behavior if leakpointer, especially
// for CONFIG_RCU_STRICT_GRACE_PERIOD=y kernels.
WARN_ON_ONCE(leakpointer && READ_ONCE(p->rtort_pipe_count) > 1);
@@ -1443,7 +2054,7 @@ static DEFINE_TORTURE_RANDOM_PERCPU(rcu_torture_timer_rand);
static void rcu_torture_timer(struct timer_list *unused)
{
atomic_long_inc(&n_rcu_torture_timers);
- (void)rcu_torture_one_read(this_cpu_ptr(&rcu_torture_timer_rand));
+ (void)rcu_torture_one_read(this_cpu_ptr(&rcu_torture_timer_rand), -1);
/* Test call_rcu() invocation from interrupt handler. */
if (cur_ops->call) {
@@ -1479,13 +2090,13 @@ rcu_torture_reader(void *arg)
if (!timer_pending(&t))
mod_timer(&t, jiffies + 1);
}
- if (!rcu_torture_one_read(&rand) && !torture_must_stop())
+ if (!rcu_torture_one_read(&rand, myid) && !torture_must_stop())
schedule_timeout_interruptible(HZ);
if (time_after(jiffies, lastsleep) && !torture_must_stop()) {
- schedule_timeout_interruptible(1);
+ torture_hrtimeout_us(500, 1000, &rand);
lastsleep = jiffies + 10;
}
- while (num_online_cpus() < mynumonline && !torture_must_stop())
+ while (torture_num_online_cpus() < mynumonline && !torture_must_stop())
schedule_timeout_interruptible(HZ / 5);
stutter_wait("rcu_torture_reader");
} while (!torture_must_stop());
@@ -1499,6 +2110,53 @@ rcu_torture_reader(void *arg)
}
/*
+ * Randomly Toggle CPUs' callback-offload state. This uses hrtimers to
+ * increase race probabilities and fuzzes the interval between toggling.
+ */
+static int rcu_nocb_toggle(void *arg)
+{
+ int cpu;
+ int maxcpu = -1;
+ int oldnice = task_nice(current);
+ long r;
+ DEFINE_TORTURE_RANDOM(rand);
+ ktime_t toggle_delay;
+ unsigned long toggle_fuzz;
+ ktime_t toggle_interval = ms_to_ktime(nocbs_toggle);
+
+ VERBOSE_TOROUT_STRING("rcu_nocb_toggle task started");
+ while (!rcu_inkernel_boot_has_ended())
+ schedule_timeout_interruptible(HZ / 10);
+ for_each_possible_cpu(cpu)
+ maxcpu = cpu;
+ WARN_ON(maxcpu < 0);
+ if (toggle_interval > ULONG_MAX)
+ toggle_fuzz = ULONG_MAX >> 3;
+ else
+ toggle_fuzz = toggle_interval >> 3;
+ if (toggle_fuzz <= 0)
+ toggle_fuzz = NSEC_PER_USEC;
+ do {
+ r = torture_random(&rand);
+ cpu = (r >> 1) % (maxcpu + 1);
+ if (r & 0x1) {
+ rcu_nocb_cpu_offload(cpu);
+ atomic_long_inc(&n_nocb_offload);
+ } else {
+ rcu_nocb_cpu_deoffload(cpu);
+ atomic_long_inc(&n_nocb_deoffload);
+ }
+ toggle_delay = torture_random(&rand) % toggle_fuzz + toggle_interval;
+ set_current_state(TASK_INTERRUPTIBLE);
+ schedule_hrtimeout(&toggle_delay, HRTIMER_MODE_REL);
+ if (stutter_wait("rcu_nocb_toggle"))
+ sched_set_normal(current, oldnice);
+ } while (!torture_must_stop());
+ torture_kthread_stopping("rcu_nocb_toggle");
+ return 0;
+}
+
+/*
* Print torture statistics. Caller must ensure that there is only
* one call to this function at a given time!!! This is normally
* accomplished by relying on the module system to only have one copy
@@ -1524,7 +2182,7 @@ rcu_torture_stats_print(void)
batchsummary[i] += READ_ONCE(per_cpu(rcu_torture_batch, cpu)[i]);
}
}
- for (i = RCU_TORTURE_PIPE_LEN - 1; i >= 0; i--) {
+ for (i = RCU_TORTURE_PIPE_LEN; i >= 0; i--) {
if (pipesummary[i] != 0)
break;
}
@@ -1539,11 +2197,11 @@ rcu_torture_stats_print(void)
atomic_read(&n_rcu_torture_alloc),
atomic_read(&n_rcu_torture_alloc_fail),
atomic_read(&n_rcu_torture_free));
- pr_cont("rtmbe: %d rtbe: %ld rtbke: %ld rtbre: %ld ",
+ pr_cont("rtmbe: %d rtmbkf: %d/%d rtbe: %ld rtbke: %ld ",
atomic_read(&n_rcu_torture_mberror),
+ atomic_read(&n_rcu_torture_mbchk_fail), atomic_read(&n_rcu_torture_mbchk_tries),
n_rcu_torture_barrier_error,
- n_rcu_torture_boost_ktrerror,
- n_rcu_torture_boost_rterror);
+ n_rcu_torture_boost_ktrerror);
pr_cont("rtbf: %ld rtb: %ld nt: %ld ",
n_rcu_torture_boost_failure,
n_rcu_torture_boosts,
@@ -1553,20 +2211,22 @@ rcu_torture_stats_print(void)
data_race(n_barrier_successes),
data_race(n_barrier_attempts),
data_race(n_rcu_torture_barrier_error));
- pr_cont("read-exits: %ld\n", data_race(n_read_exits));
+ pr_cont("read-exits: %ld ", data_race(n_read_exits)); // Statistic.
+ pr_cont("nocb-toggles: %ld:%ld\n",
+ atomic_long_read(&n_nocb_offload), atomic_long_read(&n_nocb_deoffload));
pr_alert("%s%s ", torture_type, TORTURE_FLAG);
if (atomic_read(&n_rcu_torture_mberror) ||
+ atomic_read(&n_rcu_torture_mbchk_fail) ||
n_rcu_torture_barrier_error || n_rcu_torture_boost_ktrerror ||
- n_rcu_torture_boost_rterror || n_rcu_torture_boost_failure ||
- i > 1) {
+ n_rcu_torture_boost_failure || i > 1) {
pr_cont("%s", "!!! ");
atomic_inc(&n_rcu_torture_error);
WARN_ON_ONCE(atomic_read(&n_rcu_torture_mberror));
+ WARN_ON_ONCE(atomic_read(&n_rcu_torture_mbchk_fail));
WARN_ON_ONCE(n_rcu_torture_barrier_error); // rcu_barrier()
WARN_ON_ONCE(n_rcu_torture_boost_ktrerror); // no boost kthread
- WARN_ON_ONCE(n_rcu_torture_boost_rterror); // can't set RT prio
- WARN_ON_ONCE(n_rcu_torture_boost_failure); // RCU boost failed
+ WARN_ON_ONCE(n_rcu_torture_boost_failure); // boost failed (TIMER_SOFTIRQ RT prio?)
WARN_ON_ONCE(i > 1); // Too-short grace period
}
pr_cont("Reader Pipe: ");
@@ -1600,10 +2260,10 @@ rcu_torture_stats_print(void)
srcutorture_get_gp_data(cur_ops->ttype, srcu_ctlp,
&flags, &gp_seq);
wtp = READ_ONCE(writer_task);
- pr_alert("??? Writer stall state %s(%d) g%lu f%#x ->state %#lx cpu %d\n",
+ pr_alert("??? Writer stall state %s(%d) g%lu f%#x ->state %#x cpu %d\n",
rcu_torture_writer_state_getname(),
rcu_torture_writer_state, gp_seq, flags,
- wtp == NULL ? ~0UL : wtp->state,
+ wtp == NULL ? ~0U : wtp->__state,
wtp == NULL ? -1 : (int)task_cpu(wtp));
if (!splatted && wtp) {
sched_show_task(wtp);
@@ -1633,6 +2293,56 @@ rcu_torture_stats(void *arg)
return 0;
}
+/* Test mem_dump_obj() and friends. */
+static void rcu_torture_mem_dump_obj(void)
+{
+ struct rcu_head *rhp;
+ struct kmem_cache *kcp;
+ static int z;
+
+ kcp = kmem_cache_create("rcuscale", 136, 8, SLAB_STORE_USER, NULL);
+ if (WARN_ON_ONCE(!kcp))
+ return;
+ rhp = kmem_cache_alloc(kcp, GFP_KERNEL);
+ if (WARN_ON_ONCE(!rhp)) {
+ kmem_cache_destroy(kcp);
+ return;
+ }
+ pr_alert("mem_dump_obj() slab test: rcu_torture_stats = %px, &rhp = %px, rhp = %px, &z = %px\n", stats_task, &rhp, rhp, &z);
+ pr_alert("mem_dump_obj(ZERO_SIZE_PTR):");
+ mem_dump_obj(ZERO_SIZE_PTR);
+ pr_alert("mem_dump_obj(NULL):");
+ mem_dump_obj(NULL);
+ pr_alert("mem_dump_obj(%px):", &rhp);
+ mem_dump_obj(&rhp);
+ pr_alert("mem_dump_obj(%px):", rhp);
+ mem_dump_obj(rhp);
+ pr_alert("mem_dump_obj(%px):", &rhp->func);
+ mem_dump_obj(&rhp->func);
+ pr_alert("mem_dump_obj(%px):", &z);
+ mem_dump_obj(&z);
+ kmem_cache_free(kcp, rhp);
+ kmem_cache_destroy(kcp);
+ rhp = kmalloc(sizeof(*rhp), GFP_KERNEL);
+ if (WARN_ON_ONCE(!rhp))
+ return;
+ pr_alert("mem_dump_obj() kmalloc test: rcu_torture_stats = %px, &rhp = %px, rhp = %px\n", stats_task, &rhp, rhp);
+ pr_alert("mem_dump_obj(kmalloc %px):", rhp);
+ mem_dump_obj(rhp);
+ pr_alert("mem_dump_obj(kmalloc %px):", &rhp->func);
+ mem_dump_obj(&rhp->func);
+ kfree(rhp);
+ rhp = vmalloc(4096);
+ if (WARN_ON_ONCE(!rhp))
+ return;
+ pr_alert("mem_dump_obj() vmalloc test: rcu_torture_stats = %px, &rhp = %px, rhp = %px\n", stats_task, &rhp, rhp);
+ pr_alert("mem_dump_obj(vmalloc %px):", rhp);
+ mem_dump_obj(rhp);
+ pr_alert("mem_dump_obj(vmalloc %px):", &rhp->func);
+ mem_dump_obj(&rhp->func);
+ vfree(rhp);
+}
+
static void
rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, const char *tag)
{
@@ -1647,7 +2357,9 @@ rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, const char *tag)
"stall_cpu_block=%d "
"n_barrier_cbs=%d "
"onoff_interval=%d onoff_holdoff=%d "
- "read_exit_delay=%d read_exit_burst=%d\n",
+ "read_exit_delay=%d read_exit_burst=%d "
+ "nocbs_nthreads=%d nocbs_toggle=%d "
+ "test_nmis=%d\n",
torture_type, tag, nrealreaders, nfakewriters,
stat_interval, verbose, test_no_idle_hz, shuffle_interval,
stutter, irqreader, fqs_duration, fqs_holdoff, fqs_stutter,
@@ -1657,7 +2369,9 @@ rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, const char *tag)
stall_cpu_block,
n_barrier_cbs,
onoff_interval, onoff_holdoff,
- read_exit_delay, read_exit_burst);
+ read_exit_delay, read_exit_burst,
+ nocbs_nthreads, nocbs_toggle,
+ test_nmis);
}
static int rcutorture_booster_cleanup(unsigned int cpu)
@@ -1684,13 +2398,25 @@ static int rcutorture_booster_init(unsigned int cpu)
if (boost_tasks[cpu] != NULL)
return 0; /* Already created, nothing more to do. */
+ // Testing RCU priority boosting requires rcutorture do
+ // some serious abuse. Counter this by running ksoftirqd
+ // at higher priority.
+ if (IS_BUILTIN(CONFIG_RCU_TORTURE_TEST)) {
+ struct sched_param sp;
+ struct task_struct *t;
+
+ t = per_cpu(ksoftirqd, cpu);
+ WARN_ON_ONCE(!t);
+ sp.sched_priority = 2;
+ sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
+ }
+
/* Don't allow time recalculation while creating a new task. */
mutex_lock(&boost_mutex);
rcu_torture_disable_rt_throttle();
VERBOSE_TOROUT_STRING("Creating rcu_torture_boost task");
- boost_tasks[cpu] = kthread_create_on_node(rcu_torture_boost, NULL,
- cpu_to_node(cpu),
- "rcu_torture_boost");
+ boost_tasks[cpu] = kthread_run_on_cpu(rcu_torture_boost, NULL,
+ cpu, "rcu_torture_boost_%u");
if (IS_ERR(boost_tasks[cpu])) {
retval = PTR_ERR(boost_tasks[cpu]);
VERBOSE_TOROUT_STRING("rcu_torture_boost task create failed");
@@ -1699,12 +2425,20 @@ static int rcutorture_booster_init(unsigned int cpu)
mutex_unlock(&boost_mutex);
return retval;
}
- kthread_bind(boost_tasks[cpu], cpu);
- wake_up_process(boost_tasks[cpu]);
mutex_unlock(&boost_mutex);
return 0;
}
+static int rcu_torture_stall_nf(struct notifier_block *nb, unsigned long v, void *ptr)
+{
+ pr_info("%s: v=%lu, duration=%lu.\n", __func__, v, (unsigned long)ptr);
+ return NOTIFY_OK;
+}
+
+static struct notifier_block rcu_torture_stall_block = {
+ .notifier_call = rcu_torture_stall_nf,
+};
+
/*
* CPU-stall kthread. It waits as specified by stall_cpu_holdoff, then
* induces a CPU stall for the time specified by stall_cpu.
@@ -1712,9 +2446,16 @@ static int rcutorture_booster_init(unsigned int cpu)
static int rcu_torture_stall(void *args)
{
int idx;
+ int ret;
unsigned long stop_at;
VERBOSE_TOROUT_STRING("rcu_torture_stall task started");
+ if (rcu_cpu_stall_notifiers) {
+ ret = rcu_stall_chain_notifier_register(&rcu_torture_stall_block);
+ if (ret)
+ pr_info("%s: rcu_stall_chain_notifier_register() returned %d, %sexpected.\n",
+ __func__, ret, !IS_ENABLED(CONFIG_RCU_STALL_COMMON) ? "un" : "");
+ }
if (stall_cpu_holdoff > 0) {
VERBOSE_TOROUT_STRING("rcu_torture_stall begin holdoff");
schedule_timeout_interruptible(stall_cpu_holdoff * HZ);
@@ -1738,19 +2479,31 @@ static int rcu_torture_stall(void *args)
local_irq_disable();
else if (!stall_cpu_block)
preempt_disable();
- pr_alert("rcu_torture_stall start on CPU %d.\n",
- raw_smp_processor_id());
+ pr_alert("%s start on CPU %d.\n",
+ __func__, raw_smp_processor_id());
while (ULONG_CMP_LT((unsigned long)ktime_get_seconds(),
stop_at))
- if (stall_cpu_block)
+ if (stall_cpu_block) {
+#ifdef CONFIG_PREEMPTION
+ preempt_schedule();
+#else
schedule_timeout_uninterruptible(HZ);
+#endif
+ } else if (stall_no_softlockup) {
+ touch_softlockup_watchdog();
+ }
if (stall_cpu_irqsoff)
local_irq_enable();
else if (!stall_cpu_block)
preempt_enable();
cur_ops->readunlock(idx);
}
- pr_alert("rcu_torture_stall end.\n");
+ pr_alert("%s end.\n", __func__);
+ if (rcu_cpu_stall_notifiers && !ret) {
+ ret = rcu_stall_chain_notifier_unregister(&rcu_torture_stall_block);
+ if (ret)
+ pr_info("%s: rcu_stall_chain_notifier_unregister() returned %d.\n", __func__, ret);
+ }
torture_shutdown_absorb("rcu_torture_stall");
while (!kthread_should_stop())
schedule_timeout_interruptible(10 * HZ);
@@ -1814,10 +2567,13 @@ struct rcu_fwd {
unsigned long rcu_fwd_startat;
struct rcu_launder_hist n_launders_hist[N_LAUNDERS_HIST];
unsigned long rcu_launder_gp_seq_start;
+ int rcu_fwd_id;
};
static DEFINE_MUTEX(rcu_fwd_mutex);
static struct rcu_fwd *rcu_fwds;
+static unsigned long rcu_fwd_seq;
+static atomic_long_t rcu_fwd_max_cbs;
static bool rcu_fwd_emergency_stop;
static void rcu_torture_fwd_cb_hist(struct rcu_fwd *rfp)
@@ -1830,8 +2586,8 @@ static void rcu_torture_fwd_cb_hist(struct rcu_fwd *rfp)
for (i = ARRAY_SIZE(rfp->n_launders_hist) - 1; i > 0; i--)
if (rfp->n_launders_hist[i].n_launders > 0)
break;
- pr_alert("%s: Callback-invocation histogram (duration %lu jiffies):",
- __func__, jiffies - rfp->rcu_fwd_startat);
+ pr_alert("%s: Callback-invocation histogram %d (duration %lu jiffies):",
+ __func__, rfp->rcu_fwd_id, jiffies - rfp->rcu_fwd_startat);
gps_old = rfp->rcu_launder_gp_seq_start;
for (j = 0; j <= i; j++) {
gps = rfp->n_launders_hist[j].launder_gp_seq;
@@ -1929,6 +2685,7 @@ static void rcu_torture_fwd_prog_nr(struct rcu_fwd *rfp,
unsigned long stopat;
static DEFINE_TORTURE_RANDOM(trs);
+ pr_alert("%s: Starting forward-progress test %d\n", __func__, rfp->rcu_fwd_id);
if (!cur_ops->sync)
return; // Cannot do need_resched() forward progress testing without ->sync.
if (cur_ops->call && cur_ops->cb_barrier) {
@@ -1937,7 +2694,7 @@ static void rcu_torture_fwd_prog_nr(struct rcu_fwd *rfp,
}
/* Tight loop containing cond_resched(). */
- WRITE_ONCE(rcu_fwd_cb_nodelay, true);
+ atomic_inc(&rcu_fwd_cb_nodelay);
cur_ops->sync(); /* Later readers see above write. */
if (selfpropcb) {
WRITE_ONCE(fcs.stop, 0);
@@ -1967,11 +2724,13 @@ static void rcu_torture_fwd_prog_nr(struct rcu_fwd *rfp,
cver = READ_ONCE(rcu_torture_current_version) - cver;
gps = rcutorture_seq_diff(cur_ops->get_gp_seq(), gps);
WARN_ON(!cver && gps < 2);
- pr_alert("%s: Duration %ld cver %ld gps %ld\n", __func__, dur, cver, gps);
+ pr_alert("%s: %d Duration %ld cver %ld gps %ld\n", __func__,
+ rfp->rcu_fwd_id, dur, cver, gps);
}
if (selfpropcb) {
WRITE_ONCE(fcs.stop, 1);
cur_ops->sync(); /* Wait for running CB to complete. */
+ pr_alert("%s: Waiting for CBs: %pS() %d\n", __func__, cur_ops->cb_barrier, rfp->rcu_fwd_id);
cur_ops->cb_barrier(); /* Wait for queued callbacks. */
}
@@ -1980,7 +2739,7 @@ static void rcu_torture_fwd_prog_nr(struct rcu_fwd *rfp,
destroy_rcu_head_on_stack(&fcs.rh);
}
schedule_timeout_uninterruptible(HZ / 10); /* Let kthreads recover. */
- WRITE_ONCE(rcu_fwd_cb_nodelay, false);
+ atomic_dec(&rcu_fwd_cb_nodelay);
}
/* Carry out call_rcu() forward-progress testing. */
@@ -2000,13 +2759,14 @@ static void rcu_torture_fwd_prog_cr(struct rcu_fwd *rfp)
unsigned long stopat;
unsigned long stoppedat;
+ pr_alert("%s: Starting forward-progress test %d\n", __func__, rfp->rcu_fwd_id);
if (READ_ONCE(rcu_fwd_emergency_stop))
return; /* Get out of the way quickly, no GP wait! */
if (!cur_ops->call)
return; /* Can't do call_rcu() fwd prog without ->call. */
/* Loop continuously posting RCU callbacks. */
- WRITE_ONCE(rcu_fwd_cb_nodelay, true);
+ atomic_inc(&rcu_fwd_cb_nodelay);
cur_ops->sync(); /* Later readers see above write. */
WRITE_ONCE(rfp->rcu_fwd_startat, jiffies);
stopat = rfp->rcu_fwd_startat + MAX_FWD_CB_JIFFIES;
@@ -2035,7 +2795,7 @@ static void rcu_torture_fwd_prog_cr(struct rcu_fwd *rfp)
rfp->rcu_fwd_cb_head = rfcpn;
n_launders++;
n_launders_sa++;
- } else {
+ } else if (!cur_ops->cbflood_max || cur_ops->cbflood_max > n_max_cbs) {
rfcp = kmalloc(sizeof(*rfcp), GFP_KERNEL);
if (WARN_ON_ONCE(!rfcp)) {
schedule_timeout_interruptible(1);
@@ -2045,8 +2805,11 @@ static void rcu_torture_fwd_prog_cr(struct rcu_fwd *rfp)
n_launders_sa = 0;
rfcp->rfc_gps = 0;
rfcp->rfc_rfp = rfp;
+ } else {
+ rfcp = NULL;
}
- cur_ops->call(&rfcp->rh, rcu_torture_fwd_cb_cr);
+ if (rfcp)
+ cur_ops->call(&rfcp->rh, rcu_torture_fwd_cb_cr);
rcu_torture_fwd_prog_cond_resched(n_launders + n_max_cbs);
if (tick_nohz_full_enabled()) {
local_irq_save(flags);
@@ -2058,6 +2821,7 @@ static void rcu_torture_fwd_prog_cr(struct rcu_fwd *rfp)
n_launders_cb_snap = READ_ONCE(rfp->n_launders_cb);
cver = READ_ONCE(rcu_torture_current_version) - cver;
gps = rcutorture_seq_diff(cur_ops->get_gp_seq(), gps);
+ pr_alert("%s: Waiting for CBs: %pS() %d\n", __func__, cur_ops->cb_barrier, rfp->rcu_fwd_id);
cur_ops->cb_barrier(); /* Wait for callbacks to be invoked. */
(void)rcu_torture_fwd_prog_cbfree(rfp);
@@ -2070,11 +2834,14 @@ static void rcu_torture_fwd_prog_cr(struct rcu_fwd *rfp)
n_launders + n_max_cbs - n_launders_cb_snap,
n_launders, n_launders_sa,
n_max_gps, n_max_cbs, cver, gps);
+ atomic_long_add(n_max_cbs, &rcu_fwd_max_cbs);
+ mutex_lock(&rcu_fwd_mutex); // Serialize histograms.
rcu_torture_fwd_cb_hist(rfp);
+ mutex_unlock(&rcu_fwd_mutex);
}
schedule_timeout_uninterruptible(HZ); /* Let CBs drain. */
tick_dep_clear_task(current, TICK_DEP_BIT_RCU);
- WRITE_ONCE(rcu_fwd_cb_nodelay, false);
+ atomic_dec(&rcu_fwd_cb_nodelay);
}
@@ -2085,6 +2852,8 @@ static void rcu_torture_fwd_prog_cr(struct rcu_fwd *rfp)
static int rcutorture_oom_notify(struct notifier_block *self,
unsigned long notused, void *nfreed)
{
+ int i;
+ long ncbs;
struct rcu_fwd *rfp;
mutex_lock(&rcu_fwd_mutex);
@@ -2095,18 +2864,26 @@ static int rcutorture_oom_notify(struct notifier_block *self,
}
WARN(1, "%s invoked upon OOM during forward-progress testing.\n",
__func__);
- rcu_torture_fwd_cb_hist(rfp);
- rcu_fwd_progress_check(1 + (jiffies - READ_ONCE(rfp->rcu_fwd_startat)) / 2);
+ for (i = 0; i < fwd_progress; i++) {
+ rcu_torture_fwd_cb_hist(&rfp[i]);
+ rcu_fwd_progress_check(1 + (jiffies - READ_ONCE(rfp[i].rcu_fwd_startat)) / 2);
+ }
WRITE_ONCE(rcu_fwd_emergency_stop, true);
smp_mb(); /* Emergency stop before free and wait to avoid hangs. */
- pr_info("%s: Freed %lu RCU callbacks.\n",
- __func__, rcu_torture_fwd_prog_cbfree(rfp));
- rcu_barrier();
- pr_info("%s: Freed %lu RCU callbacks.\n",
- __func__, rcu_torture_fwd_prog_cbfree(rfp));
- rcu_barrier();
- pr_info("%s: Freed %lu RCU callbacks.\n",
- __func__, rcu_torture_fwd_prog_cbfree(rfp));
+ ncbs = 0;
+ for (i = 0; i < fwd_progress; i++)
+ ncbs += rcu_torture_fwd_prog_cbfree(&rfp[i]);
+ pr_info("%s: Freed %lu RCU callbacks.\n", __func__, ncbs);
+ cur_ops->cb_barrier();
+ ncbs = 0;
+ for (i = 0; i < fwd_progress; i++)
+ ncbs += rcu_torture_fwd_prog_cbfree(&rfp[i]);
+ pr_info("%s: Freed %lu RCU callbacks.\n", __func__, ncbs);
+ cur_ops->cb_barrier();
+ ncbs = 0;
+ for (i = 0; i < fwd_progress; i++)
+ ncbs += rcu_torture_fwd_prog_cbfree(&rfp[i]);
+ pr_info("%s: Freed %lu RCU callbacks.\n", __func__, ncbs);
smp_mb(); /* Frees before return to avoid redoing OOM. */
(*(unsigned long *)nfreed)++; /* Forward progress CBs freed! */
pr_info("%s returning after OOM processing.\n", __func__);
@@ -2121,7 +2898,10 @@ static struct notifier_block rcutorture_oom_nb = {
/* Carry out grace-period forward-progress testing. */
static int rcu_torture_fwd_prog(void *args)
{
+ bool firsttime = true;
+ long max_cbs;
int oldnice = task_nice(current);
+ unsigned long oldseq = READ_ONCE(rcu_fwd_seq);
struct rcu_fwd *rfp = args;
int tested = 0;
int tested_tries = 0;
@@ -2131,21 +2911,38 @@ static int rcu_torture_fwd_prog(void *args)
if (!IS_ENABLED(CONFIG_SMP) || !IS_ENABLED(CONFIG_RCU_BOOST))
set_user_nice(current, MAX_NICE);
do {
- schedule_timeout_interruptible(fwd_progress_holdoff * HZ);
- WRITE_ONCE(rcu_fwd_emergency_stop, false);
- if (!IS_ENABLED(CONFIG_TINY_RCU) ||
- rcu_inkernel_boot_has_ended())
- rcu_torture_fwd_prog_nr(rfp, &tested, &tested_tries);
- if (rcu_inkernel_boot_has_ended())
+ if (!rfp->rcu_fwd_id) {
+ schedule_timeout_interruptible(fwd_progress_holdoff * HZ);
+ WRITE_ONCE(rcu_fwd_emergency_stop, false);
+ if (!firsttime) {
+ max_cbs = atomic_long_xchg(&rcu_fwd_max_cbs, 0);
+ pr_alert("%s n_max_cbs: %ld\n", __func__, max_cbs);
+ }
+ firsttime = false;
+ WRITE_ONCE(rcu_fwd_seq, rcu_fwd_seq + 1);
+ } else {
+ while (READ_ONCE(rcu_fwd_seq) == oldseq && !torture_must_stop())
+ schedule_timeout_interruptible(HZ / 20);
+ oldseq = READ_ONCE(rcu_fwd_seq);
+ }
+ pr_alert("%s: Starting forward-progress test %d\n", __func__, rfp->rcu_fwd_id);
+ if (rcu_inkernel_boot_has_ended() && torture_num_online_cpus() > rfp->rcu_fwd_id)
rcu_torture_fwd_prog_cr(rfp);
+ if ((cur_ops->stall_dur && cur_ops->stall_dur() > 0) &&
+ (!IS_ENABLED(CONFIG_TINY_RCU) ||
+ (rcu_inkernel_boot_has_ended() &&
+ torture_num_online_cpus() > rfp->rcu_fwd_id)))
+ rcu_torture_fwd_prog_nr(rfp, &tested, &tested_tries);
/* Avoid slow periods, better to test when busy. */
if (stutter_wait("rcu_torture_fwd_prog"))
sched_set_normal(current, oldnice);
} while (!torture_must_stop());
/* Short runs might not contain a valid forward-progress attempt. */
- WARN_ON(!tested && tested_tries >= 5);
- pr_alert("%s: tested %d tested_tries %d\n", __func__, tested, tested_tries);
+ if (!rfp->rcu_fwd_id) {
+ WARN_ON(!tested && tested_tries >= 5);
+ pr_alert("%s: tested %d tested_tries %d\n", __func__, tested, tested_tries);
+ }
torture_kthread_stopping("rcu_torture_fwd_prog");
return 0;
}
@@ -2153,18 +2950,29 @@ static int rcu_torture_fwd_prog(void *args)
/* If forward-progress checking is requested and feasible, spawn the thread. */
static int __init rcu_torture_fwd_prog_init(void)
{
+ int i;
+ int ret = 0;
struct rcu_fwd *rfp;
if (!fwd_progress)
return 0; /* Not requested, so don't do it. */
+ if (fwd_progress >= nr_cpu_ids) {
+ VERBOSE_TOROUT_STRING("rcu_torture_fwd_prog_init: Limiting fwd_progress to # CPUs.\n");
+ fwd_progress = nr_cpu_ids;
+ } else if (fwd_progress < 0) {
+ fwd_progress = nr_cpu_ids;
+ }
if ((!cur_ops->sync && !cur_ops->call) ||
- !cur_ops->stall_dur || cur_ops->stall_dur() <= 0 || cur_ops == &rcu_busted_ops) {
+ (!cur_ops->cbflood_max && (!cur_ops->stall_dur || cur_ops->stall_dur() <= 0)) ||
+ cur_ops == &rcu_busted_ops) {
VERBOSE_TOROUT_STRING("rcu_torture_fwd_prog_init: Disabled, unsupported by RCU flavor under test");
+ fwd_progress = 0;
return 0;
}
if (stall_cpu > 0) {
VERBOSE_TOROUT_STRING("rcu_torture_fwd_prog_init: Disabled, conflicts with CPU-stall testing");
- if (IS_MODULE(CONFIG_RCU_TORTURE_TESTS))
+ fwd_progress = 0;
+ if (IS_MODULE(CONFIG_RCU_TORTURE_TEST))
return -EINVAL; /* In module, can fail back to user. */
WARN_ON(1); /* Make sure rcutorture notices conflict. */
return 0;
@@ -2173,29 +2981,51 @@ static int __init rcu_torture_fwd_prog_init(void)
fwd_progress_holdoff = 1;
if (fwd_progress_div <= 0)
fwd_progress_div = 4;
- rfp = kzalloc(sizeof(*rfp), GFP_KERNEL);
- if (!rfp)
+ rfp = kcalloc(fwd_progress, sizeof(*rfp), GFP_KERNEL);
+ fwd_prog_tasks = kcalloc(fwd_progress, sizeof(*fwd_prog_tasks), GFP_KERNEL);
+ if (!rfp || !fwd_prog_tasks) {
+ kfree(rfp);
+ kfree(fwd_prog_tasks);
+ fwd_prog_tasks = NULL;
+ fwd_progress = 0;
return -ENOMEM;
- spin_lock_init(&rfp->rcu_fwd_lock);
- rfp->rcu_fwd_cb_tail = &rfp->rcu_fwd_cb_head;
+ }
+ for (i = 0; i < fwd_progress; i++) {
+ spin_lock_init(&rfp[i].rcu_fwd_lock);
+ rfp[i].rcu_fwd_cb_tail = &rfp[i].rcu_fwd_cb_head;
+ rfp[i].rcu_fwd_id = i;
+ }
mutex_lock(&rcu_fwd_mutex);
rcu_fwds = rfp;
mutex_unlock(&rcu_fwd_mutex);
register_oom_notifier(&rcutorture_oom_nb);
- return torture_create_kthread(rcu_torture_fwd_prog, rfp, fwd_prog_task);
+ for (i = 0; i < fwd_progress; i++) {
+ ret = torture_create_kthread(rcu_torture_fwd_prog, &rcu_fwds[i], fwd_prog_tasks[i]);
+ if (ret) {
+ fwd_progress = i;
+ return ret;
+ }
+ }
+ return 0;
}
static void rcu_torture_fwd_prog_cleanup(void)
{
+ int i;
struct rcu_fwd *rfp;
- torture_stop_kthread(rcu_torture_fwd_prog, fwd_prog_task);
- rfp = rcu_fwds;
+ if (!rcu_fwds || !fwd_prog_tasks)
+ return;
+ for (i = 0; i < fwd_progress; i++)
+ torture_stop_kthread(rcu_torture_fwd_prog, fwd_prog_tasks[i]);
+ unregister_oom_notifier(&rcutorture_oom_nb);
mutex_lock(&rcu_fwd_mutex);
+ rfp = rcu_fwds;
rcu_fwds = NULL;
mutex_unlock(&rcu_fwd_mutex);
- unregister_oom_notifier(&rcutorture_oom_nb);
kfree(rfp);
+ kfree(fwd_prog_tasks);
+ fwd_prog_tasks = NULL;
}
/* Callback function for RCU barrier testing. */
@@ -2362,13 +3192,15 @@ static bool rcu_torture_can_boost(void)
if (!(test_boost == 1 && cur_ops->can_boost) && test_boost != 2)
return false;
+ if (!cur_ops->start_gp_poll || !cur_ops->poll_gp_state)
+ return false;
prio = rcu_get_gp_kthreads_prio();
if (!prio)
return false;
if (prio < 2) {
- if (boost_warn_once == 1)
+ if (boost_warn_once == 1)
return false;
pr_alert("%s: WARN: RCU kthread priority too low to test boosting. Skipping RCU boost test. Try passing rcutree.kthread_prio > 1 on the kernel command line.\n", KBUILD_MODNAME);
@@ -2391,15 +3223,14 @@ static int rcu_torture_read_exit_child(void *trsp_in)
set_user_nice(current, MAX_NICE);
// Minimize time between reading and exiting.
while (!kthread_should_stop())
- schedule_timeout_uninterruptible(1);
- (void)rcu_torture_one_read(trsp);
+ schedule_timeout_uninterruptible(HZ / 20);
+ (void)rcu_torture_one_read(trsp, -1);
return 0;
}
// Parent kthread which creates and destroys read-exit child kthreads.
static int rcu_torture_read_exit(void *unused)
{
- int count = 0;
bool errexit = false;
int i;
struct task_struct *tsp;
@@ -2411,34 +3242,28 @@ static int rcu_torture_read_exit(void *unused)
// Each pass through this loop does one read-exit episode.
do {
- if (++count > read_exit_burst) {
- VERBOSE_TOROUT_STRING("rcu_torture_read_exit: End of episode");
- rcu_barrier(); // Wait for task_struct free, avoid OOM.
- for (i = 0; i < read_exit_delay; i++) {
- schedule_timeout_uninterruptible(HZ);
- if (READ_ONCE(read_exit_child_stop))
- break;
+ VERBOSE_TOROUT_STRING("rcu_torture_read_exit: Start of episode");
+ for (i = 0; i < read_exit_burst; i++) {
+ if (READ_ONCE(read_exit_child_stop))
+ break;
+ stutter_wait("rcu_torture_read_exit");
+ // Spawn child.
+ tsp = kthread_run(rcu_torture_read_exit_child,
+ &trs, "%s", "rcu_torture_read_exit_child");
+ if (IS_ERR(tsp)) {
+ TOROUT_ERRSTRING("out of memory");
+ errexit = true;
+ break;
}
- if (!READ_ONCE(read_exit_child_stop))
- VERBOSE_TOROUT_STRING("rcu_torture_read_exit: Start of episode");
- count = 0;
- }
- if (READ_ONCE(read_exit_child_stop))
- break;
- // Spawn child.
- tsp = kthread_run(rcu_torture_read_exit_child,
- &trs, "%s",
- "rcu_torture_read_exit_child");
- if (IS_ERR(tsp)) {
- VERBOSE_TOROUT_ERRSTRING("out of memory");
- errexit = true;
- tsp = NULL;
- break;
+ cond_resched();
+ kthread_stop(tsp);
+ n_read_exits++;
}
- cond_resched();
- kthread_stop(tsp);
- n_read_exits ++;
- stutter_wait("rcu_torture_read_exit");
+ VERBOSE_TOROUT_STRING("rcu_torture_read_exit: End of episode");
+ rcu_barrier(); // Wait for task_struct free, avoid OOM.
+ i = 0;
+ for (; !errexit && !READ_ONCE(read_exit_child_stop) && i < read_exit_delay; i++)
+ schedule_timeout_uninterruptible(HZ);
} while (!errexit && !READ_ONCE(read_exit_child_stop));
// Clean up and exit.
@@ -2446,7 +3271,7 @@ static int rcu_torture_read_exit(void *unused)
smp_mb(); // Store before wakeup.
wake_up(&read_exit_wq);
while (!torture_must_stop())
- schedule_timeout_uninterruptible(1);
+ schedule_timeout_uninterruptible(HZ / 20);
torture_kthread_stopping("rcu_torture_read_exit");
return 0;
}
@@ -2454,7 +3279,7 @@ static int rcu_torture_read_exit(void *unused)
static int rcu_torture_read_exit_init(void)
{
if (read_exit_burst <= 0)
- return -EINVAL;
+ return 0;
init_waitqueue_head(&read_exit_wq);
read_exit_child_stop = false;
read_exit_child_stopped = false;
@@ -2472,6 +3297,29 @@ static void rcu_torture_read_exit_cleanup(void)
torture_stop_kthread(rcutorture_read_exit, read_exit_task);
}
+static void rcutorture_test_nmis(int n)
+{
+#if IS_BUILTIN(CONFIG_RCU_TORTURE_TEST)
+ int cpu;
+ int dumpcpu;
+ int i;
+
+ for (i = 0; i < n; i++) {
+ preempt_disable();
+ cpu = smp_processor_id();
+ dumpcpu = cpu + 1;
+ if (dumpcpu >= nr_cpu_ids)
+ dumpcpu = 0;
+ pr_alert("%s: CPU %d invoking dump_cpu_task(%d)\n", __func__, cpu, dumpcpu);
+ dump_cpu_task(dumpcpu);
+ preempt_enable();
+ schedule_timeout_uninterruptible(15 * HZ);
+ }
+#else // #if IS_BUILTIN(CONFIG_RCU_TORTURE_TEST)
+ WARN_ONCE(n, "Non-zero rcutorture.test_nmis=%d permitted only when rcutorture is built in.\n", test_nmis);
+#endif // #else // #if IS_BUILTIN(CONFIG_RCU_TORTURE_TEST)
+}
+
static enum cpuhp_state rcutor_hp;
static void
@@ -2483,15 +3331,21 @@ rcu_torture_cleanup(void)
int i;
if (torture_cleanup_begin()) {
- if (cur_ops->cb_barrier != NULL)
+ if (cur_ops->cb_barrier != NULL) {
+ pr_info("%s: Invoking %pS().\n", __func__, cur_ops->cb_barrier);
cur_ops->cb_barrier();
+ }
+ rcu_gp_slow_unregister(NULL);
return;
}
if (!cur_ops) {
torture_cleanup_end();
+ rcu_gp_slow_unregister(NULL);
return;
}
+ rcutorture_test_nmis(test_nmis);
+
if (cur_ops->gp_kthread_dbg)
cur_ops->gp_kthread_dbg();
rcu_torture_read_exit_cleanup();
@@ -2500,6 +3354,13 @@ rcu_torture_cleanup(void)
torture_stop_kthread(rcu_torture_stall, stall_task);
torture_stop_kthread(rcu_torture_writer, writer_task);
+ if (nocb_tasks) {
+ for (i = 0; i < nrealnocbers; i++)
+ torture_stop_kthread(rcu_nocb_toggle, nocb_tasks[i]);
+ kfree(nocb_tasks);
+ nocb_tasks = NULL;
+ }
+
if (reader_tasks) {
for (i = 0; i < nrealreaders; i++)
torture_stop_kthread(rcu_torture_reader,
@@ -2507,6 +3368,8 @@ rcu_torture_cleanup(void)
kfree(reader_tasks);
reader_tasks = NULL;
}
+ kfree(rcu_torture_reader_mbchk);
+ rcu_torture_reader_mbchk = NULL;
if (fakewriter_tasks) {
for (i = 0; i < nfakewriters; i++)
@@ -2523,18 +3386,22 @@ rcu_torture_cleanup(void)
rcutorture_seq_diff(gp_seq, start_gp_seq));
torture_stop_kthread(rcu_torture_stats, stats_task);
torture_stop_kthread(rcu_torture_fqs, fqs_task);
- if (rcu_torture_can_boost())
+ if (rcu_torture_can_boost() && rcutor_hp >= 0)
cpuhp_remove_state(rcutor_hp);
/*
* Wait for all RCU callbacks to fire, then do torture-type-specific
* cleanup operations.
*/
- if (cur_ops->cb_barrier != NULL)
+ if (cur_ops->cb_barrier != NULL) {
+ pr_info("%s: Invoking %pS().\n", __func__, cur_ops->cb_barrier);
cur_ops->cb_barrier();
+ }
if (cur_ops->cleanup != NULL)
cur_ops->cleanup();
+ rcu_torture_mem_dump_obj();
+
rcu_torture_stats_print(); /* -After- the stats thread is stopped! */
if (err_segs_recorded) {
@@ -2572,6 +3439,7 @@ rcu_torture_cleanup(void)
else
rcu_torture_print_module_parms(cur_ops, "End of test: SUCCESS");
torture_cleanup_end();
+ rcu_gp_slow_unregister(&rcu_fwd_cb_nodelay);
}
#ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD
@@ -2604,6 +3472,7 @@ static void rcu_test_debug_objects(void)
#ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD
struct rcu_head rh1;
struct rcu_head rh2;
+ struct rcu_head *rhp = kmalloc(sizeof(*rhp), GFP_KERNEL);
init_rcu_head_on_stack(&rh1);
init_rcu_head_on_stack(&rh2);
@@ -2612,10 +3481,14 @@ static void rcu_test_debug_objects(void)
/* Try to queue the rh2 pair of callbacks for the same grace period. */
preempt_disable(); /* Prevent preemption from interrupting test. */
rcu_read_lock(); /* Make it impossible to finish a grace period. */
- call_rcu(&rh1, rcu_torture_leak_cb); /* Start grace period. */
+ call_rcu_hurry(&rh1, rcu_torture_leak_cb); /* Start grace period. */
local_irq_disable(); /* Make it harder to start a new grace period. */
- call_rcu(&rh2, rcu_torture_leak_cb);
- call_rcu(&rh2, rcu_torture_err_cb); /* Duplicate callback. */
+ call_rcu_hurry(&rh2, rcu_torture_leak_cb);
+ call_rcu_hurry(&rh2, rcu_torture_err_cb); /* Duplicate callback. */
+ if (rhp) {
+ call_rcu_hurry(rhp, rcu_torture_leak_cb);
+ call_rcu_hurry(rhp, rcu_torture_err_cb); /* Another duplicate callback. */
+ }
local_irq_enable();
rcu_read_unlock();
preempt_enable();
@@ -2625,6 +3498,7 @@ static void rcu_test_debug_objects(void)
pr_alert("%s: WARN: Duplicate call_rcu() test complete.\n", KBUILD_MODNAME);
destroy_rcu_head_on_stack(&rh1);
destroy_rcu_head_on_stack(&rh2);
+ kfree(rhp);
#else /* #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */
pr_alert("%s: !CONFIG_DEBUG_OBJECTS_RCU_HEAD, not testing duplicate call_rcu()\n", KBUILD_MODNAME);
#endif /* #else #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */
@@ -2638,6 +3512,188 @@ static void rcutorture_sync(void)
cur_ops->sync();
}
+static DEFINE_MUTEX(mut0);
+static DEFINE_MUTEX(mut1);
+static DEFINE_MUTEX(mut2);
+static DEFINE_MUTEX(mut3);
+static DEFINE_MUTEX(mut4);
+static DEFINE_MUTEX(mut5);
+static DEFINE_MUTEX(mut6);
+static DEFINE_MUTEX(mut7);
+static DEFINE_MUTEX(mut8);
+static DEFINE_MUTEX(mut9);
+
+static DECLARE_RWSEM(rwsem0);
+static DECLARE_RWSEM(rwsem1);
+static DECLARE_RWSEM(rwsem2);
+static DECLARE_RWSEM(rwsem3);
+static DECLARE_RWSEM(rwsem4);
+static DECLARE_RWSEM(rwsem5);
+static DECLARE_RWSEM(rwsem6);
+static DECLARE_RWSEM(rwsem7);
+static DECLARE_RWSEM(rwsem8);
+static DECLARE_RWSEM(rwsem9);
+
+DEFINE_STATIC_SRCU(srcu0);
+DEFINE_STATIC_SRCU(srcu1);
+DEFINE_STATIC_SRCU(srcu2);
+DEFINE_STATIC_SRCU(srcu3);
+DEFINE_STATIC_SRCU(srcu4);
+DEFINE_STATIC_SRCU(srcu5);
+DEFINE_STATIC_SRCU(srcu6);
+DEFINE_STATIC_SRCU(srcu7);
+DEFINE_STATIC_SRCU(srcu8);
+DEFINE_STATIC_SRCU(srcu9);
+
+static int srcu_lockdep_next(const char *f, const char *fl, const char *fs, const char *fu, int i,
+ int cyclelen, int deadlock)
+{
+ int j = i + 1;
+
+ if (j >= cyclelen)
+ j = deadlock ? 0 : -1;
+ if (j >= 0)
+ pr_info("%s: %s(%d), %s(%d), %s(%d)\n", f, fl, i, fs, j, fu, i);
+ else
+ pr_info("%s: %s(%d), %s(%d)\n", f, fl, i, fu, i);
+ return j;
+}
+
+// Test lockdep on SRCU-based deadlock scenarios.
+static void rcu_torture_init_srcu_lockdep(void)
+{
+ int cyclelen;
+ int deadlock;
+ bool err = false;
+ int i;
+ int j;
+ int idx;
+ struct mutex *muts[] = { &mut0, &mut1, &mut2, &mut3, &mut4,
+ &mut5, &mut6, &mut7, &mut8, &mut9 };
+ struct rw_semaphore *rwsems[] = { &rwsem0, &rwsem1, &rwsem2, &rwsem3, &rwsem4,
+ &rwsem5, &rwsem6, &rwsem7, &rwsem8, &rwsem9 };
+ struct srcu_struct *srcus[] = { &srcu0, &srcu1, &srcu2, &srcu3, &srcu4,
+ &srcu5, &srcu6, &srcu7, &srcu8, &srcu9 };
+ int testtype;
+
+ if (!test_srcu_lockdep)
+ return;
+
+ deadlock = test_srcu_lockdep / 1000;
+ testtype = (test_srcu_lockdep / 10) % 100;
+ cyclelen = test_srcu_lockdep % 10;
+ WARN_ON_ONCE(ARRAY_SIZE(muts) != ARRAY_SIZE(srcus));
+ if (WARN_ONCE(deadlock != !!deadlock,
+ "%s: test_srcu_lockdep=%d and deadlock digit %d must be zero or one.\n",
+ __func__, test_srcu_lockdep, deadlock))
+ err = true;
+ if (WARN_ONCE(cyclelen <= 0,
+ "%s: test_srcu_lockdep=%d and cycle-length digit %d must be greater than zero.\n",
+ __func__, test_srcu_lockdep, cyclelen))
+ err = true;
+ if (err)
+ goto err_out;
+
+ if (testtype == 0) {
+ pr_info("%s: test_srcu_lockdep = %05d: SRCU %d-way %sdeadlock.\n",
+ __func__, test_srcu_lockdep, cyclelen, deadlock ? "" : "non-");
+ if (deadlock && cyclelen == 1)
+ pr_info("%s: Expect hang.\n", __func__);
+ for (i = 0; i < cyclelen; i++) {
+ j = srcu_lockdep_next(__func__, "srcu_read_lock", "synchronize_srcu",
+ "srcu_read_unlock", i, cyclelen, deadlock);
+ idx = srcu_read_lock(srcus[i]);
+ if (j >= 0)
+ synchronize_srcu(srcus[j]);
+ srcu_read_unlock(srcus[i], idx);
+ }
+ return;
+ }
+
+ if (testtype == 1) {
+ pr_info("%s: test_srcu_lockdep = %05d: SRCU/mutex %d-way %sdeadlock.\n",
+ __func__, test_srcu_lockdep, cyclelen, deadlock ? "" : "non-");
+ for (i = 0; i < cyclelen; i++) {
+ pr_info("%s: srcu_read_lock(%d), mutex_lock(%d), mutex_unlock(%d), srcu_read_unlock(%d)\n",
+ __func__, i, i, i, i);
+ idx = srcu_read_lock(srcus[i]);
+ mutex_lock(muts[i]);
+ mutex_unlock(muts[i]);
+ srcu_read_unlock(srcus[i], idx);
+
+ j = srcu_lockdep_next(__func__, "mutex_lock", "synchronize_srcu",
+ "mutex_unlock", i, cyclelen, deadlock);
+ mutex_lock(muts[i]);
+ if (j >= 0)
+ synchronize_srcu(srcus[j]);
+ mutex_unlock(muts[i]);
+ }
+ return;
+ }
+
+ if (testtype == 2) {
+ pr_info("%s: test_srcu_lockdep = %05d: SRCU/rwsem %d-way %sdeadlock.\n",
+ __func__, test_srcu_lockdep, cyclelen, deadlock ? "" : "non-");
+ for (i = 0; i < cyclelen; i++) {
+ pr_info("%s: srcu_read_lock(%d), down_read(%d), up_read(%d), srcu_read_unlock(%d)\n",
+ __func__, i, i, i, i);
+ idx = srcu_read_lock(srcus[i]);
+ down_read(rwsems[i]);
+ up_read(rwsems[i]);
+ srcu_read_unlock(srcus[i], idx);
+
+ j = srcu_lockdep_next(__func__, "down_write", "synchronize_srcu",
+ "up_write", i, cyclelen, deadlock);
+ down_write(rwsems[i]);
+ if (j >= 0)
+ synchronize_srcu(srcus[j]);
+ up_write(rwsems[i]);
+ }
+ return;
+ }
+
+#ifdef CONFIG_TASKS_TRACE_RCU
+ if (testtype == 3) {
+ pr_info("%s: test_srcu_lockdep = %05d: SRCU and Tasks Trace RCU %d-way %sdeadlock.\n",
+ __func__, test_srcu_lockdep, cyclelen, deadlock ? "" : "non-");
+ if (deadlock && cyclelen == 1)
+ pr_info("%s: Expect hang.\n", __func__);
+ for (i = 0; i < cyclelen; i++) {
+ char *fl = i == 0 ? "rcu_read_lock_trace" : "srcu_read_lock";
+ char *fs = i == cyclelen - 1 ? "synchronize_rcu_tasks_trace"
+ : "synchronize_srcu";
+ char *fu = i == 0 ? "rcu_read_unlock_trace" : "srcu_read_unlock";
+
+ j = srcu_lockdep_next(__func__, fl, fs, fu, i, cyclelen, deadlock);
+ if (i == 0)
+ rcu_read_lock_trace();
+ else
+ idx = srcu_read_lock(srcus[i]);
+ if (j >= 0) {
+ if (i == cyclelen - 1)
+ synchronize_rcu_tasks_trace();
+ else
+ synchronize_srcu(srcus[j]);
+ }
+ if (i == 0)
+ rcu_read_unlock_trace();
+ else
+ srcu_read_unlock(srcus[i], idx);
+ }
+ return;
+ }
+#endif // #ifdef CONFIG_TASKS_TRACE_RCU
+
+err_out:
+ pr_info("%s: test_srcu_lockdep = %05d does nothing.\n", __func__, test_srcu_lockdep);
+ pr_info("%s: test_srcu_lockdep = DNNL.\n", __func__);
+ pr_info("%s: D: Deadlock if nonzero.\n", __func__);
+ pr_info("%s: NN: Test number, 0=SRCU, 1=SRCU/mutex, 2=SRCU/rwsem, 3=SRCU/Tasks Trace RCU.\n", __func__);
+ pr_info("%s: L: Cycle length.\n", __func__);
+ if (!IS_ENABLED(CONFIG_TASKS_TRACE_RCU))
+ pr_info("%s: NN=3 disallowed because kernel is built with CONFIG_TASKS_TRACE_RCU=n\n", __func__);
+}
+
static int __init
rcu_torture_init(void)
{
@@ -2647,9 +3703,9 @@ rcu_torture_init(void)
int flags = 0;
unsigned long gp_seq = 0;
static struct rcu_torture_ops *torture_ops[] = {
- &rcu_ops, &rcu_busted_ops, &srcu_ops, &srcud_ops,
- &busted_srcud_ops, &tasks_ops, &tasks_rude_ops,
- &tasks_tracing_ops, &trivial_ops,
+ &rcu_ops, &rcu_busted_ops, &srcu_ops, &srcud_ops, &busted_srcud_ops,
+ TASKS_OPS TASKS_RUDE_OPS TASKS_TRACING_OPS
+ &trivial_ops,
};
if (!torture_init_begin(torture_type, verbose))
@@ -2676,9 +3732,17 @@ rcu_torture_init(void)
pr_alert("rcu-torture: ->fqs NULL and non-zero fqs_duration, fqs disabled.\n");
fqs_duration = 0;
}
+ if (nocbs_nthreads != 0 && (cur_ops != &rcu_ops ||
+ !IS_ENABLED(CONFIG_RCU_NOCB_CPU))) {
+ pr_alert("rcu-torture types: %s and CONFIG_RCU_NOCB_CPU=%d, nocb toggle disabled.\n",
+ cur_ops->name, IS_ENABLED(CONFIG_RCU_NOCB_CPU));
+ nocbs_nthreads = 0;
+ }
if (cur_ops->init)
cur_ops->init();
+ rcu_torture_init_srcu_lockdep();
+
if (nreaders >= 0) {
nrealreaders = nreaders;
} else {
@@ -2710,10 +3774,11 @@ rcu_torture_init(void)
atomic_set(&n_rcu_torture_alloc_fail, 0);
atomic_set(&n_rcu_torture_free, 0);
atomic_set(&n_rcu_torture_mberror, 0);
+ atomic_set(&n_rcu_torture_mbchk_fail, 0);
+ atomic_set(&n_rcu_torture_mbchk_tries, 0);
atomic_set(&n_rcu_torture_error, 0);
n_rcu_torture_barrier_error = 0;
n_rcu_torture_boost_ktrerror = 0;
- n_rcu_torture_boost_rterror = 0;
n_rcu_torture_boost_failure = 0;
n_rcu_torture_boosts = 0;
for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++)
@@ -2729,16 +3794,17 @@ rcu_torture_init(void)
/* Start up the kthreads. */
+ rcu_torture_write_types();
firsterr = torture_create_kthread(rcu_torture_writer, NULL,
writer_task);
- if (firsterr)
+ if (torture_init_error(firsterr))
goto unwind;
if (nfakewriters > 0) {
fakewriter_tasks = kcalloc(nfakewriters,
sizeof(fakewriter_tasks[0]),
GFP_KERNEL);
if (fakewriter_tasks == NULL) {
- VERBOSE_TOROUT_ERRSTRING("out of memory");
+ TOROUT_ERRSTRING("out of memory");
firsterr = -ENOMEM;
goto unwind;
}
@@ -2746,31 +3812,54 @@ rcu_torture_init(void)
for (i = 0; i < nfakewriters; i++) {
firsterr = torture_create_kthread(rcu_torture_fakewriter,
NULL, fakewriter_tasks[i]);
- if (firsterr)
+ if (torture_init_error(firsterr))
goto unwind;
}
reader_tasks = kcalloc(nrealreaders, sizeof(reader_tasks[0]),
GFP_KERNEL);
- if (reader_tasks == NULL) {
- VERBOSE_TOROUT_ERRSTRING("out of memory");
+ rcu_torture_reader_mbchk = kcalloc(nrealreaders, sizeof(*rcu_torture_reader_mbchk),
+ GFP_KERNEL);
+ if (!reader_tasks || !rcu_torture_reader_mbchk) {
+ TOROUT_ERRSTRING("out of memory");
firsterr = -ENOMEM;
goto unwind;
}
for (i = 0; i < nrealreaders; i++) {
+ rcu_torture_reader_mbchk[i].rtc_chkrdr = -1;
firsterr = torture_create_kthread(rcu_torture_reader, (void *)i,
reader_tasks[i]);
- if (firsterr)
+ if (torture_init_error(firsterr))
+ goto unwind;
+ }
+ nrealnocbers = nocbs_nthreads;
+ if (WARN_ON(nrealnocbers < 0))
+ nrealnocbers = 1;
+ if (WARN_ON(nocbs_toggle < 0))
+ nocbs_toggle = HZ;
+ if (nrealnocbers > 0) {
+ nocb_tasks = kcalloc(nrealnocbers, sizeof(nocb_tasks[0]), GFP_KERNEL);
+ if (nocb_tasks == NULL) {
+ TOROUT_ERRSTRING("out of memory");
+ firsterr = -ENOMEM;
+ goto unwind;
+ }
+ } else {
+ nocb_tasks = NULL;
+ }
+ for (i = 0; i < nrealnocbers; i++) {
+ firsterr = torture_create_kthread(rcu_nocb_toggle, NULL, nocb_tasks[i]);
+ if (torture_init_error(firsterr))
goto unwind;
}
if (stat_interval > 0) {
firsterr = torture_create_kthread(rcu_torture_stats, NULL,
stats_task);
- if (firsterr)
+ if (torture_init_error(firsterr))
goto unwind;
}
if (test_no_idle_hz && shuffle_interval > 0) {
firsterr = torture_shuffle_init(shuffle_interval * HZ);
- if (firsterr)
+ if (torture_init_error(firsterr))
goto unwind;
}
if (stutter < 0)
@@ -2780,16 +3869,18 @@ rcu_torture_init(void)
t = cur_ops->stall_dur ? cur_ops->stall_dur() : stutter * HZ;
firsterr = torture_stutter_init(stutter * HZ, t);
- if (firsterr)
+ if (torture_init_error(firsterr))
goto unwind;
}
if (fqs_duration < 0)
fqs_duration = 0;
- if (fqs_duration) {
+ if (fqs_holdoff < 0)
+ fqs_holdoff = 0;
+ if (fqs_duration && fqs_holdoff) {
/* Create the fqs thread */
firsterr = torture_create_kthread(rcu_torture_fqs, NULL,
fqs_task);
- if (firsterr)
+ if (torture_init_error(firsterr))
goto unwind;
}
if (test_boost_interval < 1)
@@ -2803,33 +3894,34 @@ rcu_torture_init(void)
firsterr = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "RCU_TORTURE",
rcutorture_booster_init,
rcutorture_booster_cleanup);
- if (firsterr < 0)
- goto unwind;
rcutor_hp = firsterr;
+ if (torture_init_error(firsterr))
+ goto unwind;
}
shutdown_jiffies = jiffies + shutdown_secs * HZ;
firsterr = torture_shutdown_init(shutdown_secs, rcu_torture_cleanup);
- if (firsterr)
+ if (torture_init_error(firsterr))
goto unwind;
firsterr = torture_onoff_init(onoff_holdoff * HZ, onoff_interval,
rcutorture_sync);
- if (firsterr)
+ if (torture_init_error(firsterr))
goto unwind;
firsterr = rcu_torture_stall_init();
- if (firsterr)
+ if (torture_init_error(firsterr))
goto unwind;
firsterr = rcu_torture_fwd_prog_init();
- if (firsterr)
+ if (torture_init_error(firsterr))
goto unwind;
firsterr = rcu_torture_barrier_init();
- if (firsterr)
+ if (torture_init_error(firsterr))
goto unwind;
firsterr = rcu_torture_read_exit_init();
- if (firsterr)
+ if (torture_init_error(firsterr))
goto unwind;
if (object_debug)
rcu_test_debug_objects();
torture_init_end();
+ rcu_gp_slow_register(&rcu_fwd_cb_nodelay);
return 0;
unwind:
diff --git a/kernel/rcu/refscale.c b/kernel/rcu/refscale.c
index 23ff36a66f97..2c2648a3ad30 100644
--- a/kernel/rcu/refscale.c
+++ b/kernel/rcu/refscale.c
@@ -44,10 +44,24 @@
pr_alert("%s" SCALE_FLAG s, scale_type, ## x)
#define VERBOSE_SCALEOUT(s, x...) \
- do { if (verbose) pr_alert("%s" SCALE_FLAG s, scale_type, ## x); } while (0)
+ do { \
+ if (verbose) \
+ pr_alert("%s" SCALE_FLAG s "\n", scale_type, ## x); \
+ } while (0)
-#define VERBOSE_SCALEOUT_ERRSTRING(s, x...) \
- do { if (verbose) pr_alert("%s" SCALE_FLAG "!!! " s, scale_type, ## x); } while (0)
+static atomic_t verbose_batch_ctr;
+
+#define VERBOSE_SCALEOUT_BATCH(s, x...) \
+do { \
+ if (verbose && \
+ (verbose_batched <= 0 || \
+ !(atomic_inc_return(&verbose_batch_ctr) % verbose_batched))) { \
+ schedule_timeout_uninterruptible(1); \
+ pr_alert("%s" SCALE_FLAG s "\n", scale_type, ## x); \
+ } \
+} while (0)
+
+#define SCALEOUT_ERRSTRING(s, x...) pr_alert("%s" SCALE_FLAG "!!! " s "\n", scale_type, ## x)
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Joel Fernandes (Google) <joel@joelfernandes.org>");
@@ -57,10 +71,13 @@ module_param(scale_type, charp, 0444);
MODULE_PARM_DESC(scale_type, "Type of test (rcu, srcu, refcnt, rwsem, rwlock.");
torture_param(int, verbose, 0, "Enable verbose debugging printk()s");
+torture_param(int, verbose_batched, 0, "Batch verbose debugging printk()s");
// Wait until there are multiple CPUs before starting test.
torture_param(int, holdoff, IS_BUILTIN(CONFIG_RCU_REF_SCALE_TEST) ? 10 : 0,
"Holdoff time before test start (s)");
+// Number of typesafe_lookup structures, that is, the degree of concurrency.
+torture_param(long, lookup_instances, 0, "Number of typesafe_lookup structures.");
// Number of loops per experiment, all readers execute operations concurrently.
torture_param(long, loops, 10000, "Number of loops per experiment.");
// Number of readers, with -1 defaulting to about 75% of the CPUs.
@@ -109,7 +126,7 @@ static int exp_idx;
// Operations vector for selecting different types of tests.
struct ref_scale_ops {
- void (*init)(void);
+ bool (*init)(void);
void (*cleanup)(void);
void (*readsection)(const int nloops);
void (*delaysection)(const int nloops, const int udl, const int ndl);
@@ -147,8 +164,9 @@ static void ref_rcu_delay_section(const int nloops, const int udl, const int ndl
}
}
-static void rcu_sync_scale_init(void)
+static bool rcu_sync_scale_init(void)
{
+ return true;
}
static struct ref_scale_ops rcu_ops = {
@@ -192,6 +210,8 @@ static struct ref_scale_ops srcu_ops = {
.name = "srcu"
};
+#ifdef CONFIG_TASKS_RCU
+
// Definitions for RCU Tasks ref scale testing: Empty read markers.
// These definitions also work for RCU Rude readers.
static void rcu_tasks_ref_scale_read_section(const int nloops)
@@ -217,6 +237,16 @@ static struct ref_scale_ops rcu_tasks_ops = {
.name = "rcu-tasks"
};
+#define RCU_TASKS_OPS &rcu_tasks_ops,
+
+#else // #ifdef CONFIG_TASKS_RCU
+
+#define RCU_TASKS_OPS
+
+#endif // #else // #ifdef CONFIG_TASKS_RCU
+
+#ifdef CONFIG_TASKS_TRACE_RCU
+
// Definitions for RCU Tasks Trace ref scale testing.
static void rcu_trace_ref_scale_read_section(const int nloops)
{
@@ -246,6 +276,14 @@ static struct ref_scale_ops rcu_trace_ops = {
.name = "rcu-trace"
};
+#define RCU_TRACE_OPS &rcu_trace_ops,
+
+#else // #ifdef CONFIG_TASKS_TRACE_RCU
+
+#define RCU_TRACE_OPS
+
+#endif // #else // #ifdef CONFIG_TASKS_TRACE_RCU
+
// Definitions for reference count
static atomic_t refcnt;
@@ -280,9 +318,10 @@ static struct ref_scale_ops refcnt_ops = {
// Definitions for rwlock
static rwlock_t test_rwlock;
-static void ref_rwlock_init(void)
+static bool ref_rwlock_init(void)
{
rwlock_init(&test_rwlock);
+ return true;
}
static void ref_rwlock_section(const int nloops)
@@ -316,9 +355,10 @@ static struct ref_scale_ops rwlock_ops = {
// Definitions for rwsem
static struct rw_semaphore test_rwsem;
-static void ref_rwsem_init(void)
+static bool ref_rwsem_init(void)
{
init_rwsem(&test_rwsem);
+ return true;
}
static void ref_rwsem_section(const int nloops)
@@ -349,6 +389,408 @@ static struct ref_scale_ops rwsem_ops = {
.name = "rwsem"
};
+// Definitions for global spinlock
+static DEFINE_RAW_SPINLOCK(test_lock);
+
+static void ref_lock_section(const int nloops)
+{
+ int i;
+
+ preempt_disable();
+ for (i = nloops; i >= 0; i--) {
+ raw_spin_lock(&test_lock);
+ raw_spin_unlock(&test_lock);
+ }
+ preempt_enable();
+}
+
+static void ref_lock_delay_section(const int nloops, const int udl, const int ndl)
+{
+ int i;
+
+ preempt_disable();
+ for (i = nloops; i >= 0; i--) {
+ raw_spin_lock(&test_lock);
+ un_delay(udl, ndl);
+ raw_spin_unlock(&test_lock);
+ }
+ preempt_enable();
+}
+
+static struct ref_scale_ops lock_ops = {
+ .readsection = ref_lock_section,
+ .delaysection = ref_lock_delay_section,
+ .name = "lock"
+};
+
+// Definitions for global irq-save spinlock
+
+static void ref_lock_irq_section(const int nloops)
+{
+ unsigned long flags;
+ int i;
+
+ preempt_disable();
+ for (i = nloops; i >= 0; i--) {
+ raw_spin_lock_irqsave(&test_lock, flags);
+ raw_spin_unlock_irqrestore(&test_lock, flags);
+ }
+ preempt_enable();
+}
+
+static void ref_lock_irq_delay_section(const int nloops, const int udl, const int ndl)
+{
+ unsigned long flags;
+ int i;
+
+ preempt_disable();
+ for (i = nloops; i >= 0; i--) {
+ raw_spin_lock_irqsave(&test_lock, flags);
+ un_delay(udl, ndl);
+ raw_spin_unlock_irqrestore(&test_lock, flags);
+ }
+ preempt_enable();
+}
+
+static struct ref_scale_ops lock_irq_ops = {
+ .readsection = ref_lock_irq_section,
+ .delaysection = ref_lock_irq_delay_section,
+ .name = "lock-irq"
+};
+
+// Definitions acquire-release.
+static DEFINE_PER_CPU(unsigned long, test_acqrel);
+
+static void ref_acqrel_section(const int nloops)
+{
+ unsigned long x;
+ int i;
+
+ preempt_disable();
+ for (i = nloops; i >= 0; i--) {
+ x = smp_load_acquire(this_cpu_ptr(&test_acqrel));
+ smp_store_release(this_cpu_ptr(&test_acqrel), x + 1);
+ }
+ preempt_enable();
+}
+
+static void ref_acqrel_delay_section(const int nloops, const int udl, const int ndl)
+{
+ unsigned long x;
+ int i;
+
+ preempt_disable();
+ for (i = nloops; i >= 0; i--) {
+ x = smp_load_acquire(this_cpu_ptr(&test_acqrel));
+ un_delay(udl, ndl);
+ smp_store_release(this_cpu_ptr(&test_acqrel), x + 1);
+ }
+ preempt_enable();
+}
+
+static struct ref_scale_ops acqrel_ops = {
+ .readsection = ref_acqrel_section,
+ .delaysection = ref_acqrel_delay_section,
+ .name = "acqrel"
+};
+
+static volatile u64 stopopts;
+
+static void ref_clock_section(const int nloops)
+{
+ u64 x = 0;
+ int i;
+
+ preempt_disable();
+ for (i = nloops; i >= 0; i--)
+ x += ktime_get_real_fast_ns();
+ preempt_enable();
+ stopopts = x;
+}
+
+static void ref_clock_delay_section(const int nloops, const int udl, const int ndl)
+{
+ u64 x = 0;
+ int i;
+
+ preempt_disable();
+ for (i = nloops; i >= 0; i--) {
+ x += ktime_get_real_fast_ns();
+ un_delay(udl, ndl);
+ }
+ preempt_enable();
+ stopopts = x;
+}
+
+static struct ref_scale_ops clock_ops = {
+ .readsection = ref_clock_section,
+ .delaysection = ref_clock_delay_section,
+ .name = "clock"
+};
+
+static void ref_jiffies_section(const int nloops)
+{
+ u64 x = 0;
+ int i;
+
+ preempt_disable();
+ for (i = nloops; i >= 0; i--)
+ x += jiffies;
+ preempt_enable();
+ stopopts = x;
+}
+
+static void ref_jiffies_delay_section(const int nloops, const int udl, const int ndl)
+{
+ u64 x = 0;
+ int i;
+
+ preempt_disable();
+ for (i = nloops; i >= 0; i--) {
+ x += jiffies;
+ un_delay(udl, ndl);
+ }
+ preempt_enable();
+ stopopts = x;
+}
+
+static struct ref_scale_ops jiffies_ops = {
+ .readsection = ref_jiffies_section,
+ .delaysection = ref_jiffies_delay_section,
+ .name = "jiffies"
+};
+
+////////////////////////////////////////////////////////////////////////
+//
+// Methods leveraging SLAB_TYPESAFE_BY_RCU.
+//
+
+// Item to look up in a typesafe manner. Array of pointers to these.
+struct refscale_typesafe {
+ atomic_t rts_refctr; // Used by all flavors
+ spinlock_t rts_lock;
+ seqlock_t rts_seqlock;
+ unsigned int a;
+ unsigned int b;
+};
+
+static struct kmem_cache *typesafe_kmem_cachep;
+static struct refscale_typesafe **rtsarray;
+static long rtsarray_size;
+static DEFINE_TORTURE_RANDOM_PERCPU(refscale_rand);
+static bool (*rts_acquire)(struct refscale_typesafe *rtsp, unsigned int *start);
+static bool (*rts_release)(struct refscale_typesafe *rtsp, unsigned int start);
+
+// Conditionally acquire an explicit in-structure reference count.
+static bool typesafe_ref_acquire(struct refscale_typesafe *rtsp, unsigned int *start)
+{
+ return atomic_inc_not_zero(&rtsp->rts_refctr);
+}
+
+// Unconditionally release an explicit in-structure reference count.
+static bool typesafe_ref_release(struct refscale_typesafe *rtsp, unsigned int start)
+{
+ if (!atomic_dec_return(&rtsp->rts_refctr)) {
+ WRITE_ONCE(rtsp->a, rtsp->a + 1);
+ kmem_cache_free(typesafe_kmem_cachep, rtsp);
+ }
+ return true;
+}
+
+// Unconditionally acquire an explicit in-structure spinlock.
+static bool typesafe_lock_acquire(struct refscale_typesafe *rtsp, unsigned int *start)
+{
+ spin_lock(&rtsp->rts_lock);
+ return true;
+}
+
+// Unconditionally release an explicit in-structure spinlock.
+static bool typesafe_lock_release(struct refscale_typesafe *rtsp, unsigned int start)
+{
+ spin_unlock(&rtsp->rts_lock);
+ return true;
+}
+
+// Unconditionally acquire an explicit in-structure sequence lock.
+static bool typesafe_seqlock_acquire(struct refscale_typesafe *rtsp, unsigned int *start)
+{
+ *start = read_seqbegin(&rtsp->rts_seqlock);
+ return true;
+}
+
+// Conditionally release an explicit in-structure sequence lock. Return
+// true if this release was successful, that is, if no retry is required.
+static bool typesafe_seqlock_release(struct refscale_typesafe *rtsp, unsigned int start)
+{
+ return !read_seqretry(&rtsp->rts_seqlock, start);
+}
+
+// Do a read-side critical section with the specified delay in
+// microseconds and nanoseconds inserted so as to increase probability
+// of failure.
+static void typesafe_delay_section(const int nloops, const int udl, const int ndl)
+{
+ unsigned int a;
+ unsigned int b;
+ int i;
+ long idx;
+ struct refscale_typesafe *rtsp;
+ unsigned int start;
+
+ for (i = nloops; i >= 0; i--) {
+ preempt_disable();
+ idx = torture_random(this_cpu_ptr(&refscale_rand)) % rtsarray_size;
+ preempt_enable();
+retry:
+ rcu_read_lock();
+ rtsp = rcu_dereference(rtsarray[idx]);
+ a = READ_ONCE(rtsp->a);
+ if (!rts_acquire(rtsp, &start)) {
+ rcu_read_unlock();
+ goto retry;
+ }
+ if (a != READ_ONCE(rtsp->a)) {
+ (void)rts_release(rtsp, start);
+ rcu_read_unlock();
+ goto retry;
+ }
+ un_delay(udl, ndl);
+ b = READ_ONCE(rtsp->a);
+ // Remember, seqlock read-side release can fail.
+ if (!rts_release(rtsp, start)) {
+ rcu_read_unlock();
+ goto retry;
+ }
+ WARN_ONCE(a != b, "Re-read of ->a changed from %u to %u.\n", a, b);
+ b = rtsp->b;
+ rcu_read_unlock();
+ WARN_ON_ONCE(a * a != b);
+ }
+}
+
+// Because the acquisition and release methods are expensive, there
+// is no point in optimizing away the un_delay() function's two checks.
+// Thus simply define typesafe_read_section() as a simple wrapper around
+// typesafe_delay_section().
+static void typesafe_read_section(const int nloops)
+{
+ typesafe_delay_section(nloops, 0, 0);
+}
+
+// Allocate and initialize one refscale_typesafe structure.
+static struct refscale_typesafe *typesafe_alloc_one(void)
+{
+ struct refscale_typesafe *rtsp;
+
+ rtsp = kmem_cache_alloc(typesafe_kmem_cachep, GFP_KERNEL);
+ if (!rtsp)
+ return NULL;
+ atomic_set(&rtsp->rts_refctr, 1);
+ WRITE_ONCE(rtsp->a, rtsp->a + 1);
+ WRITE_ONCE(rtsp->b, rtsp->a * rtsp->a);
+ return rtsp;
+}
+
+// Slab-allocator constructor for refscale_typesafe structures created
+// out of a new slab of system memory.
+static void refscale_typesafe_ctor(void *rtsp_in)
+{
+ struct refscale_typesafe *rtsp = rtsp_in;
+
+ spin_lock_init(&rtsp->rts_lock);
+ seqlock_init(&rtsp->rts_seqlock);
+ preempt_disable();
+ rtsp->a = torture_random(this_cpu_ptr(&refscale_rand));
+ preempt_enable();
+}
+
+static struct ref_scale_ops typesafe_ref_ops;
+static struct ref_scale_ops typesafe_lock_ops;
+static struct ref_scale_ops typesafe_seqlock_ops;
+
+// Initialize for a typesafe test.
+static bool typesafe_init(void)
+{
+ long idx;
+ long si = lookup_instances;
+
+ typesafe_kmem_cachep = kmem_cache_create("refscale_typesafe",
+ sizeof(struct refscale_typesafe), sizeof(void *),
+ SLAB_TYPESAFE_BY_RCU, refscale_typesafe_ctor);
+ if (!typesafe_kmem_cachep)
+ return false;
+ if (si < 0)
+ si = -si * nr_cpu_ids;
+ else if (si == 0)
+ si = nr_cpu_ids;
+ rtsarray_size = si;
+ rtsarray = kcalloc(si, sizeof(*rtsarray), GFP_KERNEL);
+ if (!rtsarray)
+ return false;
+ for (idx = 0; idx < rtsarray_size; idx++) {
+ rtsarray[idx] = typesafe_alloc_one();
+ if (!rtsarray[idx])
+ return false;
+ }
+ if (cur_ops == &typesafe_ref_ops) {
+ rts_acquire = typesafe_ref_acquire;
+ rts_release = typesafe_ref_release;
+ } else if (cur_ops == &typesafe_lock_ops) {
+ rts_acquire = typesafe_lock_acquire;
+ rts_release = typesafe_lock_release;
+ } else if (cur_ops == &typesafe_seqlock_ops) {
+ rts_acquire = typesafe_seqlock_acquire;
+ rts_release = typesafe_seqlock_release;
+ } else {
+ WARN_ON_ONCE(1);
+ return false;
+ }
+ return true;
+}
+
+// Clean up after a typesafe test.
+static void typesafe_cleanup(void)
+{
+ long idx;
+
+ if (rtsarray) {
+ for (idx = 0; idx < rtsarray_size; idx++)
+ kmem_cache_free(typesafe_kmem_cachep, rtsarray[idx]);
+ kfree(rtsarray);
+ rtsarray = NULL;
+ rtsarray_size = 0;
+ }
+ kmem_cache_destroy(typesafe_kmem_cachep);
+ typesafe_kmem_cachep = NULL;
+ rts_acquire = NULL;
+ rts_release = NULL;
+}
+
+// The typesafe_init() function distinguishes these structures by address.
+static struct ref_scale_ops typesafe_ref_ops = {
+ .init = typesafe_init,
+ .cleanup = typesafe_cleanup,
+ .readsection = typesafe_read_section,
+ .delaysection = typesafe_delay_section,
+ .name = "typesafe_ref"
+};
+
+static struct ref_scale_ops typesafe_lock_ops = {
+ .init = typesafe_init,
+ .cleanup = typesafe_cleanup,
+ .readsection = typesafe_read_section,
+ .delaysection = typesafe_delay_section,
+ .name = "typesafe_lock"
+};
+
+static struct ref_scale_ops typesafe_seqlock_ops = {
+ .init = typesafe_init,
+ .cleanup = typesafe_cleanup,
+ .readsection = typesafe_read_section,
+ .delaysection = typesafe_delay_section,
+ .name = "typesafe_seqlock"
+};
+
static void rcu_scale_one_reader(void)
{
if (readdelay <= 0)
@@ -368,14 +810,14 @@ ref_scale_reader(void *arg)
u64 start;
s64 duration;
- VERBOSE_SCALEOUT("ref_scale_reader %ld: task started", me);
- set_cpus_allowed_ptr(current, cpumask_of(me % nr_cpu_ids));
+ VERBOSE_SCALEOUT_BATCH("ref_scale_reader %ld: task started", me);
+ WARN_ON_ONCE(set_cpus_allowed_ptr(current, cpumask_of(me % nr_cpu_ids)));
set_user_nice(current, MAX_NICE);
atomic_inc(&n_init);
if (holdoff)
schedule_timeout_interruptible(holdoff * HZ);
repeat:
- VERBOSE_SCALEOUT("ref_scale_reader %ld: waiting to start next experiment on cpu %d", me, smp_processor_id());
+ VERBOSE_SCALEOUT_BATCH("ref_scale_reader %ld: waiting to start next experiment on cpu %d", me, raw_smp_processor_id());
// Wait for signal that this reader can start.
wait_event(rt->wq, (atomic_read(&nreaders_exp) && smp_load_acquire(&rt->start_reader)) ||
@@ -385,14 +827,14 @@ repeat:
goto end;
// Make sure that the CPU is affinitized appropriately during testing.
- WARN_ON_ONCE(smp_processor_id() != me);
+ WARN_ON_ONCE(raw_smp_processor_id() != me);
WRITE_ONCE(rt->start_reader, 0);
if (!atomic_dec_return(&n_started))
while (atomic_read_acquire(&n_started))
cpu_relax();
- VERBOSE_SCALEOUT("ref_scale_reader %ld: experiment %d started", me, exp_idx);
+ VERBOSE_SCALEOUT_BATCH("ref_scale_reader %ld: experiment %d started", me, exp_idx);
// To reduce noise, do an initial cache-warming invocation, check
@@ -421,8 +863,8 @@ repeat:
if (atomic_dec_and_test(&nreaders_exp))
wake_up(&main_wq);
- VERBOSE_SCALEOUT("ref_scale_reader %ld: experiment %d ended, (readers remaining=%d)",
- me, exp_idx, atomic_read(&nreaders_exp));
+ VERBOSE_SCALEOUT_BATCH("ref_scale_reader %ld: experiment %d ended, (readers remaining=%d)",
+ me, exp_idx, atomic_read(&nreaders_exp));
if (!torture_must_stop())
goto repeat;
@@ -452,7 +894,7 @@ static u64 process_durations(int n)
char *buf;
u64 sum = 0;
- buf = kmalloc(128 + nreaders * 32, GFP_KERNEL);
+ buf = kmalloc(800 + 64, GFP_KERNEL);
if (!buf)
return 0;
buf[0] = 0;
@@ -465,13 +907,15 @@ static u64 process_durations(int n)
if (i % 5 == 0)
strcat(buf, "\n");
+ if (strlen(buf) >= 800) {
+ pr_alert("%s", buf);
+ buf[0] = 0;
+ }
strcat(buf, buf1);
sum += rt->last_duration_ns;
}
- strcat(buf, "\n");
-
- SCALEOUT("%s\n", buf);
+ pr_alert("%s\n", buf);
kfree(buf);
return sum;
@@ -485,7 +929,6 @@ static u64 process_durations(int n)
// point all the timestamps are printed.
static int main_func(void *arg)
{
- bool errexit = false;
int exp, r;
char buf1[64];
char *buf;
@@ -496,10 +939,10 @@ static int main_func(void *arg)
VERBOSE_SCALEOUT("main_func task started");
result_avg = kzalloc(nruns * sizeof(*result_avg), GFP_KERNEL);
- buf = kzalloc(64 + nruns * 32, GFP_KERNEL);
+ buf = kzalloc(800 + 64, GFP_KERNEL);
if (!result_avg || !buf) {
- VERBOSE_SCALEOUT_ERRSTRING("out of memory");
- errexit = true;
+ SCALEOUT_ERRSTRING("out of memory");
+ goto oom_exit;
}
if (holdoff)
schedule_timeout_interruptible(holdoff * HZ);
@@ -511,8 +954,6 @@ static int main_func(void *arg)
// Start exp readers up per experiment
for (exp = 0; exp < nruns && !torture_must_stop(); exp++) {
- if (errexit)
- break;
if (torture_must_stop())
goto end;
@@ -546,26 +987,23 @@ static int main_func(void *arg)
// Print the average of all experiments
SCALEOUT("END OF TEST. Calculating average duration per loop (nanoseconds)...\n");
- if (!errexit) {
- buf[0] = 0;
- strcat(buf, "\n");
- strcat(buf, "Runs\tTime(ns)\n");
- }
-
+ pr_alert("Runs\tTime(ns)\n");
for (exp = 0; exp < nruns; exp++) {
u64 avg;
u32 rem;
- if (errexit)
- break;
avg = div_u64_rem(result_avg[exp], 1000, &rem);
sprintf(buf1, "%d\t%llu.%03u\n", exp + 1, avg, rem);
strcat(buf, buf1);
+ if (strlen(buf) >= 800) {
+ pr_alert("%s", buf);
+ buf[0] = 0;
+ }
}
- if (!errexit)
- SCALEOUT("%s", buf);
+ pr_alert("%s", buf);
+oom_exit:
// This will shutdown everything including us.
if (shutdown) {
shutdown_start = 1;
@@ -587,8 +1025,8 @@ static void
ref_scale_print_module_parms(struct ref_scale_ops *cur_ops, const char *tag)
{
pr_alert("%s" SCALE_FLAG
- "--- %s: verbose=%d shutdown=%d holdoff=%d loops=%ld nreaders=%d nruns=%d readdelay=%d\n", scale_type, tag,
- verbose, shutdown, holdoff, loops, nreaders, nruns, readdelay);
+ "--- %s: verbose=%d verbose_batched=%d shutdown=%d holdoff=%d lookup_instances=%ld loops=%ld nreaders=%d nruns=%d readdelay=%d\n", scale_type, tag,
+ verbose, verbose_batched, shutdown, holdoff, lookup_instances, loops, nreaders, nruns, readdelay);
}
static void
@@ -625,7 +1063,7 @@ ref_scale_cleanup(void)
static int
ref_scale_shutdown(void *arg)
{
- wait_event(shutdown_wq, shutdown_start);
+ wait_event_idle(shutdown_wq, shutdown_start);
smp_mb(); // Wake before output.
ref_scale_cleanup();
@@ -640,8 +1078,9 @@ ref_scale_init(void)
long i;
int firsterr = 0;
static struct ref_scale_ops *scale_ops[] = {
- &rcu_ops, &srcu_ops, &rcu_trace_ops, &rcu_tasks_ops,
- &refcnt_ops, &rwlock_ops, &rwsem_ops,
+ &rcu_ops, &srcu_ops, RCU_TRACE_OPS RCU_TASKS_OPS &refcnt_ops, &rwlock_ops,
+ &rwsem_ops, &lock_ops, &lock_irq_ops, &acqrel_ops, &clock_ops, &jiffies_ops,
+ &typesafe_ref_ops, &typesafe_lock_ops, &typesafe_seqlock_ops,
};
if (!torture_init_begin(scale_type, verbose))
@@ -663,7 +1102,10 @@ ref_scale_init(void)
goto unwind;
}
if (cur_ops->init)
- cur_ops->init();
+ if (!cur_ops->init()) {
+ firsterr = -EUCLEAN;
+ goto unwind;
+ }
ref_scale_print_module_parms(cur_ops, "Start of test");
@@ -672,7 +1114,7 @@ ref_scale_init(void)
init_waitqueue_head(&shutdown_wq);
firsterr = torture_create_kthread(ref_scale_shutdown, NULL,
shutdown_task);
- if (firsterr)
+ if (torture_init_error(firsterr))
goto unwind;
schedule_timeout_uninterruptible(1);
}
@@ -689,26 +1131,25 @@ ref_scale_init(void)
reader_tasks = kcalloc(nreaders, sizeof(reader_tasks[0]),
GFP_KERNEL);
if (!reader_tasks) {
- VERBOSE_SCALEOUT_ERRSTRING("out of memory");
+ SCALEOUT_ERRSTRING("out of memory");
firsterr = -ENOMEM;
goto unwind;
}
- VERBOSE_SCALEOUT("Starting %d reader threads\n", nreaders);
+ VERBOSE_SCALEOUT("Starting %d reader threads", nreaders);
for (i = 0; i < nreaders; i++) {
+ init_waitqueue_head(&reader_tasks[i].wq);
firsterr = torture_create_kthread(ref_scale_reader, (void *)i,
reader_tasks[i].task);
- if (firsterr)
+ if (torture_init_error(firsterr))
goto unwind;
-
- init_waitqueue_head(&(reader_tasks[i].wq));
}
// Main Task
init_waitqueue_head(&main_wq);
firsterr = torture_create_kthread(main_func, NULL, main_task);
- if (firsterr)
+ if (torture_init_error(firsterr))
goto unwind;
torture_init_end();
diff --git a/kernel/rcu/srcutiny.c b/kernel/rcu/srcutiny.c
index 6208c1dae5c9..c38e5933a5d6 100644
--- a/kernel/rcu/srcutiny.c
+++ b/kernel/rcu/srcutiny.c
@@ -34,6 +34,7 @@ static int init_srcu_struct_fields(struct srcu_struct *ssp)
ssp->srcu_gp_running = false;
ssp->srcu_gp_waiting = false;
ssp->srcu_idx = 0;
+ ssp->srcu_idx_max = 0;
INIT_WORK(&ssp->srcu_work, srcu_drive_gp);
INIT_LIST_HEAD(&ssp->srcu_work.entry);
return 0;
@@ -84,6 +85,8 @@ void cleanup_srcu_struct(struct srcu_struct *ssp)
WARN_ON(ssp->srcu_gp_waiting);
WARN_ON(ssp->srcu_cb_head);
WARN_ON(&ssp->srcu_cb_head != ssp->srcu_cb_tail);
+ WARN_ON(ssp->srcu_idx != ssp->srcu_idx_max);
+ WARN_ON(ssp->srcu_idx & 0x1);
}
EXPORT_SYMBOL_GPL(cleanup_srcu_struct);
@@ -93,10 +96,10 @@ EXPORT_SYMBOL_GPL(cleanup_srcu_struct);
*/
void __srcu_read_unlock(struct srcu_struct *ssp, int idx)
{
- int newval = ssp->srcu_lock_nesting[idx] - 1;
+ int newval = READ_ONCE(ssp->srcu_lock_nesting[idx]) - 1;
WRITE_ONCE(ssp->srcu_lock_nesting[idx], newval);
- if (!newval && READ_ONCE(ssp->srcu_gp_waiting))
+ if (!newval && READ_ONCE(ssp->srcu_gp_waiting) && in_task())
swake_up_one(&ssp->srcu_wq);
}
EXPORT_SYMBOL_GPL(__srcu_read_unlock);
@@ -114,7 +117,7 @@ void srcu_drive_gp(struct work_struct *wp)
struct srcu_struct *ssp;
ssp = container_of(wp, struct srcu_struct, srcu_work);
- if (ssp->srcu_gp_running || !READ_ONCE(ssp->srcu_cb_head))
+ if (ssp->srcu_gp_running || ULONG_CMP_GE(ssp->srcu_idx, READ_ONCE(ssp->srcu_idx_max)))
return; /* Already running or nothing to do. */
/* Remove recently arrived callbacks and wait for readers. */
@@ -124,16 +127,18 @@ void srcu_drive_gp(struct work_struct *wp)
ssp->srcu_cb_head = NULL;
ssp->srcu_cb_tail = &ssp->srcu_cb_head;
local_irq_enable();
- idx = ssp->srcu_idx;
- WRITE_ONCE(ssp->srcu_idx, !ssp->srcu_idx);
+ idx = (ssp->srcu_idx & 0x2) / 2;
+ WRITE_ONCE(ssp->srcu_idx, ssp->srcu_idx + 1);
WRITE_ONCE(ssp->srcu_gp_waiting, true); /* srcu_read_unlock() wakes! */
swait_event_exclusive(ssp->srcu_wq, !READ_ONCE(ssp->srcu_lock_nesting[idx]));
WRITE_ONCE(ssp->srcu_gp_waiting, false); /* srcu_read_unlock() cheap. */
+ WRITE_ONCE(ssp->srcu_idx, ssp->srcu_idx + 1);
/* Invoke the callbacks we removed above. */
while (lh) {
rhp = lh;
lh = lh->next;
+ debug_rcu_head_callback(rhp);
local_bh_disable();
rhp->func(rhp);
local_bh_enable();
@@ -146,11 +151,27 @@ void srcu_drive_gp(struct work_struct *wp)
* straighten that out.
*/
WRITE_ONCE(ssp->srcu_gp_running, false);
- if (READ_ONCE(ssp->srcu_cb_head))
+ if (ULONG_CMP_LT(ssp->srcu_idx, READ_ONCE(ssp->srcu_idx_max)))
schedule_work(&ssp->srcu_work);
}
EXPORT_SYMBOL_GPL(srcu_drive_gp);
+static void srcu_gp_start_if_needed(struct srcu_struct *ssp)
+{
+ unsigned long cookie;
+
+ cookie = get_state_synchronize_srcu(ssp);
+ if (ULONG_CMP_GE(READ_ONCE(ssp->srcu_idx_max), cookie))
+ return;
+ WRITE_ONCE(ssp->srcu_idx_max, cookie);
+ if (!READ_ONCE(ssp->srcu_gp_running)) {
+ if (likely(srcu_init_done))
+ schedule_work(&ssp->srcu_work);
+ else if (list_empty(&ssp->srcu_work.entry))
+ list_add(&ssp->srcu_work.entry, &srcu_boot_list);
+ }
+}
+
/*
* Enqueue an SRCU callback on the specified srcu_struct structure,
* initiating grace-period processing if it is not already running.
@@ -166,12 +187,7 @@ void call_srcu(struct srcu_struct *ssp, struct rcu_head *rhp,
*ssp->srcu_cb_tail = rhp;
ssp->srcu_cb_tail = &rhp->next;
local_irq_restore(flags);
- if (!READ_ONCE(ssp->srcu_gp_running)) {
- if (likely(srcu_init_done))
- schedule_work(&ssp->srcu_work);
- else if (list_empty(&ssp->srcu_work.entry))
- list_add(&ssp->srcu_work.entry, &srcu_boot_list);
- }
+ srcu_gp_start_if_needed(ssp);
}
EXPORT_SYMBOL_GPL(call_srcu);
@@ -182,6 +198,18 @@ void synchronize_srcu(struct srcu_struct *ssp)
{
struct rcu_synchronize rs;
+ srcu_lock_sync(&ssp->dep_map);
+
+ RCU_LOCKDEP_WARN(lockdep_is_held(ssp) ||
+ lock_is_held(&rcu_bh_lock_map) ||
+ lock_is_held(&rcu_lock_map) ||
+ lock_is_held(&rcu_sched_lock_map),
+ "Illegal synchronize_srcu() in same-type SRCU (or in RCU) read-side critical section");
+
+ if (rcu_scheduler_active == RCU_SCHEDULER_INACTIVE)
+ return;
+
+ might_sleep();
init_rcu_head_on_stack(&rs.head);
init_completion(&rs.completion);
call_srcu(ssp, &rs.head, wakeme_after_rcu);
@@ -190,6 +218,48 @@ void synchronize_srcu(struct srcu_struct *ssp)
}
EXPORT_SYMBOL_GPL(synchronize_srcu);
+/*
+ * get_state_synchronize_srcu - Provide an end-of-grace-period cookie
+ */
+unsigned long get_state_synchronize_srcu(struct srcu_struct *ssp)
+{
+ unsigned long ret;
+
+ barrier();
+ ret = (READ_ONCE(ssp->srcu_idx) + 3) & ~0x1;
+ barrier();
+ return ret;
+}
+EXPORT_SYMBOL_GPL(get_state_synchronize_srcu);
+
+/*
+ * start_poll_synchronize_srcu - Provide cookie and start grace period
+ *
+ * The difference between this and get_state_synchronize_srcu() is that
+ * this function ensures that the poll_state_synchronize_srcu() will
+ * eventually return the value true.
+ */
+unsigned long start_poll_synchronize_srcu(struct srcu_struct *ssp)
+{
+ unsigned long ret = get_state_synchronize_srcu(ssp);
+
+ srcu_gp_start_if_needed(ssp);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(start_poll_synchronize_srcu);
+
+/*
+ * poll_state_synchronize_srcu - Has cookie's grace period ended?
+ */
+bool poll_state_synchronize_srcu(struct srcu_struct *ssp, unsigned long cookie)
+{
+ unsigned long cur_s = READ_ONCE(ssp->srcu_idx);
+
+ barrier();
+ return ULONG_CMP_GE(cur_s, cookie) || ULONG_CMP_LT(cur_s, cookie - 3);
+}
+EXPORT_SYMBOL_GPL(poll_state_synchronize_srcu);
+
/* Lockdep diagnostics. */
void __init rcu_scheduler_starting(void)
{
diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c
index 0f23d20d485a..0351a4e83529 100644
--- a/kernel/rcu/srcutree.c
+++ b/kernel/rcu/srcutree.c
@@ -24,6 +24,7 @@
#include <linux/smp.h>
#include <linux/delay.h>
#include <linux/module.h>
+#include <linux/slab.h>
#include <linux/srcu.h>
#include "rcu.h"
@@ -38,6 +39,35 @@ module_param(exp_holdoff, ulong, 0444);
static ulong counter_wrap_check = (ULONG_MAX >> 2);
module_param(counter_wrap_check, ulong, 0444);
+/*
+ * Control conversion to SRCU_SIZE_BIG:
+ * 0: Don't convert at all.
+ * 1: Convert at init_srcu_struct() time.
+ * 2: Convert when rcutorture invokes srcu_torture_stats_print().
+ * 3: Decide at boot time based on system shape (default).
+ * 0x1x: Convert when excessive contention encountered.
+ */
+#define SRCU_SIZING_NONE 0
+#define SRCU_SIZING_INIT 1
+#define SRCU_SIZING_TORTURE 2
+#define SRCU_SIZING_AUTO 3
+#define SRCU_SIZING_CONTEND 0x10
+#define SRCU_SIZING_IS(x) ((convert_to_big & ~SRCU_SIZING_CONTEND) == x)
+#define SRCU_SIZING_IS_NONE() (SRCU_SIZING_IS(SRCU_SIZING_NONE))
+#define SRCU_SIZING_IS_INIT() (SRCU_SIZING_IS(SRCU_SIZING_INIT))
+#define SRCU_SIZING_IS_TORTURE() (SRCU_SIZING_IS(SRCU_SIZING_TORTURE))
+#define SRCU_SIZING_IS_CONTEND() (convert_to_big & SRCU_SIZING_CONTEND)
+static int convert_to_big = SRCU_SIZING_AUTO;
+module_param(convert_to_big, int, 0444);
+
+/* Number of CPUs to trigger init_srcu_struct()-time transition to big. */
+static int big_cpu_lim __read_mostly = 128;
+module_param(big_cpu_lim, int, 0444);
+
+/* Contention events per jiffy to initiate transition to big. */
+static int small_contention_lim __read_mostly = 100;
+module_param(small_contention_lim, int, 0444);
+
/* Early-boot callback-management, so early that no lock is required! */
static LIST_HEAD(srcu_boot_list);
static bool __read_mostly srcu_init_done;
@@ -48,39 +78,90 @@ static void process_srcu(struct work_struct *work);
static void srcu_delay_timer(struct timer_list *t);
/* Wrappers for lock acquisition and release, see raw_spin_lock_rcu_node(). */
-#define spin_lock_rcu_node(p) \
-do { \
- spin_lock(&ACCESS_PRIVATE(p, lock)); \
- smp_mb__after_unlock_lock(); \
+#define spin_lock_rcu_node(p) \
+do { \
+ spin_lock(&ACCESS_PRIVATE(p, lock)); \
+ smp_mb__after_unlock_lock(); \
} while (0)
#define spin_unlock_rcu_node(p) spin_unlock(&ACCESS_PRIVATE(p, lock))
-#define spin_lock_irq_rcu_node(p) \
-do { \
- spin_lock_irq(&ACCESS_PRIVATE(p, lock)); \
- smp_mb__after_unlock_lock(); \
+#define spin_lock_irq_rcu_node(p) \
+do { \
+ spin_lock_irq(&ACCESS_PRIVATE(p, lock)); \
+ smp_mb__after_unlock_lock(); \
} while (0)
-#define spin_unlock_irq_rcu_node(p) \
+#define spin_unlock_irq_rcu_node(p) \
spin_unlock_irq(&ACCESS_PRIVATE(p, lock))
-#define spin_lock_irqsave_rcu_node(p, flags) \
-do { \
- spin_lock_irqsave(&ACCESS_PRIVATE(p, lock), flags); \
- smp_mb__after_unlock_lock(); \
+#define spin_lock_irqsave_rcu_node(p, flags) \
+do { \
+ spin_lock_irqsave(&ACCESS_PRIVATE(p, lock), flags); \
+ smp_mb__after_unlock_lock(); \
} while (0)
-#define spin_unlock_irqrestore_rcu_node(p, flags) \
- spin_unlock_irqrestore(&ACCESS_PRIVATE(p, lock), flags) \
+#define spin_trylock_irqsave_rcu_node(p, flags) \
+({ \
+ bool ___locked = spin_trylock_irqsave(&ACCESS_PRIVATE(p, lock), flags); \
+ \
+ if (___locked) \
+ smp_mb__after_unlock_lock(); \
+ ___locked; \
+})
+
+#define spin_unlock_irqrestore_rcu_node(p, flags) \
+ spin_unlock_irqrestore(&ACCESS_PRIVATE(p, lock), flags) \
/*
- * Initialize SRCU combining tree. Note that statically allocated
+ * Initialize SRCU per-CPU data. Note that statically allocated
* srcu_struct structures might already have srcu_read_lock() and
* srcu_read_unlock() running against them. So if the is_static parameter
* is set, don't initialize ->srcu_lock_count[] and ->srcu_unlock_count[].
*/
-static void init_srcu_struct_nodes(struct srcu_struct *ssp, bool is_static)
+static void init_srcu_struct_data(struct srcu_struct *ssp)
+{
+ int cpu;
+ struct srcu_data *sdp;
+
+ /*
+ * Initialize the per-CPU srcu_data array, which feeds into the
+ * leaves of the srcu_node tree.
+ */
+ WARN_ON_ONCE(ARRAY_SIZE(sdp->srcu_lock_count) !=
+ ARRAY_SIZE(sdp->srcu_unlock_count));
+ for_each_possible_cpu(cpu) {
+ sdp = per_cpu_ptr(ssp->sda, cpu);
+ spin_lock_init(&ACCESS_PRIVATE(sdp, lock));
+ rcu_segcblist_init(&sdp->srcu_cblist);
+ sdp->srcu_cblist_invoking = false;
+ sdp->srcu_gp_seq_needed = ssp->srcu_sup->srcu_gp_seq;
+ sdp->srcu_gp_seq_needed_exp = ssp->srcu_sup->srcu_gp_seq;
+ sdp->mynode = NULL;
+ sdp->cpu = cpu;
+ INIT_WORK(&sdp->work, srcu_invoke_callbacks);
+ timer_setup(&sdp->delay_work, srcu_delay_timer, 0);
+ sdp->ssp = ssp;
+ }
+}
+
+/* Invalid seq state, used during snp node initialization */
+#define SRCU_SNP_INIT_SEQ 0x2
+
+/*
+ * Check whether sequence number corresponding to snp node,
+ * is invalid.
+ */
+static inline bool srcu_invl_snp_seq(unsigned long s)
+{
+ return s == SRCU_SNP_INIT_SEQ;
+}
+
+/*
+ * Allocated and initialize SRCU combining tree. Returns @true if
+ * allocation succeeded and @false otherwise.
+ */
+static bool init_srcu_struct_nodes(struct srcu_struct *ssp, gfp_t gfp_flags)
{
int cpu;
int i;
@@ -90,10 +171,16 @@ static void init_srcu_struct_nodes(struct srcu_struct *ssp, bool is_static)
struct srcu_node *snp;
struct srcu_node *snp_first;
+ /* Initialize geometry if it has not already been initialized. */
+ rcu_init_geometry();
+ ssp->srcu_sup->node = kcalloc(rcu_num_nodes, sizeof(*ssp->srcu_sup->node), gfp_flags);
+ if (!ssp->srcu_sup->node)
+ return false;
+
/* Work out the overall tree geometry. */
- ssp->level[0] = &ssp->node[0];
+ ssp->srcu_sup->level[0] = &ssp->srcu_sup->node[0];
for (i = 1; i < rcu_num_lvls; i++)
- ssp->level[i] = ssp->level[i - 1] + num_rcu_lvl[i - 1];
+ ssp->srcu_sup->level[i] = ssp->srcu_sup->level[i - 1] + num_rcu_lvl[i - 1];
rcu_init_levelspread(levelspread, num_rcu_lvl);
/* Each pass through this loop initializes one srcu_node structure. */
@@ -102,23 +189,23 @@ static void init_srcu_struct_nodes(struct srcu_struct *ssp, bool is_static)
WARN_ON_ONCE(ARRAY_SIZE(snp->srcu_have_cbs) !=
ARRAY_SIZE(snp->srcu_data_have_cbs));
for (i = 0; i < ARRAY_SIZE(snp->srcu_have_cbs); i++) {
- snp->srcu_have_cbs[i] = 0;
+ snp->srcu_have_cbs[i] = SRCU_SNP_INIT_SEQ;
snp->srcu_data_have_cbs[i] = 0;
}
- snp->srcu_gp_seq_needed_exp = 0;
+ snp->srcu_gp_seq_needed_exp = SRCU_SNP_INIT_SEQ;
snp->grplo = -1;
snp->grphi = -1;
- if (snp == &ssp->node[0]) {
+ if (snp == &ssp->srcu_sup->node[0]) {
/* Root node, special case. */
snp->srcu_parent = NULL;
continue;
}
/* Non-root node. */
- if (snp == ssp->level[level + 1])
+ if (snp == ssp->srcu_sup->level[level + 1])
level++;
- snp->srcu_parent = ssp->level[level - 1] +
- (snp - ssp->level[level]) /
+ snp->srcu_parent = ssp->srcu_sup->level[level - 1] +
+ (snp - ssp->srcu_sup->level[level]) /
levelspread[level - 1];
}
@@ -126,64 +213,73 @@ static void init_srcu_struct_nodes(struct srcu_struct *ssp, bool is_static)
* Initialize the per-CPU srcu_data array, which feeds into the
* leaves of the srcu_node tree.
*/
- WARN_ON_ONCE(ARRAY_SIZE(sdp->srcu_lock_count) !=
- ARRAY_SIZE(sdp->srcu_unlock_count));
level = rcu_num_lvls - 1;
- snp_first = ssp->level[level];
+ snp_first = ssp->srcu_sup->level[level];
for_each_possible_cpu(cpu) {
sdp = per_cpu_ptr(ssp->sda, cpu);
- spin_lock_init(&ACCESS_PRIVATE(sdp, lock));
- rcu_segcblist_init(&sdp->srcu_cblist);
- sdp->srcu_cblist_invoking = false;
- sdp->srcu_gp_seq_needed = ssp->srcu_gp_seq;
- sdp->srcu_gp_seq_needed_exp = ssp->srcu_gp_seq;
sdp->mynode = &snp_first[cpu / levelspread[level]];
for (snp = sdp->mynode; snp != NULL; snp = snp->srcu_parent) {
if (snp->grplo < 0)
snp->grplo = cpu;
snp->grphi = cpu;
}
- sdp->cpu = cpu;
- INIT_WORK(&sdp->work, srcu_invoke_callbacks);
- timer_setup(&sdp->delay_work, srcu_delay_timer, 0);
- sdp->ssp = ssp;
- sdp->grpmask = 1 << (cpu - sdp->mynode->grplo);
- if (is_static)
- continue;
-
- /* Dynamically allocated, better be no srcu_read_locks()! */
- for (i = 0; i < ARRAY_SIZE(sdp->srcu_lock_count); i++) {
- sdp->srcu_lock_count[i] = 0;
- sdp->srcu_unlock_count[i] = 0;
- }
+ sdp->grpmask = 1UL << (cpu - sdp->mynode->grplo);
}
+ smp_store_release(&ssp->srcu_sup->srcu_size_state, SRCU_SIZE_WAIT_BARRIER);
+ return true;
}
/*
* Initialize non-compile-time initialized fields, including the
- * associated srcu_node and srcu_data structures. The is_static
- * parameter is passed through to init_srcu_struct_nodes(), and
- * also tells us that ->sda has already been wired up to srcu_data.
+ * associated srcu_node and srcu_data structures. The is_static parameter
+ * tells us that ->sda has already been wired up to srcu_data.
*/
static int init_srcu_struct_fields(struct srcu_struct *ssp, bool is_static)
{
- mutex_init(&ssp->srcu_cb_mutex);
- mutex_init(&ssp->srcu_gp_mutex);
+ if (!is_static)
+ ssp->srcu_sup = kzalloc(sizeof(*ssp->srcu_sup), GFP_KERNEL);
+ if (!ssp->srcu_sup)
+ return -ENOMEM;
+ if (!is_static)
+ spin_lock_init(&ACCESS_PRIVATE(ssp->srcu_sup, lock));
+ ssp->srcu_sup->srcu_size_state = SRCU_SIZE_SMALL;
+ ssp->srcu_sup->node = NULL;
+ mutex_init(&ssp->srcu_sup->srcu_cb_mutex);
+ mutex_init(&ssp->srcu_sup->srcu_gp_mutex);
ssp->srcu_idx = 0;
- ssp->srcu_gp_seq = 0;
- ssp->srcu_barrier_seq = 0;
- mutex_init(&ssp->srcu_barrier_mutex);
- atomic_set(&ssp->srcu_barrier_cpu_cnt, 0);
- INIT_DELAYED_WORK(&ssp->work, process_srcu);
+ ssp->srcu_sup->srcu_gp_seq = 0;
+ ssp->srcu_sup->srcu_barrier_seq = 0;
+ mutex_init(&ssp->srcu_sup->srcu_barrier_mutex);
+ atomic_set(&ssp->srcu_sup->srcu_barrier_cpu_cnt, 0);
+ INIT_DELAYED_WORK(&ssp->srcu_sup->work, process_srcu);
+ ssp->srcu_sup->sda_is_static = is_static;
if (!is_static)
ssp->sda = alloc_percpu(struct srcu_data);
if (!ssp->sda)
- return -ENOMEM;
- init_srcu_struct_nodes(ssp, is_static);
- ssp->srcu_gp_seq_needed_exp = 0;
- ssp->srcu_last_gp_end = ktime_get_mono_fast_ns();
- smp_store_release(&ssp->srcu_gp_seq_needed, 0); /* Init done. */
+ goto err_free_sup;
+ init_srcu_struct_data(ssp);
+ ssp->srcu_sup->srcu_gp_seq_needed_exp = 0;
+ ssp->srcu_sup->srcu_last_gp_end = ktime_get_mono_fast_ns();
+ if (READ_ONCE(ssp->srcu_sup->srcu_size_state) == SRCU_SIZE_SMALL && SRCU_SIZING_IS_INIT()) {
+ if (!init_srcu_struct_nodes(ssp, GFP_ATOMIC))
+ goto err_free_sda;
+ WRITE_ONCE(ssp->srcu_sup->srcu_size_state, SRCU_SIZE_BIG);
+ }
+ ssp->srcu_sup->srcu_ssp = ssp;
+ smp_store_release(&ssp->srcu_sup->srcu_gp_seq_needed, 0); /* Init done. */
return 0;
+
+err_free_sda:
+ if (!is_static) {
+ free_percpu(ssp->sda);
+ ssp->sda = NULL;
+ }
+err_free_sup:
+ if (!is_static) {
+ kfree(ssp->srcu_sup);
+ ssp->srcu_sup = NULL;
+ }
+ return -ENOMEM;
}
#ifdef CONFIG_DEBUG_LOCK_ALLOC
@@ -194,7 +290,6 @@ int __init_srcu_struct(struct srcu_struct *ssp, const char *name,
/* Don't re-initialize a lock while it is held. */
debug_check_no_locks_freed((void *)ssp, sizeof(*ssp));
lockdep_init_map(&ssp->dep_map, name, key, 0);
- spin_lock_init(&ACCESS_PRIVATE(ssp, lock));
return init_srcu_struct_fields(ssp, false);
}
EXPORT_SYMBOL_GPL(__init_srcu_struct);
@@ -211,7 +306,6 @@ EXPORT_SYMBOL_GPL(__init_srcu_struct);
*/
int init_srcu_struct(struct srcu_struct *ssp)
{
- spin_lock_init(&ACCESS_PRIVATE(ssp, lock));
return init_srcu_struct_fields(ssp, false);
}
EXPORT_SYMBOL_GPL(init_srcu_struct);
@@ -219,6 +313,86 @@ EXPORT_SYMBOL_GPL(init_srcu_struct);
#endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */
/*
+ * Initiate a transition to SRCU_SIZE_BIG with lock held.
+ */
+static void __srcu_transition_to_big(struct srcu_struct *ssp)
+{
+ lockdep_assert_held(&ACCESS_PRIVATE(ssp->srcu_sup, lock));
+ smp_store_release(&ssp->srcu_sup->srcu_size_state, SRCU_SIZE_ALLOC);
+}
+
+/*
+ * Initiate an idempotent transition to SRCU_SIZE_BIG.
+ */
+static void srcu_transition_to_big(struct srcu_struct *ssp)
+{
+ unsigned long flags;
+
+ /* Double-checked locking on ->srcu_size-state. */
+ if (smp_load_acquire(&ssp->srcu_sup->srcu_size_state) != SRCU_SIZE_SMALL)
+ return;
+ spin_lock_irqsave_rcu_node(ssp->srcu_sup, flags);
+ if (smp_load_acquire(&ssp->srcu_sup->srcu_size_state) != SRCU_SIZE_SMALL) {
+ spin_unlock_irqrestore_rcu_node(ssp->srcu_sup, flags);
+ return;
+ }
+ __srcu_transition_to_big(ssp);
+ spin_unlock_irqrestore_rcu_node(ssp->srcu_sup, flags);
+}
+
+/*
+ * Check to see if the just-encountered contention event justifies
+ * a transition to SRCU_SIZE_BIG.
+ */
+static void spin_lock_irqsave_check_contention(struct srcu_struct *ssp)
+{
+ unsigned long j;
+
+ if (!SRCU_SIZING_IS_CONTEND() || ssp->srcu_sup->srcu_size_state)
+ return;
+ j = jiffies;
+ if (ssp->srcu_sup->srcu_size_jiffies != j) {
+ ssp->srcu_sup->srcu_size_jiffies = j;
+ ssp->srcu_sup->srcu_n_lock_retries = 0;
+ }
+ if (++ssp->srcu_sup->srcu_n_lock_retries <= small_contention_lim)
+ return;
+ __srcu_transition_to_big(ssp);
+}
+
+/*
+ * Acquire the specified srcu_data structure's ->lock, but check for
+ * excessive contention, which results in initiation of a transition
+ * to SRCU_SIZE_BIG. But only if the srcutree.convert_to_big module
+ * parameter permits this.
+ */
+static void spin_lock_irqsave_sdp_contention(struct srcu_data *sdp, unsigned long *flags)
+{
+ struct srcu_struct *ssp = sdp->ssp;
+
+ if (spin_trylock_irqsave_rcu_node(sdp, *flags))
+ return;
+ spin_lock_irqsave_rcu_node(ssp->srcu_sup, *flags);
+ spin_lock_irqsave_check_contention(ssp);
+ spin_unlock_irqrestore_rcu_node(ssp->srcu_sup, *flags);
+ spin_lock_irqsave_rcu_node(sdp, *flags);
+}
+
+/*
+ * Acquire the specified srcu_struct structure's ->lock, but check for
+ * excessive contention, which results in initiation of a transition
+ * to SRCU_SIZE_BIG. But only if the srcutree.convert_to_big module
+ * parameter permits this.
+ */
+static void spin_lock_irqsave_ssp_contention(struct srcu_struct *ssp, unsigned long *flags)
+{
+ if (spin_trylock_irqsave_rcu_node(ssp->srcu_sup, *flags))
+ return;
+ spin_lock_irqsave_rcu_node(ssp->srcu_sup, *flags);
+ spin_lock_irqsave_check_contention(ssp);
+}
+
+/*
* First-use initialization of statically allocated srcu_struct
* structure. Wiring up the combining tree is more than can be
* done with compile-time initialization, so this check is added
@@ -231,15 +405,15 @@ static void check_init_srcu_struct(struct srcu_struct *ssp)
unsigned long flags;
/* The smp_load_acquire() pairs with the smp_store_release(). */
- if (!rcu_seq_state(smp_load_acquire(&ssp->srcu_gp_seq_needed))) /*^^^*/
+ if (!rcu_seq_state(smp_load_acquire(&ssp->srcu_sup->srcu_gp_seq_needed))) /*^^^*/
return; /* Already initialized. */
- spin_lock_irqsave_rcu_node(ssp, flags);
- if (!rcu_seq_state(ssp->srcu_gp_seq_needed)) {
- spin_unlock_irqrestore_rcu_node(ssp, flags);
+ spin_lock_irqsave_rcu_node(ssp->srcu_sup, flags);
+ if (!rcu_seq_state(ssp->srcu_sup->srcu_gp_seq_needed)) {
+ spin_unlock_irqrestore_rcu_node(ssp->srcu_sup, flags);
return;
}
init_srcu_struct_fields(ssp, true);
- spin_unlock_irqrestore_rcu_node(ssp, flags);
+ spin_unlock_irqrestore_rcu_node(ssp->srcu_sup, flags);
}
/*
@@ -254,7 +428,7 @@ static unsigned long srcu_readers_lock_idx(struct srcu_struct *ssp, int idx)
for_each_possible_cpu(cpu) {
struct srcu_data *cpuc = per_cpu_ptr(ssp->sda, cpu);
- sum += READ_ONCE(cpuc->srcu_lock_count[idx]);
+ sum += atomic_long_read(&cpuc->srcu_lock_count[idx]);
}
return sum;
}
@@ -266,13 +440,18 @@ static unsigned long srcu_readers_lock_idx(struct srcu_struct *ssp, int idx)
static unsigned long srcu_readers_unlock_idx(struct srcu_struct *ssp, int idx)
{
int cpu;
+ unsigned long mask = 0;
unsigned long sum = 0;
for_each_possible_cpu(cpu) {
struct srcu_data *cpuc = per_cpu_ptr(ssp->sda, cpu);
- sum += READ_ONCE(cpuc->srcu_unlock_count[idx]);
+ sum += atomic_long_read(&cpuc->srcu_unlock_count[idx]);
+ if (IS_ENABLED(CONFIG_PROVE_RCU))
+ mask = mask | READ_ONCE(cpuc->srcu_nmi_safety);
}
+ WARN_ONCE(IS_ENABLED(CONFIG_PROVE_RCU) && (mask & (mask >> 1)),
+ "Mixed NMI-safe readers for srcu_struct at %ps.\n", ssp);
return sum;
}
@@ -301,24 +480,59 @@ static bool srcu_readers_active_idx_check(struct srcu_struct *ssp, int idx)
/*
* If the locks are the same as the unlocks, then there must have
- * been no readers on this index at some time in between. This does
- * not mean that there are no more readers, as one could have read
- * the current index but not have incremented the lock counter yet.
+ * been no readers on this index at some point in this function.
+ * But there might be more readers, as a task might have read
+ * the current ->srcu_idx but not yet have incremented its CPU's
+ * ->srcu_lock_count[idx] counter. In fact, it is possible
+ * that most of the tasks have been preempted between fetching
+ * ->srcu_idx and incrementing ->srcu_lock_count[idx]. And there
+ * could be almost (ULONG_MAX / sizeof(struct task_struct)) tasks
+ * in a system whose address space was fully populated with memory.
+ * Call this quantity Nt.
+ *
+ * So suppose that the updater is preempted at this point in the
+ * code for a long time. That now-preempted updater has already
+ * flipped ->srcu_idx (possibly during the preceding grace period),
+ * done an smp_mb() (again, possibly during the preceding grace
+ * period), and summed up the ->srcu_unlock_count[idx] counters.
+ * How many times can a given one of the aforementioned Nt tasks
+ * increment the old ->srcu_idx value's ->srcu_lock_count[idx]
+ * counter, in the absence of nesting?
+ *
+ * It can clearly do so once, given that it has already fetched
+ * the old value of ->srcu_idx and is just about to use that value
+ * to index its increment of ->srcu_lock_count[idx]. But as soon as
+ * it leaves that SRCU read-side critical section, it will increment
+ * ->srcu_unlock_count[idx], which must follow the updater's above
+ * read from that same value. Thus, as soon the reading task does
+ * an smp_mb() and a later fetch from ->srcu_idx, that task will be
+ * guaranteed to get the new index. Except that the increment of
+ * ->srcu_unlock_count[idx] in __srcu_read_unlock() is after the
+ * smp_mb(), and the fetch from ->srcu_idx in __srcu_read_lock()
+ * is before the smp_mb(). Thus, that task might not see the new
+ * value of ->srcu_idx until the -second- __srcu_read_lock(),
+ * which in turn means that this task might well increment
+ * ->srcu_lock_count[idx] for the old value of ->srcu_idx twice,
+ * not just once.
+ *
+ * However, it is important to note that a given smp_mb() takes
+ * effect not just for the task executing it, but also for any
+ * later task running on that same CPU.
*
- * So suppose that the updater is preempted here for so long
- * that more than ULONG_MAX non-nested readers come and go in
- * the meantime. It turns out that this cannot result in overflow
- * because if a reader modifies its unlock count after we read it
- * above, then that reader's next load of ->srcu_idx is guaranteed
- * to get the new value, which will cause it to operate on the
- * other bank of counters, where it cannot contribute to the
- * overflow of these counters. This means that there is a maximum
- * of 2*NR_CPUS increments, which cannot overflow given current
- * systems, especially not on 64-bit systems.
+ * That is, there can be almost Nt + Nc further increments of
+ * ->srcu_lock_count[idx] for the old index, where Nc is the number
+ * of CPUs. But this is OK because the size of the task_struct
+ * structure limits the value of Nt and current systems limit Nc
+ * to a few thousand.
*
- * OK, how about nesting? This does impose a limit on nesting
- * of floor(ULONG_MAX/NR_CPUS/2), which should be sufficient,
- * especially on 64-bit systems.
+ * OK, but what about nesting? This does impose a limit on
+ * nesting of half of the size of the task_struct structure
+ * (measured in bytes), which should be sufficient. A late 2022
+ * TREE01 rcutorture run reported this size to be no less than
+ * 9408 bytes, allowing up to 4704 levels of nesting, which is
+ * comfortably beyond excessive. Especially on 64-bit systems,
+ * which are unlikely to be configured with an address space fully
+ * populated with memory, at least not anytime soon.
*/
return srcu_readers_lock_idx(ssp, idx) == unlocks;
}
@@ -340,15 +554,60 @@ static bool srcu_readers_active(struct srcu_struct *ssp)
for_each_possible_cpu(cpu) {
struct srcu_data *cpuc = per_cpu_ptr(ssp->sda, cpu);
- sum += READ_ONCE(cpuc->srcu_lock_count[0]);
- sum += READ_ONCE(cpuc->srcu_lock_count[1]);
- sum -= READ_ONCE(cpuc->srcu_unlock_count[0]);
- sum -= READ_ONCE(cpuc->srcu_unlock_count[1]);
+ sum += atomic_long_read(&cpuc->srcu_lock_count[0]);
+ sum += atomic_long_read(&cpuc->srcu_lock_count[1]);
+ sum -= atomic_long_read(&cpuc->srcu_unlock_count[0]);
+ sum -= atomic_long_read(&cpuc->srcu_unlock_count[1]);
}
return sum;
}
-#define SRCU_INTERVAL 1
+/*
+ * We use an adaptive strategy for synchronize_srcu() and especially for
+ * synchronize_srcu_expedited(). We spin for a fixed time period
+ * (defined below, boot time configurable) to allow SRCU readers to exit
+ * their read-side critical sections. If there are still some readers
+ * after one jiffy, we repeatedly block for one jiffy time periods.
+ * The blocking time is increased as the grace-period age increases,
+ * with max blocking time capped at 10 jiffies.
+ */
+#define SRCU_DEFAULT_RETRY_CHECK_DELAY 5
+
+static ulong srcu_retry_check_delay = SRCU_DEFAULT_RETRY_CHECK_DELAY;
+module_param(srcu_retry_check_delay, ulong, 0444);
+
+#define SRCU_INTERVAL 1 // Base delay if no expedited GPs pending.
+#define SRCU_MAX_INTERVAL 10 // Maximum incremental delay from slow readers.
+
+#define SRCU_DEFAULT_MAX_NODELAY_PHASE_LO 3UL // Lowmark on default per-GP-phase
+ // no-delay instances.
+#define SRCU_DEFAULT_MAX_NODELAY_PHASE_HI 1000UL // Highmark on default per-GP-phase
+ // no-delay instances.
+
+#define SRCU_UL_CLAMP_LO(val, low) ((val) > (low) ? (val) : (low))
+#define SRCU_UL_CLAMP_HI(val, high) ((val) < (high) ? (val) : (high))
+#define SRCU_UL_CLAMP(val, low, high) SRCU_UL_CLAMP_HI(SRCU_UL_CLAMP_LO((val), (low)), (high))
+// per-GP-phase no-delay instances adjusted to allow non-sleeping poll upto
+// one jiffies time duration. Mult by 2 is done to factor in the srcu_get_delay()
+// called from process_srcu().
+#define SRCU_DEFAULT_MAX_NODELAY_PHASE_ADJUSTED \
+ (2UL * USEC_PER_SEC / HZ / SRCU_DEFAULT_RETRY_CHECK_DELAY)
+
+// Maximum per-GP-phase consecutive no-delay instances.
+#define SRCU_DEFAULT_MAX_NODELAY_PHASE \
+ SRCU_UL_CLAMP(SRCU_DEFAULT_MAX_NODELAY_PHASE_ADJUSTED, \
+ SRCU_DEFAULT_MAX_NODELAY_PHASE_LO, \
+ SRCU_DEFAULT_MAX_NODELAY_PHASE_HI)
+
+static ulong srcu_max_nodelay_phase = SRCU_DEFAULT_MAX_NODELAY_PHASE;
+module_param(srcu_max_nodelay_phase, ulong, 0444);
+
+// Maximum consecutive no-delay instances.
+#define SRCU_DEFAULT_MAX_NODELAY (SRCU_DEFAULT_MAX_NODELAY_PHASE > 100 ? \
+ SRCU_DEFAULT_MAX_NODELAY_PHASE : 100)
+
+static ulong srcu_max_nodelay = SRCU_DEFAULT_MAX_NODELAY;
+module_param(srcu_max_nodelay, ulong, 0444);
/*
* Return grace-period delay, zero if there are expedited grace
@@ -356,10 +615,25 @@ static bool srcu_readers_active(struct srcu_struct *ssp)
*/
static unsigned long srcu_get_delay(struct srcu_struct *ssp)
{
- if (ULONG_CMP_LT(READ_ONCE(ssp->srcu_gp_seq),
- READ_ONCE(ssp->srcu_gp_seq_needed_exp)))
- return 0;
- return SRCU_INTERVAL;
+ unsigned long gpstart;
+ unsigned long j;
+ unsigned long jbase = SRCU_INTERVAL;
+ struct srcu_usage *sup = ssp->srcu_sup;
+
+ if (ULONG_CMP_LT(READ_ONCE(sup->srcu_gp_seq), READ_ONCE(sup->srcu_gp_seq_needed_exp)))
+ jbase = 0;
+ if (rcu_seq_state(READ_ONCE(sup->srcu_gp_seq))) {
+ j = jiffies - 1;
+ gpstart = READ_ONCE(sup->srcu_gp_start);
+ if (time_after(j, gpstart))
+ jbase += j - gpstart;
+ if (!jbase) {
+ WRITE_ONCE(sup->srcu_n_exp_nodelay, READ_ONCE(sup->srcu_n_exp_nodelay) + 1);
+ if (READ_ONCE(sup->srcu_n_exp_nodelay) > srcu_max_nodelay_phase)
+ jbase = 1;
+ }
+ }
+ return jbase > SRCU_MAX_INTERVAL ? SRCU_MAX_INTERVAL : jbase;
}
/**
@@ -372,12 +646,13 @@ static unsigned long srcu_get_delay(struct srcu_struct *ssp)
void cleanup_srcu_struct(struct srcu_struct *ssp)
{
int cpu;
+ struct srcu_usage *sup = ssp->srcu_sup;
if (WARN_ON(!srcu_get_delay(ssp)))
return; /* Just leak it! */
if (WARN_ON(srcu_readers_active(ssp)))
return; /* Just leak it! */
- flush_delayed_work(&ssp->work);
+ flush_delayed_work(&sup->work);
for_each_possible_cpu(cpu) {
struct srcu_data *sdp = per_cpu_ptr(ssp->sda, cpu);
@@ -386,17 +661,49 @@ void cleanup_srcu_struct(struct srcu_struct *ssp)
if (WARN_ON(rcu_segcblist_n_cbs(&sdp->srcu_cblist)))
return; /* Forgot srcu_barrier(), so just leak it! */
}
- if (WARN_ON(rcu_seq_state(READ_ONCE(ssp->srcu_gp_seq)) != SRCU_STATE_IDLE) ||
+ if (WARN_ON(rcu_seq_state(READ_ONCE(sup->srcu_gp_seq)) != SRCU_STATE_IDLE) ||
+ WARN_ON(rcu_seq_current(&sup->srcu_gp_seq) != sup->srcu_gp_seq_needed) ||
WARN_ON(srcu_readers_active(ssp))) {
- pr_info("%s: Active srcu_struct %p state: %d\n",
- __func__, ssp, rcu_seq_state(READ_ONCE(ssp->srcu_gp_seq)));
+ pr_info("%s: Active srcu_struct %p read state: %d gp state: %lu/%lu\n",
+ __func__, ssp, rcu_seq_state(READ_ONCE(sup->srcu_gp_seq)),
+ rcu_seq_current(&sup->srcu_gp_seq), sup->srcu_gp_seq_needed);
return; /* Caller forgot to stop doing call_srcu()? */
}
- free_percpu(ssp->sda);
- ssp->sda = NULL;
+ kfree(sup->node);
+ sup->node = NULL;
+ sup->srcu_size_state = SRCU_SIZE_SMALL;
+ if (!sup->sda_is_static) {
+ free_percpu(ssp->sda);
+ ssp->sda = NULL;
+ kfree(sup);
+ ssp->srcu_sup = NULL;
+ }
}
EXPORT_SYMBOL_GPL(cleanup_srcu_struct);
+#ifdef CONFIG_PROVE_RCU
+/*
+ * Check for consistent NMI safety.
+ */
+void srcu_check_nmi_safety(struct srcu_struct *ssp, bool nmi_safe)
+{
+ int nmi_safe_mask = 1 << nmi_safe;
+ int old_nmi_safe_mask;
+ struct srcu_data *sdp;
+
+ /* NMI-unsafe use in NMI is a bad sign */
+ WARN_ON_ONCE(!nmi_safe && in_nmi());
+ sdp = raw_cpu_ptr(ssp->sda);
+ old_nmi_safe_mask = READ_ONCE(sdp->srcu_nmi_safety);
+ if (!old_nmi_safe_mask) {
+ WRITE_ONCE(sdp->srcu_nmi_safety, nmi_safe_mask);
+ return;
+ }
+ WARN_ONCE(old_nmi_safe_mask != nmi_safe_mask, "CPU %d old state %d new state %d\n", sdp->cpu, old_nmi_safe_mask, nmi_safe_mask);
+}
+EXPORT_SYMBOL_GPL(srcu_check_nmi_safety);
+#endif /* CONFIG_PROVE_RCU */
+
/*
* Counts the new reader in the appropriate per-CPU element of the
* srcu_struct.
@@ -407,7 +714,7 @@ int __srcu_read_lock(struct srcu_struct *ssp)
int idx;
idx = READ_ONCE(ssp->srcu_idx) & 0x1;
- this_cpu_inc(ssp->sda->srcu_lock_count[idx]);
+ this_cpu_inc(ssp->sda->srcu_lock_count[idx].counter);
smp_mb(); /* B */ /* Avoid leaking the critical section. */
return idx;
}
@@ -421,38 +728,59 @@ EXPORT_SYMBOL_GPL(__srcu_read_lock);
void __srcu_read_unlock(struct srcu_struct *ssp, int idx)
{
smp_mb(); /* C */ /* Avoid leaking the critical section. */
- this_cpu_inc(ssp->sda->srcu_unlock_count[idx]);
+ this_cpu_inc(ssp->sda->srcu_unlock_count[idx].counter);
}
EXPORT_SYMBOL_GPL(__srcu_read_unlock);
+#ifdef CONFIG_NEED_SRCU_NMI_SAFE
+
/*
- * We use an adaptive strategy for synchronize_srcu() and especially for
- * synchronize_srcu_expedited(). We spin for a fixed time period
- * (defined below) to allow SRCU readers to exit their read-side critical
- * sections. If there are still some readers after a few microseconds,
- * we repeatedly block for 1-millisecond time periods.
+ * Counts the new reader in the appropriate per-CPU element of the
+ * srcu_struct, but in an NMI-safe manner using RMW atomics.
+ * Returns an index that must be passed to the matching srcu_read_unlock().
*/
-#define SRCU_RETRY_CHECK_DELAY 5
+int __srcu_read_lock_nmisafe(struct srcu_struct *ssp)
+{
+ int idx;
+ struct srcu_data *sdp = raw_cpu_ptr(ssp->sda);
+
+ idx = READ_ONCE(ssp->srcu_idx) & 0x1;
+ atomic_long_inc(&sdp->srcu_lock_count[idx]);
+ smp_mb__after_atomic(); /* B */ /* Avoid leaking the critical section. */
+ return idx;
+}
+EXPORT_SYMBOL_GPL(__srcu_read_lock_nmisafe);
+
+/*
+ * Removes the count for the old reader from the appropriate per-CPU
+ * element of the srcu_struct. Note that this may well be a different
+ * CPU than that which was incremented by the corresponding srcu_read_lock().
+ */
+void __srcu_read_unlock_nmisafe(struct srcu_struct *ssp, int idx)
+{
+ struct srcu_data *sdp = raw_cpu_ptr(ssp->sda);
+
+ smp_mb__before_atomic(); /* C */ /* Avoid leaking the critical section. */
+ atomic_long_inc(&sdp->srcu_unlock_count[idx]);
+}
+EXPORT_SYMBOL_GPL(__srcu_read_unlock_nmisafe);
+
+#endif // CONFIG_NEED_SRCU_NMI_SAFE
/*
* Start an SRCU grace period.
*/
static void srcu_gp_start(struct srcu_struct *ssp)
{
- struct srcu_data *sdp = this_cpu_ptr(ssp->sda);
int state;
- lockdep_assert_held(&ACCESS_PRIVATE(ssp, lock));
- WARN_ON_ONCE(ULONG_CMP_GE(ssp->srcu_gp_seq, ssp->srcu_gp_seq_needed));
- spin_lock_rcu_node(sdp); /* Interrupts already disabled. */
- rcu_segcblist_advance(&sdp->srcu_cblist,
- rcu_seq_current(&ssp->srcu_gp_seq));
- (void)rcu_segcblist_accelerate(&sdp->srcu_cblist,
- rcu_seq_snap(&ssp->srcu_gp_seq));
- spin_unlock_rcu_node(sdp); /* Interrupts remain disabled. */
+ lockdep_assert_held(&ACCESS_PRIVATE(ssp->srcu_sup, lock));
+ WARN_ON_ONCE(ULONG_CMP_GE(ssp->srcu_sup->srcu_gp_seq, ssp->srcu_sup->srcu_gp_seq_needed));
+ WRITE_ONCE(ssp->srcu_sup->srcu_gp_start, jiffies);
+ WRITE_ONCE(ssp->srcu_sup->srcu_n_exp_nodelay, 0);
smp_mb(); /* Order prior store to ->srcu_gp_seq_needed vs. GP start. */
- rcu_seq_start(&ssp->srcu_gp_seq);
- state = rcu_seq_state(ssp->srcu_gp_seq);
+ rcu_seq_start(&ssp->srcu_sup->srcu_gp_seq);
+ state = rcu_seq_state(ssp->srcu_sup->srcu_gp_seq);
WARN_ON_ONCE(state != SRCU_STATE_SCAN1);
}
@@ -496,7 +824,7 @@ static void srcu_schedule_cbs_snp(struct srcu_struct *ssp, struct srcu_node *snp
int cpu;
for (cpu = snp->grplo; cpu <= snp->grphi; cpu++) {
- if (!(mask & (1 << (cpu - snp->grplo))))
+ if (!(mask & (1UL << (cpu - snp->grplo))))
continue;
srcu_schedule_cbs_sdp(per_cpu_ptr(ssp->sda, cpu), delay);
}
@@ -513,7 +841,7 @@ static void srcu_schedule_cbs_snp(struct srcu_struct *ssp, struct srcu_node *snp
*/
static void srcu_gp_end(struct srcu_struct *ssp)
{
- unsigned long cbdelay;
+ unsigned long cbdelay = 1;
bool cbs;
bool last_lvl;
int cpu;
@@ -522,71 +850,92 @@ static void srcu_gp_end(struct srcu_struct *ssp)
int idx;
unsigned long mask;
struct srcu_data *sdp;
+ unsigned long sgsne;
struct srcu_node *snp;
+ int ss_state;
+ struct srcu_usage *sup = ssp->srcu_sup;
/* Prevent more than one additional grace period. */
- mutex_lock(&ssp->srcu_cb_mutex);
+ mutex_lock(&sup->srcu_cb_mutex);
/* End the current grace period. */
- spin_lock_irq_rcu_node(ssp);
- idx = rcu_seq_state(ssp->srcu_gp_seq);
+ spin_lock_irq_rcu_node(sup);
+ idx = rcu_seq_state(sup->srcu_gp_seq);
WARN_ON_ONCE(idx != SRCU_STATE_SCAN2);
- cbdelay = srcu_get_delay(ssp);
- WRITE_ONCE(ssp->srcu_last_gp_end, ktime_get_mono_fast_ns());
- rcu_seq_end(&ssp->srcu_gp_seq);
- gpseq = rcu_seq_current(&ssp->srcu_gp_seq);
- if (ULONG_CMP_LT(ssp->srcu_gp_seq_needed_exp, gpseq))
- WRITE_ONCE(ssp->srcu_gp_seq_needed_exp, gpseq);
- spin_unlock_irq_rcu_node(ssp);
- mutex_unlock(&ssp->srcu_gp_mutex);
+ if (ULONG_CMP_LT(READ_ONCE(sup->srcu_gp_seq), READ_ONCE(sup->srcu_gp_seq_needed_exp)))
+ cbdelay = 0;
+
+ WRITE_ONCE(sup->srcu_last_gp_end, ktime_get_mono_fast_ns());
+ rcu_seq_end(&sup->srcu_gp_seq);
+ gpseq = rcu_seq_current(&sup->srcu_gp_seq);
+ if (ULONG_CMP_LT(sup->srcu_gp_seq_needed_exp, gpseq))
+ WRITE_ONCE(sup->srcu_gp_seq_needed_exp, gpseq);
+ spin_unlock_irq_rcu_node(sup);
+ mutex_unlock(&sup->srcu_gp_mutex);
/* A new grace period can start at this point. But only one. */
/* Initiate callback invocation as needed. */
- idx = rcu_seq_ctr(gpseq) % ARRAY_SIZE(snp->srcu_have_cbs);
- srcu_for_each_node_breadth_first(ssp, snp) {
- spin_lock_irq_rcu_node(snp);
- cbs = false;
- last_lvl = snp >= ssp->level[rcu_num_lvls - 1];
- if (last_lvl)
- cbs = snp->srcu_have_cbs[idx] == gpseq;
- snp->srcu_have_cbs[idx] = gpseq;
- rcu_seq_set_state(&snp->srcu_have_cbs[idx], 1);
- if (ULONG_CMP_LT(snp->srcu_gp_seq_needed_exp, gpseq))
- WRITE_ONCE(snp->srcu_gp_seq_needed_exp, gpseq);
- mask = snp->srcu_data_have_cbs[idx];
- snp->srcu_data_have_cbs[idx] = 0;
- spin_unlock_irq_rcu_node(snp);
- if (cbs)
- srcu_schedule_cbs_snp(ssp, snp, mask, cbdelay);
-
- /* Occasionally prevent srcu_data counter wrap. */
- if (!(gpseq & counter_wrap_check) && last_lvl)
- for (cpu = snp->grplo; cpu <= snp->grphi; cpu++) {
- sdp = per_cpu_ptr(ssp->sda, cpu);
- spin_lock_irqsave_rcu_node(sdp, flags);
- if (ULONG_CMP_GE(gpseq,
- sdp->srcu_gp_seq_needed + 100))
- sdp->srcu_gp_seq_needed = gpseq;
- if (ULONG_CMP_GE(gpseq,
- sdp->srcu_gp_seq_needed_exp + 100))
- sdp->srcu_gp_seq_needed_exp = gpseq;
- spin_unlock_irqrestore_rcu_node(sdp, flags);
- }
+ ss_state = smp_load_acquire(&sup->srcu_size_state);
+ if (ss_state < SRCU_SIZE_WAIT_BARRIER) {
+ srcu_schedule_cbs_sdp(per_cpu_ptr(ssp->sda, get_boot_cpu_id()),
+ cbdelay);
+ } else {
+ idx = rcu_seq_ctr(gpseq) % ARRAY_SIZE(snp->srcu_have_cbs);
+ srcu_for_each_node_breadth_first(ssp, snp) {
+ spin_lock_irq_rcu_node(snp);
+ cbs = false;
+ last_lvl = snp >= sup->level[rcu_num_lvls - 1];
+ if (last_lvl)
+ cbs = ss_state < SRCU_SIZE_BIG || snp->srcu_have_cbs[idx] == gpseq;
+ snp->srcu_have_cbs[idx] = gpseq;
+ rcu_seq_set_state(&snp->srcu_have_cbs[idx], 1);
+ sgsne = snp->srcu_gp_seq_needed_exp;
+ if (srcu_invl_snp_seq(sgsne) || ULONG_CMP_LT(sgsne, gpseq))
+ WRITE_ONCE(snp->srcu_gp_seq_needed_exp, gpseq);
+ if (ss_state < SRCU_SIZE_BIG)
+ mask = ~0;
+ else
+ mask = snp->srcu_data_have_cbs[idx];
+ snp->srcu_data_have_cbs[idx] = 0;
+ spin_unlock_irq_rcu_node(snp);
+ if (cbs)
+ srcu_schedule_cbs_snp(ssp, snp, mask, cbdelay);
+ }
}
+ /* Occasionally prevent srcu_data counter wrap. */
+ if (!(gpseq & counter_wrap_check))
+ for_each_possible_cpu(cpu) {
+ sdp = per_cpu_ptr(ssp->sda, cpu);
+ spin_lock_irqsave_rcu_node(sdp, flags);
+ if (ULONG_CMP_GE(gpseq, sdp->srcu_gp_seq_needed + 100))
+ sdp->srcu_gp_seq_needed = gpseq;
+ if (ULONG_CMP_GE(gpseq, sdp->srcu_gp_seq_needed_exp + 100))
+ sdp->srcu_gp_seq_needed_exp = gpseq;
+ spin_unlock_irqrestore_rcu_node(sdp, flags);
+ }
+
/* Callback initiation done, allow grace periods after next. */
- mutex_unlock(&ssp->srcu_cb_mutex);
+ mutex_unlock(&sup->srcu_cb_mutex);
/* Start a new grace period if needed. */
- spin_lock_irq_rcu_node(ssp);
- gpseq = rcu_seq_current(&ssp->srcu_gp_seq);
+ spin_lock_irq_rcu_node(sup);
+ gpseq = rcu_seq_current(&sup->srcu_gp_seq);
if (!rcu_seq_state(gpseq) &&
- ULONG_CMP_LT(gpseq, ssp->srcu_gp_seq_needed)) {
+ ULONG_CMP_LT(gpseq, sup->srcu_gp_seq_needed)) {
srcu_gp_start(ssp);
- spin_unlock_irq_rcu_node(ssp);
+ spin_unlock_irq_rcu_node(sup);
srcu_reschedule(ssp, 0);
} else {
- spin_unlock_irq_rcu_node(ssp);
+ spin_unlock_irq_rcu_node(sup);
+ }
+
+ /* Transition to big if needed. */
+ if (ss_state != SRCU_SIZE_SMALL && ss_state != SRCU_SIZE_BIG) {
+ if (ss_state == SRCU_SIZE_ALLOC)
+ init_srcu_struct_nodes(ssp, GFP_KERNEL);
+ else
+ smp_store_release(&sup->srcu_size_state, ss_state + 1);
}
}
@@ -601,23 +950,27 @@ static void srcu_funnel_exp_start(struct srcu_struct *ssp, struct srcu_node *snp
unsigned long s)
{
unsigned long flags;
+ unsigned long sgsne;
- for (; snp != NULL; snp = snp->srcu_parent) {
- if (rcu_seq_done(&ssp->srcu_gp_seq, s) ||
- ULONG_CMP_GE(READ_ONCE(snp->srcu_gp_seq_needed_exp), s))
- return;
- spin_lock_irqsave_rcu_node(snp, flags);
- if (ULONG_CMP_GE(snp->srcu_gp_seq_needed_exp, s)) {
+ if (snp)
+ for (; snp != NULL; snp = snp->srcu_parent) {
+ sgsne = READ_ONCE(snp->srcu_gp_seq_needed_exp);
+ if (WARN_ON_ONCE(rcu_seq_done(&ssp->srcu_sup->srcu_gp_seq, s)) ||
+ (!srcu_invl_snp_seq(sgsne) && ULONG_CMP_GE(sgsne, s)))
+ return;
+ spin_lock_irqsave_rcu_node(snp, flags);
+ sgsne = snp->srcu_gp_seq_needed_exp;
+ if (!srcu_invl_snp_seq(sgsne) && ULONG_CMP_GE(sgsne, s)) {
+ spin_unlock_irqrestore_rcu_node(snp, flags);
+ return;
+ }
+ WRITE_ONCE(snp->srcu_gp_seq_needed_exp, s);
spin_unlock_irqrestore_rcu_node(snp, flags);
- return;
}
- WRITE_ONCE(snp->srcu_gp_seq_needed_exp, s);
- spin_unlock_irqrestore_rcu_node(snp, flags);
- }
- spin_lock_irqsave_rcu_node(ssp, flags);
- if (ULONG_CMP_LT(ssp->srcu_gp_seq_needed_exp, s))
- WRITE_ONCE(ssp->srcu_gp_seq_needed_exp, s);
- spin_unlock_irqrestore_rcu_node(ssp, flags);
+ spin_lock_irqsave_ssp_contention(ssp, &flags);
+ if (ULONG_CMP_LT(ssp->srcu_sup->srcu_gp_seq_needed_exp, s))
+ WRITE_ONCE(ssp->srcu_sup->srcu_gp_seq_needed_exp, s);
+ spin_unlock_irqrestore_rcu_node(ssp->srcu_sup, flags);
}
/*
@@ -629,67 +982,85 @@ static void srcu_funnel_exp_start(struct srcu_struct *ssp, struct srcu_node *snp
*
* Note that this function also does the work of srcu_funnel_exp_start(),
* in some cases by directly invoking it.
+ *
+ * The srcu read lock should be hold around this function. And s is a seq snap
+ * after holding that lock.
*/
static void srcu_funnel_gp_start(struct srcu_struct *ssp, struct srcu_data *sdp,
unsigned long s, bool do_norm)
{
unsigned long flags;
int idx = rcu_seq_ctr(s) % ARRAY_SIZE(sdp->mynode->srcu_have_cbs);
- struct srcu_node *snp = sdp->mynode;
+ unsigned long sgsne;
+ struct srcu_node *snp;
+ struct srcu_node *snp_leaf;
unsigned long snp_seq;
+ struct srcu_usage *sup = ssp->srcu_sup;
- /* Each pass through the loop does one level of the srcu_node tree. */
- for (; snp != NULL; snp = snp->srcu_parent) {
- if (rcu_seq_done(&ssp->srcu_gp_seq, s) && snp != sdp->mynode)
- return; /* GP already done and CBs recorded. */
- spin_lock_irqsave_rcu_node(snp, flags);
- if (ULONG_CMP_GE(snp->srcu_have_cbs[idx], s)) {
+ /* Ensure that snp node tree is fully initialized before traversing it */
+ if (smp_load_acquire(&sup->srcu_size_state) < SRCU_SIZE_WAIT_BARRIER)
+ snp_leaf = NULL;
+ else
+ snp_leaf = sdp->mynode;
+
+ if (snp_leaf)
+ /* Each pass through the loop does one level of the srcu_node tree. */
+ for (snp = snp_leaf; snp != NULL; snp = snp->srcu_parent) {
+ if (WARN_ON_ONCE(rcu_seq_done(&sup->srcu_gp_seq, s)) && snp != snp_leaf)
+ return; /* GP already done and CBs recorded. */
+ spin_lock_irqsave_rcu_node(snp, flags);
snp_seq = snp->srcu_have_cbs[idx];
- if (snp == sdp->mynode && snp_seq == s)
- snp->srcu_data_have_cbs[idx] |= sdp->grpmask;
- spin_unlock_irqrestore_rcu_node(snp, flags);
- if (snp == sdp->mynode && snp_seq != s) {
- srcu_schedule_cbs_sdp(sdp, do_norm
- ? SRCU_INTERVAL
- : 0);
+ if (!srcu_invl_snp_seq(snp_seq) && ULONG_CMP_GE(snp_seq, s)) {
+ if (snp == snp_leaf && snp_seq == s)
+ snp->srcu_data_have_cbs[idx] |= sdp->grpmask;
+ spin_unlock_irqrestore_rcu_node(snp, flags);
+ if (snp == snp_leaf && snp_seq != s) {
+ srcu_schedule_cbs_sdp(sdp, do_norm ? SRCU_INTERVAL : 0);
+ return;
+ }
+ if (!do_norm)
+ srcu_funnel_exp_start(ssp, snp, s);
return;
}
- if (!do_norm)
- srcu_funnel_exp_start(ssp, snp, s);
- return;
+ snp->srcu_have_cbs[idx] = s;
+ if (snp == snp_leaf)
+ snp->srcu_data_have_cbs[idx] |= sdp->grpmask;
+ sgsne = snp->srcu_gp_seq_needed_exp;
+ if (!do_norm && (srcu_invl_snp_seq(sgsne) || ULONG_CMP_LT(sgsne, s)))
+ WRITE_ONCE(snp->srcu_gp_seq_needed_exp, s);
+ spin_unlock_irqrestore_rcu_node(snp, flags);
}
- snp->srcu_have_cbs[idx] = s;
- if (snp == sdp->mynode)
- snp->srcu_data_have_cbs[idx] |= sdp->grpmask;
- if (!do_norm && ULONG_CMP_LT(snp->srcu_gp_seq_needed_exp, s))
- WRITE_ONCE(snp->srcu_gp_seq_needed_exp, s);
- spin_unlock_irqrestore_rcu_node(snp, flags);
- }
/* Top of tree, must ensure the grace period will be started. */
- spin_lock_irqsave_rcu_node(ssp, flags);
- if (ULONG_CMP_LT(ssp->srcu_gp_seq_needed, s)) {
+ spin_lock_irqsave_ssp_contention(ssp, &flags);
+ if (ULONG_CMP_LT(sup->srcu_gp_seq_needed, s)) {
/*
* Record need for grace period s. Pair with load
* acquire setting up for initialization.
*/
- smp_store_release(&ssp->srcu_gp_seq_needed, s); /*^^^*/
+ smp_store_release(&sup->srcu_gp_seq_needed, s); /*^^^*/
}
- if (!do_norm && ULONG_CMP_LT(ssp->srcu_gp_seq_needed_exp, s))
- WRITE_ONCE(ssp->srcu_gp_seq_needed_exp, s);
+ if (!do_norm && ULONG_CMP_LT(sup->srcu_gp_seq_needed_exp, s))
+ WRITE_ONCE(sup->srcu_gp_seq_needed_exp, s);
- /* If grace period not already done and none in progress, start it. */
- if (!rcu_seq_done(&ssp->srcu_gp_seq, s) &&
- rcu_seq_state(ssp->srcu_gp_seq) == SRCU_STATE_IDLE) {
- WARN_ON_ONCE(ULONG_CMP_GE(ssp->srcu_gp_seq, ssp->srcu_gp_seq_needed));
+ /* If grace period not already in progress, start it. */
+ if (!WARN_ON_ONCE(rcu_seq_done(&sup->srcu_gp_seq, s)) &&
+ rcu_seq_state(sup->srcu_gp_seq) == SRCU_STATE_IDLE) {
+ WARN_ON_ONCE(ULONG_CMP_GE(sup->srcu_gp_seq, sup->srcu_gp_seq_needed));
srcu_gp_start(ssp);
+
+ // And how can that list_add() in the "else" clause
+ // possibly be safe for concurrent execution? Well,
+ // it isn't. And it does not have to be. After all, it
+ // can only be executed during early boot when there is only
+ // the one boot CPU running with interrupts still disabled.
if (likely(srcu_init_done))
- queue_delayed_work(rcu_gp_wq, &ssp->work,
- srcu_get_delay(ssp));
- else if (list_empty(&ssp->work.work.entry))
- list_add(&ssp->work.work.entry, &srcu_boot_list);
+ queue_delayed_work(rcu_gp_wq, &sup->work,
+ !!srcu_get_delay(ssp));
+ else if (list_empty(&sup->work.work.entry))
+ list_add(&sup->work.work.entry, &srcu_boot_list);
}
- spin_unlock_irqrestore_rcu_node(ssp, flags);
+ spin_unlock_irqrestore_rcu_node(sup, flags);
}
/*
@@ -699,12 +1070,16 @@ static void srcu_funnel_gp_start(struct srcu_struct *ssp, struct srcu_data *sdp,
*/
static bool try_check_zero(struct srcu_struct *ssp, int idx, int trycount)
{
+ unsigned long curdelay;
+
+ curdelay = !srcu_get_delay(ssp);
+
for (;;) {
if (srcu_readers_active_idx_check(ssp, idx))
return true;
- if (--trycount + !srcu_get_delay(ssp) <= 0)
+ if ((--trycount + curdelay) <= 0)
return false;
- udelay(SRCU_RETRY_CHECK_DELAY);
+ udelay(srcu_retry_check_delay);
}
}
@@ -716,23 +1091,44 @@ static bool try_check_zero(struct srcu_struct *ssp, int idx, int trycount)
static void srcu_flip(struct srcu_struct *ssp)
{
/*
- * Ensure that if this updater saw a given reader's increment
- * from __srcu_read_lock(), that reader was using an old value
- * of ->srcu_idx. Also ensure that if a given reader sees the
- * new value of ->srcu_idx, this updater's earlier scans cannot
- * have seen that reader's increments (which is OK, because this
- * grace period need not wait on that reader).
+ * Because the flip of ->srcu_idx is executed only if the
+ * preceding call to srcu_readers_active_idx_check() found that
+ * the ->srcu_unlock_count[] and ->srcu_lock_count[] sums matched
+ * and because that summing uses atomic_long_read(), there is
+ * ordering due to a control dependency between that summing and
+ * the WRITE_ONCE() in this call to srcu_flip(). This ordering
+ * ensures that if this updater saw a given reader's increment from
+ * __srcu_read_lock(), that reader was using a value of ->srcu_idx
+ * from before the previous call to srcu_flip(), which should be
+ * quite rare. This ordering thus helps forward progress because
+ * the grace period could otherwise be delayed by additional
+ * calls to __srcu_read_lock() using that old (soon to be new)
+ * value of ->srcu_idx.
+ *
+ * This sum-equality check and ordering also ensures that if
+ * a given call to __srcu_read_lock() uses the new value of
+ * ->srcu_idx, this updater's earlier scans cannot have seen
+ * that reader's increments, which is all to the good, because
+ * this grace period need not wait on that reader. After all,
+ * if those earlier scans had seen that reader, there would have
+ * been a sum mismatch and this code would not be reached.
+ *
+ * This means that the following smp_mb() is redundant, but
+ * it stays until either (1) Compilers learn about this sort of
+ * control dependency or (2) Some production workload running on
+ * a production system is unduly delayed by this slowpath smp_mb().
*/
smp_mb(); /* E */ /* Pairs with B and C. */
- WRITE_ONCE(ssp->srcu_idx, ssp->srcu_idx + 1);
+ WRITE_ONCE(ssp->srcu_idx, ssp->srcu_idx + 1); // Flip the counter.
/*
* Ensure that if the updater misses an __srcu_read_unlock()
- * increment, that task's next __srcu_read_lock() will see the
- * above counter update. Note that both this memory barrier
- * and the one in srcu_readers_active_idx_check() provide the
- * guarantee for __srcu_read_lock().
+ * increment, that task's __srcu_read_lock() following its next
+ * __srcu_read_lock() or __srcu_read_unlock() will see the above
+ * counter update. Note that both this memory barrier and the
+ * one in srcu_readers_active_idx_check() provide the guarantee
+ * for __srcu_read_lock().
*/
smp_mb(); /* D */ /* Pairs with C. */
}
@@ -777,25 +1173,25 @@ static bool srcu_might_be_idle(struct srcu_struct *ssp)
spin_unlock_irqrestore_rcu_node(sdp, flags);
/*
- * No local callbacks, so probabalistically probe global state.
+ * No local callbacks, so probabilistically probe global state.
* Exact information would require acquiring locks, which would
- * kill scalability, hence the probabalistic nature of the probe.
+ * kill scalability, hence the probabilistic nature of the probe.
*/
/* First, see if enough time has passed since the last GP. */
t = ktime_get_mono_fast_ns();
- tlast = READ_ONCE(ssp->srcu_last_gp_end);
+ tlast = READ_ONCE(ssp->srcu_sup->srcu_last_gp_end);
if (exp_holdoff == 0 ||
time_in_range_open(t, tlast, tlast + exp_holdoff))
return false; /* Too soon after last GP. */
/* Next, check for probable idleness. */
- curseq = rcu_seq_current(&ssp->srcu_gp_seq);
+ curseq = rcu_seq_current(&ssp->srcu_sup->srcu_gp_seq);
smp_mb(); /* Order ->srcu_gp_seq with ->srcu_gp_seq_needed. */
- if (ULONG_CMP_LT(curseq, READ_ONCE(ssp->srcu_gp_seq_needed)))
+ if (ULONG_CMP_LT(curseq, READ_ONCE(ssp->srcu_sup->srcu_gp_seq_needed)))
return false; /* Grace period in progress, so not idle. */
smp_mb(); /* Order ->srcu_gp_seq with prior access. */
- if (curseq != rcu_seq_current(&ssp->srcu_gp_seq))
+ if (curseq != rcu_seq_current(&ssp->srcu_sup->srcu_gp_seq))
return false; /* GP # changed, so not idle. */
return true; /* With reasonable probability, idle! */
}
@@ -808,6 +1204,93 @@ static void srcu_leak_callback(struct rcu_head *rhp)
}
/*
+ * Start an SRCU grace period, and also queue the callback if non-NULL.
+ */
+static unsigned long srcu_gp_start_if_needed(struct srcu_struct *ssp,
+ struct rcu_head *rhp, bool do_norm)
+{
+ unsigned long flags;
+ int idx;
+ bool needexp = false;
+ bool needgp = false;
+ unsigned long s;
+ struct srcu_data *sdp;
+ struct srcu_node *sdp_mynode;
+ int ss_state;
+
+ check_init_srcu_struct(ssp);
+ /*
+ * While starting a new grace period, make sure we are in an
+ * SRCU read-side critical section so that the grace-period
+ * sequence number cannot wrap around in the meantime.
+ */
+ idx = __srcu_read_lock_nmisafe(ssp);
+ ss_state = smp_load_acquire(&ssp->srcu_sup->srcu_size_state);
+ if (ss_state < SRCU_SIZE_WAIT_CALL)
+ sdp = per_cpu_ptr(ssp->sda, get_boot_cpu_id());
+ else
+ sdp = raw_cpu_ptr(ssp->sda);
+ spin_lock_irqsave_sdp_contention(sdp, &flags);
+ if (rhp)
+ rcu_segcblist_enqueue(&sdp->srcu_cblist, rhp);
+ /*
+ * The snapshot for acceleration must be taken _before_ the read of the
+ * current gp sequence used for advancing, otherwise advancing may fail
+ * and acceleration may then fail too.
+ *
+ * This could happen if:
+ *
+ * 1) The RCU_WAIT_TAIL segment has callbacks (gp_num = X + 4) and the
+ * RCU_NEXT_READY_TAIL also has callbacks (gp_num = X + 8).
+ *
+ * 2) The grace period for RCU_WAIT_TAIL is seen as started but not
+ * completed so rcu_seq_current() returns X + SRCU_STATE_SCAN1.
+ *
+ * 3) This value is passed to rcu_segcblist_advance() which can't move
+ * any segment forward and fails.
+ *
+ * 4) srcu_gp_start_if_needed() still proceeds with callback acceleration.
+ * But then the call to rcu_seq_snap() observes the grace period for the
+ * RCU_WAIT_TAIL segment as completed and the subsequent one for the
+ * RCU_NEXT_READY_TAIL segment as started (ie: X + 4 + SRCU_STATE_SCAN1)
+ * so it returns a snapshot of the next grace period, which is X + 12.
+ *
+ * 5) The value of X + 12 is passed to rcu_segcblist_accelerate() but the
+ * freshly enqueued callback in RCU_NEXT_TAIL can't move to
+ * RCU_NEXT_READY_TAIL which already has callbacks for a previous grace
+ * period (gp_num = X + 8). So acceleration fails.
+ */
+ s = rcu_seq_snap(&ssp->srcu_sup->srcu_gp_seq);
+ if (rhp) {
+ rcu_segcblist_advance(&sdp->srcu_cblist,
+ rcu_seq_current(&ssp->srcu_sup->srcu_gp_seq));
+ WARN_ON_ONCE(!rcu_segcblist_accelerate(&sdp->srcu_cblist, s));
+ }
+ if (ULONG_CMP_LT(sdp->srcu_gp_seq_needed, s)) {
+ sdp->srcu_gp_seq_needed = s;
+ needgp = true;
+ }
+ if (!do_norm && ULONG_CMP_LT(sdp->srcu_gp_seq_needed_exp, s)) {
+ sdp->srcu_gp_seq_needed_exp = s;
+ needexp = true;
+ }
+ spin_unlock_irqrestore_rcu_node(sdp, flags);
+
+ /* Ensure that snp node tree is fully initialized before traversing it */
+ if (ss_state < SRCU_SIZE_WAIT_BARRIER)
+ sdp_mynode = NULL;
+ else
+ sdp_mynode = sdp->mynode;
+
+ if (needgp)
+ srcu_funnel_gp_start(ssp, sdp, s, do_norm);
+ else if (needexp)
+ srcu_funnel_exp_start(ssp, sdp_mynode, s);
+ __srcu_read_unlock_nmisafe(ssp, idx);
+ return s;
+}
+
+/*
* Enqueue an SRCU callback on the srcu_data structure associated with
* the current CPU and the specified srcu_struct structure, initiating
* grace-period processing if it is not already running.
@@ -838,14 +1321,6 @@ static void srcu_leak_callback(struct rcu_head *rhp)
static void __call_srcu(struct srcu_struct *ssp, struct rcu_head *rhp,
rcu_callback_t func, bool do_norm)
{
- unsigned long flags;
- int idx;
- bool needexp = false;
- bool needgp = false;
- unsigned long s;
- struct srcu_data *sdp;
-
- check_init_srcu_struct(ssp);
if (debug_rcu_head_queue(rhp)) {
/* Probable double call_srcu(), so leak the callback. */
WRITE_ONCE(rhp->func, srcu_leak_callback);
@@ -853,28 +1328,7 @@ static void __call_srcu(struct srcu_struct *ssp, struct rcu_head *rhp,
return;
}
rhp->func = func;
- idx = srcu_read_lock(ssp);
- sdp = raw_cpu_ptr(ssp->sda);
- spin_lock_irqsave_rcu_node(sdp, flags);
- rcu_segcblist_enqueue(&sdp->srcu_cblist, rhp);
- rcu_segcblist_advance(&sdp->srcu_cblist,
- rcu_seq_current(&ssp->srcu_gp_seq));
- s = rcu_seq_snap(&ssp->srcu_gp_seq);
- (void)rcu_segcblist_accelerate(&sdp->srcu_cblist, s);
- if (ULONG_CMP_LT(sdp->srcu_gp_seq_needed, s)) {
- sdp->srcu_gp_seq_needed = s;
- needgp = true;
- }
- if (!do_norm && ULONG_CMP_LT(sdp->srcu_gp_seq_needed_exp, s)) {
- sdp->srcu_gp_seq_needed_exp = s;
- needexp = true;
- }
- spin_unlock_irqrestore_rcu_node(sdp, flags);
- if (needgp)
- srcu_funnel_gp_start(ssp, sdp, s, do_norm);
- else if (needexp)
- srcu_funnel_exp_start(ssp, sdp->mynode, s);
- srcu_read_unlock(ssp, idx);
+ (void)srcu_gp_start_if_needed(ssp, rhp, do_norm);
}
/**
@@ -908,6 +1362,8 @@ static void __synchronize_srcu(struct srcu_struct *ssp, bool do_norm)
{
struct rcu_synchronize rcu;
+ srcu_lock_sync(&ssp->dep_map);
+
RCU_LOCKDEP_WARN(lockdep_is_held(ssp) ||
lock_is_held(&rcu_bh_lock_map) ||
lock_is_held(&rcu_lock_map) ||
@@ -989,6 +1445,9 @@ EXPORT_SYMBOL_GPL(synchronize_srcu_expedited);
* synchronize_srcu(), srcu_read_lock(), and srcu_read_unlock() are
* passed the same srcu_struct structure.
*
+ * Implementation of these memory-ordering guarantees is similar to
+ * that of synchronize_rcu().
+ *
* If SRCU is likely idle, expedite the first request. This semantic
* was provided by Classic SRCU, and is relied upon by its users, so TREE
* SRCU must also provide it. Note that detecting idleness is heuristic
@@ -1003,6 +1462,77 @@ void synchronize_srcu(struct srcu_struct *ssp)
}
EXPORT_SYMBOL_GPL(synchronize_srcu);
+/**
+ * get_state_synchronize_srcu - Provide an end-of-grace-period cookie
+ * @ssp: srcu_struct to provide cookie for.
+ *
+ * This function returns a cookie that can be passed to
+ * poll_state_synchronize_srcu(), which will return true if a full grace
+ * period has elapsed in the meantime. It is the caller's responsibility
+ * to make sure that grace period happens, for example, by invoking
+ * call_srcu() after return from get_state_synchronize_srcu().
+ */
+unsigned long get_state_synchronize_srcu(struct srcu_struct *ssp)
+{
+ // Any prior manipulation of SRCU-protected data must happen
+ // before the load from ->srcu_gp_seq.
+ smp_mb();
+ return rcu_seq_snap(&ssp->srcu_sup->srcu_gp_seq);
+}
+EXPORT_SYMBOL_GPL(get_state_synchronize_srcu);
+
+/**
+ * start_poll_synchronize_srcu - Provide cookie and start grace period
+ * @ssp: srcu_struct to provide cookie for.
+ *
+ * This function returns a cookie that can be passed to
+ * poll_state_synchronize_srcu(), which will return true if a full grace
+ * period has elapsed in the meantime. Unlike get_state_synchronize_srcu(),
+ * this function also ensures that any needed SRCU grace period will be
+ * started. This convenience does come at a cost in terms of CPU overhead.
+ */
+unsigned long start_poll_synchronize_srcu(struct srcu_struct *ssp)
+{
+ return srcu_gp_start_if_needed(ssp, NULL, true);
+}
+EXPORT_SYMBOL_GPL(start_poll_synchronize_srcu);
+
+/**
+ * poll_state_synchronize_srcu - Has cookie's grace period ended?
+ * @ssp: srcu_struct to provide cookie for.
+ * @cookie: Return value from get_state_synchronize_srcu() or start_poll_synchronize_srcu().
+ *
+ * This function takes the cookie that was returned from either
+ * get_state_synchronize_srcu() or start_poll_synchronize_srcu(), and
+ * returns @true if an SRCU grace period elapsed since the time that the
+ * cookie was created.
+ *
+ * Because cookies are finite in size, wrapping/overflow is possible.
+ * This is more pronounced on 32-bit systems where cookies are 32 bits,
+ * where in theory wrapping could happen in about 14 hours assuming
+ * 25-microsecond expedited SRCU grace periods. However, a more likely
+ * overflow lower bound is on the order of 24 days in the case of
+ * one-millisecond SRCU grace periods. Of course, wrapping in a 64-bit
+ * system requires geologic timespans, as in more than seven million years
+ * even for expedited SRCU grace periods.
+ *
+ * Wrapping/overflow is much more of an issue for CONFIG_SMP=n systems
+ * that also have CONFIG_PREEMPTION=n, which selects Tiny SRCU. This uses
+ * a 16-bit cookie, which rcutorture routinely wraps in a matter of a
+ * few minutes. If this proves to be a problem, this counter will be
+ * expanded to the same size as for Tree SRCU.
+ */
+bool poll_state_synchronize_srcu(struct srcu_struct *ssp, unsigned long cookie)
+{
+ if (!rcu_seq_done(&ssp->srcu_sup->srcu_gp_seq, cookie))
+ return false;
+ // Ensure that the end of the SRCU grace period happens before
+ // any subsequent code that the caller might execute.
+ smp_mb(); // ^^^
+ return true;
+}
+EXPORT_SYMBOL_GPL(poll_state_synchronize_srcu);
+
/*
* Callback function for srcu_barrier() use.
*/
@@ -1013,8 +1543,30 @@ static void srcu_barrier_cb(struct rcu_head *rhp)
sdp = container_of(rhp, struct srcu_data, srcu_barrier_head);
ssp = sdp->ssp;
- if (atomic_dec_and_test(&ssp->srcu_barrier_cpu_cnt))
- complete(&ssp->srcu_barrier_completion);
+ if (atomic_dec_and_test(&ssp->srcu_sup->srcu_barrier_cpu_cnt))
+ complete(&ssp->srcu_sup->srcu_barrier_completion);
+}
+
+/*
+ * Enqueue an srcu_barrier() callback on the specified srcu_data
+ * structure's ->cblist. but only if that ->cblist already has at least one
+ * callback enqueued. Note that if a CPU already has callbacks enqueue,
+ * it must have already registered the need for a future grace period,
+ * so all we need do is enqueue a callback that will use the same grace
+ * period as the last callback already in the queue.
+ */
+static void srcu_barrier_one_cpu(struct srcu_struct *ssp, struct srcu_data *sdp)
+{
+ spin_lock_irq_rcu_node(sdp);
+ atomic_inc(&ssp->srcu_sup->srcu_barrier_cpu_cnt);
+ sdp->srcu_barrier_head.func = srcu_barrier_cb;
+ debug_rcu_head_queue(&sdp->srcu_barrier_head);
+ if (!rcu_segcblist_entrain(&sdp->srcu_cblist,
+ &sdp->srcu_barrier_head)) {
+ debug_rcu_head_unqueue(&sdp->srcu_barrier_head);
+ atomic_dec(&ssp->srcu_sup->srcu_barrier_cpu_cnt);
+ }
+ spin_unlock_irq_rcu_node(sdp);
}
/**
@@ -1024,51 +1576,37 @@ static void srcu_barrier_cb(struct rcu_head *rhp)
void srcu_barrier(struct srcu_struct *ssp)
{
int cpu;
- struct srcu_data *sdp;
- unsigned long s = rcu_seq_snap(&ssp->srcu_barrier_seq);
+ int idx;
+ unsigned long s = rcu_seq_snap(&ssp->srcu_sup->srcu_barrier_seq);
check_init_srcu_struct(ssp);
- mutex_lock(&ssp->srcu_barrier_mutex);
- if (rcu_seq_done(&ssp->srcu_barrier_seq, s)) {
+ mutex_lock(&ssp->srcu_sup->srcu_barrier_mutex);
+ if (rcu_seq_done(&ssp->srcu_sup->srcu_barrier_seq, s)) {
smp_mb(); /* Force ordering following return. */
- mutex_unlock(&ssp->srcu_barrier_mutex);
+ mutex_unlock(&ssp->srcu_sup->srcu_barrier_mutex);
return; /* Someone else did our work for us. */
}
- rcu_seq_start(&ssp->srcu_barrier_seq);
- init_completion(&ssp->srcu_barrier_completion);
+ rcu_seq_start(&ssp->srcu_sup->srcu_barrier_seq);
+ init_completion(&ssp->srcu_sup->srcu_barrier_completion);
/* Initial count prevents reaching zero until all CBs are posted. */
- atomic_set(&ssp->srcu_barrier_cpu_cnt, 1);
+ atomic_set(&ssp->srcu_sup->srcu_barrier_cpu_cnt, 1);
- /*
- * Each pass through this loop enqueues a callback, but only
- * on CPUs already having callbacks enqueued. Note that if
- * a CPU already has callbacks enqueue, it must have already
- * registered the need for a future grace period, so all we
- * need do is enqueue a callback that will use the same
- * grace period as the last callback already in the queue.
- */
- for_each_possible_cpu(cpu) {
- sdp = per_cpu_ptr(ssp->sda, cpu);
- spin_lock_irq_rcu_node(sdp);
- atomic_inc(&ssp->srcu_barrier_cpu_cnt);
- sdp->srcu_barrier_head.func = srcu_barrier_cb;
- debug_rcu_head_queue(&sdp->srcu_barrier_head);
- if (!rcu_segcblist_entrain(&sdp->srcu_cblist,
- &sdp->srcu_barrier_head)) {
- debug_rcu_head_unqueue(&sdp->srcu_barrier_head);
- atomic_dec(&ssp->srcu_barrier_cpu_cnt);
- }
- spin_unlock_irq_rcu_node(sdp);
- }
+ idx = __srcu_read_lock_nmisafe(ssp);
+ if (smp_load_acquire(&ssp->srcu_sup->srcu_size_state) < SRCU_SIZE_WAIT_BARRIER)
+ srcu_barrier_one_cpu(ssp, per_cpu_ptr(ssp->sda, get_boot_cpu_id()));
+ else
+ for_each_possible_cpu(cpu)
+ srcu_barrier_one_cpu(ssp, per_cpu_ptr(ssp->sda, cpu));
+ __srcu_read_unlock_nmisafe(ssp, idx);
/* Remove the initial count, at which point reaching zero can happen. */
- if (atomic_dec_and_test(&ssp->srcu_barrier_cpu_cnt))
- complete(&ssp->srcu_barrier_completion);
- wait_for_completion(&ssp->srcu_barrier_completion);
+ if (atomic_dec_and_test(&ssp->srcu_sup->srcu_barrier_cpu_cnt))
+ complete(&ssp->srcu_sup->srcu_barrier_completion);
+ wait_for_completion(&ssp->srcu_sup->srcu_barrier_completion);
- rcu_seq_end(&ssp->srcu_barrier_seq);
- mutex_unlock(&ssp->srcu_barrier_mutex);
+ rcu_seq_end(&ssp->srcu_sup->srcu_barrier_seq);
+ mutex_unlock(&ssp->srcu_sup->srcu_barrier_mutex);
}
EXPORT_SYMBOL_GPL(srcu_barrier);
@@ -1094,7 +1632,7 @@ static void srcu_advance_state(struct srcu_struct *ssp)
{
int idx;
- mutex_lock(&ssp->srcu_gp_mutex);
+ mutex_lock(&ssp->srcu_sup->srcu_gp_mutex);
/*
* Because readers might be delayed for an extended period after
@@ -1106,38 +1644,39 @@ static void srcu_advance_state(struct srcu_struct *ssp)
* The load-acquire ensures that we see the accesses performed
* by the prior grace period.
*/
- idx = rcu_seq_state(smp_load_acquire(&ssp->srcu_gp_seq)); /* ^^^ */
+ idx = rcu_seq_state(smp_load_acquire(&ssp->srcu_sup->srcu_gp_seq)); /* ^^^ */
if (idx == SRCU_STATE_IDLE) {
- spin_lock_irq_rcu_node(ssp);
- if (ULONG_CMP_GE(ssp->srcu_gp_seq, ssp->srcu_gp_seq_needed)) {
- WARN_ON_ONCE(rcu_seq_state(ssp->srcu_gp_seq));
- spin_unlock_irq_rcu_node(ssp);
- mutex_unlock(&ssp->srcu_gp_mutex);
+ spin_lock_irq_rcu_node(ssp->srcu_sup);
+ if (ULONG_CMP_GE(ssp->srcu_sup->srcu_gp_seq, ssp->srcu_sup->srcu_gp_seq_needed)) {
+ WARN_ON_ONCE(rcu_seq_state(ssp->srcu_sup->srcu_gp_seq));
+ spin_unlock_irq_rcu_node(ssp->srcu_sup);
+ mutex_unlock(&ssp->srcu_sup->srcu_gp_mutex);
return;
}
- idx = rcu_seq_state(READ_ONCE(ssp->srcu_gp_seq));
+ idx = rcu_seq_state(READ_ONCE(ssp->srcu_sup->srcu_gp_seq));
if (idx == SRCU_STATE_IDLE)
srcu_gp_start(ssp);
- spin_unlock_irq_rcu_node(ssp);
+ spin_unlock_irq_rcu_node(ssp->srcu_sup);
if (idx != SRCU_STATE_IDLE) {
- mutex_unlock(&ssp->srcu_gp_mutex);
+ mutex_unlock(&ssp->srcu_sup->srcu_gp_mutex);
return; /* Someone else started the grace period. */
}
}
- if (rcu_seq_state(READ_ONCE(ssp->srcu_gp_seq)) == SRCU_STATE_SCAN1) {
+ if (rcu_seq_state(READ_ONCE(ssp->srcu_sup->srcu_gp_seq)) == SRCU_STATE_SCAN1) {
idx = 1 ^ (ssp->srcu_idx & 1);
if (!try_check_zero(ssp, idx, 1)) {
- mutex_unlock(&ssp->srcu_gp_mutex);
+ mutex_unlock(&ssp->srcu_sup->srcu_gp_mutex);
return; /* readers present, retry later. */
}
srcu_flip(ssp);
- spin_lock_irq_rcu_node(ssp);
- rcu_seq_set_state(&ssp->srcu_gp_seq, SRCU_STATE_SCAN2);
- spin_unlock_irq_rcu_node(ssp);
+ spin_lock_irq_rcu_node(ssp->srcu_sup);
+ rcu_seq_set_state(&ssp->srcu_sup->srcu_gp_seq, SRCU_STATE_SCAN2);
+ ssp->srcu_sup->srcu_n_exp_nodelay = 0;
+ spin_unlock_irq_rcu_node(ssp->srcu_sup);
}
- if (rcu_seq_state(READ_ONCE(ssp->srcu_gp_seq)) == SRCU_STATE_SCAN2) {
+ if (rcu_seq_state(READ_ONCE(ssp->srcu_sup->srcu_gp_seq)) == SRCU_STATE_SCAN2) {
/*
* SRCU read-side critical sections are normally short,
@@ -1145,9 +1684,10 @@ static void srcu_advance_state(struct srcu_struct *ssp)
*/
idx = 1 ^ (ssp->srcu_idx & 1);
if (!try_check_zero(ssp, idx, 2)) {
- mutex_unlock(&ssp->srcu_gp_mutex);
+ mutex_unlock(&ssp->srcu_sup->srcu_gp_mutex);
return; /* readers present, retry later. */
}
+ ssp->srcu_sup->srcu_n_exp_nodelay = 0;
srcu_gp_end(ssp); /* Releases ->srcu_gp_mutex. */
}
}
@@ -1160,6 +1700,7 @@ static void srcu_advance_state(struct srcu_struct *ssp)
*/
static void srcu_invoke_callbacks(struct work_struct *work)
{
+ long len;
bool more;
struct rcu_cblist ready_cbs;
struct rcu_head *rhp;
@@ -1171,8 +1712,14 @@ static void srcu_invoke_callbacks(struct work_struct *work)
ssp = sdp->ssp;
rcu_cblist_init(&ready_cbs);
spin_lock_irq_rcu_node(sdp);
+ WARN_ON_ONCE(!rcu_segcblist_segempty(&sdp->srcu_cblist, RCU_NEXT_TAIL));
rcu_segcblist_advance(&sdp->srcu_cblist,
- rcu_seq_current(&ssp->srcu_gp_seq));
+ rcu_seq_current(&ssp->srcu_sup->srcu_gp_seq));
+ /*
+ * Although this function is theoretically re-entrant, concurrent
+ * callbacks invocation is disallowed to avoid executing an SRCU barrier
+ * too early.
+ */
if (sdp->srcu_cblist_invoking ||
!rcu_segcblist_ready_cbs(&sdp->srcu_cblist)) {
spin_unlock_irq_rcu_node(sdp);
@@ -1182,26 +1729,28 @@ static void srcu_invoke_callbacks(struct work_struct *work)
/* We are on the job! Extract and invoke ready callbacks. */
sdp->srcu_cblist_invoking = true;
rcu_segcblist_extract_done_cbs(&sdp->srcu_cblist, &ready_cbs);
+ len = ready_cbs.len;
spin_unlock_irq_rcu_node(sdp);
rhp = rcu_cblist_dequeue(&ready_cbs);
for (; rhp != NULL; rhp = rcu_cblist_dequeue(&ready_cbs)) {
debug_rcu_head_unqueue(rhp);
+ debug_rcu_head_callback(rhp);
local_bh_disable();
rhp->func(rhp);
local_bh_enable();
}
+ WARN_ON_ONCE(ready_cbs.len);
/*
* Update counts, accelerate new callbacks, and if needed,
* schedule another round of callback invocation.
*/
spin_lock_irq_rcu_node(sdp);
- rcu_segcblist_insert_count(&sdp->srcu_cblist, &ready_cbs);
- (void)rcu_segcblist_accelerate(&sdp->srcu_cblist,
- rcu_seq_snap(&ssp->srcu_gp_seq));
+ rcu_segcblist_add_len(&sdp->srcu_cblist, -len);
sdp->srcu_cblist_invoking = false;
more = rcu_segcblist_ready_cbs(&sdp->srcu_cblist);
spin_unlock_irq_rcu_node(sdp);
+ /* An SRCU barrier or callbacks from previous nesting work pending */
if (more)
srcu_schedule_cbs_sdp(sdp, 0);
}
@@ -1214,20 +1763,20 @@ static void srcu_reschedule(struct srcu_struct *ssp, unsigned long delay)
{
bool pushgp = true;
- spin_lock_irq_rcu_node(ssp);
- if (ULONG_CMP_GE(ssp->srcu_gp_seq, ssp->srcu_gp_seq_needed)) {
- if (!WARN_ON_ONCE(rcu_seq_state(ssp->srcu_gp_seq))) {
+ spin_lock_irq_rcu_node(ssp->srcu_sup);
+ if (ULONG_CMP_GE(ssp->srcu_sup->srcu_gp_seq, ssp->srcu_sup->srcu_gp_seq_needed)) {
+ if (!WARN_ON_ONCE(rcu_seq_state(ssp->srcu_sup->srcu_gp_seq))) {
/* All requests fulfilled, time to go idle. */
pushgp = false;
}
- } else if (!rcu_seq_state(ssp->srcu_gp_seq)) {
+ } else if (!rcu_seq_state(ssp->srcu_sup->srcu_gp_seq)) {
/* Outstanding request and no GP. Start one. */
srcu_gp_start(ssp);
}
- spin_unlock_irq_rcu_node(ssp);
+ spin_unlock_irq_rcu_node(ssp->srcu_sup);
if (pushgp)
- queue_delayed_work(rcu_gp_wq, &ssp->work, delay);
+ queue_delayed_work(rcu_gp_wq, &ssp->srcu_sup->work, delay);
}
/*
@@ -1235,12 +1784,30 @@ static void srcu_reschedule(struct srcu_struct *ssp, unsigned long delay)
*/
static void process_srcu(struct work_struct *work)
{
+ unsigned long curdelay;
+ unsigned long j;
struct srcu_struct *ssp;
+ struct srcu_usage *sup;
- ssp = container_of(work, struct srcu_struct, work.work);
+ sup = container_of(work, struct srcu_usage, work.work);
+ ssp = sup->srcu_ssp;
srcu_advance_state(ssp);
- srcu_reschedule(ssp, srcu_get_delay(ssp));
+ curdelay = srcu_get_delay(ssp);
+ if (curdelay) {
+ WRITE_ONCE(sup->reschedule_count, 0);
+ } else {
+ j = jiffies;
+ if (READ_ONCE(sup->reschedule_jiffies) == j) {
+ WRITE_ONCE(sup->reschedule_count, READ_ONCE(sup->reschedule_count) + 1);
+ if (READ_ONCE(sup->reschedule_count) > srcu_max_nodelay)
+ curdelay = 1;
+ } else {
+ WRITE_ONCE(sup->reschedule_count, 1);
+ WRITE_ONCE(sup->reschedule_jiffies, j);
+ }
+ }
+ srcu_reschedule(ssp, curdelay);
}
void srcutorture_get_gp_data(enum rcutorture_type test_type,
@@ -1250,47 +1817,73 @@ void srcutorture_get_gp_data(enum rcutorture_type test_type,
if (test_type != SRCU_FLAVOR)
return;
*flags = 0;
- *gp_seq = rcu_seq_current(&ssp->srcu_gp_seq);
+ *gp_seq = rcu_seq_current(&ssp->srcu_sup->srcu_gp_seq);
}
EXPORT_SYMBOL_GPL(srcutorture_get_gp_data);
+static const char * const srcu_size_state_name[] = {
+ "SRCU_SIZE_SMALL",
+ "SRCU_SIZE_ALLOC",
+ "SRCU_SIZE_WAIT_BARRIER",
+ "SRCU_SIZE_WAIT_CALL",
+ "SRCU_SIZE_WAIT_CBS1",
+ "SRCU_SIZE_WAIT_CBS2",
+ "SRCU_SIZE_WAIT_CBS3",
+ "SRCU_SIZE_WAIT_CBS4",
+ "SRCU_SIZE_BIG",
+ "SRCU_SIZE_???",
+};
+
void srcu_torture_stats_print(struct srcu_struct *ssp, char *tt, char *tf)
{
int cpu;
int idx;
unsigned long s0 = 0, s1 = 0;
+ int ss_state = READ_ONCE(ssp->srcu_sup->srcu_size_state);
+ int ss_state_idx = ss_state;
idx = ssp->srcu_idx & 0x1;
- pr_alert("%s%s Tree SRCU g%ld per-CPU(idx=%d):",
- tt, tf, rcu_seq_current(&ssp->srcu_gp_seq), idx);
- for_each_possible_cpu(cpu) {
- unsigned long l0, l1;
- unsigned long u0, u1;
- long c0, c1;
- struct srcu_data *sdp;
-
- sdp = per_cpu_ptr(ssp->sda, cpu);
- u0 = data_race(sdp->srcu_unlock_count[!idx]);
- u1 = data_race(sdp->srcu_unlock_count[idx]);
-
- /*
- * Make sure that a lock is always counted if the corresponding
- * unlock is counted.
- */
- smp_rmb();
-
- l0 = data_race(sdp->srcu_lock_count[!idx]);
- l1 = data_race(sdp->srcu_lock_count[idx]);
-
- c0 = l0 - u0;
- c1 = l1 - u1;
- pr_cont(" %d(%ld,%ld %c)",
- cpu, c0, c1,
- "C."[rcu_segcblist_empty(&sdp->srcu_cblist)]);
- s0 += c0;
- s1 += c1;
+ if (ss_state < 0 || ss_state >= ARRAY_SIZE(srcu_size_state_name))
+ ss_state_idx = ARRAY_SIZE(srcu_size_state_name) - 1;
+ pr_alert("%s%s Tree SRCU g%ld state %d (%s)",
+ tt, tf, rcu_seq_current(&ssp->srcu_sup->srcu_gp_seq), ss_state,
+ srcu_size_state_name[ss_state_idx]);
+ if (!ssp->sda) {
+ // Called after cleanup_srcu_struct(), perhaps.
+ pr_cont(" No per-CPU srcu_data structures (->sda == NULL).\n");
+ } else {
+ pr_cont(" per-CPU(idx=%d):", idx);
+ for_each_possible_cpu(cpu) {
+ unsigned long l0, l1;
+ unsigned long u0, u1;
+ long c0, c1;
+ struct srcu_data *sdp;
+
+ sdp = per_cpu_ptr(ssp->sda, cpu);
+ u0 = data_race(atomic_long_read(&sdp->srcu_unlock_count[!idx]));
+ u1 = data_race(atomic_long_read(&sdp->srcu_unlock_count[idx]));
+
+ /*
+ * Make sure that a lock is always counted if the corresponding
+ * unlock is counted.
+ */
+ smp_rmb();
+
+ l0 = data_race(atomic_long_read(&sdp->srcu_lock_count[!idx]));
+ l1 = data_race(atomic_long_read(&sdp->srcu_lock_count[idx]));
+
+ c0 = l0 - u0;
+ c1 = l1 - u1;
+ pr_cont(" %d(%ld,%ld %c)",
+ cpu, c0, c1,
+ "C."[rcu_segcblist_empty(&sdp->srcu_cblist)]);
+ s0 += c0;
+ s1 += c1;
+ }
+ pr_cont(" T(%ld,%ld)\n", s0, s1);
}
- pr_cont(" T(%ld,%ld)\n", s0, s1);
+ if (SRCU_SIZING_IS_TORTURE())
+ srcu_transition_to_big(ssp);
}
EXPORT_SYMBOL_GPL(srcu_torture_stats_print);
@@ -1299,21 +1892,44 @@ static int __init srcu_bootup_announce(void)
pr_info("Hierarchical SRCU implementation.\n");
if (exp_holdoff != DEFAULT_SRCU_EXP_HOLDOFF)
pr_info("\tNon-default auto-expedite holdoff of %lu ns.\n", exp_holdoff);
+ if (srcu_retry_check_delay != SRCU_DEFAULT_RETRY_CHECK_DELAY)
+ pr_info("\tNon-default retry check delay of %lu us.\n", srcu_retry_check_delay);
+ if (srcu_max_nodelay != SRCU_DEFAULT_MAX_NODELAY)
+ pr_info("\tNon-default max no-delay of %lu.\n", srcu_max_nodelay);
+ pr_info("\tMax phase no-delay instances is %lu.\n", srcu_max_nodelay_phase);
return 0;
}
early_initcall(srcu_bootup_announce);
void __init srcu_init(void)
{
- struct srcu_struct *ssp;
+ struct srcu_usage *sup;
+
+ /* Decide on srcu_struct-size strategy. */
+ if (SRCU_SIZING_IS(SRCU_SIZING_AUTO)) {
+ if (nr_cpu_ids >= big_cpu_lim) {
+ convert_to_big = SRCU_SIZING_INIT; // Don't bother waiting for contention.
+ pr_info("%s: Setting srcu_struct sizes to big.\n", __func__);
+ } else {
+ convert_to_big = SRCU_SIZING_NONE | SRCU_SIZING_CONTEND;
+ pr_info("%s: Setting srcu_struct sizes based on contention.\n", __func__);
+ }
+ }
+ /*
+ * Once that is set, call_srcu() can follow the normal path and
+ * queue delayed work. This must follow RCU workqueues creation
+ * and timers initialization.
+ */
srcu_init_done = true;
while (!list_empty(&srcu_boot_list)) {
- ssp = list_first_entry(&srcu_boot_list, struct srcu_struct,
+ sup = list_first_entry(&srcu_boot_list, struct srcu_usage,
work.work.entry);
- check_init_srcu_struct(ssp);
- list_del_init(&ssp->work.work.entry);
- queue_work(rcu_gp_wq, &ssp->work.work);
+ list_del_init(&sup->work.work.entry);
+ if (SRCU_SIZING_IS(SRCU_SIZING_INIT) &&
+ sup->srcu_size_state == SRCU_SIZE_SMALL)
+ sup->srcu_size_state = SRCU_SIZE_ALLOC;
+ queue_work(rcu_gp_wq, &sup->work.work);
}
}
@@ -1323,13 +1939,14 @@ void __init srcu_init(void)
static int srcu_module_coming(struct module *mod)
{
int i;
+ struct srcu_struct *ssp;
struct srcu_struct **sspp = mod->srcu_struct_ptrs;
- int ret;
for (i = 0; i < mod->num_srcu_structs; i++) {
- ret = init_srcu_struct(*(sspp++));
- if (WARN_ON_ONCE(ret))
- return ret;
+ ssp = *(sspp++);
+ ssp->sda = alloc_percpu(struct srcu_data);
+ if (WARN_ON_ONCE(!ssp->sda))
+ return -ENOMEM;
}
return 0;
}
@@ -1338,10 +1955,17 @@ static int srcu_module_coming(struct module *mod)
static void srcu_module_going(struct module *mod)
{
int i;
+ struct srcu_struct *ssp;
struct srcu_struct **sspp = mod->srcu_struct_ptrs;
- for (i = 0; i < mod->num_srcu_structs; i++)
- cleanup_srcu_struct(*(sspp++));
+ for (i = 0; i < mod->num_srcu_structs; i++) {
+ ssp = *(sspp++);
+ if (!rcu_seq_state(smp_load_acquire(&ssp->srcu_sup->srcu_gp_seq_needed)) &&
+ !WARN_ON_ONCE(!ssp->srcu_sup->sda_is_static))
+ cleanup_srcu_struct(ssp);
+ if (!WARN_ON(srcu_readers_active(ssp)))
+ free_percpu(ssp->sda);
+ }
}
/* Handle one module, either coming or going. */
diff --git a/kernel/rcu/sync.c b/kernel/rcu/sync.c
index d4558ab7a07d..e550f97779b8 100644
--- a/kernel/rcu/sync.c
+++ b/kernel/rcu/sync.c
@@ -44,7 +44,7 @@ static void rcu_sync_func(struct rcu_head *rhp);
static void rcu_sync_call(struct rcu_sync *rsp)
{
- call_rcu(&rsp->cb_head, rcu_sync_func);
+ call_rcu_hurry(&rsp->cb_head, rcu_sync_func);
}
/**
@@ -94,9 +94,9 @@ static void rcu_sync_func(struct rcu_head *rhp)
rcu_sync_call(rsp);
} else {
/*
- * We're at least a GP after the last rcu_sync_exit(); eveybody
+ * We're at least a GP after the last rcu_sync_exit(); everybody
* will now have observed the write side critical section.
- * Let 'em rip!.
+ * Let 'em rip!
*/
WRITE_ONCE(rsp->gp_state, GP_IDLE);
}
@@ -111,7 +111,7 @@ static void rcu_sync_func(struct rcu_head *rhp)
* a slowpath during the update. After this function returns, all
* subsequent calls to rcu_sync_is_idle() will return false, which
* tells readers to stay off their fastpaths. A later call to
- * rcu_sync_exit() re-enables reader slowpaths.
+ * rcu_sync_exit() re-enables reader fastpaths.
*
* When called in isolation, rcu_sync_enter() must wait for a grace
* period, however, closely spaced calls to rcu_sync_enter() can
diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h
index 36607551f966..732ad5b39946 100644
--- a/kernel/rcu/tasks.h
+++ b/kernel/rcu/tasks.h
@@ -6,6 +6,7 @@
*/
#ifdef CONFIG_TASKS_RCU_GENERIC
+#include "rcu_segcblist.h"
////////////////////////////////////////////////////////////////////////
//
@@ -13,51 +14,90 @@
struct rcu_tasks;
typedef void (*rcu_tasks_gp_func_t)(struct rcu_tasks *rtp);
-typedef void (*pregp_func_t)(void);
+typedef void (*pregp_func_t)(struct list_head *hop);
typedef void (*pertask_func_t)(struct task_struct *t, struct list_head *hop);
typedef void (*postscan_func_t)(struct list_head *hop);
typedef void (*holdouts_func_t)(struct list_head *hop, bool ndrpt, bool *frptp);
typedef void (*postgp_func_t)(struct rcu_tasks *rtp);
/**
- * Definition for a Tasks-RCU-like mechanism.
- * @cbs_head: Head of callback list.
- * @cbs_tail: Tail pointer for callback list.
- * @cbs_wq: Wait queue allowning new callback to get kthread's attention.
- * @cbs_lock: Lock protecting callback list.
- * @kthread_ptr: This flavor's grace-period/callback-invocation kthread.
+ * struct rcu_tasks_percpu - Per-CPU component of definition for a Tasks-RCU-like mechanism.
+ * @cblist: Callback list.
+ * @lock: Lock protecting per-CPU callback list.
+ * @rtp_jiffies: Jiffies counter value for statistics.
+ * @lazy_timer: Timer to unlazify callbacks.
+ * @urgent_gp: Number of additional non-lazy grace periods.
+ * @rtp_n_lock_retries: Rough lock-contention statistic.
+ * @rtp_work: Work queue for invoking callbacks.
+ * @rtp_irq_work: IRQ work queue for deferred wakeups.
+ * @barrier_q_head: RCU callback for barrier operation.
+ * @rtp_blkd_tasks: List of tasks blocked as readers.
+ * @cpu: CPU number corresponding to this entry.
+ * @rtpp: Pointer to the rcu_tasks structure.
+ */
+struct rcu_tasks_percpu {
+ struct rcu_segcblist cblist;
+ raw_spinlock_t __private lock;
+ unsigned long rtp_jiffies;
+ unsigned long rtp_n_lock_retries;
+ struct timer_list lazy_timer;
+ unsigned int urgent_gp;
+ struct work_struct rtp_work;
+ struct irq_work rtp_irq_work;
+ struct rcu_head barrier_q_head;
+ struct list_head rtp_blkd_tasks;
+ int cpu;
+ struct rcu_tasks *rtpp;
+};
+
+/**
+ * struct rcu_tasks - Definition for a Tasks-RCU-like mechanism.
+ * @cbs_wait: RCU wait allowing a new callback to get kthread's attention.
+ * @cbs_gbl_lock: Lock protecting callback list.
+ * @tasks_gp_mutex: Mutex protecting grace period, needed during mid-boot dead zone.
* @gp_func: This flavor's grace-period-wait function.
* @gp_state: Grace period's most recent state transition (debugging).
* @gp_sleep: Per-grace-period sleep to prevent CPU-bound looping.
* @init_fract: Initial backoff sleep interval.
* @gp_jiffies: Time of last @gp_state transition.
* @gp_start: Most recent grace-period start in jiffies.
- * @n_gps: Number of grace periods completed since boot.
+ * @tasks_gp_seq: Number of grace periods completed since boot.
* @n_ipis: Number of IPIs sent to encourage grace periods to end.
* @n_ipis_fails: Number of IPI-send failures.
+ * @kthread_ptr: This flavor's grace-period/callback-invocation kthread.
+ * @lazy_jiffies: Number of jiffies to allow callbacks to be lazy.
* @pregp_func: This flavor's pre-grace-period function (optional).
* @pertask_func: This flavor's per-task scan function (optional).
* @postscan_func: This flavor's post-task scan function (optional).
- * @holdout_func: This flavor's holdout-list scan function (optional).
+ * @holdouts_func: This flavor's holdout-list scan function (optional).
* @postgp_func: This flavor's post-grace-period function (optional).
* @call_func: This flavor's call_rcu()-equivalent function.
+ * @rtpcpu: This flavor's rcu_tasks_percpu structure.
+ * @percpu_enqueue_shift: Shift down CPU ID this much when enqueuing callbacks.
+ * @percpu_enqueue_lim: Number of per-CPU callback queues in use for enqueuing.
+ * @percpu_dequeue_lim: Number of per-CPU callback queues in use for dequeuing.
+ * @percpu_dequeue_gpseq: RCU grace-period number to propagate enqueue limit to dequeuers.
+ * @barrier_q_mutex: Serialize barrier operations.
+ * @barrier_q_count: Number of queues being waited on.
+ * @barrier_q_completion: Barrier wait/wakeup mechanism.
+ * @barrier_q_seq: Sequence number for barrier operations.
* @name: This flavor's textual name.
* @kname: This flavor's kthread name.
*/
struct rcu_tasks {
- struct rcu_head *cbs_head;
- struct rcu_head **cbs_tail;
- struct wait_queue_head cbs_wq;
- raw_spinlock_t cbs_lock;
+ struct rcuwait cbs_wait;
+ raw_spinlock_t cbs_gbl_lock;
+ struct mutex tasks_gp_mutex;
int gp_state;
int gp_sleep;
int init_fract;
unsigned long gp_jiffies;
unsigned long gp_start;
- unsigned long n_gps;
+ unsigned long tasks_gp_seq;
unsigned long n_ipis;
unsigned long n_ipis_fails;
struct task_struct *kthread_ptr;
+ unsigned long lazy_jiffies;
rcu_tasks_gp_func_t gp_func;
pregp_func_t pregp_func;
pertask_func_t pertask_func;
@@ -65,34 +105,79 @@ struct rcu_tasks {
holdouts_func_t holdouts_func;
postgp_func_t postgp_func;
call_rcu_func_t call_func;
+ struct rcu_tasks_percpu __percpu *rtpcpu;
+ int percpu_enqueue_shift;
+ int percpu_enqueue_lim;
+ int percpu_dequeue_lim;
+ unsigned long percpu_dequeue_gpseq;
+ struct mutex barrier_q_mutex;
+ atomic_t barrier_q_count;
+ struct completion barrier_q_completion;
+ unsigned long barrier_q_seq;
char *name;
char *kname;
};
-#define DEFINE_RCU_TASKS(rt_name, gp, call, n) \
-static struct rcu_tasks rt_name = \
-{ \
- .cbs_tail = &rt_name.cbs_head, \
- .cbs_wq = __WAIT_QUEUE_HEAD_INITIALIZER(rt_name.cbs_wq), \
- .cbs_lock = __RAW_SPIN_LOCK_UNLOCKED(rt_name.cbs_lock), \
- .gp_func = gp, \
- .call_func = call, \
- .name = n, \
- .kname = #rt_name, \
+static void call_rcu_tasks_iw_wakeup(struct irq_work *iwp);
+
+#define DEFINE_RCU_TASKS(rt_name, gp, call, n) \
+static DEFINE_PER_CPU(struct rcu_tasks_percpu, rt_name ## __percpu) = { \
+ .lock = __RAW_SPIN_LOCK_UNLOCKED(rt_name ## __percpu.cbs_pcpu_lock), \
+ .rtp_irq_work = IRQ_WORK_INIT_HARD(call_rcu_tasks_iw_wakeup), \
+}; \
+static struct rcu_tasks rt_name = \
+{ \
+ .cbs_wait = __RCUWAIT_INITIALIZER(rt_name.wait), \
+ .cbs_gbl_lock = __RAW_SPIN_LOCK_UNLOCKED(rt_name.cbs_gbl_lock), \
+ .tasks_gp_mutex = __MUTEX_INITIALIZER(rt_name.tasks_gp_mutex), \
+ .gp_func = gp, \
+ .call_func = call, \
+ .rtpcpu = &rt_name ## __percpu, \
+ .lazy_jiffies = DIV_ROUND_UP(HZ, 4), \
+ .name = n, \
+ .percpu_enqueue_shift = order_base_2(CONFIG_NR_CPUS), \
+ .percpu_enqueue_lim = 1, \
+ .percpu_dequeue_lim = 1, \
+ .barrier_q_mutex = __MUTEX_INITIALIZER(rt_name.barrier_q_mutex), \
+ .barrier_q_seq = (0UL - 50UL) << RCU_SEQ_CTR_SHIFT, \
+ .kname = #rt_name, \
}
+#ifdef CONFIG_TASKS_RCU
/* Track exiting tasks in order to allow them to be waited for. */
DEFINE_STATIC_SRCU(tasks_rcu_exit_srcu);
+/* Report delay in synchronize_srcu() completion in rcu_tasks_postscan(). */
+static void tasks_rcu_exit_srcu_stall(struct timer_list *unused);
+static DEFINE_TIMER(tasks_rcu_exit_srcu_stall_timer, tasks_rcu_exit_srcu_stall);
+#endif
+
/* Avoid IPIing CPUs early in the grace period. */
#define RCU_TASK_IPI_DELAY (IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB) ? HZ / 2 : 0)
static int rcu_task_ipi_delay __read_mostly = RCU_TASK_IPI_DELAY;
module_param(rcu_task_ipi_delay, int, 0644);
/* Control stall timeouts. Disable with <= 0, otherwise jiffies till stall. */
+#define RCU_TASK_BOOT_STALL_TIMEOUT (HZ * 30)
#define RCU_TASK_STALL_TIMEOUT (HZ * 60 * 10)
static int rcu_task_stall_timeout __read_mostly = RCU_TASK_STALL_TIMEOUT;
module_param(rcu_task_stall_timeout, int, 0644);
+#define RCU_TASK_STALL_INFO (HZ * 10)
+static int rcu_task_stall_info __read_mostly = RCU_TASK_STALL_INFO;
+module_param(rcu_task_stall_info, int, 0644);
+static int rcu_task_stall_info_mult __read_mostly = 3;
+module_param(rcu_task_stall_info_mult, int, 0444);
+
+static int rcu_task_enqueue_lim __read_mostly = -1;
+module_param(rcu_task_enqueue_lim, int, 0444);
+
+static bool rcu_task_cb_adjust;
+static int rcu_task_contend_lim __read_mostly = 100;
+module_param(rcu_task_contend_lim, int, 0444);
+static int rcu_task_collapse_lim __read_mostly = 10;
+module_param(rcu_task_collapse_lim, int, 0444);
+static int rcu_task_lazy_lim __read_mostly = 32;
+module_param(rcu_task_lazy_lim, int, 0444);
/* RCU tasks grace-period state for debugging. */
#define RTGS_INIT 0
@@ -128,6 +213,8 @@ static const char * const rcu_tasks_gp_state_names[] = {
//
// Generic code.
+static void rcu_tasks_invoke_cbs_wq(struct work_struct *wp);
+
/* Record grace-period phase and time. */
static void set_tasks_gp_state(struct rcu_tasks *rtp, int newstate)
{
@@ -148,47 +235,383 @@ static const char *tasks_gp_state_getname(struct rcu_tasks *rtp)
}
#endif /* #ifndef CONFIG_TINY_RCU */
+// Initialize per-CPU callback lists for the specified flavor of
+// Tasks RCU. Do not enqueue callbacks before this function is invoked.
+static void cblist_init_generic(struct rcu_tasks *rtp)
+{
+ int cpu;
+ unsigned long flags;
+ int lim;
+ int shift;
+
+ if (rcu_task_enqueue_lim < 0) {
+ rcu_task_enqueue_lim = 1;
+ rcu_task_cb_adjust = true;
+ } else if (rcu_task_enqueue_lim == 0) {
+ rcu_task_enqueue_lim = 1;
+ }
+ lim = rcu_task_enqueue_lim;
+
+ if (lim > nr_cpu_ids)
+ lim = nr_cpu_ids;
+ shift = ilog2(nr_cpu_ids / lim);
+ if (((nr_cpu_ids - 1) >> shift) >= lim)
+ shift++;
+ WRITE_ONCE(rtp->percpu_enqueue_shift, shift);
+ WRITE_ONCE(rtp->percpu_dequeue_lim, lim);
+ smp_store_release(&rtp->percpu_enqueue_lim, lim);
+ for_each_possible_cpu(cpu) {
+ struct rcu_tasks_percpu *rtpcp = per_cpu_ptr(rtp->rtpcpu, cpu);
+
+ WARN_ON_ONCE(!rtpcp);
+ if (cpu)
+ raw_spin_lock_init(&ACCESS_PRIVATE(rtpcp, lock));
+ local_irq_save(flags); // serialize initialization
+ if (rcu_segcblist_empty(&rtpcp->cblist))
+ rcu_segcblist_init(&rtpcp->cblist);
+ local_irq_restore(flags);
+ INIT_WORK(&rtpcp->rtp_work, rcu_tasks_invoke_cbs_wq);
+ rtpcp->cpu = cpu;
+ rtpcp->rtpp = rtp;
+ if (!rtpcp->rtp_blkd_tasks.next)
+ INIT_LIST_HEAD(&rtpcp->rtp_blkd_tasks);
+ }
+
+ pr_info("%s: Setting shift to %d and lim to %d rcu_task_cb_adjust=%d.\n", rtp->name,
+ data_race(rtp->percpu_enqueue_shift), data_race(rtp->percpu_enqueue_lim), rcu_task_cb_adjust);
+}
+
+// Compute wakeup time for lazy callback timer.
+static unsigned long rcu_tasks_lazy_time(struct rcu_tasks *rtp)
+{
+ return jiffies + rtp->lazy_jiffies;
+}
+
+// Timer handler that unlazifies lazy callbacks.
+static void call_rcu_tasks_generic_timer(struct timer_list *tlp)
+{
+ unsigned long flags;
+ bool needwake = false;
+ struct rcu_tasks *rtp;
+ struct rcu_tasks_percpu *rtpcp = from_timer(rtpcp, tlp, lazy_timer);
+
+ rtp = rtpcp->rtpp;
+ raw_spin_lock_irqsave_rcu_node(rtpcp, flags);
+ if (!rcu_segcblist_empty(&rtpcp->cblist) && rtp->lazy_jiffies) {
+ if (!rtpcp->urgent_gp)
+ rtpcp->urgent_gp = 1;
+ needwake = true;
+ mod_timer(&rtpcp->lazy_timer, rcu_tasks_lazy_time(rtp));
+ }
+ raw_spin_unlock_irqrestore_rcu_node(rtpcp, flags);
+ if (needwake)
+ rcuwait_wake_up(&rtp->cbs_wait);
+}
+
+// IRQ-work handler that does deferred wakeup for call_rcu_tasks_generic().
+static void call_rcu_tasks_iw_wakeup(struct irq_work *iwp)
+{
+ struct rcu_tasks *rtp;
+ struct rcu_tasks_percpu *rtpcp = container_of(iwp, struct rcu_tasks_percpu, rtp_irq_work);
+
+ rtp = rtpcp->rtpp;
+ rcuwait_wake_up(&rtp->cbs_wait);
+}
+
// Enqueue a callback for the specified flavor of Tasks RCU.
static void call_rcu_tasks_generic(struct rcu_head *rhp, rcu_callback_t func,
struct rcu_tasks *rtp)
{
+ int chosen_cpu;
unsigned long flags;
+ bool havekthread = smp_load_acquire(&rtp->kthread_ptr);
+ int ideal_cpu;
+ unsigned long j;
+ bool needadjust = false;
bool needwake;
+ struct rcu_tasks_percpu *rtpcp;
rhp->next = NULL;
rhp->func = func;
- raw_spin_lock_irqsave(&rtp->cbs_lock, flags);
- needwake = !rtp->cbs_head;
- WRITE_ONCE(*rtp->cbs_tail, rhp);
- rtp->cbs_tail = &rhp->next;
- raw_spin_unlock_irqrestore(&rtp->cbs_lock, flags);
+ local_irq_save(flags);
+ rcu_read_lock();
+ ideal_cpu = smp_processor_id() >> READ_ONCE(rtp->percpu_enqueue_shift);
+ chosen_cpu = cpumask_next(ideal_cpu - 1, cpu_possible_mask);
+ rtpcp = per_cpu_ptr(rtp->rtpcpu, chosen_cpu);
+ if (!raw_spin_trylock_rcu_node(rtpcp)) { // irqs already disabled.
+ raw_spin_lock_rcu_node(rtpcp); // irqs already disabled.
+ j = jiffies;
+ if (rtpcp->rtp_jiffies != j) {
+ rtpcp->rtp_jiffies = j;
+ rtpcp->rtp_n_lock_retries = 0;
+ }
+ if (rcu_task_cb_adjust && ++rtpcp->rtp_n_lock_retries > rcu_task_contend_lim &&
+ READ_ONCE(rtp->percpu_enqueue_lim) != nr_cpu_ids)
+ needadjust = true; // Defer adjustment to avoid deadlock.
+ }
+ // Queuing callbacks before initialization not yet supported.
+ if (WARN_ON_ONCE(!rcu_segcblist_is_enabled(&rtpcp->cblist)))
+ rcu_segcblist_init(&rtpcp->cblist);
+ needwake = (func == wakeme_after_rcu) ||
+ (rcu_segcblist_n_cbs(&rtpcp->cblist) == rcu_task_lazy_lim);
+ if (havekthread && !needwake && !timer_pending(&rtpcp->lazy_timer)) {
+ if (rtp->lazy_jiffies)
+ mod_timer(&rtpcp->lazy_timer, rcu_tasks_lazy_time(rtp));
+ else
+ needwake = rcu_segcblist_empty(&rtpcp->cblist);
+ }
+ if (needwake)
+ rtpcp->urgent_gp = 3;
+ rcu_segcblist_enqueue(&rtpcp->cblist, rhp);
+ raw_spin_unlock_irqrestore_rcu_node(rtpcp, flags);
+ if (unlikely(needadjust)) {
+ raw_spin_lock_irqsave(&rtp->cbs_gbl_lock, flags);
+ if (rtp->percpu_enqueue_lim != nr_cpu_ids) {
+ WRITE_ONCE(rtp->percpu_enqueue_shift, 0);
+ WRITE_ONCE(rtp->percpu_dequeue_lim, nr_cpu_ids);
+ smp_store_release(&rtp->percpu_enqueue_lim, nr_cpu_ids);
+ pr_info("Switching %s to per-CPU callback queuing.\n", rtp->name);
+ }
+ raw_spin_unlock_irqrestore(&rtp->cbs_gbl_lock, flags);
+ }
+ rcu_read_unlock();
/* We can't create the thread unless interrupts are enabled. */
if (needwake && READ_ONCE(rtp->kthread_ptr))
- wake_up(&rtp->cbs_wq);
+ irq_work_queue(&rtpcp->rtp_irq_work);
}
-// Wait for a grace period for the specified flavor of Tasks RCU.
-static void synchronize_rcu_tasks_generic(struct rcu_tasks *rtp)
+// RCU callback function for rcu_barrier_tasks_generic().
+static void rcu_barrier_tasks_generic_cb(struct rcu_head *rhp)
{
- /* Complain if the scheduler has not started. */
- RCU_LOCKDEP_WARN(rcu_scheduler_active == RCU_SCHEDULER_INACTIVE,
- "synchronize_rcu_tasks called too soon");
+ struct rcu_tasks *rtp;
+ struct rcu_tasks_percpu *rtpcp;
- /* Wait for the grace period. */
- wait_rcu_gp(rtp->call_func);
+ rtpcp = container_of(rhp, struct rcu_tasks_percpu, barrier_q_head);
+ rtp = rtpcp->rtpp;
+ if (atomic_dec_and_test(&rtp->barrier_q_count))
+ complete(&rtp->barrier_q_completion);
}
-/* RCU-tasks kthread that detects grace periods and invokes callbacks. */
-static int __noreturn rcu_tasks_kthread(void *arg)
+// Wait for all in-flight callbacks for the specified RCU Tasks flavor.
+// Operates in a manner similar to rcu_barrier().
+static void rcu_barrier_tasks_generic(struct rcu_tasks *rtp)
+{
+ int cpu;
+ unsigned long flags;
+ struct rcu_tasks_percpu *rtpcp;
+ unsigned long s = rcu_seq_snap(&rtp->barrier_q_seq);
+
+ mutex_lock(&rtp->barrier_q_mutex);
+ if (rcu_seq_done(&rtp->barrier_q_seq, s)) {
+ smp_mb();
+ mutex_unlock(&rtp->barrier_q_mutex);
+ return;
+ }
+ rcu_seq_start(&rtp->barrier_q_seq);
+ init_completion(&rtp->barrier_q_completion);
+ atomic_set(&rtp->barrier_q_count, 2);
+ for_each_possible_cpu(cpu) {
+ if (cpu >= smp_load_acquire(&rtp->percpu_dequeue_lim))
+ break;
+ rtpcp = per_cpu_ptr(rtp->rtpcpu, cpu);
+ rtpcp->barrier_q_head.func = rcu_barrier_tasks_generic_cb;
+ raw_spin_lock_irqsave_rcu_node(rtpcp, flags);
+ if (rcu_segcblist_entrain(&rtpcp->cblist, &rtpcp->barrier_q_head))
+ atomic_inc(&rtp->barrier_q_count);
+ raw_spin_unlock_irqrestore_rcu_node(rtpcp, flags);
+ }
+ if (atomic_sub_and_test(2, &rtp->barrier_q_count))
+ complete(&rtp->barrier_q_completion);
+ wait_for_completion(&rtp->barrier_q_completion);
+ rcu_seq_end(&rtp->barrier_q_seq);
+ mutex_unlock(&rtp->barrier_q_mutex);
+}
+
+// Advance callbacks and indicate whether either a grace period or
+// callback invocation is needed.
+static int rcu_tasks_need_gpcb(struct rcu_tasks *rtp)
{
+ int cpu;
+ int dequeue_limit;
unsigned long flags;
- struct rcu_head *list;
- struct rcu_head *next;
+ bool gpdone = poll_state_synchronize_rcu(rtp->percpu_dequeue_gpseq);
+ long n;
+ long ncbs = 0;
+ long ncbsnz = 0;
+ int needgpcb = 0;
+
+ dequeue_limit = smp_load_acquire(&rtp->percpu_dequeue_lim);
+ for (cpu = 0; cpu < dequeue_limit; cpu++) {
+ struct rcu_tasks_percpu *rtpcp = per_cpu_ptr(rtp->rtpcpu, cpu);
+
+ /* Advance and accelerate any new callbacks. */
+ if (!rcu_segcblist_n_cbs(&rtpcp->cblist))
+ continue;
+ raw_spin_lock_irqsave_rcu_node(rtpcp, flags);
+ // Should we shrink down to a single callback queue?
+ n = rcu_segcblist_n_cbs(&rtpcp->cblist);
+ if (n) {
+ ncbs += n;
+ if (cpu > 0)
+ ncbsnz += n;
+ }
+ rcu_segcblist_advance(&rtpcp->cblist, rcu_seq_current(&rtp->tasks_gp_seq));
+ (void)rcu_segcblist_accelerate(&rtpcp->cblist, rcu_seq_snap(&rtp->tasks_gp_seq));
+ if (rtpcp->urgent_gp > 0 && rcu_segcblist_pend_cbs(&rtpcp->cblist)) {
+ if (rtp->lazy_jiffies)
+ rtpcp->urgent_gp--;
+ needgpcb |= 0x3;
+ } else if (rcu_segcblist_empty(&rtpcp->cblist)) {
+ rtpcp->urgent_gp = 0;
+ }
+ if (rcu_segcblist_ready_cbs(&rtpcp->cblist))
+ needgpcb |= 0x1;
+ raw_spin_unlock_irqrestore_rcu_node(rtpcp, flags);
+ }
+
+ // Shrink down to a single callback queue if appropriate.
+ // This is done in two stages: (1) If there are no more than
+ // rcu_task_collapse_lim callbacks on CPU 0 and none on any other
+ // CPU, limit enqueueing to CPU 0. (2) After an RCU grace period,
+ // if there has not been an increase in callbacks, limit dequeuing
+ // to CPU 0. Note the matching RCU read-side critical section in
+ // call_rcu_tasks_generic().
+ if (rcu_task_cb_adjust && ncbs <= rcu_task_collapse_lim) {
+ raw_spin_lock_irqsave(&rtp->cbs_gbl_lock, flags);
+ if (rtp->percpu_enqueue_lim > 1) {
+ WRITE_ONCE(rtp->percpu_enqueue_shift, order_base_2(nr_cpu_ids));
+ smp_store_release(&rtp->percpu_enqueue_lim, 1);
+ rtp->percpu_dequeue_gpseq = get_state_synchronize_rcu();
+ gpdone = false;
+ pr_info("Starting switch %s to CPU-0 callback queuing.\n", rtp->name);
+ }
+ raw_spin_unlock_irqrestore(&rtp->cbs_gbl_lock, flags);
+ }
+ if (rcu_task_cb_adjust && !ncbsnz && gpdone) {
+ raw_spin_lock_irqsave(&rtp->cbs_gbl_lock, flags);
+ if (rtp->percpu_enqueue_lim < rtp->percpu_dequeue_lim) {
+ WRITE_ONCE(rtp->percpu_dequeue_lim, 1);
+ pr_info("Completing switch %s to CPU-0 callback queuing.\n", rtp->name);
+ }
+ if (rtp->percpu_dequeue_lim == 1) {
+ for (cpu = rtp->percpu_dequeue_lim; cpu < nr_cpu_ids; cpu++) {
+ struct rcu_tasks_percpu *rtpcp = per_cpu_ptr(rtp->rtpcpu, cpu);
+
+ WARN_ON_ONCE(rcu_segcblist_n_cbs(&rtpcp->cblist));
+ }
+ }
+ raw_spin_unlock_irqrestore(&rtp->cbs_gbl_lock, flags);
+ }
+
+ return needgpcb;
+}
+
+// Advance callbacks and invoke any that are ready.
+static void rcu_tasks_invoke_cbs(struct rcu_tasks *rtp, struct rcu_tasks_percpu *rtpcp)
+{
+ int cpu;
+ int cpunext;
+ int cpuwq;
+ unsigned long flags;
+ int len;
+ struct rcu_head *rhp;
+ struct rcu_cblist rcl = RCU_CBLIST_INITIALIZER(rcl);
+ struct rcu_tasks_percpu *rtpcp_next;
+
+ cpu = rtpcp->cpu;
+ cpunext = cpu * 2 + 1;
+ if (cpunext < smp_load_acquire(&rtp->percpu_dequeue_lim)) {
+ rtpcp_next = per_cpu_ptr(rtp->rtpcpu, cpunext);
+ cpuwq = rcu_cpu_beenfullyonline(cpunext) ? cpunext : WORK_CPU_UNBOUND;
+ queue_work_on(cpuwq, system_wq, &rtpcp_next->rtp_work);
+ cpunext++;
+ if (cpunext < smp_load_acquire(&rtp->percpu_dequeue_lim)) {
+ rtpcp_next = per_cpu_ptr(rtp->rtpcpu, cpunext);
+ cpuwq = rcu_cpu_beenfullyonline(cpunext) ? cpunext : WORK_CPU_UNBOUND;
+ queue_work_on(cpuwq, system_wq, &rtpcp_next->rtp_work);
+ }
+ }
+
+ if (rcu_segcblist_empty(&rtpcp->cblist) || !cpu_possible(cpu))
+ return;
+ raw_spin_lock_irqsave_rcu_node(rtpcp, flags);
+ rcu_segcblist_advance(&rtpcp->cblist, rcu_seq_current(&rtp->tasks_gp_seq));
+ rcu_segcblist_extract_done_cbs(&rtpcp->cblist, &rcl);
+ raw_spin_unlock_irqrestore_rcu_node(rtpcp, flags);
+ len = rcl.len;
+ for (rhp = rcu_cblist_dequeue(&rcl); rhp; rhp = rcu_cblist_dequeue(&rcl)) {
+ debug_rcu_head_callback(rhp);
+ local_bh_disable();
+ rhp->func(rhp);
+ local_bh_enable();
+ cond_resched();
+ }
+ raw_spin_lock_irqsave_rcu_node(rtpcp, flags);
+ rcu_segcblist_add_len(&rtpcp->cblist, -len);
+ (void)rcu_segcblist_accelerate(&rtpcp->cblist, rcu_seq_snap(&rtp->tasks_gp_seq));
+ raw_spin_unlock_irqrestore_rcu_node(rtpcp, flags);
+}
+
+// Workqueue flood to advance callbacks and invoke any that are ready.
+static void rcu_tasks_invoke_cbs_wq(struct work_struct *wp)
+{
+ struct rcu_tasks *rtp;
+ struct rcu_tasks_percpu *rtpcp = container_of(wp, struct rcu_tasks_percpu, rtp_work);
+
+ rtp = rtpcp->rtpp;
+ rcu_tasks_invoke_cbs(rtp, rtpcp);
+}
+
+// Wait for one grace period.
+static void rcu_tasks_one_gp(struct rcu_tasks *rtp, bool midboot)
+{
+ int needgpcb;
+
+ mutex_lock(&rtp->tasks_gp_mutex);
+
+ // If there were none, wait a bit and start over.
+ if (unlikely(midboot)) {
+ needgpcb = 0x2;
+ } else {
+ mutex_unlock(&rtp->tasks_gp_mutex);
+ set_tasks_gp_state(rtp, RTGS_WAIT_CBS);
+ rcuwait_wait_event(&rtp->cbs_wait,
+ (needgpcb = rcu_tasks_need_gpcb(rtp)),
+ TASK_IDLE);
+ mutex_lock(&rtp->tasks_gp_mutex);
+ }
+
+ if (needgpcb & 0x2) {
+ // Wait for one grace period.
+ set_tasks_gp_state(rtp, RTGS_WAIT_GP);
+ rtp->gp_start = jiffies;
+ rcu_seq_start(&rtp->tasks_gp_seq);
+ rtp->gp_func(rtp);
+ rcu_seq_end(&rtp->tasks_gp_seq);
+ }
+
+ // Invoke callbacks.
+ set_tasks_gp_state(rtp, RTGS_INVOKE_CBS);
+ rcu_tasks_invoke_cbs(rtp, per_cpu_ptr(rtp->rtpcpu, 0));
+ mutex_unlock(&rtp->tasks_gp_mutex);
+}
+
+// RCU-tasks kthread that detects grace periods and invokes callbacks.
+static int __noreturn rcu_tasks_kthread(void *arg)
+{
+ int cpu;
struct rcu_tasks *rtp = arg;
+ for_each_possible_cpu(cpu) {
+ struct rcu_tasks_percpu *rtpcp = per_cpu_ptr(rtp->rtpcpu, cpu);
+
+ timer_setup(&rtpcp->lazy_timer, call_rcu_tasks_generic_timer, 0);
+ rtpcp->urgent_gp = 1;
+ }
+
/* Run on housekeeping CPUs by default. Sysadm can move if desired. */
- housekeeping_affine(current, HK_FLAG_RCU);
- WRITE_ONCE(rtp->kthread_ptr, current); // Let GPs start!
+ housekeeping_affine(current, HK_TYPE_RCU);
+ smp_store_release(&rtp->kthread_ptr, current); // Let GPs start!
/*
* Each pass through the following loop makes one check for
@@ -197,48 +620,29 @@ static int __noreturn rcu_tasks_kthread(void *arg)
* This loop is terminated by the system going down. ;-)
*/
for (;;) {
+ // Wait for one grace period and invoke any callbacks
+ // that are ready.
+ rcu_tasks_one_gp(rtp, false);
- /* Pick up any new callbacks. */
- raw_spin_lock_irqsave(&rtp->cbs_lock, flags);
- smp_mb__after_spinlock(); // Order updates vs. GP.
- list = rtp->cbs_head;
- rtp->cbs_head = NULL;
- rtp->cbs_tail = &rtp->cbs_head;
- raw_spin_unlock_irqrestore(&rtp->cbs_lock, flags);
-
- /* If there were none, wait a bit and start over. */
- if (!list) {
- wait_event_interruptible(rtp->cbs_wq,
- READ_ONCE(rtp->cbs_head));
- if (!rtp->cbs_head) {
- WARN_ON(signal_pending(current));
- set_tasks_gp_state(rtp, RTGS_WAIT_WAIT_CBS);
- schedule_timeout_idle(HZ/10);
- }
- continue;
- }
-
- // Wait for one grace period.
- set_tasks_gp_state(rtp, RTGS_WAIT_GP);
- rtp->gp_start = jiffies;
- rtp->gp_func(rtp);
- rtp->n_gps++;
-
- /* Invoke the callbacks. */
- set_tasks_gp_state(rtp, RTGS_INVOKE_CBS);
- while (list) {
- next = list->next;
- local_bh_disable();
- list->func(list);
- local_bh_enable();
- list = next;
- cond_resched();
- }
- /* Paranoid sleep to keep this from entering a tight loop */
+ // Paranoid sleep to keep this from entering a tight loop.
schedule_timeout_idle(rtp->gp_sleep);
+ }
+}
- set_tasks_gp_state(rtp, RTGS_WAIT_CBS);
+// Wait for a grace period for the specified flavor of Tasks RCU.
+static void synchronize_rcu_tasks_generic(struct rcu_tasks *rtp)
+{
+ /* Complain if the scheduler has not started. */
+ if (WARN_ONCE(rcu_scheduler_active == RCU_SCHEDULER_INACTIVE,
+ "synchronize_%s() called too soon", rtp->name))
+ return;
+
+ // If the grace-period kthread is running, use it.
+ if (READ_ONCE(rtp->kthread_ptr)) {
+ wait_rcu_gp(rtp->call_func);
+ return;
}
+ rcu_tasks_one_gp(rtp, true);
}
/* Spawn RCU-tasks grace-period kthread. */
@@ -260,8 +664,15 @@ static void __init rcu_spawn_tasks_kthread_generic(struct rcu_tasks *rtp)
static void __init rcu_tasks_bootup_oddness(void)
{
#if defined(CONFIG_TASKS_RCU) || defined(CONFIG_TASKS_TRACE_RCU)
+ int rtsimc;
+
if (rcu_task_stall_timeout != RCU_TASK_STALL_TIMEOUT)
pr_info("\tTasks-RCU CPU stall warnings timeout set to %d (rcu_task_stall_timeout).\n", rcu_task_stall_timeout);
+ rtsimc = clamp(rcu_task_stall_info_mult, 1, 10);
+ if (rtsimc != rcu_task_stall_info_mult) {
+ pr_info("\tTasks-RCU CPU stall info multiplier clamped to %d (rcu_task_stall_info_mult).\n", rtsimc);
+ rcu_task_stall_info_mult = rtsimc;
+ }
#endif /* #ifdef CONFIG_TASKS_RCU */
#ifdef CONFIG_TASKS_RCU
pr_info("\tTrampoline variant of Tasks RCU enabled.\n");
@@ -280,14 +691,34 @@ static void __init rcu_tasks_bootup_oddness(void)
/* Dump out rcutorture-relevant state common to all RCU-tasks flavors. */
static void show_rcu_tasks_generic_gp_kthread(struct rcu_tasks *rtp, char *s)
{
- pr_info("%s: %s(%d) since %lu g:%lu i:%lu/%lu %c%c %s\n",
+ int cpu;
+ bool havecbs = false;
+ bool haveurgent = false;
+ bool haveurgentcbs = false;
+
+ for_each_possible_cpu(cpu) {
+ struct rcu_tasks_percpu *rtpcp = per_cpu_ptr(rtp->rtpcpu, cpu);
+
+ if (!data_race(rcu_segcblist_empty(&rtpcp->cblist)))
+ havecbs = true;
+ if (data_race(rtpcp->urgent_gp))
+ haveurgent = true;
+ if (!data_race(rcu_segcblist_empty(&rtpcp->cblist)) && data_race(rtpcp->urgent_gp))
+ haveurgentcbs = true;
+ if (havecbs && haveurgent && haveurgentcbs)
+ break;
+ }
+ pr_info("%s: %s(%d) since %lu g:%lu i:%lu/%lu %c%c%c%c l:%lu %s\n",
rtp->kname,
tasks_gp_state_getname(rtp), data_race(rtp->gp_state),
jiffies - data_race(rtp->gp_jiffies),
- data_race(rtp->n_gps),
+ data_race(rcu_seq_current(&rtp->tasks_gp_seq)),
data_race(rtp->n_ipis_fails), data_race(rtp->n_ipis),
".k"[!!data_race(rtp->kthread_ptr)],
- ".C"[!!data_race(rtp->cbs_head)],
+ ".C"[havecbs],
+ ".u"[haveurgent],
+ ".U"[haveurgentcbs],
+ rtp->lazy_jiffies,
s);
}
#endif // #ifndef CONFIG_TINY_RCU
@@ -303,13 +734,18 @@ static void exit_tasks_rcu_finish_trace(struct task_struct *t);
/* Wait for one RCU-tasks grace period. */
static void rcu_tasks_wait_gp(struct rcu_tasks *rtp)
{
- struct task_struct *g, *t;
- unsigned long lastreport;
- LIST_HEAD(holdouts);
+ struct task_struct *g;
int fract;
+ LIST_HEAD(holdouts);
+ unsigned long j;
+ unsigned long lastinfo;
+ unsigned long lastreport;
+ bool reported = false;
+ int rtsi;
+ struct task_struct *t;
set_tasks_gp_state(rtp, RTGS_PRE_WAIT_GP);
- rtp->pregp_func();
+ rtp->pregp_func(&holdouts);
/*
* There were callbacks, so we need to wait for an RCU-tasks
@@ -318,10 +754,12 @@ static void rcu_tasks_wait_gp(struct rcu_tasks *rtp)
* and make a list of them in holdouts.
*/
set_tasks_gp_state(rtp, RTGS_SCAN_TASKLIST);
- rcu_read_lock();
- for_each_process_thread(g, t)
- rtp->pertask_func(t, &holdouts);
- rcu_read_unlock();
+ if (rtp->pertask_func) {
+ rcu_read_lock();
+ for_each_process_thread(g, t)
+ rtp->pertask_func(t, &holdouts);
+ rcu_read_unlock();
+ }
set_tasks_gp_state(rtp, RTGS_POST_SCAN_TASKLIST);
rtp->postscan_func(&holdouts);
@@ -332,30 +770,50 @@ static void rcu_tasks_wait_gp(struct rcu_tasks *rtp)
* is empty, we are done.
*/
lastreport = jiffies;
+ lastinfo = lastreport;
+ rtsi = READ_ONCE(rcu_task_stall_info);
// Start off with initial wait and slowly back off to 1 HZ wait.
fract = rtp->init_fract;
while (!list_empty(&holdouts)) {
+ ktime_t exp;
bool firstreport;
bool needreport;
int rtst;
- /* Slowly back off waiting for holdouts */
+ // Slowly back off waiting for holdouts
set_tasks_gp_state(rtp, RTGS_WAIT_SCAN_HOLDOUTS);
- schedule_timeout_idle(fract);
+ if (!IS_ENABLED(CONFIG_PREEMPT_RT)) {
+ schedule_timeout_idle(fract);
+ } else {
+ exp = jiffies_to_nsecs(fract);
+ __set_current_state(TASK_IDLE);
+ schedule_hrtimeout_range(&exp, jiffies_to_nsecs(HZ / 2), HRTIMER_MODE_REL_HARD);
+ }
if (fract < HZ)
fract++;
rtst = READ_ONCE(rcu_task_stall_timeout);
needreport = rtst > 0 && time_after(jiffies, lastreport + rtst);
- if (needreport)
+ if (needreport) {
lastreport = jiffies;
+ reported = true;
+ }
firstreport = true;
WARN_ON(signal_pending(current));
set_tasks_gp_state(rtp, RTGS_SCAN_HOLDOUTS);
rtp->holdouts_func(&holdouts, needreport, &firstreport);
+
+ // Print pre-stall informational messages if needed.
+ j = jiffies;
+ if (rtsi > 0 && !reported && time_after(j, lastinfo + rtsi)) {
+ lastinfo = j;
+ rtsi = rtsi * rcu_task_stall_info_mult;
+ pr_info("%s: %s grace period number %lu (since boot) is %lu jiffies old.\n",
+ __func__, rtp->kname, rtp->tasks_gp_seq, j - rtp->gp_start);
+ }
}
set_tasks_gp_state(rtp, RTGS_POST_GP);
@@ -369,7 +827,7 @@ static void rcu_tasks_wait_gp(struct rcu_tasks *rtp)
////////////////////////////////////////////////////////////////////////
//
// Simple variant of RCU whose quiescent states are voluntary context
-// switch, cond_resched_rcu_qs(), user-space execution, and idle.
+// switch, cond_resched_tasks_rcu_qs(), user-space execution, and idle.
// As such, grace periods can take one good long time. There are no
// read-side primitives similar to rcu_read_lock() and rcu_read_unlock()
// because this implementation is intended to get the system into a safe
@@ -377,9 +835,49 @@ static void rcu_tasks_wait_gp(struct rcu_tasks *rtp)
// Finally, this implementation does not support high call_rcu_tasks()
// rates from multiple CPUs. If this is required, per-CPU callback lists
// will be needed.
+//
+// The implementation uses rcu_tasks_wait_gp(), which relies on function
+// pointers in the rcu_tasks structure. The rcu_spawn_tasks_kthread()
+// function sets these function pointers up so that rcu_tasks_wait_gp()
+// invokes these functions in this order:
+//
+// rcu_tasks_pregp_step():
+// Invokes synchronize_rcu() in order to wait for all in-flight
+// t->on_rq and t->nvcsw transitions to complete. This works because
+// all such transitions are carried out with interrupts disabled.
+// rcu_tasks_pertask(), invoked on every non-idle task:
+// For every runnable non-idle task other than the current one, use
+// get_task_struct() to pin down that task, snapshot that task's
+// number of voluntary context switches, and add that task to the
+// holdout list.
+// rcu_tasks_postscan():
+// Invoke synchronize_srcu() to ensure that all tasks that were
+// in the process of exiting (and which thus might not know to
+// synchronize with this RCU Tasks grace period) have completed
+// exiting.
+// check_all_holdout_tasks(), repeatedly until holdout list is empty:
+// Scans the holdout list, attempting to identify a quiescent state
+// for each task on the list. If there is a quiescent state, the
+// corresponding task is removed from the holdout list.
+// rcu_tasks_postgp():
+// Invokes synchronize_rcu() in order to ensure that all prior
+// t->on_rq and t->nvcsw transitions are seen by all CPUs and tasks
+// to have happened before the end of this RCU Tasks grace period.
+// Again, this works because all such transitions are carried out
+// with interrupts disabled.
+//
+// For each exiting task, the exit_tasks_rcu_start() and
+// exit_tasks_rcu_finish() functions begin and end, respectively, the SRCU
+// read-side critical sections waited for by rcu_tasks_postscan().
+//
+// Pre-grace-period update-side code is ordered before the grace
+// via the raw_spin_lock.*rcu_node(). Pre-grace-period read-side code
+// is ordered before the grace period via synchronize_rcu() call in
+// rcu_tasks_pregp_step() and by the scheduler's locks and interrupt
+// disabling.
/* Pre-grace-period preparation. */
-static void rcu_tasks_pregp_step(void)
+static void rcu_tasks_pregp_step(struct list_head *hop)
{
/*
* Wait for all pre-existing t->on_rq and t->nvcsw transitions
@@ -397,10 +895,36 @@ static void rcu_tasks_pregp_step(void)
synchronize_rcu();
}
+/* Check for quiescent states since the pregp's synchronize_rcu() */
+static bool rcu_tasks_is_holdout(struct task_struct *t)
+{
+ int cpu;
+
+ /* Has the task been seen voluntarily sleeping? */
+ if (!READ_ONCE(t->on_rq))
+ return false;
+
+ /*
+ * Idle tasks (or idle injection) within the idle loop are RCU-tasks
+ * quiescent states. But CPU boot code performed by the idle task
+ * isn't a quiescent state.
+ */
+ if (is_idle_task(t))
+ return false;
+
+ cpu = task_cpu(t);
+
+ /* Idle tasks on offline CPUs are RCU-tasks quiescent states. */
+ if (t == idle_task(cpu) && !rcu_cpu_online(cpu))
+ return false;
+
+ return true;
+}
+
/* Per-task initial processing. */
static void rcu_tasks_pertask(struct task_struct *t, struct list_head *hop)
{
- if (t != current && READ_ONCE(t->on_rq) && !is_idle_task(t)) {
+ if (t != current && rcu_tasks_is_holdout(t)) {
get_task_struct(t);
t->rcu_tasks_nvcsw = READ_ONCE(t->nvcsw);
WRITE_ONCE(t->rcu_tasks_holdout, true);
@@ -411,14 +935,34 @@ static void rcu_tasks_pertask(struct task_struct *t, struct list_head *hop)
/* Processing between scanning taskslist and draining the holdout list. */
static void rcu_tasks_postscan(struct list_head *hop)
{
+ int rtsi = READ_ONCE(rcu_task_stall_info);
+
+ if (!IS_ENABLED(CONFIG_TINY_RCU)) {
+ tasks_rcu_exit_srcu_stall_timer.expires = jiffies + rtsi;
+ add_timer(&tasks_rcu_exit_srcu_stall_timer);
+ }
+
/*
- * Wait for tasks that are in the process of exiting. This
- * does only part of the job, ensuring that all tasks that were
- * previously exiting reach the point where they have disabled
- * preemption, allowing the later synchronize_rcu() to finish
- * the job.
+ * Exiting tasks may escape the tasklist scan. Those are vulnerable
+ * until their final schedule() with TASK_DEAD state. To cope with
+ * this, divide the fragile exit path part in two intersecting
+ * read side critical sections:
+ *
+ * 1) An _SRCU_ read side starting before calling exit_notify(),
+ * which may remove the task from the tasklist, and ending after
+ * the final preempt_disable() call in do_exit().
+ *
+ * 2) An _RCU_ read side starting with the final preempt_disable()
+ * call in do_exit() and ending with the final call to schedule()
+ * with TASK_DEAD state.
+ *
+ * This handles the part 1). And postgp will handle part 2) with a
+ * call to synchronize_rcu().
*/
synchronize_srcu(&tasks_rcu_exit_srcu);
+
+ if (!IS_ENABLED(CONFIG_TINY_RCU))
+ del_timer_sync(&tasks_rcu_exit_srcu_stall_timer);
}
/* See if tasks are still holding out, complain if so. */
@@ -429,9 +973,9 @@ static void check_holdout_task(struct task_struct *t,
if (!READ_ONCE(t->rcu_tasks_holdout) ||
t->rcu_tasks_nvcsw != READ_ONCE(t->nvcsw) ||
- !READ_ONCE(t->on_rq) ||
+ !rcu_tasks_is_holdout(t) ||
(IS_ENABLED(CONFIG_NO_HZ_FULL) &&
- !is_idle_task(t) && t->rcu_tasks_idle_cpu >= 0)) {
+ !is_idle_task(t) && READ_ONCE(t->rcu_tasks_idle_cpu) >= 0)) {
WRITE_ONCE(t->rcu_tasks_holdout, false);
list_del_init(&t->rcu_tasks_holdout_list);
put_task_struct(t);
@@ -449,7 +993,7 @@ static void check_holdout_task(struct task_struct *t,
t, ".I"[is_idle_task(t)],
"N."[cpu < 0 || !tick_nohz_full_cpu(cpu)],
t->rcu_tasks_nvcsw, t->nvcsw, t->rcu_tasks_holdout,
- t->rcu_tasks_idle_cpu, cpu);
+ data_race(t->rcu_tasks_idle_cpu), cpu);
sched_show_task(t);
}
@@ -483,7 +1027,10 @@ static void rcu_tasks_postgp(struct rcu_tasks *rtp)
*
* In addition, this synchronize_rcu() waits for exiting tasks
* to complete their final preempt_disable() region of execution,
- * cleaning up after the synchronize_srcu() above.
+ * cleaning up after synchronize_srcu(&tasks_rcu_exit_srcu),
+ * enforcing the whole region before tasklist removal until
+ * the final schedule() with TASK_DEAD state to be an RCU TASKS
+ * read side critical section.
*/
synchronize_rcu();
}
@@ -491,6 +1038,21 @@ static void rcu_tasks_postgp(struct rcu_tasks *rtp)
void call_rcu_tasks(struct rcu_head *rhp, rcu_callback_t func);
DEFINE_RCU_TASKS(rcu_tasks, rcu_tasks_wait_gp, call_rcu_tasks, "RCU Tasks");
+static void tasks_rcu_exit_srcu_stall(struct timer_list *unused)
+{
+#ifndef CONFIG_TINY_RCU
+ int rtsi;
+
+ rtsi = READ_ONCE(rcu_task_stall_info);
+ pr_info("%s: %s grace period number %lu (since boot) gp_state: %s is %lu jiffies old.\n",
+ __func__, rcu_tasks.kname, rcu_tasks.tasks_gp_seq,
+ tasks_gp_state_getname(&rcu_tasks), jiffies - rcu_tasks.gp_jiffies);
+ pr_info("Please check any exiting tasks stuck between calls to exit_tasks_rcu_start() and exit_tasks_rcu_finish()\n");
+ tasks_rcu_exit_srcu_stall_timer.expires = jiffies + rtsi;
+ add_timer(&tasks_rcu_exit_srcu_stall_timer);
+#endif // #ifndef CONFIG_TINY_RCU
+}
+
/**
* call_rcu_tasks() - Queue an RCU for invocation task-based grace period
* @rhp: structure to be used for queueing the RCU updates.
@@ -500,11 +1062,11 @@ DEFINE_RCU_TASKS(rcu_tasks, rcu_tasks_wait_gp, call_rcu_tasks, "RCU Tasks");
* period elapses, in other words after all currently executing RCU
* read-side critical sections have completed. call_rcu_tasks() assumes
* that the read-side critical sections end at a voluntary context
- * switch (not a preemption!), cond_resched_rcu_qs(), entry into idle,
+ * switch (not a preemption!), cond_resched_tasks_rcu_qs(), entry into idle,
* or transition to usermode execution. As such, there are no read-side
* primitives analogous to rcu_read_lock() and rcu_read_unlock() because
* this primitive is intended to determine that all tasks have passed
- * through a safe state, not so much for data-strcuture synchronization.
+ * through a safe state, not so much for data-structure synchronization.
*
* See the description of call_rcu() for more detailed information on
* memory ordering guarantees.
@@ -547,15 +1109,20 @@ EXPORT_SYMBOL_GPL(synchronize_rcu_tasks);
*/
void rcu_barrier_tasks(void)
{
- /* There is only one callback queue, so this is easy. ;-) */
- synchronize_rcu_tasks();
+ rcu_barrier_tasks_generic(&rcu_tasks);
}
EXPORT_SYMBOL_GPL(rcu_barrier_tasks);
+static int rcu_tasks_lazy_ms = -1;
+module_param(rcu_tasks_lazy_ms, int, 0444);
+
static int __init rcu_spawn_tasks_kthread(void)
{
+ cblist_init_generic(&rcu_tasks);
rcu_tasks.gp_sleep = HZ / 10;
rcu_tasks.init_fract = HZ / 10;
+ if (rcu_tasks_lazy_ms >= 0)
+ rcu_tasks.lazy_jiffies = msecs_to_jiffies(rcu_tasks_lazy_ms);
rcu_tasks.pregp_func = rcu_tasks_pregp_step;
rcu_tasks.pertask_func = rcu_tasks_pertask;
rcu_tasks.postscan_func = rcu_tasks_postscan;
@@ -573,27 +1140,48 @@ void show_rcu_tasks_classic_gp_kthread(void)
EXPORT_SYMBOL_GPL(show_rcu_tasks_classic_gp_kthread);
#endif // !defined(CONFIG_TINY_RCU)
-/* Do the srcu_read_lock() for the above synchronize_srcu(). */
+struct task_struct *get_rcu_tasks_gp_kthread(void)
+{
+ return rcu_tasks.kthread_ptr;
+}
+EXPORT_SYMBOL_GPL(get_rcu_tasks_gp_kthread);
+
+/*
+ * Contribute to protect against tasklist scan blind spot while the
+ * task is exiting and may be removed from the tasklist. See
+ * corresponding synchronize_srcu() for further details.
+ */
void exit_tasks_rcu_start(void) __acquires(&tasks_rcu_exit_srcu)
{
- preempt_disable();
current->rcu_tasks_idx = __srcu_read_lock(&tasks_rcu_exit_srcu);
- preempt_enable();
}
-/* Do the srcu_read_unlock() for the above synchronize_srcu(). */
-void exit_tasks_rcu_finish(void) __releases(&tasks_rcu_exit_srcu)
+/*
+ * Contribute to protect against tasklist scan blind spot while the
+ * task is exiting and may be removed from the tasklist. See
+ * corresponding synchronize_srcu() for further details.
+ */
+void exit_tasks_rcu_stop(void) __releases(&tasks_rcu_exit_srcu)
{
struct task_struct *t = current;
- preempt_disable();
__srcu_read_unlock(&tasks_rcu_exit_srcu, t->rcu_tasks_idx);
- preempt_enable();
- exit_tasks_rcu_finish_trace(t);
+}
+
+/*
+ * Contribute to protect against tasklist scan blind spot while the
+ * task is exiting and may be removed from the tasklist. See
+ * corresponding synchronize_srcu() for further details.
+ */
+void exit_tasks_rcu_finish(void)
+{
+ exit_tasks_rcu_stop();
+ exit_tasks_rcu_finish_trace(current);
}
#else /* #ifdef CONFIG_TASKS_RCU */
void exit_tasks_rcu_start(void) { }
+void exit_tasks_rcu_stop(void) { }
void exit_tasks_rcu_finish(void) { exit_tasks_rcu_finish_trace(current); }
#endif /* #else #ifdef CONFIG_TASKS_RCU */
@@ -603,10 +1191,15 @@ void exit_tasks_rcu_finish(void) { exit_tasks_rcu_finish_trace(current); }
//
// "Rude" variant of Tasks RCU, inspired by Steve Rostedt's trick of
// passing an empty function to schedule_on_each_cpu(). This approach
-// provides an asynchronous call_rcu_tasks_rude() API and batching
-// of concurrent calls to the synchronous synchronize_rcu_rude() API.
-// This sends IPIs far and wide and induces otherwise unnecessary context
-// switches on all online CPUs, whether idle or not.
+// provides an asynchronous call_rcu_tasks_rude() API and batching of
+// concurrent calls to the synchronous synchronize_rcu_tasks_rude() API.
+// This invokes schedule_on_each_cpu() in order to send IPIs far and wide
+// and induces otherwise unnecessary context switches on all online CPUs,
+// whether idle or not.
+//
+// Callback handling is provided by the rcu_tasks_kthread() function.
+//
+// Ordering is provided by the scheduler's context-switch code.
// Empty function to allow workqueues to force a context switch.
static void rcu_tasks_be_rude(struct work_struct *work)
@@ -633,11 +1226,11 @@ DEFINE_RCU_TASKS(rcu_tasks_rude, rcu_tasks_rude_wait_gp, call_rcu_tasks_rude,
* period elapses, in other words after all currently executing RCU
* read-side critical sections have completed. call_rcu_tasks_rude()
* assumes that the read-side critical sections end at context switch,
- * cond_resched_rcu_qs(), or transition to usermode execution. As such,
- * there are no read-side primitives analogous to rcu_read_lock() and
- * rcu_read_unlock() because this primitive is intended to determine
- * that all tasks have passed through a safe state, not so much for
- * data-strcuture synchronization.
+ * cond_resched_tasks_rcu_qs(), or transition to usermode execution (as
+ * usermode execution is schedulable). As such, there are no read-side
+ * primitives analogous to rcu_read_lock() and rcu_read_unlock() because
+ * this primitive is intended to determine that all tasks have passed
+ * through a safe state, not so much for data-structure synchronization.
*
* See the description of call_rcu() for more detailed information on
* memory ordering guarantees.
@@ -655,8 +1248,8 @@ EXPORT_SYMBOL_GPL(call_rcu_tasks_rude);
* grace period has elapsed, in other words after all currently
* executing rcu-tasks read-side critical sections have elapsed. These
* read-side critical sections are delimited by calls to schedule(),
- * cond_resched_tasks_rcu_qs(), userspace execution, and (in theory,
- * anyway) cond_resched().
+ * cond_resched_tasks_rcu_qs(), userspace execution (which is a schedulable
+ * context), and (in theory, anyway) cond_resched().
*
* This is a very specialized primitive, intended only for a few uses in
* tracing and other situations requiring manipulation of function preambles
@@ -680,14 +1273,19 @@ EXPORT_SYMBOL_GPL(synchronize_rcu_tasks_rude);
*/
void rcu_barrier_tasks_rude(void)
{
- /* There is only one callback queue, so this is easy. ;-) */
- synchronize_rcu_tasks_rude();
+ rcu_barrier_tasks_generic(&rcu_tasks_rude);
}
EXPORT_SYMBOL_GPL(rcu_barrier_tasks_rude);
+int rcu_tasks_rude_lazy_ms = -1;
+module_param(rcu_tasks_rude_lazy_ms, int, 0444);
+
static int __init rcu_spawn_tasks_rude_kthread(void)
{
+ cblist_init_generic(&rcu_tasks_rude);
rcu_tasks_rude.gp_sleep = HZ / 10;
+ if (rcu_tasks_rude_lazy_ms >= 0)
+ rcu_tasks_rude.lazy_jiffies = msecs_to_jiffies(rcu_tasks_rude_lazy_ms);
rcu_spawn_tasks_kthread_generic(&rcu_tasks_rude);
return 0;
}
@@ -699,6 +1297,13 @@ void show_rcu_tasks_rude_gp_kthread(void)
}
EXPORT_SYMBOL_GPL(show_rcu_tasks_rude_gp_kthread);
#endif // !defined(CONFIG_TINY_RCU)
+
+struct task_struct *get_rcu_tasks_rude_gp_kthread(void)
+{
+ return rcu_tasks_rude.kthread_ptr;
+}
+EXPORT_SYMBOL_GPL(get_rcu_tasks_rude_gp_kthread);
+
#endif /* #ifdef CONFIG_TASKS_RUDE_RCU */
////////////////////////////////////////////////////////////////////////
@@ -713,19 +1318,49 @@ EXPORT_SYMBOL_GPL(show_rcu_tasks_rude_gp_kthread);
// 2. Protects code in the idle loop, exception entry/exit, and
// CPU-hotplug code paths, similar to the capabilities of SRCU.
//
-// 3. Avoids expensive read-side instruction, having overhead similar
+// 3. Avoids expensive read-side instructions, having overhead similar
// to that of Preemptible RCU.
//
-// There are of course downsides. The grace-period code can send IPIs to
-// CPUs, even when those CPUs are in the idle loop or in nohz_full userspace.
-// It is necessary to scan the full tasklist, much as for Tasks RCU. There
-// is a single callback queue guarded by a single lock, again, much as for
-// Tasks RCU. If needed, these downsides can be at least partially remedied.
+// There are of course downsides. For example, the grace-period code
+// can send IPIs to CPUs, even when those CPUs are in the idle loop or
+// in nohz_full userspace. If needed, these downsides can be at least
+// partially remedied.
//
// Perhaps most important, this variant of RCU does not affect the vanilla
// flavors, rcu_preempt and rcu_sched. The fact that RCU Tasks Trace
// readers can operate from idle, offline, and exception entry/exit in no
// way allows rcu_preempt and rcu_sched readers to also do so.
+//
+// The implementation uses rcu_tasks_wait_gp(), which relies on function
+// pointers in the rcu_tasks structure. The rcu_spawn_tasks_trace_kthread()
+// function sets these function pointers up so that rcu_tasks_wait_gp()
+// invokes these functions in this order:
+//
+// rcu_tasks_trace_pregp_step():
+// Disables CPU hotplug, adds all currently executing tasks to the
+// holdout list, then checks the state of all tasks that blocked
+// or were preempted within their current RCU Tasks Trace read-side
+// critical section, adding them to the holdout list if appropriate.
+// Finally, this function re-enables CPU hotplug.
+// The ->pertask_func() pointer is NULL, so there is no per-task processing.
+// rcu_tasks_trace_postscan():
+// Invokes synchronize_rcu() to wait for late-stage exiting tasks
+// to finish exiting.
+// check_all_holdout_tasks_trace(), repeatedly until holdout list is empty:
+// Scans the holdout list, attempting to identify a quiescent state
+// for each task on the list. If there is a quiescent state, the
+// corresponding task is removed from the holdout list. Once this
+// list is empty, the grace period has completed.
+// rcu_tasks_trace_postgp():
+// Provides the needed full memory barrier and does debug checks.
+//
+// The exit_tasks_rcu_finish_trace() synchronizes with exiting tasks.
+//
+// Pre-grace-period update-side code is ordered before the grace period
+// via the ->cbs_lock and barriers in rcu_tasks_kthread(). Pre-grace-period
+// read-side code is ordered before the grace period by atomic operations
+// on .b.need_qs flag of each task involved in this process, or by scheduler
+// context-switch ordering (for locked-down non-running readers).
// The lockdep state must be outside of #ifdef to be useful.
#ifdef CONFIG_DEBUG_LOCK_ALLOC
@@ -737,9 +1372,6 @@ EXPORT_SYMBOL_GPL(rcu_trace_lock_map);
#ifdef CONFIG_TASKS_TRACE_RCU
-static atomic_t trc_n_readers_need_end; // Number of waited-for readers.
-static DECLARE_WAIT_QUEUE_HEAD(trc_wait); // List of holdout tasks.
-
// Record outstanding IPIs to each CPU. No point in sending two...
static DEFINE_PER_CPU(bool, trc_ipi_to_cpu);
@@ -748,44 +1380,104 @@ static DEFINE_PER_CPU(bool, trc_ipi_to_cpu);
static unsigned long n_heavy_reader_attempts;
static unsigned long n_heavy_reader_updates;
static unsigned long n_heavy_reader_ofl_updates;
+static unsigned long n_trc_holdouts;
void call_rcu_tasks_trace(struct rcu_head *rhp, rcu_callback_t func);
DEFINE_RCU_TASKS(rcu_tasks_trace, rcu_tasks_wait_gp, call_rcu_tasks_trace,
"RCU Tasks Trace");
+/* Load from ->trc_reader_special.b.need_qs with proper ordering. */
+static u8 rcu_ld_need_qs(struct task_struct *t)
+{
+ smp_mb(); // Enforce full grace-period ordering.
+ return smp_load_acquire(&t->trc_reader_special.b.need_qs);
+}
+
+/* Store to ->trc_reader_special.b.need_qs with proper ordering. */
+static void rcu_st_need_qs(struct task_struct *t, u8 v)
+{
+ smp_store_release(&t->trc_reader_special.b.need_qs, v);
+ smp_mb(); // Enforce full grace-period ordering.
+}
+
/*
- * This irq_work handler allows rcu_read_unlock_trace() to be invoked
- * while the scheduler locks are held.
+ * Do a cmpxchg() on ->trc_reader_special.b.need_qs, allowing for
+ * the four-byte operand-size restriction of some platforms.
+ * Returns the old value, which is often ignored.
*/
-static void rcu_read_unlock_iw(struct irq_work *iwp)
+u8 rcu_trc_cmpxchg_need_qs(struct task_struct *t, u8 old, u8 new)
{
- wake_up(&trc_wait);
+ union rcu_special ret;
+ union rcu_special trs_old = READ_ONCE(t->trc_reader_special);
+ union rcu_special trs_new = trs_old;
+
+ if (trs_old.b.need_qs != old)
+ return trs_old.b.need_qs;
+ trs_new.b.need_qs = new;
+ ret.s = cmpxchg(&t->trc_reader_special.s, trs_old.s, trs_new.s);
+ return ret.b.need_qs;
}
-static DEFINE_IRQ_WORK(rcu_tasks_trace_iw, rcu_read_unlock_iw);
+EXPORT_SYMBOL_GPL(rcu_trc_cmpxchg_need_qs);
-/* If we are the last reader, wake up the grace-period kthread. */
-void rcu_read_unlock_trace_special(struct task_struct *t, int nesting)
+/*
+ * If we are the last reader, signal the grace-period kthread.
+ * Also remove from the per-CPU list of blocked tasks.
+ */
+void rcu_read_unlock_trace_special(struct task_struct *t)
{
- int nq = t->trc_reader_special.b.need_qs;
+ unsigned long flags;
+ struct rcu_tasks_percpu *rtpcp;
+ union rcu_special trs;
- if (IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB) &&
- t->trc_reader_special.b.need_mb)
+ // Open-coded full-word version of rcu_ld_need_qs().
+ smp_mb(); // Enforce full grace-period ordering.
+ trs = smp_load_acquire(&t->trc_reader_special);
+
+ if (IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB) && t->trc_reader_special.b.need_mb)
smp_mb(); // Pairs with update-side barriers.
// Update .need_qs before ->trc_reader_nesting for irq/NMI handlers.
- if (nq)
- WRITE_ONCE(t->trc_reader_special.b.need_qs, false);
- WRITE_ONCE(t->trc_reader_nesting, nesting);
- if (nq && atomic_dec_and_test(&trc_n_readers_need_end))
- irq_work_queue(&rcu_tasks_trace_iw);
+ if (trs.b.need_qs == (TRC_NEED_QS_CHECKED | TRC_NEED_QS)) {
+ u8 result = rcu_trc_cmpxchg_need_qs(t, TRC_NEED_QS_CHECKED | TRC_NEED_QS,
+ TRC_NEED_QS_CHECKED);
+
+ WARN_ONCE(result != trs.b.need_qs, "%s: result = %d", __func__, result);
+ }
+ if (trs.b.blocked) {
+ rtpcp = per_cpu_ptr(rcu_tasks_trace.rtpcpu, t->trc_blkd_cpu);
+ raw_spin_lock_irqsave_rcu_node(rtpcp, flags);
+ list_del_init(&t->trc_blkd_node);
+ WRITE_ONCE(t->trc_reader_special.b.blocked, false);
+ raw_spin_unlock_irqrestore_rcu_node(rtpcp, flags);
+ }
+ WRITE_ONCE(t->trc_reader_nesting, 0);
}
EXPORT_SYMBOL_GPL(rcu_read_unlock_trace_special);
+/* Add a newly blocked reader task to its CPU's list. */
+void rcu_tasks_trace_qs_blkd(struct task_struct *t)
+{
+ unsigned long flags;
+ struct rcu_tasks_percpu *rtpcp;
+
+ local_irq_save(flags);
+ rtpcp = this_cpu_ptr(rcu_tasks_trace.rtpcpu);
+ raw_spin_lock_rcu_node(rtpcp); // irqs already disabled
+ t->trc_blkd_cpu = smp_processor_id();
+ if (!rtpcp->rtp_blkd_tasks.next)
+ INIT_LIST_HEAD(&rtpcp->rtp_blkd_tasks);
+ list_add(&t->trc_blkd_node, &rtpcp->rtp_blkd_tasks);
+ WRITE_ONCE(t->trc_reader_special.b.blocked, true);
+ raw_spin_unlock_irqrestore_rcu_node(rtpcp, flags);
+}
+EXPORT_SYMBOL_GPL(rcu_tasks_trace_qs_blkd);
+
/* Add a task to the holdout list, if it is not already on the list. */
static void trc_add_holdout(struct task_struct *t, struct list_head *bhp)
{
if (list_empty(&t->trc_holdout_list)) {
get_task_struct(t);
list_add(&t->trc_holdout_list, bhp);
+ n_trc_holdouts++;
}
}
@@ -795,98 +1487,91 @@ static void trc_del_holdout(struct task_struct *t)
if (!list_empty(&t->trc_holdout_list)) {
list_del_init(&t->trc_holdout_list);
put_task_struct(t);
+ n_trc_holdouts--;
}
}
/* IPI handler to check task state. */
static void trc_read_check_handler(void *t_in)
{
+ int nesting;
struct task_struct *t = current;
struct task_struct *texp = t_in;
// If the task is no longer running on this CPU, leave.
- if (unlikely(texp != t)) {
- if (WARN_ON_ONCE(atomic_dec_and_test(&trc_n_readers_need_end)))
- wake_up(&trc_wait);
+ if (unlikely(texp != t))
goto reset_ipi; // Already on holdout list, so will check later.
- }
// If the task is not in a read-side critical section, and
// if this is the last reader, awaken the grace-period kthread.
- if (likely(!t->trc_reader_nesting)) {
- if (WARN_ON_ONCE(atomic_dec_and_test(&trc_n_readers_need_end)))
- wake_up(&trc_wait);
- // Mark as checked after decrement to avoid false
- // positives on the above WARN_ON_ONCE().
- WRITE_ONCE(t->trc_reader_checked, true);
+ nesting = READ_ONCE(t->trc_reader_nesting);
+ if (likely(!nesting)) {
+ rcu_trc_cmpxchg_need_qs(t, 0, TRC_NEED_QS_CHECKED);
goto reset_ipi;
}
// If we are racing with an rcu_read_unlock_trace(), try again later.
- if (unlikely(t->trc_reader_nesting < 0)) {
- if (WARN_ON_ONCE(atomic_dec_and_test(&trc_n_readers_need_end)))
- wake_up(&trc_wait);
+ if (unlikely(nesting < 0))
goto reset_ipi;
- }
- WRITE_ONCE(t->trc_reader_checked, true);
- // Get here if the task is in a read-side critical section. Set
- // its state so that it will awaken the grace-period kthread upon
- // exit from that critical section.
- WARN_ON_ONCE(t->trc_reader_special.b.need_qs);
- WRITE_ONCE(t->trc_reader_special.b.need_qs, true);
+ // Get here if the task is in a read-side critical section.
+ // Set its state so that it will update state for the grace-period
+ // kthread upon exit from that critical section.
+ rcu_trc_cmpxchg_need_qs(t, 0, TRC_NEED_QS | TRC_NEED_QS_CHECKED);
reset_ipi:
// Allow future IPIs to be sent on CPU and for task.
// Also order this IPI handler against any later manipulations of
// the intended task.
- smp_store_release(&per_cpu(trc_ipi_to_cpu, smp_processor_id()), false); // ^^^
+ smp_store_release(per_cpu_ptr(&trc_ipi_to_cpu, smp_processor_id()), false); // ^^^
smp_store_release(&texp->trc_ipi_to_cpu, -1); // ^^^
}
/* Callback function for scheduler to check locked-down task. */
-static bool trc_inspect_reader(struct task_struct *t, void *arg)
+static int trc_inspect_reader(struct task_struct *t, void *bhp_in)
{
+ struct list_head *bhp = bhp_in;
int cpu = task_cpu(t);
- bool in_qs = false;
+ int nesting;
bool ofl = cpu_is_offline(cpu);
- if (task_curr(t)) {
- WARN_ON_ONCE(ofl && !is_idle_task(t));
-
+ if (task_curr(t) && !ofl) {
// If no chance of heavyweight readers, do it the hard way.
- if (!ofl && !IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB))
- return false;
+ if (!IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB))
+ return -EINVAL;
// If heavyweight readers are enabled on the remote task,
// we can inspect its state despite its currently running.
// However, we cannot safely change its state.
n_heavy_reader_attempts++;
- if (!ofl && // Check for "running" idle tasks on offline CPUs.
- !rcu_dynticks_zero_in_eqs(cpu, &t->trc_reader_nesting))
- return false; // No quiescent state, do it the hard way.
+ // Check for "running" idle tasks on offline CPUs.
+ if (!rcu_dynticks_zero_in_eqs(cpu, &t->trc_reader_nesting))
+ return -EINVAL; // No quiescent state, do it the hard way.
n_heavy_reader_updates++;
- if (ofl)
- n_heavy_reader_ofl_updates++;
- in_qs = true;
+ nesting = 0;
} else {
- in_qs = likely(!t->trc_reader_nesting);
+ // The task is not running, so C-language access is safe.
+ nesting = t->trc_reader_nesting;
+ WARN_ON_ONCE(ofl && task_curr(t) && (t != idle_task(task_cpu(t))));
+ if (IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB) && ofl)
+ n_heavy_reader_ofl_updates++;
}
- // Mark as checked. Because this is called from the grace-period
- // kthread, also remove the task from the holdout list.
- t->trc_reader_checked = true;
- trc_del_holdout(t);
-
- if (in_qs)
- return true; // Already in quiescent state, done!!!
+ // If not exiting a read-side critical section, mark as checked
+ // so that the grace-period kthread will remove it from the
+ // holdout list.
+ if (!nesting) {
+ rcu_trc_cmpxchg_need_qs(t, 0, TRC_NEED_QS_CHECKED);
+ return 0; // In QS, so done.
+ }
+ if (nesting < 0)
+ return -EINVAL; // Reader transitioning, try again later.
// The task is in a read-side critical section, so set up its
- // state so that it will awaken the grace-period kthread upon exit
- // from that critical section.
- atomic_inc(&trc_n_readers_need_end); // One more to wait on.
- WARN_ON_ONCE(t->trc_reader_special.b.need_qs);
- WRITE_ONCE(t->trc_reader_special.b.need_qs, true);
- return true;
+ // state so that it will update state upon exit from that critical
+ // section.
+ if (!rcu_trc_cmpxchg_need_qs(t, 0, TRC_NEED_QS | TRC_NEED_QS_CHECKED))
+ trc_add_holdout(t, bhp);
+ return 0;
}
/* Attempt to extract the state for the specified task. */
@@ -901,20 +1586,25 @@ static void trc_wait_for_one_reader(struct task_struct *t,
// The current task had better be in a quiescent state.
if (t == current) {
- t->trc_reader_checked = true;
- trc_del_holdout(t);
- WARN_ON_ONCE(t->trc_reader_nesting);
+ rcu_trc_cmpxchg_need_qs(t, 0, TRC_NEED_QS_CHECKED);
+ WARN_ON_ONCE(READ_ONCE(t->trc_reader_nesting));
return;
}
// Attempt to nail down the task for inspection.
get_task_struct(t);
- if (try_invoke_on_locked_down_task(t, trc_inspect_reader, NULL)) {
+ if (!task_call_func(t, trc_inspect_reader, bhp)) {
put_task_struct(t);
return;
}
put_task_struct(t);
+ // If this task is not yet on the holdout list, then we are in
+ // an RCU read-side critical section. Otherwise, the invocation of
+ // trc_add_holdout() that added it to the list did the necessary
+ // get_task_struct(). Either way, the task cannot be freed out
+ // from under this code.
+
// If currently running, send an IPI, either way, add to list.
trc_add_holdout(t, bhp);
if (task_curr(t) &&
@@ -926,96 +1616,161 @@ static void trc_wait_for_one_reader(struct task_struct *t,
if (per_cpu(trc_ipi_to_cpu, cpu) || t->trc_ipi_to_cpu >= 0)
return;
- atomic_inc(&trc_n_readers_need_end);
per_cpu(trc_ipi_to_cpu, cpu) = true;
t->trc_ipi_to_cpu = cpu;
rcu_tasks_trace.n_ipis++;
- if (smp_call_function_single(cpu,
- trc_read_check_handler, t, 0)) {
+ if (smp_call_function_single(cpu, trc_read_check_handler, t, 0)) {
// Just in case there is some other reason for
// failure than the target CPU being offline.
+ WARN_ONCE(1, "%s(): smp_call_function_single() failed for CPU: %d\n",
+ __func__, cpu);
rcu_tasks_trace.n_ipis_fails++;
per_cpu(trc_ipi_to_cpu, cpu) = false;
- t->trc_ipi_to_cpu = cpu;
- if (atomic_dec_and_test(&trc_n_readers_need_end)) {
- WARN_ON_ONCE(1);
- wake_up(&trc_wait);
- }
+ t->trc_ipi_to_cpu = -1;
}
}
}
+/*
+ * Initialize for first-round processing for the specified task.
+ * Return false if task is NULL or already taken care of, true otherwise.
+ */
+static bool rcu_tasks_trace_pertask_prep(struct task_struct *t, bool notself)
+{
+ // During early boot when there is only the one boot CPU, there
+ // is no idle task for the other CPUs. Also, the grace-period
+ // kthread is always in a quiescent state. In addition, just return
+ // if this task is already on the list.
+ if (unlikely(t == NULL) || (t == current && notself) || !list_empty(&t->trc_holdout_list))
+ return false;
+
+ rcu_st_need_qs(t, 0);
+ t->trc_ipi_to_cpu = -1;
+ return true;
+}
+
+/* Do first-round processing for the specified task. */
+static void rcu_tasks_trace_pertask(struct task_struct *t, struct list_head *hop)
+{
+ if (rcu_tasks_trace_pertask_prep(t, true))
+ trc_wait_for_one_reader(t, hop);
+}
+
/* Initialize for a new RCU-tasks-trace grace period. */
-static void rcu_tasks_trace_pregp_step(void)
+static void rcu_tasks_trace_pregp_step(struct list_head *hop)
{
+ LIST_HEAD(blkd_tasks);
int cpu;
-
- // Allow for fast-acting IPIs.
- atomic_set(&trc_n_readers_need_end, 1);
+ unsigned long flags;
+ struct rcu_tasks_percpu *rtpcp;
+ struct task_struct *t;
// There shouldn't be any old IPIs, but...
for_each_possible_cpu(cpu)
WARN_ON_ONCE(per_cpu(trc_ipi_to_cpu, cpu));
- // Disable CPU hotplug across the tasklist scan.
- // This also waits for all readers in CPU-hotplug code paths.
+ // Disable CPU hotplug across the CPU scan for the benefit of
+ // any IPIs that might be needed. This also waits for all readers
+ // in CPU-hotplug code paths.
cpus_read_lock();
-}
-/* Do first-round processing for the specified task. */
-static void rcu_tasks_trace_pertask(struct task_struct *t,
- struct list_head *hop)
-{
- // During early boot when there is only the one boot CPU, there
- // is no idle task for the other CPUs. Just return.
- if (unlikely(t == NULL))
- return;
+ // These rcu_tasks_trace_pertask_prep() calls are serialized to
+ // allow safe access to the hop list.
+ for_each_online_cpu(cpu) {
+ rcu_read_lock();
+ t = cpu_curr_snapshot(cpu);
+ if (rcu_tasks_trace_pertask_prep(t, true))
+ trc_add_holdout(t, hop);
+ rcu_read_unlock();
+ cond_resched_tasks_rcu_qs();
+ }
- WRITE_ONCE(t->trc_reader_special.b.need_qs, false);
- WRITE_ONCE(t->trc_reader_checked, false);
- t->trc_ipi_to_cpu = -1;
- trc_wait_for_one_reader(t, hop);
+ // Only after all running tasks have been accounted for is it
+ // safe to take care of the tasks that have blocked within their
+ // current RCU tasks trace read-side critical section.
+ for_each_possible_cpu(cpu) {
+ rtpcp = per_cpu_ptr(rcu_tasks_trace.rtpcpu, cpu);
+ raw_spin_lock_irqsave_rcu_node(rtpcp, flags);
+ list_splice_init(&rtpcp->rtp_blkd_tasks, &blkd_tasks);
+ while (!list_empty(&blkd_tasks)) {
+ rcu_read_lock();
+ t = list_first_entry(&blkd_tasks, struct task_struct, trc_blkd_node);
+ list_del_init(&t->trc_blkd_node);
+ list_add(&t->trc_blkd_node, &rtpcp->rtp_blkd_tasks);
+ raw_spin_unlock_irqrestore_rcu_node(rtpcp, flags);
+ rcu_tasks_trace_pertask(t, hop);
+ rcu_read_unlock();
+ raw_spin_lock_irqsave_rcu_node(rtpcp, flags);
+ }
+ raw_spin_unlock_irqrestore_rcu_node(rtpcp, flags);
+ cond_resched_tasks_rcu_qs();
+ }
+
+ // Re-enable CPU hotplug now that the holdout list is populated.
+ cpus_read_unlock();
}
/*
- * Do intermediate processing between task and holdout scans and
- * pick up the idle tasks.
+ * Do intermediate processing between task and holdout scans.
*/
static void rcu_tasks_trace_postscan(struct list_head *hop)
{
- int cpu;
-
- for_each_possible_cpu(cpu)
- rcu_tasks_trace_pertask(idle_task(cpu), hop);
-
- // Re-enable CPU hotplug now that the tasklist scan has completed.
- cpus_read_unlock();
-
// Wait for late-stage exiting tasks to finish exiting.
// These might have passed the call to exit_tasks_rcu_finish().
+
+ // If you remove the following line, update rcu_trace_implies_rcu_gp()!!!
synchronize_rcu();
- // Any tasks that exit after this point will set ->trc_reader_checked.
+ // Any tasks that exit after this point will set
+ // TRC_NEED_QS_CHECKED in ->trc_reader_special.b.need_qs.
+}
+
+/* Communicate task state back to the RCU tasks trace stall warning request. */
+struct trc_stall_chk_rdr {
+ int nesting;
+ int ipi_to_cpu;
+ u8 needqs;
+};
+
+static int trc_check_slow_task(struct task_struct *t, void *arg)
+{
+ struct trc_stall_chk_rdr *trc_rdrp = arg;
+
+ if (task_curr(t) && cpu_online(task_cpu(t)))
+ return false; // It is running, so decline to inspect it.
+ trc_rdrp->nesting = READ_ONCE(t->trc_reader_nesting);
+ trc_rdrp->ipi_to_cpu = READ_ONCE(t->trc_ipi_to_cpu);
+ trc_rdrp->needqs = rcu_ld_need_qs(t);
+ return true;
}
/* Show the state of a task stalling the current RCU tasks trace GP. */
static void show_stalled_task_trace(struct task_struct *t, bool *firstreport)
{
int cpu;
+ struct trc_stall_chk_rdr trc_rdr;
+ bool is_idle_tsk = is_idle_task(t);
if (*firstreport) {
pr_err("INFO: rcu_tasks_trace detected stalls on tasks:\n");
*firstreport = false;
}
- // FIXME: This should attempt to use try_invoke_on_nonrunning_task().
cpu = task_cpu(t);
- pr_alert("P%d: %c%c%c nesting: %d%c cpu: %d\n",
- t->pid,
- ".I"[READ_ONCE(t->trc_ipi_to_cpu) > 0],
- ".i"[is_idle_task(t)],
- ".N"[cpu > 0 && tick_nohz_full_cpu(cpu)],
- t->trc_reader_nesting,
- " N"[!!t->trc_reader_special.b.need_qs],
- cpu);
+ if (!task_call_func(t, trc_check_slow_task, &trc_rdr))
+ pr_alert("P%d: %c%c\n",
+ t->pid,
+ ".I"[t->trc_ipi_to_cpu >= 0],
+ ".i"[is_idle_tsk]);
+ else
+ pr_alert("P%d: %c%c%c%c nesting: %d%c%c cpu: %d%s\n",
+ t->pid,
+ ".I"[trc_rdr.ipi_to_cpu >= 0],
+ ".i"[is_idle_tsk],
+ ".N"[cpu >= 0 && tick_nohz_full_cpu(cpu)],
+ ".B"[!!data_race(t->trc_reader_special.b.blocked)],
+ trc_rdr.nesting,
+ " !CN"[trc_rdr.needqs & 0x3],
+ " ?"[trc_rdr.needqs > 0x3],
+ cpu, cpu_online(cpu) ? "" : "(offline)");
sched_show_task(t);
}
@@ -1035,71 +1790,52 @@ static void check_all_holdout_tasks_trace(struct list_head *hop,
{
struct task_struct *g, *t;
- // Disable CPU hotplug across the holdout list scan.
+ // Disable CPU hotplug across the holdout list scan for IPIs.
cpus_read_lock();
list_for_each_entry_safe(t, g, hop, trc_holdout_list) {
// If safe and needed, try to check the current task.
if (READ_ONCE(t->trc_ipi_to_cpu) == -1 &&
- !READ_ONCE(t->trc_reader_checked))
+ !(rcu_ld_need_qs(t) & TRC_NEED_QS_CHECKED))
trc_wait_for_one_reader(t, hop);
// If check succeeded, remove this task from the list.
- if (READ_ONCE(t->trc_reader_checked))
+ if (smp_load_acquire(&t->trc_ipi_to_cpu) == -1 &&
+ rcu_ld_need_qs(t) == TRC_NEED_QS_CHECKED)
trc_del_holdout(t);
else if (needreport)
show_stalled_task_trace(t, firstreport);
+ cond_resched_tasks_rcu_qs();
}
// Re-enable CPU hotplug now that the holdout list scan has completed.
cpus_read_unlock();
if (needreport) {
- if (firstreport)
+ if (*firstreport)
pr_err("INFO: rcu_tasks_trace detected stalls? (Late IPI?)\n");
show_stalled_ipi_trace();
}
}
+static void rcu_tasks_trace_empty_fn(void *unused)
+{
+}
+
/* Wait for grace period to complete and provide ordering. */
static void rcu_tasks_trace_postgp(struct rcu_tasks *rtp)
{
- bool firstreport;
- struct task_struct *g, *t;
- LIST_HEAD(holdouts);
- long ret;
+ int cpu;
- // Remove the safety count.
- smp_mb__before_atomic(); // Order vs. earlier atomics
- atomic_dec(&trc_n_readers_need_end);
- smp_mb__after_atomic(); // Order vs. later atomics
+ // Wait for any lingering IPI handlers to complete. Note that
+ // if a CPU has gone offline or transitioned to userspace in the
+ // meantime, all IPI handlers should have been drained beforehand.
+ // Yes, this assumes that CPUs process IPIs in order. If that ever
+ // changes, there will need to be a recheck and/or timed wait.
+ for_each_online_cpu(cpu)
+ if (WARN_ON_ONCE(smp_load_acquire(per_cpu_ptr(&trc_ipi_to_cpu, cpu))))
+ smp_call_function_single(cpu, rcu_tasks_trace_empty_fn, NULL, 1);
- // Wait for readers.
- set_tasks_gp_state(rtp, RTGS_WAIT_READERS);
- for (;;) {
- ret = wait_event_idle_exclusive_timeout(
- trc_wait,
- atomic_read(&trc_n_readers_need_end) == 0,
- READ_ONCE(rcu_task_stall_timeout));
- if (ret)
- break; // Count reached zero.
- // Stall warning time, so make a list of the offenders.
- rcu_read_lock();
- for_each_process_thread(g, t)
- if (READ_ONCE(t->trc_reader_special.b.need_qs))
- trc_add_holdout(t, &holdouts);
- rcu_read_unlock();
- firstreport = true;
- list_for_each_entry_safe(t, g, &holdouts, trc_holdout_list) {
- if (READ_ONCE(t->trc_reader_special.b.need_qs))
- show_stalled_task_trace(t, &firstreport);
- trc_del_holdout(t); // Release task_struct reference.
- }
- if (firstreport)
- pr_err("INFO: rcu_tasks_trace detected stalls? (Counter/taskslist mismatch?)\n");
- show_stalled_ipi_trace();
- pr_err("\t%d holdouts\n", atomic_read(&trc_n_readers_need_end));
- }
smp_mb(); // Caller's code must be ordered after wakeup.
// Pairs with pretty much every ordering primitive.
}
@@ -1107,11 +1843,14 @@ static void rcu_tasks_trace_postgp(struct rcu_tasks *rtp)
/* Report any needed quiescent state for this exiting task. */
static void exit_tasks_rcu_finish_trace(struct task_struct *t)
{
- WRITE_ONCE(t->trc_reader_checked, true);
- WARN_ON_ONCE(t->trc_reader_nesting);
- WRITE_ONCE(t->trc_reader_nesting, 0);
- if (WARN_ON_ONCE(READ_ONCE(t->trc_reader_special.b.need_qs)))
- rcu_read_unlock_trace_special(t, 0);
+ union rcu_special trs = READ_ONCE(t->trc_reader_special);
+
+ rcu_trc_cmpxchg_need_qs(t, 0, TRC_NEED_QS_CHECKED);
+ WARN_ON_ONCE(READ_ONCE(t->trc_reader_nesting));
+ if (WARN_ON_ONCE(rcu_ld_need_qs(t) & TRC_NEED_QS || trs.b.blocked))
+ rcu_read_unlock_trace_special(t);
+ else
+ WRITE_ONCE(t->trc_reader_nesting, 0);
}
/**
@@ -1119,15 +1858,11 @@ static void exit_tasks_rcu_finish_trace(struct task_struct *t)
* @rhp: structure to be used for queueing the RCU updates.
* @func: actual callback function to be invoked after the grace period
*
- * The callback function will be invoked some time after a full grace
- * period elapses, in other words after all currently executing RCU
- * read-side critical sections have completed. call_rcu_tasks_trace()
- * assumes that the read-side critical sections end at context switch,
- * cond_resched_rcu_qs(), or transition to usermode execution. As such,
- * there are no read-side primitives analogous to rcu_read_lock() and
- * rcu_read_unlock() because this primitive is intended to determine
- * that all tasks have passed through a safe state, not so much for
- * data-strcuture synchronization.
+ * The callback function will be invoked some time after a trace rcu-tasks
+ * grace period elapses, in other words after all currently executing
+ * trace rcu-tasks read-side critical sections have completed. These
+ * read-side critical sections are delimited by calls to rcu_read_lock_trace()
+ * and rcu_read_unlock_trace().
*
* See the description of call_rcu() for more detailed information on
* memory ordering guarantees.
@@ -1143,7 +1878,7 @@ EXPORT_SYMBOL_GPL(call_rcu_tasks_trace);
*
* Control will return to the caller some time after a trace rcu-tasks
* grace period has elapsed, in other words after all currently executing
- * rcu-tasks read-side critical sections have elapsed. These read-side
+ * trace rcu-tasks read-side critical sections have elapsed. These read-side
* critical sections are delimited by calls to rcu_read_lock_trace()
* and rcu_read_unlock_trace().
*
@@ -1170,13 +1905,16 @@ EXPORT_SYMBOL_GPL(synchronize_rcu_tasks_trace);
*/
void rcu_barrier_tasks_trace(void)
{
- /* There is only one callback queue, so this is easy. ;-) */
- synchronize_rcu_tasks_trace();
+ rcu_barrier_tasks_generic(&rcu_tasks_trace);
}
EXPORT_SYMBOL_GPL(rcu_barrier_tasks_trace);
+int rcu_tasks_trace_lazy_ms = -1;
+module_param(rcu_tasks_trace_lazy_ms, int, 0444);
+
static int __init rcu_spawn_tasks_trace_kthread(void)
{
+ cblist_init_generic(&rcu_tasks_trace);
if (IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB)) {
rcu_tasks_trace.gp_sleep = HZ / 10;
rcu_tasks_trace.init_fract = HZ / 10;
@@ -1188,8 +1926,9 @@ static int __init rcu_spawn_tasks_trace_kthread(void)
if (rcu_tasks_trace.init_fract <= 0)
rcu_tasks_trace.init_fract = 1;
}
+ if (rcu_tasks_trace_lazy_ms >= 0)
+ rcu_tasks_trace.lazy_jiffies = msecs_to_jiffies(rcu_tasks_trace_lazy_ms);
rcu_tasks_trace.pregp_func = rcu_tasks_trace_pregp_step;
- rcu_tasks_trace.pertask_func = rcu_tasks_trace_pertask;
rcu_tasks_trace.postscan_func = rcu_tasks_trace_postscan;
rcu_tasks_trace.holdouts_func = check_all_holdout_tasks_trace;
rcu_tasks_trace.postgp_func = rcu_tasks_trace_postgp;
@@ -1202,7 +1941,8 @@ void show_rcu_tasks_trace_gp_kthread(void)
{
char buf[64];
- sprintf(buf, "N%d h:%lu/%lu/%lu", atomic_read(&trc_n_readers_need_end),
+ sprintf(buf, "N%lu h:%lu/%lu/%lu",
+ data_race(n_trc_holdouts),
data_race(n_heavy_reader_ofl_updates),
data_race(n_heavy_reader_updates),
data_race(n_heavy_reader_attempts));
@@ -1211,6 +1951,12 @@ void show_rcu_tasks_trace_gp_kthread(void)
EXPORT_SYMBOL_GPL(show_rcu_tasks_trace_gp_kthread);
#endif // !defined(CONFIG_TINY_RCU)
+struct task_struct *get_rcu_tasks_trace_gp_kthread(void)
+{
+ return rcu_tasks_trace.kthread_ptr;
+}
+EXPORT_SYMBOL_GPL(get_rcu_tasks_trace_gp_kthread);
+
#else /* #ifdef CONFIG_TASKS_TRACE_RCU */
static void exit_tasks_rcu_finish_trace(struct task_struct *t) { }
#endif /* #else #ifdef CONFIG_TASKS_TRACE_RCU */
@@ -1224,6 +1970,122 @@ void show_rcu_tasks_gp_kthreads(void)
}
#endif /* #ifndef CONFIG_TINY_RCU */
+#ifdef CONFIG_PROVE_RCU
+struct rcu_tasks_test_desc {
+ struct rcu_head rh;
+ const char *name;
+ bool notrun;
+ unsigned long runstart;
+};
+
+static struct rcu_tasks_test_desc tests[] = {
+ {
+ .name = "call_rcu_tasks()",
+ /* If not defined, the test is skipped. */
+ .notrun = IS_ENABLED(CONFIG_TASKS_RCU),
+ },
+ {
+ .name = "call_rcu_tasks_rude()",
+ /* If not defined, the test is skipped. */
+ .notrun = IS_ENABLED(CONFIG_TASKS_RUDE_RCU),
+ },
+ {
+ .name = "call_rcu_tasks_trace()",
+ /* If not defined, the test is skipped. */
+ .notrun = IS_ENABLED(CONFIG_TASKS_TRACE_RCU)
+ }
+};
+
+static void test_rcu_tasks_callback(struct rcu_head *rhp)
+{
+ struct rcu_tasks_test_desc *rttd =
+ container_of(rhp, struct rcu_tasks_test_desc, rh);
+
+ pr_info("Callback from %s invoked.\n", rttd->name);
+
+ rttd->notrun = false;
+}
+
+static void rcu_tasks_initiate_self_tests(void)
+{
+#ifdef CONFIG_TASKS_RCU
+ pr_info("Running RCU Tasks wait API self tests\n");
+ tests[0].runstart = jiffies;
+ synchronize_rcu_tasks();
+ call_rcu_tasks(&tests[0].rh, test_rcu_tasks_callback);
+#endif
+
+#ifdef CONFIG_TASKS_RUDE_RCU
+ pr_info("Running RCU Tasks Rude wait API self tests\n");
+ tests[1].runstart = jiffies;
+ synchronize_rcu_tasks_rude();
+ call_rcu_tasks_rude(&tests[1].rh, test_rcu_tasks_callback);
+#endif
+
+#ifdef CONFIG_TASKS_TRACE_RCU
+ pr_info("Running RCU Tasks Trace wait API self tests\n");
+ tests[2].runstart = jiffies;
+ synchronize_rcu_tasks_trace();
+ call_rcu_tasks_trace(&tests[2].rh, test_rcu_tasks_callback);
+#endif
+}
+
+/*
+ * Return: 0 - test passed
+ * 1 - test failed, but have not timed out yet
+ * -1 - test failed and timed out
+ */
+static int rcu_tasks_verify_self_tests(void)
+{
+ int ret = 0;
+ int i;
+ unsigned long bst = rcu_task_stall_timeout;
+
+ if (bst <= 0 || bst > RCU_TASK_BOOT_STALL_TIMEOUT)
+ bst = RCU_TASK_BOOT_STALL_TIMEOUT;
+ for (i = 0; i < ARRAY_SIZE(tests); i++) {
+ while (tests[i].notrun) { // still hanging.
+ if (time_after(jiffies, tests[i].runstart + bst)) {
+ pr_err("%s has failed boot-time tests.\n", tests[i].name);
+ ret = -1;
+ break;
+ }
+ ret = 1;
+ break;
+ }
+ }
+ WARN_ON(ret < 0);
+
+ return ret;
+}
+
+/*
+ * Repeat the rcu_tasks_verify_self_tests() call once every second until the
+ * test passes or has timed out.
+ */
+static struct delayed_work rcu_tasks_verify_work;
+static void rcu_tasks_verify_work_fn(struct work_struct *work __maybe_unused)
+{
+ int ret = rcu_tasks_verify_self_tests();
+
+ if (ret <= 0)
+ return;
+
+ /* Test fails but not timed out yet, reschedule another check */
+ schedule_delayed_work(&rcu_tasks_verify_work, HZ);
+}
+
+static int rcu_tasks_verify_schedule_work(void)
+{
+ INIT_DELAYED_WORK(&rcu_tasks_verify_work, rcu_tasks_verify_work_fn);
+ rcu_tasks_verify_work_fn(NULL);
+ return 0;
+}
+late_initcall(rcu_tasks_verify_schedule_work);
+#else /* #ifdef CONFIG_PROVE_RCU */
+static void rcu_tasks_initiate_self_tests(void) { }
+#endif /* #else #ifdef CONFIG_PROVE_RCU */
+
void __init rcu_init_tasks_generic(void)
{
#ifdef CONFIG_TASKS_RCU
@@ -1237,9 +2099,11 @@ void __init rcu_init_tasks_generic(void)
#ifdef CONFIG_TASKS_TRACE_RCU
rcu_spawn_tasks_trace_kthread();
#endif
+
+ // Run the self-tests.
+ rcu_tasks_initiate_self_tests();
}
#else /* #ifdef CONFIG_TASKS_RCU_GENERIC */
static inline void rcu_tasks_bootup_oddness(void) {}
-void show_rcu_tasks_gp_kthreads(void) {}
#endif /* #else #ifdef CONFIG_TASKS_RCU_GENERIC */
diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c
index aa897c3f2e92..fec804b79080 100644
--- a/kernel/rcu/tiny.c
+++ b/kernel/rcu/tiny.c
@@ -32,17 +32,19 @@ struct rcu_ctrlblk {
struct rcu_head *rcucblist; /* List of pending callbacks (CBs). */
struct rcu_head **donetail; /* ->next pointer of last "done" CB. */
struct rcu_head **curtail; /* ->next pointer of last CB. */
+ unsigned long gp_seq; /* Grace-period counter. */
};
/* Definition for rcupdate control block. */
static struct rcu_ctrlblk rcu_ctrlblk = {
.donetail = &rcu_ctrlblk.rcucblist,
.curtail = &rcu_ctrlblk.rcucblist,
+ .gp_seq = 0 - 300UL,
};
void rcu_barrier(void)
{
- wait_rcu_gp(call_rcu);
+ wait_rcu_gp(call_rcu_hurry);
}
EXPORT_SYMBOL(rcu_barrier);
@@ -56,6 +58,7 @@ void rcu_qs(void)
rcu_ctrlblk.donetail = rcu_ctrlblk.curtail;
raise_softirq_irqoff(RCU_SOFTIRQ);
}
+ WRITE_ONCE(rcu_ctrlblk.gp_seq, rcu_ctrlblk.gp_seq + 2);
local_irq_restore(flags);
}
@@ -94,6 +97,7 @@ static inline bool rcu_reclaim_tiny(struct rcu_head *head)
trace_rcu_invoke_callback("", head);
f = head->func;
+ debug_rcu_head_callback(head);
WRITE_ONCE(head->func, (rcu_callback_t)0L);
f(head);
rcu_lock_release(&rcu_callback_map);
@@ -136,8 +140,10 @@ static __latent_entropy void rcu_process_callbacks(struct softirq_action *unused
/*
* Wait for a grace period to elapse. But it is illegal to invoke
* synchronize_rcu() from within an RCU read-side critical section.
- * Therefore, any legal call to synchronize_rcu() is a quiescent
- * state, and so on a UP system, synchronize_rcu() need do nothing.
+ * Therefore, any legal call to synchronize_rcu() is a quiescent state,
+ * and so on a UP system, synchronize_rcu() need do nothing, other than
+ * let the polled APIs know that another grace period elapsed.
+ *
* (But Lai Jiangshan points out the benefits of doing might_sleep()
* to reduce latency.)
*
@@ -149,9 +155,14 @@ void synchronize_rcu(void)
lock_is_held(&rcu_lock_map) ||
lock_is_held(&rcu_sched_lock_map),
"Illegal synchronize_rcu() in RCU read-side critical section");
+ WRITE_ONCE(rcu_ctrlblk.gp_seq, rcu_ctrlblk.gp_seq + 2);
}
EXPORT_SYMBOL_GPL(synchronize_rcu);
+static void tiny_rcu_leak_callback(struct rcu_head *rhp)
+{
+}
+
/*
* Post an RCU callback to be invoked after the end of an RCU grace
* period. But since we have but one CPU, that would be after any
@@ -159,9 +170,20 @@ EXPORT_SYMBOL_GPL(synchronize_rcu);
*/
void call_rcu(struct rcu_head *head, rcu_callback_t func)
{
+ static atomic_t doublefrees;
unsigned long flags;
- debug_rcu_head_queue(head);
+ if (debug_rcu_head_queue(head)) {
+ if (atomic_inc_return(&doublefrees) < 4) {
+ pr_err("%s(): Double-freed CB %p->%pS()!!! ", __func__, head, head->func);
+ mem_dump_obj(head);
+ }
+
+ if (!__is_kvfree_rcu_offset((unsigned long)head->func))
+ WRITE_ONCE(head->func, tiny_rcu_leak_callback);
+ return;
+ }
+
head->func = func;
head->next = NULL;
@@ -177,9 +199,66 @@ void call_rcu(struct rcu_head *head, rcu_callback_t func)
}
EXPORT_SYMBOL_GPL(call_rcu);
+/*
+ * Store a grace-period-counter "cookie". For more information,
+ * see the Tree RCU header comment.
+ */
+void get_completed_synchronize_rcu_full(struct rcu_gp_oldstate *rgosp)
+{
+ rgosp->rgos_norm = RCU_GET_STATE_COMPLETED;
+}
+EXPORT_SYMBOL_GPL(get_completed_synchronize_rcu_full);
+
+/*
+ * Return a grace-period-counter "cookie". For more information,
+ * see the Tree RCU header comment.
+ */
+unsigned long get_state_synchronize_rcu(void)
+{
+ return READ_ONCE(rcu_ctrlblk.gp_seq);
+}
+EXPORT_SYMBOL_GPL(get_state_synchronize_rcu);
+
+/*
+ * Return a grace-period-counter "cookie" and ensure that a future grace
+ * period completes. For more information, see the Tree RCU header comment.
+ */
+unsigned long start_poll_synchronize_rcu(void)
+{
+ unsigned long gp_seq = get_state_synchronize_rcu();
+
+ if (unlikely(is_idle_task(current))) {
+ /* force scheduling for rcu_qs() */
+ resched_cpu(0);
+ }
+ return gp_seq;
+}
+EXPORT_SYMBOL_GPL(start_poll_synchronize_rcu);
+
+/*
+ * Return true if the grace period corresponding to oldstate has completed
+ * and false otherwise. For more information, see the Tree RCU header
+ * comment.
+ */
+bool poll_state_synchronize_rcu(unsigned long oldstate)
+{
+ return oldstate == RCU_GET_STATE_COMPLETED || READ_ONCE(rcu_ctrlblk.gp_seq) != oldstate;
+}
+EXPORT_SYMBOL_GPL(poll_state_synchronize_rcu);
+
+#ifdef CONFIG_KASAN_GENERIC
+void kvfree_call_rcu(struct rcu_head *head, void *ptr)
+{
+ if (head)
+ kasan_record_aux_stack_noalloc(ptr);
+
+ __kvfree_call_rcu(head, ptr);
+}
+EXPORT_SYMBOL_GPL(kvfree_call_rcu);
+#endif
+
void __init rcu_init(void)
{
open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
rcu_early_boot_tests();
- srcu_init();
}
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 40e5e3dd253e..b2bccfd37c38 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -31,7 +31,10 @@
#include <linux/bitops.h>
#include <linux/export.h>
#include <linux/completion.h>
+#include <linux/kmemleak.h>
#include <linux/moduleparam.h>
+#include <linux/panic.h>
+#include <linux/panic_notifier.h>
#include <linux/percpu.h>
#include <linux/notifier.h>
#include <linux/cpu.h>
@@ -60,6 +63,7 @@
#include <linux/vmalloc.h>
#include <linux/mm.h>
#include <linux/kasan.h>
+#include <linux/context_tracking.h>
#include "../time/tick-internal.h"
#include "tree.h"
@@ -72,36 +76,33 @@
/* Data structures. */
-/*
- * Steal a bit from the bottom of ->dynticks for idle entry/exit
- * control. Initially this is for TLB flushing.
- */
-#define RCU_DYNTICK_CTRL_MASK 0x1
-#define RCU_DYNTICK_CTRL_CTR (RCU_DYNTICK_CTRL_MASK + 1)
-
static DEFINE_PER_CPU_SHARED_ALIGNED(struct rcu_data, rcu_data) = {
- .dynticks_nesting = 1,
- .dynticks_nmi_nesting = DYNTICK_IRQ_NONIDLE,
- .dynticks = ATOMIC_INIT(RCU_DYNTICK_CTRL_CTR),
+ .gpwrap = true,
+#ifdef CONFIG_RCU_NOCB_CPU
+ .cblist.flags = SEGCBLIST_RCU_CORE,
+#endif
};
static struct rcu_state rcu_state = {
.level = { &rcu_state.node[0] },
.gp_state = RCU_GP_IDLE,
.gp_seq = (0UL - 300UL) << RCU_SEQ_CTR_SHIFT,
.barrier_mutex = __MUTEX_INITIALIZER(rcu_state.barrier_mutex),
+ .barrier_lock = __RAW_SPIN_LOCK_UNLOCKED(rcu_state.barrier_lock),
.name = RCU_NAME,
.abbr = RCU_ABBR,
.exp_mutex = __MUTEX_INITIALIZER(rcu_state.exp_mutex),
.exp_wake_mutex = __MUTEX_INITIALIZER(rcu_state.exp_wake_mutex),
- .ofl_lock = __RAW_SPIN_LOCK_UNLOCKED(rcu_state.ofl_lock),
+ .ofl_lock = __ARCH_SPIN_LOCK_UNLOCKED,
};
/* Dump rcu_node combining tree at boot to verify correct setup. */
static bool dump_tree;
module_param(dump_tree, bool, 0444);
/* By default, use RCU_SOFTIRQ instead of rcuc kthreads. */
-static bool use_softirq = true;
+static bool use_softirq = !IS_ENABLED(CONFIG_PREEMPT_RT);
+#ifndef CONFIG_PREEMPT_RT
module_param(use_softirq, bool, 0444);
+#endif
/* Control rcu_node-tree auto-balancing at boot time. */
static bool rcu_fanout_exact;
module_param(rcu_fanout_exact, bool, 0444);
@@ -144,15 +145,22 @@ static int rcu_scheduler_fully_active __read_mostly;
static void rcu_report_qs_rnp(unsigned long mask, struct rcu_node *rnp,
unsigned long gps, unsigned long flags);
-static void rcu_init_new_rnp(struct rcu_node *rnp_leaf);
-static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf);
static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu);
static void invoke_rcu_core(void);
static void rcu_report_exp_rdp(struct rcu_data *rdp);
static void sync_sched_exp_online_cleanup(int cpu);
static void check_cb_ovld_locked(struct rcu_data *rdp, struct rcu_node *rnp);
+static bool rcu_rdp_is_offloaded(struct rcu_data *rdp);
+static bool rcu_rdp_cpu_online(struct rcu_data *rdp);
+static bool rcu_init_invoked(void);
+static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf);
+static void rcu_init_new_rnp(struct rcu_node *rnp_leaf);
-/* rcuc/rcub kthread realtime priority */
+/*
+ * rcuc/rcub/rcuop kthread realtime priority. The "rcuop"
+ * real-time priority(enabling/disabling) is controlled by
+ * the extra CONFIG_RCU_NOCB_CPU_CB_BOOST configuration.
+ */
static int kthread_prio = IS_ENABLED(CONFIG_RCU_BOOST) ? 1 : 0;
module_param(kthread_prio, int, 0444);
@@ -180,6 +188,17 @@ module_param(rcu_unlock_delay, int, 0444);
static int rcu_min_cached_objs = 5;
module_param(rcu_min_cached_objs, int, 0444);
+// A page shrinker can ask for pages to be freed to make them
+// available for other parts of the system. This usually happens
+// under low memory conditions, and in that case we should also
+// defer page-cache filling for a short time period.
+//
+// The default value is 5 seconds, which is long enough to reduce
+// interference with the shrinker while it asks other systems to
+// drain their caches.
+static int rcu_delay_page_cache_fill_msec = 5000;
+module_param(rcu_delay_page_cache_fill_msec, int, 0444);
+
/* Retrieve RCU kthreads priority for rcutorture */
int rcu_get_gp_kthreads_prio(void)
{
@@ -196,18 +215,7 @@ EXPORT_SYMBOL_GPL(rcu_get_gp_kthreads_prio);
* the need for long delays to increase some race probabilities with the
* need for fast grace periods to increase other race probabilities.
*/
-#define PER_RCU_NODE_PERIOD 3 /* Number of grace periods between delays. */
-
-/*
- * Compute the mask of online CPUs for the specified rcu_node structure.
- * This will not be stable unless the rcu_node structure's ->lock is
- * held, but the bit corresponding to the current CPU will be stable
- * in most contexts.
- */
-static unsigned long rcu_rnp_online_cpus(struct rcu_node *rnp)
-{
- return READ_ONCE(rnp->qsmaskinitnext);
-}
+#define PER_RCU_NODE_PERIOD 3 /* Number of grace periods between delays for debugging. */
/*
* Return true if an RCU grace period is in progress. The READ_ONCE()s
@@ -236,58 +244,7 @@ void rcu_softirq_qs(void)
{
rcu_qs();
rcu_preempt_deferred_qs(current);
-}
-
-/*
- * Record entry into an extended quiescent state. This is only to be
- * called when not already in an extended quiescent state, that is,
- * RCU is watching prior to the call to this function and is no longer
- * watching upon return.
- */
-static noinstr void rcu_dynticks_eqs_enter(void)
-{
- struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
- int seq;
-
- /*
- * CPUs seeing atomic_add_return() must see prior RCU read-side
- * critical sections, and we also must force ordering with the
- * next idle sojourn.
- */
- rcu_dynticks_task_trace_enter(); // Before ->dynticks update!
- seq = arch_atomic_add_return(RCU_DYNTICK_CTRL_CTR, &rdp->dynticks);
- // RCU is no longer watching. Better be in extended quiescent state!
- WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) &&
- (seq & RCU_DYNTICK_CTRL_CTR));
- /* Better not have special action (TLB flush) pending! */
- WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) &&
- (seq & RCU_DYNTICK_CTRL_MASK));
-}
-
-/*
- * Record exit from an extended quiescent state. This is only to be
- * called from an extended quiescent state, that is, RCU is not watching
- * prior to the call to this function and is watching upon return.
- */
-static noinstr void rcu_dynticks_eqs_exit(void)
-{
- struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
- int seq;
-
- /*
- * CPUs seeing atomic_add_return() must see prior idle sojourns,
- * and we also must force ordering with the next RCU read-side
- * critical section.
- */
- seq = arch_atomic_add_return(RCU_DYNTICK_CTRL_CTR, &rdp->dynticks);
- // RCU is now watching. Better not be in an extended quiescent state!
- rcu_dynticks_task_trace_exit(); // After ->dynticks update!
- WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) &&
- !(seq & RCU_DYNTICK_CTRL_CTR));
- if (seq & RCU_DYNTICK_CTRL_MASK) {
- arch_atomic_andnot(RCU_DYNTICK_CTRL_MASK, &rdp->dynticks);
- smp_mb__after_atomic(); /* _exit after clearing mask. */
- }
+ rcu_tasks_qs(current, false);
}
/*
@@ -302,34 +259,19 @@ static noinstr void rcu_dynticks_eqs_exit(void)
*/
static void rcu_dynticks_eqs_online(void)
{
- struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
-
- if (atomic_read(&rdp->dynticks) & RCU_DYNTICK_CTRL_CTR)
+ if (ct_dynticks() & RCU_DYNTICKS_IDX)
return;
- atomic_add(RCU_DYNTICK_CTRL_CTR, &rdp->dynticks);
-}
-
-/*
- * Is the current CPU in an extended quiescent state?
- *
- * No ordering, as we are sampling CPU-local information.
- */
-static __always_inline bool rcu_dynticks_curr_cpu_in_eqs(void)
-{
- struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
-
- return !(arch_atomic_read(&rdp->dynticks) & RCU_DYNTICK_CTRL_CTR);
+ ct_state_inc(RCU_DYNTICKS_IDX);
}
/*
* Snapshot the ->dynticks counter with full ordering so as to allow
* stable comparison of this counter with past and future snapshots.
*/
-static int rcu_dynticks_snap(struct rcu_data *rdp)
+static int rcu_dynticks_snap(int cpu)
{
- int snap = atomic_add_return(0, &rdp->dynticks);
-
- return snap & ~RCU_DYNTICK_CTRL_MASK;
+ smp_mb(); // Fundamental RCU ordering guarantee.
+ return ct_dynticks_cpu_acquire(cpu);
}
/*
@@ -338,15 +280,7 @@ static int rcu_dynticks_snap(struct rcu_data *rdp)
*/
static bool rcu_dynticks_in_eqs(int snap)
{
- return !(snap & RCU_DYNTICK_CTRL_CTR);
-}
-
-/* Return true if the specified CPU is currently idle from an RCU viewpoint. */
-bool rcu_is_idle_cpu(int cpu)
-{
- struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
-
- return rcu_dynticks_in_eqs(rcu_dynticks_snap(rdp));
+ return !(snap & RCU_DYNTICKS_IDX);
}
/*
@@ -356,7 +290,7 @@ bool rcu_is_idle_cpu(int cpu)
*/
static bool rcu_dynticks_in_eqs_since(struct rcu_data *rdp, int snap)
{
- return snap != rcu_dynticks_snap(rdp);
+ return snap != rcu_dynticks_snap(rdp->cpu);
}
/*
@@ -365,45 +299,17 @@ static bool rcu_dynticks_in_eqs_since(struct rcu_data *rdp, int snap)
*/
bool rcu_dynticks_zero_in_eqs(int cpu, int *vp)
{
- struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
int snap;
// If not quiescent, force back to earlier extended quiescent state.
- snap = atomic_read(&rdp->dynticks) & ~(RCU_DYNTICK_CTRL_MASK |
- RCU_DYNTICK_CTRL_CTR);
-
+ snap = ct_dynticks_cpu(cpu) & ~RCU_DYNTICKS_IDX;
smp_rmb(); // Order ->dynticks and *vp reads.
if (READ_ONCE(*vp))
return false; // Non-zero, so report failure;
smp_rmb(); // Order *vp read and ->dynticks re-read.
// If still in the same extended quiescent state, we are good!
- return snap == (atomic_read(&rdp->dynticks) & ~RCU_DYNTICK_CTRL_MASK);
-}
-
-/*
- * Set the special (bottom) bit of the specified CPU so that it
- * will take special action (such as flushing its TLB) on the
- * next exit from an extended quiescent state. Returns true if
- * the bit was successfully set, or false if the CPU was not in
- * an extended quiescent state.
- */
-bool rcu_eqs_special_set(int cpu)
-{
- int old;
- int new;
- int new_old;
- struct rcu_data *rdp = &per_cpu(rcu_data, cpu);
-
- new_old = atomic_read(&rdp->dynticks);
- do {
- old = new_old;
- if (old & RCU_DYNTICK_CTRL_CTR)
- return false;
- new = old | RCU_DYNTICK_CTRL_MASK;
- new_old = atomic_cmpxchg(&rdp->dynticks, old, new);
- } while (new_old != old);
- return true;
+ return snap == ct_dynticks_cpu(cpu);
}
/*
@@ -419,13 +325,12 @@ bool rcu_eqs_special_set(int cpu)
*/
notrace void rcu_momentary_dyntick_idle(void)
{
- int special;
+ int seq;
raw_cpu_write(rcu_data.rcu_need_heavy_qs, false);
- special = atomic_add_return(2 * RCU_DYNTICK_CTRL_CTR,
- &this_cpu_ptr(&rcu_data)->dynticks);
+ seq = ct_state_inc(2 * RCU_DYNTICKS_IDX);
/* It is illegal to call this from idle state. */
- WARN_ON_ONCE(!(special & RCU_DYNTICK_CTRL_CTR));
+ WARN_ON_ONCE(!(seq & RCU_DYNTICKS_IDX));
rcu_preempt_deferred_qs(current);
}
EXPORT_SYMBOL_GPL(rcu_momentary_dyntick_idle);
@@ -450,13 +355,13 @@ static int rcu_is_cpu_rrupt_from_idle(void)
lockdep_assert_irqs_disabled();
/* Check for counter underflows */
- RCU_LOCKDEP_WARN(__this_cpu_read(rcu_data.dynticks_nesting) < 0,
+ RCU_LOCKDEP_WARN(ct_dynticks_nesting() < 0,
"RCU dynticks_nesting counter underflow!");
- RCU_LOCKDEP_WARN(__this_cpu_read(rcu_data.dynticks_nmi_nesting) <= 0,
+ RCU_LOCKDEP_WARN(ct_dynticks_nmi_nesting() <= 0,
"RCU dynticks_nmi_nesting counter underflow/zero!");
/* Are we at first interrupt nesting level? */
- nesting = __this_cpu_read(rcu_data.dynticks_nmi_nesting);
+ nesting = ct_dynticks_nmi_nesting();
if (nesting > 1)
return false;
@@ -466,7 +371,7 @@ static int rcu_is_cpu_rrupt_from_idle(void)
WARN_ON_ONCE(!nesting && !is_idle_task(current));
/* Does CPU appear to be idle from an RCU standpoint? */
- return __this_cpu_read(rcu_data.dynticks_nesting) == 0;
+ return ct_dynticks_nesting() == 0;
}
#define DEFAULT_RCU_BLIMIT (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD) ? 1000 : 10)
@@ -617,187 +522,45 @@ void rcutorture_get_gp_data(enum rcutorture_type test_type, int *flags,
}
EXPORT_SYMBOL_GPL(rcutorture_get_gp_data);
+#if defined(CONFIG_NO_HZ_FULL) && (!defined(CONFIG_GENERIC_ENTRY) || !defined(CONFIG_KVM_XFER_TO_GUEST_WORK))
/*
- * Enter an RCU extended quiescent state, which can be either the
- * idle loop or adaptive-tickless usermode execution.
- *
- * We crowbar the ->dynticks_nmi_nesting field to zero to allow for
- * the possibility of usermode upcalls having messed up our count
- * of interrupt nesting level during the prior busy period.
+ * An empty function that will trigger a reschedule on
+ * IRQ tail once IRQs get re-enabled on userspace/guest resume.
*/
-static noinstr void rcu_eqs_enter(bool user)
+static void late_wakeup_func(struct irq_work *work)
{
- struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
-
- WARN_ON_ONCE(rdp->dynticks_nmi_nesting != DYNTICK_IRQ_NONIDLE);
- WRITE_ONCE(rdp->dynticks_nmi_nesting, 0);
- WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) &&
- rdp->dynticks_nesting == 0);
- if (rdp->dynticks_nesting != 1) {
- // RCU will still be watching, so just do accounting and leave.
- rdp->dynticks_nesting--;
- return;
- }
-
- lockdep_assert_irqs_disabled();
- instrumentation_begin();
- trace_rcu_dyntick(TPS("Start"), rdp->dynticks_nesting, 0, atomic_read(&rdp->dynticks));
- WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !user && !is_idle_task(current));
- rdp = this_cpu_ptr(&rcu_data);
- do_nocb_deferred_wakeup(rdp);
- rcu_prepare_for_idle();
- rcu_preempt_deferred_qs(current);
-
- // instrumentation for the noinstr rcu_dynticks_eqs_enter()
- instrument_atomic_write(&rdp->dynticks, sizeof(rdp->dynticks));
-
- instrumentation_end();
- WRITE_ONCE(rdp->dynticks_nesting, 0); /* Avoid irq-access tearing. */
- // RCU is watching here ...
- rcu_dynticks_eqs_enter();
- // ... but is no longer watching here.
- rcu_dynticks_task_enter();
-}
-
-/**
- * rcu_idle_enter - inform RCU that current CPU is entering idle
- *
- * Enter idle mode, in other words, -leave- the mode in which RCU
- * read-side critical sections can occur. (Though RCU read-side
- * critical sections can occur in irq handlers in idle, a possibility
- * handled by irq_enter() and irq_exit().)
- *
- * If you add or remove a call to rcu_idle_enter(), be sure to test with
- * CONFIG_RCU_EQS_DEBUG=y.
- */
-void rcu_idle_enter(void)
-{
- lockdep_assert_irqs_disabled();
- rcu_eqs_enter(false);
}
-EXPORT_SYMBOL_GPL(rcu_idle_enter);
-#ifdef CONFIG_NO_HZ_FULL
-/**
- * rcu_user_enter - inform RCU that we are resuming userspace.
- *
- * Enter RCU idle mode right before resuming userspace. No use of RCU
- * is permitted between this call and rcu_user_exit(). This way the
- * CPU doesn't need to maintain the tick for RCU maintenance purposes
- * when the CPU runs in userspace.
- *
- * If you add or remove a call to rcu_user_enter(), be sure to test with
- * CONFIG_RCU_EQS_DEBUG=y.
- */
-noinstr void rcu_user_enter(void)
-{
- lockdep_assert_irqs_disabled();
- rcu_eqs_enter(true);
-}
-#endif /* CONFIG_NO_HZ_FULL */
+static DEFINE_PER_CPU(struct irq_work, late_wakeup_work) =
+ IRQ_WORK_INIT(late_wakeup_func);
-/**
- * rcu_nmi_exit - inform RCU of exit from NMI context
+/*
+ * If either:
*
- * If we are returning from the outermost NMI handler that interrupted an
- * RCU-idle period, update rdp->dynticks and rdp->dynticks_nmi_nesting
- * to let the RCU grace-period handling know that the CPU is back to
- * being RCU-idle.
+ * 1) the task is about to enter in guest mode and $ARCH doesn't support KVM generic work
+ * 2) the task is about to enter in user mode and $ARCH doesn't support generic entry.
*
- * If you add or remove a call to rcu_nmi_exit(), be sure to test
- * with CONFIG_RCU_EQS_DEBUG=y.
+ * In these cases the late RCU wake ups aren't supported in the resched loops and our
+ * last resort is to fire a local irq_work that will trigger a reschedule once IRQs
+ * get re-enabled again.
*/
-noinstr void rcu_nmi_exit(void)
+noinstr void rcu_irq_work_resched(void)
{
struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
- instrumentation_begin();
- /*
- * Check for ->dynticks_nmi_nesting underflow and bad ->dynticks.
- * (We are exiting an NMI handler, so RCU better be paying attention
- * to us!)
- */
- WARN_ON_ONCE(rdp->dynticks_nmi_nesting <= 0);
- WARN_ON_ONCE(rcu_dynticks_curr_cpu_in_eqs());
-
- /*
- * If the nesting level is not 1, the CPU wasn't RCU-idle, so
- * leave it in non-RCU-idle state.
- */
- if (rdp->dynticks_nmi_nesting != 1) {
- trace_rcu_dyntick(TPS("--="), rdp->dynticks_nmi_nesting, rdp->dynticks_nmi_nesting - 2,
- atomic_read(&rdp->dynticks));
- WRITE_ONCE(rdp->dynticks_nmi_nesting, /* No store tearing. */
- rdp->dynticks_nmi_nesting - 2);
- instrumentation_end();
+ if (IS_ENABLED(CONFIG_GENERIC_ENTRY) && !(current->flags & PF_VCPU))
return;
- }
-
- /* This NMI interrupted an RCU-idle CPU, restore RCU-idleness. */
- trace_rcu_dyntick(TPS("Startirq"), rdp->dynticks_nmi_nesting, 0, atomic_read(&rdp->dynticks));
- WRITE_ONCE(rdp->dynticks_nmi_nesting, 0); /* Avoid store tearing. */
- if (!in_nmi())
- rcu_prepare_for_idle();
+ if (IS_ENABLED(CONFIG_KVM_XFER_TO_GUEST_WORK) && (current->flags & PF_VCPU))
+ return;
- // instrumentation for the noinstr rcu_dynticks_eqs_enter()
- instrument_atomic_write(&rdp->dynticks, sizeof(rdp->dynticks));
+ instrumentation_begin();
+ if (do_nocb_deferred_wakeup(rdp) && need_resched()) {
+ irq_work_queue(this_cpu_ptr(&late_wakeup_work));
+ }
instrumentation_end();
-
- // RCU is watching here ...
- rcu_dynticks_eqs_enter();
- // ... but is no longer watching here.
-
- if (!in_nmi())
- rcu_dynticks_task_enter();
-}
-
-/**
- * rcu_irq_exit - inform RCU that current CPU is exiting irq towards idle
- *
- * Exit from an interrupt handler, which might possibly result in entering
- * idle mode, in other words, leaving the mode in which read-side critical
- * sections can occur. The caller must have disabled interrupts.
- *
- * This code assumes that the idle loop never does anything that might
- * result in unbalanced calls to irq_enter() and irq_exit(). If your
- * architecture's idle loop violates this assumption, RCU will give you what
- * you deserve, good and hard. But very infrequently and irreproducibly.
- *
- * Use things like work queues to work around this limitation.
- *
- * You have been warned.
- *
- * If you add or remove a call to rcu_irq_exit(), be sure to test with
- * CONFIG_RCU_EQS_DEBUG=y.
- */
-void noinstr rcu_irq_exit(void)
-{
- lockdep_assert_irqs_disabled();
- rcu_nmi_exit();
-}
-
-/**
- * rcu_irq_exit_preempt - Inform RCU that current CPU is exiting irq
- * towards in kernel preemption
- *
- * Same as rcu_irq_exit() but has a sanity check that scheduling is safe
- * from RCU point of view. Invoked from return from interrupt before kernel
- * preemption.
- */
-void rcu_irq_exit_preempt(void)
-{
- lockdep_assert_irqs_disabled();
- rcu_nmi_exit();
-
- RCU_LOCKDEP_WARN(__this_cpu_read(rcu_data.dynticks_nesting) <= 0,
- "RCU dynticks_nesting counter underflow/zero!");
- RCU_LOCKDEP_WARN(__this_cpu_read(rcu_data.dynticks_nmi_nesting) !=
- DYNTICK_IRQ_NONIDLE,
- "Bad RCU dynticks_nmi_nesting counter\n");
- RCU_LOCKDEP_WARN(rcu_dynticks_curr_cpu_in_eqs(),
- "RCU in extended quiescent state!");
}
+#endif /* #if defined(CONFIG_NO_HZ_FULL) && (!defined(CONFIG_GENERIC_ENTRY) || !defined(CONFIG_KVM_XFER_TO_GUEST_WORK)) */
#ifdef CONFIG_PROVE_RCU
/**
@@ -807,9 +570,9 @@ void rcu_irq_exit_check_preempt(void)
{
lockdep_assert_irqs_disabled();
- RCU_LOCKDEP_WARN(__this_cpu_read(rcu_data.dynticks_nesting) <= 0,
+ RCU_LOCKDEP_WARN(ct_dynticks_nesting() <= 0,
"RCU dynticks_nesting counter underflow/zero!");
- RCU_LOCKDEP_WARN(__this_cpu_read(rcu_data.dynticks_nmi_nesting) !=
+ RCU_LOCKDEP_WARN(ct_dynticks_nmi_nesting() !=
DYNTICK_IRQ_NONIDLE,
"Bad RCU dynticks_nmi_nesting counter\n");
RCU_LOCKDEP_WARN(rcu_dynticks_curr_cpu_in_eqs(),
@@ -817,96 +580,8 @@ void rcu_irq_exit_check_preempt(void)
}
#endif /* #ifdef CONFIG_PROVE_RCU */
-/*
- * Wrapper for rcu_irq_exit() where interrupts are enabled.
- *
- * If you add or remove a call to rcu_irq_exit_irqson(), be sure to test
- * with CONFIG_RCU_EQS_DEBUG=y.
- */
-void rcu_irq_exit_irqson(void)
-{
- unsigned long flags;
-
- local_irq_save(flags);
- rcu_irq_exit();
- local_irq_restore(flags);
-}
-
-/*
- * Exit an RCU extended quiescent state, which can be either the
- * idle loop or adaptive-tickless usermode execution.
- *
- * We crowbar the ->dynticks_nmi_nesting field to DYNTICK_IRQ_NONIDLE to
- * allow for the possibility of usermode upcalls messing up our count of
- * interrupt nesting level during the busy period that is just now starting.
- */
-static void noinstr rcu_eqs_exit(bool user)
-{
- struct rcu_data *rdp;
- long oldval;
-
- lockdep_assert_irqs_disabled();
- rdp = this_cpu_ptr(&rcu_data);
- oldval = rdp->dynticks_nesting;
- WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && oldval < 0);
- if (oldval) {
- // RCU was already watching, so just do accounting and leave.
- rdp->dynticks_nesting++;
- return;
- }
- rcu_dynticks_task_exit();
- // RCU is not watching here ...
- rcu_dynticks_eqs_exit();
- // ... but is watching here.
- instrumentation_begin();
-
- // instrumentation for the noinstr rcu_dynticks_eqs_exit()
- instrument_atomic_write(&rdp->dynticks, sizeof(rdp->dynticks));
-
- rcu_cleanup_after_idle();
- trace_rcu_dyntick(TPS("End"), rdp->dynticks_nesting, 1, atomic_read(&rdp->dynticks));
- WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !user && !is_idle_task(current));
- WRITE_ONCE(rdp->dynticks_nesting, 1);
- WARN_ON_ONCE(rdp->dynticks_nmi_nesting);
- WRITE_ONCE(rdp->dynticks_nmi_nesting, DYNTICK_IRQ_NONIDLE);
- instrumentation_end();
-}
-
-/**
- * rcu_idle_exit - inform RCU that current CPU is leaving idle
- *
- * Exit idle mode, in other words, -enter- the mode in which RCU
- * read-side critical sections can occur.
- *
- * If you add or remove a call to rcu_idle_exit(), be sure to test with
- * CONFIG_RCU_EQS_DEBUG=y.
- */
-void rcu_idle_exit(void)
-{
- unsigned long flags;
-
- local_irq_save(flags);
- rcu_eqs_exit(false);
- local_irq_restore(flags);
-}
-EXPORT_SYMBOL_GPL(rcu_idle_exit);
-
#ifdef CONFIG_NO_HZ_FULL
/**
- * rcu_user_exit - inform RCU that we are exiting userspace.
- *
- * Exit RCU idle mode while entering the kernel because it can
- * run a RCU read side critical section anytime.
- *
- * If you add or remove a call to rcu_user_exit(), be sure to test with
- * CONFIG_RCU_EQS_DEBUG=y.
- */
-void noinstr rcu_user_exit(void)
-{
- rcu_eqs_exit(1);
-}
-
-/**
* __rcu_irq_enter_check_tick - Enable scheduler tick on CPU if RCU needs it.
*
* The scheduler tick is not normally enabled when CPUs enter the kernel
@@ -958,7 +633,7 @@ void __rcu_irq_enter_check_tick(void)
// prevents self-deadlock. So we can safely recheck under the lock.
// Note that the nohz_full state currently cannot change.
raw_spin_lock_rcu_node(rdp->mynode);
- if (rdp->rcu_urgent_qs && !rdp->rcu_forced_tick) {
+ if (READ_ONCE(rdp->rcu_urgent_qs) && !rdp->rcu_forced_tick) {
// A nohz_full CPU is in the kernel and RCU needs a
// quiescent state. Turn on the tick!
WRITE_ONCE(rdp->rcu_forced_tick, true);
@@ -966,116 +641,24 @@ void __rcu_irq_enter_check_tick(void)
}
raw_spin_unlock_rcu_node(rdp->mynode);
}
+NOKPROBE_SYMBOL(__rcu_irq_enter_check_tick);
#endif /* CONFIG_NO_HZ_FULL */
-/**
- * rcu_nmi_enter - inform RCU of entry to NMI context
- *
- * If the CPU was idle from RCU's viewpoint, update rdp->dynticks and
- * rdp->dynticks_nmi_nesting to let the RCU grace-period handling know
- * that the CPU is active. This implementation permits nested NMIs, as
- * long as the nesting level does not overflow an int. (You will probably
- * run out of stack space first.)
- *
- * If you add or remove a call to rcu_nmi_enter(), be sure to test
- * with CONFIG_RCU_EQS_DEBUG=y.
- */
-noinstr void rcu_nmi_enter(void)
-{
- long incby = 2;
- struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
-
- /* Complain about underflow. */
- WARN_ON_ONCE(rdp->dynticks_nmi_nesting < 0);
-
- /*
- * If idle from RCU viewpoint, atomically increment ->dynticks
- * to mark non-idle and increment ->dynticks_nmi_nesting by one.
- * Otherwise, increment ->dynticks_nmi_nesting by two. This means
- * if ->dynticks_nmi_nesting is equal to one, we are guaranteed
- * to be in the outermost NMI handler that interrupted an RCU-idle
- * period (observation due to Andy Lutomirski).
- */
- if (rcu_dynticks_curr_cpu_in_eqs()) {
-
- if (!in_nmi())
- rcu_dynticks_task_exit();
-
- // RCU is not watching here ...
- rcu_dynticks_eqs_exit();
- // ... but is watching here.
-
- if (!in_nmi()) {
- instrumentation_begin();
- rcu_cleanup_after_idle();
- instrumentation_end();
- }
-
- instrumentation_begin();
- // instrumentation for the noinstr rcu_dynticks_curr_cpu_in_eqs()
- instrument_atomic_read(&rdp->dynticks, sizeof(rdp->dynticks));
- // instrumentation for the noinstr rcu_dynticks_eqs_exit()
- instrument_atomic_write(&rdp->dynticks, sizeof(rdp->dynticks));
-
- incby = 1;
- } else if (!in_nmi()) {
- instrumentation_begin();
- rcu_irq_enter_check_tick();
- instrumentation_end();
- } else {
- instrumentation_begin();
- }
-
- trace_rcu_dyntick(incby == 1 ? TPS("Endirq") : TPS("++="),
- rdp->dynticks_nmi_nesting,
- rdp->dynticks_nmi_nesting + incby, atomic_read(&rdp->dynticks));
- instrumentation_end();
- WRITE_ONCE(rdp->dynticks_nmi_nesting, /* Prevent store tearing. */
- rdp->dynticks_nmi_nesting + incby);
- barrier();
-}
-
-/**
- * rcu_irq_enter - inform RCU that current CPU is entering irq away from idle
- *
- * Enter an interrupt handler, which might possibly result in exiting
- * idle mode, in other words, entering the mode in which read-side critical
- * sections can occur. The caller must have disabled interrupts.
- *
- * Note that the Linux kernel is fully capable of entering an interrupt
- * handler that it never exits, for example when doing upcalls to user mode!
- * This code assumes that the idle loop never does upcalls to user mode.
- * If your architecture's idle loop does do upcalls to user mode (or does
- * anything else that results in unbalanced calls to the irq_enter() and
- * irq_exit() functions), RCU will give you what you deserve, good and hard.
- * But very infrequently and irreproducibly.
- *
- * Use things like work queues to work around this limitation.
- *
- * You have been warned.
- *
- * If you add or remove a call to rcu_irq_enter(), be sure to test with
- * CONFIG_RCU_EQS_DEBUG=y.
- */
-noinstr void rcu_irq_enter(void)
-{
- lockdep_assert_irqs_disabled();
- rcu_nmi_enter();
-}
-
/*
- * Wrapper for rcu_irq_enter() where interrupts are enabled.
+ * Check to see if any future non-offloaded RCU-related work will need
+ * to be done by the current CPU, even if none need be done immediately,
+ * returning 1 if so. This function is part of the RCU implementation;
+ * it is -not- an exported member of the RCU API. This is used by
+ * the idle-entry code to figure out whether it is safe to disable the
+ * scheduler-clock interrupt.
*
- * If you add or remove a call to rcu_irq_enter_irqson(), be sure to test
- * with CONFIG_RCU_EQS_DEBUG=y.
+ * Just check whether or not this CPU has non-offloaded RCU callbacks
+ * queued.
*/
-void rcu_irq_enter_irqson(void)
+int rcu_needs_cpu(void)
{
- unsigned long flags;
-
- local_irq_save(flags);
- rcu_irq_enter();
- local_irq_restore(flags);
+ return !rcu_segcblist_empty(&this_cpu_ptr(&rcu_data)->cblist) &&
+ !rcu_rdp_is_offloaded(this_cpu_ptr(&rcu_data));
}
/*
@@ -1095,12 +678,16 @@ static void rcu_disable_urgency_upon_qs(struct rcu_data *rdp)
}
/**
- * rcu_is_watching - see if RCU thinks that the current CPU is not idle
+ * rcu_is_watching - RCU read-side critical sections permitted on current CPU?
+ *
+ * Return @true if RCU is watching the running CPU and @false otherwise.
+ * An @true return means that this CPU can safely enter RCU read-side
+ * critical sections.
*
- * Return true if RCU is watching the running CPU, which means that this
- * CPU can safely enter RCU read-side critical sections. In other words,
- * if the current CPU is not in its idle loop or is in an interrupt or
- * NMI handler, return true.
+ * Although calls to rcu_is_watching() from most parts of the kernel
+ * will return @true, there are important exceptions. For example, if the
+ * current CPU is deep within its idle loop, in kernel entry/exit code,
+ * or offline, rcu_is_watching() will return @false.
*
* Make notrace because it can be called by the internal functions of
* ftrace, and making this notrace removes unnecessary recursion calls.
@@ -1134,43 +721,8 @@ void rcu_request_urgent_qs_task(struct task_struct *t)
smp_store_release(per_cpu_ptr(&rcu_data.rcu_urgent_qs, cpu), true);
}
-#if defined(CONFIG_PROVE_RCU) && defined(CONFIG_HOTPLUG_CPU)
-
-/*
- * Is the current CPU online as far as RCU is concerned?
- *
- * Disable preemption to avoid false positives that could otherwise
- * happen due to the current CPU number being sampled, this task being
- * preempted, its old CPU being taken offline, resuming on some other CPU,
- * then determining that its old CPU is now offline.
- *
- * Disable checking if in an NMI handler because we cannot safely
- * report errors from NMI handlers anyway. In addition, it is OK to use
- * RCU on an offline processor during initial boot, hence the check for
- * rcu_scheduler_fully_active.
- */
-bool rcu_lockdep_current_cpu_online(void)
-{
- struct rcu_data *rdp;
- struct rcu_node *rnp;
- bool ret = false;
-
- if (in_nmi() || !rcu_scheduler_fully_active)
- return true;
- preempt_disable_notrace();
- rdp = this_cpu_ptr(&rcu_data);
- rnp = rdp->mynode;
- if (rdp->grpmask & rcu_rnp_online_cpus(rnp) || READ_ONCE(rnp->ofl_seq) & 0x1)
- ret = true;
- preempt_enable_notrace();
- return ret;
-}
-EXPORT_SYMBOL_GPL(rcu_lockdep_current_cpu_online);
-
-#endif /* #if defined(CONFIG_PROVE_RCU) && defined(CONFIG_HOTPLUG_CPU) */
-
/*
- * We are reporting a quiescent state on behalf of some other CPU, so
+ * When trying to report a quiescent state on behalf of some other CPU,
* it is our responsibility to check for and handle potential overflow
* of the rcu_node ->gp_seq counter with respect to the rcu_data counters.
* After all, the CPU might be in deep idle state, and thus executing no
@@ -1193,7 +745,7 @@ static void rcu_gpnum_ovf(struct rcu_node *rnp, struct rcu_data *rdp)
*/
static int dyntick_save_progress_counter(struct rcu_data *rdp)
{
- rdp->dynticks_snap = rcu_dynticks_snap(rdp);
+ rdp->dynticks_snap = rcu_dynticks_snap(rdp->cpu);
if (rcu_dynticks_in_eqs(rdp->dynticks_snap)) {
trace_rcu_fqs(rcu_state.name, rdp->gp_seq, rdp->cpu, TPS("dti"));
rcu_gpnum_ovf(rdp->mynode, rdp);
@@ -1203,16 +755,19 @@ static int dyntick_save_progress_counter(struct rcu_data *rdp)
}
/*
- * Return true if the specified CPU has passed through a quiescent
- * state by virtue of being in or having passed through an dynticks
- * idle state since the last call to dyntick_save_progress_counter()
- * for this same CPU, or by virtue of having been offline.
+ * Returns positive if the specified CPU has passed through a quiescent state
+ * by virtue of being in or having passed through an dynticks idle state since
+ * the last call to dyntick_save_progress_counter() for this same CPU, or by
+ * virtue of having been offline.
+ *
+ * Returns negative if the specified CPU needs a force resched.
+ *
+ * Returns zero otherwise.
*/
static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
{
unsigned long jtsq;
- bool *rnhqp;
- bool *ruqp;
+ int ret = 0;
struct rcu_node *rnp = rdp->mynode;
/*
@@ -1247,8 +802,7 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
* For more detail, please refer to the "Hotplug CPU" section
* of RCU's Requirements documentation.
*/
- if (WARN_ON_ONCE(!(rdp->grpmask & rcu_rnp_online_cpus(rnp)))) {
- bool onl;
+ if (WARN_ON_ONCE(!rcu_rdp_cpu_online(rdp))) {
struct rcu_node *rnp1;
pr_info("%s: grp: %d-%d level: %d ->gp_seq %ld ->completedqs %ld\n",
@@ -1257,9 +811,8 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
for (rnp1 = rnp; rnp1; rnp1 = rnp1->parent)
pr_info("%s: %d:%d ->qsmask %#lx ->qsmaskinit %#lx ->qsmaskinitnext %#lx ->rcu_gp_init_mask %#lx\n",
__func__, rnp1->grplo, rnp1->grphi, rnp1->qsmask, rnp1->qsmaskinit, rnp1->qsmaskinitnext, rnp1->rcu_gp_init_mask);
- onl = !!(rdp->grpmask & rcu_rnp_online_cpus(rnp));
pr_info("%s %d: %c online: %ld(%d) offline: %ld(%d)\n",
- __func__, rdp->cpu, ".o"[onl],
+ __func__, rdp->cpu, ".o"[rcu_rdp_cpu_online(rdp)],
(long)rdp->rcu_onl_gp_seq, rdp->rcu_onl_gp_flags,
(long)rdp->rcu_ofl_gp_seq, rdp->rcu_ofl_gp_flags);
return 1; /* Break things loose after complaining. */
@@ -1277,17 +830,15 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
* is set way high.
*/
jtsq = READ_ONCE(jiffies_to_sched_qs);
- ruqp = per_cpu_ptr(&rcu_data.rcu_urgent_qs, rdp->cpu);
- rnhqp = &per_cpu(rcu_data.rcu_need_heavy_qs, rdp->cpu);
- if (!READ_ONCE(*rnhqp) &&
+ if (!READ_ONCE(rdp->rcu_need_heavy_qs) &&
(time_after(jiffies, rcu_state.gp_start + jtsq * 2) ||
time_after(jiffies, rcu_state.jiffies_resched) ||
rcu_state.cbovld)) {
- WRITE_ONCE(*rnhqp, true);
+ WRITE_ONCE(rdp->rcu_need_heavy_qs, true);
/* Store rcu_need_heavy_qs before rcu_urgent_qs. */
- smp_store_release(ruqp, true);
+ smp_store_release(&rdp->rcu_urgent_qs, true);
} else if (time_after(jiffies, rcu_state.gp_start + jtsq)) {
- WRITE_ONCE(*ruqp, true);
+ WRITE_ONCE(rdp->rcu_urgent_qs, true);
}
/*
@@ -1301,9 +852,9 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
if (tick_nohz_full_cpu(rdp->cpu) &&
(time_after(jiffies, READ_ONCE(rdp->last_fqs_resched) + jtsq * 3) ||
rcu_state.cbovld)) {
- WRITE_ONCE(*ruqp, true);
- resched_cpu(rdp->cpu);
+ WRITE_ONCE(rdp->rcu_urgent_qs, true);
WRITE_ONCE(rdp->last_fqs_resched, jiffies);
+ ret = -1;
}
/*
@@ -1316,8 +867,8 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
if (time_after(jiffies, rcu_state.jiffies_resched)) {
if (time_after(jiffies,
READ_ONCE(rdp->last_fqs_resched) + jtsq)) {
- resched_cpu(rdp->cpu);
WRITE_ONCE(rdp->last_fqs_resched, jiffies);
+ ret = -1;
}
if (IS_ENABLED(CONFIG_IRQ_WORK) &&
!rdp->rcu_iw_pending && rdp->rcu_iw_gp_seq != rnp->gp_seq &&
@@ -1326,9 +877,27 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
rdp->rcu_iw_gp_seq = rnp->gp_seq;
irq_work_queue_on(&rdp->rcu_iw, rdp->cpu);
}
+
+ if (rcu_cpu_stall_cputime && rdp->snap_record.gp_seq != rdp->gp_seq) {
+ int cpu = rdp->cpu;
+ struct rcu_snap_record *rsrp;
+ struct kernel_cpustat *kcsp;
+
+ kcsp = &kcpustat_cpu(cpu);
+
+ rsrp = &rdp->snap_record;
+ rsrp->cputime_irq = kcpustat_field(kcsp, CPUTIME_IRQ, cpu);
+ rsrp->cputime_softirq = kcpustat_field(kcsp, CPUTIME_SOFTIRQ, cpu);
+ rsrp->cputime_system = kcpustat_field(kcsp, CPUTIME_SYSTEM, cpu);
+ rsrp->nr_hardirqs = kstat_cpu_irqs_sum(rdp->cpu);
+ rsrp->nr_softirqs = kstat_cpu_softirqs_sum(rdp->cpu);
+ rsrp->nr_csw = nr_context_switches_cpu(rdp->cpu);
+ rsrp->jiffies = jiffies;
+ rsrp->gp_seq = rdp->gp_seq;
+ }
}
- return 0;
+ return ret;
}
/* Trace-event wrapper function for trace_rcu_future_grace_period. */
@@ -1444,6 +1013,38 @@ static bool rcu_future_gp_cleanup(struct rcu_node *rnp)
return needmore;
}
+static void swake_up_one_online_ipi(void *arg)
+{
+ struct swait_queue_head *wqh = arg;
+
+ swake_up_one(wqh);
+}
+
+static void swake_up_one_online(struct swait_queue_head *wqh)
+{
+ int cpu = get_cpu();
+
+ /*
+ * If called from rcutree_report_cpu_starting(), wake up
+ * is dangerous that late in the CPU-down hotplug process. The
+ * scheduler might queue an ignored hrtimer. Defer the wake up
+ * to an online CPU instead.
+ */
+ if (unlikely(cpu_is_offline(cpu))) {
+ int target;
+
+ target = cpumask_any_and(housekeeping_cpumask(HK_TYPE_RCU),
+ cpu_online_mask);
+
+ smp_call_function_single(target, swake_up_one_online_ipi,
+ wqh, 0);
+ put_cpu();
+ } else {
+ put_cpu();
+ swake_up_one(wqh);
+ }
+}
+
/*
* Awaken the grace-period kthread. Don't do a self-awaken (unless in an
* interrupt or softirq handler, in which case we just might immediately
@@ -1463,12 +1064,12 @@ static void rcu_gp_kthread_wake(void)
{
struct task_struct *t = READ_ONCE(rcu_state.gp_kthread);
- if ((current == t && !in_irq() && !in_serving_softirq()) ||
+ if ((current == t && !in_hardirq() && !in_serving_softirq()) ||
!READ_ONCE(rcu_state.gp_flags) || !t)
return;
WRITE_ONCE(rcu_state.gp_wake_time, jiffies);
WRITE_ONCE(rcu_state.gp_wake_seq, READ_ONCE(rcu_state.gp_seq));
- swake_up_one(&rcu_state.gp_wq);
+ swake_up_one_online(&rcu_state.gp_wq);
}
/*
@@ -1495,6 +1096,8 @@ static bool rcu_accelerate_cbs(struct rcu_node *rnp, struct rcu_data *rdp)
if (!rcu_segcblist_pend_cbs(&rdp->cblist))
return false;
+ trace_rcu_segcb_stats(&rdp->cblist, TPS("SegCbPreAcc"));
+
/*
* Callbacks are often registered with incomplete grace-period
* information. Something about the fact that getting exact
@@ -1515,6 +1118,8 @@ static bool rcu_accelerate_cbs(struct rcu_node *rnp, struct rcu_data *rdp)
else
trace_rcu_grace_period(rcu_state.name, gp_seq_req, TPS("AccReadyCB"));
+ trace_rcu_segcb_stats(&rdp->cblist, TPS("SegCbPostAcc"));
+
return ret;
}
@@ -1582,10 +1187,11 @@ static void __maybe_unused rcu_advance_cbs_nowake(struct rcu_node *rnp,
struct rcu_data *rdp)
{
rcu_lockdep_assert_cblist_protected(rdp);
- if (!rcu_seq_state(rcu_seq_current(&rnp->gp_seq)) ||
- !raw_spin_trylock_rcu_node(rnp))
+ if (!rcu_seq_state(rcu_seq_current(&rnp->gp_seq)) || !raw_spin_trylock_rcu_node(rnp))
return;
- WARN_ON_ONCE(rcu_advance_cbs(rnp, rdp));
+ // The grace period cannot end while we hold the rcu_node lock.
+ if (rcu_seq_state(rcu_seq_current(&rnp->gp_seq)))
+ WARN_ON_ONCE(rcu_advance_cbs(rnp, rdp));
raw_spin_unlock_rcu_node(rnp);
}
@@ -1612,7 +1218,7 @@ static bool __note_gp_changes(struct rcu_node *rnp, struct rcu_data *rdp)
{
bool ret = false;
bool need_qs;
- const bool offloaded = rcu_segcblist_is_offloaded(&rdp->cblist);
+ const bool offloaded = rcu_rdp_is_offloaded(rdp);
raw_lockdep_assert_held_rcu_node(rnp);
@@ -1650,6 +1256,8 @@ static bool __note_gp_changes(struct rcu_node *rnp, struct rcu_data *rdp)
rdp->gp_seq = rnp->gp_seq; /* Remember new grace-period state. */
if (ULONG_CMP_LT(rdp->gp_seq_needed, rnp->gp_seq_needed) || rdp->gpwrap)
WRITE_ONCE(rdp->gp_seq_needed, rnp->gp_seq_needed);
+ if (IS_ENABLED(CONFIG_PROVE_RCU) && READ_ONCE(rdp->gpwrap))
+ WRITE_ONCE(rdp->last_sched_clock, jiffies);
WRITE_ONCE(rdp->gpwrap, false);
rcu_gpnum_ovf(rnp, rdp);
return ret;
@@ -1676,11 +1284,37 @@ static void note_gp_changes(struct rcu_data *rdp)
rcu_gp_kthread_wake();
}
+static atomic_t *rcu_gp_slow_suppress;
+
+/* Register a counter to suppress debugging grace-period delays. */
+void rcu_gp_slow_register(atomic_t *rgssp)
+{
+ WARN_ON_ONCE(rcu_gp_slow_suppress);
+
+ WRITE_ONCE(rcu_gp_slow_suppress, rgssp);
+}
+EXPORT_SYMBOL_GPL(rcu_gp_slow_register);
+
+/* Unregister a counter, with NULL for not caring which. */
+void rcu_gp_slow_unregister(atomic_t *rgssp)
+{
+ WARN_ON_ONCE(rgssp && rgssp != rcu_gp_slow_suppress && rcu_gp_slow_suppress != NULL);
+
+ WRITE_ONCE(rcu_gp_slow_suppress, NULL);
+}
+EXPORT_SYMBOL_GPL(rcu_gp_slow_unregister);
+
+static bool rcu_gp_slow_is_suppressed(void)
+{
+ atomic_t *rgssp = READ_ONCE(rcu_gp_slow_suppress);
+
+ return rgssp && atomic_read(rgssp);
+}
+
static void rcu_gp_slow(int delay)
{
- if (delay > 0 &&
- !(rcu_seq_ctr(rcu_state.gp_seq) %
- (rcu_num_nodes * PER_RCU_NODE_PERIOD * delay)))
+ if (!rcu_gp_slow_is_suppressed() && delay > 0 &&
+ !(rcu_seq_ctr(rcu_state.gp_seq) % (rcu_num_nodes * PER_RCU_NODE_PERIOD * delay)))
schedule_timeout_idle(delay);
}
@@ -1718,12 +1352,81 @@ static void rcu_strict_gp_boundary(void *unused)
invoke_rcu_core();
}
+// Make the polled API aware of the beginning of a grace period.
+static void rcu_poll_gp_seq_start(unsigned long *snap)
+{
+ struct rcu_node *rnp = rcu_get_root();
+
+ if (rcu_scheduler_active != RCU_SCHEDULER_INACTIVE)
+ raw_lockdep_assert_held_rcu_node(rnp);
+
+ // If RCU was idle, note beginning of GP.
+ if (!rcu_seq_state(rcu_state.gp_seq_polled))
+ rcu_seq_start(&rcu_state.gp_seq_polled);
+
+ // Either way, record current state.
+ *snap = rcu_state.gp_seq_polled;
+}
+
+// Make the polled API aware of the end of a grace period.
+static void rcu_poll_gp_seq_end(unsigned long *snap)
+{
+ struct rcu_node *rnp = rcu_get_root();
+
+ if (rcu_scheduler_active != RCU_SCHEDULER_INACTIVE)
+ raw_lockdep_assert_held_rcu_node(rnp);
+
+ // If the previously noted GP is still in effect, record the
+ // end of that GP. Either way, zero counter to avoid counter-wrap
+ // problems.
+ if (*snap && *snap == rcu_state.gp_seq_polled) {
+ rcu_seq_end(&rcu_state.gp_seq_polled);
+ rcu_state.gp_seq_polled_snap = 0;
+ rcu_state.gp_seq_polled_exp_snap = 0;
+ } else {
+ *snap = 0;
+ }
+}
+
+// Make the polled API aware of the beginning of a grace period, but
+// where caller does not hold the root rcu_node structure's lock.
+static void rcu_poll_gp_seq_start_unlocked(unsigned long *snap)
+{
+ unsigned long flags;
+ struct rcu_node *rnp = rcu_get_root();
+
+ if (rcu_init_invoked()) {
+ if (rcu_scheduler_active != RCU_SCHEDULER_INACTIVE)
+ lockdep_assert_irqs_enabled();
+ raw_spin_lock_irqsave_rcu_node(rnp, flags);
+ }
+ rcu_poll_gp_seq_start(snap);
+ if (rcu_init_invoked())
+ raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+}
+
+// Make the polled API aware of the end of a grace period, but where
+// caller does not hold the root rcu_node structure's lock.
+static void rcu_poll_gp_seq_end_unlocked(unsigned long *snap)
+{
+ unsigned long flags;
+ struct rcu_node *rnp = rcu_get_root();
+
+ if (rcu_init_invoked()) {
+ if (rcu_scheduler_active != RCU_SCHEDULER_INACTIVE)
+ lockdep_assert_irqs_enabled();
+ raw_spin_lock_irqsave_rcu_node(rnp, flags);
+ }
+ rcu_poll_gp_seq_end(snap);
+ if (rcu_init_invoked())
+ raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+}
+
/*
* Initialize a new grace period. Return false if no grace period required.
*/
-static bool rcu_gp_init(void)
+static noinline_for_stack bool rcu_gp_init(void)
{
- unsigned long firstseq;
unsigned long flags;
unsigned long oldmask;
unsigned long mask;
@@ -1754,6 +1457,7 @@ static bool rcu_gp_init(void)
rcu_seq_start(&rcu_state.gp_seq);
ASSERT_EXCLUSIVE_WRITER(rcu_state.gp_seq);
trace_rcu_grace_period(rcu_state.name, rcu_state.gp_seq, TPS("start"));
+ rcu_poll_gp_seq_start(&rcu_state.gp_seq_polled_snap);
raw_spin_unlock_irq_rcu_node(rnp);
/*
@@ -1765,21 +1469,18 @@ static bool rcu_gp_init(void)
* go offline later. Please also refer to "Hotplug CPU" section
* of RCU's Requirements documentation.
*/
- rcu_state.gp_state = RCU_GP_ONOFF;
+ WRITE_ONCE(rcu_state.gp_state, RCU_GP_ONOFF);
+ /* Exclude CPU hotplug operations. */
rcu_for_each_leaf_node(rnp) {
- smp_mb(); // Pair with barriers used when updating ->ofl_seq to odd values.
- firstseq = READ_ONCE(rnp->ofl_seq);
- if (firstseq & 0x1)
- while (firstseq == READ_ONCE(rnp->ofl_seq))
- schedule_timeout_idle(1); // Can't wake unless RCU is watching.
- smp_mb(); // Pair with barriers used when updating ->ofl_seq to even values.
- raw_spin_lock(&rcu_state.ofl_lock);
- raw_spin_lock_irq_rcu_node(rnp);
+ local_irq_save(flags);
+ arch_spin_lock(&rcu_state.ofl_lock);
+ raw_spin_lock_rcu_node(rnp);
if (rnp->qsmaskinit == rnp->qsmaskinitnext &&
!rnp->wait_blkd_tasks) {
/* Nothing to do on this leaf rcu_node structure. */
- raw_spin_unlock_irq_rcu_node(rnp);
- raw_spin_unlock(&rcu_state.ofl_lock);
+ raw_spin_unlock_rcu_node(rnp);
+ arch_spin_unlock(&rcu_state.ofl_lock);
+ local_irq_restore(flags);
continue;
}
@@ -1814,8 +1515,9 @@ static bool rcu_gp_init(void)
rcu_cleanup_dead_rnp(rnp);
}
- raw_spin_unlock_irq_rcu_node(rnp);
- raw_spin_unlock(&rcu_state.ofl_lock);
+ raw_spin_unlock_rcu_node(rnp);
+ arch_spin_unlock(&rcu_state.ofl_lock);
+ local_irq_restore(flags);
}
rcu_gp_slow(gp_preinit_delay); /* Races with CPU hotplug. */
@@ -1831,7 +1533,7 @@ static bool rcu_gp_init(void)
* The grace period cannot complete until the initialization
* process finishes, because this kthread handles both.
*/
- rcu_state.gp_state = RCU_GP_INIT;
+ WRITE_ONCE(rcu_state.gp_state, RCU_GP_INIT);
rcu_for_each_node_breadth_first(rnp) {
rcu_gp_slow(gp_init_delay);
raw_spin_lock_irqsave_rcu_node(rnp, flags);
@@ -1892,10 +1594,22 @@ static bool rcu_gp_fqs_check_wake(int *gfp)
*/
static void rcu_gp_fqs(bool first_time)
{
+ int nr_fqs = READ_ONCE(rcu_state.nr_fqs_jiffies_stall);
struct rcu_node *rnp = rcu_get_root();
WRITE_ONCE(rcu_state.gp_activity, jiffies);
- rcu_state.n_force_qs++;
+ WRITE_ONCE(rcu_state.n_force_qs, rcu_state.n_force_qs + 1);
+
+ WARN_ON_ONCE(nr_fqs > 3);
+ /* Only countdown nr_fqs for stall purposes if jiffies moves. */
+ if (nr_fqs) {
+ if (nr_fqs == 1) {
+ WRITE_ONCE(rcu_state.jiffies_stall,
+ jiffies + rcu_jiffies_till_stall_check());
+ }
+ WRITE_ONCE(rcu_state.nr_fqs_jiffies_stall, --nr_fqs);
+ }
+
if (first_time) {
/* Collect dyntick-idle snapshots. */
force_qs_rnp(dyntick_save_progress_counter);
@@ -1915,34 +1629,51 @@ static void rcu_gp_fqs(bool first_time)
/*
* Loop doing repeated quiescent-state forcing until the grace period ends.
*/
-static void rcu_gp_fqs_loop(void)
+static noinline_for_stack void rcu_gp_fqs_loop(void)
{
- bool first_gp_fqs;
+ bool first_gp_fqs = true;
int gf = 0;
unsigned long j;
int ret;
struct rcu_node *rnp = rcu_get_root();
- first_gp_fqs = true;
j = READ_ONCE(jiffies_till_first_fqs);
if (rcu_state.cbovld)
gf = RCU_GP_FLAG_OVLD;
ret = 0;
for (;;) {
- if (!ret) {
- rcu_state.jiffies_force_qs = jiffies + j;
+ if (rcu_state.cbovld) {
+ j = (j + 2) / 3;
+ if (j <= 0)
+ j = 1;
+ }
+ if (!ret || time_before(jiffies + j, rcu_state.jiffies_force_qs)) {
+ WRITE_ONCE(rcu_state.jiffies_force_qs, jiffies + j);
+ /*
+ * jiffies_force_qs before RCU_GP_WAIT_FQS state
+ * update; required for stall checks.
+ */
+ smp_wmb();
WRITE_ONCE(rcu_state.jiffies_kick_kthreads,
jiffies + (j ? 3 * j : 2));
}
trace_rcu_grace_period(rcu_state.name, rcu_state.gp_seq,
TPS("fqswait"));
- rcu_state.gp_state = RCU_GP_WAIT_FQS;
- ret = swait_event_idle_timeout_exclusive(
- rcu_state.gp_wq, rcu_gp_fqs_check_wake(&gf), j);
+ WRITE_ONCE(rcu_state.gp_state, RCU_GP_WAIT_FQS);
+ (void)swait_event_idle_timeout_exclusive(rcu_state.gp_wq,
+ rcu_gp_fqs_check_wake(&gf), j);
rcu_gp_torture_wait();
- rcu_state.gp_state = RCU_GP_DOING_FQS;
+ WRITE_ONCE(rcu_state.gp_state, RCU_GP_DOING_FQS);
/* Locking provides needed memory barriers. */
- /* If grace period done, leave loop. */
+ /*
+ * Exit the loop if the root rcu_node structure indicates that the grace period
+ * has ended, leave the loop. The rcu_preempt_blocked_readers_cgp(rnp) check
+ * is required only for single-node rcu_node trees because readers blocking
+ * the current grace period are queued only on leaf rcu_node structures.
+ * For multi-node trees, checking the root node's ->qsmask suffices, because a
+ * given root node's ->qsmask bit is cleared only when all CPUs and tasks from
+ * the corresponding leaf nodes have passed through their quiescent state.
+ */
if (!READ_ONCE(rnp->qsmask) &&
!rcu_preempt_blocked_readers_cgp(rnp))
break;
@@ -1984,7 +1715,7 @@ static void rcu_gp_fqs_loop(void)
/*
* Clean up after the old grace period.
*/
-static void rcu_gp_cleanup(void)
+static noinline void rcu_gp_cleanup(void)
{
int cpu;
bool needgp = false;
@@ -2010,6 +1741,7 @@ static void rcu_gp_cleanup(void)
* safe for us to drop the lock in order to mark the grace
* period as completed in all of the rcu_node structures.
*/
+ rcu_poll_gp_seq_end(&rcu_state.gp_seq_polled_snap);
raw_spin_unlock_irq_rcu_node(rnp);
/*
@@ -2029,6 +1761,8 @@ static void rcu_gp_cleanup(void)
dump_blkd_tasks(rnp, 10);
WARN_ON_ONCE(rnp->qsmask);
WRITE_ONCE(rnp->gp_seq, new_gp_seq);
+ if (!rnp->parent)
+ smp_mb(); // Order against failing poll_state_synchronize_rcu_full().
rdp = this_cpu_ptr(&rcu_data);
if (rnp == rdp->mynode)
needgp = __note_gp_changes(rnp, rdp) || needgp;
@@ -2054,7 +1788,7 @@ static void rcu_gp_cleanup(void)
trace_rcu_grace_period(rcu_state.name, rcu_state.gp_seq, TPS("end"));
rcu_seq_end(&rcu_state.gp_seq);
ASSERT_EXCLUSIVE_WRITER(rcu_state.gp_seq);
- rcu_state.gp_state = RCU_GP_IDLE;
+ WRITE_ONCE(rcu_state.gp_state, RCU_GP_IDLE);
/* Check for GP requests since above loop. */
rdp = this_cpu_ptr(&rcu_data);
if (!needgp && ULONG_CMP_LT(rnp->gp_seq, rnp->gp_seq_needed)) {
@@ -2063,16 +1797,31 @@ static void rcu_gp_cleanup(void)
needgp = true;
}
/* Advance CBs to reduce false positives below. */
- offloaded = rcu_segcblist_is_offloaded(&rdp->cblist);
+ offloaded = rcu_rdp_is_offloaded(rdp);
if ((offloaded || !rcu_accelerate_cbs(rnp, rdp)) && needgp) {
+
+ // We get here if a grace period was needed (“needgp”)
+ // and the above call to rcu_accelerate_cbs() did not set
+ // the RCU_GP_FLAG_INIT bit in ->gp_state (which records
+ // the need for another grace period).  The purpose
+ // of the “offloaded” check is to avoid invoking
+ // rcu_accelerate_cbs() on an offloaded CPU because we do not
+ // hold the ->nocb_lock needed to safely access an offloaded
+ // ->cblist.  We do not want to acquire that lock because
+ // it can be heavily contended during callback floods.
+
WRITE_ONCE(rcu_state.gp_flags, RCU_GP_FLAG_INIT);
WRITE_ONCE(rcu_state.gp_req_activity, jiffies);
- trace_rcu_grace_period(rcu_state.name,
- rcu_state.gp_seq,
- TPS("newreq"));
+ trace_rcu_grace_period(rcu_state.name, rcu_state.gp_seq, TPS("newreq"));
} else {
- WRITE_ONCE(rcu_state.gp_flags,
- rcu_state.gp_flags & RCU_GP_FLAG_INIT);
+
+ // We get here either if there is no need for an
+ // additional grace period or if rcu_accelerate_cbs() has
+ // already set the RCU_GP_FLAG_INIT bit in ->gp_flags. 
+ // So all we need to do is to clear all of the other
+ // ->gp_flags bits.
+
+ WRITE_ONCE(rcu_state.gp_flags, rcu_state.gp_flags & RCU_GP_FLAG_INIT);
}
raw_spin_unlock_irq_rcu_node(rnp);
@@ -2093,12 +1842,12 @@ static int __noreturn rcu_gp_kthread(void *unused)
for (;;) {
trace_rcu_grace_period(rcu_state.name, rcu_state.gp_seq,
TPS("reqwait"));
- rcu_state.gp_state = RCU_GP_WAIT_GPS;
+ WRITE_ONCE(rcu_state.gp_state, RCU_GP_WAIT_GPS);
swait_event_idle_exclusive(rcu_state.gp_wq,
READ_ONCE(rcu_state.gp_flags) &
RCU_GP_FLAG_INIT);
rcu_gp_torture_wait();
- rcu_state.gp_state = RCU_GP_DONE_GPS;
+ WRITE_ONCE(rcu_state.gp_state, RCU_GP_DONE_GPS);
/* Locking provides needed memory barrier. */
if (rcu_gp_init())
break;
@@ -2113,9 +1862,9 @@ static int __noreturn rcu_gp_kthread(void *unused)
rcu_gp_fqs_loop();
/* Handle grace-period end. */
- rcu_state.gp_state = RCU_GP_CLEANUP;
+ WRITE_ONCE(rcu_state.gp_state, RCU_GP_CLEANUP);
rcu_gp_cleanup();
- rcu_state.gp_state = RCU_GP_CLEANED;
+ WRITE_ONCE(rcu_state.gp_state, RCU_GP_CLEANED);
}
}
@@ -2261,8 +2010,7 @@ rcu_report_qs_rdp(struct rcu_data *rdp)
{
unsigned long flags;
unsigned long mask;
- bool needwake = false;
- const bool offloaded = rcu_segcblist_is_offloaded(&rdp->cblist);
+ bool needacc = false;
struct rcu_node *rnp;
WARN_ON_ONCE(rdp->cpu != smp_processor_id());
@@ -2289,15 +2037,33 @@ rcu_report_qs_rdp(struct rcu_data *rdp)
/*
* This GP can't end until cpu checks in, so all of our
* callbacks can be processed during the next GP.
+ *
+ * NOCB kthreads have their own way to deal with that...
*/
- if (!offloaded)
- needwake = rcu_accelerate_cbs(rnp, rdp);
+ if (!rcu_rdp_is_offloaded(rdp)) {
+ /*
+ * The current GP has not yet ended, so it
+ * should not be possible for rcu_accelerate_cbs()
+ * to return true. So complain, but don't awaken.
+ */
+ WARN_ON_ONCE(rcu_accelerate_cbs(rnp, rdp));
+ } else if (!rcu_segcblist_completely_offloaded(&rdp->cblist)) {
+ /*
+ * ...but NOCB kthreads may miss or delay callbacks acceleration
+ * if in the middle of a (de-)offloading process.
+ */
+ needacc = true;
+ }
rcu_disable_urgency_upon_qs(rdp);
rcu_report_qs_rnp(mask, rnp, rnp->gp_seq, flags);
/* ^^^ Released rnp->lock */
- if (needwake)
- rcu_gp_kthread_wake();
+
+ if (needacc) {
+ rcu_nocb_lock_irqsave(rdp, flags);
+ rcu_accelerate_cbs_unlocked(rnp, rdp);
+ rcu_nocb_unlock_irqrestore(rdp, flags);
+ }
}
}
@@ -2334,108 +2100,35 @@ rcu_check_quiescent_state(struct rcu_data *rdp)
rcu_report_qs_rdp(rdp);
}
-/*
- * Near the end of the offline process. Trace the fact that this CPU
- * is going offline.
- */
-int rcutree_dying_cpu(unsigned int cpu)
-{
- bool blkd;
- struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
- struct rcu_node *rnp = rdp->mynode;
-
- if (!IS_ENABLED(CONFIG_HOTPLUG_CPU))
- return 0;
-
- blkd = !!(rnp->qsmask & rdp->grpmask);
- trace_rcu_grace_period(rcu_state.name, READ_ONCE(rnp->gp_seq),
- blkd ? TPS("cpuofl") : TPS("cpuofl-bgp"));
- return 0;
-}
-
-/*
- * All CPUs for the specified rcu_node structure have gone offline,
- * and all tasks that were preempted within an RCU read-side critical
- * section while running on one of those CPUs have since exited their RCU
- * read-side critical section. Some other CPU is reporting this fact with
- * the specified rcu_node structure's ->lock held and interrupts disabled.
- * This function therefore goes up the tree of rcu_node structures,
- * clearing the corresponding bits in the ->qsmaskinit fields. Note that
- * the leaf rcu_node structure's ->qsmaskinit field has already been
- * updated.
- *
- * This function does check that the specified rcu_node structure has
- * all CPUs offline and no blocked tasks, so it is OK to invoke it
- * prematurely. That said, invoking it after the fact will cost you
- * a needless lock acquisition. So once it has done its work, don't
- * invoke it again.
- */
-static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf)
-{
- long mask;
- struct rcu_node *rnp = rnp_leaf;
-
- raw_lockdep_assert_held_rcu_node(rnp_leaf);
- if (!IS_ENABLED(CONFIG_HOTPLUG_CPU) ||
- WARN_ON_ONCE(rnp_leaf->qsmaskinit) ||
- WARN_ON_ONCE(rcu_preempt_has_tasks(rnp_leaf)))
- return;
- for (;;) {
- mask = rnp->grpmask;
- rnp = rnp->parent;
- if (!rnp)
- break;
- raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */
- rnp->qsmaskinit &= ~mask;
- /* Between grace periods, so better already be zero! */
- WARN_ON_ONCE(rnp->qsmask);
- if (rnp->qsmaskinit) {
- raw_spin_unlock_rcu_node(rnp);
- /* irqs remain disabled. */
- return;
- }
- raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled. */
- }
-}
-
-/*
- * The CPU has been completely removed, and some other CPU is reporting
- * this fact from process context. Do the remainder of the cleanup.
- * There can only be one CPU hotplug operation at a time, so no need for
- * explicit locking.
- */
-int rcutree_dead_cpu(unsigned int cpu)
+/* Return true if callback-invocation time limit exceeded. */
+static bool rcu_do_batch_check_time(long count, long tlimit,
+ bool jlimit_check, unsigned long jlimit)
{
- struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
- struct rcu_node *rnp = rdp->mynode; /* Outgoing CPU's rdp & rnp. */
-
- if (!IS_ENABLED(CONFIG_HOTPLUG_CPU))
- return 0;
-
- WRITE_ONCE(rcu_state.n_online_cpus, rcu_state.n_online_cpus - 1);
- /* Adjust any no-longer-needed kthreads. */
- rcu_boost_kthread_setaffinity(rnp, -1);
- /* Do any needed no-CB deferred wakeups from this CPU. */
- do_nocb_deferred_wakeup(per_cpu_ptr(&rcu_data, cpu));
-
- // Stop-machine done, so allow nohz_full to disable tick.
- tick_dep_clear(TICK_DEP_BIT_RCU);
- return 0;
+ // Invoke local_clock() only once per 32 consecutive callbacks.
+ return unlikely(tlimit) &&
+ (!likely(count & 31) ||
+ (IS_ENABLED(CONFIG_RCU_DOUBLE_CHECK_CB_TIME) &&
+ jlimit_check && time_after(jiffies, jlimit))) &&
+ local_clock() >= tlimit;
}
/*
* Invoke any RCU callbacks that have made it to the end of their grace
- * period. Thottle as specified by rdp->blimit.
+ * period. Throttle as specified by rdp->blimit.
*/
static void rcu_do_batch(struct rcu_data *rdp)
{
+ long bl;
+ long count = 0;
int div;
+ bool __maybe_unused empty;
unsigned long flags;
- const bool offloaded = rcu_segcblist_is_offloaded(&rdp->cblist);
- struct rcu_head *rhp;
+ unsigned long jlimit;
+ bool jlimit_check = false;
+ long pending;
struct rcu_cblist rcl = RCU_CBLIST_INITIALIZER(rcl);
- long bl, count;
- long pending, tlimit = 0;
+ struct rcu_head *rhp;
+ long tlimit = 0;
/* If no callbacks are ready, just return. */
if (!rcu_segcblist_ready_cbs(&rdp->cblist)) {
@@ -2444,47 +2137,55 @@ static void rcu_do_batch(struct rcu_data *rdp)
trace_rcu_batch_end(rcu_state.name, 0,
!rcu_segcblist_empty(&rdp->cblist),
need_resched(), is_idle_task(current),
- rcu_is_callbacks_kthread());
+ rcu_is_callbacks_kthread(rdp));
return;
}
/*
- * Extract the list of ready callbacks, disabling to prevent
+ * Extract the list of ready callbacks, disabling IRQs to prevent
* races with call_rcu() from interrupt handlers. Leave the
* callback counts, as rcu_barrier() needs to be conservative.
*/
- local_irq_save(flags);
- rcu_nocb_lock(rdp);
+ rcu_nocb_lock_irqsave(rdp, flags);
WARN_ON_ONCE(cpu_is_offline(smp_processor_id()));
- pending = rcu_segcblist_n_cbs(&rdp->cblist);
+ pending = rcu_segcblist_get_seglen(&rdp->cblist, RCU_DONE_TAIL);
div = READ_ONCE(rcu_divisor);
div = div < 0 ? 7 : div > sizeof(long) * 8 - 2 ? sizeof(long) * 8 - 2 : div;
bl = max(rdp->blimit, pending >> div);
- if (unlikely(bl > 100)) {
+ if ((in_serving_softirq() || rdp->rcu_cpu_kthread_status == RCU_KTHREAD_RUNNING) &&
+ (IS_ENABLED(CONFIG_RCU_DOUBLE_CHECK_CB_TIME) || unlikely(bl > 100))) {
+ const long npj = NSEC_PER_SEC / HZ;
long rrn = READ_ONCE(rcu_resched_ns);
rrn = rrn < NSEC_PER_MSEC ? NSEC_PER_MSEC : rrn > NSEC_PER_SEC ? NSEC_PER_SEC : rrn;
tlimit = local_clock() + rrn;
+ jlimit = jiffies + (rrn + npj + 1) / npj;
+ jlimit_check = true;
}
trace_rcu_batch_start(rcu_state.name,
rcu_segcblist_n_cbs(&rdp->cblist), bl);
rcu_segcblist_extract_done_cbs(&rdp->cblist, &rcl);
- if (offloaded)
+ if (rcu_rdp_is_offloaded(rdp))
rdp->qlen_last_fqs_check = rcu_segcblist_n_cbs(&rdp->cblist);
+
+ trace_rcu_segcb_stats(&rdp->cblist, TPS("SegCbDequeued"));
rcu_nocb_unlock_irqrestore(rdp, flags);
/* Invoke callbacks. */
tick_dep_set_task(current, TICK_DEP_BIT_RCU);
rhp = rcu_cblist_dequeue(&rcl);
+
for (; rhp; rhp = rcu_cblist_dequeue(&rcl)) {
rcu_callback_t f;
+ count++;
debug_rcu_head_unqueue(rhp);
rcu_lock_acquire(&rcu_callback_map);
trace_rcu_invoke_callback(rcu_state.name, rhp);
f = rhp->func;
+ debug_rcu_head_callback(rhp);
WRITE_ONCE(rhp->func, (rcu_callback_t)0L);
f(rhp);
@@ -2492,40 +2193,42 @@ static void rcu_do_batch(struct rcu_data *rdp)
/*
* Stop only if limit reached and CPU has something to do.
- * Note: The rcl structure counts down from zero.
*/
- if (-rcl.len >= bl && !offloaded &&
- (need_resched() ||
- (!is_idle_task(current) && !rcu_is_callbacks_kthread())))
- break;
- if (unlikely(tlimit)) {
- /* only call local_clock() every 32 callbacks */
- if (likely((-rcl.len & 31) || local_clock() < tlimit))
- continue;
- /* Exceeded the time limit, so leave. */
- break;
- }
- if (offloaded) {
- WARN_ON_ONCE(in_serving_softirq());
+ if (in_serving_softirq()) {
+ if (count >= bl && (need_resched() || !is_idle_task(current)))
+ break;
+ /*
+ * Make sure we don't spend too much time here and deprive other
+ * softirq vectors of CPU cycles.
+ */
+ if (rcu_do_batch_check_time(count, tlimit, jlimit_check, jlimit))
+ break;
+ } else {
+ // In rcuc/rcuoc context, so no worries about
+ // depriving other softirq vectors of CPU cycles.
local_bh_enable();
lockdep_assert_irqs_enabled();
cond_resched_tasks_rcu_qs();
lockdep_assert_irqs_enabled();
local_bh_disable();
+ // But rcuc kthreads can delay quiescent-state
+ // reporting, so check time limits for them.
+ if (rdp->rcu_cpu_kthread_status == RCU_KTHREAD_RUNNING &&
+ rcu_do_batch_check_time(count, tlimit, jlimit_check, jlimit)) {
+ rdp->rcu_cpu_has_work = 1;
+ break;
+ }
}
}
- local_irq_save(flags);
- rcu_nocb_lock(rdp);
- count = -rcl.len;
+ rcu_nocb_lock_irqsave(rdp, flags);
rdp->n_cbs_invoked += count;
trace_rcu_batch_end(rcu_state.name, count, !!rcl.head, need_resched(),
- is_idle_task(current), rcu_is_callbacks_kthread());
+ is_idle_task(current), rcu_is_callbacks_kthread(rdp));
/* Update counts and requeue any remaining callbacks. */
rcu_segcblist_insert_done_cbs(&rdp->cblist, &rcl);
- smp_mb(); /* List handling before counting for rcu_barrier(). */
- rcu_segcblist_insert_count(&rdp->cblist, &rcl);
+ rcu_segcblist_add_len(&rdp->cblist, -count);
/* Reinstate batch limit if we have worked down the excess. */
count = rcu_segcblist_n_cbs(&rdp->cblist);
@@ -2535,7 +2238,7 @@ static void rcu_do_batch(struct rcu_data *rdp)
/* Reset ->qlen_last_fqs_check trigger if enough CBs have drained. */
if (count == 0 && rdp->qlen_last_fqs_check != 0) {
rdp->qlen_last_fqs_check = 0;
- rdp->n_force_qs_snap = rcu_state.n_force_qs;
+ rdp->n_force_qs_snap = READ_ONCE(rcu_state.n_force_qs);
} else if (count < rdp->qlen_last_fqs_check - qhimark)
rdp->qlen_last_fqs_check = count;
@@ -2543,15 +2246,15 @@ static void rcu_do_batch(struct rcu_data *rdp)
* The following usually indicates a double call_rcu(). To track
* this down, try building with CONFIG_DEBUG_OBJECTS_RCU_HEAD=y.
*/
- WARN_ON_ONCE(count == 0 && !rcu_segcblist_empty(&rdp->cblist));
+ empty = rcu_segcblist_empty(&rdp->cblist);
+ WARN_ON_ONCE(count == 0 && !empty);
WARN_ON_ONCE(!IS_ENABLED(CONFIG_RCU_NOCB_CPU) &&
- count != 0 && rcu_segcblist_empty(&rdp->cblist));
+ count != 0 && empty);
+ WARN_ON_ONCE(count == 0 && rcu_segcblist_n_segment_cbs(&rdp->cblist) != 0);
+ WARN_ON_ONCE(!empty && rcu_segcblist_n_segment_cbs(&rdp->cblist) == 0);
rcu_nocb_unlock_irqrestore(rdp, flags);
- /* Re-invoke RCU core processing if there are callbacks remaining. */
- if (!offloaded && rcu_segcblist_ready_cbs(&rdp->cblist))
- invoke_rcu_core();
tick_dep_clear_task(current, TICK_DEP_BIT_RCU);
}
@@ -2561,11 +2264,19 @@ static void rcu_do_batch(struct rcu_data *rdp)
* state, for example, user mode or idle loop. It also schedules RCU
* core processing. If the current grace period has gone on too long,
* it will ask the scheduler to manufacture a context switch for the sole
- * purpose of providing a providing the needed quiescent state.
+ * purpose of providing the needed quiescent state.
*/
void rcu_sched_clock_irq(int user)
{
+ unsigned long j;
+
+ if (IS_ENABLED(CONFIG_PROVE_RCU)) {
+ j = jiffies;
+ WARN_ON_ONCE(time_before(j, __this_cpu_read(rcu_data.last_sched_clock)));
+ __this_cpu_write(rcu_data.last_sched_clock, j);
+ }
trace_rcu_utilization(TPS("Start scheduler-tick"));
+ lockdep_assert_irqs_disabled();
raw_cpu_inc(rcu_data.ticks_this_gp);
/* The load-acquire pairs with the store-release setting to true. */
if (smp_load_acquire(this_cpu_ptr(&rcu_data.rcu_urgent_qs))) {
@@ -2579,6 +2290,9 @@ void rcu_sched_clock_irq(int user)
rcu_flavor_sched_clock_irq(user);
if (rcu_pending(user))
invoke_rcu_core();
+ if (user || rcu_is_cpu_rrupt_from_idle())
+ rcu_note_voluntary_context_switch(current);
+ lockdep_assert_irqs_disabled();
trace_rcu_utilization(TPS("End scheduler-tick"));
}
@@ -2594,15 +2308,15 @@ static void force_qs_rnp(int (*f)(struct rcu_data *rdp))
{
int cpu;
unsigned long flags;
- unsigned long mask;
- struct rcu_data *rdp;
struct rcu_node *rnp;
rcu_state.cbovld = rcu_state.cbovldnext;
rcu_state.cbovldnext = false;
rcu_for_each_leaf_node(rnp) {
+ unsigned long mask = 0;
+ unsigned long rsmask = 0;
+
cond_resched_tasks_rcu_qs();
- mask = 0;
raw_spin_lock_irqsave_rcu_node(rnp, flags);
rcu_state.cbovldnext |= !!rnp->cbovldmask;
if (rnp->qsmask == 0) {
@@ -2620,11 +2334,17 @@ static void force_qs_rnp(int (*f)(struct rcu_data *rdp))
continue;
}
for_each_leaf_node_cpu_mask(rnp, cpu, rnp->qsmask) {
+ struct rcu_data *rdp;
+ int ret;
+
rdp = per_cpu_ptr(&rcu_data, cpu);
- if (f(rdp)) {
+ ret = f(rdp);
+ if (ret > 0) {
mask |= rdp->grpmask;
rcu_disable_urgency_upon_qs(rdp);
}
+ if (ret < 0)
+ rsmask |= rdp->grpmask;
}
if (mask != 0) {
/* Idle/offline CPUs, report (releases rnp->lock). */
@@ -2633,6 +2353,9 @@ static void force_qs_rnp(int (*f)(struct rcu_data *rdp))
/* Nothing to do here, so just drop the lock. */
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
}
+
+ for_each_leaf_node_cpu_mask(rnp, cpu, rsmask)
+ resched_cpu(cpu);
}
}
@@ -2647,8 +2370,10 @@ void rcu_force_quiescent_state(void)
struct rcu_node *rnp;
struct rcu_node *rnp_old = NULL;
+ if (!rcu_gp_in_progress())
+ return;
/* Funnel through hierarchy to reduce memory contention. */
- rnp = __this_cpu_read(rcu_data.mynode);
+ rnp = raw_cpu_read(rcu_data.mynode);
for (; rnp != NULL; rnp = rnp->parent) {
ret = (READ_ONCE(rcu_state.gp_flags) & RCU_GP_FLAG_FQS) ||
!raw_spin_trylock(&rnp->fqslock);
@@ -2688,7 +2413,24 @@ static __latent_entropy void rcu_core(void)
unsigned long flags;
struct rcu_data *rdp = raw_cpu_ptr(&rcu_data);
struct rcu_node *rnp = rdp->mynode;
- const bool offloaded = rcu_segcblist_is_offloaded(&rdp->cblist);
+ /*
+ * On RT rcu_core() can be preempted when IRQs aren't disabled.
+ * Therefore this function can race with concurrent NOCB (de-)offloading
+ * on this CPU and the below condition must be considered volatile.
+ * However if we race with:
+ *
+ * _ Offloading: In the worst case we accelerate or process callbacks
+ * concurrently with NOCB kthreads. We are guaranteed to
+ * call rcu_nocb_lock() if that happens.
+ *
+ * _ Deoffloading: In the worst case we miss callbacks acceleration or
+ * processing. This is fine because the early stage
+ * of deoffloading invokes rcu_core() after setting
+ * SEGCBLIST_RCU_CORE. So we guarantee that we'll process
+ * what could have been dismissed without the need to wait
+ * for the next rcu_pending() check in the next jiffy.
+ */
+ const bool do_batch = !rcu_segcblist_completely_offloaded(&rdp->cblist);
if (cpu_is_offline(smp_processor_id()))
return;
@@ -2696,7 +2438,7 @@ static __latent_entropy void rcu_core(void)
WARN_ON_ONCE(!rdp->beenonline);
/* Report any deferred quiescent states if preemption enabled. */
- if (!(preempt_count() & PREEMPT_MASK)) {
+ if (IS_ENABLED(CONFIG_PREEMPT_COUNT) && (!(preempt_count() & PREEMPT_MASK))) {
rcu_preempt_deferred_qs(current);
} else if (rcu_preempt_need_deferred_qs(current)) {
set_tsk_need_resched(current);
@@ -2708,19 +2450,23 @@ static __latent_entropy void rcu_core(void)
/* No grace period and unregistered callbacks? */
if (!rcu_gp_in_progress() &&
- rcu_segcblist_is_enabled(&rdp->cblist) && !offloaded) {
- local_irq_save(flags);
+ rcu_segcblist_is_enabled(&rdp->cblist) && do_batch) {
+ rcu_nocb_lock_irqsave(rdp, flags);
if (!rcu_segcblist_restempty(&rdp->cblist, RCU_NEXT_READY_TAIL))
rcu_accelerate_cbs_unlocked(rnp, rdp);
- local_irq_restore(flags);
+ rcu_nocb_unlock_irqrestore(rdp, flags);
}
rcu_check_gp_start_stall(rnp, rdp, rcu_jiffies_till_stall_check());
/* If there are callbacks ready, invoke them. */
- if (!offloaded && rcu_segcblist_ready_cbs(&rdp->cblist) &&
- likely(READ_ONCE(rcu_scheduler_fully_active)))
+ if (do_batch && rcu_segcblist_ready_cbs(&rdp->cblist) &&
+ likely(READ_ONCE(rcu_scheduler_fully_active))) {
rcu_do_batch(rdp);
+ /* Re-invoke RCU core processing if there are callbacks remaining. */
+ if (rcu_segcblist_ready_cbs(&rdp->cblist))
+ invoke_rcu_core();
+ }
/* Do any needed deferred wakeups of rcuo kthreads. */
do_nocb_deferred_wakeup(rdp);
@@ -2791,20 +2537,22 @@ static void rcu_cpu_kthread(unsigned int cpu)
{
unsigned int *statusp = this_cpu_ptr(&rcu_data.rcu_cpu_kthread_status);
char work, *workp = this_cpu_ptr(&rcu_data.rcu_cpu_has_work);
+ unsigned long *j = this_cpu_ptr(&rcu_data.rcuc_activity);
int spincnt;
trace_rcu_utilization(TPS("Start CPU kthread@rcu_run"));
for (spincnt = 0; spincnt < 10; spincnt++) {
+ WRITE_ONCE(*j, jiffies);
local_bh_disable();
*statusp = RCU_KTHREAD_RUNNING;
local_irq_disable();
work = *workp;
- *workp = 0;
+ WRITE_ONCE(*workp, 0);
local_irq_enable();
if (work)
rcu_core();
local_bh_enable();
- if (*workp == 0) {
+ if (!READ_ONCE(*workp)) {
trace_rcu_utilization(TPS("End CPU kthread@rcu_wait"));
*statusp = RCU_KTHREAD_WAITING;
return;
@@ -2815,6 +2563,7 @@ static void rcu_cpu_kthread(unsigned int cpu)
schedule_timeout_idle(2);
trace_rcu_utilization(TPS("End CPU kthread@rcu_yield"));
*statusp = RCU_KTHREAD_WAITING;
+ WRITE_ONCE(*j, jiffies);
}
static struct smp_hotplug_thread rcu_cpu_thread_spec = {
@@ -2835,13 +2584,12 @@ static int __init rcu_spawn_core_kthreads(void)
for_each_possible_cpu(cpu)
per_cpu(rcu_data.rcu_cpu_has_work, cpu) = 0;
- if (!IS_ENABLED(CONFIG_RCU_BOOST) && use_softirq)
+ if (use_softirq)
return 0;
WARN_ONCE(smpboot_register_percpu_thread(&rcu_cpu_thread_spec),
"%s: Could not start rcuc kthread, OOM is now expected behavior\n", __func__);
return 0;
}
-early_initcall(rcu_spawn_core_kthreads);
/*
* Handle any core-RCU processing required by a call_rcu() invocation.
@@ -2879,10 +2627,10 @@ static void __call_rcu_core(struct rcu_data *rdp, struct rcu_head *head,
} else {
/* Give the grace period a kick. */
rdp->blimit = DEFAULT_MAX_RCU_BLIMIT;
- if (rcu_state.n_force_qs == rdp->n_force_qs_snap &&
+ if (READ_ONCE(rcu_state.n_force_qs) == rdp->n_force_qs_snap &&
rcu_segcblist_first_pend_cb(&rdp->cblist) != head)
rcu_force_quiescent_state();
- rdp->n_force_qs_snap = rcu_state.n_force_qs;
+ rdp->n_force_qs_snap = READ_ONCE(rcu_state.n_force_qs);
rdp->qlen_last_fqs_check = rcu_segcblist_n_cbs(&rdp->cblist);
}
}
@@ -2937,11 +2685,12 @@ static void check_cb_ovld(struct rcu_data *rdp)
raw_spin_unlock_rcu_node(rnp);
}
-/* Helper function for call_rcu() and friends. */
static void
-__call_rcu(struct rcu_head *head, rcu_callback_t func)
+__call_rcu_common(struct rcu_head *head, rcu_callback_t func, bool lazy_in)
{
+ static atomic_t doublefrees;
unsigned long flags;
+ bool lazy;
struct rcu_data *rdp;
bool was_alldone;
@@ -2952,18 +2701,21 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func)
/*
* Probable double call_rcu(), so leak the callback.
* Use rcu:rcu_callback trace event to find the previous
- * time callback was passed to __call_rcu().
+ * time callback was passed to call_rcu().
*/
- WARN_ONCE(1, "__call_rcu(): Double-freed CB %p->%pS()!!!\n",
- head, head->func);
+ if (atomic_inc_return(&doublefrees) < 4) {
+ pr_err("%s(): Double-freed CB %p->%pS()!!! ", __func__, head, head->func);
+ mem_dump_obj(head);
+ }
WRITE_ONCE(head->func, rcu_leak_callback);
return;
}
head->func = func;
head->next = NULL;
+ kasan_record_aux_stack_noalloc(head);
local_irq_save(flags);
- kasan_record_aux_stack(head);
rdp = this_cpu_ptr(&rcu_data);
+ lazy = lazy_in && !rcu_async_should_hurry();
/* Add the callback to our list. */
if (unlikely(!rcu_segcblist_is_enabled(&rdp->cblist))) {
@@ -2977,7 +2729,7 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func)
}
check_cb_ovld(rdp);
- if (rcu_nocb_try_bypass(rdp, head, &was_alldone, flags))
+ if (rcu_nocb_try_bypass(rdp, head, &was_alldone, flags, lazy))
return; // Enqueued onto ->nocb_bypass, so just leave.
// If no-CBs CPU gets here, rcu_nocb_try_bypass() acquired ->nocb_lock.
rcu_segcblist_enqueue(&rdp->cblist, head);
@@ -2989,8 +2741,10 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func)
trace_rcu_callback(rcu_state.name, head,
rcu_segcblist_n_cbs(&rdp->cblist));
+ trace_rcu_segcb_stats(&rdp->cblist, TPS("SegCBQueued"));
+
/* Go handle any RCU core processing required. */
- if (unlikely(rcu_segcblist_is_offloaded(&rdp->cblist))) {
+ if (unlikely(rcu_rdp_is_offloaded(rdp))) {
__call_rcu_nocb_wake(rdp, was_alldone, flags); /* unlocks */
} else {
__call_rcu_core(rdp, head, flags);
@@ -2998,8 +2752,40 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func)
}
}
+#ifdef CONFIG_RCU_LAZY
+/**
+ * call_rcu_hurry() - Queue RCU callback for invocation after grace period, and
+ * flush all lazy callbacks (including the new one) to the main ->cblist while
+ * doing so.
+ *
+ * @head: structure to be used for queueing the RCU updates.
+ * @func: actual callback function to be invoked after the grace period
+ *
+ * The callback function will be invoked some time after a full grace
+ * period elapses, in other words after all pre-existing RCU read-side
+ * critical sections have completed.
+ *
+ * Use this API instead of call_rcu() if you don't want the callback to be
+ * invoked after very long periods of time, which can happen on systems without
+ * memory pressure and on systems which are lightly loaded or mostly idle.
+ * This function will cause callbacks to be invoked sooner than later at the
+ * expense of extra power. Other than that, this function is identical to, and
+ * reuses call_rcu()'s logic. Refer to call_rcu() for more details about memory
+ * ordering and other functionality.
+ */
+void call_rcu_hurry(struct rcu_head *head, rcu_callback_t func)
+{
+ __call_rcu_common(head, func, false);
+}
+EXPORT_SYMBOL_GPL(call_rcu_hurry);
+#endif
+
/**
* call_rcu() - Queue an RCU callback for invocation after a grace period.
+ * By default the callbacks are 'lazy' and are kept hidden from the main
+ * ->cblist to prevent starting of grace periods too soon.
+ * If you desire grace periods to start very soon, use call_rcu_hurry().
+ *
* @head: structure to be used for queueing the RCU updates.
* @func: actual callback function to be invoked after the grace period
*
@@ -3007,12 +2793,14 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func)
* period elapses, in other words after all pre-existing RCU read-side
* critical sections have completed. However, the callback function
* might well execute concurrently with RCU read-side critical sections
- * that started after call_rcu() was invoked. RCU read-side critical
- * sections are delimited by rcu_read_lock() and rcu_read_unlock(), and
- * may be nested. In addition, regions of code across which interrupts,
- * preemption, or softirqs have been disabled also serve as RCU read-side
- * critical sections. This includes hardware interrupt handlers, softirq
- * handlers, and NMI handlers.
+ * that started after call_rcu() was invoked.
+ *
+ * RCU read-side critical sections are delimited by rcu_read_lock()
+ * and rcu_read_unlock(), and may be nested. In addition, but only in
+ * v5.0 and later, regions of code across which interrupts, preemption,
+ * or softirqs have been disabled also serve as RCU read-side critical
+ * sections. This includes hardware interrupt handlers, softirq handlers,
+ * and NMI handlers.
*
* Note that all CPUs must agree that the grace period extended beyond
* all pre-existing RCU read-side critical section. On systems with more
@@ -3032,28 +2820,32 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func)
* between the call to call_rcu() and the invocation of "func()" -- even
* if CPU A and CPU B are the same CPU (but again only if the system has
* more than one CPU).
+ *
+ * Implementation of these memory-ordering guarantees is described here:
+ * Documentation/RCU/Design/Memory-Ordering/Tree-RCU-Memory-Ordering.rst.
*/
void call_rcu(struct rcu_head *head, rcu_callback_t func)
{
- __call_rcu(head, func);
+ __call_rcu_common(head, func, IS_ENABLED(CONFIG_RCU_LAZY));
}
EXPORT_SYMBOL_GPL(call_rcu);
-
/* Maximum number of jiffies to wait before draining a batch. */
-#define KFREE_DRAIN_JIFFIES (HZ / 50)
+#define KFREE_DRAIN_JIFFIES (5 * HZ)
#define KFREE_N_BATCHES 2
#define FREE_N_CHANNELS 2
/**
* struct kvfree_rcu_bulk_data - single block to store kvfree_rcu() pointers
+ * @list: List node. All blocks are linked between each other
+ * @gp_snap: Snapshot of RCU state for objects placed to this bulk
* @nr_records: Number of active pointers in the array
- * @next: Next bulk object in the block chain
* @records: Array of the kvfree_rcu() pointers
*/
struct kvfree_rcu_bulk_data {
+ struct list_head list;
+ struct rcu_gp_oldstate gp_snap;
unsigned long nr_records;
- struct kvfree_rcu_bulk_data *next;
void *records[];
};
@@ -3069,33 +2861,37 @@ struct kvfree_rcu_bulk_data {
* struct kfree_rcu_cpu_work - single batch of kfree_rcu() requests
* @rcu_work: Let queue_rcu_work() invoke workqueue handler after grace period
* @head_free: List of kfree_rcu() objects waiting for a grace period
- * @bkvhead_free: Bulk-List of kvfree_rcu() objects waiting for a grace period
+ * @head_free_gp_snap: Grace-period snapshot to check for attempted premature frees.
+ * @bulk_head_free: Bulk-List of kvfree_rcu() objects waiting for a grace period
* @krcp: Pointer to @kfree_rcu_cpu structure
*/
struct kfree_rcu_cpu_work {
struct rcu_work rcu_work;
struct rcu_head *head_free;
- struct kvfree_rcu_bulk_data *bkvhead_free[FREE_N_CHANNELS];
+ struct rcu_gp_oldstate head_free_gp_snap;
+ struct list_head bulk_head_free[FREE_N_CHANNELS];
struct kfree_rcu_cpu *krcp;
};
/**
* struct kfree_rcu_cpu - batch up kfree_rcu() requests for RCU grace period
* @head: List of kfree_rcu() objects not yet waiting for a grace period
- * @bkvhead: Bulk-List of kvfree_rcu() objects not yet waiting for a grace period
+ * @head_gp_snap: Snapshot of RCU state for objects placed to "@head"
+ * @bulk_head: Bulk-List of kvfree_rcu() objects not yet waiting for a grace period
* @krw_arr: Array of batches of kfree_rcu() objects waiting for a grace period
* @lock: Synchronize access to this structure
* @monitor_work: Promote @head to @head_free after KFREE_DRAIN_JIFFIES
- * @monitor_todo: Tracks whether a @monitor_work delayed work is pending
* @initialized: The @rcu_work fields have been initialized
- * @count: Number of objects for which GP not started
+ * @head_count: Number of objects in rcu_head singular list
+ * @bulk_count: Number of objects in bulk-list
* @bkvcache:
* A simple cache list that contains objects for reuse purpose.
* In order to save some per-cpu space the list is singular.
* Even though it is lockless an access has to be protected by the
* per-cpu lock.
* @page_cache_work: A work to refill the cache when it is empty
+ * @backoff_page_cache_fill: Delay cache refills
* @work_in_progress: Indicates that page_cache_work is running
* @hrtimer: A hrtimer for scheduling a page_cache_work
* @nr_bkv_objs: number of allocated objects at @bkvcache.
@@ -3106,16 +2902,23 @@ struct kfree_rcu_cpu_work {
* the interactions with the slab allocators.
*/
struct kfree_rcu_cpu {
+ // Objects queued on a linked list
+ // through their rcu_head structures.
struct rcu_head *head;
- struct kvfree_rcu_bulk_data *bkvhead[FREE_N_CHANNELS];
+ unsigned long head_gp_snap;
+ atomic_t head_count;
+
+ // Objects queued on a bulk-list.
+ struct list_head bulk_head[FREE_N_CHANNELS];
+ atomic_t bulk_count[FREE_N_CHANNELS];
+
struct kfree_rcu_cpu_work krw_arr[KFREE_N_BATCHES];
raw_spinlock_t lock;
struct delayed_work monitor_work;
- bool monitor_todo;
bool initialized;
- int count;
- struct work_struct page_cache_work;
+ struct delayed_work page_cache_work;
+ atomic_t backoff_page_cache_fill;
atomic_t work_in_progress;
struct hrtimer hrtimer;
@@ -3153,8 +2956,7 @@ krc_this_cpu_lock(unsigned long *flags)
static inline void
krc_this_cpu_unlock(struct kfree_rcu_cpu *krcp, unsigned long flags)
{
- raw_spin_unlock(&krcp->lock);
- local_irq_restore(flags);
+ raw_spin_unlock_irqrestore(&krcp->lock, flags);
}
static inline struct kvfree_rcu_bulk_data *
@@ -3163,7 +2965,7 @@ get_cached_bnode(struct kfree_rcu_cpu *krcp)
if (!krcp->nr_bkv_objs)
return NULL;
- krcp->nr_bkv_objs--;
+ WRITE_ONCE(krcp->nr_bkv_objs, krcp->nr_bkv_objs - 1);
return (struct kvfree_rcu_bulk_data *)
llist_del_first(&krcp->bkvcache);
}
@@ -3177,85 +2979,79 @@ put_cached_bnode(struct kfree_rcu_cpu *krcp,
return false;
llist_add((struct llist_node *) bnode, &krcp->bkvcache);
- krcp->nr_bkv_objs++;
+ WRITE_ONCE(krcp->nr_bkv_objs, krcp->nr_bkv_objs + 1);
return true;
-
}
-/*
- * This function is invoked in workqueue context after a grace period.
- * It frees all the objects queued on ->bhead_free or ->head_free.
- */
-static void kfree_rcu_work(struct work_struct *work)
+static int
+drain_page_cache(struct kfree_rcu_cpu *krcp)
{
unsigned long flags;
- struct kvfree_rcu_bulk_data *bkvhead[FREE_N_CHANNELS], *bnext;
- struct rcu_head *head, *next;
- struct kfree_rcu_cpu *krcp;
- struct kfree_rcu_cpu_work *krwp;
- int i, j;
+ struct llist_node *page_list, *pos, *n;
+ int freed = 0;
- krwp = container_of(to_rcu_work(work),
- struct kfree_rcu_cpu_work, rcu_work);
- krcp = krwp->krcp;
+ if (!rcu_min_cached_objs)
+ return 0;
raw_spin_lock_irqsave(&krcp->lock, flags);
- // Channels 1 and 2.
- for (i = 0; i < FREE_N_CHANNELS; i++) {
- bkvhead[i] = krwp->bkvhead_free[i];
- krwp->bkvhead_free[i] = NULL;
- }
-
- // Channel 3.
- head = krwp->head_free;
- krwp->head_free = NULL;
+ page_list = llist_del_all(&krcp->bkvcache);
+ WRITE_ONCE(krcp->nr_bkv_objs, 0);
raw_spin_unlock_irqrestore(&krcp->lock, flags);
- // Handle two first channels.
- for (i = 0; i < FREE_N_CHANNELS; i++) {
- for (; bkvhead[i]; bkvhead[i] = bnext) {
- bnext = bkvhead[i]->next;
- debug_rcu_bhead_unqueue(bkvhead[i]);
-
- rcu_lock_acquire(&rcu_callback_map);
- if (i == 0) { // kmalloc() / kfree().
- trace_rcu_invoke_kfree_bulk_callback(
- rcu_state.name, bkvhead[i]->nr_records,
- bkvhead[i]->records);
-
- kfree_bulk(bkvhead[i]->nr_records,
- bkvhead[i]->records);
- } else { // vmalloc() / vfree().
- for (j = 0; j < bkvhead[i]->nr_records; j++) {
- trace_rcu_invoke_kvfree_callback(
- rcu_state.name,
- bkvhead[i]->records[j], 0);
-
- vfree(bkvhead[i]->records[j]);
- }
- }
- rcu_lock_release(&rcu_callback_map);
+ llist_for_each_safe(pos, n, page_list) {
+ free_page((unsigned long)pos);
+ freed++;
+ }
- raw_spin_lock_irqsave(&krcp->lock, flags);
- if (put_cached_bnode(krcp, bkvhead[i]))
- bkvhead[i] = NULL;
- raw_spin_unlock_irqrestore(&krcp->lock, flags);
+ return freed;
+}
- if (bkvhead[i])
- free_page((unsigned long) bkvhead[i]);
+static void
+kvfree_rcu_bulk(struct kfree_rcu_cpu *krcp,
+ struct kvfree_rcu_bulk_data *bnode, int idx)
+{
+ unsigned long flags;
+ int i;
- cond_resched_tasks_rcu_qs();
+ if (!WARN_ON_ONCE(!poll_state_synchronize_rcu_full(&bnode->gp_snap))) {
+ debug_rcu_bhead_unqueue(bnode);
+ rcu_lock_acquire(&rcu_callback_map);
+ if (idx == 0) { // kmalloc() / kfree().
+ trace_rcu_invoke_kfree_bulk_callback(
+ rcu_state.name, bnode->nr_records,
+ bnode->records);
+
+ kfree_bulk(bnode->nr_records, bnode->records);
+ } else { // vmalloc() / vfree().
+ for (i = 0; i < bnode->nr_records; i++) {
+ trace_rcu_invoke_kvfree_callback(
+ rcu_state.name, bnode->records[i], 0);
+
+ vfree(bnode->records[i]);
+ }
}
+ rcu_lock_release(&rcu_callback_map);
}
- /*
- * Emergency case only. It can happen under low memory
- * condition when an allocation gets failed, so the "bulk"
- * path can not be temporary maintained.
- */
+ raw_spin_lock_irqsave(&krcp->lock, flags);
+ if (put_cached_bnode(krcp, bnode))
+ bnode = NULL;
+ raw_spin_unlock_irqrestore(&krcp->lock, flags);
+
+ if (bnode)
+ free_page((unsigned long) bnode);
+
+ cond_resched_tasks_rcu_qs();
+}
+
+static void
+kvfree_rcu_list(struct rcu_head *head)
+{
+ struct rcu_head *next;
+
for (; head; head = next) {
- unsigned long offset = (unsigned long)head->func;
- void *ptr = (void *)head - offset;
+ void *ptr = (void *) head->func;
+ unsigned long offset = (void *) head - ptr;
next = head->next;
debug_rcu_head_unqueue((struct rcu_head *)ptr);
@@ -3271,99 +3067,205 @@ static void kfree_rcu_work(struct work_struct *work)
}
/*
- * Schedule the kfree batch RCU work to run in workqueue context after a GP.
- *
- * This function is invoked by kfree_rcu_monitor() when the KFREE_DRAIN_JIFFIES
- * timeout has been reached.
+ * This function is invoked in workqueue context after a grace period.
+ * It frees all the objects queued on ->bulk_head_free or ->head_free.
*/
-static inline bool queue_kfree_rcu_work(struct kfree_rcu_cpu *krcp)
+static void kfree_rcu_work(struct work_struct *work)
{
+ unsigned long flags;
+ struct kvfree_rcu_bulk_data *bnode, *n;
+ struct list_head bulk_head[FREE_N_CHANNELS];
+ struct rcu_head *head;
+ struct kfree_rcu_cpu *krcp;
struct kfree_rcu_cpu_work *krwp;
- bool repeat = false;
- int i, j;
+ struct rcu_gp_oldstate head_gp_snap;
+ int i;
- lockdep_assert_held(&krcp->lock);
+ krwp = container_of(to_rcu_work(work),
+ struct kfree_rcu_cpu_work, rcu_work);
+ krcp = krwp->krcp;
- for (i = 0; i < KFREE_N_BATCHES; i++) {
- krwp = &(krcp->krw_arr[i]);
+ raw_spin_lock_irqsave(&krcp->lock, flags);
+ // Channels 1 and 2.
+ for (i = 0; i < FREE_N_CHANNELS; i++)
+ list_replace_init(&krwp->bulk_head_free[i], &bulk_head[i]);
- /*
- * Try to detach bkvhead or head and attach it over any
- * available corresponding free channel. It can be that
- * a previous RCU batch is in progress, it means that
- * immediately to queue another one is not possible so
- * return false to tell caller to retry.
- */
- if ((krcp->bkvhead[0] && !krwp->bkvhead_free[0]) ||
- (krcp->bkvhead[1] && !krwp->bkvhead_free[1]) ||
- (krcp->head && !krwp->head_free)) {
- // Channel 1 corresponds to SLAB ptrs.
- // Channel 2 corresponds to vmalloc ptrs.
- for (j = 0; j < FREE_N_CHANNELS; j++) {
- if (!krwp->bkvhead_free[j]) {
- krwp->bkvhead_free[j] = krcp->bkvhead[j];
- krcp->bkvhead[j] = NULL;
- }
- }
+ // Channel 3.
+ head = krwp->head_free;
+ krwp->head_free = NULL;
+ head_gp_snap = krwp->head_free_gp_snap;
+ raw_spin_unlock_irqrestore(&krcp->lock, flags);
- // Channel 3 corresponds to emergency path.
- if (!krwp->head_free) {
- krwp->head_free = krcp->head;
- krcp->head = NULL;
- }
+ // Handle the first two channels.
+ for (i = 0; i < FREE_N_CHANNELS; i++) {
+ // Start from the tail page, so a GP is likely passed for it.
+ list_for_each_entry_safe(bnode, n, &bulk_head[i], list)
+ kvfree_rcu_bulk(krcp, bnode, i);
+ }
- WRITE_ONCE(krcp->count, 0);
+ /*
+ * This is used when the "bulk" path can not be used for the
+ * double-argument of kvfree_rcu(). This happens when the
+ * page-cache is empty, which means that objects are instead
+ * queued on a linked list through their rcu_head structures.
+ * This list is named "Channel 3".
+ */
+ if (head && !WARN_ON_ONCE(!poll_state_synchronize_rcu_full(&head_gp_snap)))
+ kvfree_rcu_list(head);
+}
- /*
- * One work is per one batch, so there are three
- * "free channels", the batch can handle. It can
- * be that the work is in the pending state when
- * channels have been detached following by each
- * other.
- */
- queue_rcu_work(system_wq, &krwp->rcu_work);
- }
+static bool
+need_offload_krc(struct kfree_rcu_cpu *krcp)
+{
+ int i;
- // Repeat if any "free" corresponding channel is still busy.
- if (krcp->bkvhead[0] || krcp->bkvhead[1] || krcp->head)
- repeat = true;
- }
+ for (i = 0; i < FREE_N_CHANNELS; i++)
+ if (!list_empty(&krcp->bulk_head[i]))
+ return true;
- return !repeat;
+ return !!READ_ONCE(krcp->head);
}
-static inline void kfree_rcu_drain_unlock(struct kfree_rcu_cpu *krcp,
- unsigned long flags)
+static bool
+need_wait_for_krwp_work(struct kfree_rcu_cpu_work *krwp)
{
- // Attempt to start a new batch.
- krcp->monitor_todo = false;
- if (queue_kfree_rcu_work(krcp)) {
- // Success! Our job is done here.
- raw_spin_unlock_irqrestore(&krcp->lock, flags);
+ int i;
+
+ for (i = 0; i < FREE_N_CHANNELS; i++)
+ if (!list_empty(&krwp->bulk_head_free[i]))
+ return true;
+
+ return !!krwp->head_free;
+}
+
+static int krc_count(struct kfree_rcu_cpu *krcp)
+{
+ int sum = atomic_read(&krcp->head_count);
+ int i;
+
+ for (i = 0; i < FREE_N_CHANNELS; i++)
+ sum += atomic_read(&krcp->bulk_count[i]);
+
+ return sum;
+}
+
+static void
+schedule_delayed_monitor_work(struct kfree_rcu_cpu *krcp)
+{
+ long delay, delay_left;
+
+ delay = krc_count(krcp) >= KVFREE_BULK_MAX_ENTR ? 1:KFREE_DRAIN_JIFFIES;
+ if (delayed_work_pending(&krcp->monitor_work)) {
+ delay_left = krcp->monitor_work.timer.expires - jiffies;
+ if (delay < delay_left)
+ mod_delayed_work(system_wq, &krcp->monitor_work, delay);
return;
}
+ queue_delayed_work(system_wq, &krcp->monitor_work, delay);
+}
- // Previous RCU batch still in progress, try again later.
- krcp->monitor_todo = true;
- schedule_delayed_work(&krcp->monitor_work, KFREE_DRAIN_JIFFIES);
+static void
+kvfree_rcu_drain_ready(struct kfree_rcu_cpu *krcp)
+{
+ struct list_head bulk_ready[FREE_N_CHANNELS];
+ struct kvfree_rcu_bulk_data *bnode, *n;
+ struct rcu_head *head_ready = NULL;
+ unsigned long flags;
+ int i;
+
+ raw_spin_lock_irqsave(&krcp->lock, flags);
+ for (i = 0; i < FREE_N_CHANNELS; i++) {
+ INIT_LIST_HEAD(&bulk_ready[i]);
+
+ list_for_each_entry_safe_reverse(bnode, n, &krcp->bulk_head[i], list) {
+ if (!poll_state_synchronize_rcu_full(&bnode->gp_snap))
+ break;
+
+ atomic_sub(bnode->nr_records, &krcp->bulk_count[i]);
+ list_move(&bnode->list, &bulk_ready[i]);
+ }
+ }
+
+ if (krcp->head && poll_state_synchronize_rcu(krcp->head_gp_snap)) {
+ head_ready = krcp->head;
+ atomic_set(&krcp->head_count, 0);
+ WRITE_ONCE(krcp->head, NULL);
+ }
raw_spin_unlock_irqrestore(&krcp->lock, flags);
+
+ for (i = 0; i < FREE_N_CHANNELS; i++) {
+ list_for_each_entry_safe(bnode, n, &bulk_ready[i], list)
+ kvfree_rcu_bulk(krcp, bnode, i);
+ }
+
+ if (head_ready)
+ kvfree_rcu_list(head_ready);
}
/*
* This function is invoked after the KFREE_DRAIN_JIFFIES timeout.
- * It invokes kfree_rcu_drain_unlock() to attempt to start another batch.
*/
static void kfree_rcu_monitor(struct work_struct *work)
{
+ struct kfree_rcu_cpu *krcp = container_of(work,
+ struct kfree_rcu_cpu, monitor_work.work);
unsigned long flags;
- struct kfree_rcu_cpu *krcp = container_of(work, struct kfree_rcu_cpu,
- monitor_work.work);
+ int i, j;
+
+ // Drain ready for reclaim.
+ kvfree_rcu_drain_ready(krcp);
raw_spin_lock_irqsave(&krcp->lock, flags);
- if (krcp->monitor_todo)
- kfree_rcu_drain_unlock(krcp, flags);
- else
- raw_spin_unlock_irqrestore(&krcp->lock, flags);
+
+ // Attempt to start a new batch.
+ for (i = 0; i < KFREE_N_BATCHES; i++) {
+ struct kfree_rcu_cpu_work *krwp = &(krcp->krw_arr[i]);
+
+ // Try to detach bulk_head or head and attach it, only when
+ // all channels are free. Any channel is not free means at krwp
+ // there is on-going rcu work to handle krwp's free business.
+ if (need_wait_for_krwp_work(krwp))
+ continue;
+
+ // kvfree_rcu_drain_ready() might handle this krcp, if so give up.
+ if (need_offload_krc(krcp)) {
+ // Channel 1 corresponds to the SLAB-pointer bulk path.
+ // Channel 2 corresponds to vmalloc-pointer bulk path.
+ for (j = 0; j < FREE_N_CHANNELS; j++) {
+ if (list_empty(&krwp->bulk_head_free[j])) {
+ atomic_set(&krcp->bulk_count[j], 0);
+ list_replace_init(&krcp->bulk_head[j],
+ &krwp->bulk_head_free[j]);
+ }
+ }
+
+ // Channel 3 corresponds to both SLAB and vmalloc
+ // objects queued on the linked list.
+ if (!krwp->head_free) {
+ krwp->head_free = krcp->head;
+ get_state_synchronize_rcu_full(&krwp->head_free_gp_snap);
+ atomic_set(&krcp->head_count, 0);
+ WRITE_ONCE(krcp->head, NULL);
+ }
+
+ // One work is per one batch, so there are three
+ // "free channels", the batch can handle. It can
+ // be that the work is in the pending state when
+ // channels have been detached following by each
+ // other.
+ queue_rcu_work(system_wq, &krwp->rcu_work);
+ }
+ }
+
+ raw_spin_unlock_irqrestore(&krcp->lock, flags);
+
+ // If there is nothing to detach, it means that our job is
+ // successfully done here. In case of having at least one
+ // of the channels that is still busy we should rearm the
+ // work to repeat an attempt. Because previous batches are
+ // still in progress.
+ if (need_offload_krc(krcp))
+ schedule_delayed_monitor_work(krcp);
}
static enum hrtimer_restart
@@ -3372,7 +3274,7 @@ schedule_page_work_fn(struct hrtimer *t)
struct kfree_rcu_cpu *krcp =
container_of(t, struct kfree_rcu_cpu, hrtimer);
- queue_work(system_highpri_wq, &krcp->page_cache_work);
+ queue_delayed_work(system_highpri_wq, &krcp->page_cache_work, 0);
return HRTIMER_NORESTART;
}
@@ -3381,111 +3283,143 @@ static void fill_page_cache_func(struct work_struct *work)
struct kvfree_rcu_bulk_data *bnode;
struct kfree_rcu_cpu *krcp =
container_of(work, struct kfree_rcu_cpu,
- page_cache_work);
+ page_cache_work.work);
unsigned long flags;
+ int nr_pages;
bool pushed;
int i;
- for (i = 0; i < rcu_min_cached_objs; i++) {
+ nr_pages = atomic_read(&krcp->backoff_page_cache_fill) ?
+ 1 : rcu_min_cached_objs;
+
+ for (i = READ_ONCE(krcp->nr_bkv_objs); i < nr_pages; i++) {
bnode = (struct kvfree_rcu_bulk_data *)
- __get_free_page(GFP_KERNEL | __GFP_NOWARN);
+ __get_free_page(GFP_KERNEL | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN);
- if (bnode) {
- raw_spin_lock_irqsave(&krcp->lock, flags);
- pushed = put_cached_bnode(krcp, bnode);
- raw_spin_unlock_irqrestore(&krcp->lock, flags);
+ if (!bnode)
+ break;
- if (!pushed) {
- free_page((unsigned long) bnode);
- break;
- }
+ raw_spin_lock_irqsave(&krcp->lock, flags);
+ pushed = put_cached_bnode(krcp, bnode);
+ raw_spin_unlock_irqrestore(&krcp->lock, flags);
+
+ if (!pushed) {
+ free_page((unsigned long) bnode);
+ break;
}
}
atomic_set(&krcp->work_in_progress, 0);
+ atomic_set(&krcp->backoff_page_cache_fill, 0);
}
static void
run_page_cache_worker(struct kfree_rcu_cpu *krcp)
{
+ // If cache disabled, bail out.
+ if (!rcu_min_cached_objs)
+ return;
+
if (rcu_scheduler_active == RCU_SCHEDULER_RUNNING &&
!atomic_xchg(&krcp->work_in_progress, 1)) {
- hrtimer_init(&krcp->hrtimer, CLOCK_MONOTONIC,
- HRTIMER_MODE_REL);
- krcp->hrtimer.function = schedule_page_work_fn;
- hrtimer_start(&krcp->hrtimer, 0, HRTIMER_MODE_REL);
+ if (atomic_read(&krcp->backoff_page_cache_fill)) {
+ queue_delayed_work(system_wq,
+ &krcp->page_cache_work,
+ msecs_to_jiffies(rcu_delay_page_cache_fill_msec));
+ } else {
+ hrtimer_init(&krcp->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+ krcp->hrtimer.function = schedule_page_work_fn;
+ hrtimer_start(&krcp->hrtimer, 0, HRTIMER_MODE_REL);
+ }
}
}
+// Record ptr in a page managed by krcp, with the pre-krc_this_cpu_lock()
+// state specified by flags. If can_alloc is true, the caller must
+// be schedulable and not be holding any locks or mutexes that might be
+// acquired by the memory allocator or anything that it might invoke.
+// Returns true if ptr was successfully recorded, else the caller must
+// use a fallback.
static inline bool
-kvfree_call_rcu_add_ptr_to_bulk(struct kfree_rcu_cpu *krcp, void *ptr)
+add_ptr_to_bulk_krc_lock(struct kfree_rcu_cpu **krcp,
+ unsigned long *flags, void *ptr, bool can_alloc)
{
struct kvfree_rcu_bulk_data *bnode;
int idx;
- if (unlikely(!krcp->initialized))
+ *krcp = krc_this_cpu_lock(flags);
+ if (unlikely(!(*krcp)->initialized))
return false;
- lockdep_assert_held(&krcp->lock);
idx = !!is_vmalloc_addr(ptr);
+ bnode = list_first_entry_or_null(&(*krcp)->bulk_head[idx],
+ struct kvfree_rcu_bulk_data, list);
/* Check if a new block is required. */
- if (!krcp->bkvhead[idx] ||
- krcp->bkvhead[idx]->nr_records == KVFREE_BULK_MAX_ENTR) {
- bnode = get_cached_bnode(krcp);
- /* Switch to emergency path. */
+ if (!bnode || bnode->nr_records == KVFREE_BULK_MAX_ENTR) {
+ bnode = get_cached_bnode(*krcp);
+ if (!bnode && can_alloc) {
+ krc_this_cpu_unlock(*krcp, *flags);
+
+ // __GFP_NORETRY - allows a light-weight direct reclaim
+ // what is OK from minimizing of fallback hitting point of
+ // view. Apart of that it forbids any OOM invoking what is
+ // also beneficial since we are about to release memory soon.
+ //
+ // __GFP_NOMEMALLOC - prevents from consuming of all the
+ // memory reserves. Please note we have a fallback path.
+ //
+ // __GFP_NOWARN - it is supposed that an allocation can
+ // be failed under low memory or high memory pressure
+ // scenarios.
+ bnode = (struct kvfree_rcu_bulk_data *)
+ __get_free_page(GFP_KERNEL | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN);
+ raw_spin_lock_irqsave(&(*krcp)->lock, *flags);
+ }
+
if (!bnode)
return false;
- /* Initialize the new block. */
+ // Initialize the new block and attach it.
bnode->nr_records = 0;
- bnode->next = krcp->bkvhead[idx];
-
- /* Attach it to the head. */
- krcp->bkvhead[idx] = bnode;
+ list_add(&bnode->list, &(*krcp)->bulk_head[idx]);
}
- /* Finally insert. */
- krcp->bkvhead[idx]->records
- [krcp->bkvhead[idx]->nr_records++] = ptr;
+ // Finally insert and update the GP for this page.
+ bnode->records[bnode->nr_records++] = ptr;
+ get_state_synchronize_rcu_full(&bnode->gp_snap);
+ atomic_inc(&(*krcp)->bulk_count[idx]);
return true;
}
/*
- * Queue a request for lazy invocation of appropriate free routine after a
- * grace period. Please note there are three paths are maintained, two are the
- * main ones that use array of pointers interface and third one is emergency
- * one, that is used only when the main path can not be maintained temporary,
- * due to memory pressure.
+ * Queue a request for lazy invocation of the appropriate free routine
+ * after a grace period. Please note that three paths are maintained,
+ * two for the common case using arrays of pointers and a third one that
+ * is used only when the main paths cannot be used, for example, due to
+ * memory pressure.
*
* Each kvfree_call_rcu() request is added to a batch. The batch will be drained
* every KFREE_DRAIN_JIFFIES number of jiffies. All the objects in the batch will
* be free'd in workqueue context. This allows us to: batch requests together to
* reduce the number of grace periods during heavy kfree_rcu()/kvfree_rcu() load.
*/
-void kvfree_call_rcu(struct rcu_head *head, rcu_callback_t func)
+void kvfree_call_rcu(struct rcu_head *head, void *ptr)
{
unsigned long flags;
struct kfree_rcu_cpu *krcp;
bool success;
- void *ptr;
- if (head) {
- ptr = (void *) head - (unsigned long) func;
- } else {
- /*
- * Please note there is a limitation for the head-less
- * variant, that is why there is a clear rule for such
- * objects: it can be used from might_sleep() context
- * only. For other places please embed an rcu_head to
- * your data.
- */
+ /*
+ * Please note there is a limitation for the head-less
+ * variant, that is why there is a clear rule for such
+ * objects: it can be used from might_sleep() context
+ * only. For other places please embed an rcu_head to
+ * your data.
+ */
+ if (!head)
might_sleep();
- ptr = (unsigned long *) func;
- }
-
- krcp = krc_this_cpu_lock(&flags);
// Queue the object but don't yet schedule the batch.
if (debug_rcu_head_queue(ptr)) {
@@ -3494,11 +3428,11 @@ void kvfree_call_rcu(struct rcu_head *head, rcu_callback_t func)
__func__, head);
// Mark as success and leave.
- success = true;
- goto unlock_return;
+ return;
}
- success = kvfree_call_rcu_add_ptr_to_bulk(krcp, ptr);
+ kasan_record_aux_stack_noalloc(ptr);
+ success = add_ptr_to_bulk_krc_lock(&krcp, &flags, ptr, !head);
if (!success) {
run_page_cache_worker(krcp);
@@ -3506,20 +3440,27 @@ void kvfree_call_rcu(struct rcu_head *head, rcu_callback_t func)
// Inline if kvfree_rcu(one_arg) call.
goto unlock_return;
- head->func = func;
+ head->func = ptr;
head->next = krcp->head;
- krcp->head = head;
+ WRITE_ONCE(krcp->head, head);
+ atomic_inc(&krcp->head_count);
+
+ // Take a snapshot for this krcp.
+ krcp->head_gp_snap = get_state_synchronize_rcu();
success = true;
}
- WRITE_ONCE(krcp->count, krcp->count + 1);
+ /*
+ * The kvfree_rcu() caller considers the pointer freed at this point
+ * and likely removes any references to it. Since the actual slab
+ * freeing (and kmemleak_free()) is deferred, tell kmemleak to ignore
+ * this object (no scanning or false positives reporting).
+ */
+ kmemleak_ignore(ptr);
// Set timer to drain after KFREE_DRAIN_JIFFIES.
- if (rcu_scheduler_active == RCU_SCHEDULER_RUNNING &&
- !krcp->monitor_todo) {
- krcp->monitor_todo = true;
- schedule_delayed_work(&krcp->monitor_work, KFREE_DRAIN_JIFFIES);
- }
+ if (rcu_scheduler_active == RCU_SCHEDULER_RUNNING)
+ schedule_delayed_monitor_work(krcp);
unlock_return:
krc_this_cpu_unlock(krcp, flags);
@@ -3547,28 +3488,26 @@ kfree_rcu_shrink_count(struct shrinker *shrink, struct shrink_control *sc)
for_each_possible_cpu(cpu) {
struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu);
- count += READ_ONCE(krcp->count);
+ count += krc_count(krcp);
+ count += READ_ONCE(krcp->nr_bkv_objs);
+ atomic_set(&krcp->backoff_page_cache_fill, 1);
}
- return count;
+ return count == 0 ? SHRINK_EMPTY : count;
}
static unsigned long
kfree_rcu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
{
int cpu, freed = 0;
- unsigned long flags;
for_each_possible_cpu(cpu) {
int count;
struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu);
- count = krcp->count;
- raw_spin_lock_irqsave(&krcp->lock, flags);
- if (krcp->monitor_todo)
- kfree_rcu_drain_unlock(krcp, flags);
- else
- raw_spin_unlock_irqrestore(&krcp->lock, flags);
+ count = krc_count(krcp);
+ count += drain_page_cache(krcp);
+ kfree_rcu_monitor(&krcp->monitor_work.work);
sc->nr_to_scan -= count;
freed += count;
@@ -3580,69 +3519,35 @@ kfree_rcu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
return freed == 0 ? SHRINK_STOP : freed;
}
-static struct shrinker kfree_rcu_shrinker = {
- .count_objects = kfree_rcu_shrink_count,
- .scan_objects = kfree_rcu_shrink_scan,
- .batch = 0,
- .seeks = DEFAULT_SEEKS,
-};
-
void __init kfree_rcu_scheduler_running(void)
{
int cpu;
- unsigned long flags;
for_each_possible_cpu(cpu) {
struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu);
- raw_spin_lock_irqsave(&krcp->lock, flags);
- if (!krcp->head || krcp->monitor_todo) {
- raw_spin_unlock_irqrestore(&krcp->lock, flags);
- continue;
- }
- krcp->monitor_todo = true;
- schedule_delayed_work_on(cpu, &krcp->monitor_work,
- KFREE_DRAIN_JIFFIES);
- raw_spin_unlock_irqrestore(&krcp->lock, flags);
+ if (need_offload_krc(krcp))
+ schedule_delayed_monitor_work(krcp);
}
}
/*
* During early boot, any blocking grace-period wait automatically
- * implies a grace period. Later on, this is never the case for PREEMPTION.
+ * implies a grace period.
*
- * However, because a context switch is a grace period for !PREEMPTION, any
- * blocking grace-period wait automatically implies a grace period if
- * there is only one CPU online at any point time during execution of
- * either synchronize_rcu() or synchronize_rcu_expedited(). It is OK to
- * occasionally incorrectly indicate that there are multiple CPUs online
- * when there was in fact only one the whole time, as this just adds some
- * overhead: RCU still operates correctly.
+ * Later on, this could in theory be the case for kernels built with
+ * CONFIG_SMP=y && CONFIG_PREEMPTION=y running on a single CPU, but this
+ * is not a common case. Furthermore, this optimization would cause
+ * the rcu_gp_oldstate structure to expand by 50%, so this potential
+ * grace-period optimization is ignored once the scheduler is running.
*/
static int rcu_blocking_is_gp(void)
{
- int ret;
-
- if (IS_ENABLED(CONFIG_PREEMPTION))
- return rcu_scheduler_active == RCU_SCHEDULER_INACTIVE;
- might_sleep(); /* Check for RCU read-side critical section. */
- preempt_disable();
- /*
- * If the rcu_state.n_online_cpus counter is equal to one,
- * there is only one CPU, and that CPU sees all prior accesses
- * made by any CPU that was online at the time of its access.
- * Furthermore, if this counter is equal to one, its value cannot
- * change until after the preempt_enable() below.
- *
- * Furthermore, if rcu_state.n_online_cpus is equal to one here,
- * all later CPUs (both this one and any that come online later
- * on) are guaranteed to see all accesses prior to this point
- * in the code, without the need for additional memory barriers.
- * Those memory barriers are provided by CPU-hotplug code.
- */
- ret = READ_ONCE(rcu_state.n_online_cpus) <= 1;
- preempt_enable();
- return ret;
+ if (rcu_scheduler_active != RCU_SCHEDULER_INACTIVE) {
+ might_sleep();
+ return false;
+ }
+ return true;
}
/**
@@ -3653,10 +3558,12 @@ static int rcu_blocking_is_gp(void)
* read-side critical sections have completed. Note, however, that
* upon return from synchronize_rcu(), the caller might well be executing
* concurrently with new RCU read-side critical sections that began while
- * synchronize_rcu() was waiting. RCU read-side critical sections are
- * delimited by rcu_read_lock() and rcu_read_unlock(), and may be nested.
- * In addition, regions of code across which interrupts, preemption, or
- * softirqs have been disabled also serve as RCU read-side critical
+ * synchronize_rcu() was waiting.
+ *
+ * RCU read-side critical sections are delimited by rcu_read_lock()
+ * and rcu_read_unlock(), and may be nested. In addition, but only in
+ * v5.0 and later, regions of code across which interrupts, preemption,
+ * or softirqs have been disabled also serve as RCU read-side critical
* sections. This includes hardware interrupt handlers, softirq handlers,
* and NMI handlers.
*
@@ -3677,28 +3584,70 @@ static int rcu_blocking_is_gp(void)
* to have executed a full memory barrier during the execution of
* synchronize_rcu() -- even if CPU A and CPU B are the same CPU (but
* again only if the system has more than one CPU).
+ *
+ * Implementation of these memory-ordering guarantees is described here:
+ * Documentation/RCU/Design/Memory-Ordering/Tree-RCU-Memory-Ordering.rst.
*/
void synchronize_rcu(void)
{
+ unsigned long flags;
+ struct rcu_node *rnp;
+
RCU_LOCKDEP_WARN(lock_is_held(&rcu_bh_lock_map) ||
lock_is_held(&rcu_lock_map) ||
lock_is_held(&rcu_sched_lock_map),
"Illegal synchronize_rcu() in RCU read-side critical section");
- if (rcu_blocking_is_gp())
- return; // Context allows vacuous grace periods.
- if (rcu_gp_is_expedited())
- synchronize_rcu_expedited();
- else
- wait_rcu_gp(call_rcu);
+ if (!rcu_blocking_is_gp()) {
+ if (rcu_gp_is_expedited())
+ synchronize_rcu_expedited();
+ else
+ wait_rcu_gp(call_rcu_hurry);
+ return;
+ }
+
+ // Context allows vacuous grace periods.
+ // Note well that this code runs with !PREEMPT && !SMP.
+ // In addition, all code that advances grace periods runs at
+ // process level. Therefore, this normal GP overlaps with other
+ // normal GPs only by being fully nested within them, which allows
+ // reuse of ->gp_seq_polled_snap.
+ rcu_poll_gp_seq_start_unlocked(&rcu_state.gp_seq_polled_snap);
+ rcu_poll_gp_seq_end_unlocked(&rcu_state.gp_seq_polled_snap);
+
+ // Update the normal grace-period counters to record
+ // this grace period, but only those used by the boot CPU.
+ // The rcu_scheduler_starting() will take care of the rest of
+ // these counters.
+ local_irq_save(flags);
+ WARN_ON_ONCE(num_online_cpus() > 1);
+ rcu_state.gp_seq += (1 << RCU_SEQ_CTR_SHIFT);
+ for (rnp = this_cpu_ptr(&rcu_data)->mynode; rnp; rnp = rnp->parent)
+ rnp->gp_seq_needed = rnp->gp_seq = rcu_state.gp_seq;
+ local_irq_restore(flags);
}
EXPORT_SYMBOL_GPL(synchronize_rcu);
/**
+ * get_completed_synchronize_rcu_full - Return a full pre-completed polled state cookie
+ * @rgosp: Place to put state cookie
+ *
+ * Stores into @rgosp a value that will always be treated by functions
+ * like poll_state_synchronize_rcu_full() as a cookie whose grace period
+ * has already completed.
+ */
+void get_completed_synchronize_rcu_full(struct rcu_gp_oldstate *rgosp)
+{
+ rgosp->rgos_norm = RCU_GET_STATE_COMPLETED;
+ rgosp->rgos_exp = RCU_GET_STATE_COMPLETED;
+}
+EXPORT_SYMBOL_GPL(get_completed_synchronize_rcu_full);
+
+/**
* get_state_synchronize_rcu - Snapshot current RCU state
*
* Returns a cookie that is used by a later call to cond_synchronize_rcu()
- * to determine whether or not a full grace period has elapsed in the
- * meantime.
+ * or poll_state_synchronize_rcu() to determine whether or not a full
+ * grace period has elapsed in the meantime.
*/
unsigned long get_state_synchronize_rcu(void)
{
@@ -3707,33 +3656,256 @@ unsigned long get_state_synchronize_rcu(void)
* before the load from ->gp_seq.
*/
smp_mb(); /* ^^^ */
- return rcu_seq_snap(&rcu_state.gp_seq);
+ return rcu_seq_snap(&rcu_state.gp_seq_polled);
}
EXPORT_SYMBOL_GPL(get_state_synchronize_rcu);
/**
- * cond_synchronize_rcu - Conditionally wait for an RCU grace period
+ * get_state_synchronize_rcu_full - Snapshot RCU state, both normal and expedited
+ * @rgosp: location to place combined normal/expedited grace-period state
*
- * @oldstate: return value from earlier call to get_state_synchronize_rcu()
+ * Places the normal and expedited grace-period states in @rgosp. This
+ * state value can be passed to a later call to cond_synchronize_rcu_full()
+ * or poll_state_synchronize_rcu_full() to determine whether or not a
+ * grace period (whether normal or expedited) has elapsed in the meantime.
+ * The rcu_gp_oldstate structure takes up twice the memory of an unsigned
+ * long, but is guaranteed to see all grace periods. In contrast, the
+ * combined state occupies less memory, but can sometimes fail to take
+ * grace periods into account.
+ *
+ * This does not guarantee that the needed grace period will actually
+ * start.
+ */
+void get_state_synchronize_rcu_full(struct rcu_gp_oldstate *rgosp)
+{
+ struct rcu_node *rnp = rcu_get_root();
+
+ /*
+ * Any prior manipulation of RCU-protected data must happen
+ * before the loads from ->gp_seq and ->expedited_sequence.
+ */
+ smp_mb(); /* ^^^ */
+ rgosp->rgos_norm = rcu_seq_snap(&rnp->gp_seq);
+ rgosp->rgos_exp = rcu_seq_snap(&rcu_state.expedited_sequence);
+}
+EXPORT_SYMBOL_GPL(get_state_synchronize_rcu_full);
+
+/*
+ * Helper function for start_poll_synchronize_rcu() and
+ * start_poll_synchronize_rcu_full().
+ */
+static void start_poll_synchronize_rcu_common(void)
+{
+ unsigned long flags;
+ bool needwake;
+ struct rcu_data *rdp;
+ struct rcu_node *rnp;
+
+ lockdep_assert_irqs_enabled();
+ local_irq_save(flags);
+ rdp = this_cpu_ptr(&rcu_data);
+ rnp = rdp->mynode;
+ raw_spin_lock_rcu_node(rnp); // irqs already disabled.
+ // Note it is possible for a grace period to have elapsed between
+ // the above call to get_state_synchronize_rcu() and the below call
+ // to rcu_seq_snap. This is OK, the worst that happens is that we
+ // get a grace period that no one needed. These accesses are ordered
+ // by smp_mb(), and we are accessing them in the opposite order
+ // from which they are updated at grace-period start, as required.
+ needwake = rcu_start_this_gp(rnp, rdp, rcu_seq_snap(&rcu_state.gp_seq));
+ raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+ if (needwake)
+ rcu_gp_kthread_wake();
+}
+
+/**
+ * start_poll_synchronize_rcu - Snapshot and start RCU grace period
+ *
+ * Returns a cookie that is used by a later call to cond_synchronize_rcu()
+ * or poll_state_synchronize_rcu() to determine whether or not a full
+ * grace period has elapsed in the meantime. If the needed grace period
+ * is not already slated to start, notifies RCU core of the need for that
+ * grace period.
+ *
+ * Interrupts must be enabled for the case where it is necessary to awaken
+ * the grace-period kthread.
+ */
+unsigned long start_poll_synchronize_rcu(void)
+{
+ unsigned long gp_seq = get_state_synchronize_rcu();
+
+ start_poll_synchronize_rcu_common();
+ return gp_seq;
+}
+EXPORT_SYMBOL_GPL(start_poll_synchronize_rcu);
+
+/**
+ * start_poll_synchronize_rcu_full - Take a full snapshot and start RCU grace period
+ * @rgosp: value from get_state_synchronize_rcu_full() or start_poll_synchronize_rcu_full()
+ *
+ * Places the normal and expedited grace-period states in *@rgos. This
+ * state value can be passed to a later call to cond_synchronize_rcu_full()
+ * or poll_state_synchronize_rcu_full() to determine whether or not a
+ * grace period (whether normal or expedited) has elapsed in the meantime.
+ * If the needed grace period is not already slated to start, notifies
+ * RCU core of the need for that grace period.
+ *
+ * Interrupts must be enabled for the case where it is necessary to awaken
+ * the grace-period kthread.
+ */
+void start_poll_synchronize_rcu_full(struct rcu_gp_oldstate *rgosp)
+{
+ get_state_synchronize_rcu_full(rgosp);
+
+ start_poll_synchronize_rcu_common();
+}
+EXPORT_SYMBOL_GPL(start_poll_synchronize_rcu_full);
+
+/**
+ * poll_state_synchronize_rcu - Has the specified RCU grace period completed?
+ * @oldstate: value from get_state_synchronize_rcu() or start_poll_synchronize_rcu()
+ *
+ * If a full RCU grace period has elapsed since the earlier call from
+ * which @oldstate was obtained, return @true, otherwise return @false.
+ * If @false is returned, it is the caller's responsibility to invoke this
+ * function later on until it does return @true. Alternatively, the caller
+ * can explicitly wait for a grace period, for example, by passing @oldstate
+ * to either cond_synchronize_rcu() or cond_synchronize_rcu_expedited()
+ * on the one hand or by directly invoking either synchronize_rcu() or
+ * synchronize_rcu_expedited() on the other.
+ *
+ * Yes, this function does not take counter wrap into account.
+ * But counter wrap is harmless. If the counter wraps, we have waited for
+ * more than a billion grace periods (and way more on a 64-bit system!).
+ * Those needing to keep old state values for very long time periods
+ * (many hours even on 32-bit systems) should check them occasionally and
+ * either refresh them or set a flag indicating that the grace period has
+ * completed. Alternatively, they can use get_completed_synchronize_rcu()
+ * to get a guaranteed-completed grace-period state.
+ *
+ * In addition, because oldstate compresses the grace-period state for
+ * both normal and expedited grace periods into a single unsigned long,
+ * it can miss a grace period when synchronize_rcu() runs concurrently
+ * with synchronize_rcu_expedited(). If this is unacceptable, please
+ * instead use the _full() variant of these polling APIs.
+ *
+ * This function provides the same memory-ordering guarantees that
+ * would be provided by a synchronize_rcu() that was invoked at the call
+ * to the function that provided @oldstate, and that returned at the end
+ * of this function.
+ */
+bool poll_state_synchronize_rcu(unsigned long oldstate)
+{
+ if (oldstate == RCU_GET_STATE_COMPLETED ||
+ rcu_seq_done_exact(&rcu_state.gp_seq_polled, oldstate)) {
+ smp_mb(); /* Ensure GP ends before subsequent accesses. */
+ return true;
+ }
+ return false;
+}
+EXPORT_SYMBOL_GPL(poll_state_synchronize_rcu);
+
+/**
+ * poll_state_synchronize_rcu_full - Has the specified RCU grace period completed?
+ * @rgosp: value from get_state_synchronize_rcu_full() or start_poll_synchronize_rcu_full()
+ *
+ * If a full RCU grace period has elapsed since the earlier call from
+ * which *rgosp was obtained, return @true, otherwise return @false.
+ * If @false is returned, it is the caller's responsibility to invoke this
+ * function later on until it does return @true. Alternatively, the caller
+ * can explicitly wait for a grace period, for example, by passing @rgosp
+ * to cond_synchronize_rcu() or by directly invoking synchronize_rcu().
+ *
+ * Yes, this function does not take counter wrap into account.
+ * But counter wrap is harmless. If the counter wraps, we have waited
+ * for more than a billion grace periods (and way more on a 64-bit
+ * system!). Those needing to keep rcu_gp_oldstate values for very
+ * long time periods (many hours even on 32-bit systems) should check
+ * them occasionally and either refresh them or set a flag indicating
+ * that the grace period has completed. Alternatively, they can use
+ * get_completed_synchronize_rcu_full() to get a guaranteed-completed
+ * grace-period state.
+ *
+ * This function provides the same memory-ordering guarantees that would
+ * be provided by a synchronize_rcu() that was invoked at the call to
+ * the function that provided @rgosp, and that returned at the end of this
+ * function. And this guarantee requires that the root rcu_node structure's
+ * ->gp_seq field be checked instead of that of the rcu_state structure.
+ * The problem is that the just-ending grace-period's callbacks can be
+ * invoked between the time that the root rcu_node structure's ->gp_seq
+ * field is updated and the time that the rcu_state structure's ->gp_seq
+ * field is updated. Therefore, if a single synchronize_rcu() is to
+ * cause a subsequent poll_state_synchronize_rcu_full() to return @true,
+ * then the root rcu_node structure is the one that needs to be polled.
+ */
+bool poll_state_synchronize_rcu_full(struct rcu_gp_oldstate *rgosp)
+{
+ struct rcu_node *rnp = rcu_get_root();
+
+ smp_mb(); // Order against root rcu_node structure grace-period cleanup.
+ if (rgosp->rgos_norm == RCU_GET_STATE_COMPLETED ||
+ rcu_seq_done_exact(&rnp->gp_seq, rgosp->rgos_norm) ||
+ rgosp->rgos_exp == RCU_GET_STATE_COMPLETED ||
+ rcu_seq_done_exact(&rcu_state.expedited_sequence, rgosp->rgos_exp)) {
+ smp_mb(); /* Ensure GP ends before subsequent accesses. */
+ return true;
+ }
+ return false;
+}
+EXPORT_SYMBOL_GPL(poll_state_synchronize_rcu_full);
+
+/**
+ * cond_synchronize_rcu - Conditionally wait for an RCU grace period
+ * @oldstate: value from get_state_synchronize_rcu(), start_poll_synchronize_rcu(), or start_poll_synchronize_rcu_expedited()
*
* If a full RCU grace period has elapsed since the earlier call to
- * get_state_synchronize_rcu(), just return. Otherwise, invoke
- * synchronize_rcu() to wait for a full grace period.
+ * get_state_synchronize_rcu() or start_poll_synchronize_rcu(), just return.
+ * Otherwise, invoke synchronize_rcu() to wait for a full grace period.
*
- * Yes, this function does not take counter wrap into account. But
- * counter wrap is harmless. If the counter wraps, we have waited for
+ * Yes, this function does not take counter wrap into account.
+ * But counter wrap is harmless. If the counter wraps, we have waited for
* more than 2 billion grace periods (and way more on a 64-bit system!),
- * so waiting for one additional grace period should be just fine.
+ * so waiting for a couple of additional grace periods should be just fine.
+ *
+ * This function provides the same memory-ordering guarantees that
+ * would be provided by a synchronize_rcu() that was invoked at the call
+ * to the function that provided @oldstate and that returned at the end
+ * of this function.
*/
void cond_synchronize_rcu(unsigned long oldstate)
{
- if (!rcu_seq_done(&rcu_state.gp_seq, oldstate))
+ if (!poll_state_synchronize_rcu(oldstate))
synchronize_rcu();
- else
- smp_mb(); /* Ensure GP ends before subsequent accesses. */
}
EXPORT_SYMBOL_GPL(cond_synchronize_rcu);
+/**
+ * cond_synchronize_rcu_full - Conditionally wait for an RCU grace period
+ * @rgosp: value from get_state_synchronize_rcu_full(), start_poll_synchronize_rcu_full(), or start_poll_synchronize_rcu_expedited_full()
+ *
+ * If a full RCU grace period has elapsed since the call to
+ * get_state_synchronize_rcu_full(), start_poll_synchronize_rcu_full(),
+ * or start_poll_synchronize_rcu_expedited_full() from which @rgosp was
+ * obtained, just return. Otherwise, invoke synchronize_rcu() to wait
+ * for a full grace period.
+ *
+ * Yes, this function does not take counter wrap into account.
+ * But counter wrap is harmless. If the counter wraps, we have waited for
+ * more than 2 billion grace periods (and way more on a 64-bit system!),
+ * so waiting for a couple of additional grace periods should be just fine.
+ *
+ * This function provides the same memory-ordering guarantees that
+ * would be provided by a synchronize_rcu() that was invoked at the call
+ * to the function that provided @rgosp and that returned at the end of
+ * this function.
+ */
+void cond_synchronize_rcu_full(struct rcu_gp_oldstate *rgosp)
+{
+ if (!poll_state_synchronize_rcu_full(rgosp))
+ synchronize_rcu();
+}
+EXPORT_SYMBOL_GPL(cond_synchronize_rcu_full);
+
/*
* Check to see if there is any immediate RCU-related work to be done by
* the current CPU, returning 1 if so and zero otherwise. The checks are
@@ -3747,11 +3919,13 @@ static int rcu_pending(int user)
struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
struct rcu_node *rnp = rdp->mynode;
+ lockdep_assert_irqs_disabled();
+
/* Check for CPU stalls, if enabled. */
check_cpu_stall(rdp);
/* Does this CPU need a deferred NOCB wakeup? */
- if (rcu_nocb_need_deferred_wakeup(rdp))
+ if (rcu_nocb_need_deferred_wakeup(rdp, RCU_NOCB_WAKE))
return 1;
/* Is this a nohz_full CPU in userspace or idle? (Ignore RCU if so.) */
@@ -3764,13 +3938,13 @@ static int rcu_pending(int user)
return 1;
/* Does this CPU have callbacks ready to invoke? */
- if (!rcu_segcblist_is_offloaded(&rdp->cblist) &&
+ if (!rcu_rdp_is_offloaded(rdp) &&
rcu_segcblist_ready_cbs(&rdp->cblist))
return 1;
/* Has RCU gone idle with this CPU needing another grace period? */
if (!gp_in_progress && rcu_segcblist_is_enabled(&rdp->cblist) &&
- !rcu_segcblist_is_offloaded(&rdp->cblist) &&
+ !rcu_rdp_is_offloaded(rdp) &&
!rcu_segcblist_restempty(&rdp->cblist, RCU_NEXT_READY_TAIL))
return 1;
@@ -3816,26 +3990,56 @@ static void rcu_barrier_callback(struct rcu_head *rhp)
}
/*
- * Called with preemption disabled, and from cross-cpu IRQ context.
+ * If needed, entrain an rcu_barrier() callback on rdp->cblist.
*/
-static void rcu_barrier_func(void *cpu_in)
+static void rcu_barrier_entrain(struct rcu_data *rdp)
{
- uintptr_t cpu = (uintptr_t)cpu_in;
- struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
+ unsigned long gseq = READ_ONCE(rcu_state.barrier_sequence);
+ unsigned long lseq = READ_ONCE(rdp->barrier_seq_snap);
+ bool wake_nocb = false;
+ bool was_alldone = false;
+ lockdep_assert_held(&rcu_state.barrier_lock);
+ if (rcu_seq_state(lseq) || !rcu_seq_state(gseq) || rcu_seq_ctr(lseq) != rcu_seq_ctr(gseq))
+ return;
rcu_barrier_trace(TPS("IRQ"), -1, rcu_state.barrier_sequence);
rdp->barrier_head.func = rcu_barrier_callback;
debug_rcu_head_queue(&rdp->barrier_head);
rcu_nocb_lock(rdp);
- WARN_ON_ONCE(!rcu_nocb_flush_bypass(rdp, NULL, jiffies));
+ /*
+ * Flush bypass and wakeup rcuog if we add callbacks to an empty regular
+ * queue. This way we don't wait for bypass timer that can reach seconds
+ * if it's fully lazy.
+ */
+ was_alldone = rcu_rdp_is_offloaded(rdp) && !rcu_segcblist_pend_cbs(&rdp->cblist);
+ WARN_ON_ONCE(!rcu_nocb_flush_bypass(rdp, NULL, jiffies, false));
+ wake_nocb = was_alldone && rcu_segcblist_pend_cbs(&rdp->cblist);
if (rcu_segcblist_entrain(&rdp->cblist, &rdp->barrier_head)) {
atomic_inc(&rcu_state.barrier_cpu_count);
} else {
debug_rcu_head_unqueue(&rdp->barrier_head);
- rcu_barrier_trace(TPS("IRQNQ"), -1,
- rcu_state.barrier_sequence);
+ rcu_barrier_trace(TPS("IRQNQ"), -1, rcu_state.barrier_sequence);
}
rcu_nocb_unlock(rdp);
+ if (wake_nocb)
+ wake_nocb_gp(rdp, false);
+ smp_store_release(&rdp->barrier_seq_snap, gseq);
+}
+
+/*
+ * Called with preemption disabled, and from cross-cpu IRQ context.
+ */
+static void rcu_barrier_handler(void *cpu_in)
+{
+ uintptr_t cpu = (uintptr_t)cpu_in;
+ struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
+
+ lockdep_assert_irqs_disabled();
+ WARN_ON_ONCE(cpu != rdp->cpu);
+ WARN_ON_ONCE(cpu != smp_processor_id());
+ raw_spin_lock(&rcu_state.barrier_lock);
+ rcu_barrier_entrain(rdp);
+ raw_spin_unlock(&rcu_state.barrier_lock);
}
/**
@@ -3849,6 +4053,8 @@ static void rcu_barrier_func(void *cpu_in)
void rcu_barrier(void)
{
uintptr_t cpu;
+ unsigned long flags;
+ unsigned long gseq;
struct rcu_data *rdp;
unsigned long s = rcu_seq_snap(&rcu_state.barrier_sequence);
@@ -3859,15 +4065,16 @@ void rcu_barrier(void)
/* Did someone else do our work for us? */
if (rcu_seq_done(&rcu_state.barrier_sequence, s)) {
- rcu_barrier_trace(TPS("EarlyExit"), -1,
- rcu_state.barrier_sequence);
+ rcu_barrier_trace(TPS("EarlyExit"), -1, rcu_state.barrier_sequence);
smp_mb(); /* caller's subsequent code after above check. */
mutex_unlock(&rcu_state.barrier_mutex);
return;
}
/* Mark the start of the barrier operation. */
+ raw_spin_lock_irqsave(&rcu_state.barrier_lock, flags);
rcu_seq_start(&rcu_state.barrier_sequence);
+ gseq = rcu_state.barrier_sequence;
rcu_barrier_trace(TPS("Inc1"), -1, rcu_state.barrier_sequence);
/*
@@ -3879,7 +4086,7 @@ void rcu_barrier(void)
*/
init_completion(&rcu_state.barrier_completion);
atomic_set(&rcu_state.barrier_cpu_count, 2);
- get_online_cpus();
+ raw_spin_unlock_irqrestore(&rcu_state.barrier_lock, flags);
/*
* Force each CPU with callbacks to register a new callback.
@@ -3888,29 +4095,31 @@ void rcu_barrier(void)
*/
for_each_possible_cpu(cpu) {
rdp = per_cpu_ptr(&rcu_data, cpu);
- if (cpu_is_offline(cpu) &&
- !rcu_segcblist_is_offloaded(&rdp->cblist))
+retry:
+ if (smp_load_acquire(&rdp->barrier_seq_snap) == gseq)
continue;
- if (rcu_segcblist_n_cbs(&rdp->cblist) && cpu_online(cpu)) {
- rcu_barrier_trace(TPS("OnlineQ"), cpu,
- rcu_state.barrier_sequence);
- smp_call_function_single(cpu, rcu_barrier_func, (void *)cpu, 1);
- } else if (rcu_segcblist_n_cbs(&rdp->cblist) &&
- cpu_is_offline(cpu)) {
- rcu_barrier_trace(TPS("OfflineNoCBQ"), cpu,
- rcu_state.barrier_sequence);
- local_irq_disable();
- rcu_barrier_func((void *)cpu);
- local_irq_enable();
- } else if (cpu_is_offline(cpu)) {
- rcu_barrier_trace(TPS("OfflineNoCBNoQ"), cpu,
- rcu_state.barrier_sequence);
- } else {
- rcu_barrier_trace(TPS("OnlineNQ"), cpu,
- rcu_state.barrier_sequence);
+ raw_spin_lock_irqsave(&rcu_state.barrier_lock, flags);
+ if (!rcu_segcblist_n_cbs(&rdp->cblist)) {
+ WRITE_ONCE(rdp->barrier_seq_snap, gseq);
+ raw_spin_unlock_irqrestore(&rcu_state.barrier_lock, flags);
+ rcu_barrier_trace(TPS("NQ"), cpu, rcu_state.barrier_sequence);
+ continue;
+ }
+ if (!rcu_rdp_cpu_online(rdp)) {
+ rcu_barrier_entrain(rdp);
+ WARN_ON_ONCE(READ_ONCE(rdp->barrier_seq_snap) != gseq);
+ raw_spin_unlock_irqrestore(&rcu_state.barrier_lock, flags);
+ rcu_barrier_trace(TPS("OfflineNoCBQ"), cpu, rcu_state.barrier_sequence);
+ continue;
+ }
+ raw_spin_unlock_irqrestore(&rcu_state.barrier_lock, flags);
+ if (smp_call_function_single(cpu, rcu_barrier_handler, (void *)cpu, 1)) {
+ schedule_timeout_uninterruptible(1);
+ goto retry;
}
+ WARN_ON_ONCE(READ_ONCE(rdp->barrier_seq_snap) != gseq);
+ rcu_barrier_trace(TPS("OnlineQ"), cpu, rcu_state.barrier_sequence);
}
- put_online_cpus();
/*
* Now that we have an rcu_barrier_callback() callback on each
@@ -3925,16 +4134,218 @@ void rcu_barrier(void)
/* Mark the end of the barrier operation. */
rcu_barrier_trace(TPS("Inc2"), -1, rcu_state.barrier_sequence);
rcu_seq_end(&rcu_state.barrier_sequence);
+ gseq = rcu_state.barrier_sequence;
+ for_each_possible_cpu(cpu) {
+ rdp = per_cpu_ptr(&rcu_data, cpu);
+
+ WRITE_ONCE(rdp->barrier_seq_snap, gseq);
+ }
/* Other rcu_barrier() invocations can now safely proceed. */
mutex_unlock(&rcu_state.barrier_mutex);
}
EXPORT_SYMBOL_GPL(rcu_barrier);
+static unsigned long rcu_barrier_last_throttle;
+
+/**
+ * rcu_barrier_throttled - Do rcu_barrier(), but limit to one per second
+ *
+ * This can be thought of as guard rails around rcu_barrier() that
+ * permits unrestricted userspace use, at least assuming the hardware's
+ * try_cmpxchg() is robust. There will be at most one call per second to
+ * rcu_barrier() system-wide from use of this function, which means that
+ * callers might needlessly wait a second or three.
+ *
+ * This is intended for use by test suites to avoid OOM by flushing RCU
+ * callbacks from the previous test before starting the next. See the
+ * rcutree.do_rcu_barrier module parameter for more information.
+ *
+ * Why not simply make rcu_barrier() more scalable? That might be
+ * the eventual endpoint, but let's keep it simple for the time being.
+ * Note that the module parameter infrastructure serializes calls to a
+ * given .set() function, but should concurrent .set() invocation ever be
+ * possible, we are ready!
+ */
+static void rcu_barrier_throttled(void)
+{
+ unsigned long j = jiffies;
+ unsigned long old = READ_ONCE(rcu_barrier_last_throttle);
+ unsigned long s = rcu_seq_snap(&rcu_state.barrier_sequence);
+
+ while (time_in_range(j, old, old + HZ / 16) ||
+ !try_cmpxchg(&rcu_barrier_last_throttle, &old, j)) {
+ schedule_timeout_idle(HZ / 16);
+ if (rcu_seq_done(&rcu_state.barrier_sequence, s)) {
+ smp_mb(); /* caller's subsequent code after above check. */
+ return;
+ }
+ j = jiffies;
+ old = READ_ONCE(rcu_barrier_last_throttle);
+ }
+ rcu_barrier();
+}
+
+/*
+ * Invoke rcu_barrier_throttled() when a rcutree.do_rcu_barrier
+ * request arrives. We insist on a true value to allow for possible
+ * future expansion.
+ */
+static int param_set_do_rcu_barrier(const char *val, const struct kernel_param *kp)
+{
+ bool b;
+ int ret;
+
+ if (rcu_scheduler_active != RCU_SCHEDULER_RUNNING)
+ return -EAGAIN;
+ ret = kstrtobool(val, &b);
+ if (!ret && b) {
+ atomic_inc((atomic_t *)kp->arg);
+ rcu_barrier_throttled();
+ atomic_dec((atomic_t *)kp->arg);
+ }
+ return ret;
+}
+
+/*
+ * Output the number of outstanding rcutree.do_rcu_barrier requests.
+ */
+static int param_get_do_rcu_barrier(char *buffer, const struct kernel_param *kp)
+{
+ return sprintf(buffer, "%d\n", atomic_read((atomic_t *)kp->arg));
+}
+
+static const struct kernel_param_ops do_rcu_barrier_ops = {
+ .set = param_set_do_rcu_barrier,
+ .get = param_get_do_rcu_barrier,
+};
+static atomic_t do_rcu_barrier;
+module_param_cb(do_rcu_barrier, &do_rcu_barrier_ops, &do_rcu_barrier, 0644);
+
+/*
+ * Compute the mask of online CPUs for the specified rcu_node structure.
+ * This will not be stable unless the rcu_node structure's ->lock is
+ * held, but the bit corresponding to the current CPU will be stable
+ * in most contexts.
+ */
+static unsigned long rcu_rnp_online_cpus(struct rcu_node *rnp)
+{
+ return READ_ONCE(rnp->qsmaskinitnext);
+}
+
+/*
+ * Is the CPU corresponding to the specified rcu_data structure online
+ * from RCU's perspective? This perspective is given by that structure's
+ * ->qsmaskinitnext field rather than by the global cpu_online_mask.
+ */
+static bool rcu_rdp_cpu_online(struct rcu_data *rdp)
+{
+ return !!(rdp->grpmask & rcu_rnp_online_cpus(rdp->mynode));
+}
+
+bool rcu_cpu_online(int cpu)
+{
+ struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
+
+ return rcu_rdp_cpu_online(rdp);
+}
+
+#if defined(CONFIG_PROVE_RCU) && defined(CONFIG_HOTPLUG_CPU)
+
+/*
+ * Is the current CPU online as far as RCU is concerned?
+ *
+ * Disable preemption to avoid false positives that could otherwise
+ * happen due to the current CPU number being sampled, this task being
+ * preempted, its old CPU being taken offline, resuming on some other CPU,
+ * then determining that its old CPU is now offline.
+ *
+ * Disable checking if in an NMI handler because we cannot safely
+ * report errors from NMI handlers anyway. In addition, it is OK to use
+ * RCU on an offline processor during initial boot, hence the check for
+ * rcu_scheduler_fully_active.
+ */
+bool rcu_lockdep_current_cpu_online(void)
+{
+ struct rcu_data *rdp;
+ bool ret = false;
+
+ if (in_nmi() || !rcu_scheduler_fully_active)
+ return true;
+ preempt_disable_notrace();
+ rdp = this_cpu_ptr(&rcu_data);
+ /*
+ * Strictly, we care here about the case where the current CPU is
+ * in rcutree_report_cpu_starting() and thus has an excuse for rdp->grpmask
+ * not being up to date. So arch_spin_is_locked() might have a
+ * false positive if it's held by some *other* CPU, but that's
+ * OK because that just means a false *negative* on the warning.
+ */
+ if (rcu_rdp_cpu_online(rdp) || arch_spin_is_locked(&rcu_state.ofl_lock))
+ ret = true;
+ preempt_enable_notrace();
+ return ret;
+}
+EXPORT_SYMBOL_GPL(rcu_lockdep_current_cpu_online);
+
+#endif /* #if defined(CONFIG_PROVE_RCU) && defined(CONFIG_HOTPLUG_CPU) */
+
+// Has rcu_init() been invoked? This is used (for example) to determine
+// whether spinlocks may be acquired safely.
+static bool rcu_init_invoked(void)
+{
+ return !!rcu_state.n_online_cpus;
+}
+
+/*
+ * All CPUs for the specified rcu_node structure have gone offline,
+ * and all tasks that were preempted within an RCU read-side critical
+ * section while running on one of those CPUs have since exited their RCU
+ * read-side critical section. Some other CPU is reporting this fact with
+ * the specified rcu_node structure's ->lock held and interrupts disabled.
+ * This function therefore goes up the tree of rcu_node structures,
+ * clearing the corresponding bits in the ->qsmaskinit fields. Note that
+ * the leaf rcu_node structure's ->qsmaskinit field has already been
+ * updated.
+ *
+ * This function does check that the specified rcu_node structure has
+ * all CPUs offline and no blocked tasks, so it is OK to invoke it
+ * prematurely. That said, invoking it after the fact will cost you
+ * a needless lock acquisition. So once it has done its work, don't
+ * invoke it again.
+ */
+static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf)
+{
+ long mask;
+ struct rcu_node *rnp = rnp_leaf;
+
+ raw_lockdep_assert_held_rcu_node(rnp_leaf);
+ if (!IS_ENABLED(CONFIG_HOTPLUG_CPU) ||
+ WARN_ON_ONCE(rnp_leaf->qsmaskinit) ||
+ WARN_ON_ONCE(rcu_preempt_has_tasks(rnp_leaf)))
+ return;
+ for (;;) {
+ mask = rnp->grpmask;
+ rnp = rnp->parent;
+ if (!rnp)
+ break;
+ raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */
+ rnp->qsmaskinit &= ~mask;
+ /* Between grace periods, so better already be zero! */
+ WARN_ON_ONCE(rnp->qsmask);
+ if (rnp->qsmaskinit) {
+ raw_spin_unlock_rcu_node(rnp);
+ /* irqs remain disabled. */
+ return;
+ }
+ raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled. */
+ }
+}
+
/*
* Propagate ->qsinitmask bits up the rcu_node tree to account for the
* first CPU in a given leaf rcu_node structure coming online. The caller
- * must hold the corresponding leaf rcu_node ->lock with interrrupts
+ * must hold the corresponding leaf rcu_node ->lock with interrupts
* disabled.
*/
static void rcu_init_new_rnp(struct rcu_node *rnp_leaf)
@@ -3965,17 +4376,20 @@ static void rcu_init_new_rnp(struct rcu_node *rnp_leaf)
static void __init
rcu_boot_init_percpu_data(int cpu)
{
+ struct context_tracking *ct = this_cpu_ptr(&context_tracking);
struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
/* Set up local state, ensuring consistent view of global state. */
rdp->grpmask = leaf_node_cpu_bit(rdp->mynode, cpu);
INIT_WORK(&rdp->strict_work, strict_work_handler);
- WARN_ON_ONCE(rdp->dynticks_nesting != 1);
- WARN_ON_ONCE(rcu_dynticks_in_eqs(rcu_dynticks_snap(rdp)));
+ WARN_ON_ONCE(ct->dynticks_nesting != 1);
+ WARN_ON_ONCE(rcu_dynticks_in_eqs(rcu_dynticks_snap(cpu)));
+ rdp->barrier_seq_snap = rcu_state.barrier_sequence;
rdp->rcu_ofl_gp_seq = rcu_state.gp_seq;
rdp->rcu_ofl_gp_flags = RCU_GP_CLEANED;
rdp->rcu_onl_gp_seq = rcu_state.gp_seq;
rdp->rcu_onl_gp_flags = RCU_GP_CLEANED;
+ rdp->last_sched_clock = jiffies;
rdp->cpu = cpu;
rcu_boot_init_nocb_percpu_data(rdp);
}
@@ -3993,29 +4407,32 @@ rcu_boot_init_percpu_data(int cpu)
int rcutree_prepare_cpu(unsigned int cpu)
{
unsigned long flags;
+ struct context_tracking *ct = per_cpu_ptr(&context_tracking, cpu);
struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
struct rcu_node *rnp = rcu_get_root();
/* Set up local state, ensuring consistent view of global state. */
raw_spin_lock_irqsave_rcu_node(rnp, flags);
rdp->qlen_last_fqs_check = 0;
- rdp->n_force_qs_snap = rcu_state.n_force_qs;
+ rdp->n_force_qs_snap = READ_ONCE(rcu_state.n_force_qs);
rdp->blimit = blimit;
- if (rcu_segcblist_empty(&rdp->cblist) && /* No early-boot CBs? */
- !rcu_segcblist_is_offloaded(&rdp->cblist))
- rcu_segcblist_init(&rdp->cblist); /* Re-enable callbacks. */
- rdp->dynticks_nesting = 1; /* CPU not up, no tearing. */
- rcu_dynticks_eqs_online();
+ ct->dynticks_nesting = 1; /* CPU not up, no tearing. */
raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled. */
/*
+ * Only non-NOCB CPUs that didn't have early-boot callbacks need to be
+ * (re-)initialized.
+ */
+ if (!rcu_segcblist_is_enabled(&rdp->cblist))
+ rcu_segcblist_init(&rdp->cblist); /* Re-enable callbacks. */
+
+ /*
* Add CPU to leaf rcu_node pending-online bitmask. Any needed
* propagation up the rcu_node tree will happen at the beginning
* of the next grace period.
*/
rnp = rdp->mynode;
raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */
- rdp->beenonline = true; /* We have now been online. */
rdp->gp_seq = READ_ONCE(rnp->gp_seq);
rdp->gp_seq_needed = rdp->gp_seq;
rdp->cpu_no_qs.b.norm = true;
@@ -4025,7 +4442,7 @@ int rcutree_prepare_cpu(unsigned int cpu)
rdp->rcu_iw_gp_seq = rdp->gp_seq - 1;
trace_rcu_grace_period(rcu_state.name, rdp->gp_seq, TPS("cpuonl"));
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
- rcu_prepare_kthreads(cpu);
+ rcu_spawn_one_boost_kthread(rnp);
rcu_spawn_cpu_nocb_kthread(cpu);
WRITE_ONCE(rcu_state.n_online_cpus, rcu_state.n_online_cpus + 1);
@@ -4043,6 +4460,16 @@ static void rcutree_affinity_setting(unsigned int cpu, int outgoing)
}
/*
+ * Has the specified (known valid) CPU ever been fully online?
+ */
+bool rcu_cpu_beenfullyonline(int cpu)
+{
+ struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
+
+ return smp_load_acquire(&rdp->beenonline);
+}
+
+/*
* Near the end of the CPU-online process. Pretty much all services
* enabled, and the CPU is now very much alive.
*/
@@ -4068,29 +4495,6 @@ int rcutree_online_cpu(unsigned int cpu)
}
/*
- * Near the beginning of the process. The CPU is still very much alive
- * with pretty much all services enabled.
- */
-int rcutree_offline_cpu(unsigned int cpu)
-{
- unsigned long flags;
- struct rcu_data *rdp;
- struct rcu_node *rnp;
-
- rdp = per_cpu_ptr(&rcu_data, cpu);
- rnp = rdp->mynode;
- raw_spin_lock_irqsave_rcu_node(rnp, flags);
- rnp->ffmask &= ~rdp->grpmask;
- raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
-
- rcutree_affinity_setting(cpu, cpu);
-
- // nohz_full CPUs need the tick for stop-machine to work quickly
- tick_dep_set(TICK_DEP_BIT_RCU);
- return 0;
-}
-
-/*
* Mark the specified CPU as being online so that subsequent grace periods
* (both expedited and normal) will wait on it. Note that this means that
* incoming CPUs are not allowed to use RCU read-side critical sections
@@ -4100,15 +4504,18 @@ int rcutree_offline_cpu(unsigned int cpu)
* Note that this function is special in that it is invoked directly
* from the incoming CPU rather than from the cpuhp_step mechanism.
* This is because this function must be invoked at a precise location.
+ * This incoming CPU must not have enabled interrupts yet.
+ *
+ * This mirrors the effects of rcutree_report_cpu_dead().
*/
-void rcu_cpu_starting(unsigned int cpu)
+void rcutree_report_cpu_starting(unsigned int cpu)
{
- unsigned long flags;
unsigned long mask;
struct rcu_data *rdp;
struct rcu_node *rnp;
bool newcpu;
+ lockdep_assert_irqs_disabled();
rdp = per_cpu_ptr(&rcu_data, cpu);
if (rdp->cpu_started)
return;
@@ -4116,11 +4523,12 @@ void rcu_cpu_starting(unsigned int cpu)
rnp = rdp->mynode;
mask = rdp->grpmask;
- WRITE_ONCE(rnp->ofl_seq, rnp->ofl_seq + 1);
- WARN_ON_ONCE(!(rnp->ofl_seq & 0x1));
- smp_mb(); // Pair with rcu_gp_cleanup()'s ->ofl_seq barrier().
- raw_spin_lock_irqsave_rcu_node(rnp, flags);
+ arch_spin_lock(&rcu_state.ofl_lock);
+ rcu_dynticks_eqs_online();
+ raw_spin_lock(&rcu_state.barrier_lock);
+ raw_spin_lock_rcu_node(rnp);
WRITE_ONCE(rnp->qsmaskinitnext, rnp->qsmaskinitnext | mask);
+ raw_spin_unlock(&rcu_state.barrier_lock);
newcpu = !(rnp->expmaskinitnext & mask);
rnp->expmaskinitnext |= mask;
/* Allow lockless access for expedited grace periods. */
@@ -4132,15 +4540,18 @@ void rcu_cpu_starting(unsigned int cpu)
/* An incoming CPU should never be blocking a grace period. */
if (WARN_ON_ONCE(rnp->qsmask & mask)) { /* RCU waiting on incoming CPU? */
+ /* rcu_report_qs_rnp() *really* wants some flags to restore */
+ unsigned long flags;
+
+ local_irq_save(flags);
rcu_disable_urgency_upon_qs(rdp);
/* Report QS -after- changing ->qsmaskinitnext! */
rcu_report_qs_rnp(mask, rnp, rnp->gp_seq, flags);
} else {
- raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+ raw_spin_unlock_rcu_node(rnp);
}
- smp_mb(); // Pair with rcu_gp_cleanup()'s ->ofl_seq barrier().
- WRITE_ONCE(rnp->ofl_seq, rnp->ofl_seq + 1);
- WARN_ON_ONCE(rnp->ofl_seq & 0x1);
+ arch_spin_unlock(&rcu_state.ofl_lock);
+ smp_store_release(&rdp->beenonline, true);
smp_mb(); /* Ensure RCU read-side usage follows above initialization. */
}
@@ -4151,41 +4562,41 @@ void rcu_cpu_starting(unsigned int cpu)
* Note that this function is special in that it is invoked directly
* from the outgoing CPU rather than from the cpuhp_step mechanism.
* This is because this function must be invoked at a precise location.
+ *
+ * This mirrors the effect of rcutree_report_cpu_starting().
*/
-void rcu_report_dead(unsigned int cpu)
+void rcutree_report_cpu_dead(void)
{
unsigned long flags;
unsigned long mask;
- struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
+ struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
struct rcu_node *rnp = rdp->mynode; /* Outgoing CPU's rdp & rnp. */
- /* QS for any half-done expedited grace period. */
- preempt_disable();
- rcu_report_exp_rdp(this_cpu_ptr(&rcu_data));
- preempt_enable();
+ /*
+ * IRQS must be disabled from now on and until the CPU dies, or an interrupt
+ * may introduce a new READ-side while it is actually off the QS masks.
+ */
+ lockdep_assert_irqs_disabled();
+ // Do any dangling deferred wakeups.
+ do_nocb_deferred_wakeup(rdp);
+
rcu_preempt_deferred_qs(current);
/* Remove outgoing CPU from mask in the leaf rcu_node structure. */
mask = rdp->grpmask;
- WRITE_ONCE(rnp->ofl_seq, rnp->ofl_seq + 1);
- WARN_ON_ONCE(!(rnp->ofl_seq & 0x1));
- smp_mb(); // Pair with rcu_gp_cleanup()'s ->ofl_seq barrier().
- raw_spin_lock(&rcu_state.ofl_lock);
+ arch_spin_lock(&rcu_state.ofl_lock);
raw_spin_lock_irqsave_rcu_node(rnp, flags); /* Enforce GP memory-order guarantee. */
rdp->rcu_ofl_gp_seq = READ_ONCE(rcu_state.gp_seq);
rdp->rcu_ofl_gp_flags = READ_ONCE(rcu_state.gp_flags);
if (rnp->qsmask & mask) { /* RCU waiting on outgoing CPU? */
/* Report quiescent state -before- changing ->qsmaskinitnext! */
+ rcu_disable_urgency_upon_qs(rdp);
rcu_report_qs_rnp(mask, rnp, rnp->gp_seq, flags);
raw_spin_lock_irqsave_rcu_node(rnp, flags);
}
WRITE_ONCE(rnp->qsmaskinitnext, rnp->qsmaskinitnext & ~mask);
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
- raw_spin_unlock(&rcu_state.ofl_lock);
- smp_mb(); // Pair with rcu_gp_cleanup()'s ->ofl_seq barrier().
- WRITE_ONCE(rnp->ofl_seq, rnp->ofl_seq + 1);
- WARN_ON_ONCE(rnp->ofl_seq & 0x1);
-
+ arch_spin_unlock(&rcu_state.ofl_lock);
rdp->cpu_started = false;
}
@@ -4203,25 +4614,28 @@ void rcutree_migrate_callbacks(int cpu)
struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
bool needwake;
- if (rcu_segcblist_is_offloaded(&rdp->cblist) ||
+ if (rcu_rdp_is_offloaded(rdp) ||
rcu_segcblist_empty(&rdp->cblist))
return; /* No callbacks to migrate. */
- local_irq_save(flags);
+ raw_spin_lock_irqsave(&rcu_state.barrier_lock, flags);
+ WARN_ON_ONCE(rcu_rdp_cpu_online(rdp));
+ rcu_barrier_entrain(rdp);
my_rdp = this_cpu_ptr(&rcu_data);
my_rnp = my_rdp->mynode;
rcu_nocb_lock(my_rdp); /* irqs already disabled. */
- WARN_ON_ONCE(!rcu_nocb_flush_bypass(my_rdp, NULL, jiffies));
+ WARN_ON_ONCE(!rcu_nocb_flush_bypass(my_rdp, NULL, jiffies, false));
raw_spin_lock_rcu_node(my_rnp); /* irqs already disabled. */
/* Leverage recent GPs and set GP for new callbacks. */
needwake = rcu_advance_cbs(my_rnp, rdp) ||
rcu_advance_cbs(my_rnp, my_rdp);
rcu_segcblist_merge(&my_rdp->cblist, &rdp->cblist);
+ raw_spin_unlock(&rcu_state.barrier_lock); /* irqs remain disabled. */
needwake = needwake || rcu_advance_cbs(my_rnp, my_rdp);
rcu_segcblist_disable(&rdp->cblist);
- WARN_ON_ONCE(rcu_segcblist_empty(&my_rdp->cblist) !=
- !rcu_segcblist_n_cbs(&my_rdp->cblist));
- if (rcu_segcblist_is_offloaded(&my_rdp->cblist)) {
+ WARN_ON_ONCE(rcu_segcblist_empty(&my_rdp->cblist) != !rcu_segcblist_n_cbs(&my_rdp->cblist));
+ check_cb_ovld_locked(my_rdp, my_rnp);
+ if (rcu_rdp_is_offloaded(my_rdp)) {
raw_spin_unlock_rcu_node(my_rnp); /* irqs remain disabled. */
__call_rcu_nocb_wake(my_rdp, true, flags);
} else {
@@ -4237,7 +4651,60 @@ void rcutree_migrate_callbacks(int cpu)
cpu, rcu_segcblist_n_cbs(&rdp->cblist),
rcu_segcblist_first_cb(&rdp->cblist));
}
-#endif
+
+/*
+ * The CPU has been completely removed, and some other CPU is reporting
+ * this fact from process context. Do the remainder of the cleanup.
+ * There can only be one CPU hotplug operation at a time, so no need for
+ * explicit locking.
+ */
+int rcutree_dead_cpu(unsigned int cpu)
+{
+ WRITE_ONCE(rcu_state.n_online_cpus, rcu_state.n_online_cpus - 1);
+ // Stop-machine done, so allow nohz_full to disable tick.
+ tick_dep_clear(TICK_DEP_BIT_RCU);
+ return 0;
+}
+
+/*
+ * Near the end of the offline process. Trace the fact that this CPU
+ * is going offline.
+ */
+int rcutree_dying_cpu(unsigned int cpu)
+{
+ bool blkd;
+ struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
+ struct rcu_node *rnp = rdp->mynode;
+
+ blkd = !!(READ_ONCE(rnp->qsmask) & rdp->grpmask);
+ trace_rcu_grace_period(rcu_state.name, READ_ONCE(rnp->gp_seq),
+ blkd ? TPS("cpuofl-bgp") : TPS("cpuofl"));
+ return 0;
+}
+
+/*
+ * Near the beginning of the process. The CPU is still very much alive
+ * with pretty much all services enabled.
+ */
+int rcutree_offline_cpu(unsigned int cpu)
+{
+ unsigned long flags;
+ struct rcu_data *rdp;
+ struct rcu_node *rnp;
+
+ rdp = per_cpu_ptr(&rcu_data, cpu);
+ rnp = rdp->mynode;
+ raw_spin_lock_irqsave_rcu_node(rnp, flags);
+ rnp->ffmask &= ~rdp->grpmask;
+ raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+
+ rcutree_affinity_setting(cpu, cpu);
+
+ // nohz_full CPUs need the tick for stop-machine to work quickly
+ tick_dep_set(TICK_DEP_BIT_RCU);
+ return 0;
+}
+#endif /* #ifdef CONFIG_HOTPLUG_CPU */
/*
* On non-huge systems, use expedited RCU grace periods to make suspend
@@ -4249,11 +4716,13 @@ static int rcu_pm_notify(struct notifier_block *self,
switch (action) {
case PM_HIBERNATION_PREPARE:
case PM_SUSPEND_PREPARE:
+ rcu_async_hurry();
rcu_expedite_gp();
break;
case PM_POST_HIBERNATION:
case PM_POST_SUSPEND:
rcu_unexpedite_gp();
+ rcu_async_relax();
break;
default:
break;
@@ -4261,31 +4730,61 @@ static int rcu_pm_notify(struct notifier_block *self,
return NOTIFY_OK;
}
+#ifdef CONFIG_RCU_EXP_KTHREAD
+struct kthread_worker *rcu_exp_gp_kworker;
+struct kthread_worker *rcu_exp_par_gp_kworker;
+
+static void __init rcu_start_exp_gp_kworkers(void)
+{
+ const char *par_gp_kworker_name = "rcu_exp_par_gp_kthread_worker";
+ const char *gp_kworker_name = "rcu_exp_gp_kthread_worker";
+ struct sched_param param = { .sched_priority = kthread_prio };
+
+ rcu_exp_gp_kworker = kthread_create_worker(0, gp_kworker_name);
+ if (IS_ERR_OR_NULL(rcu_exp_gp_kworker)) {
+ pr_err("Failed to create %s!\n", gp_kworker_name);
+ return;
+ }
+
+ rcu_exp_par_gp_kworker = kthread_create_worker(0, par_gp_kworker_name);
+ if (IS_ERR_OR_NULL(rcu_exp_par_gp_kworker)) {
+ pr_err("Failed to create %s!\n", par_gp_kworker_name);
+ kthread_destroy_worker(rcu_exp_gp_kworker);
+ return;
+ }
+
+ sched_setscheduler_nocheck(rcu_exp_gp_kworker->task, SCHED_FIFO, &param);
+ sched_setscheduler_nocheck(rcu_exp_par_gp_kworker->task, SCHED_FIFO,
+ &param);
+}
+
+static inline void rcu_alloc_par_gp_wq(void)
+{
+}
+#else /* !CONFIG_RCU_EXP_KTHREAD */
+struct workqueue_struct *rcu_par_gp_wq;
+
+static void __init rcu_start_exp_gp_kworkers(void)
+{
+}
+
+static inline void rcu_alloc_par_gp_wq(void)
+{
+ rcu_par_gp_wq = alloc_workqueue("rcu_par_gp", WQ_MEM_RECLAIM, 0);
+ WARN_ON(!rcu_par_gp_wq);
+}
+#endif /* CONFIG_RCU_EXP_KTHREAD */
+
/*
* Spawn the kthreads that handle RCU's grace periods.
*/
static int __init rcu_spawn_gp_kthread(void)
{
unsigned long flags;
- int kthread_prio_in = kthread_prio;
struct rcu_node *rnp;
struct sched_param sp;
struct task_struct *t;
-
- /* Force priority into range. */
- if (IS_ENABLED(CONFIG_RCU_BOOST) && kthread_prio < 2
- && IS_BUILTIN(CONFIG_RCU_TORTURE_TEST))
- kthread_prio = 2;
- else if (IS_ENABLED(CONFIG_RCU_BOOST) && kthread_prio < 1)
- kthread_prio = 1;
- else if (kthread_prio < 0)
- kthread_prio = 0;
- else if (kthread_prio > 99)
- kthread_prio = 99;
-
- if (kthread_prio != kthread_prio_in)
- pr_alert("rcu_spawn_gp_kthread(): Limited prio to %d from %d\n",
- kthread_prio, kthread_prio_in);
+ struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
rcu_scheduler_fully_active = 1;
t = kthread_create(rcu_gp_kthread, NULL, "%s", rcu_state.name);
@@ -4303,8 +4802,17 @@ static int __init rcu_spawn_gp_kthread(void)
smp_store_release(&rcu_state.gp_kthread, t); /* ^^^ */
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
wake_up_process(t);
- rcu_spawn_nocb_kthreads();
- rcu_spawn_boost_kthreads();
+ /* This is a pre-SMP initcall, we expect a single CPU */
+ WARN_ON(num_online_cpus() > 1);
+ /*
+ * Those kthreads couldn't be created on rcu_init() -> rcutree_prepare_cpu()
+ * due to rcu_scheduler_fully_active.
+ */
+ rcu_spawn_cpu_nocb_kthread(smp_processor_id());
+ rcu_spawn_one_boost_kthread(rdp->mynode);
+ rcu_spawn_core_kthreads();
+ /* Create kthread worker for expedited GPs */
+ rcu_start_exp_gp_kworkers();
return 0;
}
early_initcall(rcu_spawn_gp_kthread);
@@ -4321,9 +4829,20 @@ early_initcall(rcu_spawn_gp_kthread);
*/
void rcu_scheduler_starting(void)
{
+ unsigned long flags;
+ struct rcu_node *rnp;
+
WARN_ON(num_online_cpus() != 1);
WARN_ON(nr_context_switches() > 0);
rcu_test_sync_prims();
+
+ // Fix up the ->gp_seq counters.
+ local_irq_save(flags);
+ rcu_for_each_node_breadth_first(rnp)
+ rnp->gp_seq_needed = rnp->gp_seq = rcu_state.gp_seq;
+ local_irq_restore(flags);
+
+ // Switch out of early boot mode.
rcu_scheduler_active = RCU_SCHEDULER_INIT;
rcu_test_sync_prims();
}
@@ -4396,6 +4915,10 @@ static void __init rcu_init_one(void)
init_waitqueue_head(&rnp->exp_wq[2]);
init_waitqueue_head(&rnp->exp_wq[3]);
spin_lock_init(&rnp->exp_lock);
+ mutex_init(&rnp->boost_kthread_mutex);
+ raw_spin_lock_init(&rnp->exp_poll_lock);
+ rnp->exp_seq_poll_rq = RCU_GET_STATE_COMPLETED;
+ INIT_WORK(&rnp->exp_poll_wq, sync_rcu_do_polled_gp);
}
}
@@ -4411,15 +4934,51 @@ static void __init rcu_init_one(void)
}
/*
+ * Force priority from the kernel command-line into range.
+ */
+static void __init sanitize_kthread_prio(void)
+{
+ int kthread_prio_in = kthread_prio;
+
+ if (IS_ENABLED(CONFIG_RCU_BOOST) && kthread_prio < 2
+ && IS_BUILTIN(CONFIG_RCU_TORTURE_TEST))
+ kthread_prio = 2;
+ else if (IS_ENABLED(CONFIG_RCU_BOOST) && kthread_prio < 1)
+ kthread_prio = 1;
+ else if (kthread_prio < 0)
+ kthread_prio = 0;
+ else if (kthread_prio > 99)
+ kthread_prio = 99;
+
+ if (kthread_prio != kthread_prio_in)
+ pr_alert("%s: Limited prio to %d from %d\n",
+ __func__, kthread_prio, kthread_prio_in);
+}
+
+/*
* Compute the rcu_node tree geometry from kernel parameters. This cannot
* replace the definitions in tree.h because those are needed to size
* the ->node array in the rcu_state structure.
*/
-static void __init rcu_init_geometry(void)
+void rcu_init_geometry(void)
{
ulong d;
int i;
+ static unsigned long old_nr_cpu_ids;
int rcu_capacity[RCU_NUM_LVLS];
+ static bool initialized;
+
+ if (initialized) {
+ /*
+ * Warn if setup_nr_cpu_ids() had not yet been invoked,
+ * unless nr_cpus_ids == NR_CPUS, in which case who cares?
+ */
+ WARN_ON_ONCE(old_nr_cpu_ids != nr_cpu_ids);
+ return;
+ }
+
+ old_nr_cpu_ids = nr_cpu_ids;
+ initialized = true;
/*
* Initialize any unspecified boot parameters.
@@ -4513,12 +5072,24 @@ static void __init rcu_dump_rcu_node_tree(void)
}
struct workqueue_struct *rcu_gp_wq;
-struct workqueue_struct *rcu_par_gp_wq;
static void __init kfree_rcu_batch_init(void)
{
int cpu;
- int i;
+ int i, j;
+ struct shrinker *kfree_rcu_shrinker;
+
+ /* Clamp it to [0:100] seconds interval. */
+ if (rcu_delay_page_cache_fill_msec < 0 ||
+ rcu_delay_page_cache_fill_msec > 100 * MSEC_PER_SEC) {
+
+ rcu_delay_page_cache_fill_msec =
+ clamp(rcu_delay_page_cache_fill_msec, 0,
+ (int) (100 * MSEC_PER_SEC));
+
+ pr_info("Adjusting rcutree.rcu_delay_page_cache_fill_msec to %d ms.\n",
+ rcu_delay_page_cache_fill_msec);
+ }
for_each_possible_cpu(cpu) {
struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu);
@@ -4526,24 +5097,40 @@ static void __init kfree_rcu_batch_init(void)
for (i = 0; i < KFREE_N_BATCHES; i++) {
INIT_RCU_WORK(&krcp->krw_arr[i].rcu_work, kfree_rcu_work);
krcp->krw_arr[i].krcp = krcp;
+
+ for (j = 0; j < FREE_N_CHANNELS; j++)
+ INIT_LIST_HEAD(&krcp->krw_arr[i].bulk_head_free[j]);
}
+ for (i = 0; i < FREE_N_CHANNELS; i++)
+ INIT_LIST_HEAD(&krcp->bulk_head[i]);
+
INIT_DELAYED_WORK(&krcp->monitor_work, kfree_rcu_monitor);
- INIT_WORK(&krcp->page_cache_work, fill_page_cache_func);
+ INIT_DELAYED_WORK(&krcp->page_cache_work, fill_page_cache_func);
krcp->initialized = true;
}
- if (register_shrinker(&kfree_rcu_shrinker))
- pr_err("Failed to register kfree_rcu() shrinker!\n");
+
+ kfree_rcu_shrinker = shrinker_alloc(0, "rcu-kfree");
+ if (!kfree_rcu_shrinker) {
+ pr_err("Failed to allocate kfree_rcu() shrinker!\n");
+ return;
+ }
+
+ kfree_rcu_shrinker->count_objects = kfree_rcu_shrink_count;
+ kfree_rcu_shrinker->scan_objects = kfree_rcu_shrink_scan;
+
+ shrinker_register(kfree_rcu_shrinker);
}
void __init rcu_init(void)
{
- int cpu;
+ int cpu = smp_processor_id();
rcu_early_boot_tests();
kfree_rcu_batch_init();
rcu_bootup_announce();
+ sanitize_kthread_prio();
rcu_init_geometry();
rcu_init_one();
if (dump_tree)
@@ -4557,18 +5144,15 @@ void __init rcu_init(void)
* or the scheduler are operational.
*/
pm_notifier(rcu_pm_notify, 0);
- for_each_online_cpu(cpu) {
- rcutree_prepare_cpu(cpu);
- rcu_cpu_starting(cpu);
- rcutree_online_cpu(cpu);
- }
+ WARN_ON(num_online_cpus() > 1); // Only one CPU this early in boot.
+ rcutree_prepare_cpu(cpu);
+ rcutree_report_cpu_starting(cpu);
+ rcutree_online_cpu(cpu);
- /* Create workqueue for expedited GPs and for Tree SRCU. */
+ /* Create workqueue for Tree SRCU and for expedited GPs. */
rcu_gp_wq = alloc_workqueue("rcu_gp", WQ_MEM_RECLAIM, 0);
WARN_ON(!rcu_gp_wq);
- rcu_par_gp_wq = alloc_workqueue("rcu_par_gp", WQ_MEM_RECLAIM, 0);
- WARN_ON(!rcu_par_gp_wq);
- srcu_init();
+ rcu_alloc_par_gp_wq();
/* Fill in default value for rcutree.qovld boot parameter. */
/* -After- the rcu_node ->lock fields are initialized! */
@@ -4576,8 +5160,14 @@ void __init rcu_init(void)
qovld_calc = DEFAULT_RCU_QOVLD_MULT * qhimark;
else
qovld_calc = qovld;
+
+ // Kick-start in case any polled grace periods started early.
+ (void)start_poll_synchronize_rcu_expedited();
+
+ rcu_test_sync_prims();
}
#include "tree_stall.h"
#include "tree_exp.h"
+#include "tree_nocb.h"
#include "tree_plugin.h"
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index 7708ed161f4a..e9821a8422db 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -10,6 +10,7 @@
*/
#include <linux/cache.h>
+#include <linux/kthread.h>
#include <linux/spinlock.h>
#include <linux/rtmutex.h>
#include <linux/threads.h>
@@ -23,7 +24,11 @@
/* Communicate arguments to a workqueue handler. */
struct rcu_exp_work {
unsigned long rew_s;
+#ifdef CONFIG_RCU_EXP_KTHREAD
+ struct kthread_work rew_work;
+#else
struct work_struct rew_work;
+#endif /* CONFIG_RCU_EXP_KTHREAD */
};
/* RCU's kthread states for tracing. */
@@ -56,8 +61,6 @@ struct rcu_node {
/* Initialized from ->qsmaskinitnext at the */
/* beginning of each grace period. */
unsigned long qsmaskinitnext;
- unsigned long ofl_seq; /* CPU-hotplug operation sequence count. */
- /* Online CPUs for next grace period. */
unsigned long expmask; /* CPUs or groups that need to check in */
/* to allow the current expedited GP */
/* to complete. */
@@ -110,11 +113,15 @@ struct rcu_node {
/* side effect, not as a lock. */
unsigned long boost_time;
/* When to start boosting (jiffies). */
+ struct mutex boost_kthread_mutex;
+ /* Exclusion for thread spawning and affinity */
+ /* manipulation. */
struct task_struct *boost_kthread_task;
/* kthread that takes care of priority */
/* boosting for this rcu_node structure. */
unsigned int boost_kthread_status;
/* State of boost_kthread_task for tracing. */
+ unsigned long n_boosts; /* Number of boosts for this rcu_node structure. */
#ifdef CONFIG_RCU_NOCB_CPU
struct swait_queue_head nocb_gp_wq[2];
/* Place for rcu_nocb_kthread() to wait GP. */
@@ -126,6 +133,10 @@ struct rcu_node {
wait_queue_head_t exp_wq[4];
struct rcu_exp_work rew;
bool exp_need_flush; /* Need to flush workitem? */
+ raw_spinlock_t exp_poll_lock;
+ /* Lock and data for polled expedited grace periods. */
+ unsigned long exp_seq_poll_rq;
+ struct work_struct exp_poll_wq;
} ____cacheline_internodealigned_in_smp;
/*
@@ -147,16 +158,32 @@ union rcu_noqs {
u16 s; /* Set of bits, aggregate OR here. */
};
+/*
+ * Record the snapshot of the core stats at half of the first RCU stall timeout.
+ * The member gp_seq is used to ensure that all members are updated only once
+ * during the sampling period. The snapshot is taken only if this gp_seq is not
+ * equal to rdp->gp_seq.
+ */
+struct rcu_snap_record {
+ unsigned long gp_seq; /* Track rdp->gp_seq counter */
+ u64 cputime_irq; /* Accumulated cputime of hard irqs */
+ u64 cputime_softirq;/* Accumulated cputime of soft irqs */
+ u64 cputime_system; /* Accumulated cputime of kernel tasks */
+ unsigned long nr_hardirqs; /* Accumulated number of hard irqs */
+ unsigned int nr_softirqs; /* Accumulated number of soft irqs */
+ unsigned long long nr_csw; /* Accumulated number of task switches */
+ unsigned long jiffies; /* Track jiffies value */
+};
+
/* Per-CPU data for read-copy update. */
struct rcu_data {
/* 1) quiescent-state and grace-period handling : */
unsigned long gp_seq; /* Track rsp->gp_seq counter. */
unsigned long gp_seq_needed; /* Track furthest future GP request. */
union rcu_noqs cpu_no_qs; /* No QSes yet for this CPU. */
- bool core_needs_qs; /* Core waits for quiesc state. */
+ bool core_needs_qs; /* Core waits for quiescent state. */
bool beenonline; /* CPU online at least once. */
bool gpwrap; /* Possible ->gp_seq wrap. */
- bool exp_deferred_qs; /* This CPU awaiting a deferred QS? */
bool cpu_started; /* RCU watching this onlining CPU. */
struct rcu_node *mynode; /* This CPU's leaf of hierarchy */
unsigned long grpmask; /* Mask to apply to leaf qsmask. */
@@ -181,32 +208,28 @@ struct rcu_data {
/* 3) dynticks interface. */
int dynticks_snap; /* Per-GP tracking for dynticks. */
- long dynticks_nesting; /* Track process nesting level. */
- long dynticks_nmi_nesting; /* Track irq/NMI nesting level. */
- atomic_t dynticks; /* Even value for idle, else odd. */
bool rcu_need_heavy_qs; /* GP old, so heavy quiescent state! */
bool rcu_urgent_qs; /* GP old need light quiescent state. */
bool rcu_forced_tick; /* Forced tick to provide QS. */
bool rcu_forced_tick_exp; /* ... provide QS to expedited GP. */
-#ifdef CONFIG_RCU_FAST_NO_HZ
- unsigned long last_accelerate; /* Last jiffy CBs were accelerated. */
- unsigned long last_advance_all; /* Last jiffy CBs were all advanced. */
- int tick_nohz_enabled_snap; /* Previously seen value from sysfs. */
-#endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */
/* 4) rcu_barrier(), OOM callbacks, and expediting. */
+ unsigned long barrier_seq_snap; /* Snap of rcu_state.barrier_sequence. */
struct rcu_head barrier_head;
int exp_dynticks_snap; /* Double-check need for IPI. */
/* 5) Callback offloading. */
#ifdef CONFIG_RCU_NOCB_CPU
struct swait_queue_head nocb_cb_wq; /* For nocb kthreads to sleep on. */
+ struct swait_queue_head nocb_state_wq; /* For offloading state changes */
struct task_struct *nocb_gp_kthread;
raw_spinlock_t nocb_lock; /* Guard following pair of fields. */
atomic_t nocb_lock_contended; /* Contention experienced. */
int nocb_defer_wakeup; /* Defer wakeup of nocb_kthread. */
struct timer_list nocb_timer; /* Enforce finite deferral. */
unsigned long nocb_gp_adv_time; /* Last call_rcu() CB adv (jiffies). */
+ struct mutex nocb_gp_kthread_mutex; /* Exclusion for nocb gp kthread */
+ /* spawning */
/* The following fields are used by call_rcu, hence own cacheline. */
raw_spinlock_t nocb_bypass_lock ____cacheline_internodealigned_in_smp;
@@ -217,7 +240,6 @@ struct rcu_data {
/* The following fields are used by GP kthread, hence own cacheline. */
raw_spinlock_t nocb_gp_lock ____cacheline_internodealigned_in_smp;
- struct timer_list nocb_bypass_timer; /* Force nocb_bypass flush. */
u8 nocb_gp_sleep; /* Is the nocb GP thread asleep? */
u8 nocb_gp_bypass; /* Found a bypass on last scan? */
u8 nocb_gp_gp; /* GP to wait for on last scan? */
@@ -226,8 +248,12 @@ struct rcu_data {
struct swait_queue_head nocb_gp_wq; /* For nocb kthreads to sleep on. */
bool nocb_cb_sleep; /* Is the nocb CB thread asleep? */
struct task_struct *nocb_cb_kthread;
- struct rcu_data *nocb_next_cb_rdp;
- /* Next rcu_data in wakeup chain. */
+ struct list_head nocb_head_rdp; /*
+ * Head of rcu_data list in wakeup chain,
+ * if rdp_gp.
+ */
+ struct list_head nocb_entry_rdp; /* rcu_data node in wakeup chain. */
+ struct rcu_data *nocb_toggling_rdp; /* rdp queued for (de-)offloading */
/* The following fields are used by CB kthread, hence new cacheline. */
struct rcu_data *nocb_gp_rdp ____cacheline_internodealigned_in_smp;
@@ -239,6 +265,7 @@ struct rcu_data {
/* rcuc per-CPU kthread or NULL. */
unsigned int rcu_cpu_kthread_status;
char rcu_cpu_has_work;
+ unsigned long rcuc_activity;
/* 7) Diagnostic data, including RCU CPU stall warnings. */
unsigned int softirq_snap; /* Snapshot of softirq activity. */
@@ -251,14 +278,20 @@ struct rcu_data {
unsigned long rcu_onl_gp_seq; /* ->gp_seq at last online. */
short rcu_onl_gp_flags; /* ->gp_flags at last online. */
unsigned long last_fqs_resched; /* Time of last rcu_resched(). */
+ unsigned long last_sched_clock; /* Jiffies of last rcu_sched_clock_irq(). */
+ struct rcu_snap_record snap_record; /* Snapshot of core stats at half of */
+ /* the first RCU stall timeout */
+ long lazy_len; /* Length of buffered lazy callbacks. */
int cpu;
};
/* Values for nocb_defer_wakeup field in struct rcu_data. */
#define RCU_NOCB_WAKE_NOT 0
-#define RCU_NOCB_WAKE 1
-#define RCU_NOCB_WAKE_FORCE 2
+#define RCU_NOCB_WAKE_BYPASS 1
+#define RCU_NOCB_WAKE_LAZY 2
+#define RCU_NOCB_WAKE 3
+#define RCU_NOCB_WAKE_FORCE 4
#define RCU_JIFFIES_TILL_FORCE_QS (1 + (HZ > 250) + (HZ > 500))
/* For jiffies_till_first_fqs and */
@@ -303,9 +336,8 @@ struct rcu_state {
/* The following fields are guarded by the root rcu_node's lock. */
- u8 boost ____cacheline_internodealigned_in_smp;
- /* Subject to priority boost. */
- unsigned long gp_seq; /* Grace-period sequence #. */
+ unsigned long gp_seq ____cacheline_internodealigned_in_smp;
+ /* Grace-period sequence #. */
unsigned long gp_max; /* Maximum GP duration in */
/* jiffies. */
struct task_struct *gp_kthread; /* Task for grace periods. */
@@ -314,6 +346,9 @@ struct rcu_state {
short gp_state; /* GP kthread sleep state. */
unsigned long gp_wake_time; /* Last GP kthread wake. */
unsigned long gp_wake_seq; /* ->gp_seq at ^^^. */
+ unsigned long gp_seq_polled; /* GP seq for polled API. */
+ unsigned long gp_seq_polled_snap; /* ->gp_seq_polled at normal GP start. */
+ unsigned long gp_seq_polled_exp_snap; /* ->gp_seq_polled at expedited GP start. */
/* End of fields guarded by root rcu_node's lock. */
@@ -324,6 +359,8 @@ struct rcu_state {
/* rcu_barrier(). */
/* End of fields guarded by barrier_mutex. */
+ raw_spinlock_t barrier_lock; /* Protects ->barrier_seq_snap. */
+
struct mutex exp_mutex; /* Serialize expedited GP. */
struct mutex exp_wake_mutex; /* Serialize wakeup. */
unsigned long expedited_sequence; /* Take a ticket. */
@@ -349,6 +386,10 @@ struct rcu_state {
/* in jiffies. */
unsigned long jiffies_stall; /* Time at which to check */
/* for CPU stalls. */
+ int nr_fqs_jiffies_stall; /* Number of fqs loops after
+ * which read jiffies and set
+ * jiffies_stall. Stall
+ * warnings disabled if !0. */
unsigned long jiffies_resched; /* Time at which to resched */
/* a reluctant CPU. */
unsigned long n_force_qs_gpstart; /* Snapshot of n_force_qs at */
@@ -356,9 +397,10 @@ struct rcu_state {
const char *name; /* Name of structure. */
char abbr; /* Abbreviated name. */
- raw_spinlock_t ofl_lock ____cacheline_internodealigned_in_smp;
+ arch_spinlock_t ofl_lock ____cacheline_internodealigned_in_smp;
/* Synchronize offline with */
/* GP pre-initialization. */
+ int nocb_is_setup; /* nocb is setup from boot */
};
/* Values for rcu_state structure's gp_flags field. */
@@ -413,30 +455,27 @@ static void rcu_flavor_sched_clock_irq(int user);
static void dump_blkd_tasks(struct rcu_node *rnp, int ncheck);
static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags);
static void rcu_preempt_boost_start_gp(struct rcu_node *rnp);
-static bool rcu_is_callbacks_kthread(void);
+static bool rcu_is_callbacks_kthread(struct rcu_data *rdp);
static void rcu_cpu_kthread_setup(unsigned int cpu);
-static void __init rcu_spawn_boost_kthreads(void);
-static void rcu_prepare_kthreads(int cpu);
-static void rcu_cleanup_after_idle(void);
-static void rcu_prepare_for_idle(void);
+static void rcu_spawn_one_boost_kthread(struct rcu_node *rnp);
static bool rcu_preempt_has_tasks(struct rcu_node *rnp);
static bool rcu_preempt_need_deferred_qs(struct task_struct *t);
-static void rcu_preempt_deferred_qs(struct task_struct *t);
static void zero_cpu_stall_ticks(struct rcu_data *rdp);
static struct swait_queue_head *rcu_nocb_gp_get(struct rcu_node *rnp);
static void rcu_nocb_gp_cleanup(struct swait_queue_head *sq);
static void rcu_init_one_nocb(struct rcu_node *rnp);
+static bool wake_nocb_gp(struct rcu_data *rdp, bool force);
static bool rcu_nocb_flush_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
- unsigned long j);
+ unsigned long j, bool lazy);
static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
- bool *was_alldone, unsigned long flags);
+ bool *was_alldone, unsigned long flags,
+ bool lazy);
static void __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_empty,
unsigned long flags);
-static int rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp);
-static void do_nocb_deferred_wakeup(struct rcu_data *rdp);
+static int rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp, int level);
+static bool do_nocb_deferred_wakeup(struct rcu_data *rdp);
static void rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp);
static void rcu_spawn_cpu_nocb_kthread(int cpu);
-static void __init rcu_spawn_nocb_kthreads(void);
static void show_rcu_nocb_state(struct rcu_data *rdp);
static void rcu_nocb_lock(struct rcu_data *rdp);
static void rcu_nocb_unlock(struct rcu_data *rdp);
@@ -445,12 +484,16 @@ static void rcu_nocb_unlock_irqrestore(struct rcu_data *rdp,
static void rcu_lockdep_assert_cblist_protected(struct rcu_data *rdp);
#ifdef CONFIG_RCU_NOCB_CPU
static void __init rcu_organize_nocb_kthreads(void);
-#define rcu_nocb_lock_irqsave(rdp, flags) \
-do { \
- if (!rcu_segcblist_is_offloaded(&(rdp)->cblist)) \
- local_irq_save(flags); \
- else \
- raw_spin_lock_irqsave(&(rdp)->nocb_lock, (flags)); \
+
+/*
+ * Disable IRQs before checking offloaded state so that local
+ * locking is safe against concurrent de-offloading.
+ */
+#define rcu_nocb_lock_irqsave(rdp, flags) \
+do { \
+ local_irq_save(flags); \
+ if (rcu_segcblist_is_offloaded(&(rdp)->cblist)) \
+ raw_spin_lock(&(rdp)->nocb_lock); \
} while (0)
#else /* #ifdef CONFIG_RCU_NOCB_CPU */
#define rcu_nocb_lock_irqsave(rdp, flags) local_irq_save(flags)
@@ -458,10 +501,6 @@ do { \
static void rcu_bind_gp_kthread(void);
static bool rcu_nohz_full_cpu(void);
-static void rcu_dynticks_task_enter(void);
-static void rcu_dynticks_task_exit(void);
-static void rcu_dynticks_task_trace_enter(void);
-static void rcu_dynticks_task_trace_exit(void);
/* Forward declarations for tree_stall.h */
static void record_gp_stall_check_time(void);
@@ -469,3 +508,6 @@ static void rcu_iw_handler(struct irq_work *iwp);
static void check_cpu_stall(struct rcu_data *rdp);
static void rcu_check_gp_start_stall(struct rcu_node *rnp, struct rcu_data *rdp,
const unsigned long gpssdelay);
+
+/* Forward declarations for tree_exp.h. */
+static void sync_rcu_do_polled_gp(struct work_struct *wp);
diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h
index 8760b6ead770..2ac440bc7e10 100644
--- a/kernel/rcu/tree_exp.h
+++ b/kernel/rcu/tree_exp.h
@@ -11,6 +11,7 @@
static void rcu_exp_handler(void *unused);
static int rcu_print_task_exp_stall(struct rcu_node *rnp);
+static void rcu_exp_print_detail_task_stall_rnp(struct rcu_node *rnp);
/*
* Record the start of an expedited grace period.
@@ -18,6 +19,7 @@ static int rcu_print_task_exp_stall(struct rcu_node *rnp);
static void rcu_exp_gp_seq_start(void)
{
rcu_seq_start(&rcu_state.expedited_sequence);
+ rcu_poll_gp_seq_start_unlocked(&rcu_state.gp_seq_polled_exp_snap);
}
/*
@@ -34,6 +36,7 @@ static __maybe_unused unsigned long rcu_exp_gp_seq_endval(void)
*/
static void rcu_exp_gp_seq_end(void)
{
+ rcu_poll_gp_seq_end_unlocked(&rcu_state.gp_seq_polled_exp_snap);
rcu_seq_end(&rcu_state.expedited_sequence);
smp_mb(); /* Ensure that consecutive grace periods serialize. */
}
@@ -170,7 +173,6 @@ static bool sync_rcu_exp_done_unlocked(struct rcu_node *rnp)
return ret;
}
-
/*
* Report the exit from RCU read-side critical section for the last task
* that queued itself during or before the current expedited preemptible-RCU
@@ -198,7 +200,7 @@ static void __rcu_report_exp_rnp(struct rcu_node *rnp,
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
if (wake) {
smp_mb(); /* EGP done before wake_up(). */
- swake_up_one(&rcu_state.expedited_wq);
+ swake_up_one_online(&rcu_state.expedited_wq);
}
break;
}
@@ -255,7 +257,7 @@ static void rcu_report_exp_cpu_mult(struct rcu_node *rnp,
*/
static void rcu_report_exp_rdp(struct rcu_data *rdp)
{
- WRITE_ONCE(rdp->exp_deferred_qs, false);
+ WRITE_ONCE(rdp->cpu_no_qs.b.exp, false);
rcu_report_exp_cpu_mult(rdp->mynode, rdp->grpmask, true);
}
@@ -334,15 +336,13 @@ fastpath:
* Select the CPUs within the specified rcu_node that the upcoming
* expedited grace period needs to wait for.
*/
-static void sync_rcu_exp_select_node_cpus(struct work_struct *wp)
+static void __sync_rcu_exp_select_node_cpus(struct rcu_exp_work *rewp)
{
int cpu;
unsigned long flags;
unsigned long mask_ofl_test;
unsigned long mask_ofl_ipi;
int ret;
- struct rcu_exp_work *rewp =
- container_of(wp, struct rcu_exp_work, rew_work);
struct rcu_node *rnp = container_of(rewp, struct rcu_node, rew);
raw_spin_lock_irqsave_rcu_node(rnp, flags);
@@ -358,7 +358,7 @@ static void sync_rcu_exp_select_node_cpus(struct work_struct *wp)
!(rnp->qsmaskinitnext & mask)) {
mask_ofl_test |= mask;
} else {
- snap = rcu_dynticks_snap(rdp);
+ snap = rcu_dynticks_snap(cpu);
if (rcu_dynticks_in_eqs(snap))
mask_ofl_test |= mask;
else
@@ -387,6 +387,7 @@ retry_ipi:
continue;
}
if (get_cpu() == cpu) {
+ mask_ofl_test |= mask;
put_cpu();
continue;
}
@@ -416,13 +417,119 @@ retry_ipi:
rcu_report_exp_cpu_mult(rnp, mask_ofl_test, false);
}
+static void rcu_exp_sel_wait_wake(unsigned long s);
+
+#ifdef CONFIG_RCU_EXP_KTHREAD
+static void sync_rcu_exp_select_node_cpus(struct kthread_work *wp)
+{
+ struct rcu_exp_work *rewp =
+ container_of(wp, struct rcu_exp_work, rew_work);
+
+ __sync_rcu_exp_select_node_cpus(rewp);
+}
+
+static inline bool rcu_gp_par_worker_started(void)
+{
+ return !!READ_ONCE(rcu_exp_par_gp_kworker);
+}
+
+static inline void sync_rcu_exp_select_cpus_queue_work(struct rcu_node *rnp)
+{
+ kthread_init_work(&rnp->rew.rew_work, sync_rcu_exp_select_node_cpus);
+ /*
+ * Use rcu_exp_par_gp_kworker, because flushing a work item from
+ * another work item on the same kthread worker can result in
+ * deadlock.
+ */
+ kthread_queue_work(rcu_exp_par_gp_kworker, &rnp->rew.rew_work);
+}
+
+static inline void sync_rcu_exp_select_cpus_flush_work(struct rcu_node *rnp)
+{
+ kthread_flush_work(&rnp->rew.rew_work);
+}
+
+/*
+ * Work-queue handler to drive an expedited grace period forward.
+ */
+static void wait_rcu_exp_gp(struct kthread_work *wp)
+{
+ struct rcu_exp_work *rewp;
+
+ rewp = container_of(wp, struct rcu_exp_work, rew_work);
+ rcu_exp_sel_wait_wake(rewp->rew_s);
+}
+
+static inline void synchronize_rcu_expedited_queue_work(struct rcu_exp_work *rew)
+{
+ kthread_init_work(&rew->rew_work, wait_rcu_exp_gp);
+ kthread_queue_work(rcu_exp_gp_kworker, &rew->rew_work);
+}
+
+static inline void synchronize_rcu_expedited_destroy_work(struct rcu_exp_work *rew)
+{
+}
+#else /* !CONFIG_RCU_EXP_KTHREAD */
+static void sync_rcu_exp_select_node_cpus(struct work_struct *wp)
+{
+ struct rcu_exp_work *rewp =
+ container_of(wp, struct rcu_exp_work, rew_work);
+
+ __sync_rcu_exp_select_node_cpus(rewp);
+}
+
+static inline bool rcu_gp_par_worker_started(void)
+{
+ return !!READ_ONCE(rcu_par_gp_wq);
+}
+
+static inline void sync_rcu_exp_select_cpus_queue_work(struct rcu_node *rnp)
+{
+ int cpu = find_next_bit(&rnp->ffmask, BITS_PER_LONG, -1);
+
+ INIT_WORK(&rnp->rew.rew_work, sync_rcu_exp_select_node_cpus);
+ /* If all offline, queue the work on an unbound CPU. */
+ if (unlikely(cpu > rnp->grphi - rnp->grplo))
+ cpu = WORK_CPU_UNBOUND;
+ else
+ cpu += rnp->grplo;
+ queue_work_on(cpu, rcu_par_gp_wq, &rnp->rew.rew_work);
+}
+
+static inline void sync_rcu_exp_select_cpus_flush_work(struct rcu_node *rnp)
+{
+ flush_work(&rnp->rew.rew_work);
+}
+
+/*
+ * Work-queue handler to drive an expedited grace period forward.
+ */
+static void wait_rcu_exp_gp(struct work_struct *wp)
+{
+ struct rcu_exp_work *rewp;
+
+ rewp = container_of(wp, struct rcu_exp_work, rew_work);
+ rcu_exp_sel_wait_wake(rewp->rew_s);
+}
+
+static inline void synchronize_rcu_expedited_queue_work(struct rcu_exp_work *rew)
+{
+ INIT_WORK_ONSTACK(&rew->rew_work, wait_rcu_exp_gp);
+ queue_work(rcu_gp_wq, &rew->rew_work);
+}
+
+static inline void synchronize_rcu_expedited_destroy_work(struct rcu_exp_work *rew)
+{
+ destroy_work_on_stack(&rew->rew_work);
+}
+#endif /* CONFIG_RCU_EXP_KTHREAD */
+
/*
* Select the nodes that the upcoming expedited grace period needs
* to wait for.
*/
static void sync_rcu_exp_select_cpus(void)
{
- int cpu;
struct rcu_node *rnp;
trace_rcu_exp_grace_period(rcu_state.name, rcu_exp_gp_seq_endval(), TPS("reset"));
@@ -434,28 +541,21 @@ static void sync_rcu_exp_select_cpus(void)
rnp->exp_need_flush = false;
if (!READ_ONCE(rnp->expmask))
continue; /* Avoid early boot non-existent wq. */
- if (!READ_ONCE(rcu_par_gp_wq) ||
+ if (!rcu_gp_par_worker_started() ||
rcu_scheduler_active != RCU_SCHEDULER_RUNNING ||
rcu_is_last_leaf_node(rnp)) {
- /* No workqueues yet or last leaf, do direct call. */
+ /* No worker started yet or last leaf, do direct call. */
sync_rcu_exp_select_node_cpus(&rnp->rew.rew_work);
continue;
}
- INIT_WORK(&rnp->rew.rew_work, sync_rcu_exp_select_node_cpus);
- cpu = find_next_bit(&rnp->ffmask, BITS_PER_LONG, -1);
- /* If all offline, queue the work on an unbound CPU. */
- if (unlikely(cpu > rnp->grphi - rnp->grplo))
- cpu = WORK_CPU_UNBOUND;
- else
- cpu += rnp->grplo;
- queue_work_on(cpu, rcu_par_gp_wq, &rnp->rew.rew_work);
+ sync_rcu_exp_select_cpus_queue_work(rnp);
rnp->exp_need_flush = true;
}
- /* Wait for workqueue jobs (if any) to complete. */
+ /* Wait for jobs (if any) to complete. */
rcu_for_each_leaf_node(rnp)
if (rnp->exp_need_flush)
- flush_work(&rnp->rew.rew_work);
+ sync_rcu_exp_select_cpus_flush_work(rnp);
}
/*
@@ -493,34 +593,42 @@ static void synchronize_rcu_expedited_wait(void)
struct rcu_data *rdp;
struct rcu_node *rnp;
struct rcu_node *rnp_root = rcu_get_root();
+ unsigned long flags;
trace_rcu_exp_grace_period(rcu_state.name, rcu_exp_gp_seq_endval(), TPS("startwait"));
- jiffies_stall = rcu_jiffies_till_stall_check();
+ jiffies_stall = rcu_exp_jiffies_till_stall_check();
jiffies_start = jiffies;
if (tick_nohz_full_enabled() && rcu_inkernel_boot_has_ended()) {
if (synchronize_rcu_expedited_wait_once(1))
return;
rcu_for_each_leaf_node(rnp) {
- for_each_leaf_node_cpu_mask(rnp, cpu, rnp->expmask) {
+ raw_spin_lock_irqsave_rcu_node(rnp, flags);
+ mask = READ_ONCE(rnp->expmask);
+ for_each_leaf_node_cpu_mask(rnp, cpu, mask) {
rdp = per_cpu_ptr(&rcu_data, cpu);
if (rdp->rcu_forced_tick_exp)
continue;
rdp->rcu_forced_tick_exp = true;
- tick_dep_set_cpu(cpu, TICK_DEP_BIT_RCU_EXP);
+ if (cpu_online(cpu))
+ tick_dep_set_cpu(cpu, TICK_DEP_BIT_RCU_EXP);
}
+ raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
}
j = READ_ONCE(jiffies_till_first_fqs);
if (synchronize_rcu_expedited_wait_once(j + HZ))
return;
- WARN_ON_ONCE(IS_ENABLED(CONFIG_PREEMPT_RT));
}
for (;;) {
+ unsigned long j;
+
if (synchronize_rcu_expedited_wait_once(jiffies_stall))
return;
if (rcu_stall_is_suppressed())
continue;
- panic_on_rcu_stall();
+ j = jiffies;
+ rcu_stall_notifier_call_chain(RCU_STALL_NOTIFY_EXP, (void *)(j - jiffies_start));
+ trace_rcu_stall_warning(rcu_state.name, TPS("ExpeditedStall"));
pr_err("INFO: %s detected expedited stalls on CPUs/tasks: {",
rcu_state.name);
ndetected = 0;
@@ -534,18 +642,19 @@ static void synchronize_rcu_expedited_wait(void)
continue;
ndetected++;
rdp = per_cpu_ptr(&rcu_data, cpu);
- pr_cont(" %d-%c%c%c", cpu,
+ pr_cont(" %d-%c%c%c%c", cpu,
"O."[!!cpu_online(cpu)],
"o."[!!(rdp->grpmask & rnp->expmaskinit)],
- "N."[!!(rdp->grpmask & rnp->expmaskinitnext)]);
+ "N."[!!(rdp->grpmask & rnp->expmaskinitnext)],
+ "D."[!!data_race(rdp->cpu_no_qs.b.exp)]);
}
}
pr_cont(" } %lu jiffies s: %lu root: %#lx/%c\n",
- jiffies - jiffies_start, rcu_state.expedited_sequence,
+ j - jiffies_start, rcu_state.expedited_sequence,
data_race(rnp_root->expmask),
".T"[!!data_race(rnp_root->exp_tasks)]);
if (ndetected) {
- pr_err("blocking rcu_node structures:");
+ pr_err("blocking rcu_node structures (internal RCU debug):");
rcu_for_each_node_breadth_first(rnp) {
if (rnp == rnp_root)
continue; /* printed unconditionally */
@@ -563,10 +672,14 @@ static void synchronize_rcu_expedited_wait(void)
mask = leaf_node_cpu_bit(rnp, cpu);
if (!(READ_ONCE(rnp->expmask) & mask))
continue;
+ preempt_disable(); // For smp_processor_id() in dump_cpu_task().
dump_cpu_task(cpu);
+ preempt_enable();
}
+ rcu_exp_print_detail_task_stall_rnp(rnp);
}
- jiffies_stall = 3 * rcu_jiffies_till_stall_check() + 3;
+ jiffies_stall = 3 * rcu_exp_jiffies_till_stall_check() + 3;
+ panic_on_rcu_stall();
}
}
@@ -617,17 +730,6 @@ static void rcu_exp_sel_wait_wake(unsigned long s)
rcu_exp_wait_wake(s);
}
-/*
- * Work-queue handler to drive an expedited grace period forward.
- */
-static void wait_rcu_exp_gp(struct work_struct *wp)
-{
- struct rcu_exp_work *rewp;
-
- rewp = container_of(wp, struct rcu_exp_work, rew_work);
- rcu_exp_sel_wait_wake(rewp->rew_s);
-}
-
#ifdef CONFIG_PREEMPT_RCU
/*
@@ -652,10 +754,10 @@ static void rcu_exp_handler(void *unused)
*/
if (!depth) {
if (!(preempt_count() & (PREEMPT_MASK | SOFTIRQ_MASK)) ||
- rcu_dynticks_curr_cpu_in_eqs()) {
+ rcu_is_cpu_rrupt_from_idle()) {
rcu_report_exp_rdp(rdp);
} else {
- rdp->exp_deferred_qs = true;
+ WRITE_ONCE(rdp->cpu_no_qs.b.exp, true);
set_tsk_need_resched(t);
set_preempt_need_resched();
}
@@ -677,7 +779,7 @@ static void rcu_exp_handler(void *unused)
if (depth > 0) {
raw_spin_lock_irqsave_rcu_node(rnp, flags);
if (rnp->expmask & rdp->grpmask) {
- rdp->exp_deferred_qs = true;
+ WRITE_ONCE(rdp->cpu_no_qs.b.exp, true);
t->rcu_read_unlock_special.b.exp_hint = true;
}
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
@@ -704,9 +806,11 @@ static int rcu_print_task_exp_stall(struct rcu_node *rnp)
int ndetected = 0;
struct task_struct *t;
- if (!READ_ONCE(rnp->exp_tasks))
- return 0;
raw_spin_lock_irqsave_rcu_node(rnp, flags);
+ if (!rnp->exp_tasks) {
+ raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+ return 0;
+ }
t = list_entry(rnp->exp_tasks->prev,
struct task_struct, rcu_node_entry);
list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) {
@@ -717,6 +821,36 @@ static int rcu_print_task_exp_stall(struct rcu_node *rnp)
return ndetected;
}
+/*
+ * Scan the current list of tasks blocked within RCU read-side critical
+ * sections, dumping the stack of each that is blocking the current
+ * expedited grace period.
+ */
+static void rcu_exp_print_detail_task_stall_rnp(struct rcu_node *rnp)
+{
+ unsigned long flags;
+ struct task_struct *t;
+
+ if (!rcu_exp_stall_task_details)
+ return;
+ raw_spin_lock_irqsave_rcu_node(rnp, flags);
+ if (!READ_ONCE(rnp->exp_tasks)) {
+ raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+ return;
+ }
+ t = list_entry(rnp->exp_tasks->prev,
+ struct task_struct, rcu_node_entry);
+ list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) {
+ /*
+ * We could be printing a lot while holding a spinlock.
+ * Avoid triggering hard lockup.
+ */
+ touch_nmi_watchdog();
+ sched_show_task(t);
+ }
+ raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+}
+
#else /* #ifdef CONFIG_PREEMPT_RCU */
/* Request an expedited quiescent state. */
@@ -734,11 +868,13 @@ static void rcu_exp_handler(void *unused)
{
struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
struct rcu_node *rnp = rdp->mynode;
+ bool preempt_bh_enabled = !(preempt_count() & (PREEMPT_MASK | SOFTIRQ_MASK));
if (!(READ_ONCE(rnp->expmask) & rdp->grpmask) ||
__this_cpu_read(rcu_data.cpu_no_qs.b.exp))
return;
- if (rcu_is_cpu_rrupt_from_idle()) {
+ if (rcu_is_cpu_rrupt_from_idle() ||
+ (IS_ENABLED(CONFIG_PREEMPT_COUNT) && preempt_bh_enabled)) {
rcu_report_exp_rdp(this_cpu_ptr(&rcu_data));
return;
}
@@ -759,7 +895,7 @@ static void sync_sched_exp_online_cleanup(int cpu)
my_cpu = get_cpu();
/* Quiescent state either not needed or already requested, leave. */
if (!(READ_ONCE(rnp->expmask) & rdp->grpmask) ||
- __this_cpu_read(rcu_data.cpu_no_qs.b.exp)) {
+ READ_ONCE(rdp->cpu_no_qs.b.exp)) {
put_cpu();
return;
}
@@ -787,6 +923,15 @@ static int rcu_print_task_exp_stall(struct rcu_node *rnp)
return 0;
}
+/*
+ * Because preemptible RCU does not exist, we never have to print out
+ * tasks blocked within RCU read-side critical sections that are blocking
+ * the current expedited grace period.
+ */
+static void rcu_exp_print_detail_task_stall_rnp(struct rcu_node *rnp)
+{
+}
+
#endif /* #else #ifdef CONFIG_PREEMPT_RCU */
/**
@@ -812,6 +957,7 @@ static int rcu_print_task_exp_stall(struct rcu_node *rnp)
void synchronize_rcu_expedited(void)
{
bool boottime = (rcu_scheduler_active == RCU_SCHEDULER_INIT);
+ unsigned long flags;
struct rcu_exp_work rew;
struct rcu_node *rnp;
unsigned long s;
@@ -822,12 +968,25 @@ void synchronize_rcu_expedited(void)
"Illegal synchronize_rcu_expedited() in RCU read-side critical section");
/* Is the state is such that the call is a grace period? */
- if (rcu_blocking_is_gp())
- return;
+ if (rcu_blocking_is_gp()) {
+ // Note well that this code runs with !PREEMPT && !SMP.
+ // In addition, all code that advances grace periods runs
+ // at process level. Therefore, this expedited GP overlaps
+ // with other expedited GPs only by being fully nested within
+ // them, which allows reuse of ->gp_seq_polled_exp_snap.
+ rcu_poll_gp_seq_start_unlocked(&rcu_state.gp_seq_polled_exp_snap);
+ rcu_poll_gp_seq_end_unlocked(&rcu_state.gp_seq_polled_exp_snap);
+
+ local_irq_save(flags);
+ WARN_ON_ONCE(num_online_cpus() > 1);
+ rcu_state.expedited_sequence += (1 << RCU_SEQ_CTR_SHIFT);
+ local_irq_restore(flags);
+ return; // Context allows vacuous grace periods.
+ }
/* If expedited grace periods are prohibited, fall back to normal. */
if (rcu_gp_is_normal()) {
- wait_rcu_gp(call_rcu);
+ wait_rcu_gp(call_rcu_hurry);
return;
}
@@ -843,20 +1002,155 @@ void synchronize_rcu_expedited(void)
} else {
/* Marshall arguments & schedule the expedited grace period. */
rew.rew_s = s;
- INIT_WORK_ONSTACK(&rew.rew_work, wait_rcu_exp_gp);
- queue_work(rcu_gp_wq, &rew.rew_work);
+ synchronize_rcu_expedited_queue_work(&rew);
}
/* Wait for expedited grace period to complete. */
rnp = rcu_get_root();
wait_event(rnp->exp_wq[rcu_seq_ctr(s) & 0x3],
sync_exp_work_done(s));
- smp_mb(); /* Workqueue actions happen before return. */
+ smp_mb(); /* Work actions happen before return. */
/* Let the next expedited grace period start. */
mutex_unlock(&rcu_state.exp_mutex);
if (likely(!boottime))
- destroy_work_on_stack(&rew.rew_work);
+ synchronize_rcu_expedited_destroy_work(&rew);
}
EXPORT_SYMBOL_GPL(synchronize_rcu_expedited);
+
+/*
+ * Ensure that start_poll_synchronize_rcu_expedited() has the expedited
+ * RCU grace periods that it needs.
+ */
+static void sync_rcu_do_polled_gp(struct work_struct *wp)
+{
+ unsigned long flags;
+ int i = 0;
+ struct rcu_node *rnp = container_of(wp, struct rcu_node, exp_poll_wq);
+ unsigned long s;
+
+ raw_spin_lock_irqsave(&rnp->exp_poll_lock, flags);
+ s = rnp->exp_seq_poll_rq;
+ rnp->exp_seq_poll_rq = RCU_GET_STATE_COMPLETED;
+ raw_spin_unlock_irqrestore(&rnp->exp_poll_lock, flags);
+ if (s == RCU_GET_STATE_COMPLETED)
+ return;
+ while (!poll_state_synchronize_rcu(s)) {
+ synchronize_rcu_expedited();
+ if (i == 10 || i == 20)
+ pr_info("%s: i = %d s = %lx gp_seq_polled = %lx\n", __func__, i, s, READ_ONCE(rcu_state.gp_seq_polled));
+ i++;
+ }
+ raw_spin_lock_irqsave(&rnp->exp_poll_lock, flags);
+ s = rnp->exp_seq_poll_rq;
+ if (poll_state_synchronize_rcu(s))
+ rnp->exp_seq_poll_rq = RCU_GET_STATE_COMPLETED;
+ raw_spin_unlock_irqrestore(&rnp->exp_poll_lock, flags);
+}
+
+/**
+ * start_poll_synchronize_rcu_expedited - Snapshot current RCU state and start expedited grace period
+ *
+ * Returns a cookie to pass to a call to cond_synchronize_rcu(),
+ * cond_synchronize_rcu_expedited(), or poll_state_synchronize_rcu(),
+ * allowing them to determine whether or not any sort of grace period has
+ * elapsed in the meantime. If the needed expedited grace period is not
+ * already slated to start, initiates that grace period.
+ */
+unsigned long start_poll_synchronize_rcu_expedited(void)
+{
+ unsigned long flags;
+ struct rcu_data *rdp;
+ struct rcu_node *rnp;
+ unsigned long s;
+
+ s = get_state_synchronize_rcu();
+ rdp = per_cpu_ptr(&rcu_data, raw_smp_processor_id());
+ rnp = rdp->mynode;
+ if (rcu_init_invoked())
+ raw_spin_lock_irqsave(&rnp->exp_poll_lock, flags);
+ if (!poll_state_synchronize_rcu(s)) {
+ if (rcu_init_invoked()) {
+ rnp->exp_seq_poll_rq = s;
+ queue_work(rcu_gp_wq, &rnp->exp_poll_wq);
+ }
+ }
+ if (rcu_init_invoked())
+ raw_spin_unlock_irqrestore(&rnp->exp_poll_lock, flags);
+
+ return s;
+}
+EXPORT_SYMBOL_GPL(start_poll_synchronize_rcu_expedited);
+
+/**
+ * start_poll_synchronize_rcu_expedited_full - Take a full snapshot and start expedited grace period
+ * @rgosp: Place to put snapshot of grace-period state
+ *
+ * Places the normal and expedited grace-period states in rgosp. This
+ * state value can be passed to a later call to cond_synchronize_rcu_full()
+ * or poll_state_synchronize_rcu_full() to determine whether or not a
+ * grace period (whether normal or expedited) has elapsed in the meantime.
+ * If the needed expedited grace period is not already slated to start,
+ * initiates that grace period.
+ */
+void start_poll_synchronize_rcu_expedited_full(struct rcu_gp_oldstate *rgosp)
+{
+ get_state_synchronize_rcu_full(rgosp);
+ (void)start_poll_synchronize_rcu_expedited();
+}
+EXPORT_SYMBOL_GPL(start_poll_synchronize_rcu_expedited_full);
+
+/**
+ * cond_synchronize_rcu_expedited - Conditionally wait for an expedited RCU grace period
+ *
+ * @oldstate: value from get_state_synchronize_rcu(), start_poll_synchronize_rcu(), or start_poll_synchronize_rcu_expedited()
+ *
+ * If any type of full RCU grace period has elapsed since the earlier
+ * call to get_state_synchronize_rcu(), start_poll_synchronize_rcu(),
+ * or start_poll_synchronize_rcu_expedited(), just return. Otherwise,
+ * invoke synchronize_rcu_expedited() to wait for a full grace period.
+ *
+ * Yes, this function does not take counter wrap into account.
+ * But counter wrap is harmless. If the counter wraps, we have waited for
+ * more than 2 billion grace periods (and way more on a 64-bit system!),
+ * so waiting for a couple of additional grace periods should be just fine.
+ *
+ * This function provides the same memory-ordering guarantees that
+ * would be provided by a synchronize_rcu() that was invoked at the call
+ * to the function that provided @oldstate and that returned at the end
+ * of this function.
+ */
+void cond_synchronize_rcu_expedited(unsigned long oldstate)
+{
+ if (!poll_state_synchronize_rcu(oldstate))
+ synchronize_rcu_expedited();
+}
+EXPORT_SYMBOL_GPL(cond_synchronize_rcu_expedited);
+
+/**
+ * cond_synchronize_rcu_expedited_full - Conditionally wait for an expedited RCU grace period
+ * @rgosp: value from get_state_synchronize_rcu_full(), start_poll_synchronize_rcu_full(), or start_poll_synchronize_rcu_expedited_full()
+ *
+ * If a full RCU grace period has elapsed since the call to
+ * get_state_synchronize_rcu_full(), start_poll_synchronize_rcu_full(),
+ * or start_poll_synchronize_rcu_expedited_full() from which @rgosp was
+ * obtained, just return. Otherwise, invoke synchronize_rcu_expedited()
+ * to wait for a full grace period.
+ *
+ * Yes, this function does not take counter wrap into account.
+ * But counter wrap is harmless. If the counter wraps, we have waited for
+ * more than 2 billion grace periods (and way more on a 64-bit system!),
+ * so waiting for a couple of additional grace periods should be just fine.
+ *
+ * This function provides the same memory-ordering guarantees that
+ * would be provided by a synchronize_rcu() that was invoked at the call
+ * to the function that provided @rgosp and that returned at the end of
+ * this function.
+ */
+void cond_synchronize_rcu_expedited_full(struct rcu_gp_oldstate *rgosp)
+{
+ if (!poll_state_synchronize_rcu_full(rgosp))
+ synchronize_rcu_expedited();
+}
+EXPORT_SYMBOL_GPL(cond_synchronize_rcu_expedited_full);
diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h
new file mode 100644
index 000000000000..4efbf7333d4e
--- /dev/null
+++ b/kernel/rcu/tree_nocb.h
@@ -0,0 +1,1805 @@
+/* SPDX-License-Identifier: GPL-2.0+ */
+/*
+ * Read-Copy Update mechanism for mutual exclusion (tree-based version)
+ * Internal non-public definitions that provide either classic
+ * or preemptible semantics.
+ *
+ * Copyright Red Hat, 2009
+ * Copyright IBM Corporation, 2009
+ * Copyright SUSE, 2021
+ *
+ * Author: Ingo Molnar <mingo@elte.hu>
+ * Paul E. McKenney <paulmck@linux.ibm.com>
+ * Frederic Weisbecker <frederic@kernel.org>
+ */
+
+#ifdef CONFIG_RCU_NOCB_CPU
+static cpumask_var_t rcu_nocb_mask; /* CPUs to have callbacks offloaded. */
+static bool __read_mostly rcu_nocb_poll; /* Offload kthread are to poll. */
+static inline int rcu_lockdep_is_held_nocb(struct rcu_data *rdp)
+{
+ return lockdep_is_held(&rdp->nocb_lock);
+}
+
+static inline bool rcu_current_is_nocb_kthread(struct rcu_data *rdp)
+{
+ /* Race on early boot between thread creation and assignment */
+ if (!rdp->nocb_cb_kthread || !rdp->nocb_gp_kthread)
+ return true;
+
+ if (current == rdp->nocb_cb_kthread || current == rdp->nocb_gp_kthread)
+ if (in_task())
+ return true;
+ return false;
+}
+
+/*
+ * Offload callback processing from the boot-time-specified set of CPUs
+ * specified by rcu_nocb_mask. For the CPUs in the set, there are kthreads
+ * created that pull the callbacks from the corresponding CPU, wait for
+ * a grace period to elapse, and invoke the callbacks. These kthreads
+ * are organized into GP kthreads, which manage incoming callbacks, wait for
+ * grace periods, and awaken CB kthreads, and the CB kthreads, which only
+ * invoke callbacks. Each GP kthread invokes its own CBs. The no-CBs CPUs
+ * do a wake_up() on their GP kthread when they insert a callback into any
+ * empty list, unless the rcu_nocb_poll boot parameter has been specified,
+ * in which case each kthread actively polls its CPU. (Which isn't so great
+ * for energy efficiency, but which does reduce RCU's overhead on that CPU.)
+ *
+ * This is intended to be used in conjunction with Frederic Weisbecker's
+ * adaptive-idle work, which would seriously reduce OS jitter on CPUs
+ * running CPU-bound user-mode computations.
+ *
+ * Offloading of callbacks can also be used as an energy-efficiency
+ * measure because CPUs with no RCU callbacks queued are more aggressive
+ * about entering dyntick-idle mode.
+ */
+
+
+/*
+ * Parse the boot-time rcu_nocb_mask CPU list from the kernel parameters.
+ * If the list is invalid, a warning is emitted and all CPUs are offloaded.
+ */
+static int __init rcu_nocb_setup(char *str)
+{
+ alloc_bootmem_cpumask_var(&rcu_nocb_mask);
+ if (*str == '=') {
+ if (cpulist_parse(++str, rcu_nocb_mask)) {
+ pr_warn("rcu_nocbs= bad CPU range, all CPUs set\n");
+ cpumask_setall(rcu_nocb_mask);
+ }
+ }
+ rcu_state.nocb_is_setup = true;
+ return 1;
+}
+__setup("rcu_nocbs", rcu_nocb_setup);
+
+static int __init parse_rcu_nocb_poll(char *arg)
+{
+ rcu_nocb_poll = true;
+ return 1;
+}
+__setup("rcu_nocb_poll", parse_rcu_nocb_poll);
+
+/*
+ * Don't bother bypassing ->cblist if the call_rcu() rate is low.
+ * After all, the main point of bypassing is to avoid lock contention
+ * on ->nocb_lock, which only can happen at high call_rcu() rates.
+ */
+static int nocb_nobypass_lim_per_jiffy = 16 * 1000 / HZ;
+module_param(nocb_nobypass_lim_per_jiffy, int, 0);
+
+/*
+ * Acquire the specified rcu_data structure's ->nocb_bypass_lock. If the
+ * lock isn't immediately available, increment ->nocb_lock_contended to
+ * flag the contention.
+ */
+static void rcu_nocb_bypass_lock(struct rcu_data *rdp)
+ __acquires(&rdp->nocb_bypass_lock)
+{
+ lockdep_assert_irqs_disabled();
+ if (raw_spin_trylock(&rdp->nocb_bypass_lock))
+ return;
+ atomic_inc(&rdp->nocb_lock_contended);
+ WARN_ON_ONCE(smp_processor_id() != rdp->cpu);
+ smp_mb__after_atomic(); /* atomic_inc() before lock. */
+ raw_spin_lock(&rdp->nocb_bypass_lock);
+ smp_mb__before_atomic(); /* atomic_dec() after lock. */
+ atomic_dec(&rdp->nocb_lock_contended);
+}
+
+/*
+ * Spinwait until the specified rcu_data structure's ->nocb_lock is
+ * not contended. Please note that this is extremely special-purpose,
+ * relying on the fact that at most two kthreads and one CPU contend for
+ * this lock, and also that the two kthreads are guaranteed to have frequent
+ * grace-period-duration time intervals between successive acquisitions
+ * of the lock. This allows us to use an extremely simple throttling
+ * mechanism, and further to apply it only to the CPU doing floods of
+ * call_rcu() invocations. Don't try this at home!
+ */
+static void rcu_nocb_wait_contended(struct rcu_data *rdp)
+{
+ WARN_ON_ONCE(smp_processor_id() != rdp->cpu);
+ while (WARN_ON_ONCE(atomic_read(&rdp->nocb_lock_contended)))
+ cpu_relax();
+}
+
+/*
+ * Conditionally acquire the specified rcu_data structure's
+ * ->nocb_bypass_lock.
+ */
+static bool rcu_nocb_bypass_trylock(struct rcu_data *rdp)
+{
+ lockdep_assert_irqs_disabled();
+ return raw_spin_trylock(&rdp->nocb_bypass_lock);
+}
+
+/*
+ * Release the specified rcu_data structure's ->nocb_bypass_lock.
+ */
+static void rcu_nocb_bypass_unlock(struct rcu_data *rdp)
+ __releases(&rdp->nocb_bypass_lock)
+{
+ lockdep_assert_irqs_disabled();
+ raw_spin_unlock(&rdp->nocb_bypass_lock);
+}
+
+/*
+ * Acquire the specified rcu_data structure's ->nocb_lock, but only
+ * if it corresponds to a no-CBs CPU.
+ */
+static void rcu_nocb_lock(struct rcu_data *rdp)
+{
+ lockdep_assert_irqs_disabled();
+ if (!rcu_rdp_is_offloaded(rdp))
+ return;
+ raw_spin_lock(&rdp->nocb_lock);
+}
+
+/*
+ * Release the specified rcu_data structure's ->nocb_lock, but only
+ * if it corresponds to a no-CBs CPU.
+ */
+static void rcu_nocb_unlock(struct rcu_data *rdp)
+{
+ if (rcu_rdp_is_offloaded(rdp)) {
+ lockdep_assert_irqs_disabled();
+ raw_spin_unlock(&rdp->nocb_lock);
+ }
+}
+
+/*
+ * Release the specified rcu_data structure's ->nocb_lock and restore
+ * interrupts, but only if it corresponds to a no-CBs CPU.
+ */
+static void rcu_nocb_unlock_irqrestore(struct rcu_data *rdp,
+ unsigned long flags)
+{
+ if (rcu_rdp_is_offloaded(rdp)) {
+ lockdep_assert_irqs_disabled();
+ raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags);
+ } else {
+ local_irq_restore(flags);
+ }
+}
+
+/* Lockdep check that ->cblist may be safely accessed. */
+static void rcu_lockdep_assert_cblist_protected(struct rcu_data *rdp)
+{
+ lockdep_assert_irqs_disabled();
+ if (rcu_rdp_is_offloaded(rdp))
+ lockdep_assert_held(&rdp->nocb_lock);
+}
+
+/*
+ * Wake up any no-CBs CPUs' kthreads that were waiting on the just-ended
+ * grace period.
+ */
+static void rcu_nocb_gp_cleanup(struct swait_queue_head *sq)
+{
+ swake_up_all(sq);
+}
+
+static struct swait_queue_head *rcu_nocb_gp_get(struct rcu_node *rnp)
+{
+ return &rnp->nocb_gp_wq[rcu_seq_ctr(rnp->gp_seq) & 0x1];
+}
+
+static void rcu_init_one_nocb(struct rcu_node *rnp)
+{
+ init_swait_queue_head(&rnp->nocb_gp_wq[0]);
+ init_swait_queue_head(&rnp->nocb_gp_wq[1]);
+}
+
+static bool __wake_nocb_gp(struct rcu_data *rdp_gp,
+ struct rcu_data *rdp,
+ bool force, unsigned long flags)
+ __releases(rdp_gp->nocb_gp_lock)
+{
+ bool needwake = false;
+
+ if (!READ_ONCE(rdp_gp->nocb_gp_kthread)) {
+ raw_spin_unlock_irqrestore(&rdp_gp->nocb_gp_lock, flags);
+ trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
+ TPS("AlreadyAwake"));
+ return false;
+ }
+
+ if (rdp_gp->nocb_defer_wakeup > RCU_NOCB_WAKE_NOT) {
+ WRITE_ONCE(rdp_gp->nocb_defer_wakeup, RCU_NOCB_WAKE_NOT);
+ del_timer(&rdp_gp->nocb_timer);
+ }
+
+ if (force || READ_ONCE(rdp_gp->nocb_gp_sleep)) {
+ WRITE_ONCE(rdp_gp->nocb_gp_sleep, false);
+ needwake = true;
+ }
+ raw_spin_unlock_irqrestore(&rdp_gp->nocb_gp_lock, flags);
+ if (needwake) {
+ trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("DoWake"));
+ wake_up_process(rdp_gp->nocb_gp_kthread);
+ }
+
+ return needwake;
+}
+
+/*
+ * Kick the GP kthread for this NOCB group.
+ */
+static bool wake_nocb_gp(struct rcu_data *rdp, bool force)
+{
+ unsigned long flags;
+ struct rcu_data *rdp_gp = rdp->nocb_gp_rdp;
+
+ raw_spin_lock_irqsave(&rdp_gp->nocb_gp_lock, flags);
+ return __wake_nocb_gp(rdp_gp, rdp, force, flags);
+}
+
+/*
+ * LAZY_FLUSH_JIFFIES decides the maximum amount of time that
+ * can elapse before lazy callbacks are flushed. Lazy callbacks
+ * could be flushed much earlier for a number of other reasons
+ * however, LAZY_FLUSH_JIFFIES will ensure no lazy callbacks are
+ * left unsubmitted to RCU after those many jiffies.
+ */
+#define LAZY_FLUSH_JIFFIES (10 * HZ)
+static unsigned long jiffies_till_flush = LAZY_FLUSH_JIFFIES;
+
+#ifdef CONFIG_RCU_LAZY
+// To be called only from test code.
+void rcu_lazy_set_jiffies_till_flush(unsigned long jif)
+{
+ jiffies_till_flush = jif;
+}
+EXPORT_SYMBOL(rcu_lazy_set_jiffies_till_flush);
+
+unsigned long rcu_lazy_get_jiffies_till_flush(void)
+{
+ return jiffies_till_flush;
+}
+EXPORT_SYMBOL(rcu_lazy_get_jiffies_till_flush);
+#endif
+
+/*
+ * Arrange to wake the GP kthread for this NOCB group at some future
+ * time when it is safe to do so.
+ */
+static void wake_nocb_gp_defer(struct rcu_data *rdp, int waketype,
+ const char *reason)
+{
+ unsigned long flags;
+ struct rcu_data *rdp_gp = rdp->nocb_gp_rdp;
+
+ raw_spin_lock_irqsave(&rdp_gp->nocb_gp_lock, flags);
+
+ /*
+ * Bypass wakeup overrides previous deferments. In case of
+ * callback storms, no need to wake up too early.
+ */
+ if (waketype == RCU_NOCB_WAKE_LAZY &&
+ rdp->nocb_defer_wakeup == RCU_NOCB_WAKE_NOT) {
+ mod_timer(&rdp_gp->nocb_timer, jiffies + jiffies_till_flush);
+ WRITE_ONCE(rdp_gp->nocb_defer_wakeup, waketype);
+ } else if (waketype == RCU_NOCB_WAKE_BYPASS) {
+ mod_timer(&rdp_gp->nocb_timer, jiffies + 2);
+ WRITE_ONCE(rdp_gp->nocb_defer_wakeup, waketype);
+ } else {
+ if (rdp_gp->nocb_defer_wakeup < RCU_NOCB_WAKE)
+ mod_timer(&rdp_gp->nocb_timer, jiffies + 1);
+ if (rdp_gp->nocb_defer_wakeup < waketype)
+ WRITE_ONCE(rdp_gp->nocb_defer_wakeup, waketype);
+ }
+
+ raw_spin_unlock_irqrestore(&rdp_gp->nocb_gp_lock, flags);
+
+ trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, reason);
+}
+
+/*
+ * Flush the ->nocb_bypass queue into ->cblist, enqueuing rhp if non-NULL.
+ * However, if there is a callback to be enqueued and if ->nocb_bypass
+ * proves to be initially empty, just return false because the no-CB GP
+ * kthread may need to be awakened in this case.
+ *
+ * Return true if there was something to be flushed and it succeeded, otherwise
+ * false.
+ *
+ * Note that this function always returns true if rhp is NULL.
+ */
+static bool rcu_nocb_do_flush_bypass(struct rcu_data *rdp, struct rcu_head *rhp_in,
+ unsigned long j, bool lazy)
+{
+ struct rcu_cblist rcl;
+ struct rcu_head *rhp = rhp_in;
+
+ WARN_ON_ONCE(!rcu_rdp_is_offloaded(rdp));
+ rcu_lockdep_assert_cblist_protected(rdp);
+ lockdep_assert_held(&rdp->nocb_bypass_lock);
+ if (rhp && !rcu_cblist_n_cbs(&rdp->nocb_bypass)) {
+ raw_spin_unlock(&rdp->nocb_bypass_lock);
+ return false;
+ }
+ /* Note: ->cblist.len already accounts for ->nocb_bypass contents. */
+ if (rhp)
+ rcu_segcblist_inc_len(&rdp->cblist); /* Must precede enqueue. */
+
+ /*
+ * If the new CB requested was a lazy one, queue it onto the main
+ * ->cblist so that we can take advantage of the grace-period that will
+ * happen regardless. But queue it onto the bypass list first so that
+ * the lazy CB is ordered with the existing CBs in the bypass list.
+ */
+ if (lazy && rhp) {
+ rcu_cblist_enqueue(&rdp->nocb_bypass, rhp);
+ rhp = NULL;
+ }
+ rcu_cblist_flush_enqueue(&rcl, &rdp->nocb_bypass, rhp);
+ WRITE_ONCE(rdp->lazy_len, 0);
+
+ rcu_segcblist_insert_pend_cbs(&rdp->cblist, &rcl);
+ WRITE_ONCE(rdp->nocb_bypass_first, j);
+ rcu_nocb_bypass_unlock(rdp);
+ return true;
+}
+
+/*
+ * Flush the ->nocb_bypass queue into ->cblist, enqueuing rhp if non-NULL.
+ * However, if there is a callback to be enqueued and if ->nocb_bypass
+ * proves to be initially empty, just return false because the no-CB GP
+ * kthread may need to be awakened in this case.
+ *
+ * Note that this function always returns true if rhp is NULL.
+ */
+static bool rcu_nocb_flush_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
+ unsigned long j, bool lazy)
+{
+ if (!rcu_rdp_is_offloaded(rdp))
+ return true;
+ rcu_lockdep_assert_cblist_protected(rdp);
+ rcu_nocb_bypass_lock(rdp);
+ return rcu_nocb_do_flush_bypass(rdp, rhp, j, lazy);
+}
+
+/*
+ * If the ->nocb_bypass_lock is immediately available, flush the
+ * ->nocb_bypass queue into ->cblist.
+ */
+static void rcu_nocb_try_flush_bypass(struct rcu_data *rdp, unsigned long j)
+{
+ rcu_lockdep_assert_cblist_protected(rdp);
+ if (!rcu_rdp_is_offloaded(rdp) ||
+ !rcu_nocb_bypass_trylock(rdp))
+ return;
+ WARN_ON_ONCE(!rcu_nocb_do_flush_bypass(rdp, NULL, j, false));
+}
+
+/*
+ * See whether it is appropriate to use the ->nocb_bypass list in order
+ * to control contention on ->nocb_lock. A limited number of direct
+ * enqueues are permitted into ->cblist per jiffy. If ->nocb_bypass
+ * is non-empty, further callbacks must be placed into ->nocb_bypass,
+ * otherwise rcu_barrier() breaks. Use rcu_nocb_flush_bypass() to switch
+ * back to direct use of ->cblist. However, ->nocb_bypass should not be
+ * used if ->cblist is empty, because otherwise callbacks can be stranded
+ * on ->nocb_bypass because we cannot count on the current CPU ever again
+ * invoking call_rcu(). The general rule is that if ->nocb_bypass is
+ * non-empty, the corresponding no-CBs grace-period kthread must not be
+ * in an indefinite sleep state.
+ *
+ * Finally, it is not permitted to use the bypass during early boot,
+ * as doing so would confuse the auto-initialization code. Besides
+ * which, there is no point in worrying about lock contention while
+ * there is only one CPU in operation.
+ */
+static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
+ bool *was_alldone, unsigned long flags,
+ bool lazy)
+{
+ unsigned long c;
+ unsigned long cur_gp_seq;
+ unsigned long j = jiffies;
+ long ncbs = rcu_cblist_n_cbs(&rdp->nocb_bypass);
+ bool bypass_is_lazy = (ncbs == READ_ONCE(rdp->lazy_len));
+
+ lockdep_assert_irqs_disabled();
+
+ // Pure softirq/rcuc based processing: no bypassing, no
+ // locking.
+ if (!rcu_rdp_is_offloaded(rdp)) {
+ *was_alldone = !rcu_segcblist_pend_cbs(&rdp->cblist);
+ return false;
+ }
+
+ // In the process of (de-)offloading: no bypassing, but
+ // locking.
+ if (!rcu_segcblist_completely_offloaded(&rdp->cblist)) {
+ rcu_nocb_lock(rdp);
+ *was_alldone = !rcu_segcblist_pend_cbs(&rdp->cblist);
+ return false; /* Not offloaded, no bypassing. */
+ }
+
+ // Don't use ->nocb_bypass during early boot.
+ if (rcu_scheduler_active != RCU_SCHEDULER_RUNNING) {
+ rcu_nocb_lock(rdp);
+ WARN_ON_ONCE(rcu_cblist_n_cbs(&rdp->nocb_bypass));
+ *was_alldone = !rcu_segcblist_pend_cbs(&rdp->cblist);
+ return false;
+ }
+
+ // If we have advanced to a new jiffy, reset counts to allow
+ // moving back from ->nocb_bypass to ->cblist.
+ if (j == rdp->nocb_nobypass_last) {
+ c = rdp->nocb_nobypass_count + 1;
+ } else {
+ WRITE_ONCE(rdp->nocb_nobypass_last, j);
+ c = rdp->nocb_nobypass_count - nocb_nobypass_lim_per_jiffy;
+ if (ULONG_CMP_LT(rdp->nocb_nobypass_count,
+ nocb_nobypass_lim_per_jiffy))
+ c = 0;
+ else if (c > nocb_nobypass_lim_per_jiffy)
+ c = nocb_nobypass_lim_per_jiffy;
+ }
+ WRITE_ONCE(rdp->nocb_nobypass_count, c);
+
+ // If there hasn't yet been all that many ->cblist enqueues
+ // this jiffy, tell the caller to enqueue onto ->cblist. But flush
+ // ->nocb_bypass first.
+ // Lazy CBs throttle this back and do immediate bypass queuing.
+ if (rdp->nocb_nobypass_count < nocb_nobypass_lim_per_jiffy && !lazy) {
+ rcu_nocb_lock(rdp);
+ *was_alldone = !rcu_segcblist_pend_cbs(&rdp->cblist);
+ if (*was_alldone)
+ trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
+ TPS("FirstQ"));
+
+ WARN_ON_ONCE(!rcu_nocb_flush_bypass(rdp, NULL, j, false));
+ WARN_ON_ONCE(rcu_cblist_n_cbs(&rdp->nocb_bypass));
+ return false; // Caller must enqueue the callback.
+ }
+
+ // If ->nocb_bypass has been used too long or is too full,
+ // flush ->nocb_bypass to ->cblist.
+ if ((ncbs && !bypass_is_lazy && j != READ_ONCE(rdp->nocb_bypass_first)) ||
+ (ncbs && bypass_is_lazy &&
+ (time_after(j, READ_ONCE(rdp->nocb_bypass_first) + jiffies_till_flush))) ||
+ ncbs >= qhimark) {
+ rcu_nocb_lock(rdp);
+ *was_alldone = !rcu_segcblist_pend_cbs(&rdp->cblist);
+
+ if (!rcu_nocb_flush_bypass(rdp, rhp, j, lazy)) {
+ if (*was_alldone)
+ trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
+ TPS("FirstQ"));
+ WARN_ON_ONCE(rcu_cblist_n_cbs(&rdp->nocb_bypass));
+ return false; // Caller must enqueue the callback.
+ }
+ if (j != rdp->nocb_gp_adv_time &&
+ rcu_segcblist_nextgp(&rdp->cblist, &cur_gp_seq) &&
+ rcu_seq_done(&rdp->mynode->gp_seq, cur_gp_seq)) {
+ rcu_advance_cbs_nowake(rdp->mynode, rdp);
+ rdp->nocb_gp_adv_time = j;
+ }
+
+ // The flush succeeded and we moved CBs into the regular list.
+ // Don't wait for the wake up timer as it may be too far ahead.
+ // Wake up the GP thread now instead, if the cblist was empty.
+ __call_rcu_nocb_wake(rdp, *was_alldone, flags);
+
+ return true; // Callback already enqueued.
+ }
+
+ // We need to use the bypass.
+ rcu_nocb_wait_contended(rdp);
+ rcu_nocb_bypass_lock(rdp);
+ ncbs = rcu_cblist_n_cbs(&rdp->nocb_bypass);
+ rcu_segcblist_inc_len(&rdp->cblist); /* Must precede enqueue. */
+ rcu_cblist_enqueue(&rdp->nocb_bypass, rhp);
+
+ if (lazy)
+ WRITE_ONCE(rdp->lazy_len, rdp->lazy_len + 1);
+
+ if (!ncbs) {
+ WRITE_ONCE(rdp->nocb_bypass_first, j);
+ trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("FirstBQ"));
+ }
+ rcu_nocb_bypass_unlock(rdp);
+ smp_mb(); /* Order enqueue before wake. */
+ // A wake up of the grace period kthread or timer adjustment
+ // needs to be done only if:
+ // 1. Bypass list was fully empty before (this is the first
+ // bypass list entry), or:
+ // 2. Both of these conditions are met:
+ // a. The bypass list previously had only lazy CBs, and:
+ // b. The new CB is non-lazy.
+ if (ncbs && (!bypass_is_lazy || lazy)) {
+ local_irq_restore(flags);
+ } else {
+ // No-CBs GP kthread might be indefinitely asleep, if so, wake.
+ rcu_nocb_lock(rdp); // Rare during call_rcu() flood.
+ if (!rcu_segcblist_pend_cbs(&rdp->cblist)) {
+ trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
+ TPS("FirstBQwake"));
+ __call_rcu_nocb_wake(rdp, true, flags);
+ } else {
+ trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
+ TPS("FirstBQnoWake"));
+ rcu_nocb_unlock_irqrestore(rdp, flags);
+ }
+ }
+ return true; // Callback already enqueued.
+}
+
+/*
+ * Awaken the no-CBs grace-period kthread if needed, either due to it
+ * legitimately being asleep or due to overload conditions.
+ *
+ * If warranted, also wake up the kthread servicing this CPUs queues.
+ */
+static void __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_alldone,
+ unsigned long flags)
+ __releases(rdp->nocb_lock)
+{
+ long bypass_len;
+ unsigned long cur_gp_seq;
+ unsigned long j;
+ long lazy_len;
+ long len;
+ struct task_struct *t;
+
+ // If we are being polled or there is no kthread, just leave.
+ t = READ_ONCE(rdp->nocb_gp_kthread);
+ if (rcu_nocb_poll || !t) {
+ rcu_nocb_unlock_irqrestore(rdp, flags);
+ trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
+ TPS("WakeNotPoll"));
+ return;
+ }
+ // Need to actually to a wakeup.
+ len = rcu_segcblist_n_cbs(&rdp->cblist);
+ bypass_len = rcu_cblist_n_cbs(&rdp->nocb_bypass);
+ lazy_len = READ_ONCE(rdp->lazy_len);
+ if (was_alldone) {
+ rdp->qlen_last_fqs_check = len;
+ // Only lazy CBs in bypass list
+ if (lazy_len && bypass_len == lazy_len) {
+ rcu_nocb_unlock_irqrestore(rdp, flags);
+ wake_nocb_gp_defer(rdp, RCU_NOCB_WAKE_LAZY,
+ TPS("WakeLazy"));
+ } else if (!irqs_disabled_flags(flags)) {
+ /* ... if queue was empty ... */
+ rcu_nocb_unlock_irqrestore(rdp, flags);
+ wake_nocb_gp(rdp, false);
+ trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
+ TPS("WakeEmpty"));
+ } else {
+ rcu_nocb_unlock_irqrestore(rdp, flags);
+ wake_nocb_gp_defer(rdp, RCU_NOCB_WAKE,
+ TPS("WakeEmptyIsDeferred"));
+ }
+ } else if (len > rdp->qlen_last_fqs_check + qhimark) {
+ /* ... or if many callbacks queued. */
+ rdp->qlen_last_fqs_check = len;
+ j = jiffies;
+ if (j != rdp->nocb_gp_adv_time &&
+ rcu_segcblist_nextgp(&rdp->cblist, &cur_gp_seq) &&
+ rcu_seq_done(&rdp->mynode->gp_seq, cur_gp_seq)) {
+ rcu_advance_cbs_nowake(rdp->mynode, rdp);
+ rdp->nocb_gp_adv_time = j;
+ }
+ smp_mb(); /* Enqueue before timer_pending(). */
+ if ((rdp->nocb_cb_sleep ||
+ !rcu_segcblist_ready_cbs(&rdp->cblist)) &&
+ !timer_pending(&rdp->nocb_timer)) {
+ rcu_nocb_unlock_irqrestore(rdp, flags);
+ wake_nocb_gp_defer(rdp, RCU_NOCB_WAKE_FORCE,
+ TPS("WakeOvfIsDeferred"));
+ } else {
+ rcu_nocb_unlock_irqrestore(rdp, flags);
+ trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WakeNot"));
+ }
+ } else {
+ rcu_nocb_unlock_irqrestore(rdp, flags);
+ trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WakeNot"));
+ }
+}
+
+static int nocb_gp_toggle_rdp(struct rcu_data *rdp,
+ bool *wake_state)
+{
+ struct rcu_segcblist *cblist = &rdp->cblist;
+ unsigned long flags;
+ int ret;
+
+ rcu_nocb_lock_irqsave(rdp, flags);
+ if (rcu_segcblist_test_flags(cblist, SEGCBLIST_OFFLOADED) &&
+ !rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP)) {
+ /*
+ * Offloading. Set our flag and notify the offload worker.
+ * We will handle this rdp until it ever gets de-offloaded.
+ */
+ rcu_segcblist_set_flags(cblist, SEGCBLIST_KTHREAD_GP);
+ if (rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB))
+ *wake_state = true;
+ ret = 1;
+ } else if (!rcu_segcblist_test_flags(cblist, SEGCBLIST_OFFLOADED) &&
+ rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP)) {
+ /*
+ * De-offloading. Clear our flag and notify the de-offload worker.
+ * We will ignore this rdp until it ever gets re-offloaded.
+ */
+ rcu_segcblist_clear_flags(cblist, SEGCBLIST_KTHREAD_GP);
+ if (!rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB))
+ *wake_state = true;
+ ret = 0;
+ } else {
+ WARN_ON_ONCE(1);
+ ret = -1;
+ }
+
+ rcu_nocb_unlock_irqrestore(rdp, flags);
+
+ return ret;
+}
+
+static void nocb_gp_sleep(struct rcu_data *my_rdp, int cpu)
+{
+ trace_rcu_nocb_wake(rcu_state.name, cpu, TPS("Sleep"));
+ swait_event_interruptible_exclusive(my_rdp->nocb_gp_wq,
+ !READ_ONCE(my_rdp->nocb_gp_sleep));
+ trace_rcu_nocb_wake(rcu_state.name, cpu, TPS("EndSleep"));
+}
+
+/*
+ * No-CBs GP kthreads come here to wait for additional callbacks to show up
+ * or for grace periods to end.
+ */
+static void nocb_gp_wait(struct rcu_data *my_rdp)
+{
+ bool bypass = false;
+ int __maybe_unused cpu = my_rdp->cpu;
+ unsigned long cur_gp_seq;
+ unsigned long flags;
+ bool gotcbs = false;
+ unsigned long j = jiffies;
+ bool lazy = false;
+ bool needwait_gp = false; // This prevents actual uninitialized use.
+ bool needwake;
+ bool needwake_gp;
+ struct rcu_data *rdp, *rdp_toggling = NULL;
+ struct rcu_node *rnp;
+ unsigned long wait_gp_seq = 0; // Suppress "use uninitialized" warning.
+ bool wasempty = false;
+
+ /*
+ * Each pass through the following loop checks for CBs and for the
+ * nearest grace period (if any) to wait for next. The CB kthreads
+ * and the global grace-period kthread are awakened if needed.
+ */
+ WARN_ON_ONCE(my_rdp->nocb_gp_rdp != my_rdp);
+ /*
+ * An rcu_data structure is removed from the list after its
+ * CPU is de-offloaded and added to the list before that CPU is
+ * (re-)offloaded. If the following loop happens to be referencing
+ * that rcu_data structure during the time that the corresponding
+ * CPU is de-offloaded and then immediately re-offloaded, this
+ * loop's rdp pointer will be carried to the end of the list by
+ * the resulting pair of list operations. This can cause the loop
+ * to skip over some of the rcu_data structures that were supposed
+ * to have been scanned. Fortunately a new iteration through the
+ * entire loop is forced after a given CPU's rcu_data structure
+ * is added to the list, so the skipped-over rcu_data structures
+ * won't be ignored for long.
+ */
+ list_for_each_entry(rdp, &my_rdp->nocb_head_rdp, nocb_entry_rdp) {
+ long bypass_ncbs;
+ bool flush_bypass = false;
+ long lazy_ncbs;
+
+ trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("Check"));
+ rcu_nocb_lock_irqsave(rdp, flags);
+ lockdep_assert_held(&rdp->nocb_lock);
+ bypass_ncbs = rcu_cblist_n_cbs(&rdp->nocb_bypass);
+ lazy_ncbs = READ_ONCE(rdp->lazy_len);
+
+ if (bypass_ncbs && (lazy_ncbs == bypass_ncbs) &&
+ (time_after(j, READ_ONCE(rdp->nocb_bypass_first) + jiffies_till_flush) ||
+ bypass_ncbs > 2 * qhimark)) {
+ flush_bypass = true;
+ } else if (bypass_ncbs && (lazy_ncbs != bypass_ncbs) &&
+ (time_after(j, READ_ONCE(rdp->nocb_bypass_first) + 1) ||
+ bypass_ncbs > 2 * qhimark)) {
+ flush_bypass = true;
+ } else if (!bypass_ncbs && rcu_segcblist_empty(&rdp->cblist)) {
+ rcu_nocb_unlock_irqrestore(rdp, flags);
+ continue; /* No callbacks here, try next. */
+ }
+
+ if (flush_bypass) {
+ // Bypass full or old, so flush it.
+ (void)rcu_nocb_try_flush_bypass(rdp, j);
+ bypass_ncbs = rcu_cblist_n_cbs(&rdp->nocb_bypass);
+ lazy_ncbs = READ_ONCE(rdp->lazy_len);
+ }
+
+ if (bypass_ncbs) {
+ trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
+ bypass_ncbs == lazy_ncbs ? TPS("Lazy") : TPS("Bypass"));
+ if (bypass_ncbs == lazy_ncbs)
+ lazy = true;
+ else
+ bypass = true;
+ }
+ rnp = rdp->mynode;
+
+ // Advance callbacks if helpful and low contention.
+ needwake_gp = false;
+ if (!rcu_segcblist_restempty(&rdp->cblist,
+ RCU_NEXT_READY_TAIL) ||
+ (rcu_segcblist_nextgp(&rdp->cblist, &cur_gp_seq) &&
+ rcu_seq_done(&rnp->gp_seq, cur_gp_seq))) {
+ raw_spin_lock_rcu_node(rnp); /* irqs disabled. */
+ needwake_gp = rcu_advance_cbs(rnp, rdp);
+ wasempty = rcu_segcblist_restempty(&rdp->cblist,
+ RCU_NEXT_READY_TAIL);
+ raw_spin_unlock_rcu_node(rnp); /* irqs disabled. */
+ }
+ // Need to wait on some grace period?
+ WARN_ON_ONCE(wasempty &&
+ !rcu_segcblist_restempty(&rdp->cblist,
+ RCU_NEXT_READY_TAIL));
+ if (rcu_segcblist_nextgp(&rdp->cblist, &cur_gp_seq)) {
+ if (!needwait_gp ||
+ ULONG_CMP_LT(cur_gp_seq, wait_gp_seq))
+ wait_gp_seq = cur_gp_seq;
+ needwait_gp = true;
+ trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
+ TPS("NeedWaitGP"));
+ }
+ if (rcu_segcblist_ready_cbs(&rdp->cblist)) {
+ needwake = rdp->nocb_cb_sleep;
+ WRITE_ONCE(rdp->nocb_cb_sleep, false);
+ smp_mb(); /* CB invocation -after- GP end. */
+ } else {
+ needwake = false;
+ }
+ rcu_nocb_unlock_irqrestore(rdp, flags);
+ if (needwake) {
+ swake_up_one(&rdp->nocb_cb_wq);
+ gotcbs = true;
+ }
+ if (needwake_gp)
+ rcu_gp_kthread_wake();
+ }
+
+ my_rdp->nocb_gp_bypass = bypass;
+ my_rdp->nocb_gp_gp = needwait_gp;
+ my_rdp->nocb_gp_seq = needwait_gp ? wait_gp_seq : 0;
+
+ // At least one child with non-empty ->nocb_bypass, so set
+ // timer in order to avoid stranding its callbacks.
+ if (!rcu_nocb_poll) {
+ // If bypass list only has lazy CBs. Add a deferred lazy wake up.
+ if (lazy && !bypass) {
+ wake_nocb_gp_defer(my_rdp, RCU_NOCB_WAKE_LAZY,
+ TPS("WakeLazyIsDeferred"));
+ // Otherwise add a deferred bypass wake up.
+ } else if (bypass) {
+ wake_nocb_gp_defer(my_rdp, RCU_NOCB_WAKE_BYPASS,
+ TPS("WakeBypassIsDeferred"));
+ }
+ }
+
+ if (rcu_nocb_poll) {
+ /* Polling, so trace if first poll in the series. */
+ if (gotcbs)
+ trace_rcu_nocb_wake(rcu_state.name, cpu, TPS("Poll"));
+ if (list_empty(&my_rdp->nocb_head_rdp)) {
+ raw_spin_lock_irqsave(&my_rdp->nocb_gp_lock, flags);
+ if (!my_rdp->nocb_toggling_rdp)
+ WRITE_ONCE(my_rdp->nocb_gp_sleep, true);
+ raw_spin_unlock_irqrestore(&my_rdp->nocb_gp_lock, flags);
+ /* Wait for any offloading rdp */
+ nocb_gp_sleep(my_rdp, cpu);
+ } else {
+ schedule_timeout_idle(1);
+ }
+ } else if (!needwait_gp) {
+ /* Wait for callbacks to appear. */
+ nocb_gp_sleep(my_rdp, cpu);
+ } else {
+ rnp = my_rdp->mynode;
+ trace_rcu_this_gp(rnp, my_rdp, wait_gp_seq, TPS("StartWait"));
+ swait_event_interruptible_exclusive(
+ rnp->nocb_gp_wq[rcu_seq_ctr(wait_gp_seq) & 0x1],
+ rcu_seq_done(&rnp->gp_seq, wait_gp_seq) ||
+ !READ_ONCE(my_rdp->nocb_gp_sleep));
+ trace_rcu_this_gp(rnp, my_rdp, wait_gp_seq, TPS("EndWait"));
+ }
+
+ if (!rcu_nocb_poll) {
+ raw_spin_lock_irqsave(&my_rdp->nocb_gp_lock, flags);
+ // (De-)queue an rdp to/from the group if its nocb state is changing
+ rdp_toggling = my_rdp->nocb_toggling_rdp;
+ if (rdp_toggling)
+ my_rdp->nocb_toggling_rdp = NULL;
+
+ if (my_rdp->nocb_defer_wakeup > RCU_NOCB_WAKE_NOT) {
+ WRITE_ONCE(my_rdp->nocb_defer_wakeup, RCU_NOCB_WAKE_NOT);
+ del_timer(&my_rdp->nocb_timer);
+ }
+ WRITE_ONCE(my_rdp->nocb_gp_sleep, true);
+ raw_spin_unlock_irqrestore(&my_rdp->nocb_gp_lock, flags);
+ } else {
+ rdp_toggling = READ_ONCE(my_rdp->nocb_toggling_rdp);
+ if (rdp_toggling) {
+ /*
+ * Paranoid locking to make sure nocb_toggling_rdp is well
+ * reset *before* we (re)set SEGCBLIST_KTHREAD_GP or we could
+ * race with another round of nocb toggling for this rdp.
+ * Nocb locking should prevent from that already but we stick
+ * to paranoia, especially in rare path.
+ */
+ raw_spin_lock_irqsave(&my_rdp->nocb_gp_lock, flags);
+ my_rdp->nocb_toggling_rdp = NULL;
+ raw_spin_unlock_irqrestore(&my_rdp->nocb_gp_lock, flags);
+ }
+ }
+
+ if (rdp_toggling) {
+ bool wake_state = false;
+ int ret;
+
+ ret = nocb_gp_toggle_rdp(rdp_toggling, &wake_state);
+ if (ret == 1)
+ list_add_tail(&rdp_toggling->nocb_entry_rdp, &my_rdp->nocb_head_rdp);
+ else if (ret == 0)
+ list_del(&rdp_toggling->nocb_entry_rdp);
+ if (wake_state)
+ swake_up_one(&rdp_toggling->nocb_state_wq);
+ }
+
+ my_rdp->nocb_gp_seq = -1;
+ WARN_ON(signal_pending(current));
+}
+
+/*
+ * No-CBs grace-period-wait kthread. There is one of these per group
+ * of CPUs, but only once at least one CPU in that group has come online
+ * at least once since boot. This kthread checks for newly posted
+ * callbacks from any of the CPUs it is responsible for, waits for a
+ * grace period, then awakens all of the rcu_nocb_cb_kthread() instances
+ * that then have callback-invocation work to do.
+ */
+static int rcu_nocb_gp_kthread(void *arg)
+{
+ struct rcu_data *rdp = arg;
+
+ for (;;) {
+ WRITE_ONCE(rdp->nocb_gp_loops, rdp->nocb_gp_loops + 1);
+ nocb_gp_wait(rdp);
+ cond_resched_tasks_rcu_qs();
+ }
+ return 0;
+}
+
+static inline bool nocb_cb_can_run(struct rcu_data *rdp)
+{
+ u8 flags = SEGCBLIST_OFFLOADED | SEGCBLIST_KTHREAD_CB;
+
+ return rcu_segcblist_test_flags(&rdp->cblist, flags);
+}
+
+static inline bool nocb_cb_wait_cond(struct rcu_data *rdp)
+{
+ return nocb_cb_can_run(rdp) && !READ_ONCE(rdp->nocb_cb_sleep);
+}
+
+/*
+ * Invoke any ready callbacks from the corresponding no-CBs CPU,
+ * then, if there are no more, wait for more to appear.
+ */
+static void nocb_cb_wait(struct rcu_data *rdp)
+{
+ struct rcu_segcblist *cblist = &rdp->cblist;
+ unsigned long cur_gp_seq;
+ unsigned long flags;
+ bool needwake_state = false;
+ bool needwake_gp = false;
+ bool can_sleep = true;
+ struct rcu_node *rnp = rdp->mynode;
+
+ do {
+ swait_event_interruptible_exclusive(rdp->nocb_cb_wq,
+ nocb_cb_wait_cond(rdp));
+
+ // VVV Ensure CB invocation follows _sleep test.
+ if (smp_load_acquire(&rdp->nocb_cb_sleep)) { // ^^^
+ WARN_ON(signal_pending(current));
+ trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WokeEmpty"));
+ }
+ } while (!nocb_cb_can_run(rdp));
+
+
+ local_irq_save(flags);
+ rcu_momentary_dyntick_idle();
+ local_irq_restore(flags);
+ /*
+ * Disable BH to provide the expected environment. Also, when
+ * transitioning to/from NOCB mode, a self-requeuing callback might
+ * be invoked from softirq. A short grace period could cause both
+ * instances of this callback would execute concurrently.
+ */
+ local_bh_disable();
+ rcu_do_batch(rdp);
+ local_bh_enable();
+ lockdep_assert_irqs_enabled();
+ rcu_nocb_lock_irqsave(rdp, flags);
+ if (rcu_segcblist_nextgp(cblist, &cur_gp_seq) &&
+ rcu_seq_done(&rnp->gp_seq, cur_gp_seq) &&
+ raw_spin_trylock_rcu_node(rnp)) { /* irqs already disabled. */
+ needwake_gp = rcu_advance_cbs(rdp->mynode, rdp);
+ raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled. */
+ }
+
+ if (rcu_segcblist_test_flags(cblist, SEGCBLIST_OFFLOADED)) {
+ if (!rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB)) {
+ rcu_segcblist_set_flags(cblist, SEGCBLIST_KTHREAD_CB);
+ if (rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP))
+ needwake_state = true;
+ }
+ if (rcu_segcblist_ready_cbs(cblist))
+ can_sleep = false;
+ } else {
+ /*
+ * De-offloading. Clear our flag and notify the de-offload worker.
+ * We won't touch the callbacks and keep sleeping until we ever
+ * get re-offloaded.
+ */
+ WARN_ON_ONCE(!rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB));
+ rcu_segcblist_clear_flags(cblist, SEGCBLIST_KTHREAD_CB);
+ if (!rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP))
+ needwake_state = true;
+ }
+
+ WRITE_ONCE(rdp->nocb_cb_sleep, can_sleep);
+
+ if (rdp->nocb_cb_sleep)
+ trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("CBSleep"));
+
+ rcu_nocb_unlock_irqrestore(rdp, flags);
+ if (needwake_gp)
+ rcu_gp_kthread_wake();
+
+ if (needwake_state)
+ swake_up_one(&rdp->nocb_state_wq);
+}
+
+/*
+ * Per-rcu_data kthread, but only for no-CBs CPUs. Repeatedly invoke
+ * nocb_cb_wait() to do the dirty work.
+ */
+static int rcu_nocb_cb_kthread(void *arg)
+{
+ struct rcu_data *rdp = arg;
+
+ // Each pass through this loop does one callback batch, and,
+ // if there are no more ready callbacks, waits for them.
+ for (;;) {
+ nocb_cb_wait(rdp);
+ cond_resched_tasks_rcu_qs();
+ }
+ return 0;
+}
+
+/* Is a deferred wakeup of rcu_nocb_kthread() required? */
+static int rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp, int level)
+{
+ return READ_ONCE(rdp->nocb_defer_wakeup) >= level;
+}
+
+/* Do a deferred wakeup of rcu_nocb_kthread(). */
+static bool do_nocb_deferred_wakeup_common(struct rcu_data *rdp_gp,
+ struct rcu_data *rdp, int level,
+ unsigned long flags)
+ __releases(rdp_gp->nocb_gp_lock)
+{
+ int ndw;
+ int ret;
+
+ if (!rcu_nocb_need_deferred_wakeup(rdp_gp, level)) {
+ raw_spin_unlock_irqrestore(&rdp_gp->nocb_gp_lock, flags);
+ return false;
+ }
+
+ ndw = rdp_gp->nocb_defer_wakeup;
+ ret = __wake_nocb_gp(rdp_gp, rdp, ndw == RCU_NOCB_WAKE_FORCE, flags);
+ trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("DeferredWake"));
+
+ return ret;
+}
+
+/* Do a deferred wakeup of rcu_nocb_kthread() from a timer handler. */
+static void do_nocb_deferred_wakeup_timer(struct timer_list *t)
+{
+ unsigned long flags;
+ struct rcu_data *rdp = from_timer(rdp, t, nocb_timer);
+
+ WARN_ON_ONCE(rdp->nocb_gp_rdp != rdp);
+ trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("Timer"));
+
+ raw_spin_lock_irqsave(&rdp->nocb_gp_lock, flags);
+ smp_mb__after_spinlock(); /* Timer expire before wakeup. */
+ do_nocb_deferred_wakeup_common(rdp, rdp, RCU_NOCB_WAKE_BYPASS, flags);
+}
+
+/*
+ * Do a deferred wakeup of rcu_nocb_kthread() from fastpath.
+ * This means we do an inexact common-case check. Note that if
+ * we miss, ->nocb_timer will eventually clean things up.
+ */
+static bool do_nocb_deferred_wakeup(struct rcu_data *rdp)
+{
+ unsigned long flags;
+ struct rcu_data *rdp_gp = rdp->nocb_gp_rdp;
+
+ if (!rdp_gp || !rcu_nocb_need_deferred_wakeup(rdp_gp, RCU_NOCB_WAKE))
+ return false;
+
+ raw_spin_lock_irqsave(&rdp_gp->nocb_gp_lock, flags);
+ return do_nocb_deferred_wakeup_common(rdp_gp, rdp, RCU_NOCB_WAKE, flags);
+}
+
+void rcu_nocb_flush_deferred_wakeup(void)
+{
+ do_nocb_deferred_wakeup(this_cpu_ptr(&rcu_data));
+}
+EXPORT_SYMBOL_GPL(rcu_nocb_flush_deferred_wakeup);
+
+static int rdp_offload_toggle(struct rcu_data *rdp,
+ bool offload, unsigned long flags)
+ __releases(rdp->nocb_lock)
+{
+ struct rcu_segcblist *cblist = &rdp->cblist;
+ struct rcu_data *rdp_gp = rdp->nocb_gp_rdp;
+ bool wake_gp = false;
+
+ rcu_segcblist_offload(cblist, offload);
+
+ if (rdp->nocb_cb_sleep)
+ rdp->nocb_cb_sleep = false;
+ rcu_nocb_unlock_irqrestore(rdp, flags);
+
+ /*
+ * Ignore former value of nocb_cb_sleep and force wake up as it could
+ * have been spuriously set to false already.
+ */
+ swake_up_one(&rdp->nocb_cb_wq);
+
+ raw_spin_lock_irqsave(&rdp_gp->nocb_gp_lock, flags);
+ // Queue this rdp for add/del to/from the list to iterate on rcuog
+ WRITE_ONCE(rdp_gp->nocb_toggling_rdp, rdp);
+ if (rdp_gp->nocb_gp_sleep) {
+ rdp_gp->nocb_gp_sleep = false;
+ wake_gp = true;
+ }
+ raw_spin_unlock_irqrestore(&rdp_gp->nocb_gp_lock, flags);
+
+ return wake_gp;
+}
+
+static long rcu_nocb_rdp_deoffload(void *arg)
+{
+ struct rcu_data *rdp = arg;
+ struct rcu_segcblist *cblist = &rdp->cblist;
+ unsigned long flags;
+ int wake_gp;
+ struct rcu_data *rdp_gp = rdp->nocb_gp_rdp;
+
+ /*
+ * rcu_nocb_rdp_deoffload() may be called directly if
+ * rcuog/o[p] spawn failed, because at this time the rdp->cpu
+ * is not online yet.
+ */
+ WARN_ON_ONCE((rdp->cpu != raw_smp_processor_id()) && cpu_online(rdp->cpu));
+
+ pr_info("De-offloading %d\n", rdp->cpu);
+
+ rcu_nocb_lock_irqsave(rdp, flags);
+ /*
+ * Flush once and for all now. This suffices because we are
+ * running on the target CPU holding ->nocb_lock (thus having
+ * interrupts disabled), and because rdp_offload_toggle()
+ * invokes rcu_segcblist_offload(), which clears SEGCBLIST_OFFLOADED.
+ * Thus future calls to rcu_segcblist_completely_offloaded() will
+ * return false, which means that future calls to rcu_nocb_try_bypass()
+ * will refuse to put anything into the bypass.
+ */
+ WARN_ON_ONCE(!rcu_nocb_flush_bypass(rdp, NULL, jiffies, false));
+ /*
+ * Start with invoking rcu_core() early. This way if the current thread
+ * happens to preempt an ongoing call to rcu_core() in the middle,
+ * leaving some work dismissed because rcu_core() still thinks the rdp is
+ * completely offloaded, we are guaranteed a nearby future instance of
+ * rcu_core() to catch up.
+ */
+ rcu_segcblist_set_flags(cblist, SEGCBLIST_RCU_CORE);
+ invoke_rcu_core();
+ wake_gp = rdp_offload_toggle(rdp, false, flags);
+
+ mutex_lock(&rdp_gp->nocb_gp_kthread_mutex);
+ if (rdp_gp->nocb_gp_kthread) {
+ if (wake_gp)
+ wake_up_process(rdp_gp->nocb_gp_kthread);
+
+ /*
+ * If rcuo[p] kthread spawn failed, directly remove SEGCBLIST_KTHREAD_CB.
+ * Just wait SEGCBLIST_KTHREAD_GP to be cleared by rcuog.
+ */
+ if (!rdp->nocb_cb_kthread) {
+ rcu_nocb_lock_irqsave(rdp, flags);
+ rcu_segcblist_clear_flags(&rdp->cblist, SEGCBLIST_KTHREAD_CB);
+ rcu_nocb_unlock_irqrestore(rdp, flags);
+ }
+
+ swait_event_exclusive(rdp->nocb_state_wq,
+ !rcu_segcblist_test_flags(cblist,
+ SEGCBLIST_KTHREAD_CB | SEGCBLIST_KTHREAD_GP));
+ } else {
+ /*
+ * No kthread to clear the flags for us or remove the rdp from the nocb list
+ * to iterate. Do it here instead. Locking doesn't look stricly necessary
+ * but we stick to paranoia in this rare path.
+ */
+ rcu_nocb_lock_irqsave(rdp, flags);
+ rcu_segcblist_clear_flags(&rdp->cblist,
+ SEGCBLIST_KTHREAD_CB | SEGCBLIST_KTHREAD_GP);
+ rcu_nocb_unlock_irqrestore(rdp, flags);
+
+ list_del(&rdp->nocb_entry_rdp);
+ }
+ mutex_unlock(&rdp_gp->nocb_gp_kthread_mutex);
+
+ /*
+ * Lock one last time to acquire latest callback updates from kthreads
+ * so we can later handle callbacks locally without locking.
+ */
+ rcu_nocb_lock_irqsave(rdp, flags);
+ /*
+ * Theoretically we could clear SEGCBLIST_LOCKING after the nocb
+ * lock is released but how about being paranoid for once?
+ */
+ rcu_segcblist_clear_flags(cblist, SEGCBLIST_LOCKING);
+ /*
+ * Without SEGCBLIST_LOCKING, we can't use
+ * rcu_nocb_unlock_irqrestore() anymore.
+ */
+ raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags);
+
+ /* Sanity check */
+ WARN_ON_ONCE(rcu_cblist_n_cbs(&rdp->nocb_bypass));
+
+
+ return 0;
+}
+
+int rcu_nocb_cpu_deoffload(int cpu)
+{
+ struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
+ int ret = 0;
+
+ cpus_read_lock();
+ mutex_lock(&rcu_state.barrier_mutex);
+ if (rcu_rdp_is_offloaded(rdp)) {
+ if (cpu_online(cpu)) {
+ ret = work_on_cpu(cpu, rcu_nocb_rdp_deoffload, rdp);
+ if (!ret)
+ cpumask_clear_cpu(cpu, rcu_nocb_mask);
+ } else {
+ pr_info("NOCB: Cannot CB-deoffload offline CPU %d\n", rdp->cpu);
+ ret = -EINVAL;
+ }
+ }
+ mutex_unlock(&rcu_state.barrier_mutex);
+ cpus_read_unlock();
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(rcu_nocb_cpu_deoffload);
+
+static long rcu_nocb_rdp_offload(void *arg)
+{
+ struct rcu_data *rdp = arg;
+ struct rcu_segcblist *cblist = &rdp->cblist;
+ unsigned long flags;
+ int wake_gp;
+ struct rcu_data *rdp_gp = rdp->nocb_gp_rdp;
+
+ WARN_ON_ONCE(rdp->cpu != raw_smp_processor_id());
+ /*
+ * For now we only support re-offload, ie: the rdp must have been
+ * offloaded on boot first.
+ */
+ if (!rdp->nocb_gp_rdp)
+ return -EINVAL;
+
+ if (WARN_ON_ONCE(!rdp_gp->nocb_gp_kthread))
+ return -EINVAL;
+
+ pr_info("Offloading %d\n", rdp->cpu);
+
+ /*
+ * Can't use rcu_nocb_lock_irqsave() before SEGCBLIST_LOCKING
+ * is set.
+ */
+ raw_spin_lock_irqsave(&rdp->nocb_lock, flags);
+
+ /*
+ * We didn't take the nocb lock while working on the
+ * rdp->cblist with SEGCBLIST_LOCKING cleared (pure softirq/rcuc mode).
+ * Every modifications that have been done previously on
+ * rdp->cblist must be visible remotely by the nocb kthreads
+ * upon wake up after reading the cblist flags.
+ *
+ * The layout against nocb_lock enforces that ordering:
+ *
+ * __rcu_nocb_rdp_offload() nocb_cb_wait()/nocb_gp_wait()
+ * ------------------------- ----------------------------
+ * WRITE callbacks rcu_nocb_lock()
+ * rcu_nocb_lock() READ flags
+ * WRITE flags READ callbacks
+ * rcu_nocb_unlock() rcu_nocb_unlock()
+ */
+ wake_gp = rdp_offload_toggle(rdp, true, flags);
+ if (wake_gp)
+ wake_up_process(rdp_gp->nocb_gp_kthread);
+ swait_event_exclusive(rdp->nocb_state_wq,
+ rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB) &&
+ rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP));
+
+ /*
+ * All kthreads are ready to work, we can finally relieve rcu_core() and
+ * enable nocb bypass.
+ */
+ rcu_nocb_lock_irqsave(rdp, flags);
+ rcu_segcblist_clear_flags(cblist, SEGCBLIST_RCU_CORE);
+ rcu_nocb_unlock_irqrestore(rdp, flags);
+
+ return 0;
+}
+
+int rcu_nocb_cpu_offload(int cpu)
+{
+ struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
+ int ret = 0;
+
+ cpus_read_lock();
+ mutex_lock(&rcu_state.barrier_mutex);
+ if (!rcu_rdp_is_offloaded(rdp)) {
+ if (cpu_online(cpu)) {
+ ret = work_on_cpu(cpu, rcu_nocb_rdp_offload, rdp);
+ if (!ret)
+ cpumask_set_cpu(cpu, rcu_nocb_mask);
+ } else {
+ pr_info("NOCB: Cannot CB-offload offline CPU %d\n", rdp->cpu);
+ ret = -EINVAL;
+ }
+ }
+ mutex_unlock(&rcu_state.barrier_mutex);
+ cpus_read_unlock();
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(rcu_nocb_cpu_offload);
+
+#ifdef CONFIG_RCU_LAZY
+static unsigned long
+lazy_rcu_shrink_count(struct shrinker *shrink, struct shrink_control *sc)
+{
+ int cpu;
+ unsigned long count = 0;
+
+ if (WARN_ON_ONCE(!cpumask_available(rcu_nocb_mask)))
+ return 0;
+
+ /* Protect rcu_nocb_mask against concurrent (de-)offloading. */
+ if (!mutex_trylock(&rcu_state.barrier_mutex))
+ return 0;
+
+ /* Snapshot count of all CPUs */
+ for_each_cpu(cpu, rcu_nocb_mask) {
+ struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
+
+ count += READ_ONCE(rdp->lazy_len);
+ }
+
+ mutex_unlock(&rcu_state.barrier_mutex);
+
+ return count ? count : SHRINK_EMPTY;
+}
+
+static unsigned long
+lazy_rcu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
+{
+ int cpu;
+ unsigned long flags;
+ unsigned long count = 0;
+
+ if (WARN_ON_ONCE(!cpumask_available(rcu_nocb_mask)))
+ return 0;
+ /*
+ * Protect against concurrent (de-)offloading. Otherwise nocb locking
+ * may be ignored or imbalanced.
+ */
+ if (!mutex_trylock(&rcu_state.barrier_mutex)) {
+ /*
+ * But really don't insist if barrier_mutex is contended since we
+ * can't guarantee that it will never engage in a dependency
+ * chain involving memory allocation. The lock is seldom contended
+ * anyway.
+ */
+ return 0;
+ }
+
+ /* Snapshot count of all CPUs */
+ for_each_cpu(cpu, rcu_nocb_mask) {
+ struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
+ int _count;
+
+ if (WARN_ON_ONCE(!rcu_rdp_is_offloaded(rdp)))
+ continue;
+
+ if (!READ_ONCE(rdp->lazy_len))
+ continue;
+
+ rcu_nocb_lock_irqsave(rdp, flags);
+ /*
+ * Recheck under the nocb lock. Since we are not holding the bypass
+ * lock we may still race with increments from the enqueuer but still
+ * we know for sure if there is at least one lazy callback.
+ */
+ _count = READ_ONCE(rdp->lazy_len);
+ if (!_count) {
+ rcu_nocb_unlock_irqrestore(rdp, flags);
+ continue;
+ }
+ WARN_ON_ONCE(!rcu_nocb_flush_bypass(rdp, NULL, jiffies, false));
+ rcu_nocb_unlock_irqrestore(rdp, flags);
+ wake_nocb_gp(rdp, false);
+ sc->nr_to_scan -= _count;
+ count += _count;
+ if (sc->nr_to_scan <= 0)
+ break;
+ }
+
+ mutex_unlock(&rcu_state.barrier_mutex);
+
+ return count ? count : SHRINK_STOP;
+}
+#endif // #ifdef CONFIG_RCU_LAZY
+
+void __init rcu_init_nohz(void)
+{
+ int cpu;
+ struct rcu_data *rdp;
+ const struct cpumask *cpumask = NULL;
+ struct shrinker * __maybe_unused lazy_rcu_shrinker;
+
+#if defined(CONFIG_NO_HZ_FULL)
+ if (tick_nohz_full_running && !cpumask_empty(tick_nohz_full_mask))
+ cpumask = tick_nohz_full_mask;
+#endif
+
+ if (IS_ENABLED(CONFIG_RCU_NOCB_CPU_DEFAULT_ALL) &&
+ !rcu_state.nocb_is_setup && !cpumask)
+ cpumask = cpu_possible_mask;
+
+ if (cpumask) {
+ if (!cpumask_available(rcu_nocb_mask)) {
+ if (!zalloc_cpumask_var(&rcu_nocb_mask, GFP_KERNEL)) {
+ pr_info("rcu_nocb_mask allocation failed, callback offloading disabled.\n");
+ return;
+ }
+ }
+
+ cpumask_or(rcu_nocb_mask, rcu_nocb_mask, cpumask);
+ rcu_state.nocb_is_setup = true;
+ }
+
+ if (!rcu_state.nocb_is_setup)
+ return;
+
+#ifdef CONFIG_RCU_LAZY
+ lazy_rcu_shrinker = shrinker_alloc(0, "rcu-lazy");
+ if (!lazy_rcu_shrinker) {
+ pr_err("Failed to allocate lazy_rcu shrinker!\n");
+ } else {
+ lazy_rcu_shrinker->count_objects = lazy_rcu_shrink_count;
+ lazy_rcu_shrinker->scan_objects = lazy_rcu_shrink_scan;
+
+ shrinker_register(lazy_rcu_shrinker);
+ }
+#endif // #ifdef CONFIG_RCU_LAZY
+
+ if (!cpumask_subset(rcu_nocb_mask, cpu_possible_mask)) {
+ pr_info("\tNote: kernel parameter 'rcu_nocbs=', 'nohz_full', or 'isolcpus=' contains nonexistent CPUs.\n");
+ cpumask_and(rcu_nocb_mask, cpu_possible_mask,
+ rcu_nocb_mask);
+ }
+ if (cpumask_empty(rcu_nocb_mask))
+ pr_info("\tOffload RCU callbacks from CPUs: (none).\n");
+ else
+ pr_info("\tOffload RCU callbacks from CPUs: %*pbl.\n",
+ cpumask_pr_args(rcu_nocb_mask));
+ if (rcu_nocb_poll)
+ pr_info("\tPoll for callbacks from no-CBs CPUs.\n");
+
+ for_each_cpu(cpu, rcu_nocb_mask) {
+ rdp = per_cpu_ptr(&rcu_data, cpu);
+ if (rcu_segcblist_empty(&rdp->cblist))
+ rcu_segcblist_init(&rdp->cblist);
+ rcu_segcblist_offload(&rdp->cblist, true);
+ rcu_segcblist_set_flags(&rdp->cblist, SEGCBLIST_KTHREAD_CB | SEGCBLIST_KTHREAD_GP);
+ rcu_segcblist_clear_flags(&rdp->cblist, SEGCBLIST_RCU_CORE);
+ }
+ rcu_organize_nocb_kthreads();
+}
+
+/* Initialize per-rcu_data variables for no-CBs CPUs. */
+static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp)
+{
+ init_swait_queue_head(&rdp->nocb_cb_wq);
+ init_swait_queue_head(&rdp->nocb_gp_wq);
+ init_swait_queue_head(&rdp->nocb_state_wq);
+ raw_spin_lock_init(&rdp->nocb_lock);
+ raw_spin_lock_init(&rdp->nocb_bypass_lock);
+ raw_spin_lock_init(&rdp->nocb_gp_lock);
+ timer_setup(&rdp->nocb_timer, do_nocb_deferred_wakeup_timer, 0);
+ rcu_cblist_init(&rdp->nocb_bypass);
+ WRITE_ONCE(rdp->lazy_len, 0);
+ mutex_init(&rdp->nocb_gp_kthread_mutex);
+}
+
+/*
+ * If the specified CPU is a no-CBs CPU that does not already have its
+ * rcuo CB kthread, spawn it. Additionally, if the rcuo GP kthread
+ * for this CPU's group has not yet been created, spawn it as well.
+ */
+static void rcu_spawn_cpu_nocb_kthread(int cpu)
+{
+ struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
+ struct rcu_data *rdp_gp;
+ struct task_struct *t;
+ struct sched_param sp;
+
+ if (!rcu_scheduler_fully_active || !rcu_state.nocb_is_setup)
+ return;
+
+ /* If there already is an rcuo kthread, then nothing to do. */
+ if (rdp->nocb_cb_kthread)
+ return;
+
+ /* If we didn't spawn the GP kthread first, reorganize! */
+ sp.sched_priority = kthread_prio;
+ rdp_gp = rdp->nocb_gp_rdp;
+ mutex_lock(&rdp_gp->nocb_gp_kthread_mutex);
+ if (!rdp_gp->nocb_gp_kthread) {
+ t = kthread_run(rcu_nocb_gp_kthread, rdp_gp,
+ "rcuog/%d", rdp_gp->cpu);
+ if (WARN_ONCE(IS_ERR(t), "%s: Could not start rcuo GP kthread, OOM is now expected behavior\n", __func__)) {
+ mutex_unlock(&rdp_gp->nocb_gp_kthread_mutex);
+ goto end;
+ }
+ WRITE_ONCE(rdp_gp->nocb_gp_kthread, t);
+ if (kthread_prio)
+ sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
+ }
+ mutex_unlock(&rdp_gp->nocb_gp_kthread_mutex);
+
+ /* Spawn the kthread for this CPU. */
+ t = kthread_run(rcu_nocb_cb_kthread, rdp,
+ "rcuo%c/%d", rcu_state.abbr, cpu);
+ if (WARN_ONCE(IS_ERR(t), "%s: Could not start rcuo CB kthread, OOM is now expected behavior\n", __func__))
+ goto end;
+
+ if (IS_ENABLED(CONFIG_RCU_NOCB_CPU_CB_BOOST) && kthread_prio)
+ sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
+
+ WRITE_ONCE(rdp->nocb_cb_kthread, t);
+ WRITE_ONCE(rdp->nocb_gp_kthread, rdp_gp->nocb_gp_kthread);
+ return;
+end:
+ mutex_lock(&rcu_state.barrier_mutex);
+ if (rcu_rdp_is_offloaded(rdp)) {
+ rcu_nocb_rdp_deoffload(rdp);
+ cpumask_clear_cpu(cpu, rcu_nocb_mask);
+ }
+ mutex_unlock(&rcu_state.barrier_mutex);
+}
+
+/* How many CB CPU IDs per GP kthread? Default of -1 for sqrt(nr_cpu_ids). */
+static int rcu_nocb_gp_stride = -1;
+module_param(rcu_nocb_gp_stride, int, 0444);
+
+/*
+ * Initialize GP-CB relationships for all no-CBs CPU.
+ */
+static void __init rcu_organize_nocb_kthreads(void)
+{
+ int cpu;
+ bool firsttime = true;
+ bool gotnocbs = false;
+ bool gotnocbscbs = true;
+ int ls = rcu_nocb_gp_stride;
+ int nl = 0; /* Next GP kthread. */
+ struct rcu_data *rdp;
+ struct rcu_data *rdp_gp = NULL; /* Suppress misguided gcc warn. */
+
+ if (!cpumask_available(rcu_nocb_mask))
+ return;
+ if (ls == -1) {
+ ls = nr_cpu_ids / int_sqrt(nr_cpu_ids);
+ rcu_nocb_gp_stride = ls;
+ }
+
+ /*
+ * Each pass through this loop sets up one rcu_data structure.
+ * Should the corresponding CPU come online in the future, then
+ * we will spawn the needed set of rcu_nocb_kthread() kthreads.
+ */
+ for_each_possible_cpu(cpu) {
+ rdp = per_cpu_ptr(&rcu_data, cpu);
+ if (rdp->cpu >= nl) {
+ /* New GP kthread, set up for CBs & next GP. */
+ gotnocbs = true;
+ nl = DIV_ROUND_UP(rdp->cpu + 1, ls) * ls;
+ rdp_gp = rdp;
+ INIT_LIST_HEAD(&rdp->nocb_head_rdp);
+ if (dump_tree) {
+ if (!firsttime)
+ pr_cont("%s\n", gotnocbscbs
+ ? "" : " (self only)");
+ gotnocbscbs = false;
+ firsttime = false;
+ pr_alert("%s: No-CB GP kthread CPU %d:",
+ __func__, cpu);
+ }
+ } else {
+ /* Another CB kthread, link to previous GP kthread. */
+ gotnocbscbs = true;
+ if (dump_tree)
+ pr_cont(" %d", cpu);
+ }
+ rdp->nocb_gp_rdp = rdp_gp;
+ if (cpumask_test_cpu(cpu, rcu_nocb_mask))
+ list_add_tail(&rdp->nocb_entry_rdp, &rdp_gp->nocb_head_rdp);
+ }
+ if (gotnocbs && dump_tree)
+ pr_cont("%s\n", gotnocbscbs ? "" : " (self only)");
+}
+
+/*
+ * Bind the current task to the offloaded CPUs. If there are no offloaded
+ * CPUs, leave the task unbound. Splat if the bind attempt fails.
+ */
+void rcu_bind_current_to_nocb(void)
+{
+ if (cpumask_available(rcu_nocb_mask) && !cpumask_empty(rcu_nocb_mask))
+ WARN_ON(sched_setaffinity(current->pid, rcu_nocb_mask));
+}
+EXPORT_SYMBOL_GPL(rcu_bind_current_to_nocb);
+
+// The ->on_cpu field is available only in CONFIG_SMP=y, so...
+#ifdef CONFIG_SMP
+static char *show_rcu_should_be_on_cpu(struct task_struct *tsp)
+{
+ return tsp && task_is_running(tsp) && !tsp->on_cpu ? "!" : "";
+}
+#else // #ifdef CONFIG_SMP
+static char *show_rcu_should_be_on_cpu(struct task_struct *tsp)
+{
+ return "";
+}
+#endif // #else #ifdef CONFIG_SMP
+
+/*
+ * Dump out nocb grace-period kthread state for the specified rcu_data
+ * structure.
+ */
+static void show_rcu_nocb_gp_state(struct rcu_data *rdp)
+{
+ struct rcu_node *rnp = rdp->mynode;
+
+ pr_info("nocb GP %d %c%c%c%c%c %c[%c%c] %c%c:%ld rnp %d:%d %lu %c CPU %d%s\n",
+ rdp->cpu,
+ "kK"[!!rdp->nocb_gp_kthread],
+ "lL"[raw_spin_is_locked(&rdp->nocb_gp_lock)],
+ "dD"[!!rdp->nocb_defer_wakeup],
+ "tT"[timer_pending(&rdp->nocb_timer)],
+ "sS"[!!rdp->nocb_gp_sleep],
+ ".W"[swait_active(&rdp->nocb_gp_wq)],
+ ".W"[swait_active(&rnp->nocb_gp_wq[0])],
+ ".W"[swait_active(&rnp->nocb_gp_wq[1])],
+ ".B"[!!rdp->nocb_gp_bypass],
+ ".G"[!!rdp->nocb_gp_gp],
+ (long)rdp->nocb_gp_seq,
+ rnp->grplo, rnp->grphi, READ_ONCE(rdp->nocb_gp_loops),
+ rdp->nocb_gp_kthread ? task_state_to_char(rdp->nocb_gp_kthread) : '.',
+ rdp->nocb_gp_kthread ? (int)task_cpu(rdp->nocb_gp_kthread) : -1,
+ show_rcu_should_be_on_cpu(rdp->nocb_gp_kthread));
+}
+
+/* Dump out nocb kthread state for the specified rcu_data structure. */
+static void show_rcu_nocb_state(struct rcu_data *rdp)
+{
+ char bufw[20];
+ char bufr[20];
+ struct rcu_data *nocb_next_rdp;
+ struct rcu_segcblist *rsclp = &rdp->cblist;
+ bool waslocked;
+ bool wassleep;
+
+ if (rdp->nocb_gp_rdp == rdp)
+ show_rcu_nocb_gp_state(rdp);
+
+ nocb_next_rdp = list_next_or_null_rcu(&rdp->nocb_gp_rdp->nocb_head_rdp,
+ &rdp->nocb_entry_rdp,
+ typeof(*rdp),
+ nocb_entry_rdp);
+
+ sprintf(bufw, "%ld", rsclp->gp_seq[RCU_WAIT_TAIL]);
+ sprintf(bufr, "%ld", rsclp->gp_seq[RCU_NEXT_READY_TAIL]);
+ pr_info(" CB %d^%d->%d %c%c%c%c%c%c F%ld L%ld C%d %c%c%s%c%s%c%c q%ld %c CPU %d%s\n",
+ rdp->cpu, rdp->nocb_gp_rdp->cpu,
+ nocb_next_rdp ? nocb_next_rdp->cpu : -1,
+ "kK"[!!rdp->nocb_cb_kthread],
+ "bB"[raw_spin_is_locked(&rdp->nocb_bypass_lock)],
+ "cC"[!!atomic_read(&rdp->nocb_lock_contended)],
+ "lL"[raw_spin_is_locked(&rdp->nocb_lock)],
+ "sS"[!!rdp->nocb_cb_sleep],
+ ".W"[swait_active(&rdp->nocb_cb_wq)],
+ jiffies - rdp->nocb_bypass_first,
+ jiffies - rdp->nocb_nobypass_last,
+ rdp->nocb_nobypass_count,
+ ".D"[rcu_segcblist_ready_cbs(rsclp)],
+ ".W"[!rcu_segcblist_segempty(rsclp, RCU_WAIT_TAIL)],
+ rcu_segcblist_segempty(rsclp, RCU_WAIT_TAIL) ? "" : bufw,
+ ".R"[!rcu_segcblist_segempty(rsclp, RCU_NEXT_READY_TAIL)],
+ rcu_segcblist_segempty(rsclp, RCU_NEXT_READY_TAIL) ? "" : bufr,
+ ".N"[!rcu_segcblist_segempty(rsclp, RCU_NEXT_TAIL)],
+ ".B"[!!rcu_cblist_n_cbs(&rdp->nocb_bypass)],
+ rcu_segcblist_n_cbs(&rdp->cblist),
+ rdp->nocb_cb_kthread ? task_state_to_char(rdp->nocb_cb_kthread) : '.',
+ rdp->nocb_cb_kthread ? (int)task_cpu(rdp->nocb_cb_kthread) : -1,
+ show_rcu_should_be_on_cpu(rdp->nocb_cb_kthread));
+
+ /* It is OK for GP kthreads to have GP state. */
+ if (rdp->nocb_gp_rdp == rdp)
+ return;
+
+ waslocked = raw_spin_is_locked(&rdp->nocb_gp_lock);
+ wassleep = swait_active(&rdp->nocb_gp_wq);
+ if (!rdp->nocb_gp_sleep && !waslocked && !wassleep)
+ return; /* Nothing untoward. */
+
+ pr_info(" nocb GP activity on CB-only CPU!!! %c%c%c %c\n",
+ "lL"[waslocked],
+ "dD"[!!rdp->nocb_defer_wakeup],
+ "sS"[!!rdp->nocb_gp_sleep],
+ ".W"[wassleep]);
+}
+
+#else /* #ifdef CONFIG_RCU_NOCB_CPU */
+
+static inline int rcu_lockdep_is_held_nocb(struct rcu_data *rdp)
+{
+ return 0;
+}
+
+static inline bool rcu_current_is_nocb_kthread(struct rcu_data *rdp)
+{
+ return false;
+}
+
+/* No ->nocb_lock to acquire. */
+static void rcu_nocb_lock(struct rcu_data *rdp)
+{
+}
+
+/* No ->nocb_lock to release. */
+static void rcu_nocb_unlock(struct rcu_data *rdp)
+{
+}
+
+/* No ->nocb_lock to release. */
+static void rcu_nocb_unlock_irqrestore(struct rcu_data *rdp,
+ unsigned long flags)
+{
+ local_irq_restore(flags);
+}
+
+/* Lockdep check that ->cblist may be safely accessed. */
+static void rcu_lockdep_assert_cblist_protected(struct rcu_data *rdp)
+{
+ lockdep_assert_irqs_disabled();
+}
+
+static void rcu_nocb_gp_cleanup(struct swait_queue_head *sq)
+{
+}
+
+static struct swait_queue_head *rcu_nocb_gp_get(struct rcu_node *rnp)
+{
+ return NULL;
+}
+
+static void rcu_init_one_nocb(struct rcu_node *rnp)
+{
+}
+
+static bool wake_nocb_gp(struct rcu_data *rdp, bool force)
+{
+ return false;
+}
+
+static bool rcu_nocb_flush_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
+ unsigned long j, bool lazy)
+{
+ return true;
+}
+
+static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
+ bool *was_alldone, unsigned long flags, bool lazy)
+{
+ return false;
+}
+
+static void __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_empty,
+ unsigned long flags)
+{
+ WARN_ON_ONCE(1); /* Should be dead code! */
+}
+
+static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp)
+{
+}
+
+static int rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp, int level)
+{
+ return false;
+}
+
+static bool do_nocb_deferred_wakeup(struct rcu_data *rdp)
+{
+ return false;
+}
+
+static void rcu_spawn_cpu_nocb_kthread(int cpu)
+{
+}
+
+static void show_rcu_nocb_state(struct rcu_data *rdp)
+{
+}
+
+#endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 7e291ce0a1d6..41021080ad25 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -13,10 +13,29 @@
#include "../locking/rtmutex_common.h"
-#ifdef CONFIG_RCU_NOCB_CPU
-static cpumask_var_t rcu_nocb_mask; /* CPUs to have callbacks offloaded. */
-static bool __read_mostly rcu_nocb_poll; /* Offload kthread are to poll. */
-#endif /* #ifdef CONFIG_RCU_NOCB_CPU */
+static bool rcu_rdp_is_offloaded(struct rcu_data *rdp)
+{
+ /*
+ * In order to read the offloaded state of an rdp in a safe
+ * and stable way and prevent from its value to be changed
+ * under us, we must either hold the barrier mutex, the cpu
+ * hotplug lock (read or write) or the nocb lock. Local
+ * non-preemptible reads are also safe. NOCB kthreads and
+ * timers have their own means of synchronization against the
+ * offloaded state updaters.
+ */
+ RCU_LOCKDEP_WARN(
+ !(lockdep_is_held(&rcu_state.barrier_mutex) ||
+ (IS_ENABLED(CONFIG_HOTPLUG_CPU) && lockdep_is_cpus_held()) ||
+ rcu_lockdep_is_held_nocb(rdp) ||
+ (rdp == this_cpu_ptr(&rcu_data) &&
+ !(IS_ENABLED(CONFIG_PREEMPT_COUNT) && preemptible())) ||
+ rcu_current_is_nocb_kthread(rdp)),
+ "Unsafe read of RCU_NOCB offloaded state"
+ );
+
+ return rcu_segcblist_is_offloaded(&rdp->cblist);
+}
/*
* Check the RCU kernel configuration parameters and print informative
@@ -32,12 +51,10 @@ static void __init rcu_bootup_announce_oddness(void)
RCU_FANOUT);
if (rcu_fanout_exact)
pr_info("\tHierarchical RCU autobalancing is disabled.\n");
- if (IS_ENABLED(CONFIG_RCU_FAST_NO_HZ))
- pr_info("\tRCU dyntick-idle grace-period acceleration is enabled.\n");
if (IS_ENABLED(CONFIG_PROVE_RCU))
pr_info("\tRCU lockdep checking is enabled.\n");
if (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD))
- pr_info("\tRCU strict (and thus non-scalable) grace periods enabled.\n");
+ pr_info("\tRCU strict (and thus non-scalable) grace periods are enabled.\n");
if (RCU_NUM_LVLS >= 4)
pr_info("\tFour(or more)-level hierarchy is enabled.\n");
if (RCU_FANOUT_LEAF != 16)
@@ -69,13 +86,13 @@ static void __init rcu_bootup_announce_oddness(void)
if (rcu_kick_kthreads)
pr_info("\tKick kthreads if too-long grace period.\n");
if (IS_ENABLED(CONFIG_DEBUG_OBJECTS_RCU_HEAD))
- pr_info("\tRCU callback double-/use-after-free debug enabled.\n");
+ pr_info("\tRCU callback double-/use-after-free debug is enabled.\n");
if (gp_preinit_delay)
pr_info("\tRCU debug GP pre-init slowdown %d jiffies.\n", gp_preinit_delay);
if (gp_init_delay)
pr_info("\tRCU debug GP init slowdown %d jiffies.\n", gp_init_delay);
if (gp_cleanup_delay)
- pr_info("\tRCU debug GP init slowdown %d jiffies.\n", gp_cleanup_delay);
+ pr_info("\tRCU debug GP cleanup slowdown %d jiffies.\n", gp_cleanup_delay);
if (!use_softirq)
pr_info("\tRCU_SOFTIRQ processing moved to rcuc kthreads.\n");
if (IS_ENABLED(CONFIG_RCU_EQS_DEBUG))
@@ -240,11 +257,13 @@ static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp)
* GP should not be able to end until we report, so there should be
* no need to check for a subsequent expedited GP. (Though we are
* still in a quiescent state in any case.)
+ *
+ * Interrupts are disabled, so ->cpu_no_qs.b.exp cannot change.
*/
- if (blkd_state & RCU_EXP_BLKD && rdp->exp_deferred_qs)
+ if (blkd_state & RCU_EXP_BLKD && rdp->cpu_no_qs.b.exp)
rcu_report_exp_rdp(rdp);
else
- WARN_ON_ONCE(rdp->exp_deferred_qs);
+ WARN_ON_ONCE(rdp->cpu_no_qs.b.exp);
}
/*
@@ -258,12 +277,16 @@ static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp)
* current task, there might be any number of other tasks blocked while
* in an RCU read-side critical section.
*
+ * Unlike non-preemptible-RCU, quiescent state reports for expedited
+ * grace periods are handled separately via deferred quiescent states
+ * and context switch events.
+ *
* Callers to this function must disable preemption.
*/
static void rcu_qs(void)
{
RCU_LOCKDEP_WARN(preemptible(), "rcu_qs() invoked with preemption enabled!!!\n");
- if (__this_cpu_read(rcu_data.cpu_no_qs.s)) {
+ if (__this_cpu_read(rcu_data.cpu_no_qs.b.norm)) {
trace_rcu_grace_period(TPS("rcu_preempt"),
__this_cpu_read(rcu_data.gp_seq),
TPS("cpuqs"));
@@ -294,7 +317,7 @@ void rcu_note_context_switch(bool preempt)
trace_rcu_utilization(TPS("Start context switch"));
lockdep_assert_irqs_disabled();
- WARN_ON_ONCE(!preempt && rcu_preempt_depth() > 0);
+ WARN_ONCE(!preempt && rcu_preempt_depth() > 0, "Voluntary context switch within RCU read-side critical section!");
if (rcu_preempt_depth() > 0 &&
!t->rcu_read_unlock_special.b.blocked) {
@@ -309,7 +332,7 @@ void rcu_note_context_switch(bool preempt)
* then queue the task as required based on the states
* of any ongoing and expedited grace periods.
*/
- WARN_ON_ONCE((rdp->grpmask & rcu_rnp_online_cpus(rnp)) == 0);
+ WARN_ON_ONCE(!rcu_rdp_cpu_online(rdp));
WARN_ON_ONCE(!list_empty(&t->rcu_node_entry));
trace_rcu_preempt_task(rcu_state.name,
t->pid,
@@ -331,7 +354,7 @@ void rcu_note_context_switch(bool preempt)
* means that we continue to block the current grace period.
*/
rcu_qs();
- if (rdp->exp_deferred_qs)
+ if (rdp->cpu_no_qs.b.exp)
rcu_report_exp_rdp(rdp);
rcu_tasks_qs(current, preempt);
trace_rcu_utilization(TPS("End context switch"));
@@ -353,17 +376,20 @@ static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp)
static void rcu_preempt_read_enter(void)
{
- current->rcu_read_lock_nesting++;
+ WRITE_ONCE(current->rcu_read_lock_nesting, READ_ONCE(current->rcu_read_lock_nesting) + 1);
}
static int rcu_preempt_read_exit(void)
{
- return --current->rcu_read_lock_nesting;
+ int ret = READ_ONCE(current->rcu_read_lock_nesting) - 1;
+
+ WRITE_ONCE(current->rcu_read_lock_nesting, ret);
+ return ret;
}
static void rcu_preempt_depth_set(int val)
{
- current->rcu_read_lock_nesting = val;
+ WRITE_ONCE(current->rcu_read_lock_nesting, val);
}
/*
@@ -393,8 +419,9 @@ void __rcu_read_unlock(void)
{
struct task_struct *t = current;
+ barrier(); // critical section before exit code.
if (rcu_preempt_read_exit() == 0) {
- barrier(); /* critical section before exit code. */
+ barrier(); // critical-section exit before .s check.
if (unlikely(READ_ONCE(t->rcu_read_unlock_special.s)))
rcu_read_unlock_special(t);
}
@@ -435,7 +462,7 @@ static bool rcu_preempt_has_tasks(struct rcu_node *rnp)
* be quite short, for example, in the case of the call from
* rcu_read_unlock_special().
*/
-static void
+static notrace void
rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, unsigned long flags)
{
bool empty_exp;
@@ -454,13 +481,14 @@ rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, unsigned long flags)
*/
special = t->rcu_read_unlock_special;
rdp = this_cpu_ptr(&rcu_data);
- if (!special.s && !rdp->exp_deferred_qs) {
+ if (!special.s && !rdp->cpu_no_qs.b.exp) {
local_irq_restore(flags);
return;
}
t->rcu_read_unlock_special.s = 0;
if (special.b.need_qs) {
if (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD)) {
+ rdp->cpu_no_qs.b.norm = false;
rcu_report_qs_rdp(rdp);
udelay(rcu_unlock_delay);
} else {
@@ -474,7 +502,7 @@ rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, unsigned long flags)
* tasks are handled when removing the task from the
* blocked-tasks list below.
*/
- if (rdp->exp_deferred_qs)
+ if (rdp->cpu_no_qs.b.exp)
rcu_report_exp_rdp(rdp);
/* Clean up if blocked during RCU read-side critical section. */
@@ -506,7 +534,7 @@ rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, unsigned long flags)
WRITE_ONCE(rnp->exp_tasks, np);
if (IS_ENABLED(CONFIG_RCU_BOOST)) {
/* Snapshot ->boost_mtx ownership w/rnp->lock held. */
- drop_boost_mutex = rt_mutex_owner(&rnp->boost_mtx) == t;
+ drop_boost_mutex = rt_mutex_owner(&rnp->boost_mtx.rtmutex) == t;
if (&t->rcu_node_entry == rnp->boost_tasks)
WRITE_ONCE(rnp->boost_tasks, np);
}
@@ -531,16 +559,16 @@ rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, unsigned long flags)
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
}
- /* Unboost if we were boosted. */
- if (IS_ENABLED(CONFIG_RCU_BOOST) && drop_boost_mutex)
- rt_mutex_futex_unlock(&rnp->boost_mtx);
-
/*
* If this was the last task on the expedited lists,
* then we need to report up the rcu_node hierarchy.
*/
if (!empty_exp && empty_exp_now)
rcu_report_exp_rnp(rnp, true);
+
+ /* Unboost if we were boosted. */
+ if (IS_ENABLED(CONFIG_RCU_BOOST) && drop_boost_mutex)
+ rt_mutex_futex_unlock(&rnp->boost_mtx.rtmutex);
} else {
local_irq_restore(flags);
}
@@ -555,9 +583,9 @@ rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, unsigned long flags)
* is disabled. This function cannot be expected to understand these
* nuances, so the caller must handle them.
*/
-static bool rcu_preempt_need_deferred_qs(struct task_struct *t)
+static notrace bool rcu_preempt_need_deferred_qs(struct task_struct *t)
{
- return (__this_cpu_read(rcu_data.exp_deferred_qs) ||
+ return (__this_cpu_read(rcu_data.cpu_no_qs.b.exp) ||
READ_ONCE(t->rcu_read_unlock_special.s)) &&
rcu_preempt_depth() == 0;
}
@@ -569,7 +597,7 @@ static bool rcu_preempt_need_deferred_qs(struct task_struct *t)
* evaluate safety in terms of interrupt, softirq, and preemption
* disabling.
*/
-static void rcu_preempt_deferred_qs(struct task_struct *t)
+notrace void rcu_preempt_deferred_qs(struct task_struct *t)
{
unsigned long flags;
@@ -598,9 +626,9 @@ static void rcu_preempt_deferred_qs_handler(struct irq_work *iwp)
static void rcu_read_unlock_special(struct task_struct *t)
{
unsigned long flags;
+ bool irqs_were_disabled;
bool preempt_bh_were_disabled =
!!(preempt_count() & (PREEMPT_MASK | SOFTIRQ_MASK));
- bool irqs_were_disabled;
/* NMI handlers cannot block and cannot safely manipulate state. */
if (in_nmi())
@@ -609,30 +637,40 @@ static void rcu_read_unlock_special(struct task_struct *t)
local_irq_save(flags);
irqs_were_disabled = irqs_disabled_flags(flags);
if (preempt_bh_were_disabled || irqs_were_disabled) {
- bool exp;
+ bool expboost; // Expedited GP in flight or possible boosting.
struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
struct rcu_node *rnp = rdp->mynode;
- exp = (t->rcu_blocked_node &&
- READ_ONCE(t->rcu_blocked_node->exp_tasks)) ||
- (rdp->grpmask & READ_ONCE(rnp->expmask));
+ expboost = (t->rcu_blocked_node && READ_ONCE(t->rcu_blocked_node->exp_tasks)) ||
+ (rdp->grpmask & READ_ONCE(rnp->expmask)) ||
+ (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD) &&
+ ((rdp->grpmask & READ_ONCE(rnp->qsmask)) || t->rcu_blocked_node)) ||
+ (IS_ENABLED(CONFIG_RCU_BOOST) && irqs_were_disabled &&
+ t->rcu_blocked_node);
// Need to defer quiescent state until everything is enabled.
- if (use_softirq && (in_irq() || (exp && !irqs_were_disabled))) {
+ if (use_softirq && (in_hardirq() || (expboost && !irqs_were_disabled))) {
// Using softirq, safe to awaken, and either the
- // wakeup is free or there is an expedited GP.
+ // wakeup is free or there is either an expedited
+ // GP in flight or a potential need to deboost.
raise_softirq_irqoff(RCU_SOFTIRQ);
} else {
// Enabling BH or preempt does reschedule, so...
- // Also if no expediting, slow is OK.
- // Plus nohz_full CPUs eventually get tick enabled.
+ // Also if no expediting and no possible deboosting,
+ // slow is OK. Plus nohz_full CPUs eventually get
+ // tick enabled.
set_tsk_need_resched(current);
set_preempt_need_resched();
if (IS_ENABLED(CONFIG_IRQ_WORK) && irqs_were_disabled &&
- !rdp->defer_qs_iw_pending && exp && cpu_online(rdp->cpu)) {
+ expboost && !rdp->defer_qs_iw_pending && cpu_online(rdp->cpu)) {
// Get scheduler to re-evaluate and call hooks.
// If !IRQ_WORK, FQS scan will eventually IPI.
- init_irq_work(&rdp->defer_qs_iw,
- rcu_preempt_deferred_qs_handler);
+ if (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD) &&
+ IS_ENABLED(CONFIG_PREEMPT_RT))
+ rdp->defer_qs_iw = IRQ_WORK_INIT_HARD(
+ rcu_preempt_deferred_qs_handler);
+ else
+ init_irq_work(&rdp->defer_qs_iw,
+ rcu_preempt_deferred_qs_handler);
rdp->defer_qs_iw_pending = true;
irq_work_queue_on(&rdp->defer_qs_iw, rdp->cpu);
}
@@ -682,9 +720,7 @@ static void rcu_flavor_sched_clock_irq(int user)
{
struct task_struct *t = current;
- if (user || rcu_is_cpu_rrupt_from_idle()) {
- rcu_note_voluntary_context_switch(current);
- }
+ lockdep_assert_irqs_disabled();
if (rcu_preempt_depth() > 0 ||
(preempt_count() & (PREEMPT_MASK | SOFTIRQ_MASK))) {
/* No QS, force context switch if deferred. */
@@ -744,7 +780,6 @@ dump_blkd_tasks(struct rcu_node *rnp, int ncheck)
int cpu;
int i;
struct list_head *lhp;
- bool onl;
struct rcu_data *rdp;
struct rcu_node *rnp1;
@@ -768,9 +803,8 @@ dump_blkd_tasks(struct rcu_node *rnp, int ncheck)
pr_cont("\n");
for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++) {
rdp = per_cpu_ptr(&rcu_data, cpu);
- onl = !!(rdp->grpmask & rcu_rnp_online_cpus(rnp));
pr_info("\t%d: %c online: %ld(%d) offline: %ld(%d)\n",
- cpu, ".o"[onl],
+ cpu, ".o"[rcu_rdp_cpu_online(rdp)],
(long)rdp->rcu_onl_gp_seq, rdp->rcu_onl_gp_flags,
(long)rdp->rcu_ofl_gp_seq, rdp->rcu_ofl_gp_flags);
}
@@ -787,10 +821,10 @@ void rcu_read_unlock_strict(void)
{
struct rcu_data *rdp;
- if (!IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD) ||
- irqs_disabled() || preempt_count() || !rcu_state.gp_kthread)
+ if (irqs_disabled() || preempt_count() || !rcu_state.gp_kthread)
return;
rdp = this_cpu_ptr(&rcu_data);
+ rdp->cpu_no_qs.b.norm = false;
rcu_report_qs_rdp(rdp);
udelay(rcu_unlock_delay);
}
@@ -819,10 +853,8 @@ static void rcu_qs(void)
trace_rcu_grace_period(TPS("rcu_sched"),
__this_cpu_read(rcu_data.gp_seq), TPS("cpuqs"));
__this_cpu_write(rcu_data.cpu_no_qs.b.norm, false);
- if (!__this_cpu_read(rcu_data.cpu_no_qs.b.exp))
- return;
- __this_cpu_write(rcu_data.cpu_no_qs.b.exp, false);
- rcu_report_exp_rdp(this_cpu_ptr(&rcu_data));
+ if (__this_cpu_read(rcu_data.cpu_no_qs.b.exp))
+ rcu_report_exp_rdp(this_cpu_ptr(&rcu_data));
}
/*
@@ -838,7 +870,7 @@ void rcu_all_qs(void)
if (!raw_cpu_read(rcu_data.rcu_urgent_qs))
return;
- preempt_disable();
+ preempt_disable(); // For CONFIG_PREEMPT_COUNT=y kernels
/* Load rcu_urgent_qs before other flags. */
if (!smp_load_acquire(this_cpu_ptr(&rcu_data.rcu_urgent_qs))) {
preempt_enable();
@@ -868,8 +900,8 @@ void rcu_note_context_switch(bool preempt)
this_cpu_write(rcu_data.rcu_urgent_qs, false);
if (unlikely(raw_cpu_read(rcu_data.rcu_need_heavy_qs)))
rcu_momentary_dyntick_idle();
- rcu_tasks_qs(current, preempt);
out:
+ rcu_tasks_qs(current, preempt);
trace_rcu_utilization(TPS("End context switch"));
}
EXPORT_SYMBOL_GPL(rcu_note_context_switch);
@@ -895,11 +927,25 @@ static bool rcu_preempt_has_tasks(struct rcu_node *rnp)
* Because there is no preemptible RCU, there can be no deferred quiescent
* states.
*/
-static bool rcu_preempt_need_deferred_qs(struct task_struct *t)
+static notrace bool rcu_preempt_need_deferred_qs(struct task_struct *t)
{
return false;
}
-static void rcu_preempt_deferred_qs(struct task_struct *t) { }
+
+// Except that we do need to respond to a request by an expedited
+// grace period for a quiescent state from this CPU. Note that in
+// non-preemptible kernels, there can be no context switches within RCU
+// read-side critical sections, which in turn means that the leaf rcu_node
+// structure's blocked-tasks list is always empty. is therefore no need to
+// actually check it. Instead, a quiescent state from this CPU suffices,
+// and this function is only called from such a quiescent state.
+notrace void rcu_preempt_deferred_qs(struct task_struct *t)
+{
+ struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
+
+ if (READ_ONCE(rdp->cpu_no_qs.b.exp))
+ rcu_report_exp_rdp(rdp);
+}
/*
* Because there is no preemptible RCU, there can be no readers blocked,
@@ -930,7 +976,6 @@ static void rcu_flavor_sched_clock_irq(int user)
* neither access nor modify, at least not while the
* corresponding CPU is online.
*/
-
rcu_qs();
}
}
@@ -959,12 +1004,34 @@ dump_blkd_tasks(struct rcu_node *rnp, int ncheck)
*/
static void rcu_cpu_kthread_setup(unsigned int cpu)
{
+ struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
#ifdef CONFIG_RCU_BOOST
struct sched_param sp;
sp.sched_priority = kthread_prio;
sched_setscheduler_nocheck(current, SCHED_FIFO, &sp);
#endif /* #ifdef CONFIG_RCU_BOOST */
+
+ WRITE_ONCE(rdp->rcuc_activity, jiffies);
+}
+
+static bool rcu_is_callbacks_nocb_kthread(struct rcu_data *rdp)
+{
+#ifdef CONFIG_RCU_NOCB_CPU
+ return rdp->nocb_cb_kthread == current;
+#else
+ return false;
+#endif
+}
+
+/*
+ * Is the current CPU running the RCU-callbacks kthread?
+ * Caller must have preemption disabled.
+ */
+static bool rcu_is_callbacks_kthread(struct rcu_data *rdp)
+{
+ return rdp->rcu_cpu_kthread_task == current ||
+ rcu_is_callbacks_nocb_kthread(rdp);
}
#ifdef CONFIG_RCU_BOOST
@@ -1026,11 +1093,12 @@ static int rcu_boost(struct rcu_node *rnp)
* section.
*/
t = container_of(tb, struct task_struct, rcu_node_entry);
- rt_mutex_init_proxy_locked(&rnp->boost_mtx, t);
+ rt_mutex_init_proxy_locked(&rnp->boost_mtx.rtmutex, t);
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
/* Lock only for side effect: boosts task t's priority. */
rt_mutex_lock(&rnp->boost_mtx);
rt_mutex_unlock(&rnp->boost_mtx); /* Then keep lockdep happy. */
+ rnp->n_boosts++;
return READ_ONCE(rnp->exp_tasks) != NULL ||
READ_ONCE(rnp->boost_tasks) != NULL;
@@ -1085,7 +1153,8 @@ static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags)
__releases(rnp->lock)
{
raw_lockdep_assert_held_rcu_node(rnp);
- if (!rcu_preempt_blocked_readers_cgp(rnp) && rnp->exp_tasks == NULL) {
+ if (!rnp->boost_kthread_task ||
+ (!rcu_preempt_blocked_readers_cgp(rnp) && !rnp->exp_tasks)) {
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
return;
}
@@ -1093,7 +1162,8 @@ static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags)
(rnp->gp_tasks != NULL &&
rnp->boost_tasks == NULL &&
rnp->qsmask == 0 &&
- (!time_after(rnp->boost_time, jiffies) || rcu_state.cbovld))) {
+ (!time_after(rnp->boost_time, jiffies) || rcu_state.cbovld ||
+ IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD)))) {
if (rnp->exp_tasks == NULL)
WRITE_ONCE(rnp->boost_tasks, rnp->gp_tasks);
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
@@ -1104,15 +1174,6 @@ static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags)
}
}
-/*
- * Is the current CPU running the RCU-callbacks kthread?
- * Caller must have preemption disabled.
- */
-static bool rcu_is_callbacks_kthread(void)
-{
- return __this_cpu_read(rcu_data.rcu_cpu_kthread_task) == current;
-}
-
#define RCU_BOOST_DELAY_JIFFIES DIV_ROUND_UP(CONFIG_RCU_BOOST_DELAY * HZ, 1000)
/*
@@ -1126,30 +1187,22 @@ static void rcu_preempt_boost_start_gp(struct rcu_node *rnp)
/*
* Create an RCU-boost kthread for the specified node if one does not
* already exist. We only create this kthread for preemptible RCU.
- * Returns zero if all is well, a negated errno otherwise.
*/
static void rcu_spawn_one_boost_kthread(struct rcu_node *rnp)
{
- int rnp_index = rnp - rcu_get_root();
unsigned long flags;
+ int rnp_index = rnp - rcu_get_root();
struct sched_param sp;
struct task_struct *t;
- if (!IS_ENABLED(CONFIG_PREEMPT_RCU))
- return;
-
- if (!rcu_scheduler_fully_active || rcu_rnp_online_cpus(rnp) == 0)
- return;
-
- rcu_state.boost = 1;
-
- if (rnp->boost_kthread_task != NULL)
- return;
+ mutex_lock(&rnp->boost_kthread_mutex);
+ if (rnp->boost_kthread_task || !rcu_scheduler_fully_active)
+ goto out;
t = kthread_create(rcu_boost_kthread, (void *)rnp,
"rcub/%d", rnp_index);
if (WARN_ON_ONCE(IS_ERR(t)))
- return;
+ goto out;
raw_spin_lock_irqsave_rcu_node(rnp, flags);
rnp->boost_kthread_task = t;
@@ -1157,6 +1210,9 @@ static void rcu_spawn_one_boost_kthread(struct rcu_node *rnp)
sp.sched_priority = kthread_prio;
sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
wake_up_process(t); /* get to TASK_INTERRUPTIBLE quickly. */
+
+ out:
+ mutex_unlock(&rnp->boost_kthread_mutex);
}
/*
@@ -1167,11 +1223,13 @@ static void rcu_spawn_one_boost_kthread(struct rcu_node *rnp)
* We don't include outgoingcpu in the affinity set, use -1 if there is
* no outgoing CPU. If there are no CPUs left in the affinity set,
* this function allows the kthread to execute on any CPU.
+ *
+ * Any future concurrent calls are serialized via ->boost_kthread_mutex.
*/
static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu)
{
struct task_struct *t = rnp->boost_kthread_task;
- unsigned long mask = rcu_rnp_online_cpus(rnp);
+ unsigned long mask;
cpumask_var_t cm;
int cpu;
@@ -1179,37 +1237,23 @@ static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu)
return;
if (!zalloc_cpumask_var(&cm, GFP_KERNEL))
return;
+ mutex_lock(&rnp->boost_kthread_mutex);
+ mask = rcu_rnp_online_cpus(rnp);
for_each_leaf_node_possible_cpu(rnp, cpu)
if ((mask & leaf_node_cpu_bit(rnp, cpu)) &&
cpu != outgoingcpu)
cpumask_set_cpu(cpu, cm);
- if (cpumask_weight(cm) == 0)
- cpumask_setall(cm);
+ cpumask_and(cm, cm, housekeeping_cpumask(HK_TYPE_RCU));
+ if (cpumask_empty(cm)) {
+ cpumask_copy(cm, housekeeping_cpumask(HK_TYPE_RCU));
+ if (outgoingcpu >= 0)
+ cpumask_clear_cpu(outgoingcpu, cm);
+ }
set_cpus_allowed_ptr(t, cm);
+ mutex_unlock(&rnp->boost_kthread_mutex);
free_cpumask_var(cm);
}
-/*
- * Spawn boost kthreads -- called as soon as the scheduler is running.
- */
-static void __init rcu_spawn_boost_kthreads(void)
-{
- struct rcu_node *rnp;
-
- rcu_for_each_leaf_node(rnp)
- rcu_spawn_one_boost_kthread(rnp);
-}
-
-static void rcu_prepare_kthreads(int cpu)
-{
- struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
- struct rcu_node *rnp = rdp->mynode;
-
- /* Fire up the incoming CPU's kthread and leaf rcu_node kthread. */
- if (rcu_scheduler_fully_active)
- rcu_spawn_one_boost_kthread(rnp);
-}
-
#else /* #ifdef CONFIG_RCU_BOOST */
static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags)
@@ -1218,1324 +1262,20 @@ static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags)
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
}
-static bool rcu_is_callbacks_kthread(void)
-{
- return false;
-}
-
static void rcu_preempt_boost_start_gp(struct rcu_node *rnp)
{
}
-static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu)
-{
-}
-
-static void __init rcu_spawn_boost_kthreads(void)
+static void rcu_spawn_one_boost_kthread(struct rcu_node *rnp)
{
}
-static void rcu_prepare_kthreads(int cpu)
+static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu)
{
}
#endif /* #else #ifdef CONFIG_RCU_BOOST */
-#if !defined(CONFIG_RCU_FAST_NO_HZ)
-
-/*
- * Check to see if any future non-offloaded RCU-related work will need
- * to be done by the current CPU, even if none need be done immediately,
- * returning 1 if so. This function is part of the RCU implementation;
- * it is -not- an exported member of the RCU API.
- *
- * Because we not have RCU_FAST_NO_HZ, just check whether or not this
- * CPU has RCU callbacks queued.
- */
-int rcu_needs_cpu(u64 basemono, u64 *nextevt)
-{
- *nextevt = KTIME_MAX;
- return !rcu_segcblist_empty(&this_cpu_ptr(&rcu_data)->cblist) &&
- !rcu_segcblist_is_offloaded(&this_cpu_ptr(&rcu_data)->cblist);
-}
-
-/*
- * Because we do not have RCU_FAST_NO_HZ, don't bother cleaning up
- * after it.
- */
-static void rcu_cleanup_after_idle(void)
-{
-}
-
-/*
- * Do the idle-entry grace-period work, which, because CONFIG_RCU_FAST_NO_HZ=n,
- * is nothing.
- */
-static void rcu_prepare_for_idle(void)
-{
-}
-
-#else /* #if !defined(CONFIG_RCU_FAST_NO_HZ) */
-
-/*
- * This code is invoked when a CPU goes idle, at which point we want
- * to have the CPU do everything required for RCU so that it can enter
- * the energy-efficient dyntick-idle mode.
- *
- * The following preprocessor symbol controls this:
- *
- * RCU_IDLE_GP_DELAY gives the number of jiffies that a CPU is permitted
- * to sleep in dyntick-idle mode with RCU callbacks pending. This
- * is sized to be roughly one RCU grace period. Those energy-efficiency
- * benchmarkers who might otherwise be tempted to set this to a large
- * number, be warned: Setting RCU_IDLE_GP_DELAY too high can hang your
- * system. And if you are -that- concerned about energy efficiency,
- * just power the system down and be done with it!
- *
- * The value below works well in practice. If future workloads require
- * adjustment, they can be converted into kernel config parameters, though
- * making the state machine smarter might be a better option.
- */
-#define RCU_IDLE_GP_DELAY 4 /* Roughly one grace period. */
-
-static int rcu_idle_gp_delay = RCU_IDLE_GP_DELAY;
-module_param(rcu_idle_gp_delay, int, 0644);
-
-/*
- * Try to advance callbacks on the current CPU, but only if it has been
- * awhile since the last time we did so. Afterwards, if there are any
- * callbacks ready for immediate invocation, return true.
- */
-static bool __maybe_unused rcu_try_advance_all_cbs(void)
-{
- bool cbs_ready = false;
- struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
- struct rcu_node *rnp;
-
- /* Exit early if we advanced recently. */
- if (jiffies == rdp->last_advance_all)
- return false;
- rdp->last_advance_all = jiffies;
-
- rnp = rdp->mynode;
-
- /*
- * Don't bother checking unless a grace period has
- * completed since we last checked and there are
- * callbacks not yet ready to invoke.
- */
- if ((rcu_seq_completed_gp(rdp->gp_seq,
- rcu_seq_current(&rnp->gp_seq)) ||
- unlikely(READ_ONCE(rdp->gpwrap))) &&
- rcu_segcblist_pend_cbs(&rdp->cblist))
- note_gp_changes(rdp);
-
- if (rcu_segcblist_ready_cbs(&rdp->cblist))
- cbs_ready = true;
- return cbs_ready;
-}
-
-/*
- * Allow the CPU to enter dyntick-idle mode unless it has callbacks ready
- * to invoke. If the CPU has callbacks, try to advance them. Tell the
- * caller about what to set the timeout.
- *
- * The caller must have disabled interrupts.
- */
-int rcu_needs_cpu(u64 basemono, u64 *nextevt)
-{
- struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
- unsigned long dj;
-
- lockdep_assert_irqs_disabled();
-
- /* If no non-offloaded callbacks, RCU doesn't need the CPU. */
- if (rcu_segcblist_empty(&rdp->cblist) ||
- rcu_segcblist_is_offloaded(&this_cpu_ptr(&rcu_data)->cblist)) {
- *nextevt = KTIME_MAX;
- return 0;
- }
-
- /* Attempt to advance callbacks. */
- if (rcu_try_advance_all_cbs()) {
- /* Some ready to invoke, so initiate later invocation. */
- invoke_rcu_core();
- return 1;
- }
- rdp->last_accelerate = jiffies;
-
- /* Request timer and round. */
- dj = round_up(rcu_idle_gp_delay + jiffies, rcu_idle_gp_delay) - jiffies;
-
- *nextevt = basemono + dj * TICK_NSEC;
- return 0;
-}
-
-/*
- * Prepare a CPU for idle from an RCU perspective. The first major task is to
- * sense whether nohz mode has been enabled or disabled via sysfs. The second
- * major task is to accelerate (that is, assign grace-period numbers to) any
- * recently arrived callbacks.
- *
- * The caller must have disabled interrupts.
- */
-static void rcu_prepare_for_idle(void)
-{
- bool needwake;
- struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
- struct rcu_node *rnp;
- int tne;
-
- lockdep_assert_irqs_disabled();
- if (rcu_segcblist_is_offloaded(&rdp->cblist))
- return;
-
- /* Handle nohz enablement switches conservatively. */
- tne = READ_ONCE(tick_nohz_active);
- if (tne != rdp->tick_nohz_enabled_snap) {
- if (!rcu_segcblist_empty(&rdp->cblist))
- invoke_rcu_core(); /* force nohz to see update. */
- rdp->tick_nohz_enabled_snap = tne;
- return;
- }
- if (!tne)
- return;
-
- /*
- * If we have not yet accelerated this jiffy, accelerate all
- * callbacks on this CPU.
- */
- if (rdp->last_accelerate == jiffies)
- return;
- rdp->last_accelerate = jiffies;
- if (rcu_segcblist_pend_cbs(&rdp->cblist)) {
- rnp = rdp->mynode;
- raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */
- needwake = rcu_accelerate_cbs(rnp, rdp);
- raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled. */
- if (needwake)
- rcu_gp_kthread_wake();
- }
-}
-
-/*
- * Clean up for exit from idle. Attempt to advance callbacks based on
- * any grace periods that elapsed while the CPU was idle, and if any
- * callbacks are now ready to invoke, initiate invocation.
- */
-static void rcu_cleanup_after_idle(void)
-{
- struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
-
- lockdep_assert_irqs_disabled();
- if (rcu_segcblist_is_offloaded(&rdp->cblist))
- return;
- if (rcu_try_advance_all_cbs())
- invoke_rcu_core();
-}
-
-#endif /* #else #if !defined(CONFIG_RCU_FAST_NO_HZ) */
-
-#ifdef CONFIG_RCU_NOCB_CPU
-
-/*
- * Offload callback processing from the boot-time-specified set of CPUs
- * specified by rcu_nocb_mask. For the CPUs in the set, there are kthreads
- * created that pull the callbacks from the corresponding CPU, wait for
- * a grace period to elapse, and invoke the callbacks. These kthreads
- * are organized into GP kthreads, which manage incoming callbacks, wait for
- * grace periods, and awaken CB kthreads, and the CB kthreads, which only
- * invoke callbacks. Each GP kthread invokes its own CBs. The no-CBs CPUs
- * do a wake_up() on their GP kthread when they insert a callback into any
- * empty list, unless the rcu_nocb_poll boot parameter has been specified,
- * in which case each kthread actively polls its CPU. (Which isn't so great
- * for energy efficiency, but which does reduce RCU's overhead on that CPU.)
- *
- * This is intended to be used in conjunction with Frederic Weisbecker's
- * adaptive-idle work, which would seriously reduce OS jitter on CPUs
- * running CPU-bound user-mode computations.
- *
- * Offloading of callbacks can also be used as an energy-efficiency
- * measure because CPUs with no RCU callbacks queued are more aggressive
- * about entering dyntick-idle mode.
- */
-
-
-/*
- * Parse the boot-time rcu_nocb_mask CPU list from the kernel parameters.
- * The string after the "rcu_nocbs=" is either "all" for all CPUs, or a
- * comma-separated list of CPUs and/or CPU ranges. If an invalid list is
- * given, a warning is emitted and all CPUs are offloaded.
- */
-static int __init rcu_nocb_setup(char *str)
-{
- alloc_bootmem_cpumask_var(&rcu_nocb_mask);
- if (!strcasecmp(str, "all"))
- cpumask_setall(rcu_nocb_mask);
- else
- if (cpulist_parse(str, rcu_nocb_mask)) {
- pr_warn("rcu_nocbs= bad CPU range, all CPUs set\n");
- cpumask_setall(rcu_nocb_mask);
- }
- return 1;
-}
-__setup("rcu_nocbs=", rcu_nocb_setup);
-
-static int __init parse_rcu_nocb_poll(char *arg)
-{
- rcu_nocb_poll = true;
- return 0;
-}
-early_param("rcu_nocb_poll", parse_rcu_nocb_poll);
-
-/*
- * Don't bother bypassing ->cblist if the call_rcu() rate is low.
- * After all, the main point of bypassing is to avoid lock contention
- * on ->nocb_lock, which only can happen at high call_rcu() rates.
- */
-int nocb_nobypass_lim_per_jiffy = 16 * 1000 / HZ;
-module_param(nocb_nobypass_lim_per_jiffy, int, 0);
-
-/*
- * Acquire the specified rcu_data structure's ->nocb_bypass_lock. If the
- * lock isn't immediately available, increment ->nocb_lock_contended to
- * flag the contention.
- */
-static void rcu_nocb_bypass_lock(struct rcu_data *rdp)
- __acquires(&rdp->nocb_bypass_lock)
-{
- lockdep_assert_irqs_disabled();
- if (raw_spin_trylock(&rdp->nocb_bypass_lock))
- return;
- atomic_inc(&rdp->nocb_lock_contended);
- WARN_ON_ONCE(smp_processor_id() != rdp->cpu);
- smp_mb__after_atomic(); /* atomic_inc() before lock. */
- raw_spin_lock(&rdp->nocb_bypass_lock);
- smp_mb__before_atomic(); /* atomic_dec() after lock. */
- atomic_dec(&rdp->nocb_lock_contended);
-}
-
-/*
- * Spinwait until the specified rcu_data structure's ->nocb_lock is
- * not contended. Please note that this is extremely special-purpose,
- * relying on the fact that at most two kthreads and one CPU contend for
- * this lock, and also that the two kthreads are guaranteed to have frequent
- * grace-period-duration time intervals between successive acquisitions
- * of the lock. This allows us to use an extremely simple throttling
- * mechanism, and further to apply it only to the CPU doing floods of
- * call_rcu() invocations. Don't try this at home!
- */
-static void rcu_nocb_wait_contended(struct rcu_data *rdp)
-{
- WARN_ON_ONCE(smp_processor_id() != rdp->cpu);
- while (WARN_ON_ONCE(atomic_read(&rdp->nocb_lock_contended)))
- cpu_relax();
-}
-
-/*
- * Conditionally acquire the specified rcu_data structure's
- * ->nocb_bypass_lock.
- */
-static bool rcu_nocb_bypass_trylock(struct rcu_data *rdp)
-{
- lockdep_assert_irqs_disabled();
- return raw_spin_trylock(&rdp->nocb_bypass_lock);
-}
-
-/*
- * Release the specified rcu_data structure's ->nocb_bypass_lock.
- */
-static void rcu_nocb_bypass_unlock(struct rcu_data *rdp)
- __releases(&rdp->nocb_bypass_lock)
-{
- lockdep_assert_irqs_disabled();
- raw_spin_unlock(&rdp->nocb_bypass_lock);
-}
-
-/*
- * Acquire the specified rcu_data structure's ->nocb_lock, but only
- * if it corresponds to a no-CBs CPU.
- */
-static void rcu_nocb_lock(struct rcu_data *rdp)
-{
- lockdep_assert_irqs_disabled();
- if (!rcu_segcblist_is_offloaded(&rdp->cblist))
- return;
- raw_spin_lock(&rdp->nocb_lock);
-}
-
-/*
- * Release the specified rcu_data structure's ->nocb_lock, but only
- * if it corresponds to a no-CBs CPU.
- */
-static void rcu_nocb_unlock(struct rcu_data *rdp)
-{
- if (rcu_segcblist_is_offloaded(&rdp->cblist)) {
- lockdep_assert_irqs_disabled();
- raw_spin_unlock(&rdp->nocb_lock);
- }
-}
-
-/*
- * Release the specified rcu_data structure's ->nocb_lock and restore
- * interrupts, but only if it corresponds to a no-CBs CPU.
- */
-static void rcu_nocb_unlock_irqrestore(struct rcu_data *rdp,
- unsigned long flags)
-{
- if (rcu_segcblist_is_offloaded(&rdp->cblist)) {
- lockdep_assert_irqs_disabled();
- raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags);
- } else {
- local_irq_restore(flags);
- }
-}
-
-/* Lockdep check that ->cblist may be safely accessed. */
-static void rcu_lockdep_assert_cblist_protected(struct rcu_data *rdp)
-{
- lockdep_assert_irqs_disabled();
- if (rcu_segcblist_is_offloaded(&rdp->cblist))
- lockdep_assert_held(&rdp->nocb_lock);
-}
-
-/*
- * Wake up any no-CBs CPUs' kthreads that were waiting on the just-ended
- * grace period.
- */
-static void rcu_nocb_gp_cleanup(struct swait_queue_head *sq)
-{
- swake_up_all(sq);
-}
-
-static struct swait_queue_head *rcu_nocb_gp_get(struct rcu_node *rnp)
-{
- return &rnp->nocb_gp_wq[rcu_seq_ctr(rnp->gp_seq) & 0x1];
-}
-
-static void rcu_init_one_nocb(struct rcu_node *rnp)
-{
- init_swait_queue_head(&rnp->nocb_gp_wq[0]);
- init_swait_queue_head(&rnp->nocb_gp_wq[1]);
-}
-
-/* Is the specified CPU a no-CBs CPU? */
-bool rcu_is_nocb_cpu(int cpu)
-{
- if (cpumask_available(rcu_nocb_mask))
- return cpumask_test_cpu(cpu, rcu_nocb_mask);
- return false;
-}
-
-/*
- * Kick the GP kthread for this NOCB group. Caller holds ->nocb_lock
- * and this function releases it.
- */
-static void wake_nocb_gp(struct rcu_data *rdp, bool force,
- unsigned long flags)
- __releases(rdp->nocb_lock)
-{
- bool needwake = false;
- struct rcu_data *rdp_gp = rdp->nocb_gp_rdp;
-
- lockdep_assert_held(&rdp->nocb_lock);
- if (!READ_ONCE(rdp_gp->nocb_gp_kthread)) {
- trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
- TPS("AlreadyAwake"));
- rcu_nocb_unlock_irqrestore(rdp, flags);
- return;
- }
- del_timer(&rdp->nocb_timer);
- rcu_nocb_unlock_irqrestore(rdp, flags);
- raw_spin_lock_irqsave(&rdp_gp->nocb_gp_lock, flags);
- if (force || READ_ONCE(rdp_gp->nocb_gp_sleep)) {
- WRITE_ONCE(rdp_gp->nocb_gp_sleep, false);
- needwake = true;
- trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("DoWake"));
- }
- raw_spin_unlock_irqrestore(&rdp_gp->nocb_gp_lock, flags);
- if (needwake)
- wake_up_process(rdp_gp->nocb_gp_kthread);
-}
-
-/*
- * Arrange to wake the GP kthread for this NOCB group at some future
- * time when it is safe to do so.
- */
-static void wake_nocb_gp_defer(struct rcu_data *rdp, int waketype,
- const char *reason)
-{
- if (rdp->nocb_defer_wakeup == RCU_NOCB_WAKE_NOT)
- mod_timer(&rdp->nocb_timer, jiffies + 1);
- if (rdp->nocb_defer_wakeup < waketype)
- WRITE_ONCE(rdp->nocb_defer_wakeup, waketype);
- trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, reason);
-}
-
-/*
- * Flush the ->nocb_bypass queue into ->cblist, enqueuing rhp if non-NULL.
- * However, if there is a callback to be enqueued and if ->nocb_bypass
- * proves to be initially empty, just return false because the no-CB GP
- * kthread may need to be awakened in this case.
- *
- * Note that this function always returns true if rhp is NULL.
- */
-static bool rcu_nocb_do_flush_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
- unsigned long j)
-{
- struct rcu_cblist rcl;
-
- WARN_ON_ONCE(!rcu_segcblist_is_offloaded(&rdp->cblist));
- rcu_lockdep_assert_cblist_protected(rdp);
- lockdep_assert_held(&rdp->nocb_bypass_lock);
- if (rhp && !rcu_cblist_n_cbs(&rdp->nocb_bypass)) {
- raw_spin_unlock(&rdp->nocb_bypass_lock);
- return false;
- }
- /* Note: ->cblist.len already accounts for ->nocb_bypass contents. */
- if (rhp)
- rcu_segcblist_inc_len(&rdp->cblist); /* Must precede enqueue. */
- rcu_cblist_flush_enqueue(&rcl, &rdp->nocb_bypass, rhp);
- rcu_segcblist_insert_pend_cbs(&rdp->cblist, &rcl);
- WRITE_ONCE(rdp->nocb_bypass_first, j);
- rcu_nocb_bypass_unlock(rdp);
- return true;
-}
-
-/*
- * Flush the ->nocb_bypass queue into ->cblist, enqueuing rhp if non-NULL.
- * However, if there is a callback to be enqueued and if ->nocb_bypass
- * proves to be initially empty, just return false because the no-CB GP
- * kthread may need to be awakened in this case.
- *
- * Note that this function always returns true if rhp is NULL.
- */
-static bool rcu_nocb_flush_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
- unsigned long j)
-{
- if (!rcu_segcblist_is_offloaded(&rdp->cblist))
- return true;
- rcu_lockdep_assert_cblist_protected(rdp);
- rcu_nocb_bypass_lock(rdp);
- return rcu_nocb_do_flush_bypass(rdp, rhp, j);
-}
-
-/*
- * If the ->nocb_bypass_lock is immediately available, flush the
- * ->nocb_bypass queue into ->cblist.
- */
-static void rcu_nocb_try_flush_bypass(struct rcu_data *rdp, unsigned long j)
-{
- rcu_lockdep_assert_cblist_protected(rdp);
- if (!rcu_segcblist_is_offloaded(&rdp->cblist) ||
- !rcu_nocb_bypass_trylock(rdp))
- return;
- WARN_ON_ONCE(!rcu_nocb_do_flush_bypass(rdp, NULL, j));
-}
-
-/*
- * See whether it is appropriate to use the ->nocb_bypass list in order
- * to control contention on ->nocb_lock. A limited number of direct
- * enqueues are permitted into ->cblist per jiffy. If ->nocb_bypass
- * is non-empty, further callbacks must be placed into ->nocb_bypass,
- * otherwise rcu_barrier() breaks. Use rcu_nocb_flush_bypass() to switch
- * back to direct use of ->cblist. However, ->nocb_bypass should not be
- * used if ->cblist is empty, because otherwise callbacks can be stranded
- * on ->nocb_bypass because we cannot count on the current CPU ever again
- * invoking call_rcu(). The general rule is that if ->nocb_bypass is
- * non-empty, the corresponding no-CBs grace-period kthread must not be
- * in an indefinite sleep state.
- *
- * Finally, it is not permitted to use the bypass during early boot,
- * as doing so would confuse the auto-initialization code. Besides
- * which, there is no point in worrying about lock contention while
- * there is only one CPU in operation.
- */
-static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
- bool *was_alldone, unsigned long flags)
-{
- unsigned long c;
- unsigned long cur_gp_seq;
- unsigned long j = jiffies;
- long ncbs = rcu_cblist_n_cbs(&rdp->nocb_bypass);
-
- if (!rcu_segcblist_is_offloaded(&rdp->cblist)) {
- *was_alldone = !rcu_segcblist_pend_cbs(&rdp->cblist);
- return false; /* Not offloaded, no bypassing. */
- }
- lockdep_assert_irqs_disabled();
-
- // Don't use ->nocb_bypass during early boot.
- if (rcu_scheduler_active != RCU_SCHEDULER_RUNNING) {
- rcu_nocb_lock(rdp);
- WARN_ON_ONCE(rcu_cblist_n_cbs(&rdp->nocb_bypass));
- *was_alldone = !rcu_segcblist_pend_cbs(&rdp->cblist);
- return false;
- }
-
- // If we have advanced to a new jiffy, reset counts to allow
- // moving back from ->nocb_bypass to ->cblist.
- if (j == rdp->nocb_nobypass_last) {
- c = rdp->nocb_nobypass_count + 1;
- } else {
- WRITE_ONCE(rdp->nocb_nobypass_last, j);
- c = rdp->nocb_nobypass_count - nocb_nobypass_lim_per_jiffy;
- if (ULONG_CMP_LT(rdp->nocb_nobypass_count,
- nocb_nobypass_lim_per_jiffy))
- c = 0;
- else if (c > nocb_nobypass_lim_per_jiffy)
- c = nocb_nobypass_lim_per_jiffy;
- }
- WRITE_ONCE(rdp->nocb_nobypass_count, c);
-
- // If there hasn't yet been all that many ->cblist enqueues
- // this jiffy, tell the caller to enqueue onto ->cblist. But flush
- // ->nocb_bypass first.
- if (rdp->nocb_nobypass_count < nocb_nobypass_lim_per_jiffy) {
- rcu_nocb_lock(rdp);
- *was_alldone = !rcu_segcblist_pend_cbs(&rdp->cblist);
- if (*was_alldone)
- trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
- TPS("FirstQ"));
- WARN_ON_ONCE(!rcu_nocb_flush_bypass(rdp, NULL, j));
- WARN_ON_ONCE(rcu_cblist_n_cbs(&rdp->nocb_bypass));
- return false; // Caller must enqueue the callback.
- }
-
- // If ->nocb_bypass has been used too long or is too full,
- // flush ->nocb_bypass to ->cblist.
- if ((ncbs && j != READ_ONCE(rdp->nocb_bypass_first)) ||
- ncbs >= qhimark) {
- rcu_nocb_lock(rdp);
- if (!rcu_nocb_flush_bypass(rdp, rhp, j)) {
- *was_alldone = !rcu_segcblist_pend_cbs(&rdp->cblist);
- if (*was_alldone)
- trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
- TPS("FirstQ"));
- WARN_ON_ONCE(rcu_cblist_n_cbs(&rdp->nocb_bypass));
- return false; // Caller must enqueue the callback.
- }
- if (j != rdp->nocb_gp_adv_time &&
- rcu_segcblist_nextgp(&rdp->cblist, &cur_gp_seq) &&
- rcu_seq_done(&rdp->mynode->gp_seq, cur_gp_seq)) {
- rcu_advance_cbs_nowake(rdp->mynode, rdp);
- rdp->nocb_gp_adv_time = j;
- }
- rcu_nocb_unlock_irqrestore(rdp, flags);
- return true; // Callback already enqueued.
- }
-
- // We need to use the bypass.
- rcu_nocb_wait_contended(rdp);
- rcu_nocb_bypass_lock(rdp);
- ncbs = rcu_cblist_n_cbs(&rdp->nocb_bypass);
- rcu_segcblist_inc_len(&rdp->cblist); /* Must precede enqueue. */
- rcu_cblist_enqueue(&rdp->nocb_bypass, rhp);
- if (!ncbs) {
- WRITE_ONCE(rdp->nocb_bypass_first, j);
- trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("FirstBQ"));
- }
- rcu_nocb_bypass_unlock(rdp);
- smp_mb(); /* Order enqueue before wake. */
- if (ncbs) {
- local_irq_restore(flags);
- } else {
- // No-CBs GP kthread might be indefinitely asleep, if so, wake.
- rcu_nocb_lock(rdp); // Rare during call_rcu() flood.
- if (!rcu_segcblist_pend_cbs(&rdp->cblist)) {
- trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
- TPS("FirstBQwake"));
- __call_rcu_nocb_wake(rdp, true, flags);
- } else {
- trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
- TPS("FirstBQnoWake"));
- rcu_nocb_unlock_irqrestore(rdp, flags);
- }
- }
- return true; // Callback already enqueued.
-}
-
-/*
- * Awaken the no-CBs grace-period kthead if needed, either due to it
- * legitimately being asleep or due to overload conditions.
- *
- * If warranted, also wake up the kthread servicing this CPUs queues.
- */
-static void __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_alldone,
- unsigned long flags)
- __releases(rdp->nocb_lock)
-{
- unsigned long cur_gp_seq;
- unsigned long j;
- long len;
- struct task_struct *t;
-
- // If we are being polled or there is no kthread, just leave.
- t = READ_ONCE(rdp->nocb_gp_kthread);
- if (rcu_nocb_poll || !t) {
- trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
- TPS("WakeNotPoll"));
- rcu_nocb_unlock_irqrestore(rdp, flags);
- return;
- }
- // Need to actually to a wakeup.
- len = rcu_segcblist_n_cbs(&rdp->cblist);
- if (was_alldone) {
- rdp->qlen_last_fqs_check = len;
- if (!irqs_disabled_flags(flags)) {
- /* ... if queue was empty ... */
- wake_nocb_gp(rdp, false, flags);
- trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
- TPS("WakeEmpty"));
- } else {
- wake_nocb_gp_defer(rdp, RCU_NOCB_WAKE,
- TPS("WakeEmptyIsDeferred"));
- rcu_nocb_unlock_irqrestore(rdp, flags);
- }
- } else if (len > rdp->qlen_last_fqs_check + qhimark) {
- /* ... or if many callbacks queued. */
- rdp->qlen_last_fqs_check = len;
- j = jiffies;
- if (j != rdp->nocb_gp_adv_time &&
- rcu_segcblist_nextgp(&rdp->cblist, &cur_gp_seq) &&
- rcu_seq_done(&rdp->mynode->gp_seq, cur_gp_seq)) {
- rcu_advance_cbs_nowake(rdp->mynode, rdp);
- rdp->nocb_gp_adv_time = j;
- }
- smp_mb(); /* Enqueue before timer_pending(). */
- if ((rdp->nocb_cb_sleep ||
- !rcu_segcblist_ready_cbs(&rdp->cblist)) &&
- !timer_pending(&rdp->nocb_bypass_timer))
- wake_nocb_gp_defer(rdp, RCU_NOCB_WAKE_FORCE,
- TPS("WakeOvfIsDeferred"));
- rcu_nocb_unlock_irqrestore(rdp, flags);
- } else {
- trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WakeNot"));
- rcu_nocb_unlock_irqrestore(rdp, flags);
- }
- return;
-}
-
-/* Wake up the no-CBs GP kthread to flush ->nocb_bypass. */
-static void do_nocb_bypass_wakeup_timer(struct timer_list *t)
-{
- unsigned long flags;
- struct rcu_data *rdp = from_timer(rdp, t, nocb_bypass_timer);
-
- trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("Timer"));
- rcu_nocb_lock_irqsave(rdp, flags);
- smp_mb__after_spinlock(); /* Timer expire before wakeup. */
- __call_rcu_nocb_wake(rdp, true, flags);
-}
-
-/*
- * No-CBs GP kthreads come here to wait for additional callbacks to show up
- * or for grace periods to end.
- */
-static void nocb_gp_wait(struct rcu_data *my_rdp)
-{
- bool bypass = false;
- long bypass_ncbs;
- int __maybe_unused cpu = my_rdp->cpu;
- unsigned long cur_gp_seq;
- unsigned long flags;
- bool gotcbs = false;
- unsigned long j = jiffies;
- bool needwait_gp = false; // This prevents actual uninitialized use.
- bool needwake;
- bool needwake_gp;
- struct rcu_data *rdp;
- struct rcu_node *rnp;
- unsigned long wait_gp_seq = 0; // Suppress "use uninitialized" warning.
- bool wasempty = false;
-
- /*
- * Each pass through the following loop checks for CBs and for the
- * nearest grace period (if any) to wait for next. The CB kthreads
- * and the global grace-period kthread are awakened if needed.
- */
- WARN_ON_ONCE(my_rdp->nocb_gp_rdp != my_rdp);
- for (rdp = my_rdp; rdp; rdp = rdp->nocb_next_cb_rdp) {
- trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("Check"));
- rcu_nocb_lock_irqsave(rdp, flags);
- bypass_ncbs = rcu_cblist_n_cbs(&rdp->nocb_bypass);
- if (bypass_ncbs &&
- (time_after(j, READ_ONCE(rdp->nocb_bypass_first) + 1) ||
- bypass_ncbs > 2 * qhimark)) {
- // Bypass full or old, so flush it.
- (void)rcu_nocb_try_flush_bypass(rdp, j);
- bypass_ncbs = rcu_cblist_n_cbs(&rdp->nocb_bypass);
- } else if (!bypass_ncbs && rcu_segcblist_empty(&rdp->cblist)) {
- rcu_nocb_unlock_irqrestore(rdp, flags);
- continue; /* No callbacks here, try next. */
- }
- if (bypass_ncbs) {
- trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
- TPS("Bypass"));
- bypass = true;
- }
- rnp = rdp->mynode;
- if (bypass) { // Avoid race with first bypass CB.
- WRITE_ONCE(my_rdp->nocb_defer_wakeup,
- RCU_NOCB_WAKE_NOT);
- del_timer(&my_rdp->nocb_timer);
- }
- // Advance callbacks if helpful and low contention.
- needwake_gp = false;
- if (!rcu_segcblist_restempty(&rdp->cblist,
- RCU_NEXT_READY_TAIL) ||
- (rcu_segcblist_nextgp(&rdp->cblist, &cur_gp_seq) &&
- rcu_seq_done(&rnp->gp_seq, cur_gp_seq))) {
- raw_spin_lock_rcu_node(rnp); /* irqs disabled. */
- needwake_gp = rcu_advance_cbs(rnp, rdp);
- wasempty = rcu_segcblist_restempty(&rdp->cblist,
- RCU_NEXT_READY_TAIL);
- raw_spin_unlock_rcu_node(rnp); /* irqs disabled. */
- }
- // Need to wait on some grace period?
- WARN_ON_ONCE(wasempty &&
- !rcu_segcblist_restempty(&rdp->cblist,
- RCU_NEXT_READY_TAIL));
- if (rcu_segcblist_nextgp(&rdp->cblist, &cur_gp_seq)) {
- if (!needwait_gp ||
- ULONG_CMP_LT(cur_gp_seq, wait_gp_seq))
- wait_gp_seq = cur_gp_seq;
- needwait_gp = true;
- trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
- TPS("NeedWaitGP"));
- }
- if (rcu_segcblist_ready_cbs(&rdp->cblist)) {
- needwake = rdp->nocb_cb_sleep;
- WRITE_ONCE(rdp->nocb_cb_sleep, false);
- smp_mb(); /* CB invocation -after- GP end. */
- } else {
- needwake = false;
- }
- rcu_nocb_unlock_irqrestore(rdp, flags);
- if (needwake) {
- swake_up_one(&rdp->nocb_cb_wq);
- gotcbs = true;
- }
- if (needwake_gp)
- rcu_gp_kthread_wake();
- }
-
- my_rdp->nocb_gp_bypass = bypass;
- my_rdp->nocb_gp_gp = needwait_gp;
- my_rdp->nocb_gp_seq = needwait_gp ? wait_gp_seq : 0;
- if (bypass && !rcu_nocb_poll) {
- // At least one child with non-empty ->nocb_bypass, so set
- // timer in order to avoid stranding its callbacks.
- raw_spin_lock_irqsave(&my_rdp->nocb_gp_lock, flags);
- mod_timer(&my_rdp->nocb_bypass_timer, j + 2);
- raw_spin_unlock_irqrestore(&my_rdp->nocb_gp_lock, flags);
- }
- if (rcu_nocb_poll) {
- /* Polling, so trace if first poll in the series. */
- if (gotcbs)
- trace_rcu_nocb_wake(rcu_state.name, cpu, TPS("Poll"));
- schedule_timeout_idle(1);
- } else if (!needwait_gp) {
- /* Wait for callbacks to appear. */
- trace_rcu_nocb_wake(rcu_state.name, cpu, TPS("Sleep"));
- swait_event_interruptible_exclusive(my_rdp->nocb_gp_wq,
- !READ_ONCE(my_rdp->nocb_gp_sleep));
- trace_rcu_nocb_wake(rcu_state.name, cpu, TPS("EndSleep"));
- } else {
- rnp = my_rdp->mynode;
- trace_rcu_this_gp(rnp, my_rdp, wait_gp_seq, TPS("StartWait"));
- swait_event_interruptible_exclusive(
- rnp->nocb_gp_wq[rcu_seq_ctr(wait_gp_seq) & 0x1],
- rcu_seq_done(&rnp->gp_seq, wait_gp_seq) ||
- !READ_ONCE(my_rdp->nocb_gp_sleep));
- trace_rcu_this_gp(rnp, my_rdp, wait_gp_seq, TPS("EndWait"));
- }
- if (!rcu_nocb_poll) {
- raw_spin_lock_irqsave(&my_rdp->nocb_gp_lock, flags);
- if (bypass)
- del_timer(&my_rdp->nocb_bypass_timer);
- WRITE_ONCE(my_rdp->nocb_gp_sleep, true);
- raw_spin_unlock_irqrestore(&my_rdp->nocb_gp_lock, flags);
- }
- my_rdp->nocb_gp_seq = -1;
- WARN_ON(signal_pending(current));
-}
-
-/*
- * No-CBs grace-period-wait kthread. There is one of these per group
- * of CPUs, but only once at least one CPU in that group has come online
- * at least once since boot. This kthread checks for newly posted
- * callbacks from any of the CPUs it is responsible for, waits for a
- * grace period, then awakens all of the rcu_nocb_cb_kthread() instances
- * that then have callback-invocation work to do.
- */
-static int rcu_nocb_gp_kthread(void *arg)
-{
- struct rcu_data *rdp = arg;
-
- for (;;) {
- WRITE_ONCE(rdp->nocb_gp_loops, rdp->nocb_gp_loops + 1);
- nocb_gp_wait(rdp);
- cond_resched_tasks_rcu_qs();
- }
- return 0;
-}
-
-/*
- * Invoke any ready callbacks from the corresponding no-CBs CPU,
- * then, if there are no more, wait for more to appear.
- */
-static void nocb_cb_wait(struct rcu_data *rdp)
-{
- unsigned long cur_gp_seq;
- unsigned long flags;
- bool needwake_gp = false;
- struct rcu_node *rnp = rdp->mynode;
-
- local_irq_save(flags);
- rcu_momentary_dyntick_idle();
- local_irq_restore(flags);
- local_bh_disable();
- rcu_do_batch(rdp);
- local_bh_enable();
- lockdep_assert_irqs_enabled();
- rcu_nocb_lock_irqsave(rdp, flags);
- if (rcu_segcblist_nextgp(&rdp->cblist, &cur_gp_seq) &&
- rcu_seq_done(&rnp->gp_seq, cur_gp_seq) &&
- raw_spin_trylock_rcu_node(rnp)) { /* irqs already disabled. */
- needwake_gp = rcu_advance_cbs(rdp->mynode, rdp);
- raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled. */
- }
- if (rcu_segcblist_ready_cbs(&rdp->cblist)) {
- rcu_nocb_unlock_irqrestore(rdp, flags);
- if (needwake_gp)
- rcu_gp_kthread_wake();
- return;
- }
-
- trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("CBSleep"));
- WRITE_ONCE(rdp->nocb_cb_sleep, true);
- rcu_nocb_unlock_irqrestore(rdp, flags);
- if (needwake_gp)
- rcu_gp_kthread_wake();
- swait_event_interruptible_exclusive(rdp->nocb_cb_wq,
- !READ_ONCE(rdp->nocb_cb_sleep));
- if (!smp_load_acquire(&rdp->nocb_cb_sleep)) { /* VVV */
- /* ^^^ Ensure CB invocation follows _sleep test. */
- return;
- }
- WARN_ON(signal_pending(current));
- trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WokeEmpty"));
-}
-
-/*
- * Per-rcu_data kthread, but only for no-CBs CPUs. Repeatedly invoke
- * nocb_cb_wait() to do the dirty work.
- */
-static int rcu_nocb_cb_kthread(void *arg)
-{
- struct rcu_data *rdp = arg;
-
- // Each pass through this loop does one callback batch, and,
- // if there are no more ready callbacks, waits for them.
- for (;;) {
- nocb_cb_wait(rdp);
- cond_resched_tasks_rcu_qs();
- }
- return 0;
-}
-
-/* Is a deferred wakeup of rcu_nocb_kthread() required? */
-static int rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp)
-{
- return READ_ONCE(rdp->nocb_defer_wakeup);
-}
-
-/* Do a deferred wakeup of rcu_nocb_kthread(). */
-static void do_nocb_deferred_wakeup_common(struct rcu_data *rdp)
-{
- unsigned long flags;
- int ndw;
-
- rcu_nocb_lock_irqsave(rdp, flags);
- if (!rcu_nocb_need_deferred_wakeup(rdp)) {
- rcu_nocb_unlock_irqrestore(rdp, flags);
- return;
- }
- ndw = READ_ONCE(rdp->nocb_defer_wakeup);
- WRITE_ONCE(rdp->nocb_defer_wakeup, RCU_NOCB_WAKE_NOT);
- wake_nocb_gp(rdp, ndw == RCU_NOCB_WAKE_FORCE, flags);
- trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("DeferredWake"));
-}
-
-/* Do a deferred wakeup of rcu_nocb_kthread() from a timer handler. */
-static void do_nocb_deferred_wakeup_timer(struct timer_list *t)
-{
- struct rcu_data *rdp = from_timer(rdp, t, nocb_timer);
-
- do_nocb_deferred_wakeup_common(rdp);
-}
-
-/*
- * Do a deferred wakeup of rcu_nocb_kthread() from fastpath.
- * This means we do an inexact common-case check. Note that if
- * we miss, ->nocb_timer will eventually clean things up.
- */
-static void do_nocb_deferred_wakeup(struct rcu_data *rdp)
-{
- if (rcu_nocb_need_deferred_wakeup(rdp))
- do_nocb_deferred_wakeup_common(rdp);
-}
-
-void __init rcu_init_nohz(void)
-{
- int cpu;
- bool need_rcu_nocb_mask = false;
- struct rcu_data *rdp;
-
-#if defined(CONFIG_NO_HZ_FULL)
- if (tick_nohz_full_running && cpumask_weight(tick_nohz_full_mask))
- need_rcu_nocb_mask = true;
-#endif /* #if defined(CONFIG_NO_HZ_FULL) */
-
- if (!cpumask_available(rcu_nocb_mask) && need_rcu_nocb_mask) {
- if (!zalloc_cpumask_var(&rcu_nocb_mask, GFP_KERNEL)) {
- pr_info("rcu_nocb_mask allocation failed, callback offloading disabled.\n");
- return;
- }
- }
- if (!cpumask_available(rcu_nocb_mask))
- return;
-
-#if defined(CONFIG_NO_HZ_FULL)
- if (tick_nohz_full_running)
- cpumask_or(rcu_nocb_mask, rcu_nocb_mask, tick_nohz_full_mask);
-#endif /* #if defined(CONFIG_NO_HZ_FULL) */
-
- if (!cpumask_subset(rcu_nocb_mask, cpu_possible_mask)) {
- pr_info("\tNote: kernel parameter 'rcu_nocbs=', 'nohz_full', or 'isolcpus=' contains nonexistent CPUs.\n");
- cpumask_and(rcu_nocb_mask, cpu_possible_mask,
- rcu_nocb_mask);
- }
- if (cpumask_empty(rcu_nocb_mask))
- pr_info("\tOffload RCU callbacks from CPUs: (none).\n");
- else
- pr_info("\tOffload RCU callbacks from CPUs: %*pbl.\n",
- cpumask_pr_args(rcu_nocb_mask));
- if (rcu_nocb_poll)
- pr_info("\tPoll for callbacks from no-CBs CPUs.\n");
-
- for_each_cpu(cpu, rcu_nocb_mask) {
- rdp = per_cpu_ptr(&rcu_data, cpu);
- if (rcu_segcblist_empty(&rdp->cblist))
- rcu_segcblist_init(&rdp->cblist);
- rcu_segcblist_offload(&rdp->cblist);
- }
- rcu_organize_nocb_kthreads();
-}
-
-/* Initialize per-rcu_data variables for no-CBs CPUs. */
-static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp)
-{
- init_swait_queue_head(&rdp->nocb_cb_wq);
- init_swait_queue_head(&rdp->nocb_gp_wq);
- raw_spin_lock_init(&rdp->nocb_lock);
- raw_spin_lock_init(&rdp->nocb_bypass_lock);
- raw_spin_lock_init(&rdp->nocb_gp_lock);
- timer_setup(&rdp->nocb_timer, do_nocb_deferred_wakeup_timer, 0);
- timer_setup(&rdp->nocb_bypass_timer, do_nocb_bypass_wakeup_timer, 0);
- rcu_cblist_init(&rdp->nocb_bypass);
-}
-
-/*
- * If the specified CPU is a no-CBs CPU that does not already have its
- * rcuo CB kthread, spawn it. Additionally, if the rcuo GP kthread
- * for this CPU's group has not yet been created, spawn it as well.
- */
-static void rcu_spawn_one_nocb_kthread(int cpu)
-{
- struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
- struct rcu_data *rdp_gp;
- struct task_struct *t;
-
- /*
- * If this isn't a no-CBs CPU or if it already has an rcuo kthread,
- * then nothing to do.
- */
- if (!rcu_is_nocb_cpu(cpu) || rdp->nocb_cb_kthread)
- return;
-
- /* If we didn't spawn the GP kthread first, reorganize! */
- rdp_gp = rdp->nocb_gp_rdp;
- if (!rdp_gp->nocb_gp_kthread) {
- t = kthread_run(rcu_nocb_gp_kthread, rdp_gp,
- "rcuog/%d", rdp_gp->cpu);
- if (WARN_ONCE(IS_ERR(t), "%s: Could not start rcuo GP kthread, OOM is now expected behavior\n", __func__))
- return;
- WRITE_ONCE(rdp_gp->nocb_gp_kthread, t);
- }
-
- /* Spawn the kthread for this CPU. */
- t = kthread_run(rcu_nocb_cb_kthread, rdp,
- "rcuo%c/%d", rcu_state.abbr, cpu);
- if (WARN_ONCE(IS_ERR(t), "%s: Could not start rcuo CB kthread, OOM is now expected behavior\n", __func__))
- return;
- WRITE_ONCE(rdp->nocb_cb_kthread, t);
- WRITE_ONCE(rdp->nocb_gp_kthread, rdp_gp->nocb_gp_kthread);
-}
-
-/*
- * If the specified CPU is a no-CBs CPU that does not already have its
- * rcuo kthread, spawn it.
- */
-static void rcu_spawn_cpu_nocb_kthread(int cpu)
-{
- if (rcu_scheduler_fully_active)
- rcu_spawn_one_nocb_kthread(cpu);
-}
-
-/*
- * Once the scheduler is running, spawn rcuo kthreads for all online
- * no-CBs CPUs. This assumes that the early_initcall()s happen before
- * non-boot CPUs come online -- if this changes, we will need to add
- * some mutual exclusion.
- */
-static void __init rcu_spawn_nocb_kthreads(void)
-{
- int cpu;
-
- for_each_online_cpu(cpu)
- rcu_spawn_cpu_nocb_kthread(cpu);
-}
-
-/* How many CB CPU IDs per GP kthread? Default of -1 for sqrt(nr_cpu_ids). */
-static int rcu_nocb_gp_stride = -1;
-module_param(rcu_nocb_gp_stride, int, 0444);
-
-/*
- * Initialize GP-CB relationships for all no-CBs CPU.
- */
-static void __init rcu_organize_nocb_kthreads(void)
-{
- int cpu;
- bool firsttime = true;
- bool gotnocbs = false;
- bool gotnocbscbs = true;
- int ls = rcu_nocb_gp_stride;
- int nl = 0; /* Next GP kthread. */
- struct rcu_data *rdp;
- struct rcu_data *rdp_gp = NULL; /* Suppress misguided gcc warn. */
- struct rcu_data *rdp_prev = NULL;
-
- if (!cpumask_available(rcu_nocb_mask))
- return;
- if (ls == -1) {
- ls = nr_cpu_ids / int_sqrt(nr_cpu_ids);
- rcu_nocb_gp_stride = ls;
- }
-
- /*
- * Each pass through this loop sets up one rcu_data structure.
- * Should the corresponding CPU come online in the future, then
- * we will spawn the needed set of rcu_nocb_kthread() kthreads.
- */
- for_each_cpu(cpu, rcu_nocb_mask) {
- rdp = per_cpu_ptr(&rcu_data, cpu);
- if (rdp->cpu >= nl) {
- /* New GP kthread, set up for CBs & next GP. */
- gotnocbs = true;
- nl = DIV_ROUND_UP(rdp->cpu + 1, ls) * ls;
- rdp->nocb_gp_rdp = rdp;
- rdp_gp = rdp;
- if (dump_tree) {
- if (!firsttime)
- pr_cont("%s\n", gotnocbscbs
- ? "" : " (self only)");
- gotnocbscbs = false;
- firsttime = false;
- pr_alert("%s: No-CB GP kthread CPU %d:",
- __func__, cpu);
- }
- } else {
- /* Another CB kthread, link to previous GP kthread. */
- gotnocbscbs = true;
- rdp->nocb_gp_rdp = rdp_gp;
- rdp_prev->nocb_next_cb_rdp = rdp;
- if (dump_tree)
- pr_cont(" %d", cpu);
- }
- rdp_prev = rdp;
- }
- if (gotnocbs && dump_tree)
- pr_cont("%s\n", gotnocbscbs ? "" : " (self only)");
-}
-
-/*
- * Bind the current task to the offloaded CPUs. If there are no offloaded
- * CPUs, leave the task unbound. Splat if the bind attempt fails.
- */
-void rcu_bind_current_to_nocb(void)
-{
- if (cpumask_available(rcu_nocb_mask) && cpumask_weight(rcu_nocb_mask))
- WARN_ON(sched_setaffinity(current->pid, rcu_nocb_mask));
-}
-EXPORT_SYMBOL_GPL(rcu_bind_current_to_nocb);
-
-/*
- * Dump out nocb grace-period kthread state for the specified rcu_data
- * structure.
- */
-static void show_rcu_nocb_gp_state(struct rcu_data *rdp)
-{
- struct rcu_node *rnp = rdp->mynode;
-
- pr_info("nocb GP %d %c%c%c%c%c%c %c[%c%c] %c%c:%ld rnp %d:%d %lu\n",
- rdp->cpu,
- "kK"[!!rdp->nocb_gp_kthread],
- "lL"[raw_spin_is_locked(&rdp->nocb_gp_lock)],
- "dD"[!!rdp->nocb_defer_wakeup],
- "tT"[timer_pending(&rdp->nocb_timer)],
- "bB"[timer_pending(&rdp->nocb_bypass_timer)],
- "sS"[!!rdp->nocb_gp_sleep],
- ".W"[swait_active(&rdp->nocb_gp_wq)],
- ".W"[swait_active(&rnp->nocb_gp_wq[0])],
- ".W"[swait_active(&rnp->nocb_gp_wq[1])],
- ".B"[!!rdp->nocb_gp_bypass],
- ".G"[!!rdp->nocb_gp_gp],
- (long)rdp->nocb_gp_seq,
- rnp->grplo, rnp->grphi, READ_ONCE(rdp->nocb_gp_loops));
-}
-
-/* Dump out nocb kthread state for the specified rcu_data structure. */
-static void show_rcu_nocb_state(struct rcu_data *rdp)
-{
- struct rcu_segcblist *rsclp = &rdp->cblist;
- bool waslocked;
- bool wastimer;
- bool wassleep;
-
- if (rdp->nocb_gp_rdp == rdp)
- show_rcu_nocb_gp_state(rdp);
-
- pr_info(" CB %d->%d %c%c%c%c%c%c F%ld L%ld C%d %c%c%c%c%c q%ld\n",
- rdp->cpu, rdp->nocb_gp_rdp->cpu,
- "kK"[!!rdp->nocb_cb_kthread],
- "bB"[raw_spin_is_locked(&rdp->nocb_bypass_lock)],
- "cC"[!!atomic_read(&rdp->nocb_lock_contended)],
- "lL"[raw_spin_is_locked(&rdp->nocb_lock)],
- "sS"[!!rdp->nocb_cb_sleep],
- ".W"[swait_active(&rdp->nocb_cb_wq)],
- jiffies - rdp->nocb_bypass_first,
- jiffies - rdp->nocb_nobypass_last,
- rdp->nocb_nobypass_count,
- ".D"[rcu_segcblist_ready_cbs(rsclp)],
- ".W"[!rcu_segcblist_restempty(rsclp, RCU_DONE_TAIL)],
- ".R"[!rcu_segcblist_restempty(rsclp, RCU_WAIT_TAIL)],
- ".N"[!rcu_segcblist_restempty(rsclp, RCU_NEXT_READY_TAIL)],
- ".B"[!!rcu_cblist_n_cbs(&rdp->nocb_bypass)],
- rcu_segcblist_n_cbs(&rdp->cblist));
-
- /* It is OK for GP kthreads to have GP state. */
- if (rdp->nocb_gp_rdp == rdp)
- return;
-
- waslocked = raw_spin_is_locked(&rdp->nocb_gp_lock);
- wastimer = timer_pending(&rdp->nocb_bypass_timer);
- wassleep = swait_active(&rdp->nocb_gp_wq);
- if (!rdp->nocb_gp_sleep && !waslocked && !wastimer && !wassleep)
- return; /* Nothing untowards. */
-
- pr_info(" nocb GP activity on CB-only CPU!!! %c%c%c%c %c\n",
- "lL"[waslocked],
- "dD"[!!rdp->nocb_defer_wakeup],
- "tT"[wastimer],
- "sS"[!!rdp->nocb_gp_sleep],
- ".W"[wassleep]);
-}
-
-#else /* #ifdef CONFIG_RCU_NOCB_CPU */
-
-/* No ->nocb_lock to acquire. */
-static void rcu_nocb_lock(struct rcu_data *rdp)
-{
-}
-
-/* No ->nocb_lock to release. */
-static void rcu_nocb_unlock(struct rcu_data *rdp)
-{
-}
-
-/* No ->nocb_lock to release. */
-static void rcu_nocb_unlock_irqrestore(struct rcu_data *rdp,
- unsigned long flags)
-{
- local_irq_restore(flags);
-}
-
-/* Lockdep check that ->cblist may be safely accessed. */
-static void rcu_lockdep_assert_cblist_protected(struct rcu_data *rdp)
-{
- lockdep_assert_irqs_disabled();
-}
-
-static void rcu_nocb_gp_cleanup(struct swait_queue_head *sq)
-{
-}
-
-static struct swait_queue_head *rcu_nocb_gp_get(struct rcu_node *rnp)
-{
- return NULL;
-}
-
-static void rcu_init_one_nocb(struct rcu_node *rnp)
-{
-}
-
-static bool rcu_nocb_flush_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
- unsigned long j)
-{
- return true;
-}
-
-static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
- bool *was_alldone, unsigned long flags)
-{
- return false;
-}
-
-static void __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_empty,
- unsigned long flags)
-{
- WARN_ON_ONCE(1); /* Should be dead code! */
-}
-
-static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp)
-{
-}
-
-static int rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp)
-{
- return false;
-}
-
-static void do_nocb_deferred_wakeup(struct rcu_data *rdp)
-{
-}
-
-static void rcu_spawn_cpu_nocb_kthread(int cpu)
-{
-}
-
-static void __init rcu_spawn_nocb_kthreads(void)
-{
-}
-
-static void show_rcu_nocb_state(struct rcu_data *rdp)
-{
-}
-
-#endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */
-
/*
* Is this CPU a NO_HZ_FULL CPU that should ignore RCU so that the
* grace-period kthread will do force_quiescent_state() processing?
@@ -2543,7 +1283,7 @@ static void show_rcu_nocb_state(struct rcu_data *rdp)
* CPU unless the grace period has extended for too long.
*
* This code relies on the fact that all NO_HZ_FULL CPUs are also
- * CONFIG_RCU_NOCB_CPU CPUs.
+ * RCU_NOCB_CPU CPUs.
*/
static bool rcu_nohz_full_cpu(void)
{
@@ -2563,39 +1303,5 @@ static void rcu_bind_gp_kthread(void)
{
if (!tick_nohz_full_enabled())
return;
- housekeeping_affine(current, HK_FLAG_RCU);
-}
-
-/* Record the current task on dyntick-idle entry. */
-static void noinstr rcu_dynticks_task_enter(void)
-{
-#if defined(CONFIG_TASKS_RCU) && defined(CONFIG_NO_HZ_FULL)
- WRITE_ONCE(current->rcu_tasks_idle_cpu, smp_processor_id());
-#endif /* #if defined(CONFIG_TASKS_RCU) && defined(CONFIG_NO_HZ_FULL) */
-}
-
-/* Record no current task on dyntick-idle exit. */
-static void noinstr rcu_dynticks_task_exit(void)
-{
-#if defined(CONFIG_TASKS_RCU) && defined(CONFIG_NO_HZ_FULL)
- WRITE_ONCE(current->rcu_tasks_idle_cpu, -1);
-#endif /* #if defined(CONFIG_TASKS_RCU) && defined(CONFIG_NO_HZ_FULL) */
-}
-
-/* Turn on heavyweight RCU tasks trace readers on idle/user entry. */
-static void rcu_dynticks_task_trace_enter(void)
-{
-#ifdef CONFIG_TASKS_RCU_TRACE
- if (IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB))
- current->trc_reader_special.b.need_mb = true;
-#endif /* #ifdef CONFIG_TASKS_RCU_TRACE */
-}
-
-/* Turn off heavyweight RCU tasks trace readers on idle/user exit. */
-static void rcu_dynticks_task_trace_exit(void)
-{
-#ifdef CONFIG_TASKS_RCU_TRACE
- if (IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB))
- current->trc_reader_special.b.need_mb = false;
-#endif /* #ifdef CONFIG_TASKS_RCU_TRACE */
+ housekeeping_affine(current, HK_TYPE_RCU);
}
diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h
index 70d48c52fabc..5d666428546b 100644
--- a/kernel/rcu/tree_stall.h
+++ b/kernel/rcu/tree_stall.h
@@ -7,6 +7,9 @@
* Author: Paul E. McKenney <paulmck@linux.ibm.com>
*/
+#include <linux/kvm_para.h>
+#include <linux/rcu_notifier.h>
+
//////////////////////////////////////////////////////////////////////////////
//
// Controlling CPU stall warnings, including delay calculation.
@@ -23,6 +26,34 @@ int sysctl_max_rcu_stall_to_panic __read_mostly;
#define RCU_STALL_MIGHT_DIV 8
#define RCU_STALL_MIGHT_MIN (2 * HZ)
+int rcu_exp_jiffies_till_stall_check(void)
+{
+ int cpu_stall_timeout = READ_ONCE(rcu_exp_cpu_stall_timeout);
+ int exp_stall_delay_delta = 0;
+ int till_stall_check;
+
+ // Zero says to use rcu_cpu_stall_timeout, but in milliseconds.
+ if (!cpu_stall_timeout)
+ cpu_stall_timeout = jiffies_to_msecs(rcu_jiffies_till_stall_check());
+
+ // Limit check must be consistent with the Kconfig limits for
+ // CONFIG_RCU_EXP_CPU_STALL_TIMEOUT, so check the allowed range.
+ // The minimum clamped value is "2UL", because at least one full
+ // tick has to be guaranteed.
+ till_stall_check = clamp(msecs_to_jiffies(cpu_stall_timeout), 2UL, 300UL * HZ);
+
+ if (cpu_stall_timeout && jiffies_to_msecs(till_stall_check) != cpu_stall_timeout)
+ WRITE_ONCE(rcu_exp_cpu_stall_timeout, jiffies_to_msecs(till_stall_check));
+
+#ifdef CONFIG_PROVE_RCU
+ /* Add extra ~25% out of till_stall_check. */
+ exp_stall_delay_delta = ((till_stall_check * 25) / 100) + 1;
+#endif
+
+ return till_stall_check + exp_stall_delay_delta;
+}
+EXPORT_SYMBOL_GPL(rcu_exp_jiffies_till_stall_check);
+
/* Limit-check stall timeouts specified at boottime and runtime. */
int rcu_jiffies_till_stall_check(void)
{
@@ -117,17 +148,19 @@ static void panic_on_rcu_stall(void)
}
/**
- * rcu_cpu_stall_reset - prevent further stall warnings in current grace period
+ * rcu_cpu_stall_reset - restart stall-warning timeout for current grace period
*
- * Set the stall-warning timeout way off into the future, thus preventing
- * any RCU CPU stall-warning messages from appearing in the current set of
- * RCU grace periods.
+ * To perform the reset request from the caller, disable stall detection until
+ * 3 fqs loops have passed. This is required to ensure a fresh jiffies is
+ * loaded. It should be safe to do from the fqs loop as enough timer
+ * interrupts and context switches should have passed.
*
* The caller must disable hard irqs.
*/
void rcu_cpu_stall_reset(void)
{
- WRITE_ONCE(rcu_state.jiffies_stall, jiffies + ULONG_MAX / 2);
+ WRITE_ONCE(rcu_state.nr_fqs_jiffies_stall, 3);
+ WRITE_ONCE(rcu_state.jiffies_stall, ULONG_MAX);
}
//////////////////////////////////////////////////////////////////////////////
@@ -143,6 +176,7 @@ static void record_gp_stall_check_time(void)
WRITE_ONCE(rcu_state.gp_start, j);
j1 = rcu_jiffies_till_stall_check();
smp_mb(); // ->gp_start before ->jiffies_stall and caller's ->gp_seq.
+ WRITE_ONCE(rcu_state.nr_fqs_jiffies_stall, 0);
WRITE_ONCE(rcu_state.jiffies_stall, j + j1);
rcu_state.jiffies_resched = j + j1 / 2;
rcu_state.n_force_qs_gpstart = READ_ONCE(rcu_state.n_force_qs);
@@ -241,16 +275,16 @@ struct rcu_stall_chk_rdr {
* Report out the state of a not-running task that is stalling the
* current RCU grace period.
*/
-static bool check_slow_task(struct task_struct *t, void *arg)
+static int check_slow_task(struct task_struct *t, void *arg)
{
struct rcu_stall_chk_rdr *rscrp = arg;
if (task_curr(t))
- return false; // It is running, so decline to inspect it.
+ return -EBUSY; // It is running, so decline to inspect it.
rscrp->nesting = t->rcu_read_lock_nesting;
rscrp->rs = t->rcu_read_unlock_special;
rscrp->on_blkd_list = !list_empty(&t->rcu_node_entry);
- return true;
+ return 0;
}
/*
@@ -266,8 +300,11 @@ static int rcu_print_task_stall(struct rcu_node *rnp, unsigned long flags)
struct task_struct *t;
struct task_struct *ts[8];
- if (!rcu_preempt_blocked_readers_cgp(rnp))
+ lockdep_assert_irqs_disabled();
+ if (!rcu_preempt_blocked_readers_cgp(rnp)) {
+ raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
return 0;
+ }
pr_err("\tTasks blocked on level-%d rcu_node (CPUs %d-%d):",
rnp->level, rnp->grplo, rnp->grphi);
t = list_entry(rnp->gp_tasks->prev,
@@ -279,9 +316,9 @@ static int rcu_print_task_stall(struct rcu_node *rnp, unsigned long flags)
break;
}
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
- for (i--; i; i--) {
- t = ts[i];
- if (!try_invoke_on_locked_down_task(t, check_slow_task, &rscr))
+ while (i) {
+ t = ts[--i];
+ if (task_call_func(t, check_slow_task, &rscr))
pr_cont(" P%d", t->pid);
else
pr_cont(" P%d/%d:%c%c%c%c",
@@ -290,6 +327,7 @@ static int rcu_print_task_stall(struct rcu_node *rnp, unsigned long flags)
".q"[rscr.rs.b.need_qs],
".e"[rscr.rs.b.exp_hint],
".l"[rscr.on_blkd_list]);
+ lockdep_assert_irqs_disabled();
put_task_struct(t);
ndetected++;
}
@@ -312,6 +350,7 @@ static void rcu_print_detail_task_stall_rnp(struct rcu_node *rnp)
* tasks blocked within RCU read-side critical sections.
*/
static int rcu_print_task_stall(struct rcu_node *rnp, unsigned long flags)
+ __releases(rnp->lock)
{
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
return 0;
@@ -333,33 +372,16 @@ static void rcu_dump_cpu_stacks(void)
rcu_for_each_leaf_node(rnp) {
raw_spin_lock_irqsave_rcu_node(rnp, flags);
for_each_leaf_node_possible_cpu(rnp, cpu)
- if (rnp->qsmask & leaf_node_cpu_bit(rnp, cpu))
- if (!trigger_single_cpu_backtrace(cpu))
+ if (rnp->qsmask & leaf_node_cpu_bit(rnp, cpu)) {
+ if (cpu_is_offline(cpu))
+ pr_err("Offline CPU %d blocking current GP.\n", cpu);
+ else
dump_cpu_task(cpu);
+ }
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
}
}
-#ifdef CONFIG_RCU_FAST_NO_HZ
-
-static void print_cpu_stall_fast_no_hz(char *cp, int cpu)
-{
- struct rcu_data *rdp = &per_cpu(rcu_data, cpu);
-
- sprintf(cp, "last_accelerate: %04lx/%04lx dyntick_enabled: %d",
- rdp->last_accelerate & 0xffff, jiffies & 0xffff,
- !!rdp->tick_nohz_enabled_snap);
-}
-
-#else /* #ifdef CONFIG_RCU_FAST_NO_HZ */
-
-static void print_cpu_stall_fast_no_hz(char *cp, int cpu)
-{
- *cp = '\0';
-}
-
-#endif /* #else #ifdef CONFIG_RCU_FAST_NO_HZ */
-
static const char * const gp_state_names[] = {
[RCU_GP_IDLE] = "RCU_GP_IDLE",
[RCU_GP_WAIT_GPS] = "RCU_GP_WAIT_GPS",
@@ -392,6 +414,56 @@ static bool rcu_is_gp_kthread_starving(unsigned long *jp)
return j > 2 * HZ;
}
+static bool rcu_is_rcuc_kthread_starving(struct rcu_data *rdp, unsigned long *jp)
+{
+ int cpu;
+ struct task_struct *rcuc;
+ unsigned long j;
+
+ rcuc = rdp->rcu_cpu_kthread_task;
+ if (!rcuc)
+ return false;
+
+ cpu = task_cpu(rcuc);
+ if (cpu_is_offline(cpu) || idle_cpu(cpu))
+ return false;
+
+ j = jiffies - READ_ONCE(rdp->rcuc_activity);
+
+ if (jp)
+ *jp = j;
+ return j > 2 * HZ;
+}
+
+static void print_cpu_stat_info(int cpu)
+{
+ struct rcu_snap_record rsr, *rsrp;
+ struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
+ struct kernel_cpustat *kcsp = &kcpustat_cpu(cpu);
+
+ if (!rcu_cpu_stall_cputime)
+ return;
+
+ rsrp = &rdp->snap_record;
+ if (rsrp->gp_seq != rdp->gp_seq)
+ return;
+
+ rsr.cputime_irq = kcpustat_field(kcsp, CPUTIME_IRQ, cpu);
+ rsr.cputime_softirq = kcpustat_field(kcsp, CPUTIME_SOFTIRQ, cpu);
+ rsr.cputime_system = kcpustat_field(kcsp, CPUTIME_SYSTEM, cpu);
+
+ pr_err("\t hardirqs softirqs csw/system\n");
+ pr_err("\t number: %8ld %10d %12lld\n",
+ kstat_cpu_irqs_sum(cpu) - rsrp->nr_hardirqs,
+ kstat_cpu_softirqs_sum(cpu) - rsrp->nr_softirqs,
+ nr_context_switches_cpu(cpu) - rsrp->nr_csw);
+ pr_err("\tcputime: %8lld %10lld %12lld ==> %d(ms)\n",
+ div_u64(rsr.cputime_irq - rsrp->cputime_irq, NSEC_PER_MSEC),
+ div_u64(rsr.cputime_softirq - rsrp->cputime_softirq, NSEC_PER_MSEC),
+ div_u64(rsr.cputime_system - rsrp->cputime_system, NSEC_PER_MSEC),
+ jiffies_to_msecs(jiffies - rsrp->jiffies));
+}
+
/*
* Print out diagnostic information for the specified stalled CPU.
*
@@ -401,16 +473,18 @@ static bool rcu_is_gp_kthread_starving(unsigned long *jp)
* of RCU grace periods that this CPU is ignorant of, for example, "1"
* if the CPU was aware of the previous grace period.
*
- * Also print out idle and (if CONFIG_RCU_FAST_NO_HZ) idle-entry info.
+ * Also print out idle info.
*/
static void print_cpu_stall_info(int cpu)
{
unsigned long delta;
bool falsepositive;
- char fast_no_hz[72];
struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
char *ticks_title;
unsigned long ticks_value;
+ bool rcuc_starved;
+ unsigned long j;
+ char buf[32];
/*
* We could be printing a lot while holding a spinlock. Avoid
@@ -425,11 +499,13 @@ static void print_cpu_stall_info(int cpu)
ticks_title = "ticks this GP";
ticks_value = rdp->ticks_this_gp;
}
- print_cpu_stall_fast_no_hz(fast_no_hz, cpu);
delta = rcu_seq_ctr(rdp->mynode->gp_seq - rdp->rcu_iw_gp_seq);
falsepositive = rcu_is_gp_kthread_starving(NULL) &&
- rcu_dynticks_in_eqs(rcu_dynticks_snap(rdp));
- pr_err("\t%d-%c%c%c%c: (%lu %s) idle=%03x/%ld/%#lx softirq=%u/%u fqs=%ld %s%s\n",
+ rcu_dynticks_in_eqs(rcu_dynticks_snap(cpu));
+ rcuc_starved = rcu_is_rcuc_kthread_starving(rdp, &j);
+ if (rcuc_starved)
+ sprintf(buf, " rcuc=%ld jiffies(starved)", j);
+ pr_err("\t%d-%c%c%c%c: (%lu %s) idle=%04x/%ld/%#lx softirq=%u/%u fqs=%ld%s%s\n",
cpu,
"O."[!!cpu_online(cpu)],
"o."[!!(rdp->grpmask & rdp->mynode->qsmaskinit)],
@@ -438,36 +514,79 @@ static void print_cpu_stall_info(int cpu)
rdp->rcu_iw_pending ? (int)min(delta, 9UL) + '0' :
"!."[!delta],
ticks_value, ticks_title,
- rcu_dynticks_snap(rdp) & 0xfff,
- rdp->dynticks_nesting, rdp->dynticks_nmi_nesting,
+ rcu_dynticks_snap(cpu) & 0xffff,
+ ct_dynticks_nesting_cpu(cpu), ct_dynticks_nmi_nesting_cpu(cpu),
rdp->softirq_snap, kstat_softirqs_cpu(RCU_SOFTIRQ, cpu),
data_race(rcu_state.n_force_qs) - rcu_state.n_force_qs_gpstart,
- fast_no_hz,
+ rcuc_starved ? buf : "",
falsepositive ? " (false positive?)" : "");
+
+ print_cpu_stat_info(cpu);
}
/* Complain about starvation of grace-period kthread. */
static void rcu_check_gp_kthread_starvation(void)
{
+ int cpu;
struct task_struct *gpk = rcu_state.gp_kthread;
unsigned long j;
if (rcu_is_gp_kthread_starving(&j)) {
- pr_err("%s kthread starved for %ld jiffies! g%ld f%#x %s(%d) ->state=%#lx ->cpu=%d\n",
+ cpu = gpk ? task_cpu(gpk) : -1;
+ pr_err("%s kthread starved for %ld jiffies! g%ld f%#x %s(%d) ->state=%#x ->cpu=%d\n",
rcu_state.name, j,
(long)rcu_seq_current(&rcu_state.gp_seq),
- data_race(rcu_state.gp_flags),
- gp_state_getname(rcu_state.gp_state), rcu_state.gp_state,
- gpk ? gpk->state : ~0, gpk ? task_cpu(gpk) : -1);
+ data_race(READ_ONCE(rcu_state.gp_flags)),
+ gp_state_getname(rcu_state.gp_state),
+ data_race(READ_ONCE(rcu_state.gp_state)),
+ gpk ? data_race(READ_ONCE(gpk->__state)) : ~0, cpu);
if (gpk) {
+ struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
+
pr_err("\tUnless %s kthread gets sufficient CPU time, OOM is now expected behavior.\n", rcu_state.name);
pr_err("RCU grace-period kthread stack dump:\n");
sched_show_task(gpk);
+ if (cpu_is_offline(cpu)) {
+ pr_err("RCU GP kthread last ran on offline CPU %d.\n", cpu);
+ } else if (!(data_race(READ_ONCE(rdp->mynode->qsmask)) & rdp->grpmask)) {
+ pr_err("Stack dump where RCU GP kthread last ran:\n");
+ dump_cpu_task(cpu);
+ }
wake_up_process(gpk);
}
}
}
+/* Complain about missing wakeups from expired fqs wait timer */
+static void rcu_check_gp_kthread_expired_fqs_timer(void)
+{
+ struct task_struct *gpk = rcu_state.gp_kthread;
+ short gp_state;
+ unsigned long jiffies_fqs;
+ int cpu;
+
+ /*
+ * Order reads of .gp_state and .jiffies_force_qs.
+ * Matching smp_wmb() is present in rcu_gp_fqs_loop().
+ */
+ gp_state = smp_load_acquire(&rcu_state.gp_state);
+ jiffies_fqs = READ_ONCE(rcu_state.jiffies_force_qs);
+
+ if (gp_state == RCU_GP_WAIT_FQS &&
+ time_after(jiffies, jiffies_fqs + RCU_STALL_MIGHT_MIN) &&
+ gpk && !READ_ONCE(gpk->on_rq)) {
+ cpu = task_cpu(gpk);
+ pr_err("%s kthread timer wakeup didn't happen for %ld jiffies! g%ld f%#x %s(%d) ->state=%#x\n",
+ rcu_state.name, (jiffies - jiffies_fqs),
+ (long)rcu_seq_current(&rcu_state.gp_seq),
+ data_race(rcu_state.gp_flags),
+ gp_state_getname(RCU_GP_WAIT_FQS), RCU_GP_WAIT_FQS,
+ data_race(READ_ONCE(gpk->__state)));
+ pr_err("\tPossible timer handling issue on cpu=%d timer-softirq=%u\n",
+ cpu, kstat_softirqs_cpu(TIMER_SOFTIRQ, cpu));
+ }
+}
+
static void print_other_cpu_stall(unsigned long gp_seq, unsigned long gps)
{
int cpu;
@@ -478,6 +597,8 @@ static void print_other_cpu_stall(unsigned long gp_seq, unsigned long gps)
struct rcu_node *rnp;
long totqlen = 0;
+ lockdep_assert_irqs_disabled();
+
/* Kick and suppress, if so configured. */
rcu_stall_kick_kthreads();
if (rcu_stall_is_suppressed())
@@ -488,6 +609,7 @@ static void print_other_cpu_stall(unsigned long gp_seq, unsigned long gps)
* See Documentation/RCU/stallwarn.rst for info on how to debug
* RCU CPU stall warnings.
*/
+ trace_rcu_stall_warning(rcu_state.name, TPS("StallDetected"));
pr_err("INFO: %s detected stalls on CPUs/tasks:\n", rcu_state.name);
rcu_for_each_leaf_node(rnp) {
raw_spin_lock_irqsave_rcu_node(rnp, flags);
@@ -499,13 +621,14 @@ static void print_other_cpu_stall(unsigned long gp_seq, unsigned long gps)
}
}
ndetected += rcu_print_task_stall(rnp, flags); // Releases rnp->lock.
+ lockdep_assert_irqs_disabled();
}
for_each_possible_cpu(cpu)
totqlen += rcu_get_n_cbs_cpu(cpu);
- pr_cont("\t(detected by %d, t=%ld jiffies, g=%ld, q=%lu)\n",
+ pr_err("\t(detected by %d, t=%ld jiffies, g=%ld, q=%lu ncpus=%d)\n",
smp_processor_id(), (long)(jiffies - gps),
- (long)rcu_seq_current(&rcu_state.gp_seq), totqlen);
+ (long)rcu_seq_current(&rcu_state.gp_seq), totqlen, rcu_state.n_online_cpus);
if (ndetected) {
rcu_dump_cpu_stacks();
@@ -517,11 +640,11 @@ static void print_other_cpu_stall(unsigned long gp_seq, unsigned long gps)
pr_err("INFO: Stall ended before state dump start\n");
} else {
j = jiffies;
- gpa = data_race(rcu_state.gp_activity);
+ gpa = data_race(READ_ONCE(rcu_state.gp_activity));
pr_err("All QSes seen, last %s kthread activity %ld (%ld-%ld), jiffies_till_next_fqs=%ld, root ->qsmask %#lx\n",
rcu_state.name, j - gpa, j, gpa,
- data_race(jiffies_till_next_fqs),
- rcu_get_root()->qsmask);
+ data_race(READ_ONCE(jiffies_till_next_fqs)),
+ data_race(READ_ONCE(rcu_get_root()->qsmask)));
}
}
/* Rewrite if needed in case of slow consoles. */
@@ -529,6 +652,7 @@ static void print_other_cpu_stall(unsigned long gp_seq, unsigned long gps)
WRITE_ONCE(rcu_state.jiffies_stall,
jiffies + 3 * rcu_jiffies_till_stall_check() + 3);
+ rcu_check_gp_kthread_expired_fqs_timer();
rcu_check_gp_kthread_starvation();
panic_on_rcu_stall();
@@ -544,6 +668,8 @@ static void print_cpu_stall(unsigned long gps)
struct rcu_node *rnp = rcu_get_root();
long totqlen = 0;
+ lockdep_assert_irqs_disabled();
+
/* Kick and suppress, if so configured. */
rcu_stall_kick_kthreads();
if (rcu_stall_is_suppressed())
@@ -554,16 +680,18 @@ static void print_cpu_stall(unsigned long gps)
* See Documentation/RCU/stallwarn.rst for info on how to debug
* RCU CPU stall warnings.
*/
+ trace_rcu_stall_warning(rcu_state.name, TPS("SelfDetected"));
pr_err("INFO: %s self-detected stall on CPU\n", rcu_state.name);
raw_spin_lock_irqsave_rcu_node(rdp->mynode, flags);
print_cpu_stall_info(smp_processor_id());
raw_spin_unlock_irqrestore_rcu_node(rdp->mynode, flags);
for_each_possible_cpu(cpu)
totqlen += rcu_get_n_cbs_cpu(cpu);
- pr_cont("\t(t=%lu jiffies g=%ld q=%lu)\n",
+ pr_err("\t(t=%lu jiffies g=%ld q=%lu ncpus=%d)\n",
jiffies - gps,
- (long)rcu_seq_current(&rcu_state.gp_seq), totqlen);
+ (long)rcu_seq_current(&rcu_state.gp_seq), totqlen, rcu_state.n_online_cpus);
+ rcu_check_gp_kthread_expired_fqs_timer();
rcu_check_gp_kthread_starvation();
rcu_dump_cpu_stacks();
@@ -590,6 +718,7 @@ static void print_cpu_stall(unsigned long gps)
static void check_cpu_stall(struct rcu_data *rdp)
{
+ bool self_detected;
unsigned long gs1;
unsigned long gs2;
unsigned long gps;
@@ -598,10 +727,21 @@ static void check_cpu_stall(struct rcu_data *rdp)
unsigned long js;
struct rcu_node *rnp;
+ lockdep_assert_irqs_disabled();
if ((rcu_stall_is_suppressed() && !READ_ONCE(rcu_kick_kthreads)) ||
!rcu_gp_in_progress())
return;
rcu_stall_kick_kthreads();
+
+ /*
+ * Check if it was requested (via rcu_cpu_stall_reset()) that the FQS
+ * loop has to set jiffies to ensure a non-stale jiffies value. This
+ * is required to have good jiffies value after coming out of long
+ * breaks of jiffies updates. Not doing so can cause false positives.
+ */
+ if (READ_ONCE(rcu_state.nr_fqs_jiffies_stall) > 0)
+ return;
+
j = jiffies;
/*
@@ -634,33 +774,101 @@ static void check_cpu_stall(struct rcu_data *rdp)
ULONG_CMP_GE(gps, js))
return; /* No stall or GP completed since entering function. */
rnp = rdp->mynode;
- jn = jiffies + 3 * rcu_jiffies_till_stall_check() + 3;
+ jn = jiffies + ULONG_MAX / 2;
+ self_detected = READ_ONCE(rnp->qsmask) & rdp->grpmask;
if (rcu_gp_in_progress() &&
- (READ_ONCE(rnp->qsmask) & rdp->grpmask) &&
+ (self_detected || ULONG_CMP_GE(j, js + RCU_STALL_RAT_DELAY)) &&
cmpxchg(&rcu_state.jiffies_stall, js, jn) == js) {
+ /*
+ * If a virtual machine is stopped by the host it can look to
+ * the watchdog like an RCU stall. Check to see if the host
+ * stopped the vm.
+ */
+ if (kvm_check_and_clear_guest_paused())
+ return;
- /* We haven't checked in, so go dump stack. */
- print_cpu_stall(gps);
- if (READ_ONCE(rcu_cpu_stall_ftrace_dump))
- rcu_ftrace_dump(DUMP_ALL);
-
- } else if (rcu_gp_in_progress() &&
- ULONG_CMP_GE(j, js + RCU_STALL_RAT_DELAY) &&
- cmpxchg(&rcu_state.jiffies_stall, js, jn) == js) {
+ rcu_stall_notifier_call_chain(RCU_STALL_NOTIFY_NORM, (void *)j - gps);
+ if (self_detected) {
+ /* We haven't checked in, so go dump stack. */
+ print_cpu_stall(gps);
+ } else {
+ /* They had a few time units to dump stack, so complain. */
+ print_other_cpu_stall(gs2, gps);
+ }
- /* They had a few time units to dump stack, so complain. */
- print_other_cpu_stall(gs2, gps);
if (READ_ONCE(rcu_cpu_stall_ftrace_dump))
rcu_ftrace_dump(DUMP_ALL);
+
+ if (READ_ONCE(rcu_state.jiffies_stall) == jn) {
+ jn = jiffies + 3 * rcu_jiffies_till_stall_check() + 3;
+ WRITE_ONCE(rcu_state.jiffies_stall, jn);
+ }
}
}
//////////////////////////////////////////////////////////////////////////////
//
-// RCU forward-progress mechanisms, including of callback invocation.
+// RCU forward-progress mechanisms, including for callback invocation.
/*
+ * Check to see if a failure to end RCU priority inversion was due to
+ * a CPU not passing through a quiescent state. When this happens, there
+ * is nothing that RCU priority boosting can do to help, so we shouldn't
+ * count this as an RCU priority boosting failure. A return of true says
+ * RCU priority boosting is to blame, and false says otherwise. If false
+ * is returned, the first of the CPUs to blame is stored through cpup.
+ * If there was no CPU blocking the current grace period, but also nothing
+ * in need of being boosted, *cpup is set to -1. This can happen in case
+ * of vCPU preemption while the last CPU is reporting its quiscent state,
+ * for example.
+ *
+ * If cpup is NULL, then a lockless quick check is carried out, suitable
+ * for high-rate usage. On the other hand, if cpup is non-NULL, each
+ * rcu_node structure's ->lock is acquired, ruling out high-rate usage.
+ */
+bool rcu_check_boost_fail(unsigned long gp_state, int *cpup)
+{
+ bool atb = false;
+ int cpu;
+ unsigned long flags;
+ struct rcu_node *rnp;
+
+ rcu_for_each_leaf_node(rnp) {
+ if (!cpup) {
+ if (data_race(READ_ONCE(rnp->qsmask))) {
+ return false;
+ } else {
+ if (READ_ONCE(rnp->gp_tasks))
+ atb = true;
+ continue;
+ }
+ }
+ *cpup = -1;
+ raw_spin_lock_irqsave_rcu_node(rnp, flags);
+ if (rnp->gp_tasks)
+ atb = true;
+ if (!rnp->qsmask) {
+ // No CPUs without quiescent states for this rnp.
+ raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+ continue;
+ }
+ // Find the first holdout CPU.
+ for_each_leaf_node_possible_cpu(rnp, cpu) {
+ if (rnp->qsmask & (1UL << (cpu - rnp->grplo))) {
+ raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+ *cpup = cpu;
+ return false;
+ }
+ }
+ raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+ }
+ // Can't blame CPUs, so must blame RCU priority boosting.
+ return atb;
+}
+EXPORT_SYMBOL_GPL(rcu_check_boost_fail);
+
+/*
* Show the state of the grace-period kthreads.
*/
void show_rcu_gp_kthreads(void)
@@ -670,29 +878,41 @@ void show_rcu_gp_kthreads(void)
unsigned long j;
unsigned long ja;
unsigned long jr;
+ unsigned long js;
unsigned long jw;
struct rcu_data *rdp;
struct rcu_node *rnp;
struct task_struct *t = READ_ONCE(rcu_state.gp_kthread);
j = jiffies;
- ja = j - data_race(rcu_state.gp_activity);
- jr = j - data_race(rcu_state.gp_req_activity);
- jw = j - data_race(rcu_state.gp_wake_time);
- pr_info("%s: wait state: %s(%d) ->state: %#lx delta ->gp_activity %lu ->gp_req_activity %lu ->gp_wake_time %lu ->gp_wake_seq %ld ->gp_seq %ld ->gp_seq_needed %ld ->gp_flags %#x\n",
+ ja = j - data_race(READ_ONCE(rcu_state.gp_activity));
+ jr = j - data_race(READ_ONCE(rcu_state.gp_req_activity));
+ js = j - data_race(READ_ONCE(rcu_state.gp_start));
+ jw = j - data_race(READ_ONCE(rcu_state.gp_wake_time));
+ pr_info("%s: wait state: %s(%d) ->state: %#x ->rt_priority %u delta ->gp_start %lu ->gp_activity %lu ->gp_req_activity %lu ->gp_wake_time %lu ->gp_wake_seq %ld ->gp_seq %ld ->gp_seq_needed %ld ->gp_max %lu ->gp_flags %#x\n",
rcu_state.name, gp_state_getname(rcu_state.gp_state),
- rcu_state.gp_state, t ? t->state : 0x1ffffL,
- ja, jr, jw, (long)data_race(rcu_state.gp_wake_seq),
- (long)data_race(rcu_state.gp_seq),
- (long)data_race(rcu_get_root()->gp_seq_needed),
- data_race(rcu_state.gp_flags));
+ data_race(READ_ONCE(rcu_state.gp_state)),
+ t ? data_race(READ_ONCE(t->__state)) : 0x1ffff, t ? t->rt_priority : 0xffU,
+ js, ja, jr, jw, (long)data_race(READ_ONCE(rcu_state.gp_wake_seq)),
+ (long)data_race(READ_ONCE(rcu_state.gp_seq)),
+ (long)data_race(READ_ONCE(rcu_get_root()->gp_seq_needed)),
+ data_race(READ_ONCE(rcu_state.gp_max)),
+ data_race(READ_ONCE(rcu_state.gp_flags)));
rcu_for_each_node_breadth_first(rnp) {
- if (ULONG_CMP_GE(READ_ONCE(rcu_state.gp_seq),
- READ_ONCE(rnp->gp_seq_needed)))
+ if (ULONG_CMP_GE(READ_ONCE(rcu_state.gp_seq), READ_ONCE(rnp->gp_seq_needed)) &&
+ !data_race(READ_ONCE(rnp->qsmask)) && !data_race(READ_ONCE(rnp->boost_tasks)) &&
+ !data_race(READ_ONCE(rnp->exp_tasks)) && !data_race(READ_ONCE(rnp->gp_tasks)))
continue;
- pr_info("\trcu_node %d:%d ->gp_seq %ld ->gp_seq_needed %ld\n",
- rnp->grplo, rnp->grphi, (long)data_race(rnp->gp_seq),
- (long)data_race(rnp->gp_seq_needed));
+ pr_info("\trcu_node %d:%d ->gp_seq %ld ->gp_seq_needed %ld ->qsmask %#lx %c%c%c%c ->n_boosts %ld\n",
+ rnp->grplo, rnp->grphi,
+ (long)data_race(READ_ONCE(rnp->gp_seq)),
+ (long)data_race(READ_ONCE(rnp->gp_seq_needed)),
+ data_race(READ_ONCE(rnp->qsmask)),
+ ".b"[!!data_race(READ_ONCE(rnp->boost_kthread_task))],
+ ".B"[!!data_race(READ_ONCE(rnp->boost_tasks))],
+ ".E"[!!data_race(READ_ONCE(rnp->exp_tasks))],
+ ".G"[!!data_race(READ_ONCE(rnp->gp_tasks))],
+ data_race(READ_ONCE(rnp->n_boosts)));
if (!rcu_is_leaf_node(rnp))
continue;
for_each_leaf_node_possible_cpu(rnp, cpu) {
@@ -702,12 +922,12 @@ void show_rcu_gp_kthreads(void)
READ_ONCE(rdp->gp_seq_needed)))
continue;
pr_info("\tcpu %d ->gp_seq_needed %ld\n",
- cpu, (long)data_race(rdp->gp_seq_needed));
+ cpu, (long)data_race(READ_ONCE(rdp->gp_seq_needed)));
}
}
for_each_possible_cpu(cpu) {
rdp = per_cpu_ptr(&rcu_data, cpu);
- cbs += data_race(rdp->n_cbs_invoked);
+ cbs += data_race(READ_ONCE(rdp->n_cbs_invoked));
if (rcu_segcblist_is_offloaded(&rdp->cblist))
show_rcu_nocb_state(rdp);
}
@@ -789,11 +1009,11 @@ void rcu_fwd_progress_check(unsigned long j)
if (rcu_gp_in_progress()) {
pr_info("%s: GP age %lu jiffies\n",
- __func__, jiffies - rcu_state.gp_start);
+ __func__, jiffies - data_race(READ_ONCE(rcu_state.gp_start)));
show_rcu_gp_kthreads();
} else {
pr_info("%s: Last GP end %lu jiffies ago\n",
- __func__, jiffies - rcu_state.gp_end);
+ __func__, jiffies - data_race(READ_ONCE(rcu_state.gp_end)));
preempt_disable();
rdp = this_cpu_ptr(&rcu_data);
rcu_check_gp_start_stall(rdp->mynode, rdp, j);
@@ -821,7 +1041,7 @@ static bool sysrq_rcu;
module_param(sysrq_rcu, bool, 0444);
/* Dump grace-period-request information due to commandeered sysrq. */
-static void sysrq_show_rcu(int key)
+static void sysrq_show_rcu(u8 key)
{
show_rcu_gp_kthreads();
}
@@ -840,3 +1060,67 @@ static int __init rcu_sysrq_init(void)
return 0;
}
early_initcall(rcu_sysrq_init);
+
+#ifdef CONFIG_RCU_CPU_STALL_NOTIFIER
+
+//////////////////////////////////////////////////////////////////////////////
+//
+// RCU CPU stall-warning notifiers
+
+static ATOMIC_NOTIFIER_HEAD(rcu_cpu_stall_notifier_list);
+
+/**
+ * rcu_stall_chain_notifier_register - Add an RCU CPU stall notifier
+ * @n: Entry to add.
+ *
+ * Adds an RCU CPU stall notifier to an atomic notifier chain.
+ * The @action passed to a notifier will be @RCU_STALL_NOTIFY_NORM or
+ * friends. The @data will be the duration of the stalled grace period,
+ * in jiffies, coerced to a void* pointer.
+ *
+ * Returns 0 on success, %-EEXIST on error.
+ */
+int rcu_stall_chain_notifier_register(struct notifier_block *n)
+{
+ int rcsn = rcu_cpu_stall_notifiers;
+
+ WARN(1, "Adding %pS() to RCU stall notifier list (%s).\n", n->notifier_call,
+ rcsn ? "possibly suppressing RCU CPU stall warnings" : "failed, so all is well");
+ if (rcsn)
+ return atomic_notifier_chain_register(&rcu_cpu_stall_notifier_list, n);
+ return -EEXIST;
+}
+EXPORT_SYMBOL_GPL(rcu_stall_chain_notifier_register);
+
+/**
+ * rcu_stall_chain_notifier_unregister - Remove an RCU CPU stall notifier
+ * @n: Entry to add.
+ *
+ * Removes an RCU CPU stall notifier from an atomic notifier chain.
+ *
+ * Returns zero on success, %-ENOENT on failure.
+ */
+int rcu_stall_chain_notifier_unregister(struct notifier_block *n)
+{
+ return atomic_notifier_chain_unregister(&rcu_cpu_stall_notifier_list, n);
+}
+EXPORT_SYMBOL_GPL(rcu_stall_chain_notifier_unregister);
+
+/*
+ * rcu_stall_notifier_call_chain - Call functions in an RCU CPU stall notifier chain
+ * @val: Value passed unmodified to notifier function
+ * @v: Pointer passed unmodified to notifier function
+ *
+ * Calls each function in the RCU CPU stall notifier chain in turn, which
+ * is an atomic call chain. See atomic_notifier_call_chain() for more
+ * information.
+ *
+ * This is for use within RCU, hence the omission of the extra asterisk
+ * to indicate a non-kerneldoc format header comment.
+ */
+int rcu_stall_notifier_call_chain(unsigned long val, void *v)
+{
+ return atomic_notifier_call_chain(&rcu_cpu_stall_notifier_list, val, v);
+}
+
+#endif // #ifdef CONFIG_RCU_CPU_STALL_NOTIFIER
diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
index 39334d2d2b37..46aaaa9fe339 100644
--- a/kernel/rcu/update.c
+++ b/kernel/rcu/update.c
@@ -25,6 +25,7 @@
#include <linux/interrupt.h>
#include <linux/sched/signal.h>
#include <linux/sched/debug.h>
+#include <linux/torture.h>
#include <linux/atomic.h>
#include <linux/bitops.h>
#include <linux/percpu.h>
@@ -54,10 +55,12 @@
#define MODULE_PARAM_PREFIX "rcupdate."
#ifndef CONFIG_TINY_RCU
-module_param(rcu_expedited, int, 0);
-module_param(rcu_normal, int, 0);
-static int rcu_normal_after_boot;
-module_param(rcu_normal_after_boot, int, 0);
+module_param(rcu_expedited, int, 0444);
+module_param(rcu_normal, int, 0444);
+static int rcu_normal_after_boot = IS_ENABLED(CONFIG_PREEMPT_RT);
+#if !defined(CONFIG_PREEMPT_RT) || defined(CONFIG_NO_HZ_FULL)
+module_param(rcu_normal_after_boot, int, 0444);
+#endif
#endif /* #ifndef CONFIG_TINY_RCU */
#ifdef CONFIG_DEBUG_LOCK_ALLOC
@@ -83,7 +86,7 @@ module_param(rcu_normal_after_boot, int, 0);
* and while lockdep is disabled.
*
* Note that if the CPU is in the idle loop from an RCU point of view (ie:
- * that we are in the section between rcu_idle_enter() and rcu_idle_exit())
+ * that we are in the section between ct_idle_enter() and ct_idle_exit())
* then rcu_read_lock_held() sets ``*ret`` to false even if the CPU did an
* rcu_read_lock(). The reason for this is that RCU ignores CPUs that are
* in such a section, considering these as in extended quiescent state,
@@ -142,8 +145,45 @@ bool rcu_gp_is_normal(void)
}
EXPORT_SYMBOL_GPL(rcu_gp_is_normal);
-static atomic_t rcu_expedited_nesting = ATOMIC_INIT(1);
+static atomic_t rcu_async_hurry_nesting = ATOMIC_INIT(1);
+/*
+ * Should call_rcu() callbacks be processed with urgency or are
+ * they OK being executed with arbitrary delays?
+ */
+bool rcu_async_should_hurry(void)
+{
+ return !IS_ENABLED(CONFIG_RCU_LAZY) ||
+ atomic_read(&rcu_async_hurry_nesting);
+}
+EXPORT_SYMBOL_GPL(rcu_async_should_hurry);
+
+/**
+ * rcu_async_hurry - Make future async RCU callbacks not lazy.
+ *
+ * After a call to this function, future calls to call_rcu()
+ * will be processed in a timely fashion.
+ */
+void rcu_async_hurry(void)
+{
+ if (IS_ENABLED(CONFIG_RCU_LAZY))
+ atomic_inc(&rcu_async_hurry_nesting);
+}
+EXPORT_SYMBOL_GPL(rcu_async_hurry);
+/**
+ * rcu_async_relax - Make future async RCU callbacks lazy.
+ *
+ * After a call to this function, future calls to call_rcu()
+ * will be processed in a lazy fashion.
+ */
+void rcu_async_relax(void)
+{
+ if (IS_ENABLED(CONFIG_RCU_LAZY))
+ atomic_dec(&rcu_async_hurry_nesting);
+}
+EXPORT_SYMBOL_GPL(rcu_async_relax);
+
+static atomic_t rcu_expedited_nesting = ATOMIC_INIT(1);
/*
* Should normal grace-period primitives be expedited? Intended for
* use within RCU. Note that this function takes the rcu_expedited
@@ -193,6 +233,7 @@ static bool rcu_boot_ended __read_mostly;
void rcu_end_inkernel_boot(void)
{
rcu_unexpedite_gp();
+ rcu_async_relax();
if (rcu_normal_after_boot)
WRITE_ONCE(rcu_normal, 1);
rcu_boot_ended = true;
@@ -218,11 +259,12 @@ void rcu_test_sync_prims(void)
{
if (!IS_ENABLED(CONFIG_PROVE_RCU))
return;
+ pr_info("Running RCU synchronous self tests\n");
synchronize_rcu();
synchronize_rcu_expedited();
}
-#if !defined(CONFIG_TINY_RCU) || defined(CONFIG_SRCU)
+#if !defined(CONFIG_TINY_RCU)
/*
* Switch to run-time mode once RCU has fully initialized.
@@ -237,7 +279,7 @@ static int __init rcu_set_runtime_mode(void)
}
core_initcall(rcu_set_runtime_mode);
-#endif /* #if !defined(CONFIG_TINY_RCU) || defined(CONFIG_SRCU) */
+#endif /* #if !defined(CONFIG_TINY_RCU) */
#ifdef CONFIG_DEBUG_LOCK_ALLOC
static struct lock_class_key rcu_lock_key;
@@ -245,7 +287,7 @@ struct lockdep_map rcu_lock_map = {
.name = "rcu_read_lock",
.key = &rcu_lock_key,
.wait_type_outer = LD_WAIT_FREE,
- .wait_type_inner = LD_WAIT_CONFIG, /* XXX PREEMPT_RCU ? */
+ .wait_type_inner = LD_WAIT_CONFIG, /* PREEMPT_RT implies PREEMPT_RCU */
};
EXPORT_SYMBOL_GPL(rcu_lock_map);
@@ -254,7 +296,7 @@ struct lockdep_map rcu_bh_lock_map = {
.name = "rcu_read_lock_bh",
.key = &rcu_bh_lock_key,
.wait_type_outer = LD_WAIT_FREE,
- .wait_type_inner = LD_WAIT_CONFIG, /* PREEMPT_LOCK also makes BH preemptible */
+ .wait_type_inner = LD_WAIT_CONFIG, /* PREEMPT_RT makes BH preemptible. */
};
EXPORT_SYMBOL_GPL(rcu_bh_lock_map);
@@ -275,7 +317,7 @@ EXPORT_SYMBOL_GPL(rcu_callback_map);
noinstr int notrace debug_lockdep_rcu_enabled(void)
{
- return rcu_scheduler_active != RCU_SCHEDULER_INACTIVE && debug_locks &&
+ return rcu_scheduler_active != RCU_SCHEDULER_INACTIVE && READ_ONCE(debug_locks) &&
current->lockdep_recursion == 0;
}
EXPORT_SYMBOL_GPL(debug_lockdep_rcu_enabled);
@@ -405,6 +447,13 @@ void __wait_rcu_gp(bool checktiny, int n, call_rcu_func_t *crcu_array,
}
EXPORT_SYMBOL_GPL(__wait_rcu_gp);
+void finish_rcuwait(struct rcuwait *w)
+{
+ rcu_assign_pointer(w->task, NULL);
+ __set_current_state(TASK_RUNNING);
+}
+EXPORT_SYMBOL_GPL(finish_rcuwait);
+
#ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD
void init_rcu_head(struct rcu_head *head)
{
@@ -476,27 +525,39 @@ EXPORT_SYMBOL_GPL(do_trace_rcu_torture_read);
do { } while (0)
#endif
-#if IS_ENABLED(CONFIG_RCU_TORTURE_TEST) || IS_MODULE(CONFIG_RCU_TORTURE_TEST)
+#if IS_ENABLED(CONFIG_RCU_TORTURE_TEST) || IS_MODULE(CONFIG_RCU_TORTURE_TEST) || IS_ENABLED(CONFIG_LOCK_TORTURE_TEST) || IS_MODULE(CONFIG_LOCK_TORTURE_TEST)
/* Get rcutorture access to sched_setaffinity(). */
-long rcutorture_sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
+long torture_sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
{
int ret;
ret = sched_setaffinity(pid, in_mask);
- WARN_ONCE(ret, "%s: sched_setaffinity() returned %d\n", __func__, ret);
+ WARN_ONCE(ret, "%s: sched_setaffinity(%d) returned %d\n", __func__, pid, ret);
return ret;
}
-EXPORT_SYMBOL_GPL(rcutorture_sched_setaffinity);
+EXPORT_SYMBOL_GPL(torture_sched_setaffinity);
#endif
+int rcu_cpu_stall_notifiers __read_mostly; // !0 = provide stall notifiers (rarely useful)
+EXPORT_SYMBOL_GPL(rcu_cpu_stall_notifiers);
+
#ifdef CONFIG_RCU_STALL_COMMON
int rcu_cpu_stall_ftrace_dump __read_mostly;
module_param(rcu_cpu_stall_ftrace_dump, int, 0644);
+#ifdef CONFIG_RCU_CPU_STALL_NOTIFIER
+module_param(rcu_cpu_stall_notifiers, int, 0444);
+#endif // #ifdef CONFIG_RCU_CPU_STALL_NOTIFIER
int rcu_cpu_stall_suppress __read_mostly; // !0 = suppress stall warnings.
EXPORT_SYMBOL_GPL(rcu_cpu_stall_suppress);
module_param(rcu_cpu_stall_suppress, int, 0644);
int rcu_cpu_stall_timeout __read_mostly = CONFIG_RCU_CPU_STALL_TIMEOUT;
module_param(rcu_cpu_stall_timeout, int, 0644);
+int rcu_exp_cpu_stall_timeout __read_mostly = CONFIG_RCU_EXP_CPU_STALL_TIMEOUT;
+module_param(rcu_exp_cpu_stall_timeout, int, 0644);
+int rcu_cpu_stall_cputime __read_mostly = IS_ENABLED(CONFIG_RCU_CPU_STALL_CPUTIME);
+module_param(rcu_cpu_stall_cputime, int, 0644);
+bool rcu_exp_stall_task_details __read_mostly;
+module_param(rcu_exp_stall_task_details, bool, 0644);
#endif /* #ifdef CONFIG_RCU_STALL_COMMON */
// Suppress boot-time RCU CPU stall warnings and rcutorture writer stall
@@ -505,6 +566,19 @@ int rcu_cpu_stall_suppress_at_boot __read_mostly; // !0 = suppress boot stalls.
EXPORT_SYMBOL_GPL(rcu_cpu_stall_suppress_at_boot);
module_param(rcu_cpu_stall_suppress_at_boot, int, 0444);
+/**
+ * get_completed_synchronize_rcu - Return a pre-completed polled state cookie
+ *
+ * Returns a value that will always be treated by functions like
+ * poll_state_synchronize_rcu() as a cookie whose grace period has already
+ * completed.
+ */
+unsigned long get_completed_synchronize_rcu(void)
+{
+ return RCU_GET_STATE_COMPLETED;
+}
+EXPORT_SYMBOL_GPL(get_completed_synchronize_rcu);
+
#ifdef CONFIG_PROVE_RCU
/*
@@ -522,6 +596,7 @@ static void test_callback(struct rcu_head *r)
}
DEFINE_STATIC_SRCU(early_srcu);
+static unsigned long early_srcu_cookie;
struct early_boot_kfree_rcu {
struct rcu_head rh;
@@ -530,12 +605,15 @@ struct early_boot_kfree_rcu {
static void early_boot_test_call_rcu(void)
{
static struct rcu_head head;
+ int idx;
static struct rcu_head shead;
struct early_boot_kfree_rcu *rhp;
+ idx = srcu_down_read(&early_srcu);
+ srcu_up_read(&early_srcu, idx);
call_rcu(&head, test_callback);
- if (IS_ENABLED(CONFIG_SRCU))
- call_srcu(&early_srcu, &shead, test_callback);
+ early_srcu_cookie = start_poll_synchronize_srcu(&early_srcu);
+ call_srcu(&early_srcu, &shead, test_callback);
rhp = kmalloc(sizeof(*rhp), GFP_KERNEL);
if (!WARN_ON_ONCE(!rhp))
kfree_rcu(rhp, rh);
@@ -558,10 +636,10 @@ static int rcu_verify_early_boot_tests(void)
if (rcu_self_test) {
early_boot_test_counter++;
rcu_barrier();
- if (IS_ENABLED(CONFIG_SRCU)) {
- early_boot_test_counter++;
- srcu_barrier(&early_srcu);
- }
+ early_boot_test_counter++;
+ srcu_barrier(&early_srcu);
+ WARN_ON_ONCE(!poll_state_synchronize_srcu(&early_srcu, early_srcu_cookie));
+ cleanup_srcu_struct(&early_srcu);
}
if (rcu_self_test_counter != early_boot_test_counter) {
WARN_ON(1);