summaryrefslogtreecommitdiff
path: root/kernel/rcu
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/rcu')
-rw-r--r--kernel/rcu/rcutorture.c356
-rw-r--r--kernel/rcu/refscale.c42
-rw-r--r--kernel/rcu/srcutree.c2
-rw-r--r--kernel/rcu/tree.c84
-rw-r--r--kernel/rcu/tree.h13
-rw-r--r--kernel/rcu/tree_exp.h59
-rw-r--r--kernel/rcu/tree_nocb.h10
-rw-r--r--kernel/rcu/tree_plugin.h122
-rw-r--r--kernel/rcu/tree_stall.h36
9 files changed, 537 insertions, 187 deletions
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index 70ec0f21abc3..7a893d51d02b 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -55,22 +55,24 @@ MODULE_DESCRIPTION("Read-Copy Update module-based torture test facility");
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Paul E. McKenney <paulmck@linux.ibm.com> and Josh Triplett <josh@joshtriplett.org>");
-/* Bits for ->extendables field, extendables param, and related definitions. */
-#define RCUTORTURE_RDR_SHIFT_1 8 /* Put SRCU index in upper bits. */
-#define RCUTORTURE_RDR_MASK_1 (0xff << RCUTORTURE_RDR_SHIFT_1)
-#define RCUTORTURE_RDR_SHIFT_2 16 /* Put SRCU index in upper bits. */
-#define RCUTORTURE_RDR_MASK_2 (0xff << RCUTORTURE_RDR_SHIFT_2)
-#define RCUTORTURE_RDR_BH 0x01 /* Extend readers by disabling bh. */
-#define RCUTORTURE_RDR_IRQ 0x02 /* ... disabling interrupts. */
-#define RCUTORTURE_RDR_PREEMPT 0x04 /* ... disabling preemption. */
-#define RCUTORTURE_RDR_RBH 0x08 /* ... rcu_read_lock_bh(). */
-#define RCUTORTURE_RDR_SCHED 0x10 /* ... rcu_read_lock_sched(). */
-#define RCUTORTURE_RDR_RCU_1 0x20 /* ... entering another RCU reader. */
-#define RCUTORTURE_RDR_RCU_2 0x40 /* ... entering another RCU reader. */
-#define RCUTORTURE_RDR_NBITS 7 /* Number of bits defined above. */
-#define RCUTORTURE_MAX_EXTEND \
+// Bits for ->extendables field, extendables param, and related definitions.
+#define RCUTORTURE_RDR_SHIFT_1 8 // Put SRCU index in upper bits.
+#define RCUTORTURE_RDR_MASK_1 (0xff << RCUTORTURE_RDR_SHIFT_1)
+#define RCUTORTURE_RDR_SHIFT_2 16 // Put SRCU index in upper bits.
+#define RCUTORTURE_RDR_MASK_2 (0xff << RCUTORTURE_RDR_SHIFT_2)
+#define RCUTORTURE_RDR_BH 0x01 // Extend readers by disabling bh.
+#define RCUTORTURE_RDR_IRQ 0x02 // ... disabling interrupts.
+#define RCUTORTURE_RDR_PREEMPT 0x04 // ... disabling preemption.
+#define RCUTORTURE_RDR_RBH 0x08 // ... rcu_read_lock_bh().
+#define RCUTORTURE_RDR_SCHED 0x10 // ... rcu_read_lock_sched().
+#define RCUTORTURE_RDR_RCU_1 0x20 // ... entering another RCU reader.
+#define RCUTORTURE_RDR_RCU_2 0x40 // ... entering another RCU reader.
+#define RCUTORTURE_RDR_UPDOWN 0x80 // ... up-read from task, down-read from timer.
+ // Note: Manual start, automatic end.
+#define RCUTORTURE_RDR_NBITS 8 // Number of bits defined above.
+#define RCUTORTURE_MAX_EXTEND \
(RCUTORTURE_RDR_BH | RCUTORTURE_RDR_IRQ | RCUTORTURE_RDR_PREEMPT | \
- RCUTORTURE_RDR_RBH | RCUTORTURE_RDR_SCHED)
+ RCUTORTURE_RDR_RBH | RCUTORTURE_RDR_SCHED) // Intentionally omit RCUTORTURE_RDR_UPDOWN.
#define RCUTORTURE_RDR_ALLBITS \
(RCUTORTURE_MAX_EXTEND | RCUTORTURE_RDR_RCU_1 | RCUTORTURE_RDR_RCU_2 | \
RCUTORTURE_RDR_MASK_1 | RCUTORTURE_RDR_MASK_2)
@@ -110,6 +112,7 @@ torture_param(bool, gp_sync, false, "Use synchronous GP wait primitives");
torture_param(int, irqreader, 1, "Allow RCU readers from irq handlers");
torture_param(int, leakpointer, 0, "Leak pointer dereferences from readers");
torture_param(int, n_barrier_cbs, 0, "# of callbacks/kthreads for barrier testing");
+torture_param(int, n_up_down, 32, "# of concurrent up/down hrtimer-based RCU readers");
torture_param(int, nfakewriters, 4, "Number of RCU fake writer threads");
torture_param(int, nreaders, -1, "Number of RCU reader threads");
torture_param(int, object_debug, 0, "Enable debug-object double call_rcu() testing");
@@ -156,6 +159,7 @@ static int nrealfakewriters;
static struct task_struct *writer_task;
static struct task_struct **fakewriter_tasks;
static struct task_struct **reader_tasks;
+static struct task_struct *updown_task;
static struct task_struct **nocb_tasks;
static struct task_struct *stats_task;
static struct task_struct *fqs_task;
@@ -378,6 +382,8 @@ struct rcu_torture_ops {
void (*readunlock)(int idx);
int (*readlock_held)(void); // lockdep.
int (*readlock_nesting)(void); // actual nesting, if available, -1 if not.
+ int (*down_read)(void);
+ void (*up_read)(int idx);
unsigned long (*get_gp_seq)(void);
unsigned long (*gp_diff)(unsigned long new, unsigned long old);
void (*deferred_free)(struct rcu_torture *p);
@@ -427,6 +433,7 @@ struct rcu_torture_ops {
int no_pi_lock;
int debug_objects;
int start_poll_irqsoff;
+ int have_up_down;
const char *name;
};
@@ -464,7 +471,7 @@ rcu_read_delay(struct torture_random_state *rrsp, struct rt_read_seg *rtrsp)
!(torture_random(rrsp) % (nrealreaders * 2000 * longdelay_ms))) {
started = cur_ops->get_gp_seq();
ts = rcu_trace_clock_local();
- if (preempt_count() & (SOFTIRQ_MASK | HARDIRQ_MASK))
+ if ((preempt_count() & HARDIRQ_MASK) || softirq_count())
longdelay_ms = 5; /* Avoid triggering BH limits. */
mdelay(longdelay_ms);
rtrsp->rt_delay_ms = longdelay_ms;
@@ -711,11 +718,6 @@ static int srcu_torture_read_lock(void)
WARN_ON_ONCE(idx & ~0x1);
ret += idx << 1;
}
- if (reader_flavor & SRCU_READ_FLAVOR_LITE) {
- idx = srcu_read_lock_lite(srcu_ctlp);
- WARN_ON_ONCE(idx & ~0x1);
- ret += idx << 2;
- }
if (reader_flavor & SRCU_READ_FLAVOR_FAST) {
scp = srcu_read_lock_fast(srcu_ctlp);
idx = __srcu_ptr_to_ctr(srcu_ctlp, scp);
@@ -749,8 +751,6 @@ static void srcu_torture_read_unlock(int idx)
WARN_ON_ONCE((reader_flavor && (idx & ~reader_flavor)) || (!reader_flavor && (idx & ~0x1)));
if (reader_flavor & SRCU_READ_FLAVOR_FAST)
srcu_read_unlock_fast(srcu_ctlp, __srcu_ctr_to_ptr(srcu_ctlp, (idx & 0x8) >> 3));
- if (reader_flavor & SRCU_READ_FLAVOR_LITE)
- srcu_read_unlock_lite(srcu_ctlp, (idx & 0x4) >> 2);
if (reader_flavor & SRCU_READ_FLAVOR_NMI)
srcu_read_unlock_nmisafe(srcu_ctlp, (idx & 0x2) >> 1);
if ((reader_flavor & SRCU_READ_FLAVOR_NORMAL) || !(reader_flavor & SRCU_READ_FLAVOR_ALL))
@@ -762,6 +762,50 @@ static int torture_srcu_read_lock_held(void)
return srcu_read_lock_held(srcu_ctlp);
}
+static bool srcu_torture_have_up_down(void)
+{
+ int rf = reader_flavor;
+
+ if (!rf)
+ rf = SRCU_READ_FLAVOR_NORMAL;
+ return !!(cur_ops->have_up_down & rf);
+}
+
+static int srcu_torture_down_read(void)
+{
+ int idx;
+ struct srcu_ctr __percpu *scp;
+
+ WARN_ON_ONCE(reader_flavor & ~SRCU_READ_FLAVOR_ALL);
+ WARN_ON_ONCE(reader_flavor & (reader_flavor - 1));
+
+ if ((reader_flavor & SRCU_READ_FLAVOR_NORMAL) || !(reader_flavor & SRCU_READ_FLAVOR_ALL)) {
+ idx = srcu_down_read(srcu_ctlp);
+ WARN_ON_ONCE(idx & ~0x1);
+ return idx;
+ }
+ if (reader_flavor & SRCU_READ_FLAVOR_FAST) {
+ scp = srcu_down_read_fast(srcu_ctlp);
+ idx = __srcu_ptr_to_ctr(srcu_ctlp, scp);
+ WARN_ON_ONCE(idx & ~0x1);
+ return idx << 3;
+ }
+ WARN_ON_ONCE(1);
+ return 0;
+}
+
+static void srcu_torture_up_read(int idx)
+{
+ WARN_ON_ONCE((reader_flavor && (idx & ~reader_flavor)) || (!reader_flavor && (idx & ~0x1)));
+ if (reader_flavor & SRCU_READ_FLAVOR_FAST)
+ srcu_up_read_fast(srcu_ctlp, __srcu_ctr_to_ptr(srcu_ctlp, (idx & 0x8) >> 3));
+ else if ((reader_flavor & SRCU_READ_FLAVOR_NORMAL) ||
+ !(reader_flavor & SRCU_READ_FLAVOR_ALL))
+ srcu_up_read(srcu_ctlp, idx & 0x1);
+ else
+ WARN_ON_ONCE(1);
+}
+
static unsigned long srcu_torture_completed(void)
{
return srcu_batches_completed(srcu_ctlp);
@@ -819,6 +863,8 @@ static struct rcu_torture_ops srcu_ops = {
.readlock = srcu_torture_read_lock,
.read_delay = srcu_read_delay,
.readunlock = srcu_torture_read_unlock,
+ .down_read = srcu_torture_down_read,
+ .up_read = srcu_torture_up_read,
.readlock_held = torture_srcu_read_lock_held,
.get_gp_seq = srcu_torture_completed,
.gp_diff = rcu_seq_diff,
@@ -839,6 +885,8 @@ static struct rcu_torture_ops srcu_ops = {
.irq_capable = 1,
.no_pi_lock = IS_ENABLED(CONFIG_TINY_SRCU),
.debug_objects = 1,
+ .have_up_down = IS_ENABLED(CONFIG_TINY_SRCU)
+ ? 0 : SRCU_READ_FLAVOR_NORMAL | SRCU_READ_FLAVOR_FAST,
.name = "srcu"
};
@@ -864,6 +912,8 @@ static struct rcu_torture_ops srcud_ops = {
.read_delay = srcu_read_delay,
.readunlock = srcu_torture_read_unlock,
.readlock_held = torture_srcu_read_lock_held,
+ .down_read = srcu_torture_down_read,
+ .up_read = srcu_torture_up_read,
.get_gp_seq = srcu_torture_completed,
.gp_diff = rcu_seq_diff,
.deferred_free = srcu_torture_deferred_free,
@@ -883,6 +933,8 @@ static struct rcu_torture_ops srcud_ops = {
.irq_capable = 1,
.no_pi_lock = IS_ENABLED(CONFIG_TINY_SRCU),
.debug_objects = 1,
+ .have_up_down = IS_ENABLED(CONFIG_TINY_SRCU)
+ ? 0 : SRCU_READ_FLAVOR_NORMAL | SRCU_READ_FLAVOR_FAST,
.name = "srcud"
};
@@ -910,7 +962,8 @@ static struct rcu_torture_ops busted_srcud_ops = {
/*
* Definitions for trivial CONFIG_PREEMPT=n-only torture testing.
- * This implementation does not necessarily work well with CPU hotplug.
+ * This implementation does not work well with CPU hotplug nor
+ * with rcutorture's shuffling.
*/
static void synchronize_rcu_trivial(void)
@@ -923,6 +976,16 @@ static void synchronize_rcu_trivial(void)
}
}
+static void rcu_sync_torture_init_trivial(void)
+{
+ rcu_sync_torture_init();
+ // if (onoff_interval || shuffle_interval) {
+ if (WARN_ONCE(onoff_interval || shuffle_interval, "%s: Non-zero onoff_interval (%d) or shuffle_interval (%d) breaks trivial RCU, resetting to zero", __func__, onoff_interval, shuffle_interval)) {
+ onoff_interval = 0;
+ shuffle_interval = 0;
+ }
+}
+
static int rcu_torture_read_lock_trivial(void)
{
preempt_disable();
@@ -936,7 +999,7 @@ static void rcu_torture_read_unlock_trivial(int idx)
static struct rcu_torture_ops trivial_ops = {
.ttype = RCU_TRIVIAL_FLAVOR,
- .init = rcu_sync_torture_init,
+ .init = rcu_sync_torture_init_trivial,
.readlock = rcu_torture_read_lock_trivial,
.read_delay = rcu_read_delay, /* just reuse rcu's version. */
.readunlock = rcu_torture_read_unlock_trivial,
@@ -1722,6 +1785,7 @@ rcu_torture_writer(void *arg)
cur_ops->gp_kthread_dbg();
WARN(1, "%s: rtort_pipe_count: %d\n", __func__, rcu_tortures[i].rtort_pipe_count);
rcu_ftrace_dump(DUMP_ALL);
+ break;
}
if (stutter_waited)
sched_set_normal(current, oldnice);
@@ -1915,14 +1979,14 @@ static void rcu_torture_reader_do_mbchk(long myid, struct rcu_torture *rtp,
// Verify the specified RCUTORTURE_RDR* state.
#define ROEC_ARGS "%s %s: Current %#x To add %#x To remove %#x preempt_count() %#x\n", __func__, s, curstate, new, old, preempt_count()
-static void rcutorture_one_extend_check(char *s, int curstate, int new, int old, bool insoftirq)
+static void rcutorture_one_extend_check(char *s, int curstate, int new, int old)
{
int mask;
- if (!IS_ENABLED(CONFIG_RCU_TORTURE_TEST_CHK_RDR_STATE))
+ if (!IS_ENABLED(CONFIG_RCU_TORTURE_TEST_CHK_RDR_STATE) || in_nmi())
return;
- WARN_ONCE(!(curstate & RCUTORTURE_RDR_IRQ) && irqs_disabled(), ROEC_ARGS);
+ WARN_ONCE(!(curstate & RCUTORTURE_RDR_IRQ) && irqs_disabled() && !in_hardirq(), ROEC_ARGS);
WARN_ONCE((curstate & RCUTORTURE_RDR_IRQ) && !irqs_disabled(), ROEC_ARGS);
// If CONFIG_PREEMPT_COUNT=n, further checks are unreliable.
@@ -1930,21 +1994,21 @@ static void rcutorture_one_extend_check(char *s, int curstate, int new, int old,
return;
WARN_ONCE((curstate & (RCUTORTURE_RDR_BH | RCUTORTURE_RDR_RBH)) &&
- !(preempt_count() & SOFTIRQ_MASK), ROEC_ARGS);
+ !softirq_count(), ROEC_ARGS);
WARN_ONCE((curstate & (RCUTORTURE_RDR_PREEMPT | RCUTORTURE_RDR_SCHED)) &&
!(preempt_count() & PREEMPT_MASK), ROEC_ARGS);
WARN_ONCE(cur_ops->readlock_nesting &&
(curstate & (RCUTORTURE_RDR_RCU_1 | RCUTORTURE_RDR_RCU_2)) &&
cur_ops->readlock_nesting() == 0, ROEC_ARGS);
- // Timer handlers have all sorts of stuff disabled, so ignore
+ // Interrupt handlers have all sorts of stuff disabled, so ignore
// unintended disabling.
- if (insoftirq)
+ if (in_serving_softirq() || in_hardirq())
return;
WARN_ONCE(cur_ops->extendables &&
!(curstate & (RCUTORTURE_RDR_BH | RCUTORTURE_RDR_RBH)) &&
- (preempt_count() & SOFTIRQ_MASK), ROEC_ARGS);
+ softirq_count(), ROEC_ARGS);
/*
* non-preemptible RCU in a preemptible kernel uses preempt_disable()
@@ -1965,6 +2029,9 @@ static void rcutorture_one_extend_check(char *s, int curstate, int new, int old,
if (!IS_ENABLED(CONFIG_PREEMPT_RCU))
mask |= RCUTORTURE_RDR_PREEMPT | RCUTORTURE_RDR_SCHED;
+ if (IS_ENABLED(CONFIG_PREEMPT_RT) && softirq_count())
+ mask |= RCUTORTURE_RDR_BH | RCUTORTURE_RDR_RBH;
+
WARN_ONCE(cur_ops->readlock_nesting && !(curstate & mask) &&
cur_ops->readlock_nesting() > 0, ROEC_ARGS);
}
@@ -1978,8 +2045,7 @@ static void rcutorture_one_extend_check(char *s, int curstate, int new, int old,
* beginning or end of the critical section and if there was actually a
* change, do a ->read_delay().
*/
-static void rcutorture_one_extend(int *readstate, int newstate, bool insoftirq,
- struct torture_random_state *trsp,
+static void rcutorture_one_extend(int *readstate, int newstate, struct torture_random_state *trsp,
struct rt_read_seg *rtrsp)
{
bool first;
@@ -1993,8 +2059,8 @@ static void rcutorture_one_extend(int *readstate, int newstate, bool insoftirq,
first = idxold1 == 0;
WARN_ON_ONCE(idxold2 < 0);
- WARN_ON_ONCE(idxold2 & ~RCUTORTURE_RDR_ALLBITS);
- rcutorture_one_extend_check("before change", idxold1, statesnew, statesold, insoftirq);
+ WARN_ON_ONCE(idxold2 & ~(RCUTORTURE_RDR_ALLBITS | RCUTORTURE_RDR_UPDOWN));
+ rcutorture_one_extend_check("before change", idxold1, statesnew, statesold);
rtrsp->rt_readstate = newstate;
/* First, put new protection in place to avoid critical-section gap. */
@@ -2014,8 +2080,7 @@ static void rcutorture_one_extend(int *readstate, int newstate, bool insoftirq,
idxnew2 = (cur_ops->readlock() << RCUTORTURE_RDR_SHIFT_2) & RCUTORTURE_RDR_MASK_2;
// Complain unless both the old and the new protection is in place.
- rcutorture_one_extend_check("during change",
- idxold1 | statesnew, statesnew, statesold, insoftirq);
+ rcutorture_one_extend_check("during change", idxold1 | statesnew, statesnew, statesold);
// Sample CPU under both sets of protections to reduce confusion.
if (IS_ENABLED(CONFIG_RCU_TORTURE_TEST_LOG_CPU)) {
@@ -2069,6 +2134,11 @@ static void rcutorture_one_extend(int *readstate, int newstate, bool insoftirq,
if (lockit)
raw_spin_unlock_irqrestore(&current->pi_lock, flags);
}
+ if (statesold & RCUTORTURE_RDR_UPDOWN) {
+ cur_ops->up_read((idxold1 & RCUTORTURE_RDR_MASK_1) >> RCUTORTURE_RDR_SHIFT_1);
+ WARN_ON_ONCE(idxnew1 != -1);
+ idxold1 = 0;
+ }
/* Delay if neither beginning nor end and there was a change. */
if ((statesnew || statesold) && *readstate && newstate)
@@ -2085,7 +2155,7 @@ static void rcutorture_one_extend(int *readstate, int newstate, bool insoftirq,
WARN_ON_ONCE(*readstate < 0);
if (WARN_ON_ONCE(*readstate & ~RCUTORTURE_RDR_ALLBITS))
pr_info("Unexpected readstate value of %#x\n", *readstate);
- rcutorture_one_extend_check("after change", *readstate, statesnew, statesold, insoftirq);
+ rcutorture_one_extend_check("after change", *readstate, statesnew, statesold);
}
/* Return the biggest extendables mask given current RCU and boot parameters. */
@@ -2152,8 +2222,7 @@ rcutorture_extend_mask(int oldmask, struct torture_random_state *trsp)
* critical section.
*/
static struct rt_read_seg *
-rcutorture_loop_extend(int *readstate, bool insoftirq, struct torture_random_state *trsp,
- struct rt_read_seg *rtrsp)
+rcutorture_loop_extend(int *readstate, struct torture_random_state *trsp, struct rt_read_seg *rtrsp)
{
int i;
int j;
@@ -2167,7 +2236,8 @@ rcutorture_loop_extend(int *readstate, bool insoftirq, struct torture_random_sta
i = ((i | (i >> 3)) & RCUTORTURE_RDR_MAX_LOOPS) + 1;
for (j = 0; j < i; j++) {
mask = rcutorture_extend_mask(*readstate, trsp);
- rcutorture_one_extend(readstate, mask, insoftirq, trsp, &rtrsp[j]);
+ WARN_ON_ONCE(mask & RCUTORTURE_RDR_UPDOWN);
+ rcutorture_one_extend(readstate, mask, trsp, &rtrsp[j]);
}
return &rtrsp[j];
}
@@ -2209,10 +2279,11 @@ static bool rcu_torture_one_read_start(struct rcu_torture_one_read_state *rtorsp
rtorsp->started = cur_ops->get_gp_seq();
rtorsp->ts = rcu_trace_clock_local();
rtorsp->p = rcu_dereference_check(rcu_torture_current,
- !cur_ops->readlock_held || cur_ops->readlock_held());
+ !cur_ops->readlock_held || cur_ops->readlock_held() ||
+ (rtorsp->readstate & RCUTORTURE_RDR_UPDOWN));
if (rtorsp->p == NULL) {
/* Wait for rcu_torture_writer to get underway */
- rcutorture_one_extend(&rtorsp->readstate, 0, myid < 0, trsp, rtorsp->rtrsp);
+ rcutorture_one_extend(&rtorsp->readstate, 0, trsp, rtorsp->rtrsp);
return false;
}
if (rtorsp->p->rtort_mbtest == 0)
@@ -2226,7 +2297,7 @@ static bool rcu_torture_one_read_start(struct rcu_torture_one_read_state *rtorsp
* critical sections and check for errors.
*/
static void rcu_torture_one_read_end(struct rcu_torture_one_read_state *rtorsp,
- struct torture_random_state *trsp, long myid)
+ struct torture_random_state *trsp)
{
int i;
unsigned long completed;
@@ -2273,7 +2344,7 @@ static void rcu_torture_one_read_end(struct rcu_torture_one_read_state *rtorsp,
}
if (cur_ops->reader_blocked)
preempted = cur_ops->reader_blocked();
- rcutorture_one_extend(&rtorsp->readstate, 0, myid < 0, trsp, rtorsp->rtrsp);
+ rcutorture_one_extend(&rtorsp->readstate, 0, trsp, rtorsp->rtrsp);
WARN_ON_ONCE(rtorsp->readstate);
// This next splat is expected behavior if leakpointer, especially
// for CONFIG_RCU_STRICT_GRACE_PERIOD=y kernels.
@@ -2302,13 +2373,14 @@ static bool rcu_torture_one_read(struct torture_random_state *trsp, long myid)
WARN_ON_ONCE(!rcu_is_watching());
init_rcu_torture_one_read_state(&rtors, trsp);
newstate = rcutorture_extend_mask(rtors.readstate, trsp);
- rcutorture_one_extend(&rtors.readstate, newstate, myid < 0, trsp, rtors.rtrsp++);
+ WARN_ON_ONCE(newstate & RCUTORTURE_RDR_UPDOWN);
+ rcutorture_one_extend(&rtors.readstate, newstate, trsp, rtors.rtrsp++);
if (!rcu_torture_one_read_start(&rtors, trsp, myid)) {
- rcutorture_one_extend(&rtors.readstate, 0, myid < 0, trsp, rtors.rtrsp);
+ rcutorture_one_extend(&rtors.readstate, 0, trsp, rtors.rtrsp);
return false;
}
- rtors.rtrsp = rcutorture_loop_extend(&rtors.readstate, myid < 0, trsp, rtors.rtrsp);
- rcu_torture_one_read_end(&rtors, trsp, myid);
+ rtors.rtrsp = rcutorture_loop_extend(&rtors.readstate, trsp, rtors.rtrsp);
+ rcu_torture_one_read_end(&rtors, trsp);
return true;
}
@@ -2378,6 +2450,152 @@ rcu_torture_reader(void *arg)
return 0;
}
+struct rcu_torture_one_read_state_updown {
+ struct hrtimer rtorsu_hrt;
+ bool rtorsu_inuse;
+ ktime_t rtorsu_kt;
+ int rtorsu_cpu;
+ unsigned long rtorsu_j;
+ unsigned long rtorsu_ndowns;
+ unsigned long rtorsu_nups;
+ unsigned long rtorsu_nmigrates;
+ struct torture_random_state rtorsu_trs;
+ struct rcu_torture_one_read_state rtorsu_rtors;
+};
+
+static struct rcu_torture_one_read_state_updown *updownreaders;
+static DEFINE_TORTURE_RANDOM(rcu_torture_updown_rand);
+static int rcu_torture_updown(void *arg);
+
+static enum hrtimer_restart rcu_torture_updown_hrt(struct hrtimer *hrtp)
+{
+ int cpu = raw_smp_processor_id();
+ struct rcu_torture_one_read_state_updown *rtorsup;
+
+ rtorsup = container_of(hrtp, struct rcu_torture_one_read_state_updown, rtorsu_hrt);
+ rcu_torture_one_read_end(&rtorsup->rtorsu_rtors, &rtorsup->rtorsu_trs);
+ WARN_ONCE(rtorsup->rtorsu_nups >= rtorsup->rtorsu_ndowns, "%s: Up without matching down #%zu.\n", __func__, rtorsup - updownreaders);
+ WRITE_ONCE(rtorsup->rtorsu_nups, rtorsup->rtorsu_nups + 1);
+ WRITE_ONCE(rtorsup->rtorsu_nmigrates,
+ rtorsup->rtorsu_nmigrates + (cpu != rtorsup->rtorsu_cpu));
+ smp_store_release(&rtorsup->rtorsu_inuse, false);
+ return HRTIMER_NORESTART;
+}
+
+static int rcu_torture_updown_init(void)
+{
+ int i;
+ struct torture_random_state *rand = &rcu_torture_updown_rand;
+ int ret;
+
+ if (n_up_down < 0)
+ return 0;
+ if (!srcu_torture_have_up_down()) {
+ VERBOSE_TOROUT_STRING("rcu_torture_updown_init: Disabling up/down reader tests due to lack of primitives");
+ return 0;
+ }
+ updownreaders = kcalloc(n_up_down, sizeof(*updownreaders), GFP_KERNEL);
+ if (!updownreaders) {
+ VERBOSE_TOROUT_STRING("rcu_torture_updown_init: Out of memory, disabling up/down reader tests");
+ return -ENOMEM;
+ }
+ for (i = 0; i < n_up_down; i++) {
+ init_rcu_torture_one_read_state(&updownreaders[i].rtorsu_rtors, rand);
+ hrtimer_setup(&updownreaders[i].rtorsu_hrt, rcu_torture_updown_hrt, CLOCK_MONOTONIC,
+ HRTIMER_MODE_REL | HRTIMER_MODE_HARD);
+ torture_random_init(&updownreaders[i].rtorsu_trs);
+ init_rcu_torture_one_read_state(&updownreaders[i].rtorsu_rtors,
+ &updownreaders[i].rtorsu_trs);
+ }
+ ret = torture_create_kthread(rcu_torture_updown, rand, updown_task);
+ if (ret) {
+ kfree(updownreaders);
+ updownreaders = NULL;
+ }
+ return ret;
+}
+
+static void rcu_torture_updown_cleanup(void)
+{
+ struct rcu_torture_one_read_state_updown *rtorsup;
+
+ for (rtorsup = updownreaders; rtorsup < &updownreaders[n_up_down]; rtorsup++) {
+ if (!smp_load_acquire(&rtorsup->rtorsu_inuse))
+ continue;
+ if (hrtimer_cancel(&rtorsup->rtorsu_hrt) || WARN_ON_ONCE(rtorsup->rtorsu_inuse)) {
+ rcu_torture_one_read_end(&rtorsup->rtorsu_rtors, &rtorsup->rtorsu_trs);
+ WARN_ONCE(rtorsup->rtorsu_nups >= rtorsup->rtorsu_ndowns, "%s: Up without matching down #%zu.\n", __func__, rtorsup - updownreaders);
+ WRITE_ONCE(rtorsup->rtorsu_nups, rtorsup->rtorsu_nups + 1);
+ smp_store_release(&rtorsup->rtorsu_inuse, false);
+ }
+
+ }
+ kfree(updownreaders);
+ updownreaders = NULL;
+}
+
+// Do one reader for rcu_torture_updown().
+static void rcu_torture_updown_one(struct rcu_torture_one_read_state_updown *rtorsup)
+{
+ int idx;
+ int rawidx;
+ ktime_t t;
+
+ init_rcu_torture_one_read_state(&rtorsup->rtorsu_rtors, &rtorsup->rtorsu_trs);
+ rawidx = cur_ops->down_read();
+ WRITE_ONCE(rtorsup->rtorsu_ndowns, rtorsup->rtorsu_ndowns + 1);
+ idx = (rawidx << RCUTORTURE_RDR_SHIFT_1) & RCUTORTURE_RDR_MASK_1;
+ rtorsup->rtorsu_rtors.readstate = idx | RCUTORTURE_RDR_UPDOWN;
+ rtorsup->rtorsu_rtors.rtrsp++;
+ rtorsup->rtorsu_cpu = raw_smp_processor_id();
+ if (!rcu_torture_one_read_start(&rtorsup->rtorsu_rtors, &rtorsup->rtorsu_trs, -1)) {
+ WARN_ONCE(rtorsup->rtorsu_nups >= rtorsup->rtorsu_ndowns, "%s: Up without matching down #%zu.\n", __func__, rtorsup - updownreaders);
+ WRITE_ONCE(rtorsup->rtorsu_nups, rtorsup->rtorsu_nups + 1);
+ schedule_timeout_idle(HZ);
+ return;
+ }
+ smp_store_release(&rtorsup->rtorsu_inuse, true);
+ t = torture_random(&rtorsup->rtorsu_trs) & 0xfffff; // One per million.
+ if (t < 10 * 1000)
+ t = 200 * 1000 * 1000;
+ hrtimer_start(&rtorsup->rtorsu_hrt, t, HRTIMER_MODE_REL | HRTIMER_MODE_HARD);
+ smp_mb(); // Sample jiffies after posting hrtimer.
+ rtorsup->rtorsu_j = jiffies; // Not used by hrtimer handler.
+ rtorsup->rtorsu_kt = t;
+}
+
+/*
+ * RCU torture up/down reader kthread, starting RCU readers in kthread
+ * context and ending them in hrtimer handlers. Otherwise similar to
+ * rcu_torture_reader().
+ */
+static int
+rcu_torture_updown(void *arg)
+{
+ unsigned long j;
+ struct rcu_torture_one_read_state_updown *rtorsup;
+
+ VERBOSE_TOROUT_STRING("rcu_torture_updown task started");
+ do {
+ for (rtorsup = updownreaders; rtorsup < &updownreaders[n_up_down]; rtorsup++) {
+ if (torture_must_stop())
+ break;
+ j = smp_load_acquire(&jiffies); // Time before ->rtorsu_inuse.
+ if (smp_load_acquire(&rtorsup->rtorsu_inuse)) {
+ WARN_ONCE(time_after(j, rtorsup->rtorsu_j + 1 + HZ * 10),
+ "hrtimer queued at jiffies %lu for %lld ns took %lu jiffies\n", rtorsup->rtorsu_j, rtorsup->rtorsu_kt, j - rtorsup->rtorsu_j);
+ continue;
+ }
+ rcu_torture_updown_one(rtorsup);
+ }
+ torture_hrtimeout_ms(1, 1000, &rcu_torture_updown_rand);
+ stutter_wait("rcu_torture_updown");
+ } while (!torture_must_stop());
+ rcu_torture_updown_cleanup();
+ torture_kthread_stopping("rcu_torture_updown");
+ return 0;
+}
+
/*
* Randomly Toggle CPUs' callback-offload state. This uses hrtimers to
* increase race probabilities and fuzzes the interval between toggling.
@@ -2441,6 +2659,10 @@ rcu_torture_stats_print(void)
long pipesummary[RCU_TORTURE_PIPE_LEN + 1] = { 0 };
long batchsummary[RCU_TORTURE_PIPE_LEN + 1] = { 0 };
long n_gpwraps = 0;
+ unsigned long ndowns = 0;
+ unsigned long nunexpired = 0;
+ unsigned long nmigrates = 0;
+ unsigned long nups = 0;
struct rcu_torture *rtcp;
static unsigned long rtcv_snap = ULONG_MAX;
static bool splatted;
@@ -2454,10 +2676,18 @@ rcu_torture_stats_print(void)
if (cur_ops->get_gpwrap_count)
n_gpwraps += cur_ops->get_gpwrap_count(cpu);
}
+ if (updownreaders) {
+ for (i = 0; i < n_up_down; i++) {
+ ndowns += READ_ONCE(updownreaders[i].rtorsu_ndowns);
+ nups += READ_ONCE(updownreaders[i].rtorsu_nups);
+ nunexpired += READ_ONCE(updownreaders[i].rtorsu_inuse);
+ nmigrates += READ_ONCE(updownreaders[i].rtorsu_nmigrates);
+ }
+ }
for (i = RCU_TORTURE_PIPE_LEN; i >= 0; i--) {
if (pipesummary[i] != 0)
break;
- }
+ } // The value of variable "i" is used later, so don't clobber it!
pr_alert("%s%s ", torture_type, TORTURE_FLAG);
rtcp = rcu_access_pointer(rcu_torture_current);
@@ -2478,6 +2708,8 @@ rcu_torture_stats_print(void)
n_rcu_torture_boost_failure,
n_rcu_torture_boosts,
atomic_long_read(&n_rcu_torture_timers));
+ if (updownreaders)
+ pr_cont("ndowns: %lu nups: %lu nhrt: %lu nmigrates: %lu ", ndowns, nups, nunexpired, nmigrates);
torture_onoff_stats();
pr_cont("barrier: %ld/%ld:%ld ",
data_race(n_barrier_successes),
@@ -2632,7 +2864,7 @@ rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, const char *tag)
"reader_flavor=%x "
"nocbs_nthreads=%d nocbs_toggle=%d "
"test_nmis=%d "
- "preempt_duration=%d preempt_interval=%d\n",
+ "preempt_duration=%d preempt_interval=%d n_up_down=%d\n",
torture_type, tag, nrealreaders, nrealfakewriters,
stat_interval, verbose, test_no_idle_hz, shuffle_interval,
stutter, irqreader, fqs_duration, fqs_holdoff, fqs_stutter,
@@ -2646,7 +2878,7 @@ rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, const char *tag)
reader_flavor,
nocbs_nthreads, nocbs_toggle,
test_nmis,
- preempt_duration, preempt_interval);
+ preempt_duration, preempt_interval, n_up_down);
}
static int rcutorture_booster_cleanup(unsigned int cpu)
@@ -3749,6 +3981,10 @@ rcu_torture_cleanup(void)
nocb_tasks = NULL;
}
+ if (updown_task) {
+ torture_stop_kthread(rcu_torture_updown, updown_task);
+ updown_task = NULL;
+ }
if (reader_tasks) {
for (i = 0; i < nrealreaders; i++)
torture_stop_kthread(rcu_torture_reader,
@@ -4245,11 +4481,6 @@ rcu_torture_init(void)
/* Start up the kthreads. */
rcu_torture_write_types();
- firsterr = torture_create_kthread(rcu_torture_writer, NULL,
- writer_task);
- if (torture_init_error(firsterr))
- goto unwind;
-
if (nrealfakewriters > 0) {
fakewriter_tasks = kcalloc(nrealfakewriters,
sizeof(fakewriter_tasks[0]),
@@ -4282,6 +4513,15 @@ rcu_torture_init(void)
if (torture_init_error(firsterr))
goto unwind;
}
+
+ firsterr = torture_create_kthread(rcu_torture_writer, NULL,
+ writer_task);
+ if (torture_init_error(firsterr))
+ goto unwind;
+
+ firsterr = rcu_torture_updown_init();
+ if (torture_init_error(firsterr))
+ goto unwind;
nrealnocbers = nocbs_nthreads;
if (WARN_ON(nrealnocbers < 0))
nrealnocbers = 1;
diff --git a/kernel/rcu/refscale.c b/kernel/rcu/refscale.c
index f11a7c2af778..df646e0694a8 100644
--- a/kernel/rcu/refscale.c
+++ b/kernel/rcu/refscale.c
@@ -85,7 +85,7 @@ torture_param(int, holdoff, IS_BUILTIN(CONFIG_RCU_REF_SCALE_TEST) ? 10 : 0,
// Number of typesafe_lookup structures, that is, the degree of concurrency.
torture_param(long, lookup_instances, 0, "Number of typesafe_lookup structures.");
// Number of loops per experiment, all readers execute operations concurrently.
-torture_param(long, loops, 10000, "Number of loops per experiment.");
+torture_param(int, loops, 10000, "Number of loops per experiment.");
// Number of readers, with -1 defaulting to about 75% of the CPUs.
torture_param(int, nreaders, -1, "Number of readers, -1 for 75% of CPUs.");
// Number of runs.
@@ -246,36 +246,6 @@ static const struct ref_scale_ops srcu_fast_ops = {
.name = "srcu-fast"
};
-static void srcu_lite_ref_scale_read_section(const int nloops)
-{
- int i;
- int idx;
-
- for (i = nloops; i >= 0; i--) {
- idx = srcu_read_lock_lite(srcu_ctlp);
- srcu_read_unlock_lite(srcu_ctlp, idx);
- }
-}
-
-static void srcu_lite_ref_scale_delay_section(const int nloops, const int udl, const int ndl)
-{
- int i;
- int idx;
-
- for (i = nloops; i >= 0; i--) {
- idx = srcu_read_lock_lite(srcu_ctlp);
- un_delay(udl, ndl);
- srcu_read_unlock_lite(srcu_ctlp, idx);
- }
-}
-
-static const struct ref_scale_ops srcu_lite_ops = {
- .init = rcu_sync_scale_init,
- .readsection = srcu_lite_ref_scale_read_section,
- .delaysection = srcu_lite_ref_scale_delay_section,
- .name = "srcu-lite"
-};
-
#ifdef CONFIG_TASKS_RCU
// Definitions for RCU Tasks ref scale testing: Empty read markers.
@@ -1140,7 +1110,7 @@ static void
ref_scale_print_module_parms(const struct ref_scale_ops *cur_ops, const char *tag)
{
pr_alert("%s" SCALE_FLAG
- "--- %s: verbose=%d verbose_batched=%d shutdown=%d holdoff=%d lookup_instances=%ld loops=%ld nreaders=%d nruns=%d readdelay=%d\n", scale_type, tag,
+ "--- %s: verbose=%d verbose_batched=%d shutdown=%d holdoff=%d lookup_instances=%ld loops=%d nreaders=%d nruns=%d readdelay=%d\n", scale_type, tag,
verbose, verbose_batched, shutdown, holdoff, lookup_instances, loops, nreaders, nruns, readdelay);
}
@@ -1193,7 +1163,7 @@ ref_scale_init(void)
long i;
int firsterr = 0;
static const struct ref_scale_ops *scale_ops[] = {
- &rcu_ops, &srcu_ops, &srcu_fast_ops, &srcu_lite_ops, RCU_TRACE_OPS RCU_TASKS_OPS
+ &rcu_ops, &srcu_ops, &srcu_fast_ops, RCU_TRACE_OPS RCU_TASKS_OPS
&refcnt_ops, &rwlock_ops, &rwsem_ops, &lock_ops, &lock_irq_ops,
&acqrel_ops, &sched_clock_ops, &clock_ops, &jiffies_ops,
&typesafe_ref_ops, &typesafe_lock_ops, &typesafe_seqlock_ops,
@@ -1238,12 +1208,16 @@ ref_scale_init(void)
// Reader tasks (default to ~75% of online CPUs).
if (nreaders < 0)
nreaders = (num_online_cpus() >> 1) + (num_online_cpus() >> 2);
- if (WARN_ONCE(loops <= 0, "%s: loops = %ld, adjusted to 1\n", __func__, loops))
+ if (WARN_ONCE(loops <= 0, "%s: loops = %d, adjusted to 1\n", __func__, loops))
loops = 1;
if (WARN_ONCE(nreaders <= 0, "%s: nreaders = %d, adjusted to 1\n", __func__, nreaders))
nreaders = 1;
if (WARN_ONCE(nruns <= 0, "%s: nruns = %d, adjusted to 1\n", __func__, nruns))
nruns = 1;
+ if (WARN_ONCE(loops > INT_MAX / nreaders,
+ "%s: nreaders * loops will overflow, adjusted loops to %d",
+ __func__, INT_MAX / nreaders))
+ loops = INT_MAX / nreaders;
reader_tasks = kcalloc(nreaders, sizeof(reader_tasks[0]),
GFP_KERNEL);
if (!reader_tasks) {
diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c
index 48047260697e..c5e8ebc493d5 100644
--- a/kernel/rcu/srcutree.c
+++ b/kernel/rcu/srcutree.c
@@ -502,6 +502,8 @@ static bool srcu_readers_active_idx_check(struct srcu_struct *ssp, int idx)
*/
if (!did_gp)
smp_mb(); /* A */
+ else if (srcu_gp_is_expedited(ssp))
+ synchronize_rcu_expedited(); /* X */
else
synchronize_rcu(); /* X */
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index e8a4b720d7d2..174ee243b349 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -160,7 +160,6 @@ static void rcu_report_qs_rnp(unsigned long mask, struct rcu_node *rnp,
unsigned long gps, unsigned long flags);
static void invoke_rcu_core(void);
static void rcu_report_exp_rdp(struct rcu_data *rdp);
-static void sync_sched_exp_online_cleanup(int cpu);
static void check_cb_ovld_locked(struct rcu_data *rdp, struct rcu_node *rnp);
static bool rcu_rdp_is_offloaded(struct rcu_data *rdp);
static bool rcu_rdp_cpu_online(struct rcu_data *rdp);
@@ -377,7 +376,7 @@ EXPORT_SYMBOL_GPL(rcu_momentary_eqs);
*/
static int rcu_is_cpu_rrupt_from_idle(void)
{
- long nesting;
+ long nmi_nesting = ct_nmi_nesting();
/*
* Usually called from the tick; but also used from smp_function_call()
@@ -389,21 +388,28 @@ static int rcu_is_cpu_rrupt_from_idle(void)
/* Check for counter underflows */
RCU_LOCKDEP_WARN(ct_nesting() < 0,
"RCU nesting counter underflow!");
- RCU_LOCKDEP_WARN(ct_nmi_nesting() <= 0,
- "RCU nmi_nesting counter underflow/zero!");
- /* Are we at first interrupt nesting level? */
- nesting = ct_nmi_nesting();
- if (nesting > 1)
+ /* Non-idle interrupt or nested idle interrupt */
+ if (nmi_nesting > 1)
return false;
/*
- * If we're not in an interrupt, we must be in the idle task!
+ * Non nested idle interrupt (interrupting section where RCU
+ * wasn't watching).
*/
- WARN_ON_ONCE(!nesting && !is_idle_task(current));
+ if (nmi_nesting == 1)
+ return true;
+
+ /* Not in an interrupt */
+ if (!nmi_nesting) {
+ RCU_LOCKDEP_WARN(!in_task() || !is_idle_task(current),
+ "RCU nmi_nesting counter not in idle task!");
+ return !rcu_is_watching_curr_cpu();
+ }
- /* Does CPU appear to be idle from an RCU standpoint? */
- return ct_nesting() == 0;
+ RCU_LOCKDEP_WARN(1, "RCU nmi_nesting counter underflow/zero!");
+
+ return false;
}
#define DEFAULT_RCU_BLIMIT (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD) ? 1000 : 10)
@@ -1625,8 +1631,10 @@ static void rcu_sr_put_wait_head(struct llist_node *node)
atomic_set_release(&sr_wn->inuse, 0);
}
-/* Disabled by default. */
-static int rcu_normal_wake_from_gp;
+/* Enable rcu_normal_wake_from_gp automatically on small systems. */
+#define WAKE_FROM_GP_CPU_THRESHOLD 16
+
+static int rcu_normal_wake_from_gp = -1;
module_param(rcu_normal_wake_from_gp, int, 0644);
static struct workqueue_struct *sync_wq;
@@ -1829,6 +1837,18 @@ static noinline_for_stack bool rcu_gp_init(void)
start_new_poll = rcu_sr_normal_gp_init();
/* Record GP times before starting GP, hence rcu_seq_start(). */
old_gp_seq = rcu_state.gp_seq;
+ /*
+ * Critical ordering: rcu_seq_start() must happen BEFORE the CPU hotplug
+ * scan below. Otherwise we risk a race where a newly onlining CPU could
+ * be missed by the current grace period, potentially leading to
+ * use-after-free errors. For a detailed explanation of this race, see
+ * Documentation/RCU/Design/Requirements/Requirements.rst in the
+ * "Hotplug CPU" section.
+ *
+ * Also note that the root rnp's gp_seq is kept separate from, and lags,
+ * the rcu_state's gp_seq, for a reason. See the Quick-Quiz on
+ * Single-node systems for more details (in Data-Structures.rst).
+ */
rcu_seq_start(&rcu_state.gp_seq);
/* Ensure that rcu_seq_done_exact() guardband doesn't give false positives. */
WARN_ON_ONCE(IS_ENABLED(CONFIG_PROVE_RCU) &&
@@ -1865,6 +1885,10 @@ static noinline_for_stack bool rcu_gp_init(void)
/* Exclude CPU hotplug operations. */
rcu_for_each_leaf_node(rnp) {
local_irq_disable();
+ /*
+ * Serialize with CPU offline. See Requirements.rst > Hotplug CPU >
+ * Concurrent Quiescent State Reporting for Offline CPUs.
+ */
arch_spin_lock(&rcu_state.ofl_lock);
raw_spin_lock_rcu_node(rnp);
if (rnp->qsmaskinit == rnp->qsmaskinitnext &&
@@ -1939,7 +1963,12 @@ static noinline_for_stack bool rcu_gp_init(void)
trace_rcu_grace_period_init(rcu_state.name, rnp->gp_seq,
rnp->level, rnp->grplo,
rnp->grphi, rnp->qsmask);
- /* Quiescent states for tasks on any now-offline CPUs. */
+ /*
+ * Quiescent states for tasks on any now-offline CPUs. Since we
+ * released the ofl and rnp lock before this loop, CPUs might
+ * have gone offline and we have to report QS on their behalf.
+ * See Requirements.rst > Hotplug CPU > Concurrent QS Reporting.
+ */
mask = rnp->qsmask & ~rnp->qsmaskinitnext;
rnp->rcu_gp_init_mask = mask;
if ((mask || rnp->wait_blkd_tasks) && rcu_is_leaf_node(rnp))
@@ -3072,6 +3101,10 @@ __call_rcu_common(struct rcu_head *head, rcu_callback_t func, bool lazy_in)
/* Misaligned rcu_head! */
WARN_ON_ONCE((unsigned long)head & (sizeof(void *) - 1));
+ /* Avoid NULL dereference if callback is NULL. */
+ if (WARN_ON_ONCE(!func))
+ return;
+
if (debug_rcu_head_queue(head)) {
/*
* Probable double call_rcu(), so leak the callback.
@@ -3239,7 +3272,7 @@ static void synchronize_rcu_normal(void)
trace_rcu_sr_normal(rcu_state.name, &rs.head, TPS("request"));
- if (!READ_ONCE(rcu_normal_wake_from_gp)) {
+ if (READ_ONCE(rcu_normal_wake_from_gp) < 1) {
wait_rcu_gp(call_rcu_hurry);
goto trace_complete_out;
}
@@ -4264,7 +4297,6 @@ int rcutree_online_cpu(unsigned int cpu)
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
if (rcu_scheduler_active == RCU_SCHEDULER_INACTIVE)
return 0; /* Too early in boot for scheduler work. */
- sync_sched_exp_online_cleanup(cpu);
// Stop-machine done, so allow nohz_full to disable tick.
tick_dep_clear(TICK_DEP_BIT_RCU);
@@ -4354,6 +4386,12 @@ void rcutree_report_cpu_dead(void)
* may introduce a new READ-side while it is actually off the QS masks.
*/
lockdep_assert_irqs_disabled();
+ /*
+ * CPUHP_AP_SMPCFD_DYING was the last call for rcu_exp_handler() execution.
+ * The requested QS must have been reported on the last context switch
+ * from stop machine to idle.
+ */
+ WARN_ON_ONCE(rdp->cpu_no_qs.b.exp);
// Do any dangling deferred wakeups.
do_nocb_deferred_wakeup(rdp);
@@ -4361,6 +4399,13 @@ void rcutree_report_cpu_dead(void)
/* Remove outgoing CPU from mask in the leaf rcu_node structure. */
mask = rdp->grpmask;
+
+ /*
+ * Hold the ofl_lock and rnp lock to avoid races between CPU going
+ * offline and doing a QS report (as below), versus rcu_gp_init().
+ * See Requirements.rst > Hotplug CPU > Concurrent QS Reporting section
+ * for more details.
+ */
arch_spin_lock(&rcu_state.ofl_lock);
raw_spin_lock_irqsave_rcu_node(rnp, flags); /* Enforce GP memory-order guarantee. */
rdp->rcu_ofl_gp_seq = READ_ONCE(rcu_state.gp_seq);
@@ -4371,6 +4416,7 @@ void rcutree_report_cpu_dead(void)
rcu_report_qs_rnp(mask, rnp, rnp->gp_seq, flags);
raw_spin_lock_irqsave_rcu_node(rnp, flags);
}
+ /* Clear from ->qsmaskinitnext to mark offline. */
WRITE_ONCE(rnp->qsmaskinitnext, rnp->qsmaskinitnext & ~mask);
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
arch_spin_unlock(&rcu_state.ofl_lock);
@@ -4843,6 +4889,12 @@ void __init rcu_init(void)
sync_wq = alloc_workqueue("sync_wq", WQ_MEM_RECLAIM, 0);
WARN_ON(!sync_wq);
+ /* Respect if explicitly disabled via a boot parameter. */
+ if (rcu_normal_wake_from_gp < 0) {
+ if (num_possible_cpus() <= WAKE_FROM_GP_CPU_THRESHOLD)
+ rcu_normal_wake_from_gp = 1;
+ }
+
/* Fill in default value for rcutree.qovld boot parameter. */
/* -After- the rcu_node ->lock fields are initialized! */
if (qovld < 0)
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index 3830c19cf2f6..de6ca13a7b5f 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -174,6 +174,17 @@ struct rcu_snap_record {
unsigned long jiffies; /* Track jiffies value */
};
+/*
+ * An IRQ work (deferred_qs_iw) is used by RCU to get the scheduler's attention.
+ * to report quiescent states at the soonest possible time.
+ * The request can be in one of the following states:
+ * - DEFER_QS_IDLE: An IRQ work is yet to be scheduled.
+ * - DEFER_QS_PENDING: An IRQ work was scheduled but either not yet run, or it
+ * ran and we still haven't reported a quiescent state.
+ */
+#define DEFER_QS_IDLE 0
+#define DEFER_QS_PENDING 1
+
/* Per-CPU data for read-copy update. */
struct rcu_data {
/* 1) quiescent-state and grace-period handling : */
@@ -192,7 +203,7 @@ struct rcu_data {
/* during and after the last grace */
/* period it is aware of. */
struct irq_work defer_qs_iw; /* Obtain later scheduler attention. */
- bool defer_qs_iw_pending; /* Scheduler attention pending? */
+ int defer_qs_iw_pending; /* Scheduler attention pending? */
struct work_struct strict_work; /* Schedule readers for strict GPs. */
/* 2) batch handling */
diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h
index c36c7d5575ca..6058a734090c 100644
--- a/kernel/rcu/tree_exp.h
+++ b/kernel/rcu/tree_exp.h
@@ -141,6 +141,13 @@ static void __maybe_unused sync_exp_reset_tree(void)
raw_spin_lock_irqsave_rcu_node(rnp, flags);
WARN_ON_ONCE(rnp->expmask);
WRITE_ONCE(rnp->expmask, rnp->expmaskinit);
+ /*
+ * Need to wait for any blocked tasks as well. Note that
+ * additional blocking tasks will also block the expedited GP
+ * until such time as the ->expmask bits are cleared.
+ */
+ if (rcu_is_leaf_node(rnp) && rcu_preempt_has_tasks(rnp))
+ WRITE_ONCE(rnp->exp_tasks, rnp->blkd_tasks.next);
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
}
}
@@ -393,13 +400,6 @@ static void __sync_rcu_exp_select_node_cpus(struct rcu_exp_work *rewp)
}
mask_ofl_ipi = rnp->expmask & ~mask_ofl_test;
- /*
- * Need to wait for any blocked tasks as well. Note that
- * additional blocking tasks will also block the expedited GP
- * until such time as the ->expmask bits are cleared.
- */
- if (rcu_preempt_has_tasks(rnp))
- WRITE_ONCE(rnp->exp_tasks, rnp->blkd_tasks.next);
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
/* IPI the remaining CPUs for expedited quiescent state. */
@@ -751,12 +751,8 @@ static void rcu_exp_handler(void *unused)
struct task_struct *t = current;
/*
- * First, is there no need for a quiescent state from this CPU,
- * or is this CPU already looking for a quiescent state for the
- * current grace period? If either is the case, just leave.
- * However, this should not happen due to the preemptible
- * sync_sched_exp_online_cleanup() implementation being a no-op,
- * so warn if this does happen.
+ * WARN if the CPU is unexpectedly already looking for a
+ * QS or has already reported one.
*/
ASSERT_EXCLUSIVE_WRITER_SCOPED(rdp->cpu_no_qs.b.exp);
if (WARN_ON_ONCE(!(READ_ONCE(rnp->expmask) & rdp->grpmask) ||
@@ -803,11 +799,6 @@ static void rcu_exp_handler(void *unused)
WARN_ON_ONCE(1);
}
-/* PREEMPTION=y, so no PREEMPTION=n expedited grace period to clean up after. */
-static void sync_sched_exp_online_cleanup(int cpu)
-{
-}
-
/*
* Scan the current list of tasks blocked within RCU read-side critical
* sections, printing out the tid of each that is blocking the current
@@ -885,38 +876,6 @@ static void rcu_exp_handler(void *unused)
rcu_exp_need_qs();
}
-/* Send IPI for expedited cleanup if needed at end of CPU-hotplug operation. */
-static void sync_sched_exp_online_cleanup(int cpu)
-{
- unsigned long flags;
- int my_cpu;
- struct rcu_data *rdp;
- int ret;
- struct rcu_node *rnp;
-
- rdp = per_cpu_ptr(&rcu_data, cpu);
- rnp = rdp->mynode;
- my_cpu = get_cpu();
- /* Quiescent state either not needed or already requested, leave. */
- if (!(READ_ONCE(rnp->expmask) & rdp->grpmask) ||
- READ_ONCE(rdp->cpu_no_qs.b.exp)) {
- put_cpu();
- return;
- }
- /* Quiescent state needed on current CPU, so set it up locally. */
- if (my_cpu == cpu) {
- local_irq_save(flags);
- rcu_exp_need_qs();
- local_irq_restore(flags);
- put_cpu();
- return;
- }
- /* Quiescent state needed on some other CPU, send IPI. */
- ret = smp_call_function_single(cpu, rcu_exp_handler, NULL, 0);
- put_cpu();
- WARN_ON_ONCE(ret);
-}
-
/*
* Because preemptible RCU does not exist, we never have to check for
* tasks blocked within RCU read-side critical sections that are
diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h
index b473ff056f49..e6cd56603cad 100644
--- a/kernel/rcu/tree_nocb.h
+++ b/kernel/rcu/tree_nocb.h
@@ -276,7 +276,7 @@ static void wake_nocb_gp_defer(struct rcu_data *rdp, int waketype,
* callback storms, no need to wake up too early.
*/
if (waketype == RCU_NOCB_WAKE_LAZY &&
- rdp->nocb_defer_wakeup == RCU_NOCB_WAKE_NOT) {
+ rdp_gp->nocb_defer_wakeup == RCU_NOCB_WAKE_NOT) {
mod_timer(&rdp_gp->nocb_timer, jiffies + rcu_get_jiffies_lazy_flush());
WRITE_ONCE(rdp_gp->nocb_defer_wakeup, waketype);
} else if (waketype == RCU_NOCB_WAKE_BYPASS) {
@@ -1146,7 +1146,6 @@ static bool rcu_nocb_rdp_offload_wait_cond(struct rcu_data *rdp)
static int rcu_nocb_rdp_offload(struct rcu_data *rdp)
{
int wake_gp;
- struct rcu_data *rdp_gp = rdp->nocb_gp_rdp;
WARN_ON_ONCE(cpu_online(rdp->cpu));
/*
@@ -1156,7 +1155,7 @@ static int rcu_nocb_rdp_offload(struct rcu_data *rdp)
if (!rdp->nocb_gp_rdp)
return -EINVAL;
- if (WARN_ON_ONCE(!rdp_gp->nocb_gp_kthread))
+ if (WARN_ON_ONCE(!rdp->nocb_gp_kthread))
return -EINVAL;
pr_info("Offloading %d\n", rdp->cpu);
@@ -1166,7 +1165,7 @@ static int rcu_nocb_rdp_offload(struct rcu_data *rdp)
wake_gp = rcu_nocb_queue_toggle_rdp(rdp);
if (wake_gp)
- wake_up_process(rdp_gp->nocb_gp_kthread);
+ wake_up_process(rdp->nocb_gp_kthread);
swait_event_exclusive(rdp->nocb_state_wq,
rcu_nocb_rdp_offload_wait_cond(rdp));
@@ -1564,6 +1563,9 @@ static void show_rcu_nocb_state(struct rcu_data *rdp)
if (rdp->nocb_gp_rdp == rdp)
show_rcu_nocb_gp_state(rdp);
+ if (!rcu_segcblist_is_offloaded(&rdp->cblist))
+ return;
+
nocb_next_rdp = list_next_or_null_rcu(&rdp->nocb_gp_rdp->nocb_head_rdp,
&rdp->nocb_entry_rdp,
typeof(*rdp),
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 0b0f56f6abc8..fc14adf15cbb 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -486,13 +486,16 @@ rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, unsigned long flags)
struct rcu_node *rnp;
union rcu_special special;
+ rdp = this_cpu_ptr(&rcu_data);
+ if (rdp->defer_qs_iw_pending == DEFER_QS_PENDING)
+ rdp->defer_qs_iw_pending = DEFER_QS_IDLE;
+
/*
* If RCU core is waiting for this CPU to exit its critical section,
* report the fact that it has exited. Because irqs are disabled,
* t->rcu_read_unlock_special cannot change.
*/
special = t->rcu_read_unlock_special;
- rdp = this_cpu_ptr(&rcu_data);
if (!special.s && !rdp->cpu_no_qs.b.exp) {
local_irq_restore(flags);
return;
@@ -534,7 +537,6 @@ rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, unsigned long flags)
WARN_ON_ONCE(rnp->completedqs == rnp->gp_seq &&
(!empty_norm || rnp->qsmask));
empty_exp = sync_rcu_exp_done(rnp);
- smp_mb(); /* ensure expedited fastpath sees end of RCU c-s. */
np = rcu_next_node_entry(t, rnp);
list_del_init(&t->rcu_node_entry);
t->rcu_blocked_node = NULL;
@@ -624,10 +626,98 @@ notrace void rcu_preempt_deferred_qs(struct task_struct *t)
*/
static void rcu_preempt_deferred_qs_handler(struct irq_work *iwp)
{
+ unsigned long flags;
struct rcu_data *rdp;
rdp = container_of(iwp, struct rcu_data, defer_qs_iw);
- rdp->defer_qs_iw_pending = false;
+ local_irq_save(flags);
+
+ /*
+ * If the IRQ work handler happens to run in the middle of RCU read-side
+ * critical section, it could be ineffective in getting the scheduler's
+ * attention to report a deferred quiescent state (the whole point of the
+ * IRQ work). For this reason, requeue the IRQ work.
+ *
+ * Basically, we want to avoid following situation:
+ * 1. rcu_read_unlock() queues IRQ work (state -> DEFER_QS_PENDING)
+ * 2. CPU enters new rcu_read_lock()
+ * 3. IRQ work runs but cannot report QS due to rcu_preempt_depth() > 0
+ * 4. rcu_read_unlock() does not re-queue work (state still PENDING)
+ * 5. Deferred QS reporting does not happen.
+ */
+ if (rcu_preempt_depth() > 0)
+ WRITE_ONCE(rdp->defer_qs_iw_pending, DEFER_QS_IDLE);
+
+ local_irq_restore(flags);
+}
+
+/*
+ * Check if expedited grace period processing during unlock is needed.
+ *
+ * This function determines whether expedited handling is required based on:
+ * 1. Task blocking an expedited grace period (based on a heuristic, could be
+ * false-positive, see below.)
+ * 2. CPU participating in an expedited grace period
+ * 3. Strict grace period mode requiring expedited handling
+ * 4. RCU priority deboosting needs when interrupts were disabled
+ *
+ * @t: The task being checked
+ * @rdp: The per-CPU RCU data
+ * @rnp: The RCU node for this CPU
+ * @irqs_were_disabled: Whether interrupts were disabled before rcu_read_unlock()
+ *
+ * Returns true if expedited processing of the rcu_read_unlock() is needed.
+ */
+static bool rcu_unlock_needs_exp_handling(struct task_struct *t,
+ struct rcu_data *rdp,
+ struct rcu_node *rnp,
+ bool irqs_were_disabled)
+{
+ /*
+ * Check if this task is blocking an expedited grace period. If the
+ * task was preempted within an RCU read-side critical section and is
+ * on the expedited grace period blockers list (exp_tasks), we need
+ * expedited handling to unblock the expedited GP. This is not an exact
+ * check because 't' might not be on the exp_tasks list at all - its
+ * just a fast heuristic that can be false-positive sometimes.
+ */
+ if (t->rcu_blocked_node && READ_ONCE(t->rcu_blocked_node->exp_tasks))
+ return true;
+
+ /*
+ * Check if this CPU is participating in an expedited grace period.
+ * The expmask bitmap tracks which CPUs need to check in for the
+ * current expedited GP. If our CPU's bit is set, we need expedited
+ * handling to help complete the expedited GP.
+ */
+ if (rdp->grpmask & READ_ONCE(rnp->expmask))
+ return true;
+
+ /*
+ * In CONFIG_RCU_STRICT_GRACE_PERIOD=y kernels, all grace periods
+ * are treated as short for testing purposes even if that means
+ * disturbing the system more. Check if either:
+ * - This CPU has not yet reported a quiescent state, or
+ * - This task was preempted within an RCU critical section
+ * In either case, require expedited handling for strict GP mode.
+ */
+ if (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD) &&
+ ((rdp->grpmask & READ_ONCE(rnp->qsmask)) || t->rcu_blocked_node))
+ return true;
+
+ /*
+ * RCU priority boosting case: If a task is subject to RCU priority
+ * boosting and exits an RCU read-side critical section with interrupts
+ * disabled, we need expedited handling to ensure timely deboosting.
+ * Without this, a low-priority task could incorrectly run at high
+ * real-time priority for an extended period degrading real-time
+ * responsiveness. This applies to all CONFIG_RCU_BOOST=y kernels,
+ * not just to PREEMPT_RT.
+ */
+ if (IS_ENABLED(CONFIG_RCU_BOOST) && irqs_were_disabled && t->rcu_blocked_node)
+ return true;
+
+ return false;
}
/*
@@ -649,18 +739,14 @@ static void rcu_read_unlock_special(struct task_struct *t)
local_irq_save(flags);
irqs_were_disabled = irqs_disabled_flags(flags);
if (preempt_bh_were_disabled || irqs_were_disabled) {
- bool expboost; // Expedited GP in flight or possible boosting.
+ bool needs_exp; // Expedited handling needed.
struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
struct rcu_node *rnp = rdp->mynode;
- expboost = (t->rcu_blocked_node && READ_ONCE(t->rcu_blocked_node->exp_tasks)) ||
- (rdp->grpmask & READ_ONCE(rnp->expmask)) ||
- (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD) &&
- ((rdp->grpmask & READ_ONCE(rnp->qsmask)) || t->rcu_blocked_node)) ||
- (IS_ENABLED(CONFIG_RCU_BOOST) && irqs_were_disabled &&
- t->rcu_blocked_node);
+ needs_exp = rcu_unlock_needs_exp_handling(t, rdp, rnp, irqs_were_disabled);
+
// Need to defer quiescent state until everything is enabled.
- if (use_softirq && (in_hardirq() || (expboost && !irqs_were_disabled))) {
+ if (use_softirq && (in_hardirq() || (needs_exp && !irqs_were_disabled))) {
// Using softirq, safe to awaken, and either the
// wakeup is free or there is either an expedited
// GP in flight or a potential need to deboost.
@@ -673,17 +759,13 @@ static void rcu_read_unlock_special(struct task_struct *t)
set_tsk_need_resched(current);
set_preempt_need_resched();
if (IS_ENABLED(CONFIG_IRQ_WORK) && irqs_were_disabled &&
- expboost && !rdp->defer_qs_iw_pending && cpu_online(rdp->cpu)) {
+ needs_exp && rdp->defer_qs_iw_pending != DEFER_QS_PENDING &&
+ cpu_online(rdp->cpu)) {
// Get scheduler to re-evaluate and call hooks.
// If !IRQ_WORK, FQS scan will eventually IPI.
- if (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD) &&
- IS_ENABLED(CONFIG_PREEMPT_RT))
- rdp->defer_qs_iw = IRQ_WORK_INIT_HARD(
- rcu_preempt_deferred_qs_handler);
- else
- init_irq_work(&rdp->defer_qs_iw,
- rcu_preempt_deferred_qs_handler);
- rdp->defer_qs_iw_pending = true;
+ rdp->defer_qs_iw =
+ IRQ_WORK_INIT_HARD(rcu_preempt_deferred_qs_handler);
+ rdp->defer_qs_iw_pending = DEFER_QS_PENDING;
irq_work_queue_on(&rdp->defer_qs_iw, rdp->cpu);
}
}
diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h
index 486c00536207..2a5a5329322d 100644
--- a/kernel/rcu/tree_stall.h
+++ b/kernel/rcu/tree_stall.h
@@ -17,8 +17,37 @@
// Controlling CPU stall warnings, including delay calculation.
/* panic() on RCU Stall sysctl. */
-int sysctl_panic_on_rcu_stall __read_mostly;
-int sysctl_max_rcu_stall_to_panic __read_mostly;
+static int sysctl_panic_on_rcu_stall __read_mostly;
+static int sysctl_max_rcu_stall_to_panic __read_mostly;
+
+static const struct ctl_table rcu_stall_sysctl_table[] = {
+ {
+ .procname = "panic_on_rcu_stall",
+ .data = &sysctl_panic_on_rcu_stall,
+ .maxlen = sizeof(sysctl_panic_on_rcu_stall),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
+ },
+ {
+ .procname = "max_rcu_stall_to_panic",
+ .data = &sysctl_max_rcu_stall_to_panic,
+ .maxlen = sizeof(sysctl_max_rcu_stall_to_panic),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ONE,
+ .extra2 = SYSCTL_INT_MAX,
+ },
+};
+
+static int __init init_rcu_stall_sysctl(void)
+{
+ register_sysctl_init("kernel", rcu_stall_sysctl_table);
+ return 0;
+}
+
+subsys_initcall(init_rcu_stall_sysctl);
#ifdef CONFIG_SYSFS
@@ -953,8 +982,7 @@ void show_rcu_gp_kthreads(void)
for_each_possible_cpu(cpu) {
rdp = per_cpu_ptr(&rcu_data, cpu);
cbs += data_race(READ_ONCE(rdp->n_cbs_invoked));
- if (rcu_segcblist_is_offloaded(&rdp->cblist))
- show_rcu_nocb_state(rdp);
+ show_rcu_nocb_state(rdp);
}
pr_info("RCU callbacks invoked since boot: %lu\n", cbs);
show_rcu_tasks_gp_kthreads();