summaryrefslogtreecommitdiff
path: root/kernel/rcu
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2014-08-04 15:55:08 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2014-08-04 15:55:08 -0700
commit5bda4f638f36ef4c4e3b1397b02affc3db94356e (patch)
treed1bde148cde9981c31941382b2076084c7f5796c /kernel/rcu
parenta45c657f28f82b056173d1afc2e7ed1f1f68829f (diff)
parent01c9db827146ce321562a992a5dbc1a49b1a99ce (diff)
Merge branch 'core-rcu-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull RCU changes from Ingo Molar: "The main changes: - torture-test updates - callback-offloading changes - maintainership changes - update RCU documentation - miscellaneous fixes" * 'core-rcu-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (32 commits) rcu: Allow for NULL tick_nohz_full_mask when nohz_full= missing rcu: Fix a sparse warning in rcu_report_unblock_qs_rnp() rcu: Fix a sparse warning in rcu_initiate_boost() rcu: Fix __rcu_reclaim() to use true/false for bool rcu: Remove CONFIG_PROVE_RCU_DELAY rcu: Use __this_cpu_read() instead of per_cpu_ptr() rcu: Don't use NMIs to dump other CPUs' stacks rcu: Bind grace-period kthreads to non-NO_HZ_FULL CPUs rcu: Simplify priority boosting by putting rt_mutex in rcu_node rcu: Check both root and current rcu_node when setting up future grace period rcu: Allow post-unlock reference for rt_mutex rcu: Loosen __call_rcu()'s rcu_head alignment constraint rcu: Eliminate read-modify-write ACCESS_ONCE() calls rcu: Remove redundant ACCESS_ONCE() from tick_do_timer_cpu rcu: Make rcu node arrays static const char * const signal: Explain local_irq_save() call rcu: Handle obsolete references to TINY_PREEMPT_RCU rcu: Document deadlock-avoidance information for rcu_read_unlock() scripts: Teach get_maintainer.pl about the new "R:" tag rcu: Update rcu torture maintainership filename patterns ...
Diffstat (limited to 'kernel/rcu')
-rw-r--r--kernel/rcu/rcu.h8
-rw-r--r--kernel/rcu/srcu.c4
-rw-r--r--kernel/rcu/tree.c59
-rw-r--r--kernel/rcu/tree.h36
-rw-r--r--kernel/rcu/tree_plugin.h302
-rw-r--r--kernel/rcu/update.c3
6 files changed, 314 insertions, 98 deletions
diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h
index bfda2726ca45..ff1a6de62f17 100644
--- a/kernel/rcu/rcu.h
+++ b/kernel/rcu/rcu.h
@@ -99,6 +99,10 @@ static inline void debug_rcu_head_unqueue(struct rcu_head *head)
void kfree(const void *);
+/*
+ * Reclaim the specified callback, either by invoking it (non-lazy case)
+ * or freeing it directly (lazy case). Return true if lazy, false otherwise.
+ */
static inline bool __rcu_reclaim(const char *rn, struct rcu_head *head)
{
unsigned long offset = (unsigned long)head->func;
@@ -108,12 +112,12 @@ static inline bool __rcu_reclaim(const char *rn, struct rcu_head *head)
RCU_TRACE(trace_rcu_invoke_kfree_callback(rn, head, offset));
kfree((void *)head - offset);
rcu_lock_release(&rcu_callback_map);
- return 1;
+ return true;
} else {
RCU_TRACE(trace_rcu_invoke_callback(rn, head));
head->func(head);
rcu_lock_release(&rcu_callback_map);
- return 0;
+ return false;
}
}
diff --git a/kernel/rcu/srcu.c b/kernel/rcu/srcu.c
index c639556f3fa0..e037f3eb2f7b 100644
--- a/kernel/rcu/srcu.c
+++ b/kernel/rcu/srcu.c
@@ -298,9 +298,9 @@ int __srcu_read_lock(struct srcu_struct *sp)
idx = ACCESS_ONCE(sp->completed) & 0x1;
preempt_disable();
- ACCESS_ONCE(this_cpu_ptr(sp->per_cpu_ref)->c[idx]) += 1;
+ __this_cpu_inc(sp->per_cpu_ref->c[idx]);
smp_mb(); /* B */ /* Avoid leaking the critical section. */
- ACCESS_ONCE(this_cpu_ptr(sp->per_cpu_ref)->seq[idx]) += 1;
+ __this_cpu_inc(sp->per_cpu_ref->seq[idx]);
preempt_enable();
return idx;
}
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 625d0b0cd75a..1b70cb6fbe3c 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -1013,10 +1013,7 @@ static void record_gp_stall_check_time(struct rcu_state *rsp)
}
/*
- * Dump stacks of all tasks running on stalled CPUs. This is a fallback
- * for architectures that do not implement trigger_all_cpu_backtrace().
- * The NMI-triggered stack traces are more accurate because they are
- * printed by the target CPU.
+ * Dump stacks of all tasks running on stalled CPUs.
*/
static void rcu_dump_cpu_stacks(struct rcu_state *rsp)
{
@@ -1094,7 +1091,7 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
(long)rsp->gpnum, (long)rsp->completed, totqlen);
if (ndetected == 0)
pr_err("INFO: Stall ended before state dump start\n");
- else if (!trigger_all_cpu_backtrace())
+ else
rcu_dump_cpu_stacks(rsp);
/* Complain about tasks blocking the grace period. */
@@ -1125,8 +1122,7 @@ static void print_cpu_stall(struct rcu_state *rsp)
pr_cont(" (t=%lu jiffies g=%ld c=%ld q=%lu)\n",
jiffies - rsp->gp_start,
(long)rsp->gpnum, (long)rsp->completed, totqlen);
- if (!trigger_all_cpu_backtrace())
- dump_stack();
+ rcu_dump_cpu_stacks(rsp);
raw_spin_lock_irqsave(&rnp->lock, flags);
if (ULONG_CMP_GE(jiffies, ACCESS_ONCE(rsp->jiffies_stall)))
@@ -1305,10 +1301,16 @@ rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp,
* believe that a grace period is in progress, then we must wait
* for the one following, which is in "c". Because our request
* will be noticed at the end of the current grace period, we don't
- * need to explicitly start one.
+ * need to explicitly start one. We only do the lockless check
+ * of rnp_root's fields if the current rcu_node structure thinks
+ * there is no grace period in flight, and because we hold rnp->lock,
+ * the only possible change is when rnp_root's two fields are
+ * equal, in which case rnp_root->gpnum might be concurrently
+ * incremented. But that is OK, as it will just result in our
+ * doing some extra useless work.
*/
if (rnp->gpnum != rnp->completed ||
- ACCESS_ONCE(rnp->gpnum) != ACCESS_ONCE(rnp->completed)) {
+ ACCESS_ONCE(rnp_root->gpnum) != ACCESS_ONCE(rnp_root->completed)) {
rnp->need_future_gp[c & 0x1]++;
trace_rcu_future_gp(rnp, rdp, c, TPS("Startedleaf"));
goto out;
@@ -1645,11 +1647,6 @@ static int rcu_gp_init(struct rcu_state *rsp)
rnp->level, rnp->grplo,
rnp->grphi, rnp->qsmask);
raw_spin_unlock_irq(&rnp->lock);
-#ifdef CONFIG_PROVE_RCU_DELAY
- if ((prandom_u32() % (rcu_num_nodes + 1)) == 0 &&
- system_state == SYSTEM_RUNNING)
- udelay(200);
-#endif /* #ifdef CONFIG_PROVE_RCU_DELAY */
cond_resched();
}
@@ -2347,7 +2344,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
}
smp_mb(); /* List handling before counting for rcu_barrier(). */
rdp->qlen_lazy -= count_lazy;
- ACCESS_ONCE(rdp->qlen) -= count;
+ ACCESS_ONCE(rdp->qlen) = rdp->qlen - count;
rdp->n_cbs_invoked += count;
/* Reinstate batch limit if we have worked down the excess. */
@@ -2485,14 +2482,14 @@ static void force_quiescent_state(struct rcu_state *rsp)
struct rcu_node *rnp_old = NULL;
/* Funnel through hierarchy to reduce memory contention. */
- rnp = per_cpu_ptr(rsp->rda, raw_smp_processor_id())->mynode;
+ rnp = __this_cpu_read(rsp->rda->mynode);
for (; rnp != NULL; rnp = rnp->parent) {
ret = (ACCESS_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) ||
!raw_spin_trylock(&rnp->fqslock);
if (rnp_old != NULL)
raw_spin_unlock(&rnp_old->fqslock);
if (ret) {
- ACCESS_ONCE(rsp->n_force_qs_lh)++;
+ rsp->n_force_qs_lh++;
return;
}
rnp_old = rnp;
@@ -2504,7 +2501,7 @@ static void force_quiescent_state(struct rcu_state *rsp)
smp_mb__after_unlock_lock();
raw_spin_unlock(&rnp_old->fqslock);
if (ACCESS_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) {
- ACCESS_ONCE(rsp->n_force_qs_lh)++;
+ rsp->n_force_qs_lh++;
raw_spin_unlock_irqrestore(&rnp_old->lock, flags);
return; /* Someone beat us to it. */
}
@@ -2662,7 +2659,7 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
unsigned long flags;
struct rcu_data *rdp;
- WARN_ON_ONCE((unsigned long)head & 0x3); /* Misaligned rcu_head! */
+ WARN_ON_ONCE((unsigned long)head & 0x1); /* Misaligned rcu_head! */
if (debug_rcu_head_queue(head)) {
/* Probable double call_rcu(), so leak the callback. */
ACCESS_ONCE(head->func) = rcu_leak_callback;
@@ -2693,7 +2690,7 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
local_irq_restore(flags);
return;
}
- ACCESS_ONCE(rdp->qlen)++;
+ ACCESS_ONCE(rdp->qlen) = rdp->qlen + 1;
if (lazy)
rdp->qlen_lazy++;
else
@@ -3257,7 +3254,7 @@ static void _rcu_barrier(struct rcu_state *rsp)
* ACCESS_ONCE() to prevent the compiler from speculating
* the increment to precede the early-exit check.
*/
- ACCESS_ONCE(rsp->n_barrier_done)++;
+ ACCESS_ONCE(rsp->n_barrier_done) = rsp->n_barrier_done + 1;
WARN_ON_ONCE((rsp->n_barrier_done & 0x1) != 1);
_rcu_barrier_trace(rsp, "Inc1", -1, rsp->n_barrier_done);
smp_mb(); /* Order ->n_barrier_done increment with below mechanism. */
@@ -3307,7 +3304,7 @@ static void _rcu_barrier(struct rcu_state *rsp)
/* Increment ->n_barrier_done to prevent duplicate work. */
smp_mb(); /* Keep increment after above mechanism. */
- ACCESS_ONCE(rsp->n_barrier_done)++;
+ ACCESS_ONCE(rsp->n_barrier_done) = rsp->n_barrier_done + 1;
WARN_ON_ONCE((rsp->n_barrier_done & 0x1) != 0);
_rcu_barrier_trace(rsp, "Inc2", -1, rsp->n_barrier_done);
smp_mb(); /* Keep increment before caller's subsequent code. */
@@ -3564,14 +3561,16 @@ static void __init rcu_init_levelspread(struct rcu_state *rsp)
static void __init rcu_init_one(struct rcu_state *rsp,
struct rcu_data __percpu *rda)
{
- static char *buf[] = { "rcu_node_0",
- "rcu_node_1",
- "rcu_node_2",
- "rcu_node_3" }; /* Match MAX_RCU_LVLS */
- static char *fqs[] = { "rcu_node_fqs_0",
- "rcu_node_fqs_1",
- "rcu_node_fqs_2",
- "rcu_node_fqs_3" }; /* Match MAX_RCU_LVLS */
+ static const char * const buf[] = {
+ "rcu_node_0",
+ "rcu_node_1",
+ "rcu_node_2",
+ "rcu_node_3" }; /* Match MAX_RCU_LVLS */
+ static const char * const fqs[] = {
+ "rcu_node_fqs_0",
+ "rcu_node_fqs_1",
+ "rcu_node_fqs_2",
+ "rcu_node_fqs_3" }; /* Match MAX_RCU_LVLS */
static u8 fl_mask = 0x1;
int cpustride = 1;
int i;
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index 0f69a79c5b7d..71e64c718f75 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -172,6 +172,14 @@ struct rcu_node {
/* queued on this rcu_node structure that */
/* are blocking the current grace period, */
/* there can be no such task. */
+ struct completion boost_completion;
+ /* Used to ensure that the rt_mutex used */
+ /* to carry out the boosting is fully */
+ /* released with no future boostee accesses */
+ /* before that rt_mutex is re-initialized. */
+ struct rt_mutex boost_mtx;
+ /* Used only for the priority-boosting */
+ /* side effect, not as a lock. */
unsigned long boost_time;
/* When to start boosting (jiffies). */
struct task_struct *boost_kthread_task;
@@ -334,11 +342,29 @@ struct rcu_data {
struct rcu_head **nocb_tail;
atomic_long_t nocb_q_count; /* # CBs waiting for kthread */
atomic_long_t nocb_q_count_lazy; /* (approximate). */
+ struct rcu_head *nocb_follower_head; /* CBs ready to invoke. */
+ struct rcu_head **nocb_follower_tail;
+ atomic_long_t nocb_follower_count; /* # CBs ready to invoke. */
+ atomic_long_t nocb_follower_count_lazy; /* (approximate). */
int nocb_p_count; /* # CBs being invoked by kthread */
int nocb_p_count_lazy; /* (approximate). */
wait_queue_head_t nocb_wq; /* For nocb kthreads to sleep on. */
struct task_struct *nocb_kthread;
bool nocb_defer_wakeup; /* Defer wakeup of nocb_kthread. */
+
+ /* The following fields are used by the leader, hence own cacheline. */
+ struct rcu_head *nocb_gp_head ____cacheline_internodealigned_in_smp;
+ /* CBs waiting for GP. */
+ struct rcu_head **nocb_gp_tail;
+ long nocb_gp_count;
+ long nocb_gp_count_lazy;
+ bool nocb_leader_wake; /* Is the nocb leader thread awake? */
+ struct rcu_data *nocb_next_follower;
+ /* Next follower in wakeup chain. */
+
+ /* The following fields are used by the follower, hence new cachline. */
+ struct rcu_data *nocb_leader ____cacheline_internodealigned_in_smp;
+ /* Leader CPU takes GP-end wakeups. */
#endif /* #ifdef CONFIG_RCU_NOCB_CPU */
/* 8) RCU CPU stall data. */
@@ -587,8 +613,14 @@ static bool rcu_nohz_full_cpu(struct rcu_state *rsp);
/* Sum up queue lengths for tracing. */
static inline void rcu_nocb_q_lengths(struct rcu_data *rdp, long *ql, long *qll)
{
- *ql = atomic_long_read(&rdp->nocb_q_count) + rdp->nocb_p_count;
- *qll = atomic_long_read(&rdp->nocb_q_count_lazy) + rdp->nocb_p_count_lazy;
+ *ql = atomic_long_read(&rdp->nocb_q_count) +
+ rdp->nocb_p_count +
+ atomic_long_read(&rdp->nocb_follower_count) +
+ rdp->nocb_p_count + rdp->nocb_gp_count;
+ *qll = atomic_long_read(&rdp->nocb_q_count_lazy) +
+ rdp->nocb_p_count_lazy +
+ atomic_long_read(&rdp->nocb_follower_count_lazy) +
+ rdp->nocb_p_count_lazy + rdp->nocb_gp_count_lazy;
}
#else /* #ifdef CONFIG_RCU_NOCB_CPU */
static inline void rcu_nocb_q_lengths(struct rcu_data *rdp, long *ql, long *qll)
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 02ac0fb186b8..00dc411e9676 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -33,6 +33,7 @@
#define RCU_KTHREAD_PRIO 1
#ifdef CONFIG_RCU_BOOST
+#include "../locking/rtmutex_common.h"
#define RCU_BOOST_PRIO CONFIG_RCU_BOOST_PRIO
#else
#define RCU_BOOST_PRIO RCU_KTHREAD_PRIO
@@ -336,7 +337,7 @@ void rcu_read_unlock_special(struct task_struct *t)
unsigned long flags;
struct list_head *np;
#ifdef CONFIG_RCU_BOOST
- struct rt_mutex *rbmp = NULL;
+ bool drop_boost_mutex = false;
#endif /* #ifdef CONFIG_RCU_BOOST */
struct rcu_node *rnp;
int special;
@@ -398,11 +399,8 @@ void rcu_read_unlock_special(struct task_struct *t)
#ifdef CONFIG_RCU_BOOST
if (&t->rcu_node_entry == rnp->boost_tasks)
rnp->boost_tasks = np;
- /* Snapshot/clear ->rcu_boost_mutex with rcu_node lock held. */
- if (t->rcu_boost_mutex) {
- rbmp = t->rcu_boost_mutex;
- t->rcu_boost_mutex = NULL;
- }
+ /* Snapshot ->boost_mtx ownership with rcu_node lock held. */
+ drop_boost_mutex = rt_mutex_owner(&rnp->boost_mtx) == t;
#endif /* #ifdef CONFIG_RCU_BOOST */
/*
@@ -427,8 +425,10 @@ void rcu_read_unlock_special(struct task_struct *t)
#ifdef CONFIG_RCU_BOOST
/* Unboost if we were boosted. */
- if (rbmp)
- rt_mutex_unlock(rbmp);
+ if (drop_boost_mutex) {
+ rt_mutex_unlock(&rnp->boost_mtx);
+ complete(&rnp->boost_completion);
+ }
#endif /* #ifdef CONFIG_RCU_BOOST */
/*
@@ -988,6 +988,7 @@ static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp)
/* Because preemptible RCU does not exist, no quieting of tasks. */
static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags)
+ __releases(rnp->lock)
{
raw_spin_unlock_irqrestore(&rnp->lock, flags);
}
@@ -1149,7 +1150,6 @@ static void rcu_wake_cond(struct task_struct *t, int status)
static int rcu_boost(struct rcu_node *rnp)
{
unsigned long flags;
- struct rt_mutex mtx;
struct task_struct *t;
struct list_head *tb;
@@ -1200,11 +1200,15 @@ static int rcu_boost(struct rcu_node *rnp)
* section.
*/
t = container_of(tb, struct task_struct, rcu_node_entry);
- rt_mutex_init_proxy_locked(&mtx, t);
- t->rcu_boost_mutex = &mtx;
+ rt_mutex_init_proxy_locked(&rnp->boost_mtx, t);
+ init_completion(&rnp->boost_completion);
raw_spin_unlock_irqrestore(&rnp->lock, flags);
- rt_mutex_lock(&mtx); /* Side effect: boosts task t's priority. */
- rt_mutex_unlock(&mtx); /* Keep lockdep happy. */
+ /* Lock only for side effect: boosts task t's priority. */
+ rt_mutex_lock(&rnp->boost_mtx);
+ rt_mutex_unlock(&rnp->boost_mtx); /* Then keep lockdep happy. */
+
+ /* Wait for boostee to be done w/boost_mtx before reinitializing. */
+ wait_for_completion(&rnp->boost_completion);
return ACCESS_ONCE(rnp->exp_tasks) != NULL ||
ACCESS_ONCE(rnp->boost_tasks) != NULL;
@@ -1256,6 +1260,7 @@ static int rcu_boost_kthread(void *arg)
* about it going away.
*/
static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags)
+ __releases(rnp->lock)
{
struct task_struct *t;
@@ -1491,6 +1496,7 @@ static void rcu_prepare_kthreads(int cpu)
#else /* #ifdef CONFIG_RCU_BOOST */
static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags)
+ __releases(rnp->lock)
{
raw_spin_unlock_irqrestore(&rnp->lock, flags);
}
@@ -2060,6 +2066,22 @@ bool rcu_is_nocb_cpu(int cpu)
#endif /* #ifndef CONFIG_RCU_NOCB_CPU_ALL */
/*
+ * Kick the leader kthread for this NOCB group.
+ */
+static void wake_nocb_leader(struct rcu_data *rdp, bool force)
+{
+ struct rcu_data *rdp_leader = rdp->nocb_leader;
+
+ if (!ACCESS_ONCE(rdp_leader->nocb_kthread))
+ return;
+ if (!ACCESS_ONCE(rdp_leader->nocb_leader_wake) || force) {
+ /* Prior xchg orders against prior callback enqueue. */
+ ACCESS_ONCE(rdp_leader->nocb_leader_wake) = true;
+ wake_up(&rdp_leader->nocb_wq);
+ }
+}
+
+/*
* Enqueue the specified string of rcu_head structures onto the specified
* CPU's no-CBs lists. The CPU is specified by rdp, the head of the
* string by rhp, and the tail of the string by rhtp. The non-lazy/lazy
@@ -2093,7 +2115,8 @@ static void __call_rcu_nocb_enqueue(struct rcu_data *rdp,
len = atomic_long_read(&rdp->nocb_q_count);
if (old_rhpp == &rdp->nocb_head) {
if (!irqs_disabled_flags(flags)) {
- wake_up(&rdp->nocb_wq); /* ... if queue was empty ... */
+ /* ... if queue was empty ... */
+ wake_nocb_leader(rdp, false);
trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
TPS("WakeEmpty"));
} else {
@@ -2103,7 +2126,8 @@ static void __call_rcu_nocb_enqueue(struct rcu_data *rdp,
}
rdp->qlen_last_fqs_check = 0;
} else if (len > rdp->qlen_last_fqs_check + qhimark) {
- wake_up_process(t); /* ... or if many callbacks queued. */
+ /* ... or if many callbacks queued. */
+ wake_nocb_leader(rdp, true);
rdp->qlen_last_fqs_check = LONG_MAX / 2;
trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("WakeOvf"));
} else {
@@ -2213,13 +2237,150 @@ static void rcu_nocb_wait_gp(struct rcu_data *rdp)
}
/*
+ * Leaders come here to wait for additional callbacks to show up.
+ * This function does not return until callbacks appear.
+ */
+static void nocb_leader_wait(struct rcu_data *my_rdp)
+{
+ bool firsttime = true;
+ bool gotcbs;
+ struct rcu_data *rdp;
+ struct rcu_head **tail;
+
+wait_again:
+
+ /* Wait for callbacks to appear. */
+ if (!rcu_nocb_poll) {
+ trace_rcu_nocb_wake(my_rdp->rsp->name, my_rdp->cpu, "Sleep");
+ wait_event_interruptible(my_rdp->nocb_wq,
+ ACCESS_ONCE(my_rdp->nocb_leader_wake));
+ /* Memory barrier handled by smp_mb() calls below and repoll. */
+ } else if (firsttime) {
+ firsttime = false; /* Don't drown trace log with "Poll"! */
+ trace_rcu_nocb_wake(my_rdp->rsp->name, my_rdp->cpu, "Poll");
+ }
+
+ /*
+ * Each pass through the following loop checks a follower for CBs.
+ * We are our own first follower. Any CBs found are moved to
+ * nocb_gp_head, where they await a grace period.
+ */
+ gotcbs = false;
+ for (rdp = my_rdp; rdp; rdp = rdp->nocb_next_follower) {
+ rdp->nocb_gp_head = ACCESS_ONCE(rdp->nocb_head);
+ if (!rdp->nocb_gp_head)
+ continue; /* No CBs here, try next follower. */
+
+ /* Move callbacks to wait-for-GP list, which is empty. */
+ ACCESS_ONCE(rdp->nocb_head) = NULL;
+ rdp->nocb_gp_tail = xchg(&rdp->nocb_tail, &rdp->nocb_head);
+ rdp->nocb_gp_count = atomic_long_xchg(&rdp->nocb_q_count, 0);
+ rdp->nocb_gp_count_lazy =
+ atomic_long_xchg(&rdp->nocb_q_count_lazy, 0);
+ gotcbs = true;
+ }
+
+ /*
+ * If there were no callbacks, sleep a bit, rescan after a
+ * memory barrier, and go retry.
+ */
+ if (unlikely(!gotcbs)) {
+ if (!rcu_nocb_poll)
+ trace_rcu_nocb_wake(my_rdp->rsp->name, my_rdp->cpu,
+ "WokeEmpty");
+ flush_signals(current);
+ schedule_timeout_interruptible(1);
+
+ /* Rescan in case we were a victim of memory ordering. */
+ my_rdp->nocb_leader_wake = false;
+ smp_mb(); /* Ensure _wake false before scan. */
+ for (rdp = my_rdp; rdp; rdp = rdp->nocb_next_follower)
+ if (ACCESS_ONCE(rdp->nocb_head)) {
+ /* Found CB, so short-circuit next wait. */
+ my_rdp->nocb_leader_wake = true;
+ break;
+ }
+ goto wait_again;
+ }
+
+ /* Wait for one grace period. */
+ rcu_nocb_wait_gp(my_rdp);
+
+ /*
+ * We left ->nocb_leader_wake set to reduce cache thrashing.
+ * We clear it now, but recheck for new callbacks while
+ * traversing our follower list.
+ */
+ my_rdp->nocb_leader_wake = false;
+ smp_mb(); /* Ensure _wake false before scan of ->nocb_head. */
+
+ /* Each pass through the following loop wakes a follower, if needed. */
+ for (rdp = my_rdp; rdp; rdp = rdp->nocb_next_follower) {
+ if (ACCESS_ONCE(rdp->nocb_head))
+ my_rdp->nocb_leader_wake = true; /* No need to wait. */
+ if (!rdp->nocb_gp_head)
+ continue; /* No CBs, so no need to wake follower. */
+
+ /* Append callbacks to follower's "done" list. */
+ tail = xchg(&rdp->nocb_follower_tail, rdp->nocb_gp_tail);
+ *tail = rdp->nocb_gp_head;
+ atomic_long_add(rdp->nocb_gp_count, &rdp->nocb_follower_count);
+ atomic_long_add(rdp->nocb_gp_count_lazy,
+ &rdp->nocb_follower_count_lazy);
+ if (rdp != my_rdp && tail == &rdp->nocb_follower_head) {
+ /*
+ * List was empty, wake up the follower.
+ * Memory barriers supplied by atomic_long_add().
+ */
+ wake_up(&rdp->nocb_wq);
+ }
+ }
+
+ /* If we (the leader) don't have CBs, go wait some more. */
+ if (!my_rdp->nocb_follower_head)
+ goto wait_again;
+}
+
+/*
+ * Followers come here to wait for additional callbacks to show up.
+ * This function does not return until callbacks appear.
+ */
+static void nocb_follower_wait(struct rcu_data *rdp)
+{
+ bool firsttime = true;
+
+ for (;;) {
+ if (!rcu_nocb_poll) {
+ trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
+ "FollowerSleep");
+ wait_event_interruptible(rdp->nocb_wq,
+ ACCESS_ONCE(rdp->nocb_follower_head));
+ } else if (firsttime) {
+ /* Don't drown trace log with "Poll"! */
+ firsttime = false;
+ trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, "Poll");
+ }
+ if (smp_load_acquire(&rdp->nocb_follower_head)) {
+ /* ^^^ Ensure CB invocation follows _head test. */
+ return;
+ }
+ if (!rcu_nocb_poll)
+ trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
+ "WokeEmpty");
+ flush_signals(current);
+ schedule_timeout_interruptible(1);
+ }
+}
+
+/*
* Per-rcu_data kthread, but only for no-CBs CPUs. Each kthread invokes
- * callbacks queued by the corresponding no-CBs CPU.
+ * callbacks queued by the corresponding no-CBs CPU, however, there is
+ * an optional leader-follower relationship so that the grace-period
+ * kthreads don't have to do quite so many wakeups.
*/
static int rcu_nocb_kthread(void *arg)
{
int c, cl;
- bool firsttime = 1;
struct rcu_head *list;
struct rcu_head *next;
struct rcu_head **tail;
@@ -2227,41 +2388,22 @@ static int rcu_nocb_kthread(void *arg)
/* Each pass through this loop invokes one batch of callbacks */
for (;;) {
- /* If not polling, wait for next batch of callbacks. */
- if (!rcu_nocb_poll) {
- trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
- TPS("Sleep"));
- wait_event_interruptible(rdp->nocb_wq, rdp->nocb_head);
- /* Memory barrier provide by xchg() below. */
- } else if (firsttime) {
- firsttime = 0;
- trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
- TPS("Poll"));
- }
- list = ACCESS_ONCE(rdp->nocb_head);
- if (!list) {
- if (!rcu_nocb_poll)
- trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
- TPS("WokeEmpty"));
- schedule_timeout_interruptible(1);
- flush_signals(current);
- continue;
- }
- firsttime = 1;
- trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
- TPS("WokeNonEmpty"));
-
- /*
- * Extract queued callbacks, update counts, and wait
- * for a grace period to elapse.
- */
- ACCESS_ONCE(rdp->nocb_head) = NULL;
- tail = xchg(&rdp->nocb_tail, &rdp->nocb_head);
- c = atomic_long_xchg(&rdp->nocb_q_count, 0);
- cl = atomic_long_xchg(&rdp->nocb_q_count_lazy, 0);
- ACCESS_ONCE(rdp->nocb_p_count) += c;
- ACCESS_ONCE(rdp->nocb_p_count_lazy) += cl;
- rcu_nocb_wait_gp(rdp);
+ /* Wait for callbacks. */
+ if (rdp->nocb_leader == rdp)
+ nocb_leader_wait(rdp);
+ else
+ nocb_follower_wait(rdp);
+
+ /* Pull the ready-to-invoke callbacks onto local list. */
+ list = ACCESS_ONCE(rdp->nocb_follower_head);
+ BUG_ON(!list);
+ trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, "WokeNonEmpty");
+ ACCESS_ONCE(rdp->nocb_follower_head) = NULL;
+ tail = xchg(&rdp->nocb_follower_tail, &rdp->nocb_follower_head);
+ c = atomic_long_xchg(&rdp->nocb_follower_count, 0);
+ cl = atomic_long_xchg(&rdp->nocb_follower_count_lazy, 0);
+ rdp->nocb_p_count += c;
+ rdp->nocb_p_count_lazy += cl;
/* Each pass through the following loop invokes a callback. */
trace_rcu_batch_start(rdp->rsp->name, cl, c, -1);
@@ -2305,7 +2447,7 @@ static void do_nocb_deferred_wakeup(struct rcu_data *rdp)
if (!rcu_nocb_need_deferred_wakeup(rdp))
return;
ACCESS_ONCE(rdp->nocb_defer_wakeup) = false;
- wake_up(&rdp->nocb_wq);
+ wake_nocb_leader(rdp, false);
trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("DeferredWakeEmpty"));
}
@@ -2314,19 +2456,57 @@ static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp)
{
rdp->nocb_tail = &rdp->nocb_head;
init_waitqueue_head(&rdp->nocb_wq);
+ rdp->nocb_follower_tail = &rdp->nocb_follower_head;
}
-/* Create a kthread for each RCU flavor for each no-CBs CPU. */
+/* How many follower CPU IDs per leader? Default of -1 for sqrt(nr_cpu_ids). */
+static int rcu_nocb_leader_stride = -1;
+module_param(rcu_nocb_leader_stride, int, 0444);
+
+/*
+ * Create a kthread for each RCU flavor for each no-CBs CPU.
+ * Also initialize leader-follower relationships.
+ */
static void __init rcu_spawn_nocb_kthreads(struct rcu_state *rsp)
{
int cpu;
+ int ls = rcu_nocb_leader_stride;
+ int nl = 0; /* Next leader. */
struct rcu_data *rdp;
+ struct rcu_data *rdp_leader = NULL; /* Suppress misguided gcc warn. */
+ struct rcu_data *rdp_prev = NULL;
struct task_struct *t;
if (rcu_nocb_mask == NULL)
return;
+#if defined(CONFIG_NO_HZ_FULL) && !defined(CONFIG_NO_HZ_FULL_ALL)
+ if (tick_nohz_full_running)
+ cpumask_or(rcu_nocb_mask, rcu_nocb_mask, tick_nohz_full_mask);
+#endif /* #if defined(CONFIG_NO_HZ_FULL) && !defined(CONFIG_NO_HZ_FULL_ALL) */
+ if (ls == -1) {
+ ls = int_sqrt(nr_cpu_ids);
+ rcu_nocb_leader_stride = ls;
+ }
+
+ /*
+ * Each pass through this loop sets up one rcu_data structure and
+ * spawns one rcu_nocb_kthread().
+ */
for_each_cpu(cpu, rcu_nocb_mask) {
rdp = per_cpu_ptr(rsp->rda, cpu);
+ if (rdp->cpu >= nl) {
+ /* New leader, set up for followers & next leader. */
+ nl = DIV_ROUND_UP(rdp->cpu + 1, ls) * ls;
+ rdp->nocb_leader = rdp;
+ rdp_leader = rdp;
+ } else {
+ /* Another follower, link to previous leader. */
+ rdp->nocb_leader = rdp_leader;
+ rdp_prev->nocb_next_follower = rdp;
+ }
+ rdp_prev = rdp;
+
+ /* Spawn the kthread for this CPU. */
t = kthread_run(rcu_nocb_kthread, rdp,
"rcuo%c/%d", rsp->abbr, cpu);
BUG_ON(IS_ERR(t));
@@ -2843,12 +3023,16 @@ static bool rcu_nohz_full_cpu(struct rcu_state *rsp)
*/
static void rcu_bind_gp_kthread(void)
{
-#ifdef CONFIG_NO_HZ_FULL
- int cpu = ACCESS_ONCE(tick_do_timer_cpu);
+ int __maybe_unused cpu;
- if (cpu < 0 || cpu >= nr_cpu_ids)
+ if (!tick_nohz_full_enabled())
return;
- if (raw_smp_processor_id() != cpu)
+#ifdef CONFIG_NO_HZ_FULL_SYSIDLE
+ cpu = tick_do_timer_cpu;
+ if (cpu >= 0 && cpu < nr_cpu_ids && raw_smp_processor_id() != cpu)
set_cpus_allowed_ptr(current, cpumask_of(cpu));
-#endif /* #ifdef CONFIG_NO_HZ_FULL */
+#else /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */
+ if (!is_housekeeping_cpu(raw_smp_processor_id()))
+ housekeeping_affine(current);
+#endif /* #else #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */
}
diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
index bc7883570530..4056d7992a6c 100644
--- a/kernel/rcu/update.c
+++ b/kernel/rcu/update.c
@@ -90,9 +90,6 @@ void __rcu_read_unlock(void)
} else {
barrier(); /* critical section before exit code. */
t->rcu_read_lock_nesting = INT_MIN;
-#ifdef CONFIG_PROVE_RCU_DELAY
- udelay(10); /* Make preemption more probable. */
-#endif /* #ifdef CONFIG_PROVE_RCU_DELAY */
barrier(); /* assign before ->rcu_read_unlock_special load */
if (unlikely(ACCESS_ONCE(t->rcu_read_unlock_special)))
rcu_read_unlock_special(t);