From 2a67e741bbbc022e0fadf8c6dbc3a76019ecd0cf Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 8 Oct 2015 12:24:23 +0200 Subject: rcu: Create transitive rnp->lock acquisition functions Providing RCU's memory-ordering guarantees requires that the rcu_node tree's locking provide transitive memory ordering, which the Linux kernel's spinlocks currently do not provide unless smp_mb__after_unlock_lock() is used. Having a separate smp_mb__after_unlock_lock() after each and every lock acquisition is error-prone, hard to read, and a bit annoying, so this commit provides wrapper functions that pull in the smp_mb__after_unlock_lock() invocations. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 78 ++++++++++++++++-------------------------------- kernel/rcu/tree.h | 39 ++++++++++++++++++++++++ kernel/rcu/tree_plugin.h | 18 ++++------- 3 files changed, 71 insertions(+), 64 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index f07343b54fe5..daf17e248757 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1534,10 +1534,8 @@ rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp, * hold it, acquire the root rcu_node structure's lock in order to * start one (if needed). */ - if (rnp != rnp_root) { - raw_spin_lock(&rnp_root->lock); - smp_mb__after_unlock_lock(); - } + if (rnp != rnp_root) + raw_spin_lock_rcu_node(rnp_root); /* * Get a new grace-period number. If there really is no grace @@ -1786,11 +1784,10 @@ static void note_gp_changes(struct rcu_state *rsp, struct rcu_data *rdp) if ((rdp->gpnum == READ_ONCE(rnp->gpnum) && rdp->completed == READ_ONCE(rnp->completed) && !unlikely(READ_ONCE(rdp->gpwrap))) || /* w/out lock. */ - !raw_spin_trylock(&rnp->lock)) { /* irqs already off, so later. */ + !raw_spin_trylock_rcu_node(rnp)) { /* irqs already off, so later. */ local_irq_restore(flags); return; } - smp_mb__after_unlock_lock(); needwake = __note_gp_changes(rsp, rnp, rdp); raw_spin_unlock_irqrestore(&rnp->lock, flags); if (needwake) @@ -1814,8 +1811,7 @@ static int rcu_gp_init(struct rcu_state *rsp) struct rcu_node *rnp = rcu_get_root(rsp); WRITE_ONCE(rsp->gp_activity, jiffies); - raw_spin_lock_irq(&rnp->lock); - smp_mb__after_unlock_lock(); + raw_spin_lock_irq_rcu_node(rnp); if (!READ_ONCE(rsp->gp_flags)) { /* Spurious wakeup, tell caller to go back to sleep. */ raw_spin_unlock_irq(&rnp->lock); @@ -1847,8 +1843,7 @@ static int rcu_gp_init(struct rcu_state *rsp) */ rcu_for_each_leaf_node(rsp, rnp) { rcu_gp_slow(rsp, gp_preinit_delay); - raw_spin_lock_irq(&rnp->lock); - smp_mb__after_unlock_lock(); + raw_spin_lock_irq_rcu_node(rnp); if (rnp->qsmaskinit == rnp->qsmaskinitnext && !rnp->wait_blkd_tasks) { /* Nothing to do on this leaf rcu_node structure. */ @@ -1904,8 +1899,7 @@ static int rcu_gp_init(struct rcu_state *rsp) */ rcu_for_each_node_breadth_first(rsp, rnp) { rcu_gp_slow(rsp, gp_init_delay); - raw_spin_lock_irq(&rnp->lock); - smp_mb__after_unlock_lock(); + raw_spin_lock_irq_rcu_node(rnp); rdp = this_cpu_ptr(rsp->rda); rcu_preempt_check_blocked_tasks(rnp); rnp->qsmask = rnp->qsmaskinit; @@ -1973,8 +1967,7 @@ static void rcu_gp_fqs(struct rcu_state *rsp, bool first_time) } /* Clear flag to prevent immediate re-entry. */ if (READ_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) { - raw_spin_lock_irq(&rnp->lock); - smp_mb__after_unlock_lock(); + raw_spin_lock_irq_rcu_node(rnp); WRITE_ONCE(rsp->gp_flags, READ_ONCE(rsp->gp_flags) & ~RCU_GP_FLAG_FQS); raw_spin_unlock_irq(&rnp->lock); @@ -1993,8 +1986,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp) struct rcu_node *rnp = rcu_get_root(rsp); WRITE_ONCE(rsp->gp_activity, jiffies); - raw_spin_lock_irq(&rnp->lock); - smp_mb__after_unlock_lock(); + raw_spin_lock_irq_rcu_node(rnp); gp_duration = jiffies - rsp->gp_start; if (gp_duration > rsp->gp_max) rsp->gp_max = gp_duration; @@ -2019,8 +2011,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp) * grace period is recorded in any of the rcu_node structures. */ rcu_for_each_node_breadth_first(rsp, rnp) { - raw_spin_lock_irq(&rnp->lock); - smp_mb__after_unlock_lock(); + raw_spin_lock_irq_rcu_node(rnp); WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp)); WARN_ON_ONCE(rnp->qsmask); WRITE_ONCE(rnp->completed, rsp->gpnum); @@ -2035,8 +2026,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp) rcu_gp_slow(rsp, gp_cleanup_delay); } rnp = rcu_get_root(rsp); - raw_spin_lock_irq(&rnp->lock); - smp_mb__after_unlock_lock(); /* Order GP before ->completed update. */ + raw_spin_lock_irq_rcu_node(rnp); /* Order GP before ->completed update. */ rcu_nocb_gp_set(rnp, nocb); /* Declare grace period done. */ @@ -2284,8 +2274,7 @@ rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp, raw_spin_unlock_irqrestore(&rnp->lock, flags); rnp_c = rnp; rnp = rnp->parent; - raw_spin_lock_irqsave(&rnp->lock, flags); - smp_mb__after_unlock_lock(); + raw_spin_lock_irqsave_rcu_node(rnp, flags); oldmask = rnp_c->qsmask; } @@ -2332,8 +2321,7 @@ static void rcu_report_unblock_qs_rnp(struct rcu_state *rsp, gps = rnp->gpnum; mask = rnp->grpmask; raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ - raw_spin_lock(&rnp_p->lock); /* irqs already disabled. */ - smp_mb__after_unlock_lock(); + raw_spin_lock_rcu_node(rnp_p); /* irqs already disabled. */ rcu_report_qs_rnp(mask, rsp, rnp_p, gps, flags); } @@ -2355,8 +2343,7 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp) struct rcu_node *rnp; rnp = rdp->mynode; - raw_spin_lock_irqsave(&rnp->lock, flags); - smp_mb__after_unlock_lock(); + raw_spin_lock_irqsave_rcu_node(rnp, flags); if ((rdp->cpu_no_qs.b.norm && rdp->rcu_qs_ctr_snap == __this_cpu_read(rcu_qs_ctr)) || rdp->gpnum != rnp->gpnum || rnp->completed == rnp->gpnum || @@ -2582,8 +2569,7 @@ static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf) rnp = rnp->parent; if (!rnp) break; - raw_spin_lock(&rnp->lock); /* irqs already disabled. */ - smp_mb__after_unlock_lock(); /* GP memory ordering. */ + raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */ rnp->qsmaskinit &= ~mask; rnp->qsmask &= ~mask; if (rnp->qsmaskinit) { @@ -2611,8 +2597,7 @@ static void rcu_cleanup_dying_idle_cpu(int cpu, struct rcu_state *rsp) /* Remove outgoing CPU from mask in the leaf rcu_node structure. */ mask = rdp->grpmask; - raw_spin_lock_irqsave(&rnp->lock, flags); - smp_mb__after_unlock_lock(); /* Enforce GP memory-order guarantee. */ + raw_spin_lock_irqsave_rcu_node(rnp, flags); /* Enforce GP memory-order guarantee. */ rnp->qsmaskinitnext &= ~mask; raw_spin_unlock_irqrestore(&rnp->lock, flags); } @@ -2809,8 +2794,7 @@ static void force_qs_rnp(struct rcu_state *rsp, rcu_for_each_leaf_node(rsp, rnp) { cond_resched_rcu_qs(); mask = 0; - raw_spin_lock_irqsave(&rnp->lock, flags); - smp_mb__after_unlock_lock(); + raw_spin_lock_irqsave_rcu_node(rnp, flags); if (rnp->qsmask == 0) { if (rcu_state_p == &rcu_sched_state || rsp != rcu_state_p || @@ -2881,8 +2865,7 @@ static void force_quiescent_state(struct rcu_state *rsp) /* rnp_old == rcu_get_root(rsp), rnp == NULL. */ /* Reached the root of the rcu_node tree, acquire lock. */ - raw_spin_lock_irqsave(&rnp_old->lock, flags); - smp_mb__after_unlock_lock(); + raw_spin_lock_irqsave_rcu_node(rnp_old, flags); raw_spin_unlock(&rnp_old->fqslock); if (READ_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) { rsp->n_force_qs_lh++; @@ -3005,8 +2988,7 @@ static void __call_rcu_core(struct rcu_state *rsp, struct rcu_data *rdp, if (!rcu_gp_in_progress(rsp)) { struct rcu_node *rnp_root = rcu_get_root(rsp); - raw_spin_lock(&rnp_root->lock); - smp_mb__after_unlock_lock(); + raw_spin_lock_rcu_node(rnp_root); needwake = rcu_start_gp(rsp); raw_spin_unlock(&rnp_root->lock); if (needwake) @@ -3426,8 +3408,7 @@ static void sync_exp_reset_tree_hotplug(struct rcu_state *rsp) * CPUs for the current rcu_node structure up the rcu_node tree. */ rcu_for_each_leaf_node(rsp, rnp) { - raw_spin_lock_irqsave(&rnp->lock, flags); - smp_mb__after_unlock_lock(); + raw_spin_lock_irqsave_rcu_node(rnp, flags); if (rnp->expmaskinit == rnp->expmaskinitnext) { raw_spin_unlock_irqrestore(&rnp->lock, flags); continue; /* No new CPUs, nothing to do. */ @@ -3447,8 +3428,7 @@ static void sync_exp_reset_tree_hotplug(struct rcu_state *rsp) rnp_up = rnp->parent; done = false; while (rnp_up) { - raw_spin_lock_irqsave(&rnp_up->lock, flags); - smp_mb__after_unlock_lock(); + raw_spin_lock_irqsave_rcu_node(rnp_up, flags); if (rnp_up->expmaskinit) done = true; rnp_up->expmaskinit |= mask; @@ -3472,8 +3452,7 @@ static void __maybe_unused sync_exp_reset_tree(struct rcu_state *rsp) sync_exp_reset_tree_hotplug(rsp); rcu_for_each_node_breadth_first(rsp, rnp) { - raw_spin_lock_irqsave(&rnp->lock, flags); - smp_mb__after_unlock_lock(); + raw_spin_lock_irqsave_rcu_node(rnp, flags); WARN_ON_ONCE(rnp->expmask); rnp->expmask = rnp->expmaskinit; raw_spin_unlock_irqrestore(&rnp->lock, flags); @@ -3531,8 +3510,7 @@ static void __rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp, mask = rnp->grpmask; raw_spin_unlock(&rnp->lock); /* irqs remain disabled */ rnp = rnp->parent; - raw_spin_lock(&rnp->lock); /* irqs already disabled */ - smp_mb__after_unlock_lock(); + raw_spin_lock_rcu_node(rnp); /* irqs already disabled */ WARN_ON_ONCE(!(rnp->expmask & mask)); rnp->expmask &= ~mask; } @@ -3549,8 +3527,7 @@ static void __maybe_unused rcu_report_exp_rnp(struct rcu_state *rsp, { unsigned long flags; - raw_spin_lock_irqsave(&rnp->lock, flags); - smp_mb__after_unlock_lock(); + raw_spin_lock_irqsave_rcu_node(rnp, flags); __rcu_report_exp_rnp(rsp, rnp, wake, flags); } @@ -3564,8 +3541,7 @@ static void rcu_report_exp_cpu_mult(struct rcu_state *rsp, struct rcu_node *rnp, { unsigned long flags; - raw_spin_lock_irqsave(&rnp->lock, flags); - smp_mb__after_unlock_lock(); + raw_spin_lock_irqsave_rcu_node(rnp, flags); if (!(rnp->expmask & mask)) { raw_spin_unlock_irqrestore(&rnp->lock, flags); return; @@ -3708,8 +3684,7 @@ static void sync_rcu_exp_select_cpus(struct rcu_state *rsp, sync_exp_reset_tree(rsp); rcu_for_each_leaf_node(rsp, rnp) { - raw_spin_lock_irqsave(&rnp->lock, flags); - smp_mb__after_unlock_lock(); + raw_spin_lock_irqsave_rcu_node(rnp, flags); /* Each pass checks a CPU for identity, offline, and idle. */ mask_ofl_test = 0; @@ -4198,8 +4173,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp) */ rnp = rdp->mynode; mask = rdp->grpmask; - raw_spin_lock(&rnp->lock); /* irqs already disabled. */ - smp_mb__after_unlock_lock(); + raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */ rnp->qsmaskinitnext |= mask; rnp->expmaskinitnext |= mask; if (!rdp->beenonline) diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 9fb4e238d4dc..f32bebb6bc90 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -664,3 +664,42 @@ static inline void rcu_nocb_q_lengths(struct rcu_data *rdp, long *ql, long *qll) #else /* #ifdef CONFIG_PPC */ #define smp_mb__after_unlock_lock() do { } while (0) #endif /* #else #ifdef CONFIG_PPC */ + +/* + * Wrappers for the rcu_node::lock acquire. + * + * Because the rcu_nodes form a tree, the tree traversal locking will observe + * different lock values, this in turn means that an UNLOCK of one level + * followed by a LOCK of another level does not imply a full memory barrier; + * and most importantly transitivity is lost. + * + * In order to restore full ordering between tree levels, augment the regular + * lock acquire functions with smp_mb__after_unlock_lock(). + */ +static inline void raw_spin_lock_rcu_node(struct rcu_node *rnp) +{ + raw_spin_lock(&rnp->lock); + smp_mb__after_unlock_lock(); +} + +static inline void raw_spin_lock_irq_rcu_node(struct rcu_node *rnp) +{ + raw_spin_lock_irq(&rnp->lock); + smp_mb__after_unlock_lock(); +} + +#define raw_spin_lock_irqsave_rcu_node(rnp, flags) \ +do { \ + typecheck(unsigned long, flags); \ + raw_spin_lock_irqsave(&(rnp)->lock, flags); \ + smp_mb__after_unlock_lock(); \ +} while (0) + +static inline bool raw_spin_trylock_rcu_node(struct rcu_node *rnp) +{ + bool locked = raw_spin_trylock(&rnp->lock); + + if (locked) + smp_mb__after_unlock_lock(); + return locked; +} diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 630c19772630..fa0e3b96a9ed 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -301,8 +301,7 @@ static void rcu_preempt_note_context_switch(void) /* Possibly blocking in an RCU read-side critical section. */ rdp = this_cpu_ptr(rcu_state_p->rda); rnp = rdp->mynode; - raw_spin_lock_irqsave(&rnp->lock, flags); - smp_mb__after_unlock_lock(); + raw_spin_lock_irqsave_rcu_node(rnp, flags); t->rcu_read_unlock_special.b.blocked = true; t->rcu_blocked_node = rnp; @@ -457,8 +456,7 @@ void rcu_read_unlock_special(struct task_struct *t) */ for (;;) { rnp = t->rcu_blocked_node; - raw_spin_lock(&rnp->lock); /* irqs already disabled. */ - smp_mb__after_unlock_lock(); + raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */ if (rnp == t->rcu_blocked_node) break; WARN_ON_ONCE(1); @@ -989,8 +987,7 @@ static int rcu_boost(struct rcu_node *rnp) READ_ONCE(rnp->boost_tasks) == NULL) return 0; /* Nothing left to boost. */ - raw_spin_lock_irqsave(&rnp->lock, flags); - smp_mb__after_unlock_lock(); + raw_spin_lock_irqsave_rcu_node(rnp, flags); /* * Recheck under the lock: all tasks in need of boosting @@ -1176,8 +1173,7 @@ static int rcu_spawn_one_boost_kthread(struct rcu_state *rsp, "rcub/%d", rnp_index); if (IS_ERR(t)) return PTR_ERR(t); - raw_spin_lock_irqsave(&rnp->lock, flags); - smp_mb__after_unlock_lock(); + raw_spin_lock_irqsave_rcu_node(rnp, flags); rnp->boost_kthread_task = t; raw_spin_unlock_irqrestore(&rnp->lock, flags); sp.sched_priority = kthread_prio; @@ -1567,8 +1563,7 @@ static void rcu_prepare_for_idle(void) if (!*rdp->nxttail[RCU_DONE_TAIL]) continue; rnp = rdp->mynode; - raw_spin_lock(&rnp->lock); /* irqs already disabled. */ - smp_mb__after_unlock_lock(); + raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */ needwake = rcu_accelerate_cbs(rsp, rnp, rdp); raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ if (needwake) @@ -2068,8 +2063,7 @@ static void rcu_nocb_wait_gp(struct rcu_data *rdp) bool needwake; struct rcu_node *rnp = rdp->mynode; - raw_spin_lock_irqsave(&rnp->lock, flags); - smp_mb__after_unlock_lock(); + raw_spin_lock_irqsave_rcu_node(rnp, flags); needwake = rcu_start_future_gp(rnp, rdp, &c); raw_spin_unlock_irqrestore(&rnp->lock, flags); if (needwake) -- cgit From 6cf10081220ae21175a867d446b3167bcbcb937b Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 8 Oct 2015 15:36:54 -0700 Subject: rcu: Add transitivity to remaining rcu_node ->lock acquisitions The rule is that all acquisitions of the rcu_node structure's ->lock must provide transitivity: The lock is not acquired that frequently, and sorting out exactly which required it and which did not would be a maintenance nightmare. This commit therefore supplies the needed transitivity to the remaining ->lock acquisitions. Reported-by: Peter Zijlstra Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 24 ++++++++++++------------ kernel/rcu/tree_plugin.h | 2 +- kernel/rcu/tree_trace.c | 2 +- 3 files changed, 14 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index daf17e248757..81aa1cdc6bc9 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1214,7 +1214,7 @@ static void rcu_dump_cpu_stacks(struct rcu_state *rsp) struct rcu_node *rnp; rcu_for_each_leaf_node(rsp, rnp) { - raw_spin_lock_irqsave(&rnp->lock, flags); + raw_spin_lock_irqsave_rcu_node(rnp, flags); if (rnp->qsmask != 0) { for (cpu = 0; cpu <= rnp->grphi - rnp->grplo; cpu++) if (rnp->qsmask & (1UL << cpu)) @@ -1237,7 +1237,7 @@ static void print_other_cpu_stall(struct rcu_state *rsp, unsigned long gpnum) /* Only let one CPU complain about others per time interval. */ - raw_spin_lock_irqsave(&rnp->lock, flags); + raw_spin_lock_irqsave_rcu_node(rnp, flags); delta = jiffies - READ_ONCE(rsp->jiffies_stall); if (delta < RCU_STALL_RAT_DELAY || !rcu_gp_in_progress(rsp)) { raw_spin_unlock_irqrestore(&rnp->lock, flags); @@ -1256,7 +1256,7 @@ static void print_other_cpu_stall(struct rcu_state *rsp, unsigned long gpnum) rsp->name); print_cpu_stall_info_begin(); rcu_for_each_leaf_node(rsp, rnp) { - raw_spin_lock_irqsave(&rnp->lock, flags); + raw_spin_lock_irqsave_rcu_node(rnp, flags); ndetected += rcu_print_task_stall(rnp); if (rnp->qsmask != 0) { for (cpu = 0; cpu <= rnp->grphi - rnp->grplo; cpu++) @@ -1327,7 +1327,7 @@ static void print_cpu_stall(struct rcu_state *rsp) rcu_dump_cpu_stacks(rsp); - raw_spin_lock_irqsave(&rnp->lock, flags); + raw_spin_lock_irqsave_rcu_node(rnp, flags); if (ULONG_CMP_GE(jiffies, READ_ONCE(rsp->jiffies_stall))) WRITE_ONCE(rsp->jiffies_stall, jiffies + 3 * rcu_jiffies_till_stall_check() + 3); @@ -2897,7 +2897,7 @@ __rcu_process_callbacks(struct rcu_state *rsp) /* Does this CPU require a not-yet-started grace period? */ local_irq_save(flags); if (cpu_needs_another_gp(rsp, rdp)) { - raw_spin_lock(&rcu_get_root(rsp)->lock); /* irqs disabled. */ + raw_spin_lock_rcu_node(rcu_get_root(rsp)); /* irqs disabled. */ needwake = rcu_start_gp(rsp); raw_spin_unlock_irqrestore(&rcu_get_root(rsp)->lock, flags); if (needwake) @@ -3718,7 +3718,7 @@ retry_ipi: mask_ofl_ipi &= ~mask; } else { /* Failed, raced with offline. */ - raw_spin_lock_irqsave(&rnp->lock, flags); + raw_spin_lock_irqsave_rcu_node(rnp, flags); if (cpu_online(cpu) && (rnp->expmask & mask)) { raw_spin_unlock_irqrestore(&rnp->lock, @@ -3727,8 +3727,8 @@ retry_ipi: if (cpu_online(cpu) && (rnp->expmask & mask)) goto retry_ipi; - raw_spin_lock_irqsave(&rnp->lock, - flags); + raw_spin_lock_irqsave_rcu_node(rnp, + flags); } if (!(rnp->expmask & mask)) mask_ofl_ipi &= ~mask; @@ -4110,7 +4110,7 @@ static void rcu_init_new_rnp(struct rcu_node *rnp_leaf) rnp = rnp->parent; if (rnp == NULL) return; - raw_spin_lock(&rnp->lock); /* Interrupts already disabled. */ + raw_spin_lock_rcu_node(rnp); /* Interrupts already disabled. */ rnp->qsmaskinit |= mask; raw_spin_unlock(&rnp->lock); /* Interrupts remain disabled. */ } @@ -4127,7 +4127,7 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp) struct rcu_node *rnp = rcu_get_root(rsp); /* Set up local state, ensuring consistent view of global state. */ - raw_spin_lock_irqsave(&rnp->lock, flags); + raw_spin_lock_irqsave_rcu_node(rnp, flags); rdp->grpmask = 1UL << (cpu - rdp->mynode->grplo); rdp->dynticks = &per_cpu(rcu_dynticks, cpu); WARN_ON_ONCE(rdp->dynticks->dynticks_nesting != DYNTICK_TASK_EXIT_IDLE); @@ -4154,7 +4154,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp) struct rcu_node *rnp = rcu_get_root(rsp); /* Set up local state, ensuring consistent view of global state. */ - raw_spin_lock_irqsave(&rnp->lock, flags); + raw_spin_lock_irqsave_rcu_node(rnp, flags); rdp->qlen_last_fqs_check = 0; rdp->n_force_qs_snap = rsp->n_force_qs; rdp->blimit = blimit; @@ -4301,7 +4301,7 @@ static int __init rcu_spawn_gp_kthread(void) t = kthread_create(rcu_gp_kthread, rsp, "%s", rsp->name); BUG_ON(IS_ERR(t)); rnp = rcu_get_root(rsp); - raw_spin_lock_irqsave(&rnp->lock, flags); + raw_spin_lock_irqsave_rcu_node(rnp, flags); rsp->gp_kthread = t; if (kthread_prio) { sp.sched_priority = kthread_prio; diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index fa0e3b96a9ed..57ba873d2f18 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -525,7 +525,7 @@ static void rcu_print_detail_task_stall_rnp(struct rcu_node *rnp) unsigned long flags; struct task_struct *t; - raw_spin_lock_irqsave(&rnp->lock, flags); + raw_spin_lock_irqsave_rcu_node(rnp, flags); if (!rcu_preempt_blocked_readers_cgp(rnp)) { raw_spin_unlock_irqrestore(&rnp->lock, flags); return; diff --git a/kernel/rcu/tree_trace.c b/kernel/rcu/tree_trace.c index ef7093cc9b5c..8efaba870d96 100644 --- a/kernel/rcu/tree_trace.c +++ b/kernel/rcu/tree_trace.c @@ -319,7 +319,7 @@ static void show_one_rcugp(struct seq_file *m, struct rcu_state *rsp) unsigned long gpmax; struct rcu_node *rnp = &rsp->node[0]; - raw_spin_lock_irqsave(&rnp->lock, flags); + raw_spin_lock_irqsave_rcu_node(rnp, flags); completed = READ_ONCE(rsp->completed); gpnum = READ_ONCE(rsp->gpnum); if (completed == gpnum) -- cgit From 06f60de19d3141f07d954c9275fe7ccca8e96b42 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 29 Sep 2015 08:15:52 -0700 Subject: rcu: Short-circuit synchronize_sched_expedited() if only one CPU If there is only one CPU, then invoking synchronize_sched_expedited() is by definition a grace period. This commit checks for this condition and does a short-circuit return in that case. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 81aa1cdc6bc9..bd2605c144cc 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3822,6 +3822,10 @@ void synchronize_sched_expedited(void) struct rcu_node *rnp; struct rcu_state *rsp = &rcu_sched_state; + /* If only one CPU, this is automatically a grace period. */ + if (rcu_blocking_is_gp()) + return; + /* Take a snapshot of the sequence number. */ s = rcu_exp_gp_seq_snap(rsp); -- cgit From 1de6e56ddc043437d335ee0455a1b34b73510c91 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 29 Sep 2015 09:45:00 -0700 Subject: rcu: Clarify role of ->expmaskinitnext Analogy with the ->qsmaskinitnext field might lead one to believe that ->expmaskinitnext tracks online CPUs. This belief is incorrect: Any CPU that has ever been online will have its bit set in the ->expmaskinitnext field. This commit therefore adds a comment to make this clear, at least to people who read comments. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index f32bebb6bc90..8151971a8978 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -178,6 +178,8 @@ struct rcu_node { /* beginning of each expedited GP. */ unsigned long expmaskinitnext; /* Online CPUs for next expedited GP. */ + /* Any CPU that has ever been online will */ + /* have its bit set. */ unsigned long grpmask; /* Mask to apply to parent qsmask. */ /* Only one bit will be set in this mask. */ int grplo; /* lowest-numbered CPU or group here. */ -- cgit From 886ef5a18a4a771d5fdc0e23ae9373bb35d529e7 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 29 Sep 2015 12:34:40 -0700 Subject: rcu: Move smp_mb() from rcu_seq_snap() to rcu_exp_gp_seq_snap() The memory barrier in rcu_seq_snap() is needed only for grace periods, so this commit moves it to the grace-period-oriented wrapper rcu_exp_gp_seq_snap(). Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index bd2605c144cc..a4a0475aede9 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3347,7 +3347,6 @@ static unsigned long rcu_seq_snap(unsigned long *sp) { unsigned long s; - smp_mb(); /* Caller's modifications seen first by other CPUs. */ s = (READ_ONCE(*sp) + 3) & ~0x1; smp_mb(); /* Above access must not bleed into critical section. */ return s; @@ -3374,6 +3373,7 @@ static void rcu_exp_gp_seq_end(struct rcu_state *rsp) } static unsigned long rcu_exp_gp_seq_snap(struct rcu_state *rsp) { + smp_mb(); /* Caller's modifications seen first by other CPUs. */ return rcu_seq_snap(&rsp->expedited_sequence); } static bool rcu_exp_gp_seq_done(struct rcu_state *rsp, unsigned long s) -- cgit From 1307f2148719cc9e9d12f5fa7d5b3b61ec5aef72 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 29 Sep 2015 15:29:21 -0700 Subject: rcu: Invert sync_rcu_exp_select_cpus() "if" statement This commit saves a couple lines of code and reduces indentation by inverting the sense of an "if" statement in the function sync_rcu_exp_select_cpus(). Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 30 ++++++++++++++---------------- 1 file changed, 14 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index a4a0475aede9..00f07d6436ce 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3716,24 +3716,22 @@ retry_ipi: ret = smp_call_function_single(cpu, func, rsp, 0); if (!ret) { mask_ofl_ipi &= ~mask; - } else { - /* Failed, raced with offline. */ - raw_spin_lock_irqsave_rcu_node(rnp, flags); - if (cpu_online(cpu) && - (rnp->expmask & mask)) { - raw_spin_unlock_irqrestore(&rnp->lock, - flags); - schedule_timeout_uninterruptible(1); - if (cpu_online(cpu) && - (rnp->expmask & mask)) - goto retry_ipi; - raw_spin_lock_irqsave_rcu_node(rnp, - flags); - } - if (!(rnp->expmask & mask)) - mask_ofl_ipi &= ~mask; + continue; + } + /* Failed, raced with offline. */ + raw_spin_lock_irqsave_rcu_node(rnp, flags); + if (cpu_online(cpu) && + (rnp->expmask & mask)) { raw_spin_unlock_irqrestore(&rnp->lock, flags); + schedule_timeout_uninterruptible(1); + if (cpu_online(cpu) && + (rnp->expmask & mask)) + goto retry_ipi; + raw_spin_lock_irqsave_rcu_node(rnp, flags); } + if (!(rnp->expmask & mask)) + mask_ofl_ipi &= ~mask; + raw_spin_unlock_irqrestore(&rnp->lock, flags); } /* Report quiescent states for those that went offline. */ mask_ofl_test |= mask_ofl_ipi; -- cgit From df5bd5144a80a9f6c3807383b11f735dae9caf9d Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 1 Oct 2015 10:26:24 -0700 Subject: rcu: Reduce expedited GP memory contention via per-CPU variables Currently, the piggybacked-work checks carried out by sync_exp_work_done() atomically increment a small set of variables (the ->expedited_workdone0, ->expedited_workdone1, ->expedited_workdone2, ->expedited_workdone3 fields in the rcu_state structure), which will form a memory-contention bottleneck given a sufficiently large number of CPUs concurrently invoking either synchronize_rcu_expedited() or synchronize_sched_expedited(). This commit therefore moves these for fields to the per-CPU rcu_data structure, eliminating the memory contention. The show_rcuexp() function also changes to sum up each field in the rcu_data structures. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 11 +++++------ kernel/rcu/tree.h | 8 ++++---- kernel/rcu/tree_trace.c | 18 ++++++++++++------ 3 files changed, 21 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 00f07d6436ce..33d7e2551165 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3585,7 +3585,7 @@ static bool sync_exp_work_done(struct rcu_state *rsp, struct rcu_node *rnp, */ static struct rcu_node *exp_funnel_lock(struct rcu_state *rsp, unsigned long s) { - struct rcu_data *rdp; + struct rcu_data *rdp = per_cpu_ptr(rsp->rda, raw_smp_processor_id()); struct rcu_node *rnp0; struct rcu_node *rnp1 = NULL; @@ -3599,7 +3599,7 @@ static struct rcu_node *exp_funnel_lock(struct rcu_state *rsp, unsigned long s) if (!mutex_is_locked(&rnp0->exp_funnel_mutex)) { if (mutex_trylock(&rnp0->exp_funnel_mutex)) { if (sync_exp_work_done(rsp, rnp0, NULL, - &rsp->expedited_workdone0, s)) + &rdp->expedited_workdone0, s)) return NULL; return rnp0; } @@ -3613,14 +3613,13 @@ static struct rcu_node *exp_funnel_lock(struct rcu_state *rsp, unsigned long s) * can be inexact, as it is just promoting locality and is not * strictly needed for correctness. */ - rdp = per_cpu_ptr(rsp->rda, raw_smp_processor_id()); - if (sync_exp_work_done(rsp, NULL, NULL, &rsp->expedited_workdone1, s)) + if (sync_exp_work_done(rsp, NULL, NULL, &rdp->expedited_workdone1, s)) return NULL; mutex_lock(&rdp->exp_funnel_mutex); rnp0 = rdp->mynode; for (; rnp0 != NULL; rnp0 = rnp0->parent) { if (sync_exp_work_done(rsp, rnp1, rdp, - &rsp->expedited_workdone2, s)) + &rdp->expedited_workdone2, s)) return NULL; mutex_lock(&rnp0->exp_funnel_mutex); if (rnp1) @@ -3630,7 +3629,7 @@ static struct rcu_node *exp_funnel_lock(struct rcu_state *rsp, unsigned long s) rnp1 = rnp0; } if (sync_exp_work_done(rsp, rnp1, rdp, - &rsp->expedited_workdone3, s)) + &rdp->expedited_workdone3, s)) return NULL; return rnp1; } diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 8151971a8978..6cbec31f99d6 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -386,6 +386,10 @@ struct rcu_data { struct rcu_head oom_head; #endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */ struct mutex exp_funnel_mutex; + atomic_long_t expedited_workdone0; /* # done by others #0. */ + atomic_long_t expedited_workdone1; /* # done by others #1. */ + atomic_long_t expedited_workdone2; /* # done by others #2. */ + atomic_long_t expedited_workdone3; /* # done by others #3. */ /* 7) Callback offloading. */ #ifdef CONFIG_RCU_NOCB_CPU @@ -500,10 +504,6 @@ struct rcu_state { /* End of fields guarded by barrier_mutex. */ unsigned long expedited_sequence; /* Take a ticket. */ - atomic_long_t expedited_workdone0; /* # done by others #0. */ - atomic_long_t expedited_workdone1; /* # done by others #1. */ - atomic_long_t expedited_workdone2; /* # done by others #2. */ - atomic_long_t expedited_workdone3; /* # done by others #3. */ atomic_long_t expedited_normal; /* # fallbacks to normal. */ atomic_t expedited_need_qs; /* # CPUs left to check in. */ wait_queue_head_t expedited_wq; /* Wait for check-ins. */ diff --git a/kernel/rcu/tree_trace.c b/kernel/rcu/tree_trace.c index 8efaba870d96..d43649450ea4 100644 --- a/kernel/rcu/tree_trace.c +++ b/kernel/rcu/tree_trace.c @@ -183,14 +183,20 @@ static const struct file_operations rcudata_fops = { static int show_rcuexp(struct seq_file *m, void *v) { + int cpu; struct rcu_state *rsp = (struct rcu_state *)m->private; - + struct rcu_data *rdp; + unsigned long s0 = 0, s1 = 0, s2 = 0, s3 = 0; + + for_each_possible_cpu(cpu) { + rdp = per_cpu_ptr(rsp->rda, cpu); + s0 += atomic_long_read(&rdp->expedited_workdone0); + s1 += atomic_long_read(&rdp->expedited_workdone1); + s2 += atomic_long_read(&rdp->expedited_workdone2); + s3 += atomic_long_read(&rdp->expedited_workdone3); + } seq_printf(m, "s=%lu wd0=%lu wd1=%lu wd2=%lu wd3=%lu n=%lu enq=%d sc=%lu\n", - rsp->expedited_sequence, - atomic_long_read(&rsp->expedited_workdone0), - atomic_long_read(&rsp->expedited_workdone1), - atomic_long_read(&rsp->expedited_workdone2), - atomic_long_read(&rsp->expedited_workdone3), + rsp->expedited_sequence, s0, s1, s2, s3, atomic_long_read(&rsp->expedited_normal), atomic_read(&rsp->expedited_need_qs), rsp->expedited_sequence / 2); -- cgit From 73f36f9de8bed78bcda2704a348594c20518b455 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 17 Nov 2015 10:56:55 -0800 Subject: rcu: Make expedited grace periods resolve stall-warning ties Currently, if a grace period ends just as the stall-warning timeout fires, an empty stall warning will be printed. This is not helpful, so this commit avoids these useless warnings by rechecking completion after awakening in synchronize_sched_expedited_wait(). Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 33d7e2551165..bc6b79716a86 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3757,7 +3757,7 @@ static void synchronize_sched_expedited_wait(struct rcu_state *rsp) rsp->expedited_wq, sync_rcu_preempt_exp_done(rnp_root), jiffies_stall); - if (ret > 0) + if (ret > 0 || sync_rcu_preempt_exp_done(rnp_root)) return; if (ret < 0) { /* Hit a signal, disable CPU stall warnings. */ -- cgit From 72611ab9f5d2d384a04e72d560c9c82463115cbf Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 17 Nov 2015 13:25:21 -0800 Subject: rcu: Add more diagnostics to expedited stall warning messages. This commit adds print statements that check the rcu_node structure to find which ->expmask bits and which ->exp_tasks structures are blocking the current expedited grace period. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 24 +++++++++++++++++++++--- 1 file changed, 21 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index bc6b79716a86..6a652d1f3d7f 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3745,6 +3745,7 @@ static void synchronize_sched_expedited_wait(struct rcu_state *rsp) unsigned long jiffies_stall; unsigned long jiffies_start; unsigned long mask; + int ndetected; struct rcu_node *rnp; struct rcu_node *rnp_root = rcu_get_root(rsp); int ret; @@ -3767,14 +3768,16 @@ static void synchronize_sched_expedited_wait(struct rcu_state *rsp) } pr_err("INFO: %s detected expedited stalls on CPUs/tasks: {", rsp->name); + ndetected = 0; rcu_for_each_leaf_node(rsp, rnp) { - (void)rcu_print_task_exp_stall(rnp); + ndetected = rcu_print_task_exp_stall(rnp); mask = 1; for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask <<= 1) { struct rcu_data *rdp; if (!(rnp->expmask & mask)) continue; + ndetected++; rdp = per_cpu_ptr(rsp->rda, cpu); pr_cont(" %d-%c%c%c", cpu, "O."[cpu_online(cpu)], @@ -3783,8 +3786,23 @@ static void synchronize_sched_expedited_wait(struct rcu_state *rsp) } mask <<= 1; } - pr_cont(" } %lu jiffies s: %lu\n", - jiffies - jiffies_start, rsp->expedited_sequence); + pr_cont(" } %lu jiffies s: %lu root: %#lx/%c\n", + jiffies - jiffies_start, rsp->expedited_sequence, + rnp_root->expmask, ".T"[!!rnp_root->exp_tasks]); + if (!ndetected) { + pr_err("blocking rcu_node structures:"); + rcu_for_each_node_breadth_first(rsp, rnp) { + if (rnp == rnp_root) + continue; /* printed unconditionally */ + if (sync_rcu_preempt_exp_done(rnp)) + continue; + pr_cont(" l=%u:%d-%d:%#lx/%c", + rnp->level, rnp->grplo, rnp->grphi, + rnp->expmask, + ".T"[!!rnp->exp_tasks]); + } + pr_cont("\n"); + } rcu_for_each_leaf_node(rsp, rnp) { mask = 1; for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask <<= 1) { -- cgit From 5a9be7c628c5273f84abacebf7faf2488376e0f0 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 24 Nov 2015 15:44:06 -0800 Subject: rcu: Add rcu_normal kernel parameter to suppress expediting Although expedited grace periods can be quite useful, and although their OS jitter has been greatly reduced, they can still pose problems for extreme real-time workloads. This commit therefore adds a rcu_normal kernel boot parameter (which can also be manipulated via sysfs) to suppress expedited grace periods, that is, to treat requests for expedited grace periods as if they were requests for normal grace periods. If both rcu_expedited and rcu_normal are specified, rcu_normal wins. This means that if you are relying on expedited grace periods to speed up boot, you will want to specify rcu_expedited on the kernel command line, and then specify rcu_normal via sysfs once boot completes. Signed-off-by: Paul E. McKenney --- kernel/ksysfs.c | 22 ++++++++++++++++++++-- kernel/rcu/srcu.c | 2 +- kernel/rcu/tree.c | 6 ++++++ kernel/rcu/tree_plugin.h | 6 ++++++ kernel/rcu/update.c | 12 ++++++++++++ 5 files changed, 45 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c index e83b26464061..b4e2fa52d8bc 100644 --- a/kernel/ksysfs.c +++ b/kernel/ksysfs.c @@ -20,7 +20,7 @@ #include #include -#include /* rcu_expedited */ +#include /* rcu_expedited and rcu_normal */ #define KERNEL_ATTR_RO(_name) \ static struct kobj_attribute _name##_attr = __ATTR_RO(_name) @@ -148,7 +148,7 @@ int rcu_expedited; static ssize_t rcu_expedited_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { - return sprintf(buf, "%d\n", rcu_expedited); + return sprintf(buf, "%d\n", READ_ONCE(rcu_expedited)); } static ssize_t rcu_expedited_store(struct kobject *kobj, struct kobj_attribute *attr, @@ -161,6 +161,23 @@ static ssize_t rcu_expedited_store(struct kobject *kobj, } KERNEL_ATTR_RW(rcu_expedited); +int rcu_normal; +static ssize_t rcu_normal_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sprintf(buf, "%d\n", READ_ONCE(rcu_normal)); +} +static ssize_t rcu_normal_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + if (kstrtoint(buf, 0, &rcu_normal)) + return -EINVAL; + + return count; +} +KERNEL_ATTR_RW(rcu_normal); + /* * Make /sys/kernel/notes give the raw contents of our kernel .notes section. */ @@ -203,6 +220,7 @@ static struct attribute * kernel_attrs[] = { &vmcoreinfo_attr.attr, #endif &rcu_expedited_attr.attr, + &rcu_normal_attr.attr, NULL }; diff --git a/kernel/rcu/srcu.c b/kernel/rcu/srcu.c index a63a1ea5a41b..9b9cdd549caa 100644 --- a/kernel/rcu/srcu.c +++ b/kernel/rcu/srcu.c @@ -489,7 +489,7 @@ static void __synchronize_srcu(struct srcu_struct *sp, int trycount) */ void synchronize_srcu(struct srcu_struct *sp) { - __synchronize_srcu(sp, rcu_gp_is_expedited() + __synchronize_srcu(sp, (rcu_gp_is_expedited() && !rcu_gp_is_normal()) ? SYNCHRONIZE_SRCU_EXP_TRYCOUNT : SYNCHRONIZE_SRCU_TRYCOUNT); } diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 6a652d1f3d7f..489992997c06 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3841,6 +3841,12 @@ void synchronize_sched_expedited(void) if (rcu_blocking_is_gp()) return; + /* If expedited grace periods are prohibited, fall back to normal. */ + if (rcu_gp_is_normal()) { + wait_rcu_gp(call_rcu_sched); + return; + } + /* Take a snapshot of the sequence number. */ s = rcu_exp_gp_seq_snap(rsp); diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 57ba873d2f18..d45df3781551 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -746,6 +746,12 @@ void synchronize_rcu_expedited(void) struct rcu_state *rsp = rcu_state_p; unsigned long s; + /* If expedited grace periods are prohibited, fall back to normal. */ + if (rcu_gp_is_normal()) { + wait_rcu_gp(call_rcu); + return; + } + s = rcu_exp_gp_seq_snap(rsp); rnp_unlock = exp_funnel_lock(rsp, s); diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index 5f748c5a40f0..8fccda3a794d 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -61,6 +61,7 @@ MODULE_ALIAS("rcupdate"); #define MODULE_PARAM_PREFIX "rcupdate." module_param(rcu_expedited, int, 0); +module_param(rcu_normal, int, 0); #if defined(CONFIG_DEBUG_LOCK_ALLOC) && defined(CONFIG_PREEMPT_COUNT) /** @@ -113,6 +114,17 @@ EXPORT_SYMBOL(rcu_read_lock_sched_held); #ifndef CONFIG_TINY_RCU +/* + * Should expedited grace-period primitives always fall back to their + * non-expedited counterparts? Intended for use within RCU. Note + * that if the user specifies both rcu_expedited and rcu_normal, then + * rcu_normal wins. + */ +bool rcu_gp_is_normal(void) +{ + return READ_ONCE(rcu_normal); +} + static atomic_t rcu_expedited_nesting = ATOMIC_INIT(IS_ENABLED(CONFIG_RCU_EXPEDITE_BOOT) ? 1 : 0); -- cgit From 3e42ec1aa716f10c68294b8492ae3ea684528699 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 25 Nov 2015 18:56:00 -0800 Subject: rcu: Allow expedited grace periods to be disabled at init Expedited grace periods can speed up boot, but are undesirable in aggressive real-time systems. This commit therefore introduces a kernel parameter rcupdate.rcu_normal_after_boot that disables expedited grace periods just before init is spawned. Signed-off-by: Paul E. McKenney --- kernel/rcu/update.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'kernel') diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index 8fccda3a794d..12b91f5a60a6 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -63,6 +63,9 @@ MODULE_ALIAS("rcupdate"); module_param(rcu_expedited, int, 0); module_param(rcu_normal, int, 0); +static int rcu_normal_after_boot; +module_param(rcu_normal_after_boot, int, 0); + #if defined(CONFIG_DEBUG_LOCK_ALLOC) && defined(CONFIG_PREEMPT_COUNT) /** * rcu_read_lock_sched_held() - might we be in RCU-sched read-side critical section? @@ -178,6 +181,8 @@ void rcu_end_inkernel_boot(void) { if (IS_ENABLED(CONFIG_RCU_EXPEDITE_BOOT)) rcu_unexpedite_gp(); + if (rcu_normal_after_boot) + WRITE_ONCE(rcu_normal, 1); } #ifdef CONFIG_PREEMPT_RCU -- cgit From 3dc5dbe9a1b815b659a6b04540fc6fd4b4e3831b Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sat, 26 Sep 2015 14:51:24 -0700 Subject: rcu: Move lock_class_key to local scope Currently, the rcu_node_class[], rcu_fqs_class[], and rcu_exp_class[] arrays needlessly pollute the global namespace within tree.c. This commit therefore converts them to static local variables within rcu_init_one(). Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 81aa1cdc6bc9..23df2661c899 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -68,10 +68,6 @@ MODULE_ALIAS("rcutree"); /* Data structures. */ -static struct lock_class_key rcu_node_class[RCU_NUM_LVLS]; -static struct lock_class_key rcu_fqs_class[RCU_NUM_LVLS]; -static struct lock_class_key rcu_exp_class[RCU_NUM_LVLS]; - /* * In order to export the rcu_state name to the tracing tools, it * needs to be added in the __tracepoint_string section. @@ -4365,6 +4361,9 @@ static void __init rcu_init_one(struct rcu_state *rsp, static const char * const buf[] = RCU_NODE_NAME_INIT; static const char * const fqs[] = RCU_FQS_NAME_INIT; static const char * const exp[] = RCU_EXP_NAME_INIT; + static struct lock_class_key rcu_node_class[RCU_NUM_LVLS]; + static struct lock_class_key rcu_fqs_class[RCU_NUM_LVLS]; + static struct lock_class_key rcu_exp_class[RCU_NUM_LVLS]; static u8 fl_mask = 0x1; int levelcnt[RCU_NUM_LVLS]; /* # nodes in each level. */ -- cgit From 47dbc90663f697a4515a8dd5c99ae43dba108cb4 Mon Sep 17 00:00:00 2001 From: Paul Gortmaker Date: Sun, 27 Sep 2015 19:14:57 -0400 Subject: kernel: Make rcu/tree_trace.c explicitly non-modular The Kconfig currently controlling compilation of this code is: init/Kconfig:config TREE_RCU_TRACE init/Kconfig: def_bool RCU_TRACE && ( TREE_RCU || PREEMPT_RCU ) ...meaning that it currently is not being built as a module by anyone. Lets remove the modular code that is essentially orphaned, so that when reading the file there is no doubt it is builtin-only. Since module_init translates to device_initcall in the non-modular case, the init ordering remains unchanged with this commit. We could consider moving this to an earlier initcall if desired. We don't replace module.h with init.h since the file already has that. We also delete the moduleparam.h include that is left over from commit 64db4cfff99c04cd5f550357edcc8780f96b54a2 (""Tree RCU": scalable classic RCU implementation") since it is not needed here either. We morph some tags like MODULE_AUTHOR into the comments at the top of the file for documentation purposes. Cc: "Paul E. McKenney" Cc: Josh Triplett Reviewed-by: Josh Triplett Cc: Steven Rostedt Cc: Mathieu Desnoyers Cc: Lai Jiangshan Signed-off-by: Paul Gortmaker Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_trace.c | 19 +++---------------- 1 file changed, 3 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_trace.c b/kernel/rcu/tree_trace.c index 8efaba870d96..82aca98b18f8 100644 --- a/kernel/rcu/tree_trace.c +++ b/kernel/rcu/tree_trace.c @@ -1,5 +1,5 @@ /* - * Read-Copy Update tracing for classic implementation + * Read-Copy Update tracing for hierarchical implementation. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -16,6 +16,7 @@ * http://www.gnu.org/licenses/gpl-2.0.html. * * Copyright IBM Corporation, 2008 + * Author: Paul E. McKenney * * Papers: http://www.rdrop.com/users/paulmck/RCU * @@ -33,9 +34,7 @@ #include #include #include -#include #include -#include #include #include #include @@ -487,16 +486,4 @@ free_out: debugfs_remove_recursive(rcudir); return 1; } - -static void __exit rcutree_trace_cleanup(void) -{ - debugfs_remove_recursive(rcudir); -} - - -module_init(rcutree_trace_init); -module_exit(rcutree_trace_cleanup); - -MODULE_AUTHOR("Paul E. McKenney"); -MODULE_DESCRIPTION("Read-Copy Update tracing for hierarchical implementation"); -MODULE_LICENSE("GPL"); +device_initcall(rcutree_trace_init); -- cgit From fecbf6f01fbd83e6419ccb7f61d9a6eb987f1d92 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 28 Sep 2015 18:19:24 -0700 Subject: rcu: Simplify rcu_sched_qs() control flow This commit applies an early-exit approach to rcu_sched_qs(), reducing the nesting level and saving a line of code. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 29 ++++++++++++++--------------- 1 file changed, 14 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 23df2661c899..ed3bc0578cc5 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -244,22 +244,21 @@ void rcu_sched_qs(void) { unsigned long flags; - if (__this_cpu_read(rcu_sched_data.cpu_no_qs.s)) { - trace_rcu_grace_period(TPS("rcu_sched"), - __this_cpu_read(rcu_sched_data.gpnum), - TPS("cpuqs")); - __this_cpu_write(rcu_sched_data.cpu_no_qs.b.norm, false); - if (!__this_cpu_read(rcu_sched_data.cpu_no_qs.b.exp)) - return; - local_irq_save(flags); - if (__this_cpu_read(rcu_sched_data.cpu_no_qs.b.exp)) { - __this_cpu_write(rcu_sched_data.cpu_no_qs.b.exp, false); - rcu_report_exp_rdp(&rcu_sched_state, - this_cpu_ptr(&rcu_sched_data), - true); - } - local_irq_restore(flags); + if (!__this_cpu_read(rcu_sched_data.cpu_no_qs.s)) + return; + trace_rcu_grace_period(TPS("rcu_sched"), + __this_cpu_read(rcu_sched_data.gpnum), + TPS("cpuqs")); + __this_cpu_write(rcu_sched_data.cpu_no_qs.b.norm, false); + if (!__this_cpu_read(rcu_sched_data.cpu_no_qs.b.exp)) + return; + local_irq_save(flags); + if (__this_cpu_read(rcu_sched_data.cpu_no_qs.b.exp)) { + __this_cpu_write(rcu_sched_data.cpu_no_qs.b.exp, false); + rcu_report_exp_rdp(&rcu_sched_state, + this_cpu_ptr(&rcu_sched_data), true); } + local_irq_restore(flags); } void rcu_bh_qs(void) -- cgit From 8ba9153b2c3ab733d64e22adb57820ccb6afc496 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 29 Sep 2015 07:55:41 -0700 Subject: rcu: Remove lock-acquisition loop from rcu_read_unlock_special() Several releases have come and gone without the warning triggering, so remove the lock-acquisition loop. Retain the WARN_ON_ONCE() out of sheer paranoia. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_plugin.h | 18 ++++++------------ 1 file changed, 6 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 57ba873d2f18..ae4ce2b665f8 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -449,19 +449,13 @@ void rcu_read_unlock_special(struct task_struct *t) /* * Remove this task from the list it blocked on. The task - * now remains queued on the rcu_node corresponding to - * the CPU it first blocked on, so the first attempt to - * acquire the task's rcu_node's ->lock will succeed. - * Keep the loop and add a WARN_ON() out of sheer paranoia. + * now remains queued on the rcu_node corresponding to the + * CPU it first blocked on, so there is no longer any need + * to loop. Retain a WARN_ON_ONCE() out of sheer paranoia. */ - for (;;) { - rnp = t->rcu_blocked_node; - raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */ - if (rnp == t->rcu_blocked_node) - break; - WARN_ON_ONCE(1); - raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ - } + rnp = t->rcu_blocked_node; + raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */ + WARN_ON_ONCE(rnp != t->rcu_blocked_node); empty_norm = !rcu_preempt_blocked_readers_cgp(rnp); empty_exp = sync_rcu_preempt_exp_done(rnp); smp_mb(); /* ensure expedited fastpath sees end of RCU c-s. */ -- cgit From 699d40352059e64a4d993af170272585c41988d0 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 29 Sep 2015 08:47:49 -0700 Subject: rcu: Fix obsolete rcu_bootup_announce_oddness() comment This function no longer has #ifdefs, so this commit removes the header comment calling them out. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_plugin.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index ae4ce2b665f8..42df93721e6f 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -63,8 +63,7 @@ static bool __read_mostly rcu_nocb_poll; /* Offload kthread are to poll. */ /* * Check the RCU kernel configuration parameters and print informative - * messages about anything out of the ordinary. If you like #ifdef, you - * will love this function. + * messages about anything out of the ordinary. */ static void __init rcu_bootup_announce_oddness(void) { -- cgit From f0f2e7d307fff226e0c1df5a07101a1216a46d8a Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 29 Sep 2015 08:59:32 -0700 Subject: rcu: Avoid tick_nohz_active checks on NOCBs CPUs Currently, rcu_prepare_for_idle() checks for tick_nohz_active, even on individual NOCBs CPUs, unless all CPUs are marked as NOCBs CPUs at build time. This check is pointless on NOCBs CPUs because they never have any callbacks posted, given that all of their callbacks are handed off to the corresponding rcuo kthread. There is a check for individually designated NOCBs CPUs, but it pointelessly follows the check for tick_nohz_active. This commit therefore moves the check for individually designated NOCBs CPUs up with the check for CONFIG_RCU_NOCB_CPU_ALL. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_plugin.h | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 42df93721e6f..8e9d4a4d0326 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -1513,7 +1513,8 @@ static void rcu_prepare_for_idle(void) struct rcu_state *rsp; int tne; - if (IS_ENABLED(CONFIG_RCU_NOCB_CPU_ALL)) + if (IS_ENABLED(CONFIG_RCU_NOCB_CPU_ALL) || + rcu_is_nocb_cpu(smp_processor_id())) return; /* Handle nohz enablement switches conservatively. */ @@ -1527,10 +1528,6 @@ static void rcu_prepare_for_idle(void) if (!tne) return; - /* If this is a no-CBs CPU, no callbacks, just return. */ - if (rcu_is_nocb_cpu(smp_processor_id())) - return; - /* * If a non-lazy callback arrived at a CPU having only lazy * callbacks, invoke RCU core for the side-effect of recalculating -- cgit From 46a5d164db53ba6066b11889abb7fa6bddbe5cf7 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 7 Oct 2015 09:10:48 -0700 Subject: rcu: Stop disabling interrupts in scheduler fastpaths We need the scheduler's fastpaths to be, well, fast, and unnecessarily disabling and re-enabling interrupts is not necessarily consistent with this goal. Especially given that there are regions of the scheduler that already have interrupts disabled. This commit therefore moves the call to rcu_note_context_switch() to one of the interrupts-disabled regions of the scheduler, and removes the now-redundant disabling and re-enabling of interrupts from rcu_note_context_switch() and the functions it calls. Reported-by: Peter Zijlstra Signed-off-by: Paul E. McKenney [ paulmck: Shift rcu_note_context_switch() to avoid deadlock, as suggested by Peter Zijlstra. ] --- kernel/rcu/tree.c | 27 ++++++++++++--------------- kernel/rcu/tree_plugin.h | 14 ++++++-------- kernel/sched/core.c | 6 ++++-- 3 files changed, 22 insertions(+), 25 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index ed3bc0578cc5..93941d3434ad 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -242,8 +242,6 @@ static int rcu_gp_in_progress(struct rcu_state *rsp) */ void rcu_sched_qs(void) { - unsigned long flags; - if (!__this_cpu_read(rcu_sched_data.cpu_no_qs.s)) return; trace_rcu_grace_period(TPS("rcu_sched"), @@ -252,13 +250,9 @@ void rcu_sched_qs(void) __this_cpu_write(rcu_sched_data.cpu_no_qs.b.norm, false); if (!__this_cpu_read(rcu_sched_data.cpu_no_qs.b.exp)) return; - local_irq_save(flags); - if (__this_cpu_read(rcu_sched_data.cpu_no_qs.b.exp)) { - __this_cpu_write(rcu_sched_data.cpu_no_qs.b.exp, false); - rcu_report_exp_rdp(&rcu_sched_state, - this_cpu_ptr(&rcu_sched_data), true); - } - local_irq_restore(flags); + __this_cpu_write(rcu_sched_data.cpu_no_qs.b.exp, false); + rcu_report_exp_rdp(&rcu_sched_state, + this_cpu_ptr(&rcu_sched_data), true); } void rcu_bh_qs(void) @@ -295,17 +289,16 @@ EXPORT_PER_CPU_SYMBOL_GPL(rcu_qs_ctr); * We inform the RCU core by emulating a zero-duration dyntick-idle * period, which we in turn do by incrementing the ->dynticks counter * by two. + * + * The caller must have disabled interrupts. */ static void rcu_momentary_dyntick_idle(void) { - unsigned long flags; struct rcu_data *rdp; struct rcu_dynticks *rdtp; int resched_mask; struct rcu_state *rsp; - local_irq_save(flags); - /* * Yes, we can lose flag-setting operations. This is OK, because * the flag will be set again after some delay. @@ -335,13 +328,12 @@ static void rcu_momentary_dyntick_idle(void) smp_mb__after_atomic(); /* Later stuff after QS. */ break; } - local_irq_restore(flags); } /* * Note a context switch. This is a quiescent state for RCU-sched, * and requires special handling for preemptible RCU. - * The caller must have disabled preemption. + * The caller must have disabled interrupts. */ void rcu_note_context_switch(void) { @@ -371,9 +363,14 @@ EXPORT_SYMBOL_GPL(rcu_note_context_switch); */ void rcu_all_qs(void) { + unsigned long flags; + barrier(); /* Avoid RCU read-side critical sections leaking down. */ - if (unlikely(raw_cpu_read(rcu_sched_qs_mask))) + if (unlikely(raw_cpu_read(rcu_sched_qs_mask))) { + local_irq_save(flags); rcu_momentary_dyntick_idle(); + local_irq_restore(flags); + } this_cpu_inc(rcu_qs_ctr); barrier(); /* Avoid RCU read-side critical sections leaking up. */ } diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 8e9d4a4d0326..e6da888cc908 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -146,8 +146,8 @@ static void __init rcu_bootup_announce(void) * the corresponding expedited grace period will also be the end of the * normal grace period. */ -static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp, - unsigned long flags) __releases(rnp->lock) +static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp) + __releases(rnp->lock) /* But leaves rrupts disabled. */ { int blkd_state = (rnp->gp_tasks ? RCU_GP_TASKS : 0) + (rnp->exp_tasks ? RCU_EXP_TASKS : 0) + @@ -235,7 +235,7 @@ static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp, rnp->gp_tasks = &t->rcu_node_entry; if (!rnp->exp_tasks && (blkd_state & RCU_EXP_BLKD)) rnp->exp_tasks = &t->rcu_node_entry; - raw_spin_unlock(&rnp->lock); + raw_spin_unlock(&rnp->lock); /* rrupts remain disabled. */ /* * Report the quiescent state for the expedited GP. This expedited @@ -250,7 +250,6 @@ static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp, } else { WARN_ON_ONCE(t->rcu_read_unlock_special.b.exp_need_qs); } - local_irq_restore(flags); } /* @@ -285,12 +284,11 @@ static void rcu_preempt_qs(void) * predating the current grace period drain, in other words, until * rnp->gp_tasks becomes NULL. * - * Caller must disable preemption. + * Caller must disable interrupts. */ static void rcu_preempt_note_context_switch(void) { struct task_struct *t = current; - unsigned long flags; struct rcu_data *rdp; struct rcu_node *rnp; @@ -300,7 +298,7 @@ static void rcu_preempt_note_context_switch(void) /* Possibly blocking in an RCU read-side critical section. */ rdp = this_cpu_ptr(rcu_state_p->rda); rnp = rdp->mynode; - raw_spin_lock_irqsave_rcu_node(rnp, flags); + raw_spin_lock_rcu_node(rnp); t->rcu_read_unlock_special.b.blocked = true; t->rcu_blocked_node = rnp; @@ -316,7 +314,7 @@ static void rcu_preempt_note_context_switch(void) (rnp->qsmask & rdp->grpmask) ? rnp->gpnum : rnp->gpnum + 1); - rcu_preempt_ctxt_queue(rnp, rdp, flags); + rcu_preempt_ctxt_queue(rnp, rdp); } else if (t->rcu_read_lock_nesting < 0 && t->rcu_read_unlock_special.s) { diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 4d568ac9319e..ec72de234feb 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3085,7 +3085,6 @@ static void __sched notrace __schedule(bool preempt) cpu = smp_processor_id(); rq = cpu_rq(cpu); - rcu_note_context_switch(); prev = rq->curr; /* @@ -3104,13 +3103,16 @@ static void __sched notrace __schedule(bool preempt) if (sched_feat(HRTICK)) hrtick_clear(rq); + local_irq_disable(); + rcu_note_context_switch(); + /* * Make sure that signal_pending_state()->signal_pending() below * can't be reordered with __set_current_state(TASK_INTERRUPTIBLE) * done by the caller to avoid the race with signal_wake_up(). */ smp_mb__before_spinlock(); - raw_spin_lock_irq(&rq->lock); + raw_spin_lock(&rq->lock); lockdep_pin_lock(&rq->lock); rq->clock_skip_update <<= 1; /* promote REQ to ACT */ -- cgit From a0e3a3aa2841d5720a277de53b6882eb8b2ef698 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sat, 5 Dec 2015 17:34:10 -0800 Subject: rcutorture: Flag nonexistent RCU GP kthread Currently, if the RCU grace-period kthread has not yet been created, in which case the starvation-check code will print zero for the state, which maps to TASK_RUNNING. This could clearly be quite confusing, so this commit prints ~0, which does not map to any legal ->state value. Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcu/tree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 81aa1cdc6bc9..e2315fb57b23 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1201,7 +1201,7 @@ static void rcu_check_gp_kthread_starvation(struct rcu_state *rsp) rsp->name, j - gpa, rsp->gpnum, rsp->completed, rsp->gp_flags, rsp->gp_state, - rsp->gp_kthread ? rsp->gp_kthread->state : 0); + rsp->gp_kthread ? rsp->gp_kthread->state : ~0); } /* -- cgit From b1adb3e2736b695821badc715d2c7a5d873b8b94 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 1 Oct 2015 10:38:16 -0700 Subject: rcutorture: Dump stack when GP kthread stalls This commit increases debug information in the case where the grace-period kthread is being prevented from running by dumping that kthread's stack. Signed-off-by: Paul E. McKenney [ paulmck: Split into prior commit and this commit, as suggested by Josh Triplett. ] Reviewed-by: Josh Triplett --- kernel/rcu/tree.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index e2315fb57b23..7b78c88e19a3 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1196,12 +1196,15 @@ static void rcu_check_gp_kthread_starvation(struct rcu_state *rsp) j = jiffies; gpa = READ_ONCE(rsp->gp_activity); - if (j - gpa > 2 * HZ) + if (j - gpa > 2 * HZ) { pr_err("%s kthread starved for %ld jiffies! g%lu c%lu f%#x s%d ->state=%#lx\n", rsp->name, j - gpa, rsp->gpnum, rsp->completed, rsp->gp_flags, rsp->gp_state, rsp->gp_kthread ? rsp->gp_kthread->state : ~0); + if (rsp->gp_kthread) + sched_show_task(rsp->gp_kthread); + } } /* -- cgit From 18aff33e7314253b9437234bd6d69ddc4827de70 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 17 Nov 2015 13:35:28 -0800 Subject: rcutorture: Print symbolic name for rcu_torture_writer_state Currently, rcu_torture_writer_state is printed as an integer, which slows debugging. This commit therefore prints a symbolic name in addition to the integer. Signed-off-by: Paul E. McKenney [ paulmck: More "const", as suggested by Josh Triplett. ] Reviewed-by: Josh Triplett --- kernel/rcu/rcutorture.c | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index d89328e260df..d2988d047d66 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -162,6 +162,27 @@ static int rcu_torture_writer_state; #define RTWS_SYNC 7 #define RTWS_STUTTER 8 #define RTWS_STOPPING 9 +static const char * const rcu_torture_writer_state_names[] = { + "RTWS_FIXED_DELAY", + "RTWS_DELAY", + "RTWS_REPLACE", + "RTWS_DEF_FREE", + "RTWS_EXP_SYNC", + "RTWS_COND_GET", + "RTWS_COND_SYNC", + "RTWS_SYNC", + "RTWS_STUTTER", + "RTWS_STOPPING", +}; + +static const char *rcu_torture_writer_state_getname(void) +{ + unsigned int i = READ_ONCE(rcu_torture_writer_state); + + if (i >= ARRAY_SIZE(rcu_torture_writer_state_names)) + return "???"; + return rcu_torture_writer_state_names[i]; +} #if defined(MODULE) || defined(CONFIG_RCU_TORTURE_TEST_RUNNABLE) #define RCUTORTURE_RUNNABLE_INIT 1 @@ -1307,7 +1328,8 @@ rcu_torture_stats_print(void) rcutorture_get_gp_data(cur_ops->ttype, &flags, &gpnum, &completed); - pr_alert("??? Writer stall state %d g%lu c%lu f%#x\n", + pr_alert("??? Writer stall state %s(%d) g%lu c%lu f%#x\n", + rcu_torture_writer_state_getname(), rcu_torture_writer_state, gpnum, completed, flags); show_rcu_gp_kthreads(); -- cgit From 6b50e119c440b7532ed749b635a58b3839f62992 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 17 Nov 2015 14:39:26 -0800 Subject: rcutorture: Print symbolic name for ->gp_state Currently, ->gp_state is printed as an integer, which slows debugging. This commit therefore prints a symbolic name in addition to the integer. Signed-off-by: Paul E. McKenney [ paulmck: Updated to fix relational operator called out by Dan Carpenter. ] [ paulmck: More "const", as suggested by Josh Triplett. ] Reviewed-by: Josh Triplett --- kernel/rcu/tree.c | 15 +++++++++++++-- kernel/rcu/tree.h | 12 ++++++++++++ 2 files changed, 25 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 7b78c88e19a3..316354109734 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1186,6 +1186,16 @@ static void record_gp_stall_check_time(struct rcu_state *rsp) rsp->n_force_qs_gpstart = READ_ONCE(rsp->n_force_qs); } +/* + * Convert a ->gp_state value to a character string. + */ +static const char *gp_state_getname(short gs) +{ + if (gs < 0 || gs >= ARRAY_SIZE(gp_state_names)) + return "???"; + return gp_state_names[gs]; +} + /* * Complain about starvation of grace-period kthread. */ @@ -1197,10 +1207,11 @@ static void rcu_check_gp_kthread_starvation(struct rcu_state *rsp) j = jiffies; gpa = READ_ONCE(rsp->gp_activity); if (j - gpa > 2 * HZ) { - pr_err("%s kthread starved for %ld jiffies! g%lu c%lu f%#x s%d ->state=%#lx\n", + pr_err("%s kthread starved for %ld jiffies! g%lu c%lu f%#x %s(%d) ->state=%#lx\n", rsp->name, j - gpa, rsp->gpnum, rsp->completed, - rsp->gp_flags, rsp->gp_state, + rsp->gp_flags, + gp_state_getname(rsp->gp_state), rsp->gp_state, rsp->gp_kthread ? rsp->gp_kthread->state : ~0); if (rsp->gp_kthread) sched_show_task(rsp->gp_kthread); diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index f32bebb6bc90..a3fb6fe94127 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -545,6 +545,18 @@ struct rcu_state { #define RCU_GP_CLEANUP 5 /* Grace-period cleanup started. */ #define RCU_GP_CLEANED 6 /* Grace-period cleanup complete. */ +#ifndef RCU_TREE_NONCORE +static const char * const gp_state_names[] = { + "RCU_GP_IDLE", + "RCU_GP_WAIT_GPS", + "RCU_GP_DONE_GPS", + "RCU_GP_WAIT_FQS", + "RCU_GP_DOING_FQS", + "RCU_GP_CLEANUP", + "RCU_GP_CLEANED", +}; +#endif /* #ifndef RCU_TREE_NONCORE */ + extern struct list_head rcu_struct_flavors; /* Sequence through rcu_state structures for each RCU flavor. */ -- cgit From 79cfea0273876d9c438f3227b8f68c8c7ae31583 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 7 Dec 2015 13:09:52 -0800 Subject: rcu: Remove TINY_RCU bloat from pointless boot parameters The rcu_expedited, rcu_normal, and rcu_normal_after_boot kernel boot parameters are pointless in the case of TINY_RCU because in that case synchronous grace periods, both expedited and normal, are no-ops. However, these three symbols contribute several hundred bytes of bloat. This commit therefore uses CPP directives to avoid compiling this code in TINY_RCU kernels. Reported-by: kbuild test robot Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/ksysfs.c | 4 ++++ kernel/rcu/update.c | 7 ++++--- 2 files changed, 8 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c index b4e2fa52d8bc..152da4a48867 100644 --- a/kernel/ksysfs.c +++ b/kernel/ksysfs.c @@ -144,6 +144,7 @@ static ssize_t fscaps_show(struct kobject *kobj, } KERNEL_ATTR_RO(fscaps); +#ifndef CONFIG_TINY_RCU int rcu_expedited; static ssize_t rcu_expedited_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) @@ -177,6 +178,7 @@ static ssize_t rcu_normal_store(struct kobject *kobj, return count; } KERNEL_ATTR_RW(rcu_normal); +#endif /* #ifndef CONFIG_TINY_RCU */ /* * Make /sys/kernel/notes give the raw contents of our kernel .notes section. @@ -219,8 +221,10 @@ static struct attribute * kernel_attrs[] = { &kexec_crash_size_attr.attr, &vmcoreinfo_attr.attr, #endif +#ifndef CONFIG_TINY_RCU &rcu_expedited_attr.attr, &rcu_normal_attr.attr, +#endif NULL }; diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index 12b91f5a60a6..76b94e19430b 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -60,11 +60,12 @@ MODULE_ALIAS("rcupdate"); #endif #define MODULE_PARAM_PREFIX "rcupdate." +#ifndef CONFIG_TINY_RCU module_param(rcu_expedited, int, 0); module_param(rcu_normal, int, 0); - static int rcu_normal_after_boot; module_param(rcu_normal_after_boot, int, 0); +#endif /* #ifndef CONFIG_TINY_RCU */ #if defined(CONFIG_DEBUG_LOCK_ALLOC) && defined(CONFIG_PREEMPT_COUNT) /** @@ -172,8 +173,6 @@ void rcu_unexpedite_gp(void) } EXPORT_SYMBOL_GPL(rcu_unexpedite_gp); -#endif /* #ifndef CONFIG_TINY_RCU */ - /* * Inform RCU of the end of the in-kernel boot sequence. */ @@ -185,6 +184,8 @@ void rcu_end_inkernel_boot(void) WRITE_ONCE(rcu_normal, 1); } +#endif /* #ifndef CONFIG_TINY_RCU */ + #ifdef CONFIG_PREEMPT_RCU /* -- cgit From a87f203e2731ab477386c678e59033ee103018c0 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 20 Oct 2015 12:38:49 -0700 Subject: rcu: Eliminate unused rcu_init_one() argument Now that the rcu_state structure's ->rda field is compile-time initialized, there is no need to pass the per-CPU rcu_data structure into rcu_init_one(). This commit therefore eliminates this now-unused parameter. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 7 +++---- kernel/rcu/tree_plugin.h | 2 +- 2 files changed, 4 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 93941d3434ad..9a4c8c0653ff 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -4351,8 +4351,7 @@ static void __init rcu_init_levelspread(int *levelspread, const int *levelcnt) /* * Helper function for rcu_init() that initializes one rcu_state structure. */ -static void __init rcu_init_one(struct rcu_state *rsp, - struct rcu_data __percpu *rda) +static void __init rcu_init_one(struct rcu_state *rsp) { static const char * const buf[] = RCU_NODE_NAME_INIT; static const char * const fqs[] = RCU_FQS_NAME_INIT; @@ -4545,8 +4544,8 @@ void __init rcu_init(void) rcu_bootup_announce(); rcu_init_geometry(); - rcu_init_one(&rcu_bh_state, &rcu_bh_data); - rcu_init_one(&rcu_sched_state, &rcu_sched_data); + rcu_init_one(&rcu_bh_state); + rcu_init_one(&rcu_sched_state); if (dump_tree) rcu_dump_rcu_node_tree(&rcu_sched_state); __rcu_init_preempt(); diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index e6da888cc908..fccef5d4b198 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -777,7 +777,7 @@ EXPORT_SYMBOL_GPL(rcu_barrier); */ static void __init __rcu_init_preempt(void) { - rcu_init_one(rcu_state_p, rcu_data_p); + rcu_init_one(rcu_state_p); } /* -- cgit From d117c8aa1d511f76401337620b9c4ffb4c886579 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sat, 31 Oct 2015 00:01:18 -0700 Subject: rcu: Make cpu_needs_another_gp() be bool The cpu_needs_another_gp() function is currently of type int, but only returns zero or one. Bow to reality and make it be of type bool. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 9a4c8c0653ff..d6863bceeb45 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -597,25 +597,25 @@ static int rcu_future_needs_gp(struct rcu_state *rsp) * The caller must have disabled interrupts to prevent races with * normal callback registry. */ -static int +static bool cpu_needs_another_gp(struct rcu_state *rsp, struct rcu_data *rdp) { int i; if (rcu_gp_in_progress(rsp)) - return 0; /* No, a grace period is already in progress. */ + return false; /* No, a grace period is already in progress. */ if (rcu_future_needs_gp(rsp)) - return 1; /* Yes, a no-CBs CPU needs one. */ + return true; /* Yes, a no-CBs CPU needs one. */ if (!rdp->nxttail[RCU_NEXT_TAIL]) - return 0; /* No, this is a no-CBs (or offline) CPU. */ + return false; /* No, this is a no-CBs (or offline) CPU. */ if (*rdp->nxttail[RCU_NEXT_READY_TAIL]) - return 1; /* Yes, this CPU has newly registered callbacks. */ + return true; /* Yes, CPU has newly registered callbacks. */ for (i = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++) if (rdp->nxttail[i - 1] != rdp->nxttail[i] && ULONG_CMP_LT(READ_ONCE(rsp->completed), rdp->nxtcompleted[i])) - return 1; /* Yes, CBs for future grace period. */ - return 0; /* No grace period needed. */ + return true; /* Yes, CBs for future grace period. */ + return false; /* No grace period needed. */ } /* -- cgit From 7c9906ca5e582a773fff696975e312cef58a7386 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sat, 31 Oct 2015 00:59:01 -0700 Subject: rcu: Don't redundantly disable irqs in rcu_irq_{enter,exit}() This commit replaces a local_irq_save()/local_irq_restore() pair with a lockdep assertion that interrupts are already disabled. This should remove the corresponding overhead from the interrupt entry/exit fastpaths. This change was inspired by the fact that Iftekhar Ahmed's mutation testing showed that removing rcu_irq_enter()'s call to local_ird_restore() had no effect, which might indicate that interrupts were always enabled anyway. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 32 ++++++++++++++++++++++++++------ 1 file changed, 26 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index d6863bceeb45..40940b0d0310 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -732,7 +732,7 @@ void rcu_user_enter(void) * * Exit from an interrupt handler, which might possibly result in entering * idle mode, in other words, leaving the mode in which read-side critical - * sections can occur. + * sections can occur. The caller must have disabled interrupts. * * This code assumes that the idle loop never does anything that might * result in unbalanced calls to irq_enter() and irq_exit(). If your @@ -745,11 +745,10 @@ void rcu_user_enter(void) */ void rcu_irq_exit(void) { - unsigned long flags; long long oldval; struct rcu_dynticks *rdtp; - local_irq_save(flags); + RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_irq_exit() invoked with irqs enabled!!!"); rdtp = this_cpu_ptr(&rcu_dynticks); oldval = rdtp->dynticks_nesting; rdtp->dynticks_nesting--; @@ -760,6 +759,17 @@ void rcu_irq_exit(void) else rcu_eqs_enter_common(oldval, true); rcu_sysidle_enter(1); +} + +/* + * Wrapper for rcu_irq_exit() where interrupts are enabled. + */ +void rcu_irq_exit_irqson(void) +{ + unsigned long flags; + + local_irq_save(flags); + rcu_irq_exit(); local_irq_restore(flags); } @@ -857,7 +867,7 @@ void rcu_user_exit(void) * * Enter an interrupt handler, which might possibly result in exiting * idle mode, in other words, entering the mode in which read-side critical - * sections can occur. + * sections can occur. The caller must have disabled interrupts. * * Note that the Linux kernel is fully capable of entering an interrupt * handler that it never exits, for example when doing upcalls to @@ -873,11 +883,10 @@ void rcu_user_exit(void) */ void rcu_irq_enter(void) { - unsigned long flags; struct rcu_dynticks *rdtp; long long oldval; - local_irq_save(flags); + RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_irq_enter() invoked with irqs enabled!!!"); rdtp = this_cpu_ptr(&rcu_dynticks); oldval = rdtp->dynticks_nesting; rdtp->dynticks_nesting++; @@ -888,6 +897,17 @@ void rcu_irq_enter(void) else rcu_eqs_exit_common(oldval, true); rcu_sysidle_exit(1); +} + +/* + * Wrapper for rcu_irq_enter() where interrupts are enabled. + */ +void rcu_irq_enter_irqson(void) +{ + unsigned long flags; + + local_irq_save(flags); + rcu_irq_enter(); local_irq_restore(flags); } -- cgit From e11f13355b09df970495c45ed0eac1dc85dcf5c1 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 4 Nov 2015 08:22:05 -0800 Subject: rcu: Move wakeup out from under rnp->lock This patch removes a potential deadlock hazard by moving the wake_up_process() in rcu_spawn_gp_kthread() out from under rnp->lock. Signed-off-by: Peter Zijlstra Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 40940b0d0310..87b604d0b0d2 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -4319,8 +4319,8 @@ static int __init rcu_spawn_gp_kthread(void) sp.sched_priority = kthread_prio; sched_setscheduler_nocheck(t, SCHED_FIFO, &sp); } - wake_up_process(t); raw_spin_unlock_irqrestore(&rnp->lock, flags); + wake_up_process(t); } rcu_spawn_nocb_kthreads(); rcu_spawn_boost_kthreads(); -- cgit From 45fed3e7cfb4001c80cd4bd25249d194a52bfed3 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sat, 7 Nov 2015 23:35:00 -0800 Subject: rcu: Make rcu_gp_init() be bool rather than int The return value from rcu_gp_init() is always used as a bool, so this commit makes it be a bool. Reported-by: Iftekhar Ahmed Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 87b604d0b0d2..01a90a3bdf79 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1814,9 +1814,9 @@ static void rcu_gp_slow(struct rcu_state *rsp, int delay) } /* - * Initialize a new grace period. Return 0 if no grace period required. + * Initialize a new grace period. Return false if no grace period required. */ -static int rcu_gp_init(struct rcu_state *rsp) +static bool rcu_gp_init(struct rcu_state *rsp) { unsigned long oldmask; struct rcu_data *rdp; @@ -1827,7 +1827,7 @@ static int rcu_gp_init(struct rcu_state *rsp) if (!READ_ONCE(rsp->gp_flags)) { /* Spurious wakeup, tell caller to go back to sleep. */ raw_spin_unlock_irq(&rnp->lock); - return 0; + return false; } WRITE_ONCE(rsp->gp_flags, 0); /* Clear all flags: New grace period. */ @@ -1837,7 +1837,7 @@ static int rcu_gp_init(struct rcu_state *rsp) * Not supposed to be able to happen. */ raw_spin_unlock_irq(&rnp->lock); - return 0; + return false; } /* Advance to a new grace period and initialize state. */ @@ -1929,7 +1929,7 @@ static int rcu_gp_init(struct rcu_state *rsp) WRITE_ONCE(rsp->gp_activity, jiffies); } - return 1; + return true; } /* -- cgit