From b8c17e6664c461e4aed545a943304c3b32dd309c Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 8 Nov 2016 14:25:21 -0800 Subject: rcu: Maintain special bits at bottom of ->dynticks counter Currently, IPIs are used to force other CPUs to invalidate their TLBs in response to a kernel virtual-memory mapping change. This works, but degrades both battery lifetime (for idle CPUs) and real-time response (for nohz_full CPUs), and in addition results in unnecessary IPIs due to the fact that CPUs executing in usermode are unaffected by stale kernel mappings. It would be better to cause a CPU executing in usermode to wait until it is entering kernel mode to do the flush, first to avoid interrupting usemode tasks and second to handle multiple flush requests with a single flush in the case of a long-running user task. This commit therefore reserves a bit at the bottom of the ->dynticks counter, which is checked upon exit from extended quiescent states. If it is set, it is cleared and then a new rcu_eqs_special_exit() macro is invoked, which, if not supplied, is an empty single-pass do-while loop. If this bottom bit is set on -entry- to an extended quiescent state, then a WARN_ON_ONCE() triggers. This bottom bit may be set using a new rcu_eqs_special_set() function, which returns true if the bit was set, or false if the CPU turned out to not be in an extended quiescent state. Please note that this function refuses to set the bit for a non-nohz_full CPU when that CPU is executing in usermode because usermode execution is tracked by RCU as a dyntick-idle extended quiescent state only for nohz_full CPUs. Reported-by: Andy Lutomirski Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcu/tree.h | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel/rcu/tree.h') diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index ec62a05bfdb3..7468b4de7e0c 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -596,6 +596,7 @@ extern struct rcu_state rcu_preempt_state; #endif /* #ifdef CONFIG_PREEMPT_RCU */ int rcu_dynticks_snap(struct rcu_dynticks *rdtp); +bool rcu_eqs_special_set(int cpu); #ifdef CONFIG_RCU_BOOST DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_status); -- cgit From abb06b99484a9f5af05c7147c289faf835f68e8e Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 26 Jan 2017 13:45:38 -0800 Subject: rcu: Pull rcu_sched_qs_mask into rcu_dynticks structure The rcu_sched_qs_mask variable is yet another isolated per-CPU variable, so this commit pulls it into the pre-existing rcu_dynticks per-CPU structure. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.h | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel/rcu/tree.h') diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 7468b4de7e0c..e298281984dc 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -113,6 +113,7 @@ struct rcu_dynticks { /* Process level is worth LLONG_MAX/2. */ int dynticks_nmi_nesting; /* Track NMI nesting level. */ atomic_t dynticks; /* Even value for idle, else odd. */ + int rcu_sched_qs_mask; /* GP old, need quiescent state. */ #ifdef CONFIG_NO_HZ_FULL_SYSIDLE long long dynticks_idle_nesting; /* irq/process nesting level from idle. */ -- cgit From 9577df9a3122af08fff84b8a1a60dccf524a3891 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 26 Jan 2017 16:18:07 -0800 Subject: rcu: Pull rcu_qs_ctr into rcu_dynticks structure The rcu_qs_ctr variable is yet another isolated per-CPU variable, so this commit pulls it into the pre-existing rcu_dynticks per-CPU structure. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel/rcu/tree.h') diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index e298281984dc..76e4467bc765 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -113,7 +113,8 @@ struct rcu_dynticks { /* Process level is worth LLONG_MAX/2. */ int dynticks_nmi_nesting; /* Track NMI nesting level. */ atomic_t dynticks; /* Even value for idle, else odd. */ - int rcu_sched_qs_mask; /* GP old, need quiescent state. */ + int rcu_sched_qs_mask; /* GP old, need heavy quiescent state. */ + unsigned long rcu_qs_ctr; /* Light universal quiescent state ctr. */ #ifdef CONFIG_NO_HZ_FULL_SYSIDLE long long dynticks_idle_nesting; /* irq/process nesting level from idle. */ -- cgit From 0f9be8cabbc343218dd2807af7308656be113045 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 27 Jan 2017 13:17:02 -0800 Subject: rcu: Eliminate flavor scan in rcu_momentary_dyntick_idle() The rcu_momentary_dyntick_idle() function scans the RCU flavors, checking that one of them still needs a quiescent state before doing an expensive atomic operation on the ->dynticks counter. However, this check reduces overhead only after a rare race condition, and increases complexity. This commit therefore removes the scan and the mechanism enabling the scan. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel/rcu/tree.h') diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 76e4467bc765..b212cd0f22c7 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -113,7 +113,7 @@ struct rcu_dynticks { /* Process level is worth LLONG_MAX/2. */ int dynticks_nmi_nesting; /* Track NMI nesting level. */ atomic_t dynticks; /* Even value for idle, else odd. */ - int rcu_sched_qs_mask; /* GP old, need heavy quiescent state. */ + bool rcu_need_heavy_qs; /* GP old, need heavy quiescent state. */ unsigned long rcu_qs_ctr; /* Light universal quiescent state ctr. */ #ifdef CONFIG_NO_HZ_FULL_SYSIDLE long long dynticks_idle_nesting; @@ -484,7 +484,6 @@ struct rcu_state { struct rcu_node *level[RCU_NUM_LVLS + 1]; /* Hierarchy levels (+1 to */ /* shut bogus gcc warning) */ - u8 flavor_mask; /* bit in flavor mask. */ struct rcu_data __percpu *rda; /* pointer of percu rcu_data. */ call_rcu_func_t call; /* call_rcu() flavor. */ int ncpus; /* # CPUs seen so far. */ -- cgit From 9226b10d78ffe7895549045fe388dc5e73b87eac Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 27 Jan 2017 14:17:50 -0800 Subject: rcu: Place guard on rcu_all_qs() and rcu_note_context_switch() actions The rcu_all_qs() and rcu_note_context_switch() do a series of checks, taking various actions to supply RCU with quiescent states, depending on the outcomes of the various checks. This is a bit much for scheduling fastpaths, so this commit creates a separate ->rcu_urgent_qs field in the rcu_dynticks structure that acts as a global guard for these checks. Thus, in the common case, rcu_all_qs() and rcu_note_context_switch() check the ->rcu_urgent_qs field, find it false, and simply return. Signed-off-by: Paul E. McKenney Cc: Peter Zijlstra --- kernel/rcu/tree.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel/rcu/tree.h') diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index b212cd0f22c7..d2f276fc2edc 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -113,8 +113,9 @@ struct rcu_dynticks { /* Process level is worth LLONG_MAX/2. */ int dynticks_nmi_nesting; /* Track NMI nesting level. */ atomic_t dynticks; /* Even value for idle, else odd. */ - bool rcu_need_heavy_qs; /* GP old, need heavy quiescent state. */ + bool rcu_need_heavy_qs; /* GP old, need heavy quiescent state. */ unsigned long rcu_qs_ctr; /* Light universal quiescent state ctr. */ + bool rcu_urgent_qs; /* GP old need light quiescent state. */ #ifdef CONFIG_NO_HZ_FULL_SYSIDLE long long dynticks_idle_nesting; /* irq/process nesting level from idle. */ -- cgit From b8c78d3afc6aac1c722af3bec18959c6bd93231c Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 3 Feb 2017 09:54:05 -0800 Subject: rcu: Default RCU_FANOUT_LEAF to 16 unless explicitly changed If the RCU_EXPERT Kconfig option is not set (the default), then the RCU_FANOUT_LEAF Kconfig option will not be defined, which will cause the leaf-level rcu_node tree fanout to default to 32 on 32-bit systems and 64 on 64-bit systems. This can result in excessive lock contention. This commit therefore changes the computation of the leaf-level rcu_node tree fanout so that the result will be 16 unless an explicit Kconfig or kernel-boot setting says otherwise. Reported-by: Peter Zijlstra Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.h | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'kernel/rcu/tree.h') diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index d2f276fc2edc..376c01e539c7 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -52,11 +52,7 @@ #ifdef CONFIG_RCU_FANOUT_LEAF #define RCU_FANOUT_LEAF CONFIG_RCU_FANOUT_LEAF #else /* #ifdef CONFIG_RCU_FANOUT_LEAF */ -# ifdef CONFIG_64BIT -# define RCU_FANOUT_LEAF 64 -# else -# define RCU_FANOUT_LEAF 32 -# endif +#define RCU_FANOUT_LEAF 16 #endif /* #else #ifdef CONFIG_RCU_FANOUT_LEAF */ #define RCU_FANOUT_1 (RCU_FANOUT_LEAF) -- cgit From 15fecf89e46a962ccda583d919e25d9da7bf0723 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 8 Feb 2017 12:36:42 -0800 Subject: srcu: Abstract multi-tail callback list handling RCU has only one multi-tail callback list, which is implemented via the nxtlist, nxttail, nxtcompleted, qlen_lazy, and qlen fields in the rcu_data structure, and whose operations are open-code throughout the Tree RCU implementation. This has been more or less OK in the past, but upcoming callback-list optimizations in SRCU could really use a multi-tail callback list there as well. This commit therefore abstracts the multi-tail callback list handling into a new kernel/rcu/rcu_segcblist.h file, and uses this new API. The simple head-and-tail pointer callback list is also abstracted and applied everywhere except for the NOCB callback-offload lists. (Yes, the plan is to apply them there as well, but this commit is already bigger than would be good.) Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.h | 41 +++++++---------------------------------- 1 file changed, 7 insertions(+), 34 deletions(-) (limited to 'kernel/rcu/tree.h') diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 376c01e539c7..93889ff21dbb 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -30,6 +30,7 @@ #include #include #include +#include "rcu_segcblist.h" /* * Define shape of hierarchy based on NR_CPUS, CONFIG_RCU_FANOUT, and @@ -335,34 +336,9 @@ struct rcu_data { /* period it is aware of. */ /* 2) batch handling */ - /* - * If nxtlist is not NULL, it is partitioned as follows. - * Any of the partitions might be empty, in which case the - * pointer to that partition will be equal to the pointer for - * the following partition. When the list is empty, all of - * the nxttail elements point to the ->nxtlist pointer itself, - * which in that case is NULL. - * - * [nxtlist, *nxttail[RCU_DONE_TAIL]): - * Entries that batch # <= ->completed - * The grace period for these entries has completed, and - * the other grace-period-completed entries may be moved - * here temporarily in rcu_process_callbacks(). - * [*nxttail[RCU_DONE_TAIL], *nxttail[RCU_WAIT_TAIL]): - * Entries that batch # <= ->completed - 1: waiting for current GP - * [*nxttail[RCU_WAIT_TAIL], *nxttail[RCU_NEXT_READY_TAIL]): - * Entries known to have arrived before current GP ended - * [*nxttail[RCU_NEXT_READY_TAIL], *nxttail[RCU_NEXT_TAIL]): - * Entries that might have arrived after current GP ended - * Note that the value of *nxttail[RCU_NEXT_TAIL] will - * always be NULL, as this is the end of the list. - */ - struct rcu_head *nxtlist; - struct rcu_head **nxttail[RCU_NEXT_SIZE]; - unsigned long nxtcompleted[RCU_NEXT_SIZE]; - /* grace periods for sublists. */ - long qlen_lazy; /* # of lazy queued callbacks */ - long qlen; /* # of queued callbacks, incl lazy */ + struct rcu_segcblist cblist; /* Segmented callback list, with */ + /* different callbacks waiting for */ + /* different grace periods. */ long qlen_last_fqs_check; /* qlen at last check for QS forcing */ unsigned long n_cbs_invoked; /* count of RCU cbs invoked. */ @@ -500,14 +476,11 @@ struct rcu_state { raw_spinlock_t orphan_lock ____cacheline_internodealigned_in_smp; /* Protect following fields. */ - struct rcu_head *orphan_nxtlist; /* Orphaned callbacks that */ + struct rcu_cblist orphan_pend; /* Orphaned callbacks that */ /* need a grace period. */ - struct rcu_head **orphan_nxttail; /* Tail of above. */ - struct rcu_head *orphan_donelist; /* Orphaned callbacks that */ + struct rcu_cblist orphan_done; /* Orphaned callbacks that */ /* are ready to invoke. */ - struct rcu_head **orphan_donetail; /* Tail of above. */ - long qlen_lazy; /* Number of lazy callbacks. */ - long qlen; /* Total number of callbacks. */ + /* (Contains counts.) */ /* End of fields guarded by orphan_lock. */ struct mutex barrier_mutex; /* Guards barrier fields. */ -- cgit From 8660b7d8a545227fd9ee80508aa82528ea9947d7 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 13 Mar 2017 16:48:18 -0700 Subject: srcu: Use rcu_segcblist to track SRCU callbacks This commit switches SRCU from custom-built callback queues to the new rcu_segcblist structure. This change associates grace-period sequence numbers with groups of callbacks, which will be needed for efficient processing of per-CPU callbacks. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/rcu/tree.h') diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 93889ff21dbb..4f62651588ea 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -30,7 +30,7 @@ #include #include #include -#include "rcu_segcblist.h" +#include /* * Define shape of hierarchy based on NR_CPUS, CONFIG_RCU_FANOUT, and -- cgit From f2425b4efb0c69e77c0b9666b605ae4a1ecaae47 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 14 Mar 2017 12:42:30 -0700 Subject: srcu: Move combining-tree definitions for SRCU's benefit This commit moves the C preprocessor code that defines the default shape of the rcu_node combining tree to a new include/linux/rcu_node_tree.h file as a first step towards enabling SRCU to create its own combining tree, which in turn enables SRCU to implement per-CPU callback handling, thus avoiding contention on the lock currently guarding the single list of callbacks. Note that users of SRCU still need to know the size of the srcu_struct structure, hence include/linux rather than kernel/rcu. This commit is code-movement only. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.h | 71 +------------------------------------------------------ 1 file changed, 1 insertion(+), 70 deletions(-) (limited to 'kernel/rcu/tree.h') diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 4f62651588ea..1bec3958d44f 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -31,76 +31,7 @@ #include #include #include - -/* - * Define shape of hierarchy based on NR_CPUS, CONFIG_RCU_FANOUT, and - * CONFIG_RCU_FANOUT_LEAF. - * In theory, it should be possible to add more levels straightforwardly. - * In practice, this did work well going from three levels to four. - * Of course, your mileage may vary. - */ - -#ifdef CONFIG_RCU_FANOUT -#define RCU_FANOUT CONFIG_RCU_FANOUT -#else /* #ifdef CONFIG_RCU_FANOUT */ -# ifdef CONFIG_64BIT -# define RCU_FANOUT 64 -# else -# define RCU_FANOUT 32 -# endif -#endif /* #else #ifdef CONFIG_RCU_FANOUT */ - -#ifdef CONFIG_RCU_FANOUT_LEAF -#define RCU_FANOUT_LEAF CONFIG_RCU_FANOUT_LEAF -#else /* #ifdef CONFIG_RCU_FANOUT_LEAF */ -#define RCU_FANOUT_LEAF 16 -#endif /* #else #ifdef CONFIG_RCU_FANOUT_LEAF */ - -#define RCU_FANOUT_1 (RCU_FANOUT_LEAF) -#define RCU_FANOUT_2 (RCU_FANOUT_1 * RCU_FANOUT) -#define RCU_FANOUT_3 (RCU_FANOUT_2 * RCU_FANOUT) -#define RCU_FANOUT_4 (RCU_FANOUT_3 * RCU_FANOUT) - -#if NR_CPUS <= RCU_FANOUT_1 -# define RCU_NUM_LVLS 1 -# define NUM_RCU_LVL_0 1 -# define NUM_RCU_NODES NUM_RCU_LVL_0 -# define NUM_RCU_LVL_INIT { NUM_RCU_LVL_0 } -# define RCU_NODE_NAME_INIT { "rcu_node_0" } -# define RCU_FQS_NAME_INIT { "rcu_node_fqs_0" } -#elif NR_CPUS <= RCU_FANOUT_2 -# define RCU_NUM_LVLS 2 -# define NUM_RCU_LVL_0 1 -# define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1) -# define NUM_RCU_NODES (NUM_RCU_LVL_0 + NUM_RCU_LVL_1) -# define NUM_RCU_LVL_INIT { NUM_RCU_LVL_0, NUM_RCU_LVL_1 } -# define RCU_NODE_NAME_INIT { "rcu_node_0", "rcu_node_1" } -# define RCU_FQS_NAME_INIT { "rcu_node_fqs_0", "rcu_node_fqs_1" } -#elif NR_CPUS <= RCU_FANOUT_3 -# define RCU_NUM_LVLS 3 -# define NUM_RCU_LVL_0 1 -# define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_2) -# define NUM_RCU_LVL_2 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1) -# define NUM_RCU_NODES (NUM_RCU_LVL_0 + NUM_RCU_LVL_1 + NUM_RCU_LVL_2) -# define NUM_RCU_LVL_INIT { NUM_RCU_LVL_0, NUM_RCU_LVL_1, NUM_RCU_LVL_2 } -# define RCU_NODE_NAME_INIT { "rcu_node_0", "rcu_node_1", "rcu_node_2" } -# define RCU_FQS_NAME_INIT { "rcu_node_fqs_0", "rcu_node_fqs_1", "rcu_node_fqs_2" } -#elif NR_CPUS <= RCU_FANOUT_4 -# define RCU_NUM_LVLS 4 -# define NUM_RCU_LVL_0 1 -# define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_3) -# define NUM_RCU_LVL_2 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_2) -# define NUM_RCU_LVL_3 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1) -# define NUM_RCU_NODES (NUM_RCU_LVL_0 + NUM_RCU_LVL_1 + NUM_RCU_LVL_2 + NUM_RCU_LVL_3) -# define NUM_RCU_LVL_INIT { NUM_RCU_LVL_0, NUM_RCU_LVL_1, NUM_RCU_LVL_2, NUM_RCU_LVL_3 } -# define RCU_NODE_NAME_INIT { "rcu_node_0", "rcu_node_1", "rcu_node_2", "rcu_node_3" } -# define RCU_FQS_NAME_INIT { "rcu_node_fqs_0", "rcu_node_fqs_1", "rcu_node_fqs_2", "rcu_node_fqs_3" } -#else -# error "CONFIG_RCU_FANOUT insufficient for NR_CPUS" -#endif /* #if (NR_CPUS) <= RCU_FANOUT_1 */ - -extern int rcu_num_lvls; -extern int rcu_num_nodes; +#include /* * Dynticks per-CPU state. -- cgit From efbe451d46af62369226e42b98dbcd95b6940a63 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 15 Mar 2017 13:07:53 -0700 Subject: srcu: Move rcu_node traversal macros to rcu.h This commit moves rcu_for_each_node_breadth_first(), rcu_for_each_nonleaf_node_breadth_first(), and rcu_for_each_leaf_node() from kernel/rcu/tree.h to kernel/rcu/rcu.h so that SRCU can access them. This commit is code-movement only. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.h | 35 ----------------------------------- 1 file changed, 35 deletions(-) (limited to 'kernel/rcu/tree.h') diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 1bec3958d44f..a2a45cb629d6 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -192,41 +192,6 @@ struct rcu_node { */ #define leaf_node_cpu_bit(rnp, cpu) (1UL << ((cpu) - (rnp)->grplo)) -/* - * Do a full breadth-first scan of the rcu_node structures for the - * specified rcu_state structure. - */ -#define rcu_for_each_node_breadth_first(rsp, rnp) \ - for ((rnp) = &(rsp)->node[0]; \ - (rnp) < &(rsp)->node[rcu_num_nodes]; (rnp)++) - -/* - * Do a breadth-first scan of the non-leaf rcu_node structures for the - * specified rcu_state structure. Note that if there is a singleton - * rcu_node tree with but one rcu_node structure, this loop is a no-op. - */ -#define rcu_for_each_nonleaf_node_breadth_first(rsp, rnp) \ - for ((rnp) = &(rsp)->node[0]; \ - (rnp) < (rsp)->level[rcu_num_lvls - 1]; (rnp)++) - -/* - * Scan the leaves of the rcu_node hierarchy for the specified rcu_state - * structure. Note that if there is a singleton rcu_node tree with but - * one rcu_node structure, this loop -will- visit the rcu_node structure. - * It is still a leaf node, even if it is also the root node. - */ -#define rcu_for_each_leaf_node(rsp, rnp) \ - for ((rnp) = (rsp)->level[rcu_num_lvls - 1]; \ - (rnp) < &(rsp)->node[rcu_num_nodes]; (rnp)++) - -/* - * Iterate over all possible CPUs in a leaf RCU node. - */ -#define for_each_leaf_node_possible_cpu(rnp, cpu) \ - for ((cpu) = cpumask_next(rnp->grplo - 1, cpu_possible_mask); \ - cpu <= rnp->grphi; \ - cpu = cpumask_next((cpu), cpu_possible_mask)) - /* * Union to allow "aggregate OR" operation on the need for a quiescent * state by the normal and expedited grace periods. -- cgit From da915ad5cf25b5f5d358dd3670c3378d8ae8c03e Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 5 Apr 2017 09:01:53 -0700 Subject: srcu: Parallelize callback handling Peter Zijlstra proposed using SRCU to reduce mmap_sem contention [1,2], however, there are workloads that could result in a high volume of concurrent invocations of call_srcu(), which with current SRCU would result in excessive lock contention on the srcu_struct structure's ->queue_lock, which protects SRCU's callback lists. This commit therefore moves SRCU to per-CPU callback lists, thus greatly reducing contention. Because a given SRCU instance no longer has a single centralized callback list, starting grace periods and invoking callbacks are both more complex than in the single-list Classic SRCU implementation. Starting grace periods and handling callbacks are now handled using an srcu_node tree that is in some ways similar to the rcu_node trees used by RCU-bh, RCU-preempt, and RCU-sched (for example, the srcu_node tree shape is controlled by exactly the same Kconfig options and boot parameters that control the shape of the rcu_node tree). In addition, the old per-CPU srcu_array structure is now named srcu_data and contains an rcu_segcblist structure named ->srcu_cblist for its callbacks (and a spinlock to protect this). The srcu_struct gets an srcu_gp_seq that is used to associate callback segments with the corresponding completion-time grace-period number. These completion-time grace-period numbers are propagated up the srcu_node tree so that the grace-period workqueue handler can determine whether additional grace periods are needed on the one hand and where to look for callbacks that are ready to be invoked. The srcu_barrier() function must now wait on all instances of the per-CPU ->srcu_cblist. Because each ->srcu_cblist is protected by ->lock, srcu_barrier() can remotely add the needed callbacks. In theory, it could also remotely start grace periods, but in practice doing so is complex and racy. And interestingly enough, it is never necessary for srcu_barrier() to start a grace period because srcu_barrier() only enqueues a callback when a callback is already present--and it turns out that a grace period has to have already been started for this pre-existing callback. Furthermore, it is only the callback that srcu_barrier() needs to wait on, not any particular grace period. Therefore, a new rcu_segcblist_entrain() function enqueues the srcu_barrier() function's callback into the same segment occupied by the last pre-existing callback in the list. The special case where all the pre-existing callbacks are on a different list (because they are in the process of being invoked) is handled by enqueuing srcu_barrier()'s callback into the RCU_DONE_TAIL segment, relying on the done-callbacks check that takes place after all callbacks are inovked. Note that the readers use the same algorithm as before. Note that there is a separate srcu_idx that tells the readers what counter to increment. This unfortunately cannot be combined with srcu_gp_seq because they need to be incremented at different times. This commit introduces some ugly #ifdefs in rcutorture. These will go away when I feel good enough about Tree SRCU to ditch Classic SRCU. Some crude performance comparisons, courtesy of a quickly hacked rcuperf asynchronous-grace-period capability: Callback Queuing Overhead ------------------------- # CPUS Classic SRCU Tree SRCU ------ ------------ --------- 2 0.349 us 0.342 us 16 31.66 us 0.4 us 41 --------- 0.417 us The times are the 90th percentiles, a statistic that was chosen to reject the overheads of the occasional srcu_barrier() call needed to avoid OOMing the test machine. The rcuperf test hangs when running Classic SRCU at 41 CPUs, hence the line of dashes. Despite the hacks to both the rcuperf code and that statistics, this is a convincing demonstration of Tree SRCU's performance and scalability advantages. [1] https://lwn.net/Articles/309030/ [2] https://patchwork.kernel.org/patch/5108281/ Signed-off-by: Paul E. McKenney [ paulmck: Fix initialization if synchronize_srcu_expedited() called first. ] --- kernel/rcu/tree.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'kernel/rcu/tree.h') diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index a2a45cb629d6..0e598ab08fea 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -541,6 +541,14 @@ static bool rcu_nohz_full_cpu(struct rcu_state *rsp); static void rcu_dynticks_task_enter(void); static void rcu_dynticks_task_exit(void); +#ifdef CONFIG_SRCU +void srcu_online_cpu(unsigned int cpu); +void srcu_offline_cpu(unsigned int cpu); +#else /* #ifdef CONFIG_SRCU */ +void srcu_online_cpu(unsigned int cpu) { } +void srcu_offline_cpu(unsigned int cpu) { } +#endif /* #else #ifdef CONFIG_SRCU */ + #endif /* #ifndef RCU_TREE_NONCORE */ #ifdef CONFIG_RCU_TRACE -- cgit