diff options
Diffstat (limited to 'kernel/rcu')
| -rw-r--r-- | kernel/rcu/Kconfig | 312 | ||||
| -rw-r--r-- | kernel/rcu/Kconfig.debug | 159 | ||||
| -rw-r--r-- | kernel/rcu/Makefile | 8 | ||||
| -rw-r--r-- | kernel/rcu/rcu.h | 320 | ||||
| -rw-r--r-- | kernel/rcu/rcu_segcblist.c | 374 | ||||
| -rw-r--r-- | kernel/rcu/rcu_segcblist.h | 107 | ||||
| -rw-r--r-- | kernel/rcu/rcuperf.c | 696 | ||||
| -rw-r--r-- | kernel/rcu/rcuscale.c | 1219 | ||||
| -rw-r--r-- | kernel/rcu/rcutorture.c | 3398 | ||||
| -rw-r--r-- | kernel/rcu/refscale.c | 1616 | ||||
| -rw-r--r-- | kernel/rcu/srcutiny.c | 157 | ||||
| -rw-r--r-- | kernel/rcu/srcutree.c | 1844 | ||||
| -rw-r--r-- | kernel/rcu/sync.c | 243 | ||||
| -rw-r--r-- | kernel/rcu/tasks.h | 2283 | ||||
| -rw-r--r-- | kernel/rcu/tiny.c | 143 | ||||
| -rw-r--r-- | kernel/rcu/tree.c | 4254 | ||||
| -rw-r--r-- | kernel/rcu/tree.h | 340 | ||||
| -rw-r--r-- | kernel/rcu/tree_exp.h | 918 | ||||
| -rw-r--r-- | kernel/rcu/tree_nocb.h | 1705 | ||||
| -rw-r--r-- | kernel/rcu/tree_plugin.h | 2191 | ||||
| -rw-r--r-- | kernel/rcu/tree_stall.h | 1185 | ||||
| -rw-r--r-- | kernel/rcu/update.c | 736 |
22 files changed, 17630 insertions, 6578 deletions
diff --git a/kernel/rcu/Kconfig b/kernel/rcu/Kconfig index 939a2056c87a..4d9b21f69eaa 100644 --- a/kernel/rcu/Kconfig +++ b/kernel/rcu/Kconfig @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0-only # # RCU-related configuration options # @@ -6,7 +7,9 @@ menu "RCU Subsystem" config TREE_RCU bool - default y if !PREEMPT && SMP + default y if SMP + # Dynticks-idle tracking + select CONTEXT_TRACKING_IDLE help This option selects the RCU implementation that is designed for very large SMP system with hundreds or @@ -15,7 +18,8 @@ config TREE_RCU config PREEMPT_RCU bool - default y if PREEMPT + default y if (PREEMPT || PREEMPT_RT || PREEMPT_DYNAMIC) + select TREE_RCU help This option selects the RCU implementation that is designed for very large SMP systems with hundreds or @@ -27,7 +31,7 @@ config PREEMPT_RCU config TINY_RCU bool - default y if !PREEMPT && !SMP + default y if !PREEMPT_RCU && !SMP help This option selects the RCU implementation that is designed for UP systems from which real-time response @@ -49,35 +53,97 @@ config RCU_EXPERT Say N if you are unsure. -config SRCU - bool - help - This option selects the sleepable version of RCU. This version - permits arbitrary sleeping or blocking within RCU read-side critical - sections. - config TINY_SRCU bool - default y if SRCU && TINY_RCU + default y if TINY_RCU help This option selects the single-CPU non-preemptible version of SRCU. config TREE_SRCU bool - default y if SRCU && !TINY_RCU + default y if !TINY_RCU help This option selects the full-fledged version of SRCU. +config FORCE_NEED_SRCU_NMI_SAFE + bool "Force selection of NEED_SRCU_NMI_SAFE" + depends on !TINY_SRCU + depends on RCU_EXPERT + depends on ARCH_HAS_NMI_SAFE_THIS_CPU_OPS + select NEED_SRCU_NMI_SAFE + default n + help + This option forces selection of the NEED_SRCU_NMI_SAFE + Kconfig option, allowing testing of srcu_read_lock_nmisafe() + and srcu_read_unlock_nmisafe() on architectures (like x86) + that select the ARCH_HAS_NMI_SAFE_THIS_CPU_OPS Kconfig option. + +config NEED_SRCU_NMI_SAFE + def_bool HAVE_NMI && !ARCH_HAS_NMI_SAFE_THIS_CPU_OPS && !TINY_SRCU + +config TASKS_RCU_GENERIC + def_bool TASKS_RCU || TASKS_RUDE_RCU || TASKS_TRACE_RCU + help + This option enables generic infrastructure code supporting + task-based RCU implementations. Not for manual selection. + +config FORCE_TASKS_RCU + bool "Force selection of TASKS_RCU" + depends on RCU_EXPERT + select TASKS_RCU + default n + help + This option force-enables a task-based RCU implementation + that uses only voluntary context switch (not preemption!), + idle, and user-mode execution as quiescent states. Not for + manual selection in most cases. + +config NEED_TASKS_RCU + bool + default n + config TASKS_RCU - def_bool PREEMPT - select SRCU + bool + default NEED_TASKS_RCU && PREEMPTION + select IRQ_WORK + +config FORCE_TASKS_RUDE_RCU + bool "Force selection of Tasks Rude RCU" + depends on RCU_EXPERT + select TASKS_RUDE_RCU + default n + help + This option force-enables a task-based RCU implementation + that uses only context switch (including preemption) and + user-mode execution as quiescent states. It forces IPIs and + context switches on all online CPUs, including idle ones, + so use with caution. Not for manual selection in most cases. + +config TASKS_RUDE_RCU + bool + default n + select IRQ_WORK + +config FORCE_TASKS_TRACE_RCU + bool "Force selection of Tasks Trace RCU" + depends on RCU_EXPERT + select TASKS_TRACE_RCU + default n help This option enables a task-based RCU implementation that uses - only voluntary context switch (not preemption!), idle, and - user-mode execution as quiescent states. + explicit rcu_read_lock_trace() read-side markers, and allows + these readers to appear in the idle loop as well as on the + CPU hotplug code paths. It can force IPIs on online CPUs, + including idle ones, so use with caution. Not for manual + selection in most cases. + +config TASKS_TRACE_RCU + bool + default n + select IRQ_WORK config RCU_STALL_COMMON - def_bool ( TREE_RCU || PREEMPT_RCU ) + def_bool TREE_RCU help This option enables RCU CPU stall code that is common between the TINY and TREE variants of RCU. The purpose is to allow @@ -85,43 +151,13 @@ config RCU_STALL_COMMON making these warnings mandatory for the tree variants. config RCU_NEED_SEGCBLIST - def_bool ( TREE_RCU || PREEMPT_RCU || TREE_SRCU ) - -config CONTEXT_TRACKING - bool - -config CONTEXT_TRACKING_FORCE - bool "Force context tracking" - depends on CONTEXT_TRACKING - default y if !NO_HZ_FULL - help - The major pre-requirement for full dynticks to work is to - support the context tracking subsystem. But there are also - other dependencies to provide in order to make the full - dynticks working. - - This option stands for testing when an arch implements the - context tracking backend but doesn't yet fullfill all the - requirements to make the full dynticks feature working. - Without the full dynticks, there is no way to test the support - for context tracking and the subsystems that rely on it: RCU - userspace extended quiescent state and tickless cputime - accounting. This option copes with the absence of the full - dynticks subsystem by forcing the context tracking on all - CPUs in the system. - - Say Y only if you're working on the development of an - architecture backend for the context tracking. - - Say N otherwise, this option brings an overhead that you - don't want in production. - + def_bool ( TREE_RCU || TREE_SRCU || TASKS_RCU_GENERIC ) config RCU_FANOUT int "Tree-based hierarchical RCU fanout value" range 2 64 if 64BIT range 2 32 if !64BIT - depends on (TREE_RCU || PREEMPT_RCU) && RCU_EXPERT + depends on TREE_RCU && RCU_EXPERT default 64 if 64BIT default 32 if !64BIT help @@ -139,10 +175,12 @@ config RCU_FANOUT config RCU_FANOUT_LEAF int "Tree-based hierarchical RCU leaf-level fanout value" - range 2 64 if 64BIT - range 2 32 if !64BIT - depends on (TREE_RCU || PREEMPT_RCU) && RCU_EXPERT - default 16 + range 2 64 if 64BIT && !RCU_STRICT_GRACE_PERIOD + range 2 32 if !64BIT && !RCU_STRICT_GRACE_PERIOD + range 2 3 if RCU_STRICT_GRACE_PERIOD + depends on TREE_RCU && RCU_EXPERT + default 16 if !RCU_STRICT_GRACE_PERIOD + default 2 if RCU_STRICT_GRACE_PERIOD help This option controls the leaf-level fanout of hierarchical implementations of RCU, and allows trading off cache misses @@ -170,28 +208,10 @@ config RCU_FANOUT_LEAF Take the default if unsure. -config RCU_FAST_NO_HZ - bool "Accelerate last non-dyntick-idle CPU's grace periods" - depends on NO_HZ_COMMON && SMP && RCU_EXPERT - default n - help - This option permits CPUs to enter dynticks-idle state even if - they have RCU callbacks queued, and prevents RCU from waking - these CPUs up more than roughly once every four jiffies (by - default, you can adjust this using the rcutree.rcu_idle_gp_delay - parameter), thus improving energy efficiency. On the other - hand, this option increases the duration of RCU grace periods, - for example, slowing down synchronize_rcu(). - - Say Y if energy efficiency is critically important, and you - don't care about increased grace-period durations. - - Say N if you are unsure. - config RCU_BOOST bool "Enable RCU priority boosting" - depends on RT_MUTEXES && PREEMPT_RCU && RCU_EXPERT - default n + depends on (RT_MUTEXES && PREEMPT_RCU && RCU_EXPERT) || PREEMPT_RT + default y if PREEMPT_RT help This option boosts the priority of preempted RCU readers that block the current preemptible RCU grace period for too long. @@ -214,28 +234,146 @@ config RCU_BOOST_DELAY Accept the default if unsure. +config RCU_EXP_KTHREAD + bool "Perform RCU expedited work in a real-time kthread" + depends on RCU_BOOST && RCU_EXPERT + default !PREEMPT_RT && NR_CPUS <= 32 + help + Use this option to further reduce the latencies of expedited + grace periods at the expense of being more disruptive. + + This option is disabled by default on PREEMPT_RT=y kernels which + disable expedited grace periods after boot by unconditionally + setting rcupdate.rcu_normal_after_boot=1. + + Accept the default if unsure. + config RCU_NOCB_CPU bool "Offload RCU callback processing from boot-selected CPUs" - depends on TREE_RCU || PREEMPT_RCU + depends on TREE_RCU depends on RCU_EXPERT || NO_HZ_FULL default n help Use this option to reduce OS jitter for aggressive HPC or real-time workloads. It can also be used to offload RCU callback invocation to energy-efficient CPUs in battery-powered - asymmetric multiprocessors. - - This option offloads callback invocation from the set of CPUs - specified at boot time by the rcu_nocbs parameter. For each - such CPU, a kthread ("rcuox/N") will be created to invoke - callbacks, where the "N" is the CPU being offloaded, and where - the "p" for RCU-preempt (PREEMPT kernels) and "s" for RCU-sched - (!PREEMPT kernels). Nothing prevents this kthread from running - on the specified CPUs, but (1) the kthreads may be preempted - between each callback, and (2) affinity or cgroups can be used - to force the kthreads to run on whatever set of CPUs is desired. - - Say Y here if you want to help to debug reduced OS jitter. + asymmetric multiprocessors. The price of this reduced jitter + is that the overhead of call_rcu() increases and that some + workloads will incur significant increases in context-switch + rates. + + This option offloads callback invocation from the set of + CPUs specified at boot time by the rcu_nocbs parameter. + For each such CPU, a kthread ("rcuox/N") will be created to + invoke callbacks, where the "N" is the CPU being offloaded, + and where the "x" is "p" for RCU-preempt (PREEMPTION kernels) + and "s" for RCU-sched (!PREEMPTION kernels). This option + also creates another kthread for each sqrt(nr_cpu_ids) CPUs + ("rcuog/N", where N is the first CPU in that group to come + online), which handles grace periods for its group. Nothing + prevents these kthreads from running on the specified CPUs, + but (1) the kthreads may be preempted between each callback, + and (2) affinity or cgroups can be used to force the kthreads + to run on whatever set of CPUs is desired. + + The sqrt(nr_cpu_ids) grouping may be overridden using the + rcutree.rcu_nocb_gp_stride kernel boot parameter. This can + be especially helpful for smaller numbers of CPUs, where + sqrt(nr_cpu_ids) can be a bit of a blunt instrument. + + Say Y here if you need reduced OS jitter, despite added overhead. + Say N here if you are unsure. + +config RCU_NOCB_CPU_DEFAULT_ALL + bool "Offload RCU callback processing from all CPUs by default" + depends on RCU_NOCB_CPU + default n + help + Use this option to offload callback processing from all CPUs + by default, in the absence of the rcu_nocbs or nohz_full boot + parameter. This also avoids the need to use any boot parameters + to achieve the effect of offloading all CPUs on boot. + + Say Y here if you want offload all CPUs by default on boot. + Say N here if you are unsure. + +config RCU_NOCB_CPU_CB_BOOST + bool "Offload RCU callback from real-time kthread" + depends on RCU_NOCB_CPU && RCU_BOOST + default y if PREEMPT_RT + help + Use this option to invoke offloaded callbacks as SCHED_FIFO + to avoid starvation by heavy SCHED_OTHER background load. + Of course, running as SCHED_FIFO during callback floods will + cause the rcuo[ps] kthreads to monopolize the CPU for hundreds + of milliseconds or more. Therefore, when enabling this option, + it is your responsibility to ensure that latency-sensitive + tasks either run with higher priority or run on some other CPU. + + Say Y here if you want to set RT priority for offloading kthreads. + Say N here if you are building a !PREEMPT_RT kernel and are unsure. + +config TASKS_TRACE_RCU_READ_MB + bool "Tasks Trace RCU readers use memory barriers in user and idle" + depends on RCU_EXPERT && TASKS_TRACE_RCU + default PREEMPT_RT || NR_CPUS < 8 + help + Use this option to further reduce the number of IPIs sent + to CPUs executing in userspace or idle during tasks trace + RCU grace periods. Given that a reasonable setting of + the rcupdate.rcu_task_ipi_delay kernel boot parameter + eliminates such IPIs for many workloads, proper setting + of this Kconfig option is important mostly for aggressive + real-time installations and for battery-powered devices, + hence the default chosen above. + + Say Y here if you hate IPIs. + Say N here if you hate read-side memory barriers. + Take the default if you are unsure. + +config RCU_LAZY + bool "RCU callback lazy invocation functionality" + depends on RCU_NOCB_CPU + default n + help + To save power, batch RCU callbacks and delay starting the + corresponding grace period for multiple seconds. The grace + period will be started after this delay, in case of memory + pressure, or if the corresponding CPU's callback list grows + too large. + + These delays happen only on rcu_nocbs CPUs, that is, CPUs + whose callbacks have been offloaded. + + Use the rcutree.enable_rcu_lazy=0 kernel-boot parameter to + globally disable these delays. + +config RCU_LAZY_DEFAULT_OFF + bool "Turn RCU lazy invocation off by default" + depends on RCU_LAZY + default n + help + Build the kernel with CONFIG_RCU_LAZY=y, but cause the kernel + to boot with these energy-efficiency delays disabled. Use the + rcutree.enable_rcu_lazy=0 kernel-boot parameter to override + the this option at boot time, thus re-enabling these delays. + +config RCU_DOUBLE_CHECK_CB_TIME + bool "RCU callback-batch backup time check" + depends on RCU_EXPERT + default n + help + Use this option to provide more precise enforcement of the + rcutree.rcu_resched_ns module parameter in situations where + a single RCU callback might run for hundreds of microseconds, + thus defeating the 32-callback batching used to amortize the + cost of the fine-grained but expensive local_clock() function. + + This option rounds rcutree.rcu_resched_ns up to the next + jiffy, and overrides the 32-callback batching if this limit + is exceeded. + + Say Y here if you need tighter callback-limit enforcement. Say N here if you are unsure. endmenu # "RCU Subsystem" diff --git a/kernel/rcu/Kconfig.debug b/kernel/rcu/Kconfig.debug index 0ec7d1d33a14..625d75392647 100644 --- a/kernel/rcu/Kconfig.debug +++ b/kernel/rcu/Kconfig.debug @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0-only # # RCU-related debugging configuration options # @@ -7,16 +8,25 @@ menu "RCU Debugging" config PROVE_RCU def_bool PROVE_LOCKING +config PROVE_RCU_LIST + bool "RCU list lockdep debugging" + depends on PROVE_RCU && RCU_EXPERT + default n + help + Enable RCU lockdep checking for list usages. By default it is + turned off since there are several list RCU users that still + need to be converted to pass a lockdep expression. To prevent + false-positive splats, we keep it default disabled but once all + users are converted, we can remove this config option. + config TORTURE_TEST tristate default n -config RCU_PERF_TEST +config RCU_SCALE_TEST tristate "performance tests for RCU" depends on DEBUG_KERNEL select TORTURE_TEST - select SRCU - select TASKS_RCU default n help This option provides a kernel module that runs performance @@ -32,8 +42,6 @@ config RCU_TORTURE_TEST tristate "torture tests for RCU" depends on DEBUG_KERNEL select TORTURE_TEST - select SRCU - select TASKS_RCU default n help This option provides a kernel module that runs torture tests @@ -45,6 +53,66 @@ config RCU_TORTURE_TEST Say M if you want the RCU torture tests to build as a module. Say N if you are unsure. +config RCU_TORTURE_TEST_CHK_RDR_STATE + bool "Check rcutorture reader state" + depends on RCU_TORTURE_TEST + default n + help + This option causes rcutorture to check the desired rcutorture + reader state for each segment against the actual context. + Note that PREEMPT_COUNT must be enabled if the preempt-disabled + and bh-disabled checks are to take effect, and that PREEMPT_RCU + must be enabled for the RCU-nesting checks to take effect. + These checks add overhead, and this Kconfig options is therefore + disabled by default. + + Say Y here if you want rcutorture reader contexts checked. + Say N if you are unsure. + +config RCU_TORTURE_TEST_LOG_CPU + bool "Log CPU for rcutorture failures" + depends on RCU_TORTURE_TEST + default n + help + This option causes rcutorture to decorate each entry of its + log of failure/close-call rcutorture reader segments with the + number of the CPU that the reader was running on at the time. + This information can be useful, but it does incur additional + overhead, overhead that can make both failures and close calls + less probable. + + Say Y here if you want CPU IDs logged. + Say N if you are unsure. + +config RCU_TORTURE_TEST_LOG_GP + bool "Log grace-period numbers for rcutorture failures" + depends on RCU_TORTURE_TEST + default n + help + This option causes rcutorture to decorate each entry of its + log of failure/close-call rcutorture reader segments with the + corresponding grace-period sequence numbers. This information + can be useful, but it does incur additional overhead, overhead + that can make both failures and close calls less probable. + + Say Y here if you want grace-period sequence numbers logged. + Say N if you are unsure. + +config RCU_REF_SCALE_TEST + tristate "Scalability tests for read-side synchronization (RCU and others)" + depends on DEBUG_KERNEL + select TORTURE_TEST + default n + help + This option provides a kernel module that runs performance tests + useful comparing RCU with various read-side synchronization mechanisms. + The kernel module may be built after the fact on the running kernel to be + tested, if desired. + + Say Y here if you want these performance tests built into the kernel. + Say M if you want to build it as a module instead. + Say N if you are unsure. + config RCU_CPU_STALL_TIMEOUT int "RCU CPU stall timeout in seconds" depends on RCU_STALL_COMMON @@ -56,6 +124,57 @@ config RCU_CPU_STALL_TIMEOUT RCU grace period persists, additional CPU stall warnings are printed at more widely spaced intervals. +config RCU_EXP_CPU_STALL_TIMEOUT + int "Expedited RCU CPU stall timeout in milliseconds" + depends on RCU_STALL_COMMON + range 0 300000 + default 0 + help + If a given expedited RCU grace period extends more than the + specified number of milliseconds, a CPU stall warning is printed. + If the RCU grace period persists, additional CPU stall warnings + are printed at more widely spaced intervals. A value of zero + says to use the RCU_CPU_STALL_TIMEOUT value converted from + seconds to milliseconds. + +config RCU_CPU_STALL_CPUTIME + bool "Provide additional RCU stall debug information" + depends on RCU_STALL_COMMON + default n + help + Collect statistics during the sampling period, such as the number of + (hard interrupts, soft interrupts, task switches) and the cputime of + (hard interrupts, soft interrupts, kernel tasks) are added to the + RCU stall report. For multiple continuous RCU stalls, all sampling + periods begin at half of the first RCU stall timeout. + The boot option rcupdate.rcu_cpu_stall_cputime has the same function + as this one, but will override this if it exists. + +config RCU_CPU_STALL_NOTIFIER + bool "Provide RCU CPU-stall notifiers" + depends on RCU_STALL_COMMON + depends on DEBUG_KERNEL + depends on RCU_EXPERT + default n + help + WARNING: You almost certainly do not want this!!! + + Enable RCU CPU-stall notifiers, which are invoked just before + printing the RCU CPU stall warning. As such, bugs in notifier + callbacks can prevent stall warnings from being printed. + And the whole reason that a stall warning is being printed is + that something is hung up somewhere. Therefore, the notifier + callbacks must be written extremely carefully, preferably + containing only lockless code. After all, it is quite possible + that the whole reason that the RCU CPU stall is happening in + the first place is that someone forgot to release whatever lock + that you are thinking of acquiring. In which case, having your + notifier callback acquire that lock will hang, preventing the + RCU CPU stall warning from appearing. + + Say Y here if you want RCU CPU stall notifiers (you don't want them) + Say N if you are unsure. + config RCU_TRACE bool "Enable tracing for RCU" depends on DEBUG_KERNEL @@ -79,4 +198,34 @@ config RCU_EQS_DEBUG Say N here if you need ultimate kernel/user switch latencies Say Y if you are unsure +config RCU_STRICT_GRACE_PERIOD + bool "Provide debug RCU implementation with short grace periods" + depends on DEBUG_KERNEL && RCU_EXPERT && NR_CPUS <= 4 && !TINY_RCU + default n + select PREEMPT_COUNT if PREEMPT=n + help + Select this option to build an RCU variant that is strict about + grace periods, making them as short as it can. This limits + scalability, destroys real-time response, degrades battery + lifetime and kills performance. Don't try this on large + machines, as in systems with more than about 10 or 20 CPUs. + But in conjunction with tools like KASAN, it can be helpful + when looking for certain types of RCU usage bugs, for example, + too-short RCU read-side critical sections. + + +config RCU_DYNTICKS_TORTURE + bool "Minimize RCU dynticks counter size" + depends on RCU_EXPERT && !COMPILE_TEST + default n + help + This option sets the width of the dynticks counter to its + minimum usable value. This minimum width greatly increases + the probability of flushing out bugs involving counter wrap, + but it also increases the probability of extending grace period + durations. This Kconfig option should therefore be avoided in + production due to the consequent increased probability of OOMs. + + This has no value for production and is only for testing. + endmenu # "RCU Debugging" diff --git a/kernel/rcu/Makefile b/kernel/rcu/Makefile index 020e8b6a644b..0cfb009a99b9 100644 --- a/kernel/rcu/Makefile +++ b/kernel/rcu/Makefile @@ -3,12 +3,16 @@ # and is generally not a function of system call inputs. KCOV_INSTRUMENT := n +ifeq ($(CONFIG_KCSAN),y) +KBUILD_CFLAGS += -g -fno-omit-frame-pointer +endif + obj-y += update.o sync.o obj-$(CONFIG_TREE_SRCU) += srcutree.o obj-$(CONFIG_TINY_SRCU) += srcutiny.o obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o -obj-$(CONFIG_RCU_PERF_TEST) += rcuperf.o +obj-$(CONFIG_RCU_SCALE_TEST) += rcuscale.o +obj-$(CONFIG_RCU_REF_SCALE_TEST) += refscale.o obj-$(CONFIG_TREE_RCU) += tree.o -obj-$(CONFIG_PREEMPT_RCU) += tree.o obj-$(CONFIG_TINY_RCU) += tiny.o obj-$(CONFIG_RCU_NEED_SEGCBLIST) += rcu_segcblist.o diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index a393e24a9195..9cf01832a6c3 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -1,45 +1,66 @@ +/* SPDX-License-Identifier: GPL-2.0+ */ /* * Read-Copy Update definitions shared among RCU implementations. * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, you can access it online at - * http://www.gnu.org/licenses/gpl-2.0.html. - * * Copyright IBM Corporation, 2011 * - * Author: Paul E. McKenney <paulmck@linux.vnet.ibm.com> + * Author: Paul E. McKenney <paulmck@linux.ibm.com> */ #ifndef __LINUX_RCU_H #define __LINUX_RCU_H +#include <linux/slab.h> #include <trace/events/rcu.h> -#ifdef CONFIG_RCU_TRACE -#define RCU_TRACE(stmt) stmt -#else /* #ifdef CONFIG_RCU_TRACE */ -#define RCU_TRACE(stmt) -#endif /* #else #ifdef CONFIG_RCU_TRACE */ - -/* Offset to allow for unmatched rcu_irq_{enter,exit}(). */ -#define DYNTICK_IRQ_NONIDLE ((LONG_MAX / 2) + 1) - /* * Grace-period counter management. + * + * The two least significant bits contain the control flags. + * The most significant bits contain the grace-period sequence counter. + * + * When both control flags are zero, no grace period is in progress. + * When either bit is non-zero, a grace period has started and is in + * progress. When the grace period completes, the control flags are reset + * to 0 and the grace-period sequence counter is incremented. + * + * However some specific RCU usages make use of custom values. + * + * SRCU special control values: + * + * SRCU_SNP_INIT_SEQ : Invalid/init value set when SRCU node + * is initialized. + * + * SRCU_STATE_IDLE : No SRCU gp is in progress + * + * SRCU_STATE_SCAN1 : State set by rcu_seq_start(). Indicates + * we are scanning the readers on the slot + * defined as inactive (there might well + * be pending readers that will use that + * index, but their number is bounded). + * + * SRCU_STATE_SCAN2 : State set manually via rcu_seq_set_state() + * Indicates we are flipping the readers + * index and then scanning the readers on the + * slot newly designated as inactive (again, + * the number of pending readers that will use + * this inactive index is bounded). + * + * RCU polled GP special control value: + * + * RCU_GET_STATE_COMPLETED : State value indicating an already-completed + * polled GP has completed. This value covers + * both the state and the counter of the + * grace-period sequence number. */ -#define RCU_SEQ_CTR_SHIFT 2 -#define RCU_SEQ_STATE_MASK ((1 << RCU_SEQ_CTR_SHIFT) - 1) +/* Low-order bit definition for polled grace-period APIs. */ +#define RCU_GET_STATE_COMPLETED 0x1 + +/* A complete grace period count */ +#define RCU_SEQ_GP (RCU_SEQ_STATE_MASK + 1) + +extern int sysctl_sched_rt_runtime; /* * Return the counter portion of a sequence number previously returned @@ -136,6 +157,27 @@ static inline bool rcu_seq_done(unsigned long *sp, unsigned long s) } /* + * Given a snapshot from rcu_seq_snap(), determine whether or not a + * full update-side operation has occurred, but do not allow the + * (ULONG_MAX / 2) safety-factor/guard-band. + * + * The token returned by get_state_synchronize_rcu_full() is based on + * rcu_state.gp_seq but it is tested in poll_state_synchronize_rcu_full() + * against the root rnp->gp_seq. Since rcu_seq_start() is first called + * on rcu_state.gp_seq and only later reflected on the root rnp->gp_seq, + * it is possible that rcu_seq_snap(rcu_state.gp_seq) returns 2 full grace + * periods ahead of the root rnp->gp_seq. To prevent false-positives with the + * full polling API that a wrap around instantly completed the GP, when nothing + * like that happened, adjust for the 2 GPs in the ULONG_CMP_LT(). + */ +static inline bool rcu_seq_done_exact(unsigned long *sp, unsigned long s) +{ + unsigned long cur_s = READ_ONCE(*sp); + + return ULONG_CMP_GE(cur_s, s) || ULONG_CMP_LT(cur_s, s - (2 * RCU_SEQ_GP)); +} + +/* * Has a grace period completed since the time the old gp_seq was collected? */ static inline bool rcu_seq_completed_gp(unsigned long old, unsigned long new) @@ -185,7 +227,7 @@ static inline unsigned long rcu_seq_diff(unsigned long new, unsigned long old) # define STATE_RCU_HEAD_READY 0 # define STATE_RCU_HEAD_QUEUED 1 -extern struct debug_obj_descr rcuhead_debug_descr; +extern const struct debug_obj_descr rcuhead_debug_descr; static inline int debug_rcu_head_queue(struct rcu_head *head) { @@ -216,37 +258,41 @@ static inline void debug_rcu_head_unqueue(struct rcu_head *head) } #endif /* #else !CONFIG_DEBUG_OBJECTS_RCU_HEAD */ -void kfree(const void *); +static inline void debug_rcu_head_callback(struct rcu_head *rhp) +{ + if (unlikely(!rhp->func)) + kmem_dump_obj(rhp); +} -/* - * Reclaim the specified callback, either by invoking it (non-lazy case) - * or freeing it directly (lazy case). Return true if lazy, false otherwise. - */ -static inline bool __rcu_reclaim(const char *rn, struct rcu_head *head) +static inline bool rcu_barrier_cb_is_done(struct rcu_head *rhp) { - rcu_callback_t f; - unsigned long offset = (unsigned long)head->func; - - rcu_lock_acquire(&rcu_callback_map); - if (__is_kfree_rcu_offset(offset)) { - RCU_TRACE(trace_rcu_invoke_kfree_callback(rn, head, offset);) - kfree((void *)head - offset); - rcu_lock_release(&rcu_callback_map); - return true; - } else { - RCU_TRACE(trace_rcu_invoke_callback(rn, head);) - f = head->func; - WRITE_ONCE(head->func, (rcu_callback_t)0L); - f(head); - rcu_lock_release(&rcu_callback_map); - return false; - } + return rhp->next == rhp; } +extern int rcu_cpu_stall_suppress_at_boot; + +static inline bool rcu_stall_is_suppressed_at_boot(void) +{ + return rcu_cpu_stall_suppress_at_boot && !rcu_inkernel_boot_has_ended(); +} + +extern int rcu_cpu_stall_notifiers; + #ifdef CONFIG_RCU_STALL_COMMON +extern int rcu_cpu_stall_ftrace_dump; extern int rcu_cpu_stall_suppress; +extern int rcu_cpu_stall_timeout; +extern int rcu_exp_cpu_stall_timeout; +extern int rcu_cpu_stall_cputime; +extern bool rcu_exp_stall_task_details __read_mostly; int rcu_jiffies_till_stall_check(void); +int rcu_exp_jiffies_till_stall_check(void); + +static inline bool rcu_stall_is_suppressed(void) +{ + return rcu_stall_is_suppressed_at_boot() || rcu_cpu_stall_suppress; +} #define rcu_ftrace_dump_stall_suppress() \ do { \ @@ -261,6 +307,11 @@ do { \ } while (0) #else /* #endif #ifdef CONFIG_RCU_STALL_COMMON */ + +static inline bool rcu_stall_is_suppressed(void) +{ + return rcu_stall_is_suppressed_at_boot(); +} #define rcu_ftrace_dump_stall_suppress() #define rcu_ftrace_dump_stall_unsuppress() #endif /* #ifdef CONFIG_RCU_STALL_COMMON */ @@ -297,7 +348,7 @@ void rcu_test_sync_prims(void); */ extern void resched_cpu(int cpu); -#if defined(SRCU) || !defined(TINY_RCU) +#if !defined(CONFIG_TINY_RCU) #include <linux/rcu_node_tree.h> @@ -315,6 +366,8 @@ static inline void rcu_init_levelspread(int *levelspread, const int *levelcnt) { int i; + for (i = 0; i < RCU_NUM_LVLS; i++) + levelspread[i] = INT_MIN; if (rcu_fanout_exact) { levelspread[rcu_num_lvls - 1] = rcu_fanout_leaf; for (i = rcu_num_lvls - 2; i >= 0; i--) @@ -332,6 +385,8 @@ static inline void rcu_init_levelspread(int *levelspread, const int *levelcnt) } } +extern void rcu_init_geometry(void); + /* Returns a pointer to the first leaf rcu_node structure. */ #define rcu_first_leaf_node() (rcu_state.level[rcu_num_lvls - 1]) @@ -346,11 +401,13 @@ static inline void rcu_init_levelspread(int *levelspread, const int *levelcnt) * specified state structure (for SRCU) or the only rcu_state structure * (for RCU). */ -#define srcu_for_each_node_breadth_first(sp, rnp) \ +#define _rcu_for_each_node_breadth_first(sp, rnp) \ for ((rnp) = &(sp)->node[0]; \ (rnp) < &(sp)->node[rcu_num_nodes]; (rnp)++) #define rcu_for_each_node_breadth_first(rnp) \ - srcu_for_each_node_breadth_first(&rcu_state, rnp) + _rcu_for_each_node_breadth_first(&rcu_state, rnp) +#define srcu_for_each_node_breadth_first(ssp, rnp) \ + _rcu_for_each_node_breadth_first(ssp->srcu_sup, rnp) /* * Scan the leaves of the rcu_node hierarchy for the rcu_state structure. @@ -366,7 +423,8 @@ static inline void rcu_init_levelspread(int *levelspread, const int *levelcnt) * Iterate over all possible CPUs in a leaf RCU node. */ #define for_each_leaf_node_possible_cpu(rnp, cpu) \ - for ((cpu) = cpumask_next((rnp)->grplo - 1, cpu_possible_mask); \ + for (WARN_ON_ONCE(!rcu_is_leaf_node(rnp)), \ + (cpu) = cpumask_next((rnp)->grplo - 1, cpu_possible_mask); \ (cpu) <= rnp->grphi; \ (cpu) = cpumask_next((cpu), cpu_possible_mask)) @@ -376,10 +434,15 @@ static inline void rcu_init_levelspread(int *levelspread, const int *levelcnt) #define rcu_find_next_bit(rnp, cpu, mask) \ ((rnp)->grplo + find_next_bit(&(mask), BITS_PER_LONG, (cpu))) #define for_each_leaf_node_cpu_mask(rnp, cpu, mask) \ - for ((cpu) = rcu_find_next_bit((rnp), 0, (mask)); \ + for (WARN_ON_ONCE(!rcu_is_leaf_node(rnp)), \ + (cpu) = rcu_find_next_bit((rnp), 0, (mask)); \ (cpu) <= rnp->grphi; \ (cpu) = rcu_find_next_bit((rnp), (cpu) + 1 - (rnp->grplo), (mask))) +#endif /* !defined(CONFIG_TINY_RCU) */ + +#if !defined(CONFIG_TINY_RCU) || defined(CONFIG_TASKS_RCU_GENERIC) + /* * Wrappers for the rcu_node::lock acquire and release. * @@ -400,7 +463,11 @@ do { \ smp_mb__after_unlock_lock(); \ } while (0) -#define raw_spin_unlock_rcu_node(p) raw_spin_unlock(&ACCESS_PRIVATE(p, lock)) +#define raw_spin_unlock_rcu_node(p) \ +do { \ + lockdep_assert_irqs_disabled(); \ + raw_spin_unlock(&ACCESS_PRIVATE(p, lock)); \ +} while (0) #define raw_spin_lock_irq_rcu_node(p) \ do { \ @@ -409,7 +476,10 @@ do { \ } while (0) #define raw_spin_unlock_irq_rcu_node(p) \ - raw_spin_unlock_irq(&ACCESS_PRIVATE(p, lock)) +do { \ + lockdep_assert_irqs_disabled(); \ + raw_spin_unlock_irq(&ACCESS_PRIVATE(p, lock)); \ +} while (0) #define raw_spin_lock_irqsave_rcu_node(p, flags) \ do { \ @@ -418,7 +488,10 @@ do { \ } while (0) #define raw_spin_unlock_irqrestore_rcu_node(p, flags) \ - raw_spin_unlock_irqrestore(&ACCESS_PRIVATE(p, lock), flags) +do { \ + lockdep_assert_irqs_disabled(); \ + raw_spin_unlock_irqrestore(&ACCESS_PRIVATE(p, lock), flags); \ +} while (0) #define raw_spin_trylock_rcu_node(p) \ ({ \ @@ -432,60 +505,93 @@ do { \ #define raw_lockdep_assert_held_rcu_node(p) \ lockdep_assert_held(&ACCESS_PRIVATE(p, lock)) -#endif /* #if defined(SRCU) || !defined(TINY_RCU) */ - -#ifdef CONFIG_SRCU -void srcu_init(void); -#else /* #ifdef CONFIG_SRCU */ -static inline void srcu_init(void) { } -#endif /* #else #ifdef CONFIG_SRCU */ +#endif // #if !defined(CONFIG_TINY_RCU) || defined(CONFIG_TASKS_RCU_GENERIC) #ifdef CONFIG_TINY_RCU /* Tiny RCU doesn't expedite, as its purpose in life is instead to be tiny. */ static inline bool rcu_gp_is_normal(void) { return true; } static inline bool rcu_gp_is_expedited(void) { return false; } +static inline bool rcu_async_should_hurry(void) { return false; } static inline void rcu_expedite_gp(void) { } static inline void rcu_unexpedite_gp(void) { } -static inline void rcu_request_urgent_qs_task(struct task_struct *t) { } +static inline void rcu_async_hurry(void) { } +static inline void rcu_async_relax(void) { } +static inline bool rcu_cpu_online(int cpu) { return true; } #else /* #ifdef CONFIG_TINY_RCU */ bool rcu_gp_is_normal(void); /* Internal RCU use. */ bool rcu_gp_is_expedited(void); /* Internal RCU use. */ +bool rcu_async_should_hurry(void); /* Internal RCU use. */ void rcu_expedite_gp(void); void rcu_unexpedite_gp(void); +void rcu_async_hurry(void); +void rcu_async_relax(void); void rcupdate_announce_bootup_oddness(void); -void rcu_request_urgent_qs_task(struct task_struct *t); +bool rcu_cpu_online(int cpu); +#ifdef CONFIG_TASKS_RCU_GENERIC +void show_rcu_tasks_gp_kthreads(void); +#else /* #ifdef CONFIG_TASKS_RCU_GENERIC */ +static inline void show_rcu_tasks_gp_kthreads(void) {} +#endif /* #else #ifdef CONFIG_TASKS_RCU_GENERIC */ #endif /* #else #ifdef CONFIG_TINY_RCU */ +#ifdef CONFIG_TASKS_RCU +struct task_struct *get_rcu_tasks_gp_kthread(void); +void rcu_tasks_get_gp_data(int *flags, unsigned long *gp_seq); +#endif // # ifdef CONFIG_TASKS_RCU + +#ifdef CONFIG_TASKS_RUDE_RCU +struct task_struct *get_rcu_tasks_rude_gp_kthread(void); +void rcu_tasks_rude_get_gp_data(int *flags, unsigned long *gp_seq); +#endif // # ifdef CONFIG_TASKS_RUDE_RCU + +#ifdef CONFIG_TASKS_TRACE_RCU +void rcu_tasks_trace_get_gp_data(int *flags, unsigned long *gp_seq); +#endif + +#ifdef CONFIG_TASKS_RCU_GENERIC +void tasks_cblist_init_generic(void); +#else /* #ifdef CONFIG_TASKS_RCU_GENERIC */ +static inline void tasks_cblist_init_generic(void) { } +#endif /* #else #ifdef CONFIG_TASKS_RCU_GENERIC */ + #define RCU_SCHEDULER_INACTIVE 0 #define RCU_SCHEDULER_INIT 1 #define RCU_SCHEDULER_RUNNING 2 enum rcutorture_type { RCU_FLAVOR, - RCU_BH_FLAVOR, - RCU_SCHED_FLAVOR, RCU_TASKS_FLAVOR, + RCU_TASKS_RUDE_FLAVOR, + RCU_TASKS_TRACING_FLAVOR, + RCU_TRIVIAL_FLAVOR, SRCU_FLAVOR, INVALID_RCU_FLAVOR }; -#if defined(CONFIG_TREE_RCU) || defined(CONFIG_PREEMPT_RCU) -void rcutorture_get_gp_data(enum rcutorture_type test_type, int *flags, - unsigned long *gp_seq); -void rcutorture_record_progress(unsigned long vernum); +#if defined(CONFIG_RCU_LAZY) +unsigned long rcu_get_jiffies_lazy_flush(void); +void rcu_set_jiffies_lazy_flush(unsigned long j); +#else +static inline unsigned long rcu_get_jiffies_lazy_flush(void) { return 0; } +static inline void rcu_set_jiffies_lazy_flush(unsigned long j) { } +#endif + +#if defined(CONFIG_TREE_RCU) +void rcutorture_get_gp_data(int *flags, unsigned long *gp_seq); void do_trace_rcu_torture_read(const char *rcutorturename, struct rcu_head *rhp, unsigned long secs, unsigned long c_old, unsigned long c); +void rcu_gp_set_torture_wait(int duration); +void rcu_set_gpwrap_lag(unsigned long lag); +int rcu_get_gpwrap_count(int cpu); #else -static inline void rcutorture_get_gp_data(enum rcutorture_type test_type, - int *flags, unsigned long *gp_seq) +static inline void rcutorture_get_gp_data(int *flags, unsigned long *gp_seq) { *flags = 0; *gp_seq = 0; } -static inline void rcutorture_record_progress(unsigned long vernum) { } #ifdef CONFIG_RCU_TRACE void do_trace_rcu_torture_read(const char *rcutorturename, struct rcu_head *rhp, @@ -496,55 +602,93 @@ void do_trace_rcu_torture_read(const char *rcutorturename, #define do_trace_rcu_torture_read(rcutorturename, rhp, secs, c_old, c) \ do { } while (0) #endif +static inline void rcu_gp_set_torture_wait(int duration) { } +static inline void rcu_set_gpwrap_lag(unsigned long lag) { } +static inline int rcu_get_gpwrap_count(int cpu) { return 0; } #endif +unsigned long long rcutorture_gather_gp_seqs(void); +void rcutorture_format_gp_seqs(unsigned long long seqs, char *cp, size_t len); #ifdef CONFIG_TINY_SRCU -static inline void srcutorture_get_gp_data(enum rcutorture_type test_type, - struct srcu_struct *sp, int *flags, +static inline void srcutorture_get_gp_data(struct srcu_struct *sp, int *flags, unsigned long *gp_seq) { - if (test_type != SRCU_FLAVOR) - return; *flags = 0; *gp_seq = sp->srcu_idx; } #elif defined(CONFIG_TREE_SRCU) -void srcutorture_get_gp_data(enum rcutorture_type test_type, - struct srcu_struct *sp, int *flags, +void srcutorture_get_gp_data(struct srcu_struct *sp, int *flags, unsigned long *gp_seq); #endif #ifdef CONFIG_TINY_RCU +static inline bool rcu_watching_zero_in_eqs(int cpu, int *vp) { return false; } static inline unsigned long rcu_get_gp_seq(void) { return 0; } static inline unsigned long rcu_exp_batches_completed(void) { return 0; } -static inline unsigned long -srcu_batches_completed(struct srcu_struct *sp) { return 0; } static inline void rcu_force_quiescent_state(void) { } +static inline bool rcu_check_boost_fail(unsigned long gp_state, int *cpup) { return true; } static inline void show_rcu_gp_kthreads(void) { } static inline int rcu_get_gp_kthreads_prio(void) { return 0; } static inline void rcu_fwd_progress_check(unsigned long j) { } +static inline void rcu_gp_slow_register(atomic_t *rgssp) { } +static inline void rcu_gp_slow_unregister(atomic_t *rgssp) { } #else /* #ifdef CONFIG_TINY_RCU */ +bool rcu_watching_zero_in_eqs(int cpu, int *vp); unsigned long rcu_get_gp_seq(void); unsigned long rcu_exp_batches_completed(void); -unsigned long srcu_batches_completed(struct srcu_struct *sp); +bool rcu_check_boost_fail(unsigned long gp_state, int *cpup); void show_rcu_gp_kthreads(void); int rcu_get_gp_kthreads_prio(void); void rcu_fwd_progress_check(unsigned long j); void rcu_force_quiescent_state(void); extern struct workqueue_struct *rcu_gp_wq; -extern struct workqueue_struct *rcu_par_gp_wq; +extern struct kthread_worker *rcu_exp_gp_kworker; +void rcu_gp_slow_register(atomic_t *rgssp); +void rcu_gp_slow_unregister(atomic_t *rgssp); #endif /* #else #ifdef CONFIG_TINY_RCU */ +#ifdef CONFIG_TINY_SRCU +static inline unsigned long srcu_batches_completed(struct srcu_struct *sp) { return 0; } +#else // #ifdef CONFIG_TINY_SRCU +unsigned long srcu_batches_completed(struct srcu_struct *sp); +#endif // #else // #ifdef CONFIG_TINY_SRCU + #ifdef CONFIG_RCU_NOCB_CPU -bool rcu_is_nocb_cpu(int cpu); void rcu_bind_current_to_nocb(void); #else -static inline bool rcu_is_nocb_cpu(int cpu) { return false; } static inline void rcu_bind_current_to_nocb(void) { } #endif +#if !defined(CONFIG_TINY_RCU) && defined(CONFIG_TASKS_RCU) +void show_rcu_tasks_classic_gp_kthread(void); +#else +static inline void show_rcu_tasks_classic_gp_kthread(void) {} +#endif +#if !defined(CONFIG_TINY_RCU) && defined(CONFIG_TASKS_RUDE_RCU) +void show_rcu_tasks_rude_gp_kthread(void); +#else +static inline void show_rcu_tasks_rude_gp_kthread(void) {} +#endif +#if !defined(CONFIG_TINY_RCU) && defined(CONFIG_TASKS_TRACE_RCU) +void show_rcu_tasks_trace_gp_kthread(void); +#else +static inline void show_rcu_tasks_trace_gp_kthread(void) {} +#endif + +#ifdef CONFIG_TINY_RCU +static inline bool rcu_cpu_beenfullyonline(int cpu) { return true; } +#else +bool rcu_cpu_beenfullyonline(int cpu); +#endif + +#if defined(CONFIG_RCU_STALL_COMMON) && defined(CONFIG_RCU_CPU_STALL_NOTIFIER) +int rcu_stall_notifier_call_chain(unsigned long val, void *v); +#else // #if defined(CONFIG_RCU_STALL_COMMON) && defined(CONFIG_RCU_CPU_STALL_NOTIFIER) +static inline int rcu_stall_notifier_call_chain(unsigned long val, void *v) { return NOTIFY_DONE; } +#endif // #else // #if defined(CONFIG_RCU_STALL_COMMON) && defined(CONFIG_RCU_CPU_STALL_NOTIFIER) + #endif /* __LINUX_RCU_H */ diff --git a/kernel/rcu/rcu_segcblist.c b/kernel/rcu/rcu_segcblist.c index 5aff271adf1e..298a2c573f02 100644 --- a/kernel/rcu/rcu_segcblist.c +++ b/kernel/rcu/rcu_segcblist.c @@ -1,29 +1,16 @@ +// SPDX-License-Identifier: GPL-2.0+ /* * RCU segmented callback lists, function definitions * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, you can access it online at - * http://www.gnu.org/licenses/gpl-2.0.html. - * * Copyright IBM Corporation, 2017 * - * Authors: Paul E. McKenney <paulmck@linux.vnet.ibm.com> + * Authors: Paul E. McKenney <paulmck@linux.ibm.com> */ -#include <linux/types.h> -#include <linux/kernel.h> +#include <linux/cpu.h> #include <linux/interrupt.h> -#include <linux/rcupdate.h> +#include <linux/kernel.h> +#include <linux/types.h> #include "rcu_segcblist.h" @@ -33,15 +20,49 @@ void rcu_cblist_init(struct rcu_cblist *rclp) rclp->head = NULL; rclp->tail = &rclp->head; rclp->len = 0; - rclp->len_lazy = 0; +} + +/* + * Enqueue an rcu_head structure onto the specified callback list. + */ +void rcu_cblist_enqueue(struct rcu_cblist *rclp, struct rcu_head *rhp) +{ + *rclp->tail = rhp; + rclp->tail = &rhp->next; + WRITE_ONCE(rclp->len, rclp->len + 1); +} + +/* + * Flush the second rcu_cblist structure onto the first one, obliterating + * any contents of the first. If rhp is non-NULL, enqueue it as the sole + * element of the second rcu_cblist structure, but ensuring that the second + * rcu_cblist structure, if initially non-empty, always appears non-empty + * throughout the process. If rdp is NULL, the second rcu_cblist structure + * is instead initialized to empty. + */ +void rcu_cblist_flush_enqueue(struct rcu_cblist *drclp, + struct rcu_cblist *srclp, + struct rcu_head *rhp) +{ + drclp->head = srclp->head; + if (drclp->head) + drclp->tail = srclp->tail; + else + drclp->tail = &drclp->head; + drclp->len = srclp->len; + if (!rhp) { + rcu_cblist_init(srclp); + } else { + rhp->next = NULL; + srclp->head = rhp; + srclp->tail = &rhp->next; + WRITE_ONCE(srclp->len, 1); + } } /* * Dequeue the oldest rcu_head structure from the specified callback - * list. This function assumes that the callback is non-lazy, but - * the caller can later invoke rcu_cblist_dequeued_lazy() if it - * finds otherwise (and if it cares about laziness). This allows - * different users to have different ways of determining laziness. + * list. */ struct rcu_head *rcu_cblist_dequeue(struct rcu_cblist *rclp) { @@ -57,6 +78,159 @@ struct rcu_head *rcu_cblist_dequeue(struct rcu_cblist *rclp) return rhp; } +/* Set the length of an rcu_segcblist structure. */ +static void rcu_segcblist_set_len(struct rcu_segcblist *rsclp, long v) +{ +#ifdef CONFIG_RCU_NOCB_CPU + atomic_long_set(&rsclp->len, v); +#else + WRITE_ONCE(rsclp->len, v); +#endif +} + +/* Get the length of a segment of the rcu_segcblist structure. */ +long rcu_segcblist_get_seglen(struct rcu_segcblist *rsclp, int seg) +{ + return READ_ONCE(rsclp->seglen[seg]); +} + +/* Return number of callbacks in segmented callback list by summing seglen. */ +long rcu_segcblist_n_segment_cbs(struct rcu_segcblist *rsclp) +{ + long len = 0; + int i; + + for (i = RCU_DONE_TAIL; i < RCU_CBLIST_NSEGS; i++) + len += rcu_segcblist_get_seglen(rsclp, i); + + return len; +} + +/* Set the length of a segment of the rcu_segcblist structure. */ +static void rcu_segcblist_set_seglen(struct rcu_segcblist *rsclp, int seg, long v) +{ + WRITE_ONCE(rsclp->seglen[seg], v); +} + +/* Increase the numeric length of a segment by a specified amount. */ +static void rcu_segcblist_add_seglen(struct rcu_segcblist *rsclp, int seg, long v) +{ + WRITE_ONCE(rsclp->seglen[seg], rsclp->seglen[seg] + v); +} + +/* Move from's segment length to to's segment. */ +static void rcu_segcblist_move_seglen(struct rcu_segcblist *rsclp, int from, int to) +{ + long len; + + if (from == to) + return; + + len = rcu_segcblist_get_seglen(rsclp, from); + if (!len) + return; + + rcu_segcblist_add_seglen(rsclp, to, len); + rcu_segcblist_set_seglen(rsclp, from, 0); +} + +/* Increment segment's length. */ +static void rcu_segcblist_inc_seglen(struct rcu_segcblist *rsclp, int seg) +{ + rcu_segcblist_add_seglen(rsclp, seg, 1); +} + +/* + * Increase the numeric length of an rcu_segcblist structure by the + * specified amount, which can be negative. This can cause the ->len + * field to disagree with the actual number of callbacks on the structure. + * This increase is fully ordered with respect to the callers accesses + * both before and after. + * + * So why on earth is a memory barrier required both before and after + * the update to the ->len field??? + * + * The reason is that rcu_barrier() locklessly samples each CPU's ->len + * field, and if a given CPU's field is zero, avoids IPIing that CPU. + * This can of course race with both queuing and invoking of callbacks. + * Failing to correctly handle either of these races could result in + * rcu_barrier() failing to IPI a CPU that actually had callbacks queued + * which rcu_barrier() was obligated to wait on. And if rcu_barrier() + * failed to wait on such a callback, unloading certain kernel modules + * would result in calls to functions whose code was no longer present in + * the kernel, for but one example. + * + * Therefore, ->len transitions from 1->0 and 0->1 have to be carefully + * ordered with respect with both list modifications and the rcu_barrier(). + * + * The queuing case is CASE 1 and the invoking case is CASE 2. + * + * CASE 1: Suppose that CPU 0 has no callbacks queued, but invokes + * call_rcu() just as CPU 1 invokes rcu_barrier(). CPU 0's ->len field + * will transition from 0->1, which is one of the transitions that must + * be handled carefully. Without the full memory barriers after the ->len + * update and at the beginning of rcu_barrier(), the following could happen: + * + * CPU 0 CPU 1 + * + * call_rcu(). + * rcu_barrier() sees ->len as 0. + * set ->len = 1. + * rcu_barrier() does nothing. + * module is unloaded. + * callback invokes unloaded function! + * + * With the full barriers, any case where rcu_barrier() sees ->len as 0 will + * have unambiguously preceded the return from the racing call_rcu(), which + * means that this call_rcu() invocation is OK to not wait on. After all, + * you are supposed to make sure that any problematic call_rcu() invocations + * happen before the rcu_barrier(). + * + * + * CASE 2: Suppose that CPU 0 is invoking its last callback just as + * CPU 1 invokes rcu_barrier(). CPU 0's ->len field will transition from + * 1->0, which is one of the transitions that must be handled carefully. + * Without the full memory barriers before the ->len update and at the + * end of rcu_barrier(), the following could happen: + * + * CPU 0 CPU 1 + * + * start invoking last callback + * set ->len = 0 (reordered) + * rcu_barrier() sees ->len as 0 + * rcu_barrier() does nothing. + * module is unloaded + * callback executing after unloaded! + * + * With the full barriers, any case where rcu_barrier() sees ->len as 0 + * will be fully ordered after the completion of the callback function, + * so that the module unloading operation is completely safe. + * + */ +void rcu_segcblist_add_len(struct rcu_segcblist *rsclp, long v) +{ +#ifdef CONFIG_RCU_NOCB_CPU + smp_mb__before_atomic(); // Read header comment above. + atomic_long_add(v, &rsclp->len); + smp_mb__after_atomic(); // Read header comment above. +#else + smp_mb(); // Read header comment above. + WRITE_ONCE(rsclp->len, rsclp->len + v); + smp_mb(); // Read header comment above. +#endif +} + +/* + * Increase the numeric length of an rcu_segcblist structure by one. + * This can cause the ->len field to disagree with the actual number of + * callbacks on the structure. This increase is fully ordered with respect + * to the callers accesses both before and after. + */ +void rcu_segcblist_inc_len(struct rcu_segcblist *rsclp) +{ + rcu_segcblist_add_len(rsclp, 1); +} + /* * Initialize an rcu_segcblist structure. */ @@ -67,10 +241,12 @@ void rcu_segcblist_init(struct rcu_segcblist *rsclp) BUILD_BUG_ON(RCU_NEXT_TAIL + 1 != ARRAY_SIZE(rsclp->gp_seq)); BUILD_BUG_ON(ARRAY_SIZE(rsclp->tails) != ARRAY_SIZE(rsclp->gp_seq)); rsclp->head = NULL; - for (i = 0; i < RCU_CBLIST_NSEGS; i++) + for (i = 0; i < RCU_CBLIST_NSEGS; i++) { rsclp->tails[i] = &rsclp->head; - rsclp->len = 0; - rsclp->len_lazy = 0; + rcu_segcblist_set_seglen(rsclp, i, 0); + } + rcu_segcblist_set_len(rsclp, 0); + rcu_segcblist_set_flags(rsclp, SEGCBLIST_ENABLED); } /* @@ -81,8 +257,7 @@ void rcu_segcblist_disable(struct rcu_segcblist *rsclp) { WARN_ON_ONCE(!rcu_segcblist_empty(rsclp)); WARN_ON_ONCE(rcu_segcblist_n_cbs(rsclp)); - WARN_ON_ONCE(rcu_segcblist_n_lazy_cbs(rsclp)); - rsclp->tails[RCU_NEXT_TAIL] = NULL; + rcu_segcblist_clear_flags(rsclp, SEGCBLIST_ENABLED); } /* @@ -92,7 +267,7 @@ void rcu_segcblist_disable(struct rcu_segcblist *rsclp) bool rcu_segcblist_ready_cbs(struct rcu_segcblist *rsclp) { return rcu_segcblist_is_enabled(rsclp) && - &rsclp->head != rsclp->tails[RCU_DONE_TAIL]; + &rsclp->head != READ_ONCE(rsclp->tails[RCU_DONE_TAIL]); } /* @@ -131,6 +306,18 @@ struct rcu_head *rcu_segcblist_first_pend_cb(struct rcu_segcblist *rsclp) } /* + * Return false if there are no CBs awaiting grace periods, otherwise, + * return true and store the nearest waited-upon grace period into *lp. + */ +bool rcu_segcblist_nextgp(struct rcu_segcblist *rsclp, unsigned long *lp) +{ + if (!rcu_segcblist_pend_cbs(rsclp)) + return false; + *lp = rsclp->gp_seq[RCU_WAIT_TAIL]; + return true; +} + +/* * Enqueue the specified callback onto the specified rcu_segcblist * structure, updating accounting as needed. Note that the ->len * field may be accessed locklessly, hence the WRITE_ONCE(). @@ -140,15 +327,13 @@ struct rcu_head *rcu_segcblist_first_pend_cb(struct rcu_segcblist *rsclp) * absolutely not OK for it to ever miss posting a callback. */ void rcu_segcblist_enqueue(struct rcu_segcblist *rsclp, - struct rcu_head *rhp, bool lazy) + struct rcu_head *rhp) { - WRITE_ONCE(rsclp->len, rsclp->len + 1); /* ->len sampled locklessly. */ - if (lazy) - rsclp->len_lazy++; - smp_mb(); /* Ensure counts are updated before callback is enqueued. */ + rcu_segcblist_inc_len(rsclp); + rcu_segcblist_inc_seglen(rsclp, RCU_NEXT_TAIL); rhp->next = NULL; - *rsclp->tails[RCU_NEXT_TAIL] = rhp; - rsclp->tails[RCU_NEXT_TAIL] = &rhp->next; + WRITE_ONCE(*rsclp->tails[RCU_NEXT_TAIL], rhp); + WRITE_ONCE(rsclp->tails[RCU_NEXT_TAIL], &rhp->next); } /* @@ -162,45 +347,26 @@ void rcu_segcblist_enqueue(struct rcu_segcblist *rsclp, * period. You have been warned. */ bool rcu_segcblist_entrain(struct rcu_segcblist *rsclp, - struct rcu_head *rhp, bool lazy) + struct rcu_head *rhp) { int i; if (rcu_segcblist_n_cbs(rsclp) == 0) return false; - WRITE_ONCE(rsclp->len, rsclp->len + 1); - if (lazy) - rsclp->len_lazy++; + rcu_segcblist_inc_len(rsclp); smp_mb(); /* Ensure counts are updated before callback is entrained. */ rhp->next = NULL; for (i = RCU_NEXT_TAIL; i > RCU_DONE_TAIL; i--) - if (rsclp->tails[i] != rsclp->tails[i - 1]) + if (!rcu_segcblist_segempty(rsclp, i)) break; - *rsclp->tails[i] = rhp; + rcu_segcblist_inc_seglen(rsclp, i); + WRITE_ONCE(*rsclp->tails[i], rhp); for (; i <= RCU_NEXT_TAIL; i++) - rsclp->tails[i] = &rhp->next; + WRITE_ONCE(rsclp->tails[i], &rhp->next); return true; } /* - * Extract only the counts from the specified rcu_segcblist structure, - * and place them in the specified rcu_cblist structure. This function - * supports both callback orphaning and invocation, hence the separation - * of counts and callbacks. (Callbacks ready for invocation must be - * orphaned and adopted separately from pending callbacks, but counts - * apply to all callbacks. Locking must be used to make sure that - * both orphaned-callbacks lists are consistent.) - */ -void rcu_segcblist_extract_count(struct rcu_segcblist *rsclp, - struct rcu_cblist *rclp) -{ - rclp->len_lazy += rsclp->len_lazy; - rclp->len += rsclp->len; - rsclp->len_lazy = 0; - WRITE_ONCE(rsclp->len, 0); /* ->len sampled locklessly. */ -} - -/* * Extract only those callbacks ready to be invoked from the specified * rcu_segcblist structure and place them in the specified rcu_cblist * structure. @@ -212,13 +378,15 @@ void rcu_segcblist_extract_done_cbs(struct rcu_segcblist *rsclp, if (!rcu_segcblist_ready_cbs(rsclp)) return; /* Nothing to do. */ + rclp->len = rcu_segcblist_get_seglen(rsclp, RCU_DONE_TAIL); *rclp->tail = rsclp->head; - rsclp->head = *rsclp->tails[RCU_DONE_TAIL]; - *rsclp->tails[RCU_DONE_TAIL] = NULL; + WRITE_ONCE(rsclp->head, *rsclp->tails[RCU_DONE_TAIL]); + WRITE_ONCE(*rsclp->tails[RCU_DONE_TAIL], NULL); rclp->tail = rsclp->tails[RCU_DONE_TAIL]; for (i = RCU_CBLIST_NSEGS - 1; i >= RCU_DONE_TAIL; i--) if (rsclp->tails[i] == rsclp->tails[RCU_DONE_TAIL]) - rsclp->tails[i] = &rsclp->head; + WRITE_ONCE(rsclp->tails[i], &rsclp->head); + rcu_segcblist_set_seglen(rsclp, RCU_DONE_TAIL, 0); } /* @@ -235,11 +403,15 @@ void rcu_segcblist_extract_pend_cbs(struct rcu_segcblist *rsclp, if (!rcu_segcblist_pend_cbs(rsclp)) return; /* Nothing to do. */ + rclp->len = 0; *rclp->tail = *rsclp->tails[RCU_DONE_TAIL]; rclp->tail = rsclp->tails[RCU_NEXT_TAIL]; - *rsclp->tails[RCU_DONE_TAIL] = NULL; - for (i = RCU_DONE_TAIL + 1; i < RCU_CBLIST_NSEGS; i++) - rsclp->tails[i] = rsclp->tails[RCU_DONE_TAIL]; + WRITE_ONCE(*rsclp->tails[RCU_DONE_TAIL], NULL); + for (i = RCU_DONE_TAIL + 1; i < RCU_CBLIST_NSEGS; i++) { + rclp->len += rcu_segcblist_get_seglen(rsclp, i); + WRITE_ONCE(rsclp->tails[i], rsclp->tails[RCU_DONE_TAIL]); + rcu_segcblist_set_seglen(rsclp, i, 0); + } } /* @@ -249,11 +421,7 @@ void rcu_segcblist_extract_pend_cbs(struct rcu_segcblist *rsclp, void rcu_segcblist_insert_count(struct rcu_segcblist *rsclp, struct rcu_cblist *rclp) { - rsclp->len_lazy += rclp->len_lazy; - /* ->len sampled locklessly. */ - WRITE_ONCE(rsclp->len, rsclp->len + rclp->len); - rclp->len_lazy = 0; - rclp->len = 0; + rcu_segcblist_add_len(rsclp, rclp->len); } /* @@ -267,11 +435,12 @@ void rcu_segcblist_insert_done_cbs(struct rcu_segcblist *rsclp, if (!rclp->head) return; /* No callbacks to move. */ + rcu_segcblist_add_seglen(rsclp, RCU_DONE_TAIL, rclp->len); *rclp->tail = rsclp->head; - rsclp->head = rclp->head; + WRITE_ONCE(rsclp->head, rclp->head); for (i = RCU_DONE_TAIL; i < RCU_CBLIST_NSEGS; i++) if (&rsclp->head == rsclp->tails[i]) - rsclp->tails[i] = rclp->tail; + WRITE_ONCE(rsclp->tails[i], rclp->tail); else break; rclp->head = NULL; @@ -287,10 +456,10 @@ void rcu_segcblist_insert_pend_cbs(struct rcu_segcblist *rsclp, { if (!rclp->head) return; /* Nothing to do. */ - *rsclp->tails[RCU_NEXT_TAIL] = rclp->head; - rsclp->tails[RCU_NEXT_TAIL] = rclp->tail; - rclp->head = NULL; - rclp->tail = &rclp->head; + + rcu_segcblist_add_seglen(rsclp, RCU_NEXT_TAIL, rclp->len); + WRITE_ONCE(*rsclp->tails[RCU_NEXT_TAIL], rclp->head); + WRITE_ONCE(rsclp->tails[RCU_NEXT_TAIL], rclp->tail); } /* @@ -312,7 +481,8 @@ void rcu_segcblist_advance(struct rcu_segcblist *rsclp, unsigned long seq) for (i = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++) { if (ULONG_CMP_LT(seq, rsclp->gp_seq[i])) break; - rsclp->tails[RCU_DONE_TAIL] = rsclp->tails[i]; + WRITE_ONCE(rsclp->tails[RCU_DONE_TAIL], rsclp->tails[i]); + rcu_segcblist_move_seglen(rsclp, i, RCU_DONE_TAIL); } /* If no callbacks moved, nothing more need be done. */ @@ -321,18 +491,19 @@ void rcu_segcblist_advance(struct rcu_segcblist *rsclp, unsigned long seq) /* Clean up tail pointers that might have been misordered above. */ for (j = RCU_WAIT_TAIL; j < i; j++) - rsclp->tails[j] = rsclp->tails[RCU_DONE_TAIL]; + WRITE_ONCE(rsclp->tails[j], rsclp->tails[RCU_DONE_TAIL]); /* - * Callbacks moved, so clean up the misordered ->tails[] pointers - * that now point into the middle of the list of ready-to-invoke - * callbacks. The overall effect is to copy down the later pointers - * into the gap that was created by the now-ready segments. + * Callbacks moved, so there might be an empty RCU_WAIT_TAIL + * and a non-empty RCU_NEXT_READY_TAIL. If so, copy the + * RCU_NEXT_READY_TAIL segment to fill the RCU_WAIT_TAIL gap + * created by the now-ready-to-invoke segments. */ for (j = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++, j++) { if (rsclp->tails[j] == rsclp->tails[RCU_NEXT_TAIL]) break; /* No more callbacks. */ - rsclp->tails[j] = rsclp->tails[i]; + WRITE_ONCE(rsclp->tails[j], rsclp->tails[i]); + rcu_segcblist_move_seglen(rsclp, i, j); rsclp->gp_seq[j] = rsclp->gp_seq[i]; } } @@ -354,7 +525,7 @@ void rcu_segcblist_advance(struct rcu_segcblist *rsclp, unsigned long seq) */ bool rcu_segcblist_accelerate(struct rcu_segcblist *rsclp, unsigned long seq) { - int i; + int i, j; WARN_ON_ONCE(!rcu_segcblist_is_enabled(rsclp)); if (rcu_segcblist_restempty(rsclp, RCU_DONE_TAIL)) @@ -369,7 +540,7 @@ bool rcu_segcblist_accelerate(struct rcu_segcblist *rsclp, unsigned long seq) * as their ->gp_seq[] grace-period completion sequence number. */ for (i = RCU_NEXT_READY_TAIL; i > RCU_DONE_TAIL; i--) - if (rsclp->tails[i] != rsclp->tails[i - 1] && + if (!rcu_segcblist_segempty(rsclp, i) && ULONG_CMP_LT(rsclp->gp_seq[i], seq)) break; @@ -385,10 +556,22 @@ bool rcu_segcblist_accelerate(struct rcu_segcblist *rsclp, unsigned long seq) * Also advance to the oldest segment of callbacks whose * ->gp_seq[] completion is at or after that passed in via "seq", * skipping any empty segments. + * + * Note that segment "i" (and any lower-numbered segments + * containing older callbacks) will be unaffected, and their + * grace-period numbers remain unchanged. For example, if i == + * WAIT_TAIL, then neither WAIT_TAIL nor DONE_TAIL will be touched. + * Instead, the CBs in NEXT_TAIL will be merged with those in + * NEXT_READY_TAIL and the grace-period number of NEXT_READY_TAIL + * would be updated. NEXT_TAIL would then be empty. */ - if (++i >= RCU_NEXT_TAIL) + if (rcu_segcblist_restempty(rsclp, i) || ++i >= RCU_NEXT_TAIL) return false; + /* Accounting: everything below i is about to get merged into i. */ + for (j = i + 1; j <= RCU_NEXT_TAIL; j++) + rcu_segcblist_move_seglen(rsclp, j, i); + /* * Merge all later callbacks, including newly arrived callbacks, * into the segment located by the for-loop above. Assign "seq" @@ -397,7 +580,7 @@ bool rcu_segcblist_accelerate(struct rcu_segcblist *rsclp, unsigned long seq) * structure other than in the RCU_NEXT_TAIL segment. */ for (; i < RCU_NEXT_TAIL; i++) { - rsclp->tails[i] = rsclp->tails[RCU_NEXT_TAIL]; + WRITE_ONCE(rsclp->tails[i], rsclp->tails[RCU_NEXT_TAIL]); rsclp->gp_seq[i] = seq; } return true; @@ -416,13 +599,24 @@ void rcu_segcblist_merge(struct rcu_segcblist *dst_rsclp, struct rcu_cblist donecbs; struct rcu_cblist pendcbs; + lockdep_assert_cpus_held(); + rcu_cblist_init(&donecbs); rcu_cblist_init(&pendcbs); - rcu_segcblist_extract_count(src_rsclp, &donecbs); + rcu_segcblist_extract_done_cbs(src_rsclp, &donecbs); rcu_segcblist_extract_pend_cbs(src_rsclp, &pendcbs); + + /* + * No need smp_mb() before setting length to 0, because CPU hotplug + * lock excludes rcu_barrier. + */ + rcu_segcblist_set_len(src_rsclp, 0); + rcu_segcblist_insert_count(dst_rsclp, &donecbs); + rcu_segcblist_insert_count(dst_rsclp, &pendcbs); rcu_segcblist_insert_done_cbs(dst_rsclp, &donecbs); rcu_segcblist_insert_pend_cbs(dst_rsclp, &pendcbs); + rcu_segcblist_init(src_rsclp); } diff --git a/kernel/rcu/rcu_segcblist.h b/kernel/rcu/rcu_segcblist.h index 948470cef385..fadc08ad4b7b 100644 --- a/kernel/rcu/rcu_segcblist.h +++ b/kernel/rcu/rcu_segcblist.h @@ -1,37 +1,30 @@ +/* SPDX-License-Identifier: GPL-2.0+ */ /* * RCU segmented callback lists, internal-to-rcu header file * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, you can access it online at - * http://www.gnu.org/licenses/gpl-2.0.html. - * * Copyright IBM Corporation, 2017 * - * Authors: Paul E. McKenney <paulmck@linux.vnet.ibm.com> + * Authors: Paul E. McKenney <paulmck@linux.ibm.com> */ #include <linux/rcu_segcblist.h> -/* - * Account for the fact that a previously dequeued callback turned out - * to be marked as lazy. - */ -static inline void rcu_cblist_dequeued_lazy(struct rcu_cblist *rclp) +/* Return number of callbacks in the specified callback list. */ +static inline long rcu_cblist_n_cbs(struct rcu_cblist *rclp) { - rclp->len_lazy--; + return READ_ONCE(rclp->len); } +long rcu_segcblist_get_seglen(struct rcu_segcblist *rsclp, int seg); + +/* Return number of callbacks in segmented callback list by summing seglen. */ +long rcu_segcblist_n_segment_cbs(struct rcu_segcblist *rsclp); + void rcu_cblist_init(struct rcu_cblist *rclp); +void rcu_cblist_enqueue(struct rcu_cblist *rclp, struct rcu_head *rhp); +void rcu_cblist_flush_enqueue(struct rcu_cblist *drclp, + struct rcu_cblist *srclp, + struct rcu_head *rhp); struct rcu_head *rcu_cblist_dequeue(struct rcu_cblist *rclp); /* @@ -49,79 +42,93 @@ struct rcu_head *rcu_cblist_dequeue(struct rcu_cblist *rclp); */ static inline bool rcu_segcblist_empty(struct rcu_segcblist *rsclp) { - return !rsclp->head; + return !READ_ONCE(rsclp->head); } /* Return number of callbacks in segmented callback list. */ static inline long rcu_segcblist_n_cbs(struct rcu_segcblist *rsclp) { +#ifdef CONFIG_RCU_NOCB_CPU + return atomic_long_read(&rsclp->len); +#else return READ_ONCE(rsclp->len); +#endif +} + +static inline void rcu_segcblist_set_flags(struct rcu_segcblist *rsclp, + int flags) +{ + WRITE_ONCE(rsclp->flags, rsclp->flags | flags); } -/* Return number of lazy callbacks in segmented callback list. */ -static inline long rcu_segcblist_n_lazy_cbs(struct rcu_segcblist *rsclp) +static inline void rcu_segcblist_clear_flags(struct rcu_segcblist *rsclp, + int flags) { - return rsclp->len_lazy; + WRITE_ONCE(rsclp->flags, rsclp->flags & ~flags); } -/* Return number of lazy callbacks in segmented callback list. */ -static inline long rcu_segcblist_n_nonlazy_cbs(struct rcu_segcblist *rsclp) +static inline bool rcu_segcblist_test_flags(struct rcu_segcblist *rsclp, + int flags) { - return rsclp->len - rsclp->len_lazy; + return READ_ONCE(rsclp->flags) & flags; } /* * Is the specified rcu_segcblist enabled, for example, not corresponding - * to an offline or callback-offloaded CPU? + * to an offline CPU? */ static inline bool rcu_segcblist_is_enabled(struct rcu_segcblist *rsclp) { - return !!rsclp->tails[RCU_NEXT_TAIL]; + return rcu_segcblist_test_flags(rsclp, SEGCBLIST_ENABLED); } /* - * Are all segments following the specified segment of the specified - * rcu_segcblist structure empty of callbacks? (The specified - * segment might well contain callbacks.) + * Is the specified rcu_segcblist NOCB offloaded (or in the middle of the + * [de]offloading process)? */ -static inline bool rcu_segcblist_restempty(struct rcu_segcblist *rsclp, int seg) +static inline bool rcu_segcblist_is_offloaded(struct rcu_segcblist *rsclp) { - return !*rsclp->tails[seg]; + if (IS_ENABLED(CONFIG_RCU_NOCB_CPU) && + rcu_segcblist_test_flags(rsclp, SEGCBLIST_OFFLOADED)) + return true; + + return false; } /* - * Interim function to return rcu_segcblist head pointer. Longer term, the - * rcu_segcblist will be used more pervasively, removing the need for this - * function. + * Are all segments following the specified segment of the specified + * rcu_segcblist structure empty of callbacks? (The specified + * segment might well contain callbacks.) */ -static inline struct rcu_head *rcu_segcblist_head(struct rcu_segcblist *rsclp) +static inline bool rcu_segcblist_restempty(struct rcu_segcblist *rsclp, int seg) { - return rsclp->head; + return !READ_ONCE(*READ_ONCE(rsclp->tails[seg])); } /* - * Interim function to return rcu_segcblist head pointer. Longer term, the - * rcu_segcblist will be used more pervasively, removing the need for this - * function. + * Is the specified segment of the specified rcu_segcblist structure + * empty of callbacks? */ -static inline struct rcu_head **rcu_segcblist_tail(struct rcu_segcblist *rsclp) +static inline bool rcu_segcblist_segempty(struct rcu_segcblist *rsclp, int seg) { - WARN_ON_ONCE(rcu_segcblist_empty(rsclp)); - return rsclp->tails[RCU_NEXT_TAIL]; + if (seg == RCU_DONE_TAIL) + return &rsclp->head == rsclp->tails[RCU_DONE_TAIL]; + return rsclp->tails[seg - 1] == rsclp->tails[seg]; } +void rcu_segcblist_inc_len(struct rcu_segcblist *rsclp); +void rcu_segcblist_add_len(struct rcu_segcblist *rsclp, long v); void rcu_segcblist_init(struct rcu_segcblist *rsclp); void rcu_segcblist_disable(struct rcu_segcblist *rsclp); bool rcu_segcblist_ready_cbs(struct rcu_segcblist *rsclp); bool rcu_segcblist_pend_cbs(struct rcu_segcblist *rsclp); struct rcu_head *rcu_segcblist_first_cb(struct rcu_segcblist *rsclp); struct rcu_head *rcu_segcblist_first_pend_cb(struct rcu_segcblist *rsclp); +bool rcu_segcblist_nextgp(struct rcu_segcblist *rsclp, unsigned long *lp); void rcu_segcblist_enqueue(struct rcu_segcblist *rsclp, - struct rcu_head *rhp, bool lazy); + struct rcu_head *rhp); bool rcu_segcblist_entrain(struct rcu_segcblist *rsclp, - struct rcu_head *rhp, bool lazy); -void rcu_segcblist_extract_count(struct rcu_segcblist *rsclp, - struct rcu_cblist *rclp); + struct rcu_head *rhp); void rcu_segcblist_extract_done_cbs(struct rcu_segcblist *rsclp, struct rcu_cblist *rclp); void rcu_segcblist_extract_pend_cbs(struct rcu_segcblist *rsclp, diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuperf.c deleted file mode 100644 index b459da70b4fc..000000000000 --- a/kernel/rcu/rcuperf.c +++ /dev/null @@ -1,696 +0,0 @@ -/* - * Read-Copy Update module-based performance-test facility - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, you can access it online at - * http://www.gnu.org/licenses/gpl-2.0.html. - * - * Copyright (C) IBM Corporation, 2015 - * - * Authors: Paul E. McKenney <paulmck@us.ibm.com> - */ - -#define pr_fmt(fmt) fmt - -#include <linux/types.h> -#include <linux/kernel.h> -#include <linux/init.h> -#include <linux/module.h> -#include <linux/kthread.h> -#include <linux/err.h> -#include <linux/spinlock.h> -#include <linux/smp.h> -#include <linux/rcupdate.h> -#include <linux/interrupt.h> -#include <linux/sched.h> -#include <uapi/linux/sched/types.h> -#include <linux/atomic.h> -#include <linux/bitops.h> -#include <linux/completion.h> -#include <linux/moduleparam.h> -#include <linux/percpu.h> -#include <linux/notifier.h> -#include <linux/reboot.h> -#include <linux/freezer.h> -#include <linux/cpu.h> -#include <linux/delay.h> -#include <linux/stat.h> -#include <linux/srcu.h> -#include <linux/slab.h> -#include <asm/byteorder.h> -#include <linux/torture.h> -#include <linux/vmalloc.h> - -#include "rcu.h" - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Paul E. McKenney <paulmck@linux.vnet.ibm.com>"); - -#define PERF_FLAG "-perf:" -#define PERFOUT_STRING(s) \ - pr_alert("%s" PERF_FLAG " %s\n", perf_type, s) -#define VERBOSE_PERFOUT_STRING(s) \ - do { if (verbose) pr_alert("%s" PERF_FLAG " %s\n", perf_type, s); } while (0) -#define VERBOSE_PERFOUT_ERRSTRING(s) \ - do { if (verbose) pr_alert("%s" PERF_FLAG "!!! %s\n", perf_type, s); } while (0) - -/* - * The intended use cases for the nreaders and nwriters module parameters - * are as follows: - * - * 1. Specify only the nr_cpus kernel boot parameter. This will - * set both nreaders and nwriters to the value specified by - * nr_cpus for a mixed reader/writer test. - * - * 2. Specify the nr_cpus kernel boot parameter, but set - * rcuperf.nreaders to zero. This will set nwriters to the - * value specified by nr_cpus for an update-only test. - * - * 3. Specify the nr_cpus kernel boot parameter, but set - * rcuperf.nwriters to zero. This will set nreaders to the - * value specified by nr_cpus for a read-only test. - * - * Various other use cases may of course be specified. - */ - -torture_param(bool, gp_async, false, "Use asynchronous GP wait primitives"); -torture_param(int, gp_async_max, 1000, "Max # outstanding waits per reader"); -torture_param(bool, gp_exp, false, "Use expedited GP wait primitives"); -torture_param(int, holdoff, 10, "Holdoff time before test start (s)"); -torture_param(int, nreaders, -1, "Number of RCU reader threads"); -torture_param(int, nwriters, -1, "Number of RCU updater threads"); -torture_param(bool, shutdown, !IS_ENABLED(MODULE), - "Shutdown at end of performance tests."); -torture_param(int, verbose, 1, "Enable verbose debugging printk()s"); -torture_param(int, writer_holdoff, 0, "Holdoff (us) between GPs, zero to disable"); - -static char *perf_type = "rcu"; -module_param(perf_type, charp, 0444); -MODULE_PARM_DESC(perf_type, "Type of RCU to performance-test (rcu, rcu_bh, ...)"); - -static int nrealreaders; -static int nrealwriters; -static struct task_struct **writer_tasks; -static struct task_struct **reader_tasks; -static struct task_struct *shutdown_task; - -static u64 **writer_durations; -static int *writer_n_durations; -static atomic_t n_rcu_perf_reader_started; -static atomic_t n_rcu_perf_writer_started; -static atomic_t n_rcu_perf_writer_finished; -static wait_queue_head_t shutdown_wq; -static u64 t_rcu_perf_writer_started; -static u64 t_rcu_perf_writer_finished; -static unsigned long b_rcu_perf_writer_started; -static unsigned long b_rcu_perf_writer_finished; -static DEFINE_PER_CPU(atomic_t, n_async_inflight); - -static int rcu_perf_writer_state; -#define RTWS_INIT 0 -#define RTWS_ASYNC 1 -#define RTWS_BARRIER 2 -#define RTWS_EXP_SYNC 3 -#define RTWS_SYNC 4 -#define RTWS_IDLE 5 -#define RTWS_STOPPING 6 - -#define MAX_MEAS 10000 -#define MIN_MEAS 100 - -/* - * Operations vector for selecting different types of tests. - */ - -struct rcu_perf_ops { - int ptype; - void (*init)(void); - void (*cleanup)(void); - int (*readlock)(void); - void (*readunlock)(int idx); - unsigned long (*get_gp_seq)(void); - unsigned long (*gp_diff)(unsigned long new, unsigned long old); - unsigned long (*exp_completed)(void); - void (*async)(struct rcu_head *head, rcu_callback_t func); - void (*gp_barrier)(void); - void (*sync)(void); - void (*exp_sync)(void); - const char *name; -}; - -static struct rcu_perf_ops *cur_ops; - -/* - * Definitions for rcu perf testing. - */ - -static int rcu_perf_read_lock(void) __acquires(RCU) -{ - rcu_read_lock(); - return 0; -} - -static void rcu_perf_read_unlock(int idx) __releases(RCU) -{ - rcu_read_unlock(); -} - -static unsigned long __maybe_unused rcu_no_completed(void) -{ - return 0; -} - -static void rcu_sync_perf_init(void) -{ -} - -static struct rcu_perf_ops rcu_ops = { - .ptype = RCU_FLAVOR, - .init = rcu_sync_perf_init, - .readlock = rcu_perf_read_lock, - .readunlock = rcu_perf_read_unlock, - .get_gp_seq = rcu_get_gp_seq, - .gp_diff = rcu_seq_diff, - .exp_completed = rcu_exp_batches_completed, - .async = call_rcu, - .gp_barrier = rcu_barrier, - .sync = synchronize_rcu, - .exp_sync = synchronize_rcu_expedited, - .name = "rcu" -}; - -/* - * Definitions for srcu perf testing. - */ - -DEFINE_STATIC_SRCU(srcu_ctl_perf); -static struct srcu_struct *srcu_ctlp = &srcu_ctl_perf; - -static int srcu_perf_read_lock(void) __acquires(srcu_ctlp) -{ - return srcu_read_lock(srcu_ctlp); -} - -static void srcu_perf_read_unlock(int idx) __releases(srcu_ctlp) -{ - srcu_read_unlock(srcu_ctlp, idx); -} - -static unsigned long srcu_perf_completed(void) -{ - return srcu_batches_completed(srcu_ctlp); -} - -static void srcu_call_rcu(struct rcu_head *head, rcu_callback_t func) -{ - call_srcu(srcu_ctlp, head, func); -} - -static void srcu_rcu_barrier(void) -{ - srcu_barrier(srcu_ctlp); -} - -static void srcu_perf_synchronize(void) -{ - synchronize_srcu(srcu_ctlp); -} - -static void srcu_perf_synchronize_expedited(void) -{ - synchronize_srcu_expedited(srcu_ctlp); -} - -static struct rcu_perf_ops srcu_ops = { - .ptype = SRCU_FLAVOR, - .init = rcu_sync_perf_init, - .readlock = srcu_perf_read_lock, - .readunlock = srcu_perf_read_unlock, - .get_gp_seq = srcu_perf_completed, - .gp_diff = rcu_seq_diff, - .exp_completed = srcu_perf_completed, - .async = srcu_call_rcu, - .gp_barrier = srcu_rcu_barrier, - .sync = srcu_perf_synchronize, - .exp_sync = srcu_perf_synchronize_expedited, - .name = "srcu" -}; - -static struct srcu_struct srcud; - -static void srcu_sync_perf_init(void) -{ - srcu_ctlp = &srcud; - init_srcu_struct(srcu_ctlp); -} - -static void srcu_sync_perf_cleanup(void) -{ - cleanup_srcu_struct(srcu_ctlp); -} - -static struct rcu_perf_ops srcud_ops = { - .ptype = SRCU_FLAVOR, - .init = srcu_sync_perf_init, - .cleanup = srcu_sync_perf_cleanup, - .readlock = srcu_perf_read_lock, - .readunlock = srcu_perf_read_unlock, - .get_gp_seq = srcu_perf_completed, - .gp_diff = rcu_seq_diff, - .exp_completed = srcu_perf_completed, - .async = srcu_call_rcu, - .gp_barrier = srcu_rcu_barrier, - .sync = srcu_perf_synchronize, - .exp_sync = srcu_perf_synchronize_expedited, - .name = "srcud" -}; - -/* - * Definitions for RCU-tasks perf testing. - */ - -static int tasks_perf_read_lock(void) -{ - return 0; -} - -static void tasks_perf_read_unlock(int idx) -{ -} - -static struct rcu_perf_ops tasks_ops = { - .ptype = RCU_TASKS_FLAVOR, - .init = rcu_sync_perf_init, - .readlock = tasks_perf_read_lock, - .readunlock = tasks_perf_read_unlock, - .get_gp_seq = rcu_no_completed, - .gp_diff = rcu_seq_diff, - .async = call_rcu_tasks, - .gp_barrier = rcu_barrier_tasks, - .sync = synchronize_rcu_tasks, - .exp_sync = synchronize_rcu_tasks, - .name = "tasks" -}; - -static unsigned long rcuperf_seq_diff(unsigned long new, unsigned long old) -{ - if (!cur_ops->gp_diff) - return new - old; - return cur_ops->gp_diff(new, old); -} - -/* - * If performance tests complete, wait for shutdown to commence. - */ -static void rcu_perf_wait_shutdown(void) -{ - cond_resched_tasks_rcu_qs(); - if (atomic_read(&n_rcu_perf_writer_finished) < nrealwriters) - return; - while (!torture_must_stop()) - schedule_timeout_uninterruptible(1); -} - -/* - * RCU perf reader kthread. Repeatedly does empty RCU read-side - * critical section, minimizing update-side interference. - */ -static int -rcu_perf_reader(void *arg) -{ - unsigned long flags; - int idx; - long me = (long)arg; - - VERBOSE_PERFOUT_STRING("rcu_perf_reader task started"); - set_cpus_allowed_ptr(current, cpumask_of(me % nr_cpu_ids)); - set_user_nice(current, MAX_NICE); - atomic_inc(&n_rcu_perf_reader_started); - - do { - local_irq_save(flags); - idx = cur_ops->readlock(); - cur_ops->readunlock(idx); - local_irq_restore(flags); - rcu_perf_wait_shutdown(); - } while (!torture_must_stop()); - torture_kthread_stopping("rcu_perf_reader"); - return 0; -} - -/* - * Callback function for asynchronous grace periods from rcu_perf_writer(). - */ -static void rcu_perf_async_cb(struct rcu_head *rhp) -{ - atomic_dec(this_cpu_ptr(&n_async_inflight)); - kfree(rhp); -} - -/* - * RCU perf writer kthread. Repeatedly does a grace period. - */ -static int -rcu_perf_writer(void *arg) -{ - int i = 0; - int i_max; - long me = (long)arg; - struct rcu_head *rhp = NULL; - struct sched_param sp; - bool started = false, done = false, alldone = false; - u64 t; - u64 *wdp; - u64 *wdpp = writer_durations[me]; - - VERBOSE_PERFOUT_STRING("rcu_perf_writer task started"); - WARN_ON(!wdpp); - set_cpus_allowed_ptr(current, cpumask_of(me % nr_cpu_ids)); - sp.sched_priority = 1; - sched_setscheduler_nocheck(current, SCHED_FIFO, &sp); - - if (holdoff) - schedule_timeout_uninterruptible(holdoff * HZ); - - t = ktime_get_mono_fast_ns(); - if (atomic_inc_return(&n_rcu_perf_writer_started) >= nrealwriters) { - t_rcu_perf_writer_started = t; - if (gp_exp) { - b_rcu_perf_writer_started = - cur_ops->exp_completed() / 2; - } else { - b_rcu_perf_writer_started = cur_ops->get_gp_seq(); - } - } - - do { - if (writer_holdoff) - udelay(writer_holdoff); - wdp = &wdpp[i]; - *wdp = ktime_get_mono_fast_ns(); - if (gp_async) { -retry: - if (!rhp) - rhp = kmalloc(sizeof(*rhp), GFP_KERNEL); - if (rhp && atomic_read(this_cpu_ptr(&n_async_inflight)) < gp_async_max) { - rcu_perf_writer_state = RTWS_ASYNC; - atomic_inc(this_cpu_ptr(&n_async_inflight)); - cur_ops->async(rhp, rcu_perf_async_cb); - rhp = NULL; - } else if (!kthread_should_stop()) { - rcu_perf_writer_state = RTWS_BARRIER; - cur_ops->gp_barrier(); - goto retry; - } else { - kfree(rhp); /* Because we are stopping. */ - } - } else if (gp_exp) { - rcu_perf_writer_state = RTWS_EXP_SYNC; - cur_ops->exp_sync(); - } else { - rcu_perf_writer_state = RTWS_SYNC; - cur_ops->sync(); - } - rcu_perf_writer_state = RTWS_IDLE; - t = ktime_get_mono_fast_ns(); - *wdp = t - *wdp; - i_max = i; - if (!started && - atomic_read(&n_rcu_perf_writer_started) >= nrealwriters) - started = true; - if (!done && i >= MIN_MEAS) { - done = true; - sp.sched_priority = 0; - sched_setscheduler_nocheck(current, - SCHED_NORMAL, &sp); - pr_alert("%s%s rcu_perf_writer %ld has %d measurements\n", - perf_type, PERF_FLAG, me, MIN_MEAS); - if (atomic_inc_return(&n_rcu_perf_writer_finished) >= - nrealwriters) { - schedule_timeout_interruptible(10); - rcu_ftrace_dump(DUMP_ALL); - PERFOUT_STRING("Test complete"); - t_rcu_perf_writer_finished = t; - if (gp_exp) { - b_rcu_perf_writer_finished = - cur_ops->exp_completed() / 2; - } else { - b_rcu_perf_writer_finished = - cur_ops->get_gp_seq(); - } - if (shutdown) { - smp_mb(); /* Assign before wake. */ - wake_up(&shutdown_wq); - } - } - } - if (done && !alldone && - atomic_read(&n_rcu_perf_writer_finished) >= nrealwriters) - alldone = true; - if (started && !alldone && i < MAX_MEAS - 1) - i++; - rcu_perf_wait_shutdown(); - } while (!torture_must_stop()); - if (gp_async) { - rcu_perf_writer_state = RTWS_BARRIER; - cur_ops->gp_barrier(); - } - rcu_perf_writer_state = RTWS_STOPPING; - writer_n_durations[me] = i_max; - torture_kthread_stopping("rcu_perf_writer"); - return 0; -} - -static void -rcu_perf_print_module_parms(struct rcu_perf_ops *cur_ops, const char *tag) -{ - pr_alert("%s" PERF_FLAG - "--- %s: nreaders=%d nwriters=%d verbose=%d shutdown=%d\n", - perf_type, tag, nrealreaders, nrealwriters, verbose, shutdown); -} - -static void -rcu_perf_cleanup(void) -{ - int i; - int j; - int ngps = 0; - u64 *wdp; - u64 *wdpp; - - /* - * Would like warning at start, but everything is expedited - * during the mid-boot phase, so have to wait till the end. - */ - if (rcu_gp_is_expedited() && !rcu_gp_is_normal() && !gp_exp) - VERBOSE_PERFOUT_ERRSTRING("All grace periods expedited, no normal ones to measure!"); - if (rcu_gp_is_normal() && gp_exp) - VERBOSE_PERFOUT_ERRSTRING("All grace periods normal, no expedited ones to measure!"); - if (gp_exp && gp_async) - VERBOSE_PERFOUT_ERRSTRING("No expedited async GPs, so went with async!"); - - if (torture_cleanup_begin()) - return; - - if (reader_tasks) { - for (i = 0; i < nrealreaders; i++) - torture_stop_kthread(rcu_perf_reader, - reader_tasks[i]); - kfree(reader_tasks); - } - - if (writer_tasks) { - for (i = 0; i < nrealwriters; i++) { - torture_stop_kthread(rcu_perf_writer, - writer_tasks[i]); - if (!writer_n_durations) - continue; - j = writer_n_durations[i]; - pr_alert("%s%s writer %d gps: %d\n", - perf_type, PERF_FLAG, i, j); - ngps += j; - } - pr_alert("%s%s start: %llu end: %llu duration: %llu gps: %d batches: %ld\n", - perf_type, PERF_FLAG, - t_rcu_perf_writer_started, t_rcu_perf_writer_finished, - t_rcu_perf_writer_finished - - t_rcu_perf_writer_started, - ngps, - rcuperf_seq_diff(b_rcu_perf_writer_finished, - b_rcu_perf_writer_started)); - for (i = 0; i < nrealwriters; i++) { - if (!writer_durations) - break; - if (!writer_n_durations) - continue; - wdpp = writer_durations[i]; - if (!wdpp) - continue; - for (j = 0; j <= writer_n_durations[i]; j++) { - wdp = &wdpp[j]; - pr_alert("%s%s %4d writer-duration: %5d %llu\n", - perf_type, PERF_FLAG, - i, j, *wdp); - if (j % 100 == 0) - schedule_timeout_uninterruptible(1); - } - kfree(writer_durations[i]); - } - kfree(writer_tasks); - kfree(writer_durations); - kfree(writer_n_durations); - } - - /* Do torture-type-specific cleanup operations. */ - if (cur_ops->cleanup != NULL) - cur_ops->cleanup(); - - torture_cleanup_end(); -} - -/* - * Return the number if non-negative. If -1, the number of CPUs. - * If less than -1, that much less than the number of CPUs, but - * at least one. - */ -static int compute_real(int n) -{ - int nr; - - if (n >= 0) { - nr = n; - } else { - nr = num_online_cpus() + 1 + n; - if (nr <= 0) - nr = 1; - } - return nr; -} - -/* - * RCU perf shutdown kthread. Just waits to be awakened, then shuts - * down system. - */ -static int -rcu_perf_shutdown(void *arg) -{ - do { - wait_event(shutdown_wq, - atomic_read(&n_rcu_perf_writer_finished) >= - nrealwriters); - } while (atomic_read(&n_rcu_perf_writer_finished) < nrealwriters); - smp_mb(); /* Wake before output. */ - rcu_perf_cleanup(); - kernel_power_off(); - return -EINVAL; -} - -static int __init -rcu_perf_init(void) -{ - long i; - int firsterr = 0; - static struct rcu_perf_ops *perf_ops[] = { - &rcu_ops, &srcu_ops, &srcud_ops, &tasks_ops, - }; - - if (!torture_init_begin(perf_type, verbose)) - return -EBUSY; - - /* Process args and tell the world that the perf'er is on the job. */ - for (i = 0; i < ARRAY_SIZE(perf_ops); i++) { - cur_ops = perf_ops[i]; - if (strcmp(perf_type, cur_ops->name) == 0) - break; - } - if (i == ARRAY_SIZE(perf_ops)) { - pr_alert("rcu-perf: invalid perf type: \"%s\"\n", perf_type); - pr_alert("rcu-perf types:"); - for (i = 0; i < ARRAY_SIZE(perf_ops); i++) - pr_cont(" %s", perf_ops[i]->name); - pr_cont("\n"); - WARN_ON(!IS_MODULE(CONFIG_RCU_PERF_TEST)); - firsterr = -EINVAL; - goto unwind; - } - if (cur_ops->init) - cur_ops->init(); - - nrealwriters = compute_real(nwriters); - nrealreaders = compute_real(nreaders); - atomic_set(&n_rcu_perf_reader_started, 0); - atomic_set(&n_rcu_perf_writer_started, 0); - atomic_set(&n_rcu_perf_writer_finished, 0); - rcu_perf_print_module_parms(cur_ops, "Start of test"); - - /* Start up the kthreads. */ - - if (shutdown) { - init_waitqueue_head(&shutdown_wq); - firsterr = torture_create_kthread(rcu_perf_shutdown, NULL, - shutdown_task); - if (firsterr) - goto unwind; - schedule_timeout_uninterruptible(1); - } - reader_tasks = kcalloc(nrealreaders, sizeof(reader_tasks[0]), - GFP_KERNEL); - if (reader_tasks == NULL) { - VERBOSE_PERFOUT_ERRSTRING("out of memory"); - firsterr = -ENOMEM; - goto unwind; - } - for (i = 0; i < nrealreaders; i++) { - firsterr = torture_create_kthread(rcu_perf_reader, (void *)i, - reader_tasks[i]); - if (firsterr) - goto unwind; - } - while (atomic_read(&n_rcu_perf_reader_started) < nrealreaders) - schedule_timeout_uninterruptible(1); - writer_tasks = kcalloc(nrealwriters, sizeof(reader_tasks[0]), - GFP_KERNEL); - writer_durations = kcalloc(nrealwriters, sizeof(*writer_durations), - GFP_KERNEL); - writer_n_durations = - kcalloc(nrealwriters, sizeof(*writer_n_durations), - GFP_KERNEL); - if (!writer_tasks || !writer_durations || !writer_n_durations) { - VERBOSE_PERFOUT_ERRSTRING("out of memory"); - firsterr = -ENOMEM; - goto unwind; - } - for (i = 0; i < nrealwriters; i++) { - writer_durations[i] = - kcalloc(MAX_MEAS, sizeof(*writer_durations[i]), - GFP_KERNEL); - if (!writer_durations[i]) { - firsterr = -ENOMEM; - goto unwind; - } - firsterr = torture_create_kthread(rcu_perf_writer, (void *)i, - writer_tasks[i]); - if (firsterr) - goto unwind; - } - torture_init_end(); - return 0; - -unwind: - torture_init_end(); - rcu_perf_cleanup(); - return firsterr; -} - -module_init(rcu_perf_init); -module_exit(rcu_perf_cleanup); diff --git a/kernel/rcu/rcuscale.c b/kernel/rcu/rcuscale.c new file mode 100644 index 000000000000..7484d8ad5767 --- /dev/null +++ b/kernel/rcu/rcuscale.c @@ -0,0 +1,1219 @@ +// SPDX-License-Identifier: GPL-2.0+ +/* + * Read-Copy Update module-based scalability-test facility + * + * Copyright (C) IBM Corporation, 2015 + * + * Authors: Paul E. McKenney <paulmck@linux.ibm.com> + */ + +#define pr_fmt(fmt) fmt + +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/mm.h> +#include <linux/module.h> +#include <linux/kthread.h> +#include <linux/err.h> +#include <linux/spinlock.h> +#include <linux/smp.h> +#include <linux/rcupdate.h> +#include <linux/interrupt.h> +#include <linux/sched.h> +#include <uapi/linux/sched/types.h> +#include <linux/atomic.h> +#include <linux/bitops.h> +#include <linux/completion.h> +#include <linux/moduleparam.h> +#include <linux/percpu.h> +#include <linux/notifier.h> +#include <linux/reboot.h> +#include <linux/freezer.h> +#include <linux/cpu.h> +#include <linux/delay.h> +#include <linux/stat.h> +#include <linux/srcu.h> +#include <linux/slab.h> +#include <asm/byteorder.h> +#include <linux/torture.h> +#include <linux/vmalloc.h> +#include <linux/rcupdate_trace.h> +#include <linux/sched/debug.h> + +#include "rcu.h" + +MODULE_DESCRIPTION("Read-Copy Update module-based scalability-test facility"); +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Paul E. McKenney <paulmck@linux.ibm.com>"); + +#define SCALE_FLAG "-scale:" +#define SCALEOUT_STRING(s) \ + pr_alert("%s" SCALE_FLAG " %s\n", scale_type, s) +#define VERBOSE_SCALEOUT_STRING(s) \ + do { if (verbose) pr_alert("%s" SCALE_FLAG " %s\n", scale_type, s); } while (0) +#define SCALEOUT_ERRSTRING(s) \ + pr_alert("%s" SCALE_FLAG "!!! %s\n", scale_type, s) + +/* + * The intended use cases for the nreaders and nwriters module parameters + * are as follows: + * + * 1. Specify only the nr_cpus kernel boot parameter. This will + * set both nreaders and nwriters to the value specified by + * nr_cpus for a mixed reader/writer test. + * + * 2. Specify the nr_cpus kernel boot parameter, but set + * rcuscale.nreaders to zero. This will set nwriters to the + * value specified by nr_cpus for an update-only test. + * + * 3. Specify the nr_cpus kernel boot parameter, but set + * rcuscale.nwriters to zero. This will set nreaders to the + * value specified by nr_cpus for a read-only test. + * + * Various other use cases may of course be specified. + * + * Note that this test's readers are intended only as a test load for + * the writers. The reader scalability statistics will be overly + * pessimistic due to the per-critical-section interrupt disabling, + * test-end checks, and the pair of calls through pointers. + */ + +#ifdef MODULE +# define RCUSCALE_SHUTDOWN 0 +#else +# define RCUSCALE_SHUTDOWN 1 +#endif + +torture_param(bool, gp_async, false, "Use asynchronous GP wait primitives"); +torture_param(int, gp_async_max, 1000, "Max # outstanding waits per writer"); +torture_param(bool, gp_exp, false, "Use expedited GP wait primitives"); +torture_param(int, holdoff, 10, "Holdoff time before test start (s)"); +torture_param(int, minruntime, 0, "Minimum run time (s)"); +torture_param(int, nreaders, -1, "Number of RCU reader threads"); +torture_param(int, nwriters, -1, "Number of RCU updater threads"); +torture_param(bool, shutdown, RCUSCALE_SHUTDOWN, + "Shutdown at end of scalability tests."); +torture_param(int, verbose, 1, "Enable verbose debugging printk()s"); +torture_param(int, writer_holdoff, 0, "Holdoff (us) between GPs, zero to disable"); +torture_param(int, writer_holdoff_jiffies, 0, "Holdoff (jiffies) between GPs, zero to disable"); +torture_param(int, kfree_rcu_test, 0, "Do we run a kfree_rcu() scale test?"); +torture_param(int, kfree_mult, 1, "Multiple of kfree_obj size to allocate."); +torture_param(int, kfree_by_call_rcu, 0, "Use call_rcu() to emulate kfree_rcu()?"); + +static char *scale_type = "rcu"; +module_param(scale_type, charp, 0444); +MODULE_PARM_DESC(scale_type, "Type of RCU to scalability-test (rcu, srcu, ...)"); + +// Structure definitions for custom fixed-per-task allocator. +struct writer_mblock { + struct rcu_head wmb_rh; + struct llist_node wmb_node; + struct writer_freelist *wmb_wfl; +}; + +struct writer_freelist { + struct llist_head ws_lhg; + atomic_t ws_inflight; + struct llist_head ____cacheline_internodealigned_in_smp ws_lhp; + struct writer_mblock *ws_mblocks; +}; + +static int nrealreaders; +static int nrealwriters; +static struct task_struct **writer_tasks; +static struct task_struct **reader_tasks; +static struct task_struct *shutdown_task; + +static u64 **writer_durations; +static bool *writer_done; +static struct writer_freelist *writer_freelists; +static int *writer_n_durations; +static atomic_t n_rcu_scale_reader_started; +static atomic_t n_rcu_scale_writer_started; +static atomic_t n_rcu_scale_writer_finished; +static wait_queue_head_t shutdown_wq; +static u64 t_rcu_scale_writer_started; +static u64 t_rcu_scale_writer_finished; +static unsigned long b_rcu_gp_test_started; +static unsigned long b_rcu_gp_test_finished; + +#define MAX_MEAS 10000 +#define MIN_MEAS 100 + +/* + * Operations vector for selecting different types of tests. + */ + +struct rcu_scale_ops { + int ptype; + void (*init)(void); + void (*cleanup)(void); + int (*readlock)(void); + void (*readunlock)(int idx); + unsigned long (*get_gp_seq)(void); + unsigned long (*gp_diff)(unsigned long new, unsigned long old); + unsigned long (*exp_completed)(void); + void (*async)(struct rcu_head *head, rcu_callback_t func); + void (*gp_barrier)(void); + void (*sync)(void); + void (*exp_sync)(void); + struct task_struct *(*rso_gp_kthread)(void); + void (*stats)(void); + const char *name; +}; + +static struct rcu_scale_ops *cur_ops; + +/* + * Definitions for rcu scalability testing. + */ + +static int rcu_scale_read_lock(void) __acquires(RCU) +{ + rcu_read_lock(); + return 0; +} + +static void rcu_scale_read_unlock(int idx) __releases(RCU) +{ + rcu_read_unlock(); +} + +static unsigned long __maybe_unused rcu_no_completed(void) +{ + return 0; +} + +static void rcu_sync_scale_init(void) +{ +} + +static struct rcu_scale_ops rcu_ops = { + .ptype = RCU_FLAVOR, + .init = rcu_sync_scale_init, + .readlock = rcu_scale_read_lock, + .readunlock = rcu_scale_read_unlock, + .get_gp_seq = rcu_get_gp_seq, + .gp_diff = rcu_seq_diff, + .exp_completed = rcu_exp_batches_completed, + .async = call_rcu_hurry, + .gp_barrier = rcu_barrier, + .sync = synchronize_rcu, + .exp_sync = synchronize_rcu_expedited, + .name = "rcu" +}; + +/* + * Definitions for srcu scalability testing. + */ + +DEFINE_STATIC_SRCU(srcu_ctl_scale); +static struct srcu_struct *srcu_ctlp = &srcu_ctl_scale; + +static int srcu_scale_read_lock(void) __acquires(srcu_ctlp) +{ + return srcu_read_lock(srcu_ctlp); +} + +static void srcu_scale_read_unlock(int idx) __releases(srcu_ctlp) +{ + srcu_read_unlock(srcu_ctlp, idx); +} + +static unsigned long srcu_scale_completed(void) +{ + return srcu_batches_completed(srcu_ctlp); +} + +static void srcu_call_rcu(struct rcu_head *head, rcu_callback_t func) +{ + call_srcu(srcu_ctlp, head, func); +} + +static void srcu_rcu_barrier(void) +{ + srcu_barrier(srcu_ctlp); +} + +static void srcu_scale_synchronize(void) +{ + synchronize_srcu(srcu_ctlp); +} + +static void srcu_scale_stats(void) +{ + srcu_torture_stats_print(srcu_ctlp, scale_type, SCALE_FLAG); +} + +static void srcu_scale_synchronize_expedited(void) +{ + synchronize_srcu_expedited(srcu_ctlp); +} + +static struct rcu_scale_ops srcu_ops = { + .ptype = SRCU_FLAVOR, + .init = rcu_sync_scale_init, + .readlock = srcu_scale_read_lock, + .readunlock = srcu_scale_read_unlock, + .get_gp_seq = srcu_scale_completed, + .gp_diff = rcu_seq_diff, + .exp_completed = srcu_scale_completed, + .async = srcu_call_rcu, + .gp_barrier = srcu_rcu_barrier, + .sync = srcu_scale_synchronize, + .exp_sync = srcu_scale_synchronize_expedited, + .stats = srcu_scale_stats, + .name = "srcu" +}; + +static struct srcu_struct srcud; + +static void srcu_sync_scale_init(void) +{ + srcu_ctlp = &srcud; + init_srcu_struct(srcu_ctlp); +} + +static void srcu_sync_scale_cleanup(void) +{ + cleanup_srcu_struct(srcu_ctlp); +} + +static struct rcu_scale_ops srcud_ops = { + .ptype = SRCU_FLAVOR, + .init = srcu_sync_scale_init, + .cleanup = srcu_sync_scale_cleanup, + .readlock = srcu_scale_read_lock, + .readunlock = srcu_scale_read_unlock, + .get_gp_seq = srcu_scale_completed, + .gp_diff = rcu_seq_diff, + .exp_completed = srcu_scale_completed, + .async = srcu_call_rcu, + .gp_barrier = srcu_rcu_barrier, + .sync = srcu_scale_synchronize, + .exp_sync = srcu_scale_synchronize_expedited, + .stats = srcu_scale_stats, + .name = "srcud" +}; + +#ifdef CONFIG_TASKS_RCU + +/* + * Definitions for RCU-tasks scalability testing. + */ + +static int tasks_scale_read_lock(void) +{ + return 0; +} + +static void tasks_scale_read_unlock(int idx) +{ +} + +static void rcu_tasks_scale_stats(void) +{ + rcu_tasks_torture_stats_print(scale_type, SCALE_FLAG); +} + +static struct rcu_scale_ops tasks_ops = { + .ptype = RCU_TASKS_FLAVOR, + .init = rcu_sync_scale_init, + .readlock = tasks_scale_read_lock, + .readunlock = tasks_scale_read_unlock, + .get_gp_seq = rcu_no_completed, + .gp_diff = rcu_seq_diff, + .async = call_rcu_tasks, + .gp_barrier = rcu_barrier_tasks, + .sync = synchronize_rcu_tasks, + .exp_sync = synchronize_rcu_tasks, + .rso_gp_kthread = get_rcu_tasks_gp_kthread, + .stats = IS_ENABLED(CONFIG_TINY_RCU) ? NULL : rcu_tasks_scale_stats, + .name = "tasks" +}; + +#define TASKS_OPS &tasks_ops, + +#else // #ifdef CONFIG_TASKS_RCU + +#define TASKS_OPS + +#endif // #else // #ifdef CONFIG_TASKS_RCU + +#ifdef CONFIG_TASKS_RUDE_RCU + +/* + * Definitions for RCU-tasks-rude scalability testing. + */ + +static int tasks_rude_scale_read_lock(void) +{ + return 0; +} + +static void tasks_rude_scale_read_unlock(int idx) +{ +} + +static void rcu_tasks_rude_scale_stats(void) +{ + rcu_tasks_rude_torture_stats_print(scale_type, SCALE_FLAG); +} + +static struct rcu_scale_ops tasks_rude_ops = { + .ptype = RCU_TASKS_RUDE_FLAVOR, + .init = rcu_sync_scale_init, + .readlock = tasks_rude_scale_read_lock, + .readunlock = tasks_rude_scale_read_unlock, + .get_gp_seq = rcu_no_completed, + .gp_diff = rcu_seq_diff, + .sync = synchronize_rcu_tasks_rude, + .exp_sync = synchronize_rcu_tasks_rude, + .rso_gp_kthread = get_rcu_tasks_rude_gp_kthread, + .stats = IS_ENABLED(CONFIG_TINY_RCU) ? NULL : rcu_tasks_rude_scale_stats, + .name = "tasks-rude" +}; + +#define TASKS_RUDE_OPS &tasks_rude_ops, + +#else // #ifdef CONFIG_TASKS_RUDE_RCU + +#define TASKS_RUDE_OPS + +#endif // #else // #ifdef CONFIG_TASKS_RUDE_RCU + +#ifdef CONFIG_TASKS_TRACE_RCU + +/* + * Definitions for RCU-tasks-trace scalability testing. + */ + +static int tasks_trace_scale_read_lock(void) +{ + rcu_read_lock_trace(); + return 0; +} + +static void tasks_trace_scale_read_unlock(int idx) +{ + rcu_read_unlock_trace(); +} + +static void rcu_tasks_trace_scale_stats(void) +{ + rcu_tasks_trace_torture_stats_print(scale_type, SCALE_FLAG); +} + +static struct rcu_scale_ops tasks_tracing_ops = { + .ptype = RCU_TASKS_FLAVOR, + .init = rcu_sync_scale_init, + .readlock = tasks_trace_scale_read_lock, + .readunlock = tasks_trace_scale_read_unlock, + .get_gp_seq = rcu_no_completed, + .gp_diff = rcu_seq_diff, + .async = call_rcu_tasks_trace, + .gp_barrier = rcu_barrier_tasks_trace, + .sync = synchronize_rcu_tasks_trace, + .exp_sync = synchronize_rcu_tasks_trace, + .rso_gp_kthread = get_rcu_tasks_trace_gp_kthread, + .stats = IS_ENABLED(CONFIG_TINY_RCU) ? NULL : rcu_tasks_trace_scale_stats, + .name = "tasks-tracing" +}; + +#define TASKS_TRACING_OPS &tasks_tracing_ops, + +#else // #ifdef CONFIG_TASKS_TRACE_RCU + +#define TASKS_TRACING_OPS + +#endif // #else // #ifdef CONFIG_TASKS_TRACE_RCU + +static unsigned long rcuscale_seq_diff(unsigned long new, unsigned long old) +{ + if (!cur_ops->gp_diff) + return new - old; + return cur_ops->gp_diff(new, old); +} + +/* + * If scalability tests complete, wait for shutdown to commence. + */ +static void rcu_scale_wait_shutdown(void) +{ + cond_resched_tasks_rcu_qs(); + if (atomic_read(&n_rcu_scale_writer_finished) < nrealwriters) + return; + while (!torture_must_stop()) + schedule_timeout_uninterruptible(1); +} + +/* + * RCU scalability reader kthread. Repeatedly does empty RCU read-side + * critical section, minimizing update-side interference. However, the + * point of this test is not to evaluate reader scalability, but instead + * to serve as a test load for update-side scalability testing. + */ +static int +rcu_scale_reader(void *arg) +{ + unsigned long flags; + int idx; + long me = (long)arg; + + VERBOSE_SCALEOUT_STRING("rcu_scale_reader task started"); + set_cpus_allowed_ptr(current, cpumask_of(me % nr_cpu_ids)); + set_user_nice(current, MAX_NICE); + atomic_inc(&n_rcu_scale_reader_started); + + do { + local_irq_save(flags); + idx = cur_ops->readlock(); + cur_ops->readunlock(idx); + local_irq_restore(flags); + rcu_scale_wait_shutdown(); + } while (!torture_must_stop()); + torture_kthread_stopping("rcu_scale_reader"); + return 0; +} + +/* + * Allocate a writer_mblock structure for the specified rcu_scale_writer + * task. + */ +static struct writer_mblock *rcu_scale_alloc(long me) +{ + struct llist_node *llnp; + struct writer_freelist *wflp; + struct writer_mblock *wmbp; + + if (WARN_ON_ONCE(!writer_freelists)) + return NULL; + wflp = &writer_freelists[me]; + if (llist_empty(&wflp->ws_lhp)) { + // ->ws_lhp is private to its rcu_scale_writer task. + wmbp = container_of(llist_del_all(&wflp->ws_lhg), struct writer_mblock, wmb_node); + wflp->ws_lhp.first = &wmbp->wmb_node; + } + llnp = llist_del_first(&wflp->ws_lhp); + if (!llnp) + return NULL; + return container_of(llnp, struct writer_mblock, wmb_node); +} + +/* + * Free a writer_mblock structure to its rcu_scale_writer task. + */ +static void rcu_scale_free(struct writer_mblock *wmbp) +{ + struct writer_freelist *wflp; + + if (!wmbp) + return; + wflp = wmbp->wmb_wfl; + llist_add(&wmbp->wmb_node, &wflp->ws_lhg); +} + +/* + * Callback function for asynchronous grace periods from rcu_scale_writer(). + */ +static void rcu_scale_async_cb(struct rcu_head *rhp) +{ + struct writer_mblock *wmbp = container_of(rhp, struct writer_mblock, wmb_rh); + struct writer_freelist *wflp = wmbp->wmb_wfl; + + atomic_dec(&wflp->ws_inflight); + rcu_scale_free(wmbp); +} + +/* + * RCU scale writer kthread. Repeatedly does a grace period. + */ +static int +rcu_scale_writer(void *arg) +{ + int i = 0; + int i_max; + unsigned long jdone; + long me = (long)arg; + bool selfreport = false; + bool started = false, done = false, alldone = false; + u64 t; + DEFINE_TORTURE_RANDOM(tr); + u64 *wdp; + u64 *wdpp = writer_durations[me]; + struct writer_freelist *wflp = &writer_freelists[me]; + struct writer_mblock *wmbp = NULL; + + VERBOSE_SCALEOUT_STRING("rcu_scale_writer task started"); + WARN_ON(!wdpp); + set_cpus_allowed_ptr(current, cpumask_of(me % nr_cpu_ids)); + current->flags |= PF_NO_SETAFFINITY; + sched_set_fifo_low(current); + + if (holdoff) + schedule_timeout_idle(holdoff * HZ); + + /* + * Wait until rcu_end_inkernel_boot() is called for normal GP tests + * so that RCU is not always expedited for normal GP tests. + * The system_state test is approximate, but works well in practice. + */ + while (!gp_exp && system_state != SYSTEM_RUNNING) + schedule_timeout_uninterruptible(1); + + t = ktime_get_mono_fast_ns(); + if (atomic_inc_return(&n_rcu_scale_writer_started) >= nrealwriters) { + t_rcu_scale_writer_started = t; + if (gp_exp) { + b_rcu_gp_test_started = + cur_ops->exp_completed() / 2; + } else { + b_rcu_gp_test_started = cur_ops->get_gp_seq(); + } + } + + jdone = jiffies + minruntime * HZ; + do { + bool gp_succeeded = false; + + if (writer_holdoff) + udelay(writer_holdoff); + if (writer_holdoff_jiffies) + schedule_timeout_idle(torture_random(&tr) % writer_holdoff_jiffies + 1); + wdp = &wdpp[i]; + *wdp = ktime_get_mono_fast_ns(); + if (gp_async && !WARN_ON_ONCE(!cur_ops->async)) { + if (!wmbp) + wmbp = rcu_scale_alloc(me); + if (wmbp && atomic_read(&wflp->ws_inflight) < gp_async_max) { + atomic_inc(&wflp->ws_inflight); + cur_ops->async(&wmbp->wmb_rh, rcu_scale_async_cb); + wmbp = NULL; + gp_succeeded = true; + } else if (!kthread_should_stop()) { + cur_ops->gp_barrier(); + } else { + rcu_scale_free(wmbp); /* Because we are stopping. */ + wmbp = NULL; + } + } else if (gp_exp) { + cur_ops->exp_sync(); + gp_succeeded = true; + } else { + cur_ops->sync(); + gp_succeeded = true; + } + t = ktime_get_mono_fast_ns(); + *wdp = t - *wdp; + i_max = i; + if (!started && + atomic_read(&n_rcu_scale_writer_started) >= nrealwriters) + started = true; + if (!done && i >= MIN_MEAS && time_after(jiffies, jdone)) { + done = true; + WRITE_ONCE(writer_done[me], true); + sched_set_normal(current, 0); + pr_alert("%s%s rcu_scale_writer %ld has %d measurements\n", + scale_type, SCALE_FLAG, me, MIN_MEAS); + if (atomic_inc_return(&n_rcu_scale_writer_finished) >= + nrealwriters) { + schedule_timeout_interruptible(10); + rcu_ftrace_dump(DUMP_ALL); + SCALEOUT_STRING("Test complete"); + t_rcu_scale_writer_finished = t; + if (gp_exp) { + b_rcu_gp_test_finished = + cur_ops->exp_completed() / 2; + } else { + b_rcu_gp_test_finished = + cur_ops->get_gp_seq(); + } + if (shutdown) { + smp_mb(); /* Assign before wake. */ + wake_up(&shutdown_wq); + } + } + } + if (done && !alldone && + atomic_read(&n_rcu_scale_writer_finished) >= nrealwriters) + alldone = true; + if (done && !alldone && time_after(jiffies, jdone + HZ * 60)) { + static atomic_t dumped; + int i; + + if (!atomic_xchg(&dumped, 1)) { + for (i = 0; i < nrealwriters; i++) { + if (writer_done[i]) + continue; + pr_info("%s: Task %ld flags writer %d:\n", __func__, me, i); + sched_show_task(writer_tasks[i]); + } + if (cur_ops->stats) + cur_ops->stats(); + } + } + if (!selfreport && time_after(jiffies, jdone + HZ * (70 + me))) { + pr_info("%s: Writer %ld self-report: started %d done %d/%d->%d i %d jdone %lu.\n", + __func__, me, started, done, writer_done[me], atomic_read(&n_rcu_scale_writer_finished), i, jiffies - jdone); + selfreport = true; + } + if (gp_succeeded && started && !alldone && i < MAX_MEAS - 1) + i++; + rcu_scale_wait_shutdown(); + } while (!torture_must_stop()); + if (gp_async && cur_ops->async) { + rcu_scale_free(wmbp); + cur_ops->gp_barrier(); + } + writer_n_durations[me] = i_max + 1; + torture_kthread_stopping("rcu_scale_writer"); + return 0; +} + +static void +rcu_scale_print_module_parms(struct rcu_scale_ops *cur_ops, const char *tag) +{ + pr_alert("%s" SCALE_FLAG + "--- %s: gp_async=%d gp_async_max=%d gp_exp=%d holdoff=%d minruntime=%d nreaders=%d nwriters=%d writer_holdoff=%d writer_holdoff_jiffies=%d verbose=%d shutdown=%d\n", + scale_type, tag, gp_async, gp_async_max, gp_exp, holdoff, minruntime, nrealreaders, nrealwriters, writer_holdoff, writer_holdoff_jiffies, verbose, shutdown); +} + +/* + * Return the number if non-negative. If -1, the number of CPUs. + * If less than -1, that much less than the number of CPUs, but + * at least one. + */ +static int compute_real(int n) +{ + int nr; + + if (n >= 0) { + nr = n; + } else { + nr = num_online_cpus() + 1 + n; + if (nr <= 0) + nr = 1; + } + return nr; +} + +/* + * kfree_rcu() scalability tests: Start a kfree_rcu() loop on all CPUs for number + * of iterations and measure total time and number of GP for all iterations to complete. + */ + +torture_param(int, kfree_nthreads, -1, "Number of threads running loops of kfree_rcu()."); +torture_param(int, kfree_alloc_num, 8000, "Number of allocations and frees done in an iteration."); +torture_param(int, kfree_loops, 10, "Number of loops doing kfree_alloc_num allocations and frees."); +torture_param(bool, kfree_rcu_test_double, false, "Do we run a kfree_rcu() double-argument scale test?"); +torture_param(bool, kfree_rcu_test_single, false, "Do we run a kfree_rcu() single-argument scale test?"); + +static struct task_struct **kfree_reader_tasks; +static int kfree_nrealthreads; +static atomic_t n_kfree_scale_thread_started; +static atomic_t n_kfree_scale_thread_ended; +static struct task_struct *kthread_tp; +static u64 kthread_stime; + +struct kfree_obj { + char kfree_obj[8]; + struct rcu_head rh; +}; + +/* Used if doing RCU-kfree'ing via call_rcu(). */ +static void kfree_call_rcu(struct rcu_head *rh) +{ + struct kfree_obj *obj = container_of(rh, struct kfree_obj, rh); + + kfree(obj); +} + +static int +kfree_scale_thread(void *arg) +{ + int i, loop = 0; + long me = (long)arg; + struct kfree_obj *alloc_ptr; + u64 start_time, end_time; + long long mem_begin, mem_during = 0; + bool kfree_rcu_test_both; + DEFINE_TORTURE_RANDOM(tr); + + VERBOSE_SCALEOUT_STRING("kfree_scale_thread task started"); + set_cpus_allowed_ptr(current, cpumask_of(me % nr_cpu_ids)); + set_user_nice(current, MAX_NICE); + kfree_rcu_test_both = (kfree_rcu_test_single == kfree_rcu_test_double); + + start_time = ktime_get_mono_fast_ns(); + + if (atomic_inc_return(&n_kfree_scale_thread_started) >= kfree_nrealthreads) { + if (gp_exp) + b_rcu_gp_test_started = cur_ops->exp_completed() / 2; + else + b_rcu_gp_test_started = cur_ops->get_gp_seq(); + } + + do { + if (!mem_during) { + mem_during = mem_begin = si_mem_available(); + } else if (loop % (kfree_loops / 4) == 0) { + mem_during = (mem_during + si_mem_available()) / 2; + } + + for (i = 0; i < kfree_alloc_num; i++) { + alloc_ptr = kcalloc(kfree_mult, sizeof(struct kfree_obj), GFP_KERNEL); + if (!alloc_ptr) + return -ENOMEM; + + if (kfree_by_call_rcu) { + call_rcu(&(alloc_ptr->rh), kfree_call_rcu); + continue; + } + + // By default kfree_rcu_test_single and kfree_rcu_test_double are + // initialized to false. If both have the same value (false or true) + // both are randomly tested, otherwise only the one with value true + // is tested. + if ((kfree_rcu_test_single && !kfree_rcu_test_double) || + (kfree_rcu_test_both && torture_random(&tr) & 0x800)) + kfree_rcu_mightsleep(alloc_ptr); + else + kfree_rcu(alloc_ptr, rh); + } + + cond_resched(); + } while (!torture_must_stop() && ++loop < kfree_loops); + + if (atomic_inc_return(&n_kfree_scale_thread_ended) >= kfree_nrealthreads) { + end_time = ktime_get_mono_fast_ns(); + + if (gp_exp) + b_rcu_gp_test_finished = cur_ops->exp_completed() / 2; + else + b_rcu_gp_test_finished = cur_ops->get_gp_seq(); + + pr_alert("Total time taken by all kfree'ers: %llu ns, loops: %d, batches: %ld, memory footprint: %lldMB\n", + (unsigned long long)(end_time - start_time), kfree_loops, + rcuscale_seq_diff(b_rcu_gp_test_finished, b_rcu_gp_test_started), + PAGES_TO_MB(mem_begin - mem_during)); + + if (shutdown) { + smp_mb(); /* Assign before wake. */ + wake_up(&shutdown_wq); + } + } + + torture_kthread_stopping("kfree_scale_thread"); + return 0; +} + +static void +kfree_scale_cleanup(void) +{ + int i; + + if (torture_cleanup_begin()) + return; + + if (kfree_reader_tasks) { + for (i = 0; i < kfree_nrealthreads; i++) + torture_stop_kthread(kfree_scale_thread, + kfree_reader_tasks[i]); + kfree(kfree_reader_tasks); + kfree_reader_tasks = NULL; + } + + torture_cleanup_end(); +} + +/* + * shutdown kthread. Just waits to be awakened, then shuts down system. + */ +static int +kfree_scale_shutdown(void *arg) +{ + wait_event_idle(shutdown_wq, + atomic_read(&n_kfree_scale_thread_ended) >= kfree_nrealthreads); + + smp_mb(); /* Wake before output. */ + + kfree_scale_cleanup(); + kernel_power_off(); + return -EINVAL; +} + +// Used if doing RCU-kfree'ing via call_rcu(). +static unsigned long jiffies_at_lazy_cb; +static struct rcu_head lazy_test1_rh; +static int rcu_lazy_test1_cb_called; +static void call_rcu_lazy_test1(struct rcu_head *rh) +{ + jiffies_at_lazy_cb = jiffies; + WRITE_ONCE(rcu_lazy_test1_cb_called, 1); +} + +static int __init +kfree_scale_init(void) +{ + int firsterr = 0; + long i; + unsigned long jif_start; + unsigned long orig_jif; + + pr_alert("%s" SCALE_FLAG + "--- kfree_rcu_test: kfree_mult=%d kfree_by_call_rcu=%d kfree_nthreads=%d kfree_alloc_num=%d kfree_loops=%d kfree_rcu_test_double=%d kfree_rcu_test_single=%d\n", + scale_type, kfree_mult, kfree_by_call_rcu, kfree_nthreads, kfree_alloc_num, kfree_loops, kfree_rcu_test_double, kfree_rcu_test_single); + + // Also, do a quick self-test to ensure laziness is as much as + // expected. + if (kfree_by_call_rcu && !IS_ENABLED(CONFIG_RCU_LAZY)) { + pr_alert("CONFIG_RCU_LAZY is disabled, falling back to kfree_rcu() for delayed RCU kfree'ing\n"); + kfree_by_call_rcu = 0; + } + + if (kfree_by_call_rcu) { + /* do a test to check the timeout. */ + orig_jif = rcu_get_jiffies_lazy_flush(); + + rcu_set_jiffies_lazy_flush(2 * HZ); + rcu_barrier(); + + jif_start = jiffies; + jiffies_at_lazy_cb = 0; + call_rcu(&lazy_test1_rh, call_rcu_lazy_test1); + + smp_cond_load_relaxed(&rcu_lazy_test1_cb_called, VAL == 1); + + rcu_set_jiffies_lazy_flush(orig_jif); + + if (WARN_ON_ONCE(jiffies_at_lazy_cb - jif_start < 2 * HZ)) { + pr_alert("ERROR: call_rcu() CBs are not being lazy as expected!\n"); + firsterr = -1; + goto unwind; + } + + if (WARN_ON_ONCE(jiffies_at_lazy_cb - jif_start > 3 * HZ)) { + pr_alert("ERROR: call_rcu() CBs are being too lazy!\n"); + firsterr = -1; + goto unwind; + } + } + + kfree_nrealthreads = compute_real(kfree_nthreads); + /* Start up the kthreads. */ + if (shutdown) { + init_waitqueue_head(&shutdown_wq); + firsterr = torture_create_kthread(kfree_scale_shutdown, NULL, + shutdown_task); + if (torture_init_error(firsterr)) + goto unwind; + schedule_timeout_uninterruptible(1); + } + + pr_alert("kfree object size=%zu, kfree_by_call_rcu=%d\n", + kfree_mult * sizeof(struct kfree_obj), + kfree_by_call_rcu); + + kfree_reader_tasks = kcalloc(kfree_nrealthreads, sizeof(kfree_reader_tasks[0]), + GFP_KERNEL); + if (kfree_reader_tasks == NULL) { + firsterr = -ENOMEM; + goto unwind; + } + + for (i = 0; i < kfree_nrealthreads; i++) { + firsterr = torture_create_kthread(kfree_scale_thread, (void *)i, + kfree_reader_tasks[i]); + if (torture_init_error(firsterr)) + goto unwind; + } + + while (atomic_read(&n_kfree_scale_thread_started) < kfree_nrealthreads) + schedule_timeout_uninterruptible(1); + + torture_init_end(); + return 0; + +unwind: + torture_init_end(); + kfree_scale_cleanup(); + return firsterr; +} + +static void +rcu_scale_cleanup(void) +{ + int i; + int j; + int ngps = 0; + u64 *wdp; + u64 *wdpp; + + /* + * Would like warning at start, but everything is expedited + * during the mid-boot phase, so have to wait till the end. + */ + if (rcu_gp_is_expedited() && !rcu_gp_is_normal() && !gp_exp) + SCALEOUT_ERRSTRING("All grace periods expedited, no normal ones to measure!"); + if (rcu_gp_is_normal() && gp_exp) + SCALEOUT_ERRSTRING("All grace periods normal, no expedited ones to measure!"); + if (gp_exp && gp_async) + SCALEOUT_ERRSTRING("No expedited async GPs, so went with async!"); + + // If built-in, just report all of the GP kthread's CPU time. + if (IS_BUILTIN(CONFIG_RCU_SCALE_TEST) && !kthread_tp && cur_ops->rso_gp_kthread) + kthread_tp = cur_ops->rso_gp_kthread(); + if (kthread_tp) { + u32 ns; + u64 us; + + kthread_stime = kthread_tp->stime - kthread_stime; + us = div_u64_rem(kthread_stime, 1000, &ns); + pr_info("rcu_scale: Grace-period kthread CPU time: %llu.%03u us\n", us, ns); + show_rcu_gp_kthreads(); + } + if (kfree_rcu_test) { + kfree_scale_cleanup(); + return; + } + + if (torture_cleanup_begin()) + return; + if (!cur_ops) { + torture_cleanup_end(); + return; + } + + if (reader_tasks) { + for (i = 0; i < nrealreaders; i++) + torture_stop_kthread(rcu_scale_reader, + reader_tasks[i]); + kfree(reader_tasks); + reader_tasks = NULL; + } + + if (writer_tasks) { + for (i = 0; i < nrealwriters; i++) { + torture_stop_kthread(rcu_scale_writer, + writer_tasks[i]); + if (!writer_n_durations) + continue; + j = writer_n_durations[i]; + pr_alert("%s%s writer %d gps: %d\n", + scale_type, SCALE_FLAG, i, j); + ngps += j; + } + pr_alert("%s%s start: %llu end: %llu duration: %llu gps: %d batches: %ld\n", + scale_type, SCALE_FLAG, + t_rcu_scale_writer_started, t_rcu_scale_writer_finished, + t_rcu_scale_writer_finished - + t_rcu_scale_writer_started, + ngps, + rcuscale_seq_diff(b_rcu_gp_test_finished, + b_rcu_gp_test_started)); + for (i = 0; i < nrealwriters; i++) { + if (!writer_durations) + break; + if (!writer_n_durations) + continue; + wdpp = writer_durations[i]; + if (!wdpp) + continue; + for (j = 0; j < writer_n_durations[i]; j++) { + wdp = &wdpp[j]; + pr_alert("%s%s %4d writer-duration: %5d %llu\n", + scale_type, SCALE_FLAG, + i, j, *wdp); + if (j % 100 == 0) + schedule_timeout_uninterruptible(1); + } + kfree(writer_durations[i]); + if (writer_freelists) { + int ctr = 0; + struct llist_node *llnp; + struct writer_freelist *wflp = &writer_freelists[i]; + + if (wflp->ws_mblocks) { + llist_for_each(llnp, wflp->ws_lhg.first) + ctr++; + llist_for_each(llnp, wflp->ws_lhp.first) + ctr++; + WARN_ONCE(ctr != gp_async_max, + "%s: ctr = %d gp_async_max = %d\n", + __func__, ctr, gp_async_max); + kfree(wflp->ws_mblocks); + } + } + } + kfree(writer_tasks); + writer_tasks = NULL; + kfree(writer_durations); + writer_durations = NULL; + kfree(writer_n_durations); + writer_n_durations = NULL; + kfree(writer_done); + writer_done = NULL; + kfree(writer_freelists); + writer_freelists = NULL; + } + + /* Do torture-type-specific cleanup operations. */ + if (cur_ops->cleanup != NULL) + cur_ops->cleanup(); + + torture_cleanup_end(); +} + +/* + * RCU scalability shutdown kthread. Just waits to be awakened, then shuts + * down system. + */ +static int +rcu_scale_shutdown(void *arg) +{ + wait_event_idle(shutdown_wq, atomic_read(&n_rcu_scale_writer_finished) >= nrealwriters); + smp_mb(); /* Wake before output. */ + rcu_scale_cleanup(); + kernel_power_off(); + return -EINVAL; +} + +static int __init +rcu_scale_init(void) +{ + int firsterr = 0; + long i; + long j; + static struct rcu_scale_ops *scale_ops[] = { + &rcu_ops, &srcu_ops, &srcud_ops, TASKS_OPS TASKS_RUDE_OPS TASKS_TRACING_OPS + }; + + if (!torture_init_begin(scale_type, verbose)) + return -EBUSY; + + /* Process args and announce that the scalability'er is on the job. */ + for (i = 0; i < ARRAY_SIZE(scale_ops); i++) { + cur_ops = scale_ops[i]; + if (strcmp(scale_type, cur_ops->name) == 0) + break; + } + if (i == ARRAY_SIZE(scale_ops)) { + pr_alert("rcu-scale: invalid scale type: \"%s\"\n", scale_type); + pr_alert("rcu-scale types:"); + for (i = 0; i < ARRAY_SIZE(scale_ops); i++) + pr_cont(" %s", scale_ops[i]->name); + pr_cont("\n"); + firsterr = -EINVAL; + cur_ops = NULL; + goto unwind; + } + if (cur_ops->init) + cur_ops->init(); + + if (cur_ops->rso_gp_kthread) { + kthread_tp = cur_ops->rso_gp_kthread(); + if (kthread_tp) + kthread_stime = kthread_tp->stime; + } + if (kfree_rcu_test) + return kfree_scale_init(); + + nrealwriters = compute_real(nwriters); + nrealreaders = compute_real(nreaders); + atomic_set(&n_rcu_scale_reader_started, 0); + atomic_set(&n_rcu_scale_writer_started, 0); + atomic_set(&n_rcu_scale_writer_finished, 0); + rcu_scale_print_module_parms(cur_ops, "Start of test"); + + /* Start up the kthreads. */ + + if (shutdown) { + init_waitqueue_head(&shutdown_wq); + firsterr = torture_create_kthread(rcu_scale_shutdown, NULL, + shutdown_task); + if (torture_init_error(firsterr)) + goto unwind; + schedule_timeout_uninterruptible(1); + } + reader_tasks = kcalloc(nrealreaders, sizeof(reader_tasks[0]), + GFP_KERNEL); + if (reader_tasks == NULL) { + SCALEOUT_ERRSTRING("out of memory"); + firsterr = -ENOMEM; + goto unwind; + } + for (i = 0; i < nrealreaders; i++) { + firsterr = torture_create_kthread(rcu_scale_reader, (void *)i, + reader_tasks[i]); + if (torture_init_error(firsterr)) + goto unwind; + } + while (atomic_read(&n_rcu_scale_reader_started) < nrealreaders) + schedule_timeout_uninterruptible(1); + writer_tasks = kcalloc(nrealwriters, sizeof(writer_tasks[0]), GFP_KERNEL); + writer_durations = kcalloc(nrealwriters, sizeof(*writer_durations), GFP_KERNEL); + writer_n_durations = kcalloc(nrealwriters, sizeof(*writer_n_durations), GFP_KERNEL); + writer_done = kcalloc(nrealwriters, sizeof(writer_done[0]), GFP_KERNEL); + if (gp_async) { + if (gp_async_max <= 0) { + pr_warn("%s: gp_async_max = %d must be greater than zero.\n", + __func__, gp_async_max); + WARN_ON_ONCE(IS_BUILTIN(CONFIG_RCU_TORTURE_TEST)); + firsterr = -EINVAL; + goto unwind; + } + writer_freelists = kcalloc(nrealwriters, sizeof(writer_freelists[0]), GFP_KERNEL); + } + if (!writer_tasks || !writer_durations || !writer_n_durations || !writer_done || + (gp_async && !writer_freelists)) { + SCALEOUT_ERRSTRING("out of memory"); + firsterr = -ENOMEM; + goto unwind; + } + for (i = 0; i < nrealwriters; i++) { + writer_durations[i] = + kcalloc(MAX_MEAS, sizeof(*writer_durations[i]), + GFP_KERNEL); + if (!writer_durations[i]) { + firsterr = -ENOMEM; + goto unwind; + } + if (writer_freelists) { + struct writer_freelist *wflp = &writer_freelists[i]; + + init_llist_head(&wflp->ws_lhg); + init_llist_head(&wflp->ws_lhp); + wflp->ws_mblocks = kcalloc(gp_async_max, sizeof(wflp->ws_mblocks[0]), + GFP_KERNEL); + if (!wflp->ws_mblocks) { + firsterr = -ENOMEM; + goto unwind; + } + for (j = 0; j < gp_async_max; j++) { + struct writer_mblock *wmbp = &wflp->ws_mblocks[j]; + + wmbp->wmb_wfl = wflp; + llist_add(&wmbp->wmb_node, &wflp->ws_lhp); + } + } + firsterr = torture_create_kthread(rcu_scale_writer, (void *)i, + writer_tasks[i]); + if (torture_init_error(firsterr)) + goto unwind; + } + torture_init_end(); + return 0; + +unwind: + torture_init_end(); + rcu_scale_cleanup(); + if (shutdown) { + WARN_ON(!IS_MODULE(CONFIG_RCU_SCALE_TEST)); + kernel_power_off(); + } + return firsterr; +} + +module_init(rcu_scale_init); +module_exit(rcu_scale_cleanup); diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index f6e85faa4ff4..07e51974b06b 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -1,26 +1,13 @@ +// SPDX-License-Identifier: GPL-2.0+ /* * Read-Copy Update module-based torture test facility * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, you can access it online at - * http://www.gnu.org/licenses/gpl-2.0.html. - * * Copyright (C) IBM Corporation, 2005, 2006 * - * Authors: Paul E. McKenney <paulmck@us.ibm.com> + * Authors: Paul E. McKenney <paulmck@linux.ibm.com> * Josh Triplett <josh@joshtriplett.org> * - * See also: Documentation/RCU/torture.txt + * See also: Documentation/RCU/torture.rst */ #define pr_fmt(fmt) fmt @@ -33,7 +20,8 @@ #include <linux/err.h> #include <linux/spinlock.h> #include <linux/smp.h> -#include <linux/rcupdate.h> +#include <linux/rcupdate_wait.h> +#include <linux/rcu_notifier.h> #include <linux/interrupt.h> #include <linux/sched/signal.h> #include <uapi/linux/sched/types.h> @@ -57,99 +45,150 @@ #include <linux/sched/debug.h> #include <linux/sched/sysctl.h> #include <linux/oom.h> +#include <linux/tick.h> +#include <linux/rcupdate_trace.h> +#include <linux/nmi.h> #include "rcu.h" +MODULE_DESCRIPTION("Read-Copy Update module-based torture test facility"); MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com> and Josh Triplett <josh@joshtriplett.org>"); - - -/* Bits for ->extendables field, extendables param, and related definitions. */ -#define RCUTORTURE_RDR_SHIFT 8 /* Put SRCU index in upper bits. */ -#define RCUTORTURE_RDR_MASK ((1 << RCUTORTURE_RDR_SHIFT) - 1) -#define RCUTORTURE_RDR_BH 0x01 /* Extend readers by disabling bh. */ -#define RCUTORTURE_RDR_IRQ 0x02 /* ... disabling interrupts. */ -#define RCUTORTURE_RDR_PREEMPT 0x04 /* ... disabling preemption. */ -#define RCUTORTURE_RDR_RBH 0x08 /* ... rcu_read_lock_bh(). */ -#define RCUTORTURE_RDR_SCHED 0x10 /* ... rcu_read_lock_sched(). */ -#define RCUTORTURE_RDR_RCU 0x20 /* ... entering another RCU reader. */ -#define RCUTORTURE_RDR_NBITS 6 /* Number of bits defined above. */ -#define RCUTORTURE_MAX_EXTEND \ +MODULE_AUTHOR("Paul E. McKenney <paulmck@linux.ibm.com> and Josh Triplett <josh@joshtriplett.org>"); + +// Bits for ->extendables field, extendables param, and related definitions. +#define RCUTORTURE_RDR_SHIFT_1 8 // Put SRCU index in upper bits. +#define RCUTORTURE_RDR_MASK_1 (0xff << RCUTORTURE_RDR_SHIFT_1) +#define RCUTORTURE_RDR_SHIFT_2 16 // Put SRCU index in upper bits. +#define RCUTORTURE_RDR_MASK_2 (0xff << RCUTORTURE_RDR_SHIFT_2) +#define RCUTORTURE_RDR_BH 0x01 // Extend readers by disabling bh. +#define RCUTORTURE_RDR_IRQ 0x02 // ... disabling interrupts. +#define RCUTORTURE_RDR_PREEMPT 0x04 // ... disabling preemption. +#define RCUTORTURE_RDR_RBH 0x08 // ... rcu_read_lock_bh(). +#define RCUTORTURE_RDR_SCHED 0x10 // ... rcu_read_lock_sched(). +#define RCUTORTURE_RDR_RCU_1 0x20 // ... entering another RCU reader. +#define RCUTORTURE_RDR_RCU_2 0x40 // ... entering another RCU reader. +#define RCUTORTURE_RDR_UPDOWN 0x80 // ... up-read from task, down-read from timer. + // Note: Manual start, automatic end. +#define RCUTORTURE_RDR_NBITS 8 // Number of bits defined above. +#define RCUTORTURE_MAX_EXTEND \ (RCUTORTURE_RDR_BH | RCUTORTURE_RDR_IRQ | RCUTORTURE_RDR_PREEMPT | \ - RCUTORTURE_RDR_RBH | RCUTORTURE_RDR_SCHED) + RCUTORTURE_RDR_RBH | RCUTORTURE_RDR_SCHED) // Intentionally omit RCUTORTURE_RDR_UPDOWN. +#define RCUTORTURE_RDR_ALLBITS \ + (RCUTORTURE_MAX_EXTEND | RCUTORTURE_RDR_RCU_1 | RCUTORTURE_RDR_RCU_2 | \ + RCUTORTURE_RDR_MASK_1 | RCUTORTURE_RDR_MASK_2) #define RCUTORTURE_RDR_MAX_LOOPS 0x7 /* Maximum reader extensions. */ /* Must be power of two minus one. */ #define RCUTORTURE_RDR_MAX_SEGS (RCUTORTURE_RDR_MAX_LOOPS + 3) torture_param(int, extendables, RCUTORTURE_MAX_EXTEND, "Extend readers by disabling bh (1), irqs (2), or preempt (4)"); -torture_param(int, fqs_duration, 0, - "Duration of fqs bursts (us), 0 to disable"); +torture_param(int, fqs_duration, 0, "Duration of fqs bursts (us), 0 to disable"); torture_param(int, fqs_holdoff, 0, "Holdoff time within fqs bursts (us)"); torture_param(int, fqs_stutter, 3, "Wait time between fqs bursts (s)"); -torture_param(bool, fwd_progress, 1, "Test grace-period forward progress"); +torture_param(int, fwd_progress, 1, "Number of grace-period forward progress tasks (0 to disable)"); torture_param(int, fwd_progress_div, 4, "Fraction of CPU stall to wait"); -torture_param(int, fwd_progress_holdoff, 60, - "Time between forward-progress tests (s)"); -torture_param(bool, fwd_progress_need_resched, 1, - "Hide cond_resched() behind need_resched()"); +torture_param(int, fwd_progress_holdoff, 60, "Time between forward-progress tests (s)"); +torture_param(bool, fwd_progress_need_resched, 1, "Hide cond_resched() behind need_resched()"); torture_param(bool, gp_cond, false, "Use conditional/async GP wait primitives"); +torture_param(bool, gp_cond_exp, false, "Use conditional/async expedited GP wait primitives"); +torture_param(bool, gp_cond_full, false, "Use conditional/async full-state GP wait primitives"); +torture_param(bool, gp_cond_exp_full, false, + "Use conditional/async full-stateexpedited GP wait primitives"); +torture_param(int, gp_cond_wi, 16 * USEC_PER_SEC / HZ, + "Wait interval for normal conditional grace periods, us (default 16 jiffies)"); +torture_param(int, gp_cond_wi_exp, 128, + "Wait interval for expedited conditional grace periods, us (default 128 us)"); torture_param(bool, gp_exp, false, "Use expedited GP wait primitives"); -torture_param(bool, gp_normal, false, - "Use normal (non-expedited) GP wait primitives"); +torture_param(bool, gp_normal, false, "Use normal (non-expedited) GP wait primitives"); +torture_param(bool, gp_poll, false, "Use polling GP wait primitives"); +torture_param(bool, gp_poll_exp, false, "Use polling expedited GP wait primitives"); +torture_param(bool, gp_poll_full, false, "Use polling full-state GP wait primitives"); +torture_param(bool, gp_poll_exp_full, false, "Use polling full-state expedited GP wait primitives"); +torture_param(int, gp_poll_wi, 16 * USEC_PER_SEC / HZ, + "Wait interval for normal polled grace periods, us (default 16 jiffies)"); +torture_param(int, gp_poll_wi_exp, 128, + "Wait interval for expedited polled grace periods, us (default 128 us)"); torture_param(bool, gp_sync, false, "Use synchronous GP wait primitives"); torture_param(int, irqreader, 1, "Allow RCU readers from irq handlers"); -torture_param(int, n_barrier_cbs, 0, - "# of callbacks/kthreads for barrier testing"); +torture_param(int, leakpointer, 0, "Leak pointer dereferences from readers"); +torture_param(int, n_barrier_cbs, 0, "# of callbacks/kthreads for barrier testing"); +torture_param(int, n_up_down, 32, "# of concurrent up/down hrtimer-based RCU readers"); torture_param(int, nfakewriters, 4, "Number of RCU fake writer threads"); torture_param(int, nreaders, -1, "Number of RCU reader threads"); -torture_param(int, object_debug, 0, - "Enable debug-object double call_rcu() testing"); +torture_param(int, object_debug, 0, "Enable debug-object double call_rcu() testing"); torture_param(int, onoff_holdoff, 0, "Time after boot before CPU hotplugs (s)"); -torture_param(int, onoff_interval, 0, - "Time between CPU hotplugs (jiffies), 0=disable"); +torture_param(int, onoff_interval, 0, "Time between CPU hotplugs (jiffies), 0=disable"); +torture_param(bool, gpwrap_lag, true, "Enable grace-period wrap lag testing"); +torture_param(int, gpwrap_lag_gps, 8, "Value to set for set_gpwrap_lag during an active testing period."); +torture_param(int, gpwrap_lag_cycle_mins, 30, "Total cycle duration for gpwrap lag testing (in minutes)"); +torture_param(int, gpwrap_lag_active_mins, 5, "Duration for which gpwrap lag is active within each cycle (in minutes)"); +torture_param(int, nocbs_nthreads, 0, "Number of NOCB toggle threads, 0 to disable"); +torture_param(int, nocbs_toggle, 1000, "Time between toggling nocb state (ms)"); +torture_param(int, preempt_duration, 0, "Preemption duration (ms), zero to disable"); +torture_param(int, preempt_interval, MSEC_PER_SEC, "Interval between preemptions (ms)"); +torture_param(int, read_exit_delay, 13, "Delay between read-then-exit episodes (s)"); +torture_param(int, read_exit_burst, 16, "# of read-then-exit bursts per episode, zero to disable"); +torture_param(int, reader_flavor, SRCU_READ_FLAVOR_NORMAL, "Reader flavors to use, one per bit."); torture_param(int, shuffle_interval, 3, "Number of seconds between shuffles"); torture_param(int, shutdown_secs, 0, "Shutdown time (s), <= zero to disable."); torture_param(int, stall_cpu, 0, "Stall duration (s), zero to disable."); -torture_param(int, stall_cpu_holdoff, 10, - "Time to wait before starting stall (s)."); +torture_param(int, stall_cpu_holdoff, 10, "Time to wait before starting stall (s)."); +torture_param(bool, stall_no_softlockup, false, "Avoid softlockup warning during cpu stall."); torture_param(int, stall_cpu_irqsoff, 0, "Disable interrupts while stalling."); -torture_param(int, stat_interval, 60, - "Number of seconds between stats printk()s"); +torture_param(int, stall_cpu_block, 0, "Sleep while stalling."); +torture_param(int, stall_cpu_repeat, 0, "Number of additional stalls after the first one."); +torture_param(int, stall_gp_kthread, 0, "Grace-period kthread stall duration (s)."); +torture_param(int, stat_interval, 60, "Number of seconds between stats printk()s"); torture_param(int, stutter, 5, "Number of seconds to run/halt test"); torture_param(int, test_boost, 1, "Test RCU prio boost: 0=no, 1=maybe, 2=yes."); -torture_param(int, test_boost_duration, 4, - "Duration of each boost test, seconds."); -torture_param(int, test_boost_interval, 7, - "Interval between boost tests, seconds."); -torture_param(bool, test_no_idle_hz, true, - "Test support for tickless idle CPUs"); -torture_param(int, verbose, 1, - "Enable verbose debugging printk()s"); +torture_param(int, test_boost_duration, 4, "Duration of each boost test, seconds."); +torture_param(int, test_boost_holdoff, 0, "Holdoff time from rcutorture start, seconds."); +torture_param(int, test_boost_interval, 7, "Interval between boost tests, seconds."); +torture_param(int, test_nmis, 0, "End-test NMI tests, 0 to disable."); +torture_param(bool, test_no_idle_hz, true, "Test support for tickless idle CPUs"); +torture_param(int, test_srcu_lockdep, 0, "Test specified SRCU deadlock scenario."); +torture_param(int, verbose, 1, "Enable verbose debugging printk()s"); static char *torture_type = "rcu"; module_param(torture_type, charp, 0444); MODULE_PARM_DESC(torture_type, "Type of RCU to torture (rcu, srcu, ...)"); +static int nrealnocbers; static int nrealreaders; +static int nrealfakewriters; static struct task_struct *writer_task; static struct task_struct **fakewriter_tasks; static struct task_struct **reader_tasks; +static struct task_struct *updown_task; +static struct task_struct **nocb_tasks; static struct task_struct *stats_task; static struct task_struct *fqs_task; static struct task_struct *boost_tasks[NR_CPUS]; static struct task_struct *stall_task; -static struct task_struct *fwd_prog_task; +static struct task_struct **fwd_prog_tasks; static struct task_struct **barrier_cbs_tasks; static struct task_struct *barrier_task; +static struct task_struct *read_exit_task; +static struct task_struct *preempt_task; #define RCU_TORTURE_PIPE_LEN 10 +// Mailbox-like structure to check RCU global memory ordering. +struct rcu_torture_reader_check { + unsigned long rtc_myloops; + int rtc_chkrdr; + unsigned long rtc_chkloops; + int rtc_ready; + struct rcu_torture_reader_check *rtc_assigner; +} ____cacheline_internodealigned_in_smp; + +// Update-side data structure used to check RCU readers. struct rcu_torture { struct rcu_head rtort_rcu; int rtort_pipe_count; struct list_head rtort_free; int rtort_mbtest; + struct rcu_torture_reader_check *rtort_chkp; }; static LIST_HEAD(rcu_torture_freelist); @@ -160,20 +199,27 @@ static DEFINE_SPINLOCK(rcu_torture_lock); static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1], rcu_torture_count); static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1], rcu_torture_batch); static atomic_t rcu_torture_wcount[RCU_TORTURE_PIPE_LEN + 1]; +static struct rcu_torture_reader_check *rcu_torture_reader_mbchk; static atomic_t n_rcu_torture_alloc; static atomic_t n_rcu_torture_alloc_fail; static atomic_t n_rcu_torture_free; static atomic_t n_rcu_torture_mberror; +static atomic_t n_rcu_torture_mbchk_fail; +static atomic_t n_rcu_torture_mbchk_tries; static atomic_t n_rcu_torture_error; static long n_rcu_torture_barrier_error; static long n_rcu_torture_boost_ktrerror; -static long n_rcu_torture_boost_rterror; static long n_rcu_torture_boost_failure; static long n_rcu_torture_boosts; static atomic_long_t n_rcu_torture_timers; static long n_barrier_attempts; static long n_barrier_successes; /* did rcu_barrier test succeed? */ +static unsigned long n_read_exits; static struct list_head rcu_torture_removed; +static unsigned long shutdown_jiffies; +static unsigned long start_gp_seq; +static atomic_long_t n_nocb_offload; +static atomic_long_t n_nocb_deoffload; static int rcu_torture_writer_state; #define RTWS_FIXED_DELAY 0 @@ -182,10 +228,24 @@ static int rcu_torture_writer_state; #define RTWS_DEF_FREE 3 #define RTWS_EXP_SYNC 4 #define RTWS_COND_GET 5 -#define RTWS_COND_SYNC 6 -#define RTWS_SYNC 7 -#define RTWS_STUTTER 8 -#define RTWS_STOPPING 9 +#define RTWS_COND_GET_FULL 6 +#define RTWS_COND_GET_EXP 7 +#define RTWS_COND_GET_EXP_FULL 8 +#define RTWS_COND_SYNC 9 +#define RTWS_COND_SYNC_FULL 10 +#define RTWS_COND_SYNC_EXP 11 +#define RTWS_COND_SYNC_EXP_FULL 12 +#define RTWS_POLL_GET 13 +#define RTWS_POLL_GET_FULL 14 +#define RTWS_POLL_GET_EXP 15 +#define RTWS_POLL_GET_EXP_FULL 16 +#define RTWS_POLL_WAIT 17 +#define RTWS_POLL_WAIT_FULL 18 +#define RTWS_POLL_WAIT_EXP 19 +#define RTWS_POLL_WAIT_EXP_FULL 20 +#define RTWS_SYNC 21 +#define RTWS_STUTTER 22 +#define RTWS_STOPPING 23 static const char * const rcu_torture_writer_state_names[] = { "RTWS_FIXED_DELAY", "RTWS_DELAY", @@ -193,7 +253,21 @@ static const char * const rcu_torture_writer_state_names[] = { "RTWS_DEF_FREE", "RTWS_EXP_SYNC", "RTWS_COND_GET", + "RTWS_COND_GET_FULL", + "RTWS_COND_GET_EXP", + "RTWS_COND_GET_EXP_FULL", "RTWS_COND_SYNC", + "RTWS_COND_SYNC_FULL", + "RTWS_COND_SYNC_EXP", + "RTWS_COND_SYNC_EXP_FULL", + "RTWS_POLL_GET", + "RTWS_POLL_GET_FULL", + "RTWS_POLL_GET_EXP", + "RTWS_POLL_GET_EXP_FULL", + "RTWS_POLL_WAIT", + "RTWS_POLL_WAIT_FULL", + "RTWS_POLL_WAIT_EXP", + "RTWS_POLL_WAIT_EXP_FULL", "RTWS_SYNC", "RTWS_STUTTER", "RTWS_STOPPING", @@ -206,10 +280,16 @@ struct rt_read_seg { unsigned long rt_delay_ms; unsigned long rt_delay_us; bool rt_preempted; + int rt_cpu; + int rt_end_cpu; + unsigned long long rt_gp_seq; + unsigned long long rt_gp_seq_end; + u64 rt_ts; }; static int err_segs_recorded; static struct rt_read_seg err_segs[RCUTORTURE_RDR_MAX_SEGS]; static int rt_read_nsegs; +static int rt_read_preempted; static const char *rcu_torture_writer_state_getname(void) { @@ -220,12 +300,6 @@ static const char *rcu_torture_writer_state_getname(void) return rcu_torture_writer_state_names[i]; } -#if defined(CONFIG_RCU_BOOST) && !defined(CONFIG_HOTPLUG_CPU) -#define rcu_can_boost() 1 -#else /* #if defined(CONFIG_RCU_BOOST) && !defined(CONFIG_HOTPLUG_CPU) */ -#define rcu_can_boost() 0 -#endif /* #else #if defined(CONFIG_RCU_BOOST) && !defined(CONFIG_HOTPLUG_CPU) */ - #ifdef CONFIG_RCU_TRACE static u64 notrace rcu_trace_clock_local(void) { @@ -241,6 +315,15 @@ static u64 notrace rcu_trace_clock_local(void) } #endif /* #else #ifdef CONFIG_RCU_TRACE */ +/* + * Stop aggressive CPU-hog tests a bit before the end of the test in order + * to avoid interfering with test shutdown. + */ +static bool shutdown_time_arrived(void) +{ + return shutdown_secs && time_after(jiffies, shutdown_jiffies - 30 * HZ); +} + static unsigned long boost_starttime; /* jiffies of next boost test start. */ static DEFINE_MUTEX(boost_mutex); /* protect setting boost_starttime */ /* and boost task create/destroy. */ @@ -250,7 +333,7 @@ static atomic_t barrier_cbs_invoked; /* Barrier callbacks invoked. */ static wait_queue_head_t *barrier_cbs_wq; /* Coordinate barrier testing. */ static DECLARE_WAIT_QUEUE_HEAD(barrier_wq); -static bool rcu_fwd_cb_nodelay; /* Short rcu_torture_delay() delays. */ +static atomic_t rcu_fwd_cb_nodelay; /* Short rcu_torture_delay() delays. */ /* * Allocate an element from the rcu_tortures pool. @@ -297,22 +380,61 @@ struct rcu_torture_ops { void (*read_delay)(struct torture_random_state *rrsp, struct rt_read_seg *rtrsp); void (*readunlock)(int idx); + int (*readlock_held)(void); // lockdep. + int (*readlock_nesting)(void); // actual nesting, if available, -1 if not. + int (*down_read)(void); + void (*up_read)(int idx); unsigned long (*get_gp_seq)(void); unsigned long (*gp_diff)(unsigned long new, unsigned long old); void (*deferred_free)(struct rcu_torture *p); void (*sync)(void); void (*exp_sync)(void); - unsigned long (*get_state)(void); + void (*exp_current)(void); + unsigned long (*get_gp_state_exp)(void); + unsigned long (*start_gp_poll_exp)(void); + void (*start_gp_poll_exp_full)(struct rcu_gp_oldstate *rgosp); + bool (*poll_gp_state_exp)(unsigned long oldstate); + void (*cond_sync_exp)(unsigned long oldstate); + void (*cond_sync_exp_full)(struct rcu_gp_oldstate *rgosp); + unsigned long (*get_comp_state)(void); + void (*get_comp_state_full)(struct rcu_gp_oldstate *rgosp); + bool (*same_gp_state)(unsigned long oldstate1, unsigned long oldstate2); + bool (*same_gp_state_full)(struct rcu_gp_oldstate *rgosp1, struct rcu_gp_oldstate *rgosp2); + unsigned long (*get_gp_state)(void); + void (*get_gp_state_full)(struct rcu_gp_oldstate *rgosp); + unsigned long (*start_gp_poll)(void); + void (*start_gp_poll_full)(struct rcu_gp_oldstate *rgosp); + bool (*poll_gp_state)(unsigned long oldstate); + bool (*poll_gp_state_full)(struct rcu_gp_oldstate *rgosp); + bool (*poll_need_2gp)(bool poll, bool poll_full); void (*cond_sync)(unsigned long oldstate); + void (*cond_sync_full)(struct rcu_gp_oldstate *rgosp); + int poll_active; + int poll_active_full; call_rcu_func_t call; void (*cb_barrier)(void); void (*fqs)(void); void (*stats)(void); + void (*gp_kthread_dbg)(void); + bool (*check_boost_failed)(unsigned long gp_state, int *cpup); int (*stall_dur)(void); + void (*get_gp_data)(int *flags, unsigned long *gp_seq); + void (*gp_slow_register)(atomic_t *rgssp); + void (*gp_slow_unregister)(atomic_t *rgssp); + bool (*reader_blocked)(void); + unsigned long long (*gather_gp_seqs)(void); + void (*format_gp_seqs)(unsigned long long seqs, char *cp, size_t len); + void (*set_gpwrap_lag)(unsigned long lag); + int (*get_gpwrap_count)(int cpu); + long cbflood_max; int irq_capable; int can_boost; int extendables; - int ext_irq_conflict; + int slow_gps; + int no_pi_lock; + int debug_objects; + int start_poll_irqsoff; + int have_up_down; const char *name; }; @@ -322,7 +444,12 @@ static struct rcu_torture_ops *cur_ops; * Definitions for rcu torture testing. */ -static int rcu_torture_read_lock(void) __acquires(RCU) +static int torture_readlock_not_held(void) +{ + return rcu_read_lock_bh_held() || rcu_read_lock_sched_held(); +} + +static int rcu_torture_read_lock(void) { rcu_read_lock(); return 0; @@ -341,11 +468,11 @@ rcu_read_delay(struct torture_random_state *rrsp, struct rt_read_seg *rtrsp) * period, and we want a long delay occasionally to trigger * force_quiescent_state. */ - if (!rcu_fwd_cb_nodelay && + if (!atomic_read(&rcu_fwd_cb_nodelay) && !(torture_random(rrsp) % (nrealreaders * 2000 * longdelay_ms))) { started = cur_ops->get_gp_seq(); ts = rcu_trace_clock_local(); - if (preempt_count() & (SOFTIRQ_MASK | HARDIRQ_MASK)) + if ((preempt_count() & HARDIRQ_MASK) || softirq_count()) longdelay_ms = 5; /* Avoid triggering BH limits. */ mdelay(longdelay_ms); rtrsp->rt_delay_ms = longdelay_ms; @@ -358,17 +485,24 @@ rcu_read_delay(struct torture_random_state *rrsp, struct rt_read_seg *rtrsp) rtrsp->rt_delay_us = shortdelay_us; } if (!preempt_count() && - !(torture_random(rrsp) % (nrealreaders * 500))) { + !(torture_random(rrsp) % (nrealreaders * 500))) torture_preempt_schedule(); /* QS only if preemptible. */ - rtrsp->rt_preempted = true; - } } -static void rcu_torture_read_unlock(int idx) __releases(RCU) +static void rcu_torture_read_unlock(int idx) { rcu_read_unlock(); } +static int rcu_torture_readlock_nesting(void) +{ + if (IS_ENABLED(CONFIG_PREEMPT_RCU)) + return rcu_preempt_depth(); + if (IS_ENABLED(CONFIG_PREEMPT_COUNT)) + return (preempt_count() & PREEMPT_MASK); + return -1; +} + /* * Update callback in the pipe. This should be invoked after a grace period. */ @@ -376,12 +510,19 @@ static bool rcu_torture_pipe_update_one(struct rcu_torture *rp) { int i; + struct rcu_torture_reader_check *rtrcp = READ_ONCE(rp->rtort_chkp); + if (rtrcp) { + WRITE_ONCE(rp->rtort_chkp, NULL); + smp_store_release(&rtrcp->rtc_ready, 1); // Pair with smp_load_acquire(). + } i = rp->rtort_pipe_count; if (i > RCU_TORTURE_PIPE_LEN) i = RCU_TORTURE_PIPE_LEN; atomic_inc(&rcu_torture_wcount[i]); - if (++rp->rtort_pipe_count >= RCU_TORTURE_PIPE_LEN) { + WRITE_ONCE(rp->rtort_pipe_count, i + 1); + ASSERT_EXCLUSIVE_WRITER(rp->rtort_pipe_count); + if (i + 1 >= RCU_TORTURE_PIPE_LEN) { rp->rtort_mbtest = 0; return true; } @@ -431,7 +572,7 @@ static unsigned long rcu_no_completed(void) static void rcu_torture_deferred_free(struct rcu_torture *p) { - call_rcu(&p->rtort_rcu, rcu_torture_cb); + call_rcu_hurry(&p->rtort_rcu, rcu_torture_cb); } static void rcu_sync_torture_init(void) @@ -439,28 +580,67 @@ static void rcu_sync_torture_init(void) INIT_LIST_HEAD(&rcu_torture_removed); } +static bool rcu_poll_need_2gp(bool poll, bool poll_full) +{ + return poll; +} + static struct rcu_torture_ops rcu_ops = { - .ttype = RCU_FLAVOR, - .init = rcu_sync_torture_init, - .readlock = rcu_torture_read_lock, - .read_delay = rcu_read_delay, - .readunlock = rcu_torture_read_unlock, - .get_gp_seq = rcu_get_gp_seq, - .gp_diff = rcu_seq_diff, - .deferred_free = rcu_torture_deferred_free, - .sync = synchronize_rcu, - .exp_sync = synchronize_rcu_expedited, - .get_state = get_state_synchronize_rcu, - .cond_sync = cond_synchronize_rcu, - .call = call_rcu, - .cb_barrier = rcu_barrier, - .fqs = rcu_force_quiescent_state, - .stats = NULL, - .stall_dur = rcu_jiffies_till_stall_check, - .irq_capable = 1, - .can_boost = rcu_can_boost(), - .extendables = RCUTORTURE_MAX_EXTEND, - .name = "rcu" + .ttype = RCU_FLAVOR, + .init = rcu_sync_torture_init, + .readlock = rcu_torture_read_lock, + .read_delay = rcu_read_delay, + .readunlock = rcu_torture_read_unlock, + .readlock_held = torture_readlock_not_held, + .readlock_nesting = rcu_torture_readlock_nesting, + .get_gp_seq = rcu_get_gp_seq, + .gp_diff = rcu_seq_diff, + .deferred_free = rcu_torture_deferred_free, + .sync = synchronize_rcu, + .exp_sync = synchronize_rcu_expedited, + .same_gp_state = same_state_synchronize_rcu, + .same_gp_state_full = same_state_synchronize_rcu_full, + .get_comp_state = get_completed_synchronize_rcu, + .get_comp_state_full = get_completed_synchronize_rcu_full, + .get_gp_state = get_state_synchronize_rcu, + .get_gp_state_full = get_state_synchronize_rcu_full, + .start_gp_poll = start_poll_synchronize_rcu, + .start_gp_poll_full = start_poll_synchronize_rcu_full, + .poll_gp_state = poll_state_synchronize_rcu, + .poll_gp_state_full = poll_state_synchronize_rcu_full, + .poll_need_2gp = rcu_poll_need_2gp, + .cond_sync = cond_synchronize_rcu, + .cond_sync_full = cond_synchronize_rcu_full, + .poll_active = NUM_ACTIVE_RCU_POLL_OLDSTATE, + .poll_active_full = NUM_ACTIVE_RCU_POLL_FULL_OLDSTATE, + .get_gp_state_exp = get_state_synchronize_rcu, + .start_gp_poll_exp = start_poll_synchronize_rcu_expedited, + .start_gp_poll_exp_full = start_poll_synchronize_rcu_expedited_full, + .poll_gp_state_exp = poll_state_synchronize_rcu, + .cond_sync_exp = cond_synchronize_rcu_expedited, + .cond_sync_exp_full = cond_synchronize_rcu_expedited_full, + .call = call_rcu_hurry, + .cb_barrier = rcu_barrier, + .fqs = rcu_force_quiescent_state, + .gp_kthread_dbg = show_rcu_gp_kthreads, + .check_boost_failed = rcu_check_boost_fail, + .stall_dur = rcu_jiffies_till_stall_check, + .get_gp_data = rcutorture_get_gp_data, + .gp_slow_register = rcu_gp_slow_register, + .gp_slow_unregister = rcu_gp_slow_unregister, + .reader_blocked = IS_ENABLED(CONFIG_RCU_TORTURE_TEST_LOG_CPU) + ? has_rcu_reader_blocked + : NULL, + .gather_gp_seqs = rcutorture_gather_gp_seqs, + .format_gp_seqs = rcutorture_format_gp_seqs, + .set_gpwrap_lag = rcu_set_gpwrap_lag, + .get_gpwrap_count = rcu_get_gpwrap_count, + .irq_capable = 1, + .can_boost = IS_ENABLED(CONFIG_RCU_BOOST), + .extendables = RCUTORTURE_MAX_EXTEND, + .debug_objects = 1, + .start_poll_irqsoff = 1, + .name = "rcu" }; /* @@ -494,15 +674,16 @@ static struct rcu_torture_ops rcu_busted_ops = { .readlock = rcu_torture_read_lock, .read_delay = rcu_read_delay, /* just reuse rcu's version. */ .readunlock = rcu_torture_read_unlock, + .readlock_held = torture_readlock_not_held, .get_gp_seq = rcu_no_completed, .deferred_free = rcu_busted_torture_deferred_free, .sync = synchronize_rcu_busted, .exp_sync = synchronize_rcu_busted, .call = call_rcu_busted, - .cb_barrier = NULL, - .fqs = NULL, - .stats = NULL, + .gather_gp_seqs = rcutorture_gather_gp_seqs, + .format_gp_seqs = rcutorture_format_gp_seqs, .irq_capable = 1, + .extendables = RCUTORTURE_MAX_EXTEND, .name = "busted" }; @@ -511,12 +692,65 @@ static struct rcu_torture_ops rcu_busted_ops = { */ DEFINE_STATIC_SRCU(srcu_ctl); +DEFINE_STATIC_SRCU_FAST(srcu_ctlf); +DEFINE_STATIC_SRCU_FAST_UPDOWN(srcu_ctlfud); static struct srcu_struct srcu_ctld; static struct srcu_struct *srcu_ctlp = &srcu_ctl; +static struct rcu_torture_ops srcud_ops; + +static void srcu_torture_init(void) +{ + rcu_sync_torture_init(); + if (!reader_flavor || (reader_flavor & SRCU_READ_FLAVOR_NORMAL)) + VERBOSE_TOROUT_STRING("srcu_torture_init normal SRCU"); + if (reader_flavor & SRCU_READ_FLAVOR_NMI) + VERBOSE_TOROUT_STRING("srcu_torture_init NMI-safe SRCU"); + if (reader_flavor & SRCU_READ_FLAVOR_FAST) { + srcu_ctlp = &srcu_ctlf; + VERBOSE_TOROUT_STRING("srcu_torture_init fast SRCU"); + } + if (reader_flavor & SRCU_READ_FLAVOR_FAST_UPDOWN) { + srcu_ctlp = &srcu_ctlfud; + VERBOSE_TOROUT_STRING("srcu_torture_init fast-up/down SRCU"); + } +} -static int srcu_torture_read_lock(void) __acquires(srcu_ctlp) +static void srcu_get_gp_data(int *flags, unsigned long *gp_seq) { - return srcu_read_lock(srcu_ctlp); + srcutorture_get_gp_data(srcu_ctlp, flags, gp_seq); +} + +static int srcu_torture_read_lock(void) +{ + int idx; + struct srcu_ctr __percpu *scp; + int ret = 0; + + WARN_ON_ONCE(reader_flavor & ~SRCU_READ_FLAVOR_ALL); + + if ((reader_flavor & SRCU_READ_FLAVOR_NORMAL) || !(reader_flavor & SRCU_READ_FLAVOR_ALL)) { + idx = srcu_read_lock(srcu_ctlp); + WARN_ON_ONCE(idx & ~0x1); + ret += idx; + } + if (reader_flavor & SRCU_READ_FLAVOR_NMI) { + idx = srcu_read_lock_nmisafe(srcu_ctlp); + WARN_ON_ONCE(idx & ~0x1); + ret += idx << 1; + } + if (reader_flavor & SRCU_READ_FLAVOR_FAST) { + scp = srcu_read_lock_fast(srcu_ctlp); + idx = __srcu_ptr_to_ctr(srcu_ctlp, scp); + WARN_ON_ONCE(idx & ~0x1); + ret += idx << 2; + } + if (reader_flavor & SRCU_READ_FLAVOR_FAST_UPDOWN) { + scp = srcu_read_lock_fast_updown(srcu_ctlp); + idx = __srcu_ptr_to_ctr(srcu_ctlp, scp); + WARN_ON_ONCE(idx & ~0x1); + ret += idx << 3; + } + return ret; } static void @@ -538,9 +772,67 @@ srcu_read_delay(struct torture_random_state *rrsp, struct rt_read_seg *rtrsp) } } -static void srcu_torture_read_unlock(int idx) __releases(srcu_ctlp) +static void srcu_torture_read_unlock(int idx) +{ + WARN_ON_ONCE((reader_flavor && (idx & ~reader_flavor)) || (!reader_flavor && (idx & ~0x1))); + if (reader_flavor & SRCU_READ_FLAVOR_FAST_UPDOWN) + srcu_read_unlock_fast_updown(srcu_ctlp, + __srcu_ctr_to_ptr(srcu_ctlp, (idx & 0x8) >> 3)); + if (reader_flavor & SRCU_READ_FLAVOR_FAST) + srcu_read_unlock_fast(srcu_ctlp, __srcu_ctr_to_ptr(srcu_ctlp, (idx & 0x4) >> 2)); + if (reader_flavor & SRCU_READ_FLAVOR_NMI) + srcu_read_unlock_nmisafe(srcu_ctlp, (idx & 0x2) >> 1); + if ((reader_flavor & SRCU_READ_FLAVOR_NORMAL) || !(reader_flavor & SRCU_READ_FLAVOR_ALL)) + srcu_read_unlock(srcu_ctlp, idx & 0x1); +} + +static int torture_srcu_read_lock_held(void) +{ + return srcu_read_lock_held(srcu_ctlp); +} + +static bool srcu_torture_have_up_down(void) +{ + int rf = reader_flavor; + + if (!rf) + rf = SRCU_READ_FLAVOR_NORMAL; + return !!(cur_ops->have_up_down & rf); +} + +static int srcu_torture_down_read(void) +{ + int idx; + struct srcu_ctr __percpu *scp; + + WARN_ON_ONCE(reader_flavor & ~SRCU_READ_FLAVOR_ALL); + WARN_ON_ONCE(reader_flavor & (reader_flavor - 1)); + + if ((reader_flavor & SRCU_READ_FLAVOR_NORMAL) || !(reader_flavor & SRCU_READ_FLAVOR_ALL)) { + idx = srcu_down_read(srcu_ctlp); + WARN_ON_ONCE(idx & ~0x1); + return idx; + } + if (reader_flavor & SRCU_READ_FLAVOR_FAST_UPDOWN) { + scp = srcu_down_read_fast(srcu_ctlp); + idx = __srcu_ptr_to_ctr(srcu_ctlp, scp); + WARN_ON_ONCE(idx & ~0x1); + return idx << 3; + } + WARN_ON_ONCE(1); + return 0; +} + +static void srcu_torture_up_read(int idx) { - srcu_read_unlock(srcu_ctlp, idx); + WARN_ON_ONCE((reader_flavor && (idx & ~reader_flavor)) || (!reader_flavor && (idx & ~0x1))); + if (reader_flavor & SRCU_READ_FLAVOR_FAST_UPDOWN) + srcu_up_read_fast(srcu_ctlp, __srcu_ctr_to_ptr(srcu_ctlp, (idx & 0x8) >> 3)); + else if ((reader_flavor & SRCU_READ_FLAVOR_NORMAL) || + !(reader_flavor & SRCU_READ_FLAVOR_ALL)) + srcu_up_read(srcu_ctlp, idx & 0x1); + else + WARN_ON_ONCE(1); } static unsigned long srcu_torture_completed(void) @@ -558,6 +850,21 @@ static void srcu_torture_synchronize(void) synchronize_srcu(srcu_ctlp); } +static unsigned long srcu_torture_get_gp_state(void) +{ + return get_state_synchronize_srcu(srcu_ctlp); +} + +static unsigned long srcu_torture_start_gp_poll(void) +{ + return start_poll_synchronize_srcu(srcu_ctlp); +} + +static bool srcu_torture_poll_gp_state(unsigned long oldstate) +{ + return poll_state_synchronize_srcu(srcu_ctlp, oldstate); +} + static void srcu_torture_call(struct rcu_head *head, rcu_callback_t func) { @@ -579,57 +886,105 @@ static void srcu_torture_synchronize_expedited(void) synchronize_srcu_expedited(srcu_ctlp); } +static void srcu_torture_expedite_current(void) +{ + srcu_expedite_current(srcu_ctlp); +} + static struct rcu_torture_ops srcu_ops = { .ttype = SRCU_FLAVOR, - .init = rcu_sync_torture_init, + .init = srcu_torture_init, .readlock = srcu_torture_read_lock, .read_delay = srcu_read_delay, .readunlock = srcu_torture_read_unlock, + .down_read = srcu_torture_down_read, + .up_read = srcu_torture_up_read, + .readlock_held = torture_srcu_read_lock_held, .get_gp_seq = srcu_torture_completed, + .gp_diff = rcu_seq_diff, .deferred_free = srcu_torture_deferred_free, .sync = srcu_torture_synchronize, .exp_sync = srcu_torture_synchronize_expedited, + .exp_current = srcu_torture_expedite_current, + .same_gp_state = same_state_synchronize_srcu, + .get_comp_state = get_completed_synchronize_srcu, + .get_gp_state = srcu_torture_get_gp_state, + .start_gp_poll = srcu_torture_start_gp_poll, + .poll_gp_state = srcu_torture_poll_gp_state, + .poll_active = NUM_ACTIVE_SRCU_POLL_OLDSTATE, .call = srcu_torture_call, .cb_barrier = srcu_torture_barrier, .stats = srcu_torture_stats, + .get_gp_data = srcu_get_gp_data, + .cbflood_max = 50000, .irq_capable = 1, + .no_pi_lock = IS_ENABLED(CONFIG_TINY_SRCU), + .debug_objects = 1, + .have_up_down = IS_ENABLED(CONFIG_TINY_SRCU) + ? 0 : SRCU_READ_FLAVOR_NORMAL | SRCU_READ_FLAVOR_FAST_UPDOWN, .name = "srcu" }; -static void srcu_torture_init(void) +static void srcud_torture_init(void) { rcu_sync_torture_init(); - WARN_ON(init_srcu_struct(&srcu_ctld)); + if (!reader_flavor || (reader_flavor & SRCU_READ_FLAVOR_NORMAL)) { + WARN_ON(init_srcu_struct(&srcu_ctld)); + VERBOSE_TOROUT_STRING("srcud_torture_init normal SRCU"); + } else if (reader_flavor & SRCU_READ_FLAVOR_NMI) { + WARN_ON(init_srcu_struct(&srcu_ctld)); + VERBOSE_TOROUT_STRING("srcud_torture_init NMI-safe SRCU"); + } else if (reader_flavor & SRCU_READ_FLAVOR_FAST) { + WARN_ON(init_srcu_struct_fast(&srcu_ctld)); + VERBOSE_TOROUT_STRING("srcud_torture_init fast SRCU"); + } else if (reader_flavor & SRCU_READ_FLAVOR_FAST_UPDOWN) { + WARN_ON(init_srcu_struct_fast_updown(&srcu_ctld)); + VERBOSE_TOROUT_STRING("srcud_torture_init fast-up/down SRCU"); + } else { + WARN_ON(init_srcu_struct(&srcu_ctld)); + } srcu_ctlp = &srcu_ctld; } static void srcu_torture_cleanup(void) { - static DEFINE_TORTURE_RANDOM(rand); - - if (torture_random(&rand) & 0x800) - cleanup_srcu_struct(&srcu_ctld); - else - cleanup_srcu_struct_quiesced(&srcu_ctld); + cleanup_srcu_struct(&srcu_ctld); srcu_ctlp = &srcu_ctl; /* In case of a later rcutorture run. */ } /* As above, but dynamically allocated. */ static struct rcu_torture_ops srcud_ops = { .ttype = SRCU_FLAVOR, - .init = srcu_torture_init, + .init = srcud_torture_init, .cleanup = srcu_torture_cleanup, .readlock = srcu_torture_read_lock, .read_delay = srcu_read_delay, .readunlock = srcu_torture_read_unlock, + .readlock_held = torture_srcu_read_lock_held, + .down_read = srcu_torture_down_read, + .up_read = srcu_torture_up_read, .get_gp_seq = srcu_torture_completed, + .gp_diff = rcu_seq_diff, .deferred_free = srcu_torture_deferred_free, .sync = srcu_torture_synchronize, .exp_sync = srcu_torture_synchronize_expedited, + .exp_current = srcu_torture_expedite_current, + .same_gp_state = same_state_synchronize_srcu, + .get_comp_state = get_completed_synchronize_srcu, + .get_gp_state = srcu_torture_get_gp_state, + .start_gp_poll = srcu_torture_start_gp_poll, + .poll_gp_state = srcu_torture_poll_gp_state, + .poll_active = NUM_ACTIVE_SRCU_POLL_OLDSTATE, .call = srcu_torture_call, .cb_barrier = srcu_torture_barrier, .stats = srcu_torture_stats, + .get_gp_data = srcu_get_gp_data, + .cbflood_max = 50000, .irq_capable = 1, + .no_pi_lock = IS_ENABLED(CONFIG_TINY_SRCU), + .debug_objects = 1, + .have_up_down = IS_ENABLED(CONFIG_TINY_SRCU) + ? 0 : SRCU_READ_FLAVOR_NORMAL | SRCU_READ_FLAVOR_FAST_UPDOWN, .name = "srcud" }; @@ -641,6 +996,7 @@ static struct rcu_torture_ops busted_srcud_ops = { .readlock = srcu_torture_read_lock, .read_delay = rcu_read_delay, .readunlock = srcu_torture_read_unlock, + .readlock_held = torture_srcu_read_lock_held, .get_gp_seq = srcu_torture_completed, .deferred_free = srcu_torture_deferred_free, .sync = srcu_torture_synchronize, @@ -649,11 +1005,65 @@ static struct rcu_torture_ops busted_srcud_ops = { .cb_barrier = srcu_torture_barrier, .stats = srcu_torture_stats, .irq_capable = 1, + .no_pi_lock = IS_ENABLED(CONFIG_TINY_SRCU), .extendables = RCUTORTURE_MAX_EXTEND, .name = "busted_srcud" }; /* + * Definitions for trivial CONFIG_PREEMPT=n-only torture testing. + * This implementation does not work well with CPU hotplug nor + * with rcutorture's shuffling. + */ + +static void synchronize_rcu_trivial(void) +{ + int cpu; + + for_each_online_cpu(cpu) { + torture_sched_setaffinity(current->pid, cpumask_of(cpu), true); + WARN_ON_ONCE(raw_smp_processor_id() != cpu); + } +} + +static void rcu_sync_torture_init_trivial(void) +{ + rcu_sync_torture_init(); + // if (onoff_interval || shuffle_interval) { + if (WARN_ONCE(onoff_interval || shuffle_interval, "%s: Non-zero onoff_interval (%d) or shuffle_interval (%d) breaks trivial RCU, resetting to zero", __func__, onoff_interval, shuffle_interval)) { + onoff_interval = 0; + shuffle_interval = 0; + } +} + +static int rcu_torture_read_lock_trivial(void) +{ + preempt_disable(); + return 0; +} + +static void rcu_torture_read_unlock_trivial(int idx) +{ + preempt_enable(); +} + +static struct rcu_torture_ops trivial_ops = { + .ttype = RCU_TRIVIAL_FLAVOR, + .init = rcu_sync_torture_init_trivial, + .readlock = rcu_torture_read_lock_trivial, + .read_delay = rcu_read_delay, /* just reuse rcu's version. */ + .readunlock = rcu_torture_read_unlock_trivial, + .readlock_held = torture_readlock_not_held, + .get_gp_seq = rcu_no_completed, + .sync = synchronize_rcu_trivial, + .exp_sync = synchronize_rcu_trivial, + .irq_capable = 1, + .name = "trivial" +}; + +#ifdef CONFIG_TASKS_RCU + +/* * Definitions for RCU-tasks torture testing. */ @@ -671,6 +1081,11 @@ static void rcu_tasks_torture_deferred_free(struct rcu_torture *p) call_rcu_tasks(&p->rtort_rcu, rcu_torture_cb); } +static void synchronize_rcu_mult_test(void) +{ + synchronize_rcu_mult(call_rcu_tasks, call_rcu_hurry); +} + static struct rcu_torture_ops tasks_ops = { .ttype = RCU_TASKS_FLAVOR, .init = rcu_sync_torture_init, @@ -680,48 +1095,122 @@ static struct rcu_torture_ops tasks_ops = { .get_gp_seq = rcu_no_completed, .deferred_free = rcu_tasks_torture_deferred_free, .sync = synchronize_rcu_tasks, - .exp_sync = synchronize_rcu_tasks, + .exp_sync = synchronize_rcu_mult_test, .call = call_rcu_tasks, .cb_barrier = rcu_barrier_tasks, - .fqs = NULL, - .stats = NULL, + .gp_kthread_dbg = show_rcu_tasks_classic_gp_kthread, + .get_gp_data = rcu_tasks_get_gp_data, .irq_capable = 1, + .slow_gps = 1, .name = "tasks" }; -static unsigned long rcutorture_seq_diff(unsigned long new, unsigned long old) +#define TASKS_OPS &tasks_ops, + +#else // #ifdef CONFIG_TASKS_RCU + +#define TASKS_OPS + +#endif // #else #ifdef CONFIG_TASKS_RCU + + +#ifdef CONFIG_TASKS_RUDE_RCU + +/* + * Definitions for rude RCU-tasks torture testing. + */ + +static struct rcu_torture_ops tasks_rude_ops = { + .ttype = RCU_TASKS_RUDE_FLAVOR, + .init = rcu_sync_torture_init, + .readlock = rcu_torture_read_lock_trivial, + .read_delay = rcu_read_delay, /* just reuse rcu's version. */ + .readunlock = rcu_torture_read_unlock_trivial, + .get_gp_seq = rcu_no_completed, + .sync = synchronize_rcu_tasks_rude, + .exp_sync = synchronize_rcu_tasks_rude, + .gp_kthread_dbg = show_rcu_tasks_rude_gp_kthread, + .get_gp_data = rcu_tasks_rude_get_gp_data, + .cbflood_max = 50000, + .irq_capable = 1, + .name = "tasks-rude" +}; + +#define TASKS_RUDE_OPS &tasks_rude_ops, + +#else // #ifdef CONFIG_TASKS_RUDE_RCU + +#define TASKS_RUDE_OPS + +#endif // #else #ifdef CONFIG_TASKS_RUDE_RCU + + +#ifdef CONFIG_TASKS_TRACE_RCU + +/* + * Definitions for tracing RCU-tasks torture testing. + */ + +static int tasks_tracing_torture_read_lock(void) { - if (!cur_ops->gp_diff) - return new - old; - return cur_ops->gp_diff(new, old); + rcu_read_lock_trace(); + return 0; } -static bool __maybe_unused torturing_tasks(void) +static void tasks_tracing_torture_read_unlock(int idx) { - return cur_ops == &tasks_ops; + rcu_read_unlock_trace(); } -/* - * RCU torture priority-boost testing. Runs one real-time thread per - * CPU for moderate bursts, repeatedly registering RCU callbacks and - * spinning waiting for them to be invoked. If a given callback takes - * too long to be invoked, we assume that priority inversion has occurred. - */ +static void rcu_tasks_tracing_torture_deferred_free(struct rcu_torture *p) +{ + call_rcu_tasks_trace(&p->rtort_rcu, rcu_torture_cb); +} -struct rcu_boost_inflight { - struct rcu_head rcu; - int inflight; +static struct rcu_torture_ops tasks_tracing_ops = { + .ttype = RCU_TASKS_TRACING_FLAVOR, + .init = rcu_sync_torture_init, + .readlock = tasks_tracing_torture_read_lock, + .read_delay = srcu_read_delay, /* just reuse srcu's version. */ + .readunlock = tasks_tracing_torture_read_unlock, + .readlock_held = rcu_read_lock_trace_held, + .get_gp_seq = rcu_no_completed, + .deferred_free = rcu_tasks_tracing_torture_deferred_free, + .sync = synchronize_rcu_tasks_trace, + .exp_sync = synchronize_rcu_tasks_trace, + .call = call_rcu_tasks_trace, + .cb_barrier = rcu_barrier_tasks_trace, + .gp_kthread_dbg = show_rcu_tasks_trace_gp_kthread, + .get_gp_data = rcu_tasks_trace_get_gp_data, + .cbflood_max = 50000, + .irq_capable = 1, + .slow_gps = 1, + .name = "tasks-tracing" }; -static void rcu_torture_boost_cb(struct rcu_head *head) -{ - struct rcu_boost_inflight *rbip = - container_of(head, struct rcu_boost_inflight, rcu); +#define TASKS_TRACING_OPS &tasks_tracing_ops, + +#else // #ifdef CONFIG_TASKS_TRACE_RCU + +#define TASKS_TRACING_OPS + +#endif // #else #ifdef CONFIG_TASKS_TRACE_RCU + - /* Ensure RCU-core accesses precede clearing ->inflight */ - smp_store_release(&rbip->inflight, 0); +static unsigned long rcutorture_seq_diff(unsigned long new, unsigned long old) +{ + if (!cur_ops->gp_diff) + return new - old; + return cur_ops->gp_diff(new, old); } +/* + * RCU torture priority-boost testing. Runs one real-time thread per + * CPU for moderate bursts, repeatedly starting grace periods and waiting + * for them to complete. If a given grace period takes too long, we assume + * that priority inversion has occurred. + */ + static int old_rt_runtime = -1; static void rcu_torture_disable_rt_throttle(void) @@ -748,89 +1237,124 @@ static void rcu_torture_enable_rt_throttle(void) old_rt_runtime = -1; } -static bool rcu_torture_boost_failed(unsigned long start, unsigned long end) +static bool rcu_torture_boost_failed(unsigned long gp_state, unsigned long *start) { - if (end - start > test_boost_duration * HZ - HZ / 2) { + int cpu; + static int dbg_done; + unsigned long end = jiffies; + bool gp_done; + unsigned long j; + static unsigned long last_persist; + unsigned long lp; + unsigned long mininterval = test_boost_duration * HZ - HZ / 2; + + if (end - *start > mininterval) { + // Recheck after checking time to avoid false positives. + smp_mb(); // Time check before grace-period check. + if (cur_ops->poll_gp_state(gp_state)) + return false; // passed, though perhaps just barely + if (cur_ops->check_boost_failed && !cur_ops->check_boost_failed(gp_state, &cpu)) { + // At most one persisted message per boost test. + j = jiffies; + lp = READ_ONCE(last_persist); + if (time_after(j, lp + mininterval) && + cmpxchg(&last_persist, lp, j) == lp) { + if (cpu < 0) + pr_info("Boost inversion persisted: QS from all CPUs\n"); + else + pr_info("Boost inversion persisted: No QS from CPU %d\n", cpu); + } + return false; // passed on a technicality + } VERBOSE_TOROUT_STRING("rcu_torture_boost boosting failed"); n_rcu_torture_boost_failure++; + if (!xchg(&dbg_done, 1) && cur_ops->gp_kthread_dbg) { + pr_info("Boost inversion thread ->rt_priority %u gp_state %lu jiffies %lu\n", + current->rt_priority, gp_state, end - *start); + cur_ops->gp_kthread_dbg(); + // Recheck after print to flag grace period ending during splat. + gp_done = cur_ops->poll_gp_state(gp_state); + pr_info("Boost inversion: GP %lu %s.\n", gp_state, + gp_done ? "ended already" : "still pending"); - return true; /* failed */ + } + + return true; // failed + } else if (cur_ops->check_boost_failed && !cur_ops->check_boost_failed(gp_state, NULL)) { + *start = jiffies; } - return false; /* passed */ + return false; // passed } static int rcu_torture_boost(void *arg) { - unsigned long call_rcu_time; unsigned long endtime; + unsigned long gp_state; + unsigned long gp_state_time; unsigned long oldstarttime; - struct rcu_boost_inflight rbi = { .inflight = 0 }; - struct sched_param sp; + unsigned long booststarttime = get_torture_init_jiffies() + test_boost_holdoff * HZ; - VERBOSE_TOROUT_STRING("rcu_torture_boost started"); + if (test_boost_holdoff <= 0 || time_after(jiffies, booststarttime)) { + VERBOSE_TOROUT_STRING("rcu_torture_boost started"); + } else { + VERBOSE_TOROUT_STRING("rcu_torture_boost started holdoff period"); + while (time_before(jiffies, booststarttime)) { + schedule_timeout_idle(HZ); + if (kthread_should_stop()) + goto cleanup; + } + VERBOSE_TOROUT_STRING("rcu_torture_boost finished holdoff period"); + } /* Set real-time priority. */ - sp.sched_priority = 1; - if (sched_setscheduler(current, SCHED_FIFO, &sp) < 0) { - VERBOSE_TOROUT_STRING("rcu_torture_boost RT prio failed!"); - n_rcu_torture_boost_rterror++; - } + sched_set_fifo_low(current); - init_rcu_head_on_stack(&rbi.rcu); /* Each pass through the following loop does one boost-test cycle. */ do { - /* Track if the test failed already in this test interval? */ - bool failed = false; + bool failed = false; // Test failed already in this test interval + bool gp_initiated = false; - /* Increment n_rcu_torture_boosts once per boost-test */ - while (!kthread_should_stop()) { - if (mutex_trylock(&boost_mutex)) { - n_rcu_torture_boosts++; - mutex_unlock(&boost_mutex); - break; - } - schedule_timeout_uninterruptible(1); - } if (kthread_should_stop()) goto checkwait; /* Wait for the next test interval. */ - oldstarttime = boost_starttime; - while (ULONG_CMP_LT(jiffies, oldstarttime)) { + oldstarttime = READ_ONCE(boost_starttime); + while (time_before(jiffies, oldstarttime)) { schedule_timeout_interruptible(oldstarttime - jiffies); - stutter_wait("rcu_torture_boost"); + if (stutter_wait("rcu_torture_boost")) + sched_set_fifo_low(current); if (torture_must_stop()) goto checkwait; } - /* Do one boost-test interval. */ + // Do one boost-test interval. endtime = oldstarttime + test_boost_duration * HZ; - call_rcu_time = jiffies; - while (ULONG_CMP_LT(jiffies, endtime)) { - /* If we don't have a callback in flight, post one. */ - if (!smp_load_acquire(&rbi.inflight)) { - /* RCU core before ->inflight = 1. */ - smp_store_release(&rbi.inflight, 1); - call_rcu(&rbi.rcu, rcu_torture_boost_cb); - /* Check if the boost test failed */ - failed = failed || - rcu_torture_boost_failed(call_rcu_time, - jiffies); - call_rcu_time = jiffies; + while (time_before(jiffies, endtime)) { + // Has current GP gone too long? + if (gp_initiated && !failed && !cur_ops->poll_gp_state(gp_state)) + failed = rcu_torture_boost_failed(gp_state, &gp_state_time); + // If we don't have a grace period in flight, start one. + if (!gp_initiated || cur_ops->poll_gp_state(gp_state)) { + gp_state = cur_ops->start_gp_poll(); + gp_initiated = true; + gp_state_time = jiffies; + } + if (stutter_wait("rcu_torture_boost")) { + sched_set_fifo_low(current); + // If the grace period already ended, + // we don't know when that happened, so + // start over. + if (cur_ops->poll_gp_state(gp_state)) + gp_initiated = false; } - stutter_wait("rcu_torture_boost"); if (torture_must_stop()) goto checkwait; } - /* - * If boost never happened, then inflight will always be 1, in - * this case the boost check would never happen in the above - * loop so do another one here. - */ - if (!failed && smp_load_acquire(&rbi.inflight)) - rcu_torture_boost_failed(call_rcu_time, jiffies); + // In case the grace period extended beyond the end of the loop. + if (gp_initiated && !failed && !cur_ops->poll_gp_state(gp_state)) + rcu_torture_boost_failed(gp_state, &gp_state_time); /* * Set the start time of the next test interval. @@ -839,27 +1363,30 @@ static int rcu_torture_boost(void *arg) * interval. Besides, we are running at RT priority, * so delays should be relatively rare. */ - while (oldstarttime == boost_starttime && - !kthread_should_stop()) { + while (oldstarttime == READ_ONCE(boost_starttime) && !kthread_should_stop()) { if (mutex_trylock(&boost_mutex)) { - boost_starttime = jiffies + - test_boost_interval * HZ; + if (oldstarttime == boost_starttime) { + WRITE_ONCE(boost_starttime, + jiffies + test_boost_interval * HZ); + n_rcu_torture_boosts++; + } mutex_unlock(&boost_mutex); break; } - schedule_timeout_uninterruptible(1); + schedule_timeout_uninterruptible(HZ / 20); } /* Go do the stutter. */ -checkwait: stutter_wait("rcu_torture_boost"); +checkwait: if (stutter_wait("rcu_torture_boost")) + sched_set_fifo_low(current); } while (!torture_must_stop()); +cleanup: /* Clean up and exit. */ - while (!kthread_should_stop() || smp_load_acquire(&rbi.inflight)) { + while (!kthread_should_stop()) { torture_shutdown_absorb("rcu_torture_boost"); - schedule_timeout_uninterruptible(1); + schedule_timeout_uninterruptible(HZ / 20); } - destroy_rcu_head_on_stack(&rbi.rcu); torture_kthread_stopping("rcu_torture_boost"); return 0; } @@ -874,13 +1401,14 @@ rcu_torture_fqs(void *arg) { unsigned long fqs_resume_time; int fqs_burst_remaining; + int oldnice = task_nice(current); VERBOSE_TOROUT_STRING("rcu_torture_fqs task started"); do { fqs_resume_time = jiffies + fqs_stutter * HZ; - while (ULONG_CMP_LT(jiffies, fqs_resume_time) && + while (time_before(jiffies, fqs_resume_time) && !kthread_should_stop()) { - schedule_timeout_interruptible(1); + schedule_timeout_interruptible(HZ / 20); } fqs_burst_remaining = fqs_duration; while (fqs_burst_remaining > 0 && @@ -889,48 +1417,76 @@ rcu_torture_fqs(void *arg) udelay(fqs_holdoff); fqs_burst_remaining -= fqs_holdoff; } - stutter_wait("rcu_torture_fqs"); + if (stutter_wait("rcu_torture_fqs")) + sched_set_normal(current, oldnice); } while (!torture_must_stop()); torture_kthread_stopping("rcu_torture_fqs"); return 0; } +// Used by writers to randomly choose from the available grace-period primitives. +static int synctype[ARRAY_SIZE(rcu_torture_writer_state_names)] = { }; +static int nsynctypes; + /* - * RCU torture writer kthread. Repeatedly substitutes a new structure - * for that pointed to by rcu_torture_current, freeing the old structure - * after a series of grace periods (the "pipeline"). + * Determine which grace-period primitives are available. */ -static int -rcu_torture_writer(void *arg) +static void rcu_torture_write_types(void) { - bool can_expedite = !rcu_gp_is_expedited() && !rcu_gp_is_normal(); - int expediting = 0; - unsigned long gp_snap; - bool gp_cond1 = gp_cond, gp_exp1 = gp_exp, gp_normal1 = gp_normal; - bool gp_sync1 = gp_sync; - int i; - struct rcu_torture *rp; - struct rcu_torture *old_rp; - static DEFINE_TORTURE_RANDOM(rand); - int synctype[] = { RTWS_DEF_FREE, RTWS_EXP_SYNC, - RTWS_COND_GET, RTWS_SYNC }; - int nsynctypes = 0; - - VERBOSE_TOROUT_STRING("rcu_torture_writer task started"); - if (!can_expedite) - pr_alert("%s" TORTURE_FLAG - " GP expediting controlled from boot/sysfs for %s.\n", - torture_type, cur_ops->name); + bool gp_cond1 = gp_cond, gp_cond_exp1 = gp_cond_exp, gp_cond_full1 = gp_cond_full; + bool gp_cond_exp_full1 = gp_cond_exp_full, gp_exp1 = gp_exp, gp_poll_exp1 = gp_poll_exp; + bool gp_poll_exp_full1 = gp_poll_exp_full, gp_normal1 = gp_normal, gp_poll1 = gp_poll; + bool gp_poll_full1 = gp_poll_full, gp_sync1 = gp_sync; /* Initialize synctype[] array. If none set, take default. */ - if (!gp_cond1 && !gp_exp1 && !gp_normal1 && !gp_sync1) - gp_cond1 = gp_exp1 = gp_normal1 = gp_sync1 = true; - if (gp_cond1 && cur_ops->get_state && cur_ops->cond_sync) { + if (!gp_cond1 && + !gp_cond_exp1 && + !gp_cond_full1 && + !gp_cond_exp_full1 && + !gp_exp1 && + !gp_poll_exp1 && + !gp_poll_exp_full1 && + !gp_normal1 && + !gp_poll1 && + !gp_poll_full1 && + !gp_sync1) { + gp_cond1 = true; + gp_cond_exp1 = true; + gp_cond_full1 = true; + gp_cond_exp_full1 = true; + gp_exp1 = true; + gp_poll_exp1 = true; + gp_poll_exp_full1 = true; + gp_normal1 = true; + gp_poll1 = true; + gp_poll_full1 = true; + gp_sync1 = true; + } + if (gp_cond1 && cur_ops->get_gp_state && cur_ops->cond_sync) { synctype[nsynctypes++] = RTWS_COND_GET; pr_info("%s: Testing conditional GPs.\n", __func__); - } else if (gp_cond && (!cur_ops->get_state || !cur_ops->cond_sync)) { + } else if (gp_cond && (!cur_ops->get_gp_state || !cur_ops->cond_sync)) { pr_alert("%s: gp_cond without primitives.\n", __func__); } + if (gp_cond_exp1 && cur_ops->get_gp_state_exp && cur_ops->cond_sync_exp) { + synctype[nsynctypes++] = RTWS_COND_GET_EXP; + pr_info("%s: Testing conditional expedited GPs.\n", __func__); + } else if (gp_cond_exp && (!cur_ops->get_gp_state_exp || !cur_ops->cond_sync_exp)) { + pr_alert("%s: gp_cond_exp without primitives.\n", __func__); + } + if (gp_cond_full1 && cur_ops->get_gp_state && cur_ops->cond_sync_full) { + synctype[nsynctypes++] = RTWS_COND_GET_FULL; + pr_info("%s: Testing conditional full-state GPs.\n", __func__); + } else if (gp_cond_full && (!cur_ops->get_gp_state || !cur_ops->cond_sync_full)) { + pr_alert("%s: gp_cond_full without primitives.\n", __func__); + } + if (gp_cond_exp_full1 && cur_ops->get_gp_state_exp && cur_ops->cond_sync_exp_full) { + synctype[nsynctypes++] = RTWS_COND_GET_EXP_FULL; + pr_info("%s: Testing conditional full-state expedited GPs.\n", __func__); + } else if (gp_cond_exp_full && + (!cur_ops->get_gp_state_exp || !cur_ops->cond_sync_exp_full)) { + pr_alert("%s: gp_cond_exp_full without primitives.\n", __func__); + } if (gp_exp1 && cur_ops->exp_sync) { synctype[nsynctypes++] = RTWS_EXP_SYNC; pr_info("%s: Testing expedited GPs.\n", __func__); @@ -943,14 +1499,119 @@ rcu_torture_writer(void *arg) } else if (gp_normal && !cur_ops->deferred_free) { pr_alert("%s: gp_normal without primitives.\n", __func__); } + if (gp_poll1 && cur_ops->get_comp_state && cur_ops->same_gp_state && + cur_ops->start_gp_poll && cur_ops->poll_gp_state) { + synctype[nsynctypes++] = RTWS_POLL_GET; + pr_info("%s: Testing polling GPs.\n", __func__); + } else if (gp_poll && (!cur_ops->start_gp_poll || !cur_ops->poll_gp_state)) { + pr_alert("%s: gp_poll without primitives.\n", __func__); + } + if (gp_poll_full1 && cur_ops->get_comp_state_full && cur_ops->same_gp_state_full + && cur_ops->start_gp_poll_full && cur_ops->poll_gp_state_full) { + synctype[nsynctypes++] = RTWS_POLL_GET_FULL; + pr_info("%s: Testing polling full-state GPs.\n", __func__); + } else if (gp_poll_full && (!cur_ops->start_gp_poll_full || !cur_ops->poll_gp_state_full)) { + pr_alert("%s: gp_poll_full without primitives.\n", __func__); + } + if (gp_poll_exp1 && cur_ops->start_gp_poll_exp && cur_ops->poll_gp_state_exp) { + synctype[nsynctypes++] = RTWS_POLL_GET_EXP; + pr_info("%s: Testing polling expedited GPs.\n", __func__); + } else if (gp_poll_exp && (!cur_ops->start_gp_poll_exp || !cur_ops->poll_gp_state_exp)) { + pr_alert("%s: gp_poll_exp without primitives.\n", __func__); + } + if (gp_poll_exp_full1 && cur_ops->start_gp_poll_exp_full && cur_ops->poll_gp_state_full) { + synctype[nsynctypes++] = RTWS_POLL_GET_EXP_FULL; + pr_info("%s: Testing polling full-state expedited GPs.\n", __func__); + } else if (gp_poll_exp_full && + (!cur_ops->start_gp_poll_exp_full || !cur_ops->poll_gp_state_full)) { + pr_alert("%s: gp_poll_exp_full without primitives.\n", __func__); + } if (gp_sync1 && cur_ops->sync) { synctype[nsynctypes++] = RTWS_SYNC; pr_info("%s: Testing normal GPs.\n", __func__); } else if (gp_sync && !cur_ops->sync) { pr_alert("%s: gp_sync without primitives.\n", __func__); } + pr_alert("%s: Testing %d update types.\n", __func__, nsynctypes); + pr_info("%s: gp_cond_wi %d gp_cond_wi_exp %d gp_poll_wi %d gp_poll_wi_exp %d\n", __func__, gp_cond_wi, gp_cond_wi_exp, gp_poll_wi, gp_poll_wi_exp); +} + +/* + * Do the specified rcu_torture_writer() synchronous grace period, + * while also testing out the polled APIs. Note well that the single-CPU + * grace-period optimizations must be accounted for. + */ +static void do_rtws_sync(struct torture_random_state *trsp, void (*sync)(void)) +{ + unsigned long cookie; + struct rcu_gp_oldstate cookie_full; + bool dopoll; + bool dopoll_full; + unsigned long r = torture_random(trsp); + + dopoll = cur_ops->get_gp_state && cur_ops->poll_gp_state && !(r & 0x300); + dopoll_full = cur_ops->get_gp_state_full && cur_ops->poll_gp_state_full && !(r & 0xc00); + if (dopoll || dopoll_full) + cpus_read_lock(); + if (dopoll) + cookie = cur_ops->get_gp_state(); + if (dopoll_full) + cur_ops->get_gp_state_full(&cookie_full); + if (cur_ops->poll_need_2gp && cur_ops->poll_need_2gp(dopoll, dopoll_full)) + sync(); + sync(); + WARN_ONCE(dopoll && !cur_ops->poll_gp_state(cookie), + "%s: Cookie check 3 failed %pS() online %*pbl.", + __func__, sync, cpumask_pr_args(cpu_online_mask)); + WARN_ONCE(dopoll_full && !cur_ops->poll_gp_state_full(&cookie_full), + "%s: Cookie check 4 failed %pS() online %*pbl", + __func__, sync, cpumask_pr_args(cpu_online_mask)); + if (dopoll || dopoll_full) + cpus_read_unlock(); +} + +/* + * RCU torture writer kthread. Repeatedly substitutes a new structure + * for that pointed to by rcu_torture_current, freeing the old structure + * after a series of grace periods (the "pipeline"). + */ +static int +rcu_torture_writer(void *arg) +{ + bool booting_still = false; + bool can_expedite = !rcu_gp_is_expedited() && !rcu_gp_is_normal(); + unsigned long cookie; + struct rcu_gp_oldstate cookie_full; + int expediting = 0; + unsigned long gp_snap; + unsigned long gp_snap1; + struct rcu_gp_oldstate gp_snap_full; + struct rcu_gp_oldstate gp_snap1_full; + int i; + int idx; + unsigned long j; + int oldnice = task_nice(current); + struct rcu_gp_oldstate *rgo = NULL; + int rgo_size = 0; + struct rcu_torture *rp; + struct rcu_torture *old_rp; + static DEFINE_TORTURE_RANDOM(rand); + unsigned long stallsdone = jiffies; + bool stutter_waited; + unsigned long *ulo = NULL; + int ulo_size = 0; + + // If a new stall test is added, this must be adjusted. + if (stall_cpu_holdoff + stall_gp_kthread + stall_cpu) + stallsdone += (stall_cpu_holdoff + stall_gp_kthread + stall_cpu + 60) * + HZ * (stall_cpu_repeat + 1); + VERBOSE_TOROUT_STRING("rcu_torture_writer task started"); + if (!can_expedite) + pr_alert("%s" TORTURE_FLAG + " GP expediting controlled from boot/sysfs for %s.\n", + torture_type, cur_ops->name); if (WARN_ONCE(nsynctypes == 0, - "rcu_torture_writer: No update-side primitives.\n")) { + "%s: No update-side primitives.\n", __func__)) { /* * No updates primitives, so don't try updating. * The resulting test won't be testing much, hence the @@ -958,15 +1619,37 @@ rcu_torture_writer(void *arg) */ rcu_torture_writer_state = RTWS_STOPPING; torture_kthread_stopping("rcu_torture_writer"); + return 0; + } + if (cur_ops->poll_active > 0) { + ulo = kcalloc(cur_ops->poll_active, sizeof(*ulo), GFP_KERNEL); + if (!WARN_ON(!ulo)) + ulo_size = cur_ops->poll_active; + } + if (cur_ops->poll_active_full > 0) { + rgo = kcalloc(cur_ops->poll_active_full, sizeof(*rgo), GFP_KERNEL); + if (!WARN_ON(!rgo)) + rgo_size = cur_ops->poll_active_full; } + // If the system is still booting, let it finish. + j = jiffies; + while (!torture_must_stop() && !rcu_inkernel_boot_has_ended()) { + booting_still = true; + schedule_timeout_interruptible(HZ); + } + if (booting_still) + pr_alert("%s" TORTURE_FLAG " Waited %lu jiffies for boot to complete.\n", + torture_type, jiffies - j); + do { rcu_torture_writer_state = RTWS_FIXED_DELAY; - schedule_timeout_uninterruptible(1); + torture_hrtimeout_us(500, 1000, &rand); rp = rcu_torture_alloc(); if (rp == NULL) continue; rp->rtort_pipe_count = 0; + ASSERT_EXCLUSIVE_WRITER(rp->rtort_pipe_count); rcu_torture_writer_state = RTWS_DELAY; udelay(torture_random(&rand) & 0x3ff); rcu_torture_writer_state = RTWS_REPLACE; @@ -980,7 +1663,41 @@ rcu_torture_writer(void *arg) if (i > RCU_TORTURE_PIPE_LEN) i = RCU_TORTURE_PIPE_LEN; atomic_inc(&rcu_torture_wcount[i]); - old_rp->rtort_pipe_count++; + WRITE_ONCE(old_rp->rtort_pipe_count, + old_rp->rtort_pipe_count + 1); + ASSERT_EXCLUSIVE_WRITER(old_rp->rtort_pipe_count); + + // Make sure readers block polled grace periods. + if (cur_ops->get_gp_state && cur_ops->poll_gp_state) { + idx = cur_ops->readlock(); + cookie = cur_ops->get_gp_state(); + WARN_ONCE(cur_ops->poll_gp_state(cookie), + "%s: Cookie check 1 failed %s(%d) %lu->%lu\n", + __func__, + rcu_torture_writer_state_getname(), + rcu_torture_writer_state, + cookie, cur_ops->get_gp_state()); + if (cur_ops->get_comp_state) { + cookie = cur_ops->get_comp_state(); + WARN_ON_ONCE(!cur_ops->poll_gp_state(cookie)); + } + cur_ops->readunlock(idx); + } + if (cur_ops->get_gp_state_full && cur_ops->poll_gp_state_full) { + idx = cur_ops->readlock(); + cur_ops->get_gp_state_full(&cookie_full); + WARN_ONCE(cur_ops->poll_gp_state_full(&cookie_full), + "%s: Cookie check 5 failed %s(%d) online %*pbl\n", + __func__, + rcu_torture_writer_state_getname(), + rcu_torture_writer_state, + cpumask_pr_args(cpu_online_mask)); + if (cur_ops->get_comp_state_full) { + cur_ops->get_comp_state_full(&cookie_full); + WARN_ON_ONCE(!cur_ops->poll_gp_state_full(&cookie_full)); + } + cur_ops->readunlock(idx); + } switch (synctype[torture_random(&rand) % nsynctypes]) { case RTWS_DEF_FREE: rcu_torture_writer_state = RTWS_DEF_FREE; @@ -988,23 +1705,111 @@ rcu_torture_writer(void *arg) break; case RTWS_EXP_SYNC: rcu_torture_writer_state = RTWS_EXP_SYNC; - cur_ops->exp_sync(); + do_rtws_sync(&rand, cur_ops->exp_sync); rcu_torture_pipe_update(old_rp); break; case RTWS_COND_GET: rcu_torture_writer_state = RTWS_COND_GET; - gp_snap = cur_ops->get_state(); - i = torture_random(&rand) % 16; - if (i != 0) - schedule_timeout_interruptible(i); - udelay(torture_random(&rand) % 1000); + gp_snap = cur_ops->get_gp_state(); + torture_hrtimeout_us(torture_random(&rand) % gp_cond_wi, + 1000, &rand); rcu_torture_writer_state = RTWS_COND_SYNC; cur_ops->cond_sync(gp_snap); rcu_torture_pipe_update(old_rp); break; + case RTWS_COND_GET_EXP: + rcu_torture_writer_state = RTWS_COND_GET_EXP; + gp_snap = cur_ops->get_gp_state_exp(); + torture_hrtimeout_us(torture_random(&rand) % gp_cond_wi_exp, + 1000, &rand); + rcu_torture_writer_state = RTWS_COND_SYNC_EXP; + cur_ops->cond_sync_exp(gp_snap); + rcu_torture_pipe_update(old_rp); + break; + case RTWS_COND_GET_FULL: + rcu_torture_writer_state = RTWS_COND_GET_FULL; + cur_ops->get_gp_state_full(&gp_snap_full); + torture_hrtimeout_us(torture_random(&rand) % gp_cond_wi, + 1000, &rand); + rcu_torture_writer_state = RTWS_COND_SYNC_FULL; + cur_ops->cond_sync_full(&gp_snap_full); + rcu_torture_pipe_update(old_rp); + break; + case RTWS_COND_GET_EXP_FULL: + rcu_torture_writer_state = RTWS_COND_GET_EXP_FULL; + cur_ops->get_gp_state_full(&gp_snap_full); + torture_hrtimeout_us(torture_random(&rand) % gp_cond_wi_exp, + 1000, &rand); + rcu_torture_writer_state = RTWS_COND_SYNC_EXP_FULL; + cur_ops->cond_sync_exp_full(&gp_snap_full); + rcu_torture_pipe_update(old_rp); + break; + case RTWS_POLL_GET: + rcu_torture_writer_state = RTWS_POLL_GET; + for (i = 0; i < ulo_size; i++) + ulo[i] = cur_ops->get_comp_state(); + gp_snap = cur_ops->start_gp_poll(); + rcu_torture_writer_state = RTWS_POLL_WAIT; + if (cur_ops->exp_current && !torture_random(&rand) % 0xff) + cur_ops->exp_current(); + while (!cur_ops->poll_gp_state(gp_snap)) { + gp_snap1 = cur_ops->get_gp_state(); + for (i = 0; i < ulo_size; i++) + if (cur_ops->poll_gp_state(ulo[i]) || + cur_ops->same_gp_state(ulo[i], gp_snap1)) { + ulo[i] = gp_snap1; + break; + } + WARN_ON_ONCE(ulo_size > 0 && i >= ulo_size); + torture_hrtimeout_us(torture_random(&rand) % gp_poll_wi, + 1000, &rand); + } + rcu_torture_pipe_update(old_rp); + break; + case RTWS_POLL_GET_FULL: + rcu_torture_writer_state = RTWS_POLL_GET_FULL; + for (i = 0; i < rgo_size; i++) + cur_ops->get_comp_state_full(&rgo[i]); + cur_ops->start_gp_poll_full(&gp_snap_full); + rcu_torture_writer_state = RTWS_POLL_WAIT_FULL; + if (cur_ops->exp_current && !torture_random(&rand) % 0xff) + cur_ops->exp_current(); + while (!cur_ops->poll_gp_state_full(&gp_snap_full)) { + cur_ops->get_gp_state_full(&gp_snap1_full); + for (i = 0; i < rgo_size; i++) + if (cur_ops->poll_gp_state_full(&rgo[i]) || + cur_ops->same_gp_state_full(&rgo[i], + &gp_snap1_full)) { + rgo[i] = gp_snap1_full; + break; + } + WARN_ON_ONCE(rgo_size > 0 && i >= rgo_size); + torture_hrtimeout_us(torture_random(&rand) % gp_poll_wi, + 1000, &rand); + } + rcu_torture_pipe_update(old_rp); + break; + case RTWS_POLL_GET_EXP: + rcu_torture_writer_state = RTWS_POLL_GET_EXP; + gp_snap = cur_ops->start_gp_poll_exp(); + rcu_torture_writer_state = RTWS_POLL_WAIT_EXP; + while (!cur_ops->poll_gp_state_exp(gp_snap)) + torture_hrtimeout_us(torture_random(&rand) % gp_poll_wi_exp, + 1000, &rand); + rcu_torture_pipe_update(old_rp); + break; + case RTWS_POLL_GET_EXP_FULL: + rcu_torture_writer_state = RTWS_POLL_GET_EXP_FULL; + cur_ops->start_gp_poll_exp_full(&gp_snap_full); + rcu_torture_writer_state = RTWS_POLL_WAIT_EXP_FULL; + while (!cur_ops->poll_gp_state_full(&gp_snap_full)) + torture_hrtimeout_us(torture_random(&rand) % gp_poll_wi_exp, + 1000, &rand); + rcu_torture_pipe_update(old_rp); + break; case RTWS_SYNC: rcu_torture_writer_state = RTWS_SYNC; - cur_ops->sync(); + do_rtws_sync(&rand, cur_ops->sync); rcu_torture_pipe_update(old_rp); break; default: @@ -1029,11 +1834,26 @@ rcu_torture_writer(void *arg) !rcu_gp_is_normal(); } rcu_torture_writer_state = RTWS_STUTTER; - if (stutter_wait("rcu_torture_writer")) + stutter_waited = stutter_wait("rcu_torture_writer"); + if (stutter_waited && + !atomic_read(&rcu_fwd_cb_nodelay) && + !cur_ops->slow_gps && + !torture_must_stop() && + time_after(jiffies, stallsdone)) for (i = 0; i < ARRAY_SIZE(rcu_tortures); i++) - if (list_empty(&rcu_tortures[i].rtort_free)) - WARN_ON_ONCE(1); + if (list_empty(&rcu_tortures[i].rtort_free) && + rcu_access_pointer(rcu_torture_current) != &rcu_tortures[i]) { + tracing_off(); + if (cur_ops->gp_kthread_dbg) + cur_ops->gp_kthread_dbg(); + WARN(1, "%s: rtort_pipe_count: %d\n", __func__, rcu_tortures[i].rtort_pipe_count); + rcu_ftrace_dump(DUMP_ALL); + break; + } + if (stutter_waited) + sched_set_normal(current, oldnice); } while (!torture_must_stop()); + rcu_torture_current = NULL; // Let stats task know that we are done. /* Reset expediting back to unexpedited. */ if (expediting > 0) expediting = -expediting; @@ -1044,6 +1864,8 @@ rcu_torture_writer(void *arg) pr_alert("%s" TORTURE_FLAG " Dynamic grace-period expediting was disabled.\n", torture_type); + kfree(ulo); + kfree(rgo); rcu_torture_writer_state = RTWS_STOPPING; torture_kthread_stopping("rcu_torture_writer"); return 0; @@ -1056,26 +1878,99 @@ rcu_torture_writer(void *arg) static int rcu_torture_fakewriter(void *arg) { + unsigned long gp_snap; + struct rcu_gp_oldstate gp_snap_full; DEFINE_TORTURE_RANDOM(rand); VERBOSE_TOROUT_STRING("rcu_torture_fakewriter task started"); set_user_nice(current, MAX_NICE); + if (WARN_ONCE(nsynctypes == 0, + "%s: No update-side primitives.\n", __func__)) { + /* + * No updates primitives, so don't try updating. + * The resulting test won't be testing much, hence the + * above WARN_ONCE(). + */ + torture_kthread_stopping("rcu_torture_fakewriter"); + return 0; + } + do { - schedule_timeout_uninterruptible(1 + torture_random(&rand)%10); - udelay(torture_random(&rand) & 0x3ff); + torture_hrtimeout_jiffies(torture_random(&rand) % 10, &rand); if (cur_ops->cb_barrier != NULL && - torture_random(&rand) % (nfakewriters * 8) == 0) { + torture_random(&rand) % (nrealfakewriters * 8) == 0) { cur_ops->cb_barrier(); - } else if (gp_normal == gp_exp) { - if (cur_ops->sync && torture_random(&rand) & 0x80) - cur_ops->sync(); - else if (cur_ops->exp_sync) + } else { + switch (synctype[torture_random(&rand) % nsynctypes]) { + case RTWS_DEF_FREE: + break; + case RTWS_EXP_SYNC: cur_ops->exp_sync(); - } else if (gp_normal && cur_ops->sync) { - cur_ops->sync(); - } else if (cur_ops->exp_sync) { - cur_ops->exp_sync(); + break; + case RTWS_COND_GET: + gp_snap = cur_ops->get_gp_state(); + torture_hrtimeout_jiffies(torture_random(&rand) % 16, &rand); + cur_ops->cond_sync(gp_snap); + break; + case RTWS_COND_GET_EXP: + gp_snap = cur_ops->get_gp_state_exp(); + torture_hrtimeout_jiffies(torture_random(&rand) % 16, &rand); + cur_ops->cond_sync_exp(gp_snap); + break; + case RTWS_COND_GET_FULL: + cur_ops->get_gp_state_full(&gp_snap_full); + torture_hrtimeout_jiffies(torture_random(&rand) % 16, &rand); + cur_ops->cond_sync_full(&gp_snap_full); + break; + case RTWS_COND_GET_EXP_FULL: + cur_ops->get_gp_state_full(&gp_snap_full); + torture_hrtimeout_jiffies(torture_random(&rand) % 16, &rand); + cur_ops->cond_sync_exp_full(&gp_snap_full); + break; + case RTWS_POLL_GET: + if (cur_ops->start_poll_irqsoff) + local_irq_disable(); + gp_snap = cur_ops->start_gp_poll(); + if (cur_ops->start_poll_irqsoff) + local_irq_enable(); + while (!cur_ops->poll_gp_state(gp_snap)) { + torture_hrtimeout_jiffies(torture_random(&rand) % 16, + &rand); + } + break; + case RTWS_POLL_GET_FULL: + if (cur_ops->start_poll_irqsoff) + local_irq_disable(); + cur_ops->start_gp_poll_full(&gp_snap_full); + if (cur_ops->start_poll_irqsoff) + local_irq_enable(); + while (!cur_ops->poll_gp_state_full(&gp_snap_full)) { + torture_hrtimeout_jiffies(torture_random(&rand) % 16, + &rand); + } + break; + case RTWS_POLL_GET_EXP: + gp_snap = cur_ops->start_gp_poll_exp(); + while (!cur_ops->poll_gp_state_exp(gp_snap)) { + torture_hrtimeout_jiffies(torture_random(&rand) % 16, + &rand); + } + break; + case RTWS_POLL_GET_EXP_FULL: + cur_ops->start_gp_poll_exp_full(&gp_snap_full); + while (!cur_ops->poll_gp_state_full(&gp_snap_full)) { + torture_hrtimeout_jiffies(torture_random(&rand) % 16, + &rand); + } + break; + case RTWS_SYNC: + cur_ops->sync(); + break; + default: + WARN_ON_ONCE(1); + break; + } } stutter_wait("rcu_torture_fakewriter"); } while (!torture_must_stop()); @@ -1089,6 +1984,121 @@ static void rcu_torture_timer_cb(struct rcu_head *rhp) kfree(rhp); } +// Set up and carry out testing of RCU's global memory ordering +static void rcu_torture_reader_do_mbchk(long myid, struct rcu_torture *rtp, + struct torture_random_state *trsp) +{ + unsigned long loops; + int noc = torture_num_online_cpus(); + int rdrchked; + int rdrchker; + struct rcu_torture_reader_check *rtrcp; // Me. + struct rcu_torture_reader_check *rtrcp_assigner; // Assigned us to do checking. + struct rcu_torture_reader_check *rtrcp_chked; // Reader being checked. + struct rcu_torture_reader_check *rtrcp_chker; // Reader doing checking when not me. + + if (myid < 0) + return; // Don't try this from timer handlers. + + // Increment my counter. + rtrcp = &rcu_torture_reader_mbchk[myid]; + WRITE_ONCE(rtrcp->rtc_myloops, rtrcp->rtc_myloops + 1); + + // Attempt to assign someone else some checking work. + rdrchked = torture_random(trsp) % nrealreaders; + rtrcp_chked = &rcu_torture_reader_mbchk[rdrchked]; + rdrchker = torture_random(trsp) % nrealreaders; + rtrcp_chker = &rcu_torture_reader_mbchk[rdrchker]; + if (rdrchked != myid && rdrchked != rdrchker && noc >= rdrchked && noc >= rdrchker && + smp_load_acquire(&rtrcp->rtc_chkrdr) < 0 && // Pairs with smp_store_release below. + !READ_ONCE(rtp->rtort_chkp) && + !smp_load_acquire(&rtrcp_chker->rtc_assigner)) { // Pairs with smp_store_release below. + rtrcp->rtc_chkloops = READ_ONCE(rtrcp_chked->rtc_myloops); + WARN_ON_ONCE(rtrcp->rtc_chkrdr >= 0); + rtrcp->rtc_chkrdr = rdrchked; + WARN_ON_ONCE(rtrcp->rtc_ready); // This gets set after the grace period ends. + if (cmpxchg_relaxed(&rtrcp_chker->rtc_assigner, NULL, rtrcp) || + cmpxchg_relaxed(&rtp->rtort_chkp, NULL, rtrcp)) + (void)cmpxchg_relaxed(&rtrcp_chker->rtc_assigner, rtrcp, NULL); // Back out. + } + + // If assigned some completed work, do it! + rtrcp_assigner = READ_ONCE(rtrcp->rtc_assigner); + if (!rtrcp_assigner || !smp_load_acquire(&rtrcp_assigner->rtc_ready)) + return; // No work or work not yet ready. + rdrchked = rtrcp_assigner->rtc_chkrdr; + if (WARN_ON_ONCE(rdrchked < 0)) + return; + rtrcp_chked = &rcu_torture_reader_mbchk[rdrchked]; + loops = READ_ONCE(rtrcp_chked->rtc_myloops); + atomic_inc(&n_rcu_torture_mbchk_tries); + if (ULONG_CMP_LT(loops, rtrcp_assigner->rtc_chkloops)) + atomic_inc(&n_rcu_torture_mbchk_fail); + rtrcp_assigner->rtc_chkloops = loops + ULONG_MAX / 2; + rtrcp_assigner->rtc_ready = 0; + smp_store_release(&rtrcp->rtc_assigner, NULL); // Someone else can assign us work. + smp_store_release(&rtrcp_assigner->rtc_chkrdr, -1); // Assigner can again assign. +} + +// Verify the specified RCUTORTURE_RDR* state. +#define ROEC_ARGS "%s %s: Current %#x To add %#x To remove %#x preempt_count() %#x\n", __func__, s, curstate, new, old, preempt_count() +static void rcutorture_one_extend_check(char *s, int curstate, int new, int old) +{ + int mask; + + if (!IS_ENABLED(CONFIG_RCU_TORTURE_TEST_CHK_RDR_STATE) || in_nmi()) + return; + + WARN_ONCE(!(curstate & RCUTORTURE_RDR_IRQ) && irqs_disabled() && !in_hardirq(), ROEC_ARGS); + WARN_ONCE((curstate & RCUTORTURE_RDR_IRQ) && !irqs_disabled(), ROEC_ARGS); + + // If CONFIG_PREEMPT_COUNT=n, further checks are unreliable. + if (!IS_ENABLED(CONFIG_PREEMPT_COUNT)) + return; + + WARN_ONCE((curstate & (RCUTORTURE_RDR_BH | RCUTORTURE_RDR_RBH)) && + !softirq_count(), ROEC_ARGS); + WARN_ONCE((curstate & (RCUTORTURE_RDR_PREEMPT | RCUTORTURE_RDR_SCHED)) && + !(preempt_count() & PREEMPT_MASK), ROEC_ARGS); + WARN_ONCE(cur_ops->readlock_nesting && + (curstate & (RCUTORTURE_RDR_RCU_1 | RCUTORTURE_RDR_RCU_2)) && + cur_ops->readlock_nesting() == 0, ROEC_ARGS); + + // Interrupt handlers have all sorts of stuff disabled, so ignore + // unintended disabling. + if (in_serving_softirq() || in_hardirq()) + return; + + WARN_ONCE(cur_ops->extendables && + !(curstate & (RCUTORTURE_RDR_BH | RCUTORTURE_RDR_RBH)) && + softirq_count(), ROEC_ARGS); + + /* + * non-preemptible RCU in a preemptible kernel uses preempt_disable() + * as rcu_read_lock(). + */ + mask = RCUTORTURE_RDR_PREEMPT | RCUTORTURE_RDR_SCHED; + if (!IS_ENABLED(CONFIG_PREEMPT_RCU)) + mask |= RCUTORTURE_RDR_RCU_1 | RCUTORTURE_RDR_RCU_2; + + WARN_ONCE(cur_ops->extendables && !(curstate & mask) && + (preempt_count() & PREEMPT_MASK), ROEC_ARGS); + + /* + * non-preemptible RCU in a preemptible kernel uses "preempt_count() & + * PREEMPT_MASK" as ->readlock_nesting(). + */ + mask = RCUTORTURE_RDR_RCU_1 | RCUTORTURE_RDR_RCU_2; + if (!IS_ENABLED(CONFIG_PREEMPT_RCU)) + mask |= RCUTORTURE_RDR_PREEMPT | RCUTORTURE_RDR_SCHED; + + if (IS_ENABLED(CONFIG_PREEMPT_RT) && softirq_count()) + mask |= RCUTORTURE_RDR_BH | RCUTORTURE_RDR_RBH; + + WARN_ONCE(cur_ops->readlock_nesting && !(curstate & mask) && + cur_ops->readlock_nesting() > 0, ROEC_ARGS); +} + /* * Do one extension of an RCU read-side critical section using the * current reader state in readstate (set to zero for initial entry @@ -1098,59 +2108,117 @@ static void rcu_torture_timer_cb(struct rcu_head *rhp) * beginning or end of the critical section and if there was actually a * change, do a ->read_delay(). */ -static void rcutorture_one_extend(int *readstate, int newstate, - struct torture_random_state *trsp, +static void rcutorture_one_extend(int *readstate, int newstate, struct torture_random_state *trsp, struct rt_read_seg *rtrsp) { - int idxnew = -1; - int idxold = *readstate; + bool first; + unsigned long flags; + int idxnew1 = -1; + int idxnew2 = -1; + int idxold1 = *readstate; + int idxold2 = idxold1; int statesnew = ~*readstate & newstate; int statesold = *readstate & ~newstate; - WARN_ON_ONCE(idxold < 0); - WARN_ON_ONCE((idxold >> RCUTORTURE_RDR_SHIFT) > 1); + first = idxold1 == 0; + WARN_ON_ONCE(idxold2 < 0); + WARN_ON_ONCE(idxold2 & ~(RCUTORTURE_RDR_ALLBITS | RCUTORTURE_RDR_UPDOWN)); + rcutorture_one_extend_check("before change", idxold1, statesnew, statesold); rtrsp->rt_readstate = newstate; /* First, put new protection in place to avoid critical-section gap. */ if (statesnew & RCUTORTURE_RDR_BH) local_bh_disable(); + if (statesnew & RCUTORTURE_RDR_RBH) + rcu_read_lock_bh(); if (statesnew & RCUTORTURE_RDR_IRQ) local_irq_disable(); if (statesnew & RCUTORTURE_RDR_PREEMPT) preempt_disable(); - if (statesnew & RCUTORTURE_RDR_RBH) - rcu_read_lock_bh(); if (statesnew & RCUTORTURE_RDR_SCHED) rcu_read_lock_sched(); - if (statesnew & RCUTORTURE_RDR_RCU) - idxnew = cur_ops->readlock() << RCUTORTURE_RDR_SHIFT; + if (statesnew & RCUTORTURE_RDR_RCU_1) + idxnew1 = (cur_ops->readlock() << RCUTORTURE_RDR_SHIFT_1) & RCUTORTURE_RDR_MASK_1; + if (statesnew & RCUTORTURE_RDR_RCU_2) + idxnew2 = (cur_ops->readlock() << RCUTORTURE_RDR_SHIFT_2) & RCUTORTURE_RDR_MASK_2; + + // Complain unless both the old and the new protection is in place. + rcutorture_one_extend_check("during change", idxold1 | statesnew, statesnew, statesold); + + // Sample CPU under both sets of protections to reduce confusion. + if (IS_ENABLED(CONFIG_RCU_TORTURE_TEST_LOG_CPU)) { + int cpu = raw_smp_processor_id(); + rtrsp->rt_cpu = cpu; + if (!first) { + rtrsp[-1].rt_end_cpu = cpu; + if (cur_ops->reader_blocked) + rtrsp[-1].rt_preempted = cur_ops->reader_blocked(); + } + } + // Sample grace-period sequence number, as good a place as any. + if (IS_ENABLED(CONFIG_RCU_TORTURE_TEST_LOG_GP) && cur_ops->gather_gp_seqs) { + rtrsp->rt_gp_seq = cur_ops->gather_gp_seqs(); + rtrsp->rt_ts = ktime_get_mono_fast_ns(); + if (!first) + rtrsp[-1].rt_gp_seq_end = rtrsp->rt_gp_seq; + } - /* Next, remove old protection, irq first due to bh conflict. */ + /* + * Next, remove old protection, in decreasing order of strength + * to avoid unlock paths that aren't safe in the stronger + * context. Namely: BH can not be enabled with disabled interrupts. + * Additionally PREEMPT_RT requires that BH is enabled in preemptible + * context. + */ if (statesold & RCUTORTURE_RDR_IRQ) local_irq_enable(); - if (statesold & RCUTORTURE_RDR_BH) - local_bh_enable(); if (statesold & RCUTORTURE_RDR_PREEMPT) preempt_enable(); - if (statesold & RCUTORTURE_RDR_RBH) - rcu_read_unlock_bh(); if (statesold & RCUTORTURE_RDR_SCHED) rcu_read_unlock_sched(); - if (statesold & RCUTORTURE_RDR_RCU) - cur_ops->readunlock(idxold >> RCUTORTURE_RDR_SHIFT); + if (statesold & RCUTORTURE_RDR_BH) + local_bh_enable(); + if (statesold & RCUTORTURE_RDR_RBH) + rcu_read_unlock_bh(); + if (statesold & RCUTORTURE_RDR_RCU_2) { + cur_ops->readunlock((idxold2 & RCUTORTURE_RDR_MASK_2) >> RCUTORTURE_RDR_SHIFT_2); + WARN_ON_ONCE(idxnew2 != -1); + idxold2 = 0; + } + if (statesold & RCUTORTURE_RDR_RCU_1) { + bool lockit; + + lockit = !cur_ops->no_pi_lock && !statesnew && !(torture_random(trsp) & 0xffff); + if (lockit) + raw_spin_lock_irqsave(¤t->pi_lock, flags); + cur_ops->readunlock((idxold1 & RCUTORTURE_RDR_MASK_1) >> RCUTORTURE_RDR_SHIFT_1); + WARN_ON_ONCE(idxnew1 != -1); + idxold1 = 0; + if (lockit) + raw_spin_unlock_irqrestore(¤t->pi_lock, flags); + } + if (statesold & RCUTORTURE_RDR_UPDOWN) { + cur_ops->up_read((idxold1 & RCUTORTURE_RDR_MASK_1) >> RCUTORTURE_RDR_SHIFT_1); + WARN_ON_ONCE(idxnew1 != -1); + idxold1 = 0; + } /* Delay if neither beginning nor end and there was a change. */ if ((statesnew || statesold) && *readstate && newstate) cur_ops->read_delay(trsp, rtrsp); /* Update the reader state. */ - if (idxnew == -1) - idxnew = idxold & ~RCUTORTURE_RDR_MASK; - WARN_ON_ONCE(idxnew < 0); - WARN_ON_ONCE((idxnew >> RCUTORTURE_RDR_SHIFT) > 1); - *readstate = idxnew | newstate; - WARN_ON_ONCE((*readstate >> RCUTORTURE_RDR_SHIFT) < 0); - WARN_ON_ONCE((*readstate >> RCUTORTURE_RDR_SHIFT) > 1); + if (idxnew1 == -1) + idxnew1 = idxold1 & RCUTORTURE_RDR_MASK_1; + WARN_ON_ONCE(idxnew1 < 0); + if (idxnew2 == -1) + idxnew2 = idxold2 & RCUTORTURE_RDR_MASK_2; + WARN_ON_ONCE(idxnew2 < 0); + *readstate = idxnew1 | idxnew2 | newstate; + WARN_ON_ONCE(*readstate < 0); + if (WARN_ON_ONCE(*readstate & ~RCUTORTURE_RDR_ALLBITS)) + pr_info("Unexpected readstate value of %#x\n", *readstate); + rcutorture_one_extend_check("after change", *readstate, statesnew, statesold); } /* Return the biggest extendables mask given current RCU and boot parameters. */ @@ -1160,7 +2228,7 @@ static int rcutorture_extend_mask_max(void) WARN_ON_ONCE(extendables & ~RCUTORTURE_MAX_EXTEND); mask = extendables & RCUTORTURE_MAX_EXTEND & cur_ops->extendables; - mask = mask | RCUTORTURE_RDR_RCU; + mask = mask | RCUTORTURE_RDR_RCU_1 | RCUTORTURE_RDR_RCU_2; return mask; } @@ -1169,25 +2237,47 @@ static int rcutorture_extend_mask(int oldmask, struct torture_random_state *trsp) { int mask = rcutorture_extend_mask_max(); - unsigned long randmask1 = torture_random(trsp) >> 8; + unsigned long randmask1 = torture_random(trsp); unsigned long randmask2 = randmask1 >> 3; + unsigned long preempts = RCUTORTURE_RDR_PREEMPT | RCUTORTURE_RDR_SCHED; + unsigned long preempts_irq = preempts | RCUTORTURE_RDR_IRQ; + unsigned long bhs = RCUTORTURE_RDR_BH | RCUTORTURE_RDR_RBH; - WARN_ON_ONCE(mask >> RCUTORTURE_RDR_SHIFT); - /* Most of the time lots of bits, half the time only one bit. */ + WARN_ON_ONCE(mask >> RCUTORTURE_RDR_SHIFT_1); // Can't have reader idx bits. + /* Mostly only one bit (need preemption!), sometimes lots of bits. */ if (!(randmask1 & 0x7)) mask = mask & randmask2; else mask = mask & (1 << (randmask2 % RCUTORTURE_RDR_NBITS)); - /* Can't enable bh w/irq disabled. */ - if ((mask & RCUTORTURE_RDR_IRQ) && - ((!(mask & RCUTORTURE_RDR_BH) && (oldmask & RCUTORTURE_RDR_BH)) || - (!(mask & RCUTORTURE_RDR_RBH) && (oldmask & RCUTORTURE_RDR_RBH)))) - mask |= RCUTORTURE_RDR_BH | RCUTORTURE_RDR_RBH; - if ((mask & RCUTORTURE_RDR_IRQ) && - !(mask & cur_ops->ext_irq_conflict) && - (oldmask & cur_ops->ext_irq_conflict)) - mask |= cur_ops->ext_irq_conflict; /* Or if readers object. */ - return mask ?: RCUTORTURE_RDR_RCU; + + // Can't have nested RCU reader without outer RCU reader. + if (!(mask & RCUTORTURE_RDR_RCU_1) && (mask & RCUTORTURE_RDR_RCU_2)) { + if (oldmask & RCUTORTURE_RDR_RCU_1) + mask &= ~RCUTORTURE_RDR_RCU_2; + else + mask |= RCUTORTURE_RDR_RCU_1; + } + + /* + * Can't enable bh w/irq disabled. + */ + if (mask & RCUTORTURE_RDR_IRQ) + mask |= oldmask & bhs; + + /* + * Ideally these sequences would be detected in debug builds + * (regardless of RT), but until then don't stop testing + * them on non-RT. + */ + if (IS_ENABLED(CONFIG_PREEMPT_RT)) { + /* Can't modify BH in atomic context */ + if (oldmask & preempts_irq) + mask &= ~bhs; + if ((oldmask | mask) & preempts_irq) + mask |= oldmask & bhs; + } + + return mask ?: RCUTORTURE_RDR_RCU_1; } /* @@ -1195,8 +2285,7 @@ rcutorture_extend_mask(int oldmask, struct torture_random_state *trsp) * critical section. */ static struct rt_read_seg * -rcutorture_loop_extend(int *readstate, struct torture_random_state *trsp, - struct rt_read_seg *rtrsp) +rcutorture_loop_extend(int *readstate, struct torture_random_state *trsp, struct rt_read_seg *rtrsp) { int i; int j; @@ -1206,82 +2295,153 @@ rcutorture_loop_extend(int *readstate, struct torture_random_state *trsp, if (!((mask - 1) & mask)) return rtrsp; /* Current RCU reader not extendable. */ /* Bias towards larger numbers of loops. */ - i = (torture_random(trsp) >> 3); + i = torture_random(trsp); i = ((i | (i >> 3)) & RCUTORTURE_RDR_MAX_LOOPS) + 1; for (j = 0; j < i; j++) { mask = rcutorture_extend_mask(*readstate, trsp); + WARN_ON_ONCE(mask & RCUTORTURE_RDR_UPDOWN); rcutorture_one_extend(readstate, mask, trsp, &rtrsp[j]); } return &rtrsp[j]; } -/* - * Do one read-side critical section, returning false if there was - * no data to read. Can be invoked both from process context and - * from a timer handler. - */ -static bool rcu_torture_one_read(struct torture_random_state *trsp) -{ - int i; +struct rcu_torture_one_read_state { + bool checkpolling; + unsigned long cookie; + struct rcu_gp_oldstate cookie_full; unsigned long started; - unsigned long completed; - int newstate; struct rcu_torture *p; - int pipe_count; - int readstate = 0; - struct rt_read_seg rtseg[RCUTORTURE_RDR_MAX_SEGS] = { { 0 } }; - struct rt_read_seg *rtrsp = &rtseg[0]; - struct rt_read_seg *rtrsp1; + int readstate; + struct rt_read_seg rtseg[RCUTORTURE_RDR_MAX_SEGS]; + struct rt_read_seg *rtrsp; unsigned long long ts; +}; + +static void init_rcu_torture_one_read_state(struct rcu_torture_one_read_state *rtorsp, + struct torture_random_state *trsp) +{ + memset(rtorsp, 0, sizeof(*rtorsp)); + rtorsp->checkpolling = !(torture_random(trsp) & 0xfff); + rtorsp->rtrsp = &rtorsp->rtseg[0]; +} - newstate = rcutorture_extend_mask(readstate, trsp); - rcutorture_one_extend(&readstate, newstate, trsp, rtrsp++); - started = cur_ops->get_gp_seq(); - ts = rcu_trace_clock_local(); - p = rcu_dereference_check(rcu_torture_current, - rcu_read_lock_bh_held() || - rcu_read_lock_sched_held() || - srcu_read_lock_held(srcu_ctlp) || - torturing_tasks()); - if (p == NULL) { +/* + * Set up the first segment of a series of overlapping read-side + * critical sections. The caller must have actually initiated the + * outermost read-side critical section. + */ +static bool rcu_torture_one_read_start(struct rcu_torture_one_read_state *rtorsp, + struct torture_random_state *trsp, long myid) +{ + if (rtorsp->checkpolling) { + if (cur_ops->get_gp_state && cur_ops->poll_gp_state) + rtorsp->cookie = cur_ops->get_gp_state(); + if (cur_ops->get_gp_state_full && cur_ops->poll_gp_state_full) + cur_ops->get_gp_state_full(&rtorsp->cookie_full); + } + rtorsp->started = cur_ops->get_gp_seq(); + rtorsp->ts = rcu_trace_clock_local(); + rtorsp->p = rcu_dereference_check(rcu_torture_current, + !cur_ops->readlock_held || cur_ops->readlock_held() || + (rtorsp->readstate & RCUTORTURE_RDR_UPDOWN)); + if (rtorsp->p == NULL) { /* Wait for rcu_torture_writer to get underway */ - rcutorture_one_extend(&readstate, 0, trsp, rtrsp); + rcutorture_one_extend(&rtorsp->readstate, 0, trsp, rtorsp->rtrsp); return false; } - if (p->rtort_mbtest == 0) + if (rtorsp->p->rtort_mbtest == 0) atomic_inc(&n_rcu_torture_mberror); - rtrsp = rcutorture_loop_extend(&readstate, trsp, rtrsp); + rcu_torture_reader_do_mbchk(myid, rtorsp->p, trsp); + return true; +} + +/* + * Complete the last segment of a series of overlapping read-side + * critical sections and check for errors. + */ +static void rcu_torture_one_read_end(struct rcu_torture_one_read_state *rtorsp, + struct torture_random_state *trsp) +{ + int i; + unsigned long completed; + int pipe_count; + bool preempted = false; + struct rt_read_seg *rtrsp1; + preempt_disable(); - pipe_count = p->rtort_pipe_count; + pipe_count = READ_ONCE(rtorsp->p->rtort_pipe_count); if (pipe_count > RCU_TORTURE_PIPE_LEN) { - /* Should not happen, but... */ + // Should not happen in a correct RCU implementation, + // happens quite often for torture_type=busted. pipe_count = RCU_TORTURE_PIPE_LEN; } completed = cur_ops->get_gp_seq(); if (pipe_count > 1) { - do_trace_rcu_torture_read(cur_ops->name, &p->rtort_rcu, - ts, started, completed); + do_trace_rcu_torture_read(cur_ops->name, &rtorsp->p->rtort_rcu, + rtorsp->ts, rtorsp->started, completed); rcu_ftrace_dump(DUMP_ALL); } __this_cpu_inc(rcu_torture_count[pipe_count]); - completed = rcutorture_seq_diff(completed, started); + completed = rcutorture_seq_diff(completed, rtorsp->started); if (completed > RCU_TORTURE_PIPE_LEN) { /* Should not happen, but... */ completed = RCU_TORTURE_PIPE_LEN; } __this_cpu_inc(rcu_torture_batch[completed]); preempt_enable(); - rcutorture_one_extend(&readstate, 0, trsp, rtrsp); - WARN_ON_ONCE(readstate & RCUTORTURE_RDR_MASK); + if (rtorsp->checkpolling) { + if (cur_ops->get_gp_state && cur_ops->poll_gp_state) + WARN_ONCE(cur_ops->poll_gp_state(rtorsp->cookie), + "%s: Cookie check 2 failed %s(%d) %lu->%lu\n", + __func__, + rcu_torture_writer_state_getname(), + rcu_torture_writer_state, + rtorsp->cookie, cur_ops->get_gp_state()); + if (cur_ops->get_gp_state_full && cur_ops->poll_gp_state_full) + WARN_ONCE(cur_ops->poll_gp_state_full(&rtorsp->cookie_full), + "%s: Cookie check 6 failed %s(%d) online %*pbl\n", + __func__, + rcu_torture_writer_state_getname(), + rcu_torture_writer_state, + cpumask_pr_args(cpu_online_mask)); + } + if (cur_ops->reader_blocked) + preempted = cur_ops->reader_blocked(); + rcutorture_one_extend(&rtorsp->readstate, 0, trsp, rtorsp->rtrsp); + WARN_ON_ONCE(rtorsp->readstate); + // This next splat is expected behavior if leakpointer, especially + // for CONFIG_RCU_STRICT_GRACE_PERIOD=y kernels. + WARN_ON_ONCE(leakpointer && READ_ONCE(rtorsp->p->rtort_pipe_count) > 1); /* If error or close call, record the sequence of reader protections. */ if ((pipe_count > 1 || completed > 1) && !xchg(&err_segs_recorded, 1)) { i = 0; - for (rtrsp1 = &rtseg[0]; rtrsp1 < rtrsp; rtrsp1++) + for (rtrsp1 = &rtorsp->rtseg[0]; rtrsp1 < rtorsp->rtrsp; rtrsp1++) err_segs[i++] = *rtrsp1; rt_read_nsegs = i; + rt_read_preempted = preempted; } +} +/* + * Do one read-side critical section, returning false if there was + * no data to read. Can be invoked both from process context and + * from a timer handler. + */ +static bool rcu_torture_one_read(struct torture_random_state *trsp, long myid) +{ + int newstate; + struct rcu_torture_one_read_state rtors; + + WARN_ON_ONCE(!rcu_is_watching()); + init_rcu_torture_one_read_state(&rtors, trsp); + newstate = rcutorture_extend_mask(rtors.readstate, trsp); + WARN_ON_ONCE(newstate & RCUTORTURE_RDR_UPDOWN); + rcutorture_one_extend(&rtors.readstate, newstate, trsp, rtors.rtrsp++); + if (!rcu_torture_one_read_start(&rtors, trsp, myid)) + return false; + rtors.rtrsp = rcutorture_loop_extend(&rtors.readstate, trsp, rtors.rtrsp); + rcu_torture_one_read_end(&rtors, trsp); return true; } @@ -1296,7 +2456,7 @@ static DEFINE_TORTURE_RANDOM_PERCPU(rcu_torture_timer_rand); static void rcu_torture_timer(struct timer_list *unused) { atomic_long_inc(&n_rcu_torture_timers); - (void)rcu_torture_one_read(this_cpu_ptr(&rcu_torture_timer_rand)); + (void)rcu_torture_one_read(this_cpu_ptr(&rcu_torture_timer_rand), -1); /* Test call_rcu() invocation from interrupt handler. */ if (cur_ops->call) { @@ -1326,30 +2486,225 @@ rcu_torture_reader(void *arg) set_user_nice(current, MAX_NICE); if (irqreader && cur_ops->irq_capable) timer_setup_on_stack(&t, rcu_torture_timer, 0); - + tick_dep_set_task(current, TICK_DEP_BIT_RCU); // CPU bound, so need tick. do { if (irqreader && cur_ops->irq_capable) { if (!timer_pending(&t)) mod_timer(&t, jiffies + 1); } - if (!rcu_torture_one_read(&rand)) + if (!rcu_torture_one_read(&rand, myid) && !torture_must_stop()) schedule_timeout_interruptible(HZ); - if (time_after(jiffies, lastsleep)) { - schedule_timeout_interruptible(1); + if (time_after(jiffies, lastsleep) && !torture_must_stop()) { + torture_hrtimeout_us(500, 1000, &rand); lastsleep = jiffies + 10; } - while (num_online_cpus() < mynumonline && !torture_must_stop()) + while (!torture_must_stop() && + (torture_num_online_cpus() < mynumonline || !rcu_inkernel_boot_has_ended())) schedule_timeout_interruptible(HZ / 5); stutter_wait("rcu_torture_reader"); } while (!torture_must_stop()); if (irqreader && cur_ops->irq_capable) { - del_timer_sync(&t); - destroy_timer_on_stack(&t); + timer_delete_sync(&t); + timer_destroy_on_stack(&t); } + tick_dep_clear_task(current, TICK_DEP_BIT_RCU); torture_kthread_stopping("rcu_torture_reader"); return 0; } +struct rcu_torture_one_read_state_updown { + struct hrtimer rtorsu_hrt; + bool rtorsu_inuse; + ktime_t rtorsu_kt; + int rtorsu_cpu; + unsigned long rtorsu_j; + unsigned long rtorsu_ndowns; + unsigned long rtorsu_nups; + unsigned long rtorsu_nmigrates; + struct torture_random_state rtorsu_trs; + struct rcu_torture_one_read_state rtorsu_rtors; +}; + +static struct rcu_torture_one_read_state_updown *updownreaders; +static DEFINE_TORTURE_RANDOM(rcu_torture_updown_rand); +static int rcu_torture_updown(void *arg); + +static enum hrtimer_restart rcu_torture_updown_hrt(struct hrtimer *hrtp) +{ + int cpu = raw_smp_processor_id(); + struct rcu_torture_one_read_state_updown *rtorsup; + + rtorsup = container_of(hrtp, struct rcu_torture_one_read_state_updown, rtorsu_hrt); + rcu_torture_one_read_end(&rtorsup->rtorsu_rtors, &rtorsup->rtorsu_trs); + WARN_ONCE(rtorsup->rtorsu_nups >= rtorsup->rtorsu_ndowns, "%s: Up without matching down #%zu.\n", __func__, rtorsup - updownreaders); + WRITE_ONCE(rtorsup->rtorsu_nups, rtorsup->rtorsu_nups + 1); + WRITE_ONCE(rtorsup->rtorsu_nmigrates, + rtorsup->rtorsu_nmigrates + (cpu != rtorsup->rtorsu_cpu)); + smp_store_release(&rtorsup->rtorsu_inuse, false); + return HRTIMER_NORESTART; +} + +static int rcu_torture_updown_init(void) +{ + int i; + struct torture_random_state *rand = &rcu_torture_updown_rand; + int ret; + + if (n_up_down < 0) + return 0; + if (!srcu_torture_have_up_down()) { + VERBOSE_TOROUT_STRING("rcu_torture_updown_init: Disabling up/down reader tests due to lack of primitives"); + return 0; + } + updownreaders = kcalloc(n_up_down, sizeof(*updownreaders), GFP_KERNEL); + if (!updownreaders) { + VERBOSE_TOROUT_STRING("rcu_torture_updown_init: Out of memory, disabling up/down reader tests"); + return -ENOMEM; + } + for (i = 0; i < n_up_down; i++) { + init_rcu_torture_one_read_state(&updownreaders[i].rtorsu_rtors, rand); + hrtimer_setup(&updownreaders[i].rtorsu_hrt, rcu_torture_updown_hrt, CLOCK_MONOTONIC, + HRTIMER_MODE_REL | HRTIMER_MODE_HARD); + torture_random_init(&updownreaders[i].rtorsu_trs); + init_rcu_torture_one_read_state(&updownreaders[i].rtorsu_rtors, + &updownreaders[i].rtorsu_trs); + } + ret = torture_create_kthread(rcu_torture_updown, rand, updown_task); + if (ret) { + kfree(updownreaders); + updownreaders = NULL; + } + return ret; +} + +static void rcu_torture_updown_cleanup(void) +{ + struct rcu_torture_one_read_state_updown *rtorsup; + + for (rtorsup = updownreaders; rtorsup < &updownreaders[n_up_down]; rtorsup++) { + if (!smp_load_acquire(&rtorsup->rtorsu_inuse)) + continue; + if (hrtimer_cancel(&rtorsup->rtorsu_hrt) || WARN_ON_ONCE(rtorsup->rtorsu_inuse)) { + rcu_torture_one_read_end(&rtorsup->rtorsu_rtors, &rtorsup->rtorsu_trs); + WARN_ONCE(rtorsup->rtorsu_nups >= rtorsup->rtorsu_ndowns, "%s: Up without matching down #%zu.\n", __func__, rtorsup - updownreaders); + WRITE_ONCE(rtorsup->rtorsu_nups, rtorsup->rtorsu_nups + 1); + smp_store_release(&rtorsup->rtorsu_inuse, false); + } + + } + kfree(updownreaders); + updownreaders = NULL; +} + +// Do one reader for rcu_torture_updown(). +static void rcu_torture_updown_one(struct rcu_torture_one_read_state_updown *rtorsup) +{ + int idx; + int rawidx; + ktime_t t; + + init_rcu_torture_one_read_state(&rtorsup->rtorsu_rtors, &rtorsup->rtorsu_trs); + rawidx = cur_ops->down_read(); + WRITE_ONCE(rtorsup->rtorsu_ndowns, rtorsup->rtorsu_ndowns + 1); + idx = (rawidx << RCUTORTURE_RDR_SHIFT_1) & RCUTORTURE_RDR_MASK_1; + rtorsup->rtorsu_rtors.readstate = idx | RCUTORTURE_RDR_UPDOWN; + rtorsup->rtorsu_rtors.rtrsp++; + rtorsup->rtorsu_cpu = raw_smp_processor_id(); + if (!rcu_torture_one_read_start(&rtorsup->rtorsu_rtors, &rtorsup->rtorsu_trs, -1)) { + WARN_ONCE(rtorsup->rtorsu_nups >= rtorsup->rtorsu_ndowns, "%s: Up without matching down #%zu.\n", __func__, rtorsup - updownreaders); + WRITE_ONCE(rtorsup->rtorsu_nups, rtorsup->rtorsu_nups + 1); + schedule_timeout_idle(HZ); + return; + } + smp_store_release(&rtorsup->rtorsu_inuse, true); + t = torture_random(&rtorsup->rtorsu_trs) & 0xfffff; // One per million. + if (t < 10 * 1000) + t = 200 * 1000 * 1000; + hrtimer_start(&rtorsup->rtorsu_hrt, t, HRTIMER_MODE_REL | HRTIMER_MODE_HARD); + smp_mb(); // Sample jiffies after posting hrtimer. + rtorsup->rtorsu_j = jiffies; // Not used by hrtimer handler. + rtorsup->rtorsu_kt = t; +} + +/* + * RCU torture up/down reader kthread, starting RCU readers in kthread + * context and ending them in hrtimer handlers. Otherwise similar to + * rcu_torture_reader(). + */ +static int +rcu_torture_updown(void *arg) +{ + unsigned long j; + struct rcu_torture_one_read_state_updown *rtorsup; + + VERBOSE_TOROUT_STRING("rcu_torture_updown task started"); + do { + for (rtorsup = updownreaders; rtorsup < &updownreaders[n_up_down]; rtorsup++) { + if (torture_must_stop()) + break; + j = smp_load_acquire(&jiffies); // Time before ->rtorsu_inuse. + if (smp_load_acquire(&rtorsup->rtorsu_inuse)) { + WARN_ONCE(time_after(j, rtorsup->rtorsu_j + 1 + HZ * 10), + "hrtimer queued at jiffies %lu for %lld ns took %lu jiffies\n", rtorsup->rtorsu_j, rtorsup->rtorsu_kt, j - rtorsup->rtorsu_j); + continue; + } + rcu_torture_updown_one(rtorsup); + } + torture_hrtimeout_ms(1, 1000, &rcu_torture_updown_rand); + stutter_wait("rcu_torture_updown"); + } while (!torture_must_stop()); + rcu_torture_updown_cleanup(); + torture_kthread_stopping("rcu_torture_updown"); + return 0; +} + +/* + * Randomly Toggle CPUs' callback-offload state. This uses hrtimers to + * increase race probabilities and fuzzes the interval between toggling. + */ +static int rcu_nocb_toggle(void *arg) +{ + int cpu; + int maxcpu = -1; + int oldnice = task_nice(current); + long r; + DEFINE_TORTURE_RANDOM(rand); + ktime_t toggle_delay; + unsigned long toggle_fuzz; + ktime_t toggle_interval = ms_to_ktime(nocbs_toggle); + + VERBOSE_TOROUT_STRING("rcu_nocb_toggle task started"); + while (!rcu_inkernel_boot_has_ended()) + schedule_timeout_interruptible(HZ / 10); + for_each_possible_cpu(cpu) + maxcpu = cpu; + WARN_ON(maxcpu < 0); + if (toggle_interval > ULONG_MAX) + toggle_fuzz = ULONG_MAX >> 3; + else + toggle_fuzz = toggle_interval >> 3; + if (toggle_fuzz <= 0) + toggle_fuzz = NSEC_PER_USEC; + do { + r = torture_random(&rand); + cpu = (r >> 1) % (maxcpu + 1); + if (r & 0x1) { + rcu_nocb_cpu_offload(cpu); + atomic_long_inc(&n_nocb_offload); + } else { + rcu_nocb_cpu_deoffload(cpu); + atomic_long_inc(&n_nocb_deoffload); + } + toggle_delay = torture_random(&rand) % toggle_fuzz + toggle_interval; + set_current_state(TASK_INTERRUPTIBLE); + schedule_hrtimeout(&toggle_delay, HRTIMER_MODE_REL); + if (stutter_wait("rcu_nocb_toggle")) + sched_set_normal(current, oldnice); + } while (!torture_must_stop()); + torture_kthread_stopping("rcu_nocb_toggle"); + return 0; +} + /* * Print torture statistics. Caller must ensure that there is only * one call to this function at a given time!!! This is normally @@ -1365,54 +2720,81 @@ rcu_torture_stats_print(void) int i; long pipesummary[RCU_TORTURE_PIPE_LEN + 1] = { 0 }; long batchsummary[RCU_TORTURE_PIPE_LEN + 1] = { 0 }; + long n_gpwraps = 0; + unsigned long ndowns = 0; + unsigned long nunexpired = 0; + unsigned long nmigrates = 0; + unsigned long nups = 0; + struct rcu_torture *rtcp; static unsigned long rtcv_snap = ULONG_MAX; static bool splatted; struct task_struct *wtp; for_each_possible_cpu(cpu) { for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) { - pipesummary[i] += per_cpu(rcu_torture_count, cpu)[i]; - batchsummary[i] += per_cpu(rcu_torture_batch, cpu)[i]; + pipesummary[i] += READ_ONCE(per_cpu(rcu_torture_count, cpu)[i]); + batchsummary[i] += READ_ONCE(per_cpu(rcu_torture_batch, cpu)[i]); + } + if (cur_ops->get_gpwrap_count) + n_gpwraps += cur_ops->get_gpwrap_count(cpu); + } + if (updownreaders) { + for (i = 0; i < n_up_down; i++) { + ndowns += READ_ONCE(updownreaders[i].rtorsu_ndowns); + nups += READ_ONCE(updownreaders[i].rtorsu_nups); + nunexpired += READ_ONCE(updownreaders[i].rtorsu_inuse); + nmigrates += READ_ONCE(updownreaders[i].rtorsu_nmigrates); } } - for (i = RCU_TORTURE_PIPE_LEN - 1; i >= 0; i--) { + for (i = RCU_TORTURE_PIPE_LEN; i >= 0; i--) { if (pipesummary[i] != 0) break; - } + } // The value of variable "i" is used later, so don't clobber it! pr_alert("%s%s ", torture_type, TORTURE_FLAG); - pr_cont("rtc: %p ver: %lu tfle: %d rta: %d rtaf: %d rtf: %d ", - rcu_torture_current, + rtcp = rcu_access_pointer(rcu_torture_current); + pr_cont("rtc: %p %s: %lu tfle: %d rta: %d rtaf: %d rtf: %d ", + rtcp, + rtcp && !rcu_stall_is_suppressed_at_boot() ? "ver" : "VER", rcu_torture_current_version, list_empty(&rcu_torture_freelist), atomic_read(&n_rcu_torture_alloc), atomic_read(&n_rcu_torture_alloc_fail), atomic_read(&n_rcu_torture_free)); - pr_cont("rtmbe: %d rtbe: %ld rtbke: %ld rtbre: %ld ", + pr_cont("rtmbe: %d rtmbkf: %d/%d rtbe: %ld rtbke: %ld ", atomic_read(&n_rcu_torture_mberror), + atomic_read(&n_rcu_torture_mbchk_fail), atomic_read(&n_rcu_torture_mbchk_tries), n_rcu_torture_barrier_error, - n_rcu_torture_boost_ktrerror, - n_rcu_torture_boost_rterror); + n_rcu_torture_boost_ktrerror); pr_cont("rtbf: %ld rtb: %ld nt: %ld ", n_rcu_torture_boost_failure, n_rcu_torture_boosts, atomic_long_read(&n_rcu_torture_timers)); + if (updownreaders) + pr_cont("ndowns: %lu nups: %lu nhrt: %lu nmigrates: %lu ", ndowns, nups, nunexpired, nmigrates); torture_onoff_stats(); - pr_cont("barrier: %ld/%ld:%ld\n", - n_barrier_successes, - n_barrier_attempts, - n_rcu_torture_barrier_error); + pr_cont("barrier: %ld/%ld:%ld ", + data_race(n_barrier_successes), + data_race(n_barrier_attempts), + data_race(n_rcu_torture_barrier_error)); + pr_cont("read-exits: %ld ", data_race(n_read_exits)); // Statistic. + pr_cont("nocb-toggles: %ld:%ld ", + atomic_long_read(&n_nocb_offload), atomic_long_read(&n_nocb_deoffload)); + pr_cont("gpwraps: %ld\n", n_gpwraps); pr_alert("%s%s ", torture_type, TORTURE_FLAG); - if (atomic_read(&n_rcu_torture_mberror) != 0 || - n_rcu_torture_barrier_error != 0 || - n_rcu_torture_boost_ktrerror != 0 || - n_rcu_torture_boost_rterror != 0 || - n_rcu_torture_boost_failure != 0 || - i > 1) { + if (atomic_read(&n_rcu_torture_mberror) || + atomic_read(&n_rcu_torture_mbchk_fail) || + n_rcu_torture_barrier_error || n_rcu_torture_boost_ktrerror || + n_rcu_torture_boost_failure || i > 1) { pr_cont("%s", "!!! "); atomic_inc(&n_rcu_torture_error); - WARN_ON_ONCE(1); + WARN_ON_ONCE(atomic_read(&n_rcu_torture_mberror)); + WARN_ON_ONCE(atomic_read(&n_rcu_torture_mbchk_fail)); + WARN_ON_ONCE(n_rcu_torture_barrier_error); // rcu_barrier() + WARN_ON_ONCE(n_rcu_torture_boost_ktrerror); // no boost kthread + WARN_ON_ONCE(n_rcu_torture_boost_failure); // boost failed (TIMER_SOFTIRQ RT prio?) + WARN_ON_ONCE(i > 1); // Too-short grace period } pr_cont("Reader Pipe: "); for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) @@ -1435,25 +2817,26 @@ rcu_torture_stats_print(void) if (cur_ops->stats) cur_ops->stats(); if (rtcv_snap == rcu_torture_current_version && - rcu_torture_current != NULL) { + rcu_access_pointer(rcu_torture_current) && + !rcu_stall_is_suppressed() && + rcu_inkernel_boot_has_ended()) { int __maybe_unused flags = 0; unsigned long __maybe_unused gp_seq = 0; - rcutorture_get_gp_data(cur_ops->ttype, - &flags, &gp_seq); - srcutorture_get_gp_data(cur_ops->ttype, srcu_ctlp, - &flags, &gp_seq); + if (cur_ops->get_gp_data) + cur_ops->get_gp_data(&flags, &gp_seq); wtp = READ_ONCE(writer_task); - pr_alert("??? Writer stall state %s(%d) g%lu f%#x ->state %#lx cpu %d\n", + pr_alert("??? Writer stall state %s(%d) g%lu f%#x ->state %#x cpu %d\n", rcu_torture_writer_state_getname(), rcu_torture_writer_state, gp_seq, flags, - wtp == NULL ? ~0UL : wtp->state, + wtp == NULL ? ~0U : wtp->__state, wtp == NULL ? -1 : (int)task_cpu(wtp)); if (!splatted && wtp) { sched_show_task(wtp); splatted = true; } - show_rcu_gp_kthreads(); + if (cur_ops->gp_kthread_dbg) + cur_ops->gp_kthread_dbg(); rcu_ftrace_dump(DUMP_ALL); } rtcv_snap = rcu_torture_current_version; @@ -1476,6 +2859,56 @@ rcu_torture_stats(void *arg) return 0; } +/* Test mem_dump_obj() and friends. */ +static void rcu_torture_mem_dump_obj(void) +{ + struct rcu_head *rhp; + struct kmem_cache *kcp; + static int z; + + kcp = kmem_cache_create("rcuscale", 136, 8, SLAB_STORE_USER, NULL); + if (WARN_ON_ONCE(!kcp)) + return; + rhp = kmem_cache_alloc(kcp, GFP_KERNEL); + if (WARN_ON_ONCE(!rhp)) { + kmem_cache_destroy(kcp); + return; + } + pr_alert("mem_dump_obj() slab test: rcu_torture_stats = %px, &rhp = %px, rhp = %px, &z = %px\n", stats_task, &rhp, rhp, &z); + pr_alert("mem_dump_obj(ZERO_SIZE_PTR):"); + mem_dump_obj(ZERO_SIZE_PTR); + pr_alert("mem_dump_obj(NULL):"); + mem_dump_obj(NULL); + pr_alert("mem_dump_obj(%px):", &rhp); + mem_dump_obj(&rhp); + pr_alert("mem_dump_obj(%px):", rhp); + mem_dump_obj(rhp); + pr_alert("mem_dump_obj(%px):", &rhp->func); + mem_dump_obj(&rhp->func); + pr_alert("mem_dump_obj(%px):", &z); + mem_dump_obj(&z); + kmem_cache_free(kcp, rhp); + kmem_cache_destroy(kcp); + rhp = kmalloc(sizeof(*rhp), GFP_KERNEL); + if (WARN_ON_ONCE(!rhp)) + return; + pr_alert("mem_dump_obj() kmalloc test: rcu_torture_stats = %px, &rhp = %px, rhp = %px\n", stats_task, &rhp, rhp); + pr_alert("mem_dump_obj(kmalloc %px):", rhp); + mem_dump_obj(rhp); + pr_alert("mem_dump_obj(kmalloc %px):", &rhp->func); + mem_dump_obj(&rhp->func); + kfree(rhp); + rhp = vmalloc(4096); + if (WARN_ON_ONCE(!rhp)) + return; + pr_alert("mem_dump_obj() vmalloc test: rcu_torture_stats = %px, &rhp = %px, rhp = %px\n", stats_task, &rhp, rhp); + pr_alert("mem_dump_obj(vmalloc %px):", rhp); + mem_dump_obj(rhp); + pr_alert("mem_dump_obj(vmalloc %px):", &rhp->func); + mem_dump_obj(&rhp->func); + vfree(rhp); +} + static void rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, const char *tag) { @@ -1485,18 +2918,30 @@ rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, const char *tag) "shuffle_interval=%d stutter=%d irqreader=%d " "fqs_duration=%d fqs_holdoff=%d fqs_stutter=%d " "test_boost=%d/%d test_boost_interval=%d " - "test_boost_duration=%d shutdown_secs=%d " + "test_boost_duration=%d test_boost_holdoff=%d shutdown_secs=%d " "stall_cpu=%d stall_cpu_holdoff=%d stall_cpu_irqsoff=%d " + "stall_cpu_block=%d stall_cpu_repeat=%d " "n_barrier_cbs=%d " - "onoff_interval=%d onoff_holdoff=%d\n", - torture_type, tag, nrealreaders, nfakewriters, + "onoff_interval=%d onoff_holdoff=%d " + "read_exit_delay=%d read_exit_burst=%d " + "reader_flavor=%x " + "nocbs_nthreads=%d nocbs_toggle=%d " + "test_nmis=%d " + "preempt_duration=%d preempt_interval=%d n_up_down=%d\n", + torture_type, tag, nrealreaders, nrealfakewriters, stat_interval, verbose, test_no_idle_hz, shuffle_interval, stutter, irqreader, fqs_duration, fqs_holdoff, fqs_stutter, test_boost, cur_ops->can_boost, - test_boost_interval, test_boost_duration, shutdown_secs, + test_boost_interval, test_boost_duration, test_boost_holdoff, shutdown_secs, stall_cpu, stall_cpu_holdoff, stall_cpu_irqsoff, + stall_cpu_block, stall_cpu_repeat, n_barrier_cbs, - onoff_interval, onoff_holdoff); + onoff_interval, onoff_holdoff, + read_exit_delay, read_exit_burst, + reader_flavor, + nocbs_nthreads, nocbs_toggle, + test_nmis, + preempt_duration, preempt_interval, n_up_down); } static int rcutorture_booster_cleanup(unsigned int cpu) @@ -1523,13 +2968,33 @@ static int rcutorture_booster_init(unsigned int cpu) if (boost_tasks[cpu] != NULL) return 0; /* Already created, nothing more to do. */ + // Testing RCU priority boosting requires rcutorture do + // some serious abuse. Counter this by running ksoftirqd + // at higher priority. + if (IS_BUILTIN(CONFIG_RCU_TORTURE_TEST)) { + struct sched_param sp; + struct task_struct *t; + + t = per_cpu(ksoftirqd, cpu); + WARN_ON_ONCE(!t); + sp.sched_priority = 2; + sched_setscheduler_nocheck(t, SCHED_FIFO, &sp); +#ifdef CONFIG_IRQ_FORCED_THREADING + if (force_irqthreads()) { + t = per_cpu(ktimerd, cpu); + WARN_ON_ONCE(!t); + sp.sched_priority = 2; + sched_setscheduler_nocheck(t, SCHED_FIFO, &sp); + } +#endif + } + /* Don't allow time recalculation while creating a new task. */ mutex_lock(&boost_mutex); rcu_torture_disable_rt_throttle(); VERBOSE_TOROUT_STRING("Creating rcu_torture_boost task"); - boost_tasks[cpu] = kthread_create_on_node(rcu_torture_boost, NULL, - cpu_to_node(cpu), - "rcu_torture_boost"); + boost_tasks[cpu] = kthread_run_on_cpu(rcu_torture_boost, NULL, + cpu, "rcu_torture_boost_%u"); if (IS_ERR(boost_tasks[cpu])) { retval = PTR_ERR(boost_tasks[cpu]); VERBOSE_TOROUT_STRING("rcu_torture_boost task create failed"); @@ -1538,45 +3003,107 @@ static int rcutorture_booster_init(unsigned int cpu) mutex_unlock(&boost_mutex); return retval; } - kthread_bind(boost_tasks[cpu], cpu); - wake_up_process(boost_tasks[cpu]); mutex_unlock(&boost_mutex); return 0; } +static int rcu_torture_stall_nf(struct notifier_block *nb, unsigned long v, void *ptr) +{ + pr_info("%s: v=%lu, duration=%lu.\n", __func__, v, (unsigned long)ptr); + return NOTIFY_OK; +} + +static struct notifier_block rcu_torture_stall_block = { + .notifier_call = rcu_torture_stall_nf, +}; + /* * CPU-stall kthread. It waits as specified by stall_cpu_holdoff, then - * induces a CPU stall for the time specified by stall_cpu. + * induces a CPU stall for the time specified by stall_cpu. If a new + * stall test is added, stallsdone in rcu_torture_writer() must be adjusted. */ -static int rcu_torture_stall(void *args) +static void rcu_torture_stall_one(int rep, int irqsoff) { + int idx; unsigned long stop_at; - VERBOSE_TOROUT_STRING("rcu_torture_stall task started"); if (stall_cpu_holdoff > 0) { VERBOSE_TOROUT_STRING("rcu_torture_stall begin holdoff"); schedule_timeout_interruptible(stall_cpu_holdoff * HZ); VERBOSE_TOROUT_STRING("rcu_torture_stall end holdoff"); } - if (!kthread_should_stop()) { + if (!kthread_should_stop() && stall_gp_kthread > 0) { + VERBOSE_TOROUT_STRING("rcu_torture_stall begin GP stall"); + rcu_gp_set_torture_wait(stall_gp_kthread * HZ); + for (idx = 0; idx < stall_gp_kthread + 2; idx++) { + if (kthread_should_stop()) + break; + schedule_timeout_uninterruptible(HZ); + } + } + if (!kthread_should_stop() && stall_cpu > 0) { + VERBOSE_TOROUT_STRING("rcu_torture_stall begin CPU stall"); stop_at = ktime_get_seconds() + stall_cpu; /* RCU CPU stall is expected behavior in following code. */ - rcu_read_lock(); - if (stall_cpu_irqsoff) + idx = cur_ops->readlock(); + if (irqsoff) local_irq_disable(); - else + else if (!stall_cpu_block) preempt_disable(); - pr_alert("rcu_torture_stall start on CPU %d.\n", - smp_processor_id()); - while (ULONG_CMP_LT((unsigned long)ktime_get_seconds(), - stop_at)) - continue; /* Induce RCU CPU stall warning. */ - if (stall_cpu_irqsoff) + pr_alert("%s start stall episode %d on CPU %d.\n", + __func__, rep + 1, raw_smp_processor_id()); + while (ULONG_CMP_LT((unsigned long)ktime_get_seconds(), stop_at) && + !kthread_should_stop()) + if (stall_cpu_block) { +#ifdef CONFIG_PREEMPTION + preempt_schedule(); +#else + schedule_timeout_uninterruptible(HZ); +#endif + } else if (stall_no_softlockup) { + touch_softlockup_watchdog(); + } + if (irqsoff) local_irq_enable(); - else + else if (!stall_cpu_block) preempt_enable(); - rcu_read_unlock(); - pr_alert("rcu_torture_stall end.\n"); + cur_ops->readunlock(idx); + } +} + +/* + * CPU-stall kthread. Invokes rcu_torture_stall_one() once, and then as many + * additional times as specified by the stall_cpu_repeat module parameter. + * Note that stall_cpu_irqsoff is ignored on the second and subsequent + * stall. + */ +static int rcu_torture_stall(void *args) +{ + int i; + int repeat = stall_cpu_repeat; + int ret; + + VERBOSE_TOROUT_STRING("rcu_torture_stall task started"); + if (repeat < 0) { + repeat = 0; + WARN_ON_ONCE(IS_BUILTIN(CONFIG_RCU_TORTURE_TEST)); + } + if (rcu_cpu_stall_notifiers) { + ret = rcu_stall_chain_notifier_register(&rcu_torture_stall_block); + if (ret) + pr_info("%s: rcu_stall_chain_notifier_register() returned %d, %sexpected.\n", + __func__, ret, !IS_ENABLED(CONFIG_RCU_STALL_COMMON) ? "un" : ""); + } + for (i = 0; i <= repeat; i++) { + if (kthread_should_stop()) + break; + rcu_torture_stall_one(i, i == 0 ? stall_cpu_irqsoff : 0); + } + pr_alert("%s end.\n", __func__); + if (rcu_cpu_stall_notifiers && !ret) { + ret = rcu_stall_chain_notifier_unregister(&rcu_torture_stall_block); + if (ret) + pr_info("%s: rcu_stall_chain_notifier_unregister() returned %d.\n", __func__, ret); } torture_shutdown_absorb("rcu_torture_stall"); while (!kthread_should_stop()) @@ -1587,7 +3114,7 @@ static int rcu_torture_stall(void *args) /* Spawn CPU-stall kthread, if stall_cpu specified. */ static int __init rcu_torture_stall_init(void) { - if (stall_cpu <= 0) + if (stall_cpu <= 0 && stall_gp_kthread <= 0) return 0; return torture_create_kthread(rcu_torture_stall, NULL, stall_task); } @@ -1618,33 +3145,59 @@ static void rcu_torture_fwd_prog_cb(struct rcu_head *rhp) struct rcu_fwd_cb { struct rcu_head rh; struct rcu_fwd_cb *rfc_next; + struct rcu_fwd *rfc_rfp; int rfc_gps; }; -static DEFINE_SPINLOCK(rcu_fwd_lock); -static struct rcu_fwd_cb *rcu_fwd_cb_head; -static struct rcu_fwd_cb **rcu_fwd_cb_tail = &rcu_fwd_cb_head; -static long n_launders_cb; -static unsigned long rcu_fwd_startat; -static bool rcu_fwd_emergency_stop; + #define MAX_FWD_CB_JIFFIES (8 * HZ) /* Maximum CB test duration. */ #define MIN_FWD_CB_LAUNDERS 3 /* This many CB invocations to count. */ #define MIN_FWD_CBS_LAUNDERED 100 /* Number of counted CBs. */ #define FWD_CBS_HIST_DIV 10 /* Histogram buckets/second. */ -static long n_launders_hist[2 * MAX_FWD_CB_JIFFIES / (HZ / FWD_CBS_HIST_DIV)]; +#define N_LAUNDERS_HIST (2 * MAX_FWD_CB_JIFFIES / (HZ / FWD_CBS_HIST_DIV)) + +struct rcu_launder_hist { + long n_launders; + unsigned long launder_gp_seq; +}; -static void rcu_torture_fwd_cb_hist(void) +struct rcu_fwd { + spinlock_t rcu_fwd_lock; + struct rcu_fwd_cb *rcu_fwd_cb_head; + struct rcu_fwd_cb **rcu_fwd_cb_tail; + long n_launders_cb; + unsigned long rcu_fwd_startat; + struct rcu_launder_hist n_launders_hist[N_LAUNDERS_HIST]; + unsigned long rcu_launder_gp_seq_start; + int rcu_fwd_id; +}; + +static DEFINE_MUTEX(rcu_fwd_mutex); +static struct rcu_fwd *rcu_fwds; +static unsigned long rcu_fwd_seq; +static atomic_long_t rcu_fwd_max_cbs; +static bool rcu_fwd_emergency_stop; + +static void rcu_torture_fwd_cb_hist(struct rcu_fwd *rfp) { + unsigned long gps; + unsigned long gps_old; int i; int j; - for (i = ARRAY_SIZE(n_launders_hist) - 1; i > 0; i--) - if (n_launders_hist[i] > 0) + for (i = ARRAY_SIZE(rfp->n_launders_hist) - 1; i > 0; i--) + if (rfp->n_launders_hist[i].n_launders > 0) break; - pr_alert("%s: Callback-invocation histogram (duration %lu jiffies):", - __func__, jiffies - rcu_fwd_startat); - for (j = 0; j <= i; j++) - pr_cont(" %ds/%d: %ld", - j + 1, FWD_CBS_HIST_DIV, n_launders_hist[j]); + pr_alert("%s: Callback-invocation histogram %d (duration %lu jiffies):", + __func__, rfp->rcu_fwd_id, jiffies - rfp->rcu_fwd_startat); + gps_old = rfp->rcu_launder_gp_seq_start; + for (j = 0; j <= i; j++) { + gps = rfp->n_launders_hist[j].launder_gp_seq; + pr_cont(" %ds/%d: %ld:%ld", + j + 1, FWD_CBS_HIST_DIV, + rfp->n_launders_hist[j].n_launders, + rcutorture_seq_diff(gps, gps_old)); + gps_old = gps; + } pr_cont("\n"); } @@ -1655,49 +3208,72 @@ static void rcu_torture_fwd_cb_cr(struct rcu_head *rhp) int i; struct rcu_fwd_cb *rfcp = container_of(rhp, struct rcu_fwd_cb, rh); struct rcu_fwd_cb **rfcpp; + struct rcu_fwd *rfp = rfcp->rfc_rfp; rfcp->rfc_next = NULL; rfcp->rfc_gps++; - spin_lock_irqsave(&rcu_fwd_lock, flags); - rfcpp = rcu_fwd_cb_tail; - rcu_fwd_cb_tail = &rfcp->rfc_next; - WRITE_ONCE(*rfcpp, rfcp); - WRITE_ONCE(n_launders_cb, n_launders_cb + 1); - i = ((jiffies - rcu_fwd_startat) / (HZ / FWD_CBS_HIST_DIV)); - if (i >= ARRAY_SIZE(n_launders_hist)) - i = ARRAY_SIZE(n_launders_hist) - 1; - n_launders_hist[i]++; - spin_unlock_irqrestore(&rcu_fwd_lock, flags); + spin_lock_irqsave(&rfp->rcu_fwd_lock, flags); + rfcpp = rfp->rcu_fwd_cb_tail; + rfp->rcu_fwd_cb_tail = &rfcp->rfc_next; + smp_store_release(rfcpp, rfcp); + WRITE_ONCE(rfp->n_launders_cb, rfp->n_launders_cb + 1); + i = ((jiffies - rfp->rcu_fwd_startat) / (HZ / FWD_CBS_HIST_DIV)); + if (i >= ARRAY_SIZE(rfp->n_launders_hist)) + i = ARRAY_SIZE(rfp->n_launders_hist) - 1; + rfp->n_launders_hist[i].n_launders++; + rfp->n_launders_hist[i].launder_gp_seq = cur_ops->get_gp_seq(); + spin_unlock_irqrestore(&rfp->rcu_fwd_lock, flags); +} + +// Give the scheduler a chance, even on nohz_full CPUs. +static void rcu_torture_fwd_prog_cond_resched(unsigned long iter) +{ + if (IS_ENABLED(CONFIG_PREEMPTION) && IS_ENABLED(CONFIG_NO_HZ_FULL)) { + // Real call_rcu() floods hit userspace, so emulate that. + if (need_resched() || (iter & 0xfff)) + schedule(); + return; + } + // No userspace emulation: CB invocation throttles call_rcu() + cond_resched(); } /* * Free all callbacks on the rcu_fwd_cb_head list, either because the * test is over or because we hit an OOM event. */ -static unsigned long rcu_torture_fwd_prog_cbfree(void) +static unsigned long rcu_torture_fwd_prog_cbfree(struct rcu_fwd *rfp) { unsigned long flags; unsigned long freed = 0; struct rcu_fwd_cb *rfcp; for (;;) { - spin_lock_irqsave(&rcu_fwd_lock, flags); - rfcp = rcu_fwd_cb_head; - if (!rfcp) + spin_lock_irqsave(&rfp->rcu_fwd_lock, flags); + rfcp = rfp->rcu_fwd_cb_head; + if (!rfcp) { + spin_unlock_irqrestore(&rfp->rcu_fwd_lock, flags); break; - rcu_fwd_cb_head = rfcp->rfc_next; - if (!rcu_fwd_cb_head) - rcu_fwd_cb_tail = &rcu_fwd_cb_head; - spin_unlock_irqrestore(&rcu_fwd_lock, flags); + } + rfp->rcu_fwd_cb_head = rfcp->rfc_next; + if (!rfp->rcu_fwd_cb_head) + rfp->rcu_fwd_cb_tail = &rfp->rcu_fwd_cb_head; + spin_unlock_irqrestore(&rfp->rcu_fwd_lock, flags); kfree(rfcp); freed++; + rcu_torture_fwd_prog_cond_resched(freed); + if (tick_nohz_full_enabled()) { + local_irq_save(flags); + rcu_momentary_eqs(); + local_irq_restore(flags); + } } - spin_unlock_irqrestore(&rcu_fwd_lock, flags); return freed; } /* Carry out need_resched()/cond_resched() forward-progress testing. */ -static void rcu_torture_fwd_prog_nr(int *tested, int *tested_tries) +static void rcu_torture_fwd_prog_nr(struct rcu_fwd *rfp, + int *tested, int *tested_tries) { unsigned long cver; unsigned long dur; @@ -1710,12 +3286,17 @@ static void rcu_torture_fwd_prog_nr(int *tested, int *tested_tries) unsigned long stopat; static DEFINE_TORTURE_RANDOM(trs); - if (cur_ops->call && cur_ops->sync && cur_ops->cb_barrier) { + pr_alert("%s: Starting forward-progress test %d\n", __func__, rfp->rcu_fwd_id); + if (!cur_ops->sync) + return; // Cannot do need_resched() forward progress testing without ->sync. + if (cur_ops->call && cur_ops->cb_barrier) { init_rcu_head_on_stack(&fcs.rh); selfpropcb = true; } /* Tight loop containing cond_resched(). */ + atomic_inc(&rcu_fwd_cb_nodelay); + cur_ops->sync(); /* Later readers see above write. */ if (selfpropcb) { WRITE_ONCE(fcs.stop, 0); cur_ops->call(&fcs.rh, rcu_torture_fwd_prog_cb); @@ -1725,9 +3306,10 @@ static void rcu_torture_fwd_prog_nr(int *tested, int *tested_tries) sd = cur_ops->stall_dur() + 1; sd4 = (sd + fwd_progress_div - 1) / fwd_progress_div; dur = sd4 + torture_random(&trs) % (sd - sd4); - WRITE_ONCE(rcu_fwd_startat, jiffies); - stopat = rcu_fwd_startat + dur; + WRITE_ONCE(rfp->rcu_fwd_startat, jiffies); + stopat = rfp->rcu_fwd_startat + dur; while (time_before(jiffies, stopat) && + !shutdown_time_arrived() && !READ_ONCE(rcu_fwd_emergency_stop) && !torture_must_stop()) { idx = cur_ops->readlock(); udelay(10); @@ -1737,16 +3319,19 @@ static void rcu_torture_fwd_prog_nr(int *tested, int *tested_tries) } (*tested_tries)++; if (!time_before(jiffies, stopat) && + !shutdown_time_arrived() && !READ_ONCE(rcu_fwd_emergency_stop) && !torture_must_stop()) { (*tested)++; cver = READ_ONCE(rcu_torture_current_version) - cver; gps = rcutorture_seq_diff(cur_ops->get_gp_seq(), gps); WARN_ON(!cver && gps < 2); - pr_alert("%s: Duration %ld cver %ld gps %ld\n", __func__, dur, cver, gps); + pr_alert("%s: %d Duration %ld cver %ld gps %ld\n", __func__, + rfp->rcu_fwd_id, dur, cver, gps); } if (selfpropcb) { WRITE_ONCE(fcs.stop, 1); cur_ops->sync(); /* Wait for running CB to complete. */ + pr_alert("%s: Waiting for CBs: %pS() %d\n", __func__, cur_ops->cb_barrier, rfp->rcu_fwd_id); cur_ops->cb_barrier(); /* Wait for queued callbacks. */ } @@ -1754,12 +3339,15 @@ static void rcu_torture_fwd_prog_nr(int *tested, int *tested_tries) WARN_ON(READ_ONCE(fcs.stop) != 2); destroy_rcu_head_on_stack(&fcs.rh); } + schedule_timeout_uninterruptible(HZ / 10); /* Let kthreads recover. */ + atomic_dec(&rcu_fwd_cb_nodelay); } /* Carry out call_rcu() forward-progress testing. */ -static void rcu_torture_fwd_prog_cr(void) +static void rcu_torture_fwd_prog_cr(struct rcu_fwd *rfp) { unsigned long cver; + unsigned long flags; unsigned long gps; int i; long n_launders; @@ -1772,26 +3360,32 @@ static void rcu_torture_fwd_prog_cr(void) unsigned long stopat; unsigned long stoppedat; + pr_alert("%s: Starting forward-progress test %d\n", __func__, rfp->rcu_fwd_id); if (READ_ONCE(rcu_fwd_emergency_stop)) return; /* Get out of the way quickly, no GP wait! */ + if (!cur_ops->call) + return; /* Can't do call_rcu() fwd prog without ->call. */ /* Loop continuously posting RCU callbacks. */ - WRITE_ONCE(rcu_fwd_cb_nodelay, true); + atomic_inc(&rcu_fwd_cb_nodelay); cur_ops->sync(); /* Later readers see above write. */ - WRITE_ONCE(rcu_fwd_startat, jiffies); - stopat = rcu_fwd_startat + MAX_FWD_CB_JIFFIES; + WRITE_ONCE(rfp->rcu_fwd_startat, jiffies); + stopat = rfp->rcu_fwd_startat + MAX_FWD_CB_JIFFIES; n_launders = 0; - n_launders_cb = 0; + rfp->n_launders_cb = 0; // Hoist initialization for multi-kthread n_launders_sa = 0; n_max_cbs = 0; n_max_gps = 0; - for (i = 0; i < ARRAY_SIZE(n_launders_hist); i++) - n_launders_hist[i] = 0; + for (i = 0; i < ARRAY_SIZE(rfp->n_launders_hist); i++) + rfp->n_launders_hist[i].n_launders = 0; cver = READ_ONCE(rcu_torture_current_version); gps = cur_ops->get_gp_seq(); + rfp->rcu_launder_gp_seq_start = gps; + tick_dep_set_task(current, TICK_DEP_BIT_RCU); // CPU bound, so need tick. while (time_before(jiffies, stopat) && + !shutdown_time_arrived() && !READ_ONCE(rcu_fwd_emergency_stop) && !torture_must_stop()) { - rfcp = READ_ONCE(rcu_fwd_cb_head); + rfcp = READ_ONCE(rfp->rcu_fwd_cb_head); rfcpn = NULL; if (rfcp) rfcpn = READ_ONCE(rfcp->rfc_next); @@ -1799,10 +3393,10 @@ static void rcu_torture_fwd_prog_cr(void) if (rfcp->rfc_gps >= MIN_FWD_CB_LAUNDERS && ++n_max_gps >= MIN_FWD_CBS_LAUNDERED) break; - rcu_fwd_cb_head = rfcpn; + rfp->rcu_fwd_cb_head = rfcpn; n_launders++; n_launders_sa++; - } else { + } else if (!cur_ops->cbflood_max || cur_ops->cbflood_max > n_max_cbs) { rfcp = kmalloc(sizeof(*rfcp), GFP_KERNEL); if (WARN_ON_ONCE(!rfcp)) { schedule_timeout_interruptible(1); @@ -1811,28 +3405,45 @@ static void rcu_torture_fwd_prog_cr(void) n_max_cbs++; n_launders_sa = 0; rfcp->rfc_gps = 0; + rfcp->rfc_rfp = rfp; + } else { + rfcp = NULL; + } + if (rfcp) + cur_ops->call(&rfcp->rh, rcu_torture_fwd_cb_cr); + rcu_torture_fwd_prog_cond_resched(n_launders + n_max_cbs); + if (tick_nohz_full_enabled()) { + local_irq_save(flags); + rcu_momentary_eqs(); + local_irq_restore(flags); } - cur_ops->call(&rfcp->rh, rcu_torture_fwd_cb_cr); - cond_resched(); } stoppedat = jiffies; - n_launders_cb_snap = READ_ONCE(n_launders_cb); + n_launders_cb_snap = READ_ONCE(rfp->n_launders_cb); cver = READ_ONCE(rcu_torture_current_version) - cver; gps = rcutorture_seq_diff(cur_ops->get_gp_seq(), gps); + pr_alert("%s: Waiting for CBs: %pS() %d\n", __func__, cur_ops->cb_barrier, rfp->rcu_fwd_id); cur_ops->cb_barrier(); /* Wait for callbacks to be invoked. */ - (void)rcu_torture_fwd_prog_cbfree(); + (void)rcu_torture_fwd_prog_cbfree(rfp); - WRITE_ONCE(rcu_fwd_cb_nodelay, false); - if (!torture_must_stop() && !READ_ONCE(rcu_fwd_emergency_stop)) { - WARN_ON(n_max_gps < MIN_FWD_CBS_LAUNDERED); - pr_alert("%s Duration %lu barrier: %lu pending %ld n_launders: %ld n_launders_sa: %ld n_max_gps: %ld n_max_cbs: %ld cver %ld gps %ld\n", + if (!torture_must_stop() && !READ_ONCE(rcu_fwd_emergency_stop) && + !shutdown_time_arrived()) { + if (WARN_ON(n_max_gps < MIN_FWD_CBS_LAUNDERED) && cur_ops->gp_kthread_dbg) + cur_ops->gp_kthread_dbg(); + pr_alert("%s Duration %lu barrier: %lu pending %ld n_launders: %ld n_launders_sa: %ld n_max_gps: %ld n_max_cbs: %ld cver %ld gps %ld #online %u\n", __func__, - stoppedat - rcu_fwd_startat, jiffies - stoppedat, + stoppedat - rfp->rcu_fwd_startat, jiffies - stoppedat, n_launders + n_max_cbs - n_launders_cb_snap, n_launders, n_launders_sa, - n_max_gps, n_max_cbs, cver, gps); - rcu_torture_fwd_cb_hist(); + n_max_gps, n_max_cbs, cver, gps, num_online_cpus()); + atomic_long_add(n_max_cbs, &rcu_fwd_max_cbs); + mutex_lock(&rcu_fwd_mutex); // Serialize histograms. + rcu_torture_fwd_cb_hist(rfp); + mutex_unlock(&rcu_fwd_mutex); } + schedule_timeout_uninterruptible(HZ); /* Let CBs drain. */ + tick_dep_clear_task(current, TICK_DEP_BIT_RCU); + atomic_dec(&rcu_fwd_cb_nodelay); } @@ -1843,23 +3454,42 @@ static void rcu_torture_fwd_prog_cr(void) static int rcutorture_oom_notify(struct notifier_block *self, unsigned long notused, void *nfreed) { + int i; + long ncbs; + struct rcu_fwd *rfp; + + mutex_lock(&rcu_fwd_mutex); + rfp = rcu_fwds; + if (!rfp) { + mutex_unlock(&rcu_fwd_mutex); + return NOTIFY_OK; + } WARN(1, "%s invoked upon OOM during forward-progress testing.\n", __func__); - rcu_torture_fwd_cb_hist(); - rcu_fwd_progress_check(1 + (jiffies - READ_ONCE(rcu_fwd_startat) / 2)); + for (i = 0; i < fwd_progress; i++) { + rcu_torture_fwd_cb_hist(&rfp[i]); + rcu_fwd_progress_check(1 + (jiffies - READ_ONCE(rfp[i].rcu_fwd_startat)) / 2); + } WRITE_ONCE(rcu_fwd_emergency_stop, true); smp_mb(); /* Emergency stop before free and wait to avoid hangs. */ - pr_info("%s: Freed %lu RCU callbacks.\n", - __func__, rcu_torture_fwd_prog_cbfree()); - rcu_barrier(); - pr_info("%s: Freed %lu RCU callbacks.\n", - __func__, rcu_torture_fwd_prog_cbfree()); - rcu_barrier(); - pr_info("%s: Freed %lu RCU callbacks.\n", - __func__, rcu_torture_fwd_prog_cbfree()); + ncbs = 0; + for (i = 0; i < fwd_progress; i++) + ncbs += rcu_torture_fwd_prog_cbfree(&rfp[i]); + pr_info("%s: Freed %lu RCU callbacks.\n", __func__, ncbs); + cur_ops->cb_barrier(); + ncbs = 0; + for (i = 0; i < fwd_progress; i++) + ncbs += rcu_torture_fwd_prog_cbfree(&rfp[i]); + pr_info("%s: Freed %lu RCU callbacks.\n", __func__, ncbs); + cur_ops->cb_barrier(); + ncbs = 0; + for (i = 0; i < fwd_progress; i++) + ncbs += rcu_torture_fwd_prog_cbfree(&rfp[i]); + pr_info("%s: Freed %lu RCU callbacks.\n", __func__, ncbs); smp_mb(); /* Frees before return to avoid redoing OOM. */ (*(unsigned long *)nfreed)++; /* Forward progress CBs freed! */ pr_info("%s returning after OOM processing.\n", __func__); + mutex_unlock(&rcu_fwd_mutex); return NOTIFY_OK; } @@ -1870,27 +3500,53 @@ static struct notifier_block rcutorture_oom_nb = { /* Carry out grace-period forward-progress testing. */ static int rcu_torture_fwd_prog(void *args) { + bool firsttime = true; + long max_cbs; + int oldnice = task_nice(current); + unsigned long oldseq = READ_ONCE(rcu_fwd_seq); + struct rcu_fwd *rfp = args; int tested = 0; int tested_tries = 0; VERBOSE_TOROUT_STRING("rcu_torture_fwd_progress task started"); + while (!rcu_inkernel_boot_has_ended()) + schedule_timeout_interruptible(HZ / 10); rcu_bind_current_to_nocb(); if (!IS_ENABLED(CONFIG_SMP) || !IS_ENABLED(CONFIG_RCU_BOOST)) set_user_nice(current, MAX_NICE); do { - schedule_timeout_interruptible(fwd_progress_holdoff * HZ); - WRITE_ONCE(rcu_fwd_emergency_stop, false); - register_oom_notifier(&rcutorture_oom_nb); - rcu_torture_fwd_prog_nr(&tested, &tested_tries); - rcu_torture_fwd_prog_cr(); - unregister_oom_notifier(&rcutorture_oom_nb); + if (!rfp->rcu_fwd_id) { + schedule_timeout_interruptible(fwd_progress_holdoff * HZ); + WRITE_ONCE(rcu_fwd_emergency_stop, false); + if (!firsttime) { + max_cbs = atomic_long_xchg(&rcu_fwd_max_cbs, 0); + pr_alert("%s n_max_cbs: %ld\n", __func__, max_cbs); + } + firsttime = false; + WRITE_ONCE(rcu_fwd_seq, rcu_fwd_seq + 1); + } else { + while (READ_ONCE(rcu_fwd_seq) == oldseq && !torture_must_stop()) + schedule_timeout_interruptible(HZ / 20); + oldseq = READ_ONCE(rcu_fwd_seq); + } + pr_alert("%s: Starting forward-progress test %d\n", __func__, rfp->rcu_fwd_id); + if (rcu_inkernel_boot_has_ended() && torture_num_online_cpus() > rfp->rcu_fwd_id) + rcu_torture_fwd_prog_cr(rfp); + if ((cur_ops->stall_dur && cur_ops->stall_dur() > 0) && + (!IS_ENABLED(CONFIG_TINY_RCU) || + (rcu_inkernel_boot_has_ended() && + torture_num_online_cpus() > rfp->rcu_fwd_id))) + rcu_torture_fwd_prog_nr(rfp, &tested, &tested_tries); /* Avoid slow periods, better to test when busy. */ - stutter_wait("rcu_torture_fwd_prog"); + if (stutter_wait("rcu_torture_fwd_prog")) + sched_set_normal(current, oldnice); } while (!torture_must_stop()); /* Short runs might not contain a valid forward-progress attempt. */ - WARN_ON(!tested && tested_tries >= 5); - pr_alert("%s: tested %d tested_tries %d\n", __func__, tested, tested_tries); + if (!rfp->rcu_fwd_id) { + WARN_ON(!tested && tested_tries >= 5); + pr_alert("%s: tested %d tested_tries %d\n", __func__, tested, tested_tries); + } torture_kthread_stopping("rcu_torture_fwd_prog"); return 0; } @@ -1898,26 +3554,82 @@ static int rcu_torture_fwd_prog(void *args) /* If forward-progress checking is requested and feasible, spawn the thread. */ static int __init rcu_torture_fwd_prog_init(void) { + int i; + int ret = 0; + struct rcu_fwd *rfp; + if (!fwd_progress) return 0; /* Not requested, so don't do it. */ - if (!cur_ops->stall_dur || cur_ops->stall_dur() <= 0 || + if (fwd_progress >= nr_cpu_ids) { + VERBOSE_TOROUT_STRING("rcu_torture_fwd_prog_init: Limiting fwd_progress to # CPUs.\n"); + fwd_progress = nr_cpu_ids; + } else if (fwd_progress < 0) { + fwd_progress = nr_cpu_ids; + } + if ((!cur_ops->sync && !cur_ops->call) || + (!cur_ops->cbflood_max && (!cur_ops->stall_dur || cur_ops->stall_dur() <= 0)) || cur_ops == &rcu_busted_ops) { VERBOSE_TOROUT_STRING("rcu_torture_fwd_prog_init: Disabled, unsupported by RCU flavor under test"); + fwd_progress = 0; return 0; } - if (stall_cpu > 0) { - VERBOSE_TOROUT_STRING("rcu_torture_fwd_prog_init: Disabled, conflicts with CPU-stall testing"); - if (IS_MODULE(CONFIG_RCU_TORTURE_TESTS)) + if (stall_cpu > 0 || (preempt_duration > 0 && IS_ENABLED(CONFIG_RCU_NOCB_CPU))) { + VERBOSE_TOROUT_STRING("rcu_torture_fwd_prog_init: Disabled, conflicts with CPU-stall and/or preemption testing"); + fwd_progress = 0; + if (IS_MODULE(CONFIG_RCU_TORTURE_TEST)) return -EINVAL; /* In module, can fail back to user. */ - WARN_ON(1); /* Make sure rcutorture notices conflict. */ + WARN_ON(1); /* Make sure rcutorture scripting notices conflict. */ return 0; } if (fwd_progress_holdoff <= 0) fwd_progress_holdoff = 1; if (fwd_progress_div <= 0) fwd_progress_div = 4; - return torture_create_kthread(rcu_torture_fwd_prog, - NULL, fwd_prog_task); + rfp = kcalloc(fwd_progress, sizeof(*rfp), GFP_KERNEL); + fwd_prog_tasks = kcalloc(fwd_progress, sizeof(*fwd_prog_tasks), GFP_KERNEL); + if (!rfp || !fwd_prog_tasks) { + kfree(rfp); + kfree(fwd_prog_tasks); + fwd_prog_tasks = NULL; + fwd_progress = 0; + return -ENOMEM; + } + for (i = 0; i < fwd_progress; i++) { + spin_lock_init(&rfp[i].rcu_fwd_lock); + rfp[i].rcu_fwd_cb_tail = &rfp[i].rcu_fwd_cb_head; + rfp[i].rcu_fwd_id = i; + } + mutex_lock(&rcu_fwd_mutex); + rcu_fwds = rfp; + mutex_unlock(&rcu_fwd_mutex); + register_oom_notifier(&rcutorture_oom_nb); + for (i = 0; i < fwd_progress; i++) { + ret = torture_create_kthread(rcu_torture_fwd_prog, &rcu_fwds[i], fwd_prog_tasks[i]); + if (ret) { + fwd_progress = i; + return ret; + } + } + return 0; +} + +static void rcu_torture_fwd_prog_cleanup(void) +{ + int i; + struct rcu_fwd *rfp; + + if (!rcu_fwds || !fwd_prog_tasks) + return; + for (i = 0; i < fwd_progress; i++) + torture_stop_kthread(rcu_torture_fwd_prog, fwd_prog_tasks[i]); + unregister_oom_notifier(&rcutorture_oom_nb); + mutex_lock(&rcu_fwd_mutex); + rfp = rcu_fwds; + rcu_fwds = NULL; + mutex_unlock(&rcu_fwd_mutex); + kfree(rfp); + kfree(fwd_prog_tasks); + fwd_prog_tasks = NULL; } /* Callback function for RCU barrier testing. */ @@ -1926,11 +3638,20 @@ static void rcu_torture_barrier_cbf(struct rcu_head *rcu) atomic_inc(&barrier_cbs_invoked); } +/* IPI handler to get callback posted on desired CPU, if online. */ +static int rcu_torture_barrier1cb(void *rcu_void) +{ + struct rcu_head *rhp = rcu_void; + + cur_ops->call(rhp, rcu_torture_barrier_cbf); + return 0; +} + /* kthread function to register callbacks used to test RCU barriers. */ static int rcu_torture_barrier_cbs(void *arg) { long myid = (long)arg; - bool lastphase = 0; + bool lastphase = false; bool newphase; struct rcu_head rcu; @@ -1949,9 +3670,9 @@ static int rcu_torture_barrier_cbs(void *arg) * The above smp_load_acquire() ensures barrier_phase load * is ordered before the following ->call(). */ - local_irq_disable(); /* Just to test no-irq call_rcu(). */ - cur_ops->call(&rcu, rcu_torture_barrier_cbf); - local_irq_enable(); + if (smp_call_on_cpu(myid, rcu_torture_barrier1cb, &rcu, 1)) + cur_ops->call(&rcu, rcu_torture_barrier_cbf); + if (atomic_dec_and_test(&barrier_cbs_count)) wake_up(&barrier_wq); } while (!torture_must_stop()); @@ -1987,7 +3708,21 @@ static int rcu_torture_barrier(void *arg) pr_err("barrier_cbs_invoked = %d, n_barrier_cbs = %d\n", atomic_read(&barrier_cbs_invoked), n_barrier_cbs); - WARN_ON_ONCE(1); + WARN_ON(1); + // Wait manually for the remaining callbacks + i = 0; + do { + if (WARN_ON(i++ > HZ)) + i = INT_MIN; + schedule_timeout_interruptible(1); + cur_ops->cb_barrier(); + } while (atomic_read(&barrier_cbs_invoked) != + n_barrier_cbs && + !torture_must_stop()); + smp_mb(); // Can't trust ordering if broken. + if (!torture_must_stop()) + pr_err("Recovered: barrier_cbs_invoked = %d\n", + atomic_read(&barrier_cbs_invoked)); } else { n_barrier_successes++; } @@ -2060,13 +3795,15 @@ static bool rcu_torture_can_boost(void) if (!(test_boost == 1 && cur_ops->can_boost) && test_boost != 2) return false; + if (!cur_ops->start_gp_poll || !cur_ops->poll_gp_state) + return false; prio = rcu_get_gp_kthreads_prio(); if (!prio) return false; if (prio < 2) { - if (boost_warn_once == 1) + if (boost_warn_once == 1) return false; pr_alert("%s: WARN: RCU kthread priority too low to test boosting. Skipping RCU boost test. Try passing rcutree.kthread_prio > 1 on the kernel command line.\n", KBUILD_MODNAME); @@ -2077,8 +3814,197 @@ static bool rcu_torture_can_boost(void) return true; } +static bool read_exit_child_stop; +static bool read_exit_child_stopped; +static wait_queue_head_t read_exit_wq; + +// Child kthread which just does an rcutorture reader and exits. +static int rcu_torture_read_exit_child(void *trsp_in) +{ + struct torture_random_state *trsp = trsp_in; + + set_user_nice(current, MAX_NICE); + // Minimize time between reading and exiting. + while (!kthread_should_stop()) + schedule_timeout_uninterruptible(HZ / 20); + (void)rcu_torture_one_read(trsp, -1); + return 0; +} + +// Parent kthread which creates and destroys read-exit child kthreads. +static int rcu_torture_read_exit(void *unused) +{ + bool errexit = false; + int i; + struct task_struct *tsp; + DEFINE_TORTURE_RANDOM(trs); + + // Allocate and initialize. + set_user_nice(current, MAX_NICE); + VERBOSE_TOROUT_STRING("rcu_torture_read_exit: Start of test"); + + // Each pass through this loop does one read-exit episode. + do { + VERBOSE_TOROUT_STRING("rcu_torture_read_exit: Start of episode"); + for (i = 0; i < read_exit_burst; i++) { + if (READ_ONCE(read_exit_child_stop)) + break; + stutter_wait("rcu_torture_read_exit"); + // Spawn child. + tsp = kthread_run(rcu_torture_read_exit_child, + &trs, "%s", "rcu_torture_read_exit_child"); + if (IS_ERR(tsp)) { + TOROUT_ERRSTRING("out of memory"); + errexit = true; + break; + } + cond_resched(); + kthread_stop(tsp); + n_read_exits++; + } + VERBOSE_TOROUT_STRING("rcu_torture_read_exit: End of episode"); + rcu_barrier(); // Wait for task_struct free, avoid OOM. + i = 0; + for (; !errexit && !READ_ONCE(read_exit_child_stop) && i < read_exit_delay; i++) + schedule_timeout_uninterruptible(HZ); + } while (!errexit && !READ_ONCE(read_exit_child_stop)); + + // Clean up and exit. + smp_store_release(&read_exit_child_stopped, true); // After reaping. + smp_mb(); // Store before wakeup. + wake_up(&read_exit_wq); + while (!torture_must_stop()) + schedule_timeout_uninterruptible(HZ / 20); + torture_kthread_stopping("rcu_torture_read_exit"); + return 0; +} + +static int rcu_torture_read_exit_init(void) +{ + if (read_exit_burst <= 0) + return 0; + init_waitqueue_head(&read_exit_wq); + read_exit_child_stop = false; + read_exit_child_stopped = false; + return torture_create_kthread(rcu_torture_read_exit, NULL, + read_exit_task); +} + +static void rcu_torture_read_exit_cleanup(void) +{ + if (!read_exit_task) + return; + WRITE_ONCE(read_exit_child_stop, true); + smp_mb(); // Above write before wait. + wait_event(read_exit_wq, smp_load_acquire(&read_exit_child_stopped)); + torture_stop_kthread(rcutorture_read_exit, read_exit_task); +} + +static void rcutorture_test_nmis(int n) +{ +#if IS_BUILTIN(CONFIG_RCU_TORTURE_TEST) + int cpu; + int dumpcpu; + int i; + + for (i = 0; i < n; i++) { + preempt_disable(); + cpu = smp_processor_id(); + dumpcpu = cpu + 1; + if (dumpcpu >= nr_cpu_ids) + dumpcpu = 0; + pr_alert("%s: CPU %d invoking dump_cpu_task(%d)\n", __func__, cpu, dumpcpu); + dump_cpu_task(dumpcpu); + preempt_enable(); + schedule_timeout_uninterruptible(15 * HZ); + } +#else // #if IS_BUILTIN(CONFIG_RCU_TORTURE_TEST) + WARN_ONCE(n, "Non-zero rcutorture.test_nmis=%d permitted only when rcutorture is built in.\n", test_nmis); +#endif // #else // #if IS_BUILTIN(CONFIG_RCU_TORTURE_TEST) +} + +// Randomly preempt online CPUs. +static int rcu_torture_preempt(void *unused) +{ + int cpu = -1; + DEFINE_TORTURE_RANDOM(rand); + + schedule_timeout_idle(stall_cpu_holdoff); + do { + // Wait for preempt_interval ms with up to 100us fuzz. + torture_hrtimeout_ms(preempt_interval, 100, &rand); + // Select online CPU. + cpu = cpumask_next(cpu, cpu_online_mask); + if (cpu >= nr_cpu_ids) + cpu = cpumask_next(-1, cpu_online_mask); + WARN_ON_ONCE(cpu >= nr_cpu_ids); + // Move to that CPU, if can't do so, retry later. + if (torture_sched_setaffinity(current->pid, cpumask_of(cpu), false)) + continue; + // Preempt at high-ish priority, then reset to normal. + sched_set_fifo(current); + torture_sched_setaffinity(current->pid, cpu_present_mask, true); + mdelay(preempt_duration); + sched_set_normal(current, 0); + stutter_wait("rcu_torture_preempt"); + } while (!torture_must_stop()); + torture_kthread_stopping("rcu_torture_preempt"); + return 0; +} + static enum cpuhp_state rcutor_hp; +static struct hrtimer gpwrap_lag_timer; +static bool gpwrap_lag_active; + +/* Timer handler for toggling RCU grace-period sequence overflow test lag value */ +static enum hrtimer_restart rcu_gpwrap_lag_timer(struct hrtimer *timer) +{ + ktime_t next_delay; + + if (gpwrap_lag_active) { + pr_alert("rcu-torture: Disabling gpwrap lag (value=0)\n"); + cur_ops->set_gpwrap_lag(0); + gpwrap_lag_active = false; + next_delay = ktime_set((gpwrap_lag_cycle_mins - gpwrap_lag_active_mins) * 60, 0); + } else { + pr_alert("rcu-torture: Enabling gpwrap lag (value=%d)\n", gpwrap_lag_gps); + cur_ops->set_gpwrap_lag(gpwrap_lag_gps); + gpwrap_lag_active = true; + next_delay = ktime_set(gpwrap_lag_active_mins * 60, 0); + } + + if (torture_must_stop_irq()) + return HRTIMER_NORESTART; + + hrtimer_forward_now(timer, next_delay); + return HRTIMER_RESTART; +} + +static int rcu_gpwrap_lag_init(void) +{ + if (!gpwrap_lag) + return 0; + + if (gpwrap_lag_cycle_mins <= 0 || gpwrap_lag_active_mins <= 0) { + pr_alert("rcu-torture: lag timing parameters must be positive\n"); + return -EINVAL; + } + + hrtimer_setup(&gpwrap_lag_timer, rcu_gpwrap_lag_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + gpwrap_lag_active = false; + hrtimer_start(&gpwrap_lag_timer, + ktime_set((gpwrap_lag_cycle_mins - gpwrap_lag_active_mins) * 60, 0), HRTIMER_MODE_REL); + + return 0; +} + +static void rcu_gpwrap_lag_cleanup(void) +{ + hrtimer_cancel(&gpwrap_lag_timer); + cur_ops->set_gpwrap_lag(0); + gpwrap_lag_active = false; +} static void rcu_torture_cleanup(void) { @@ -2086,53 +4012,85 @@ rcu_torture_cleanup(void) int flags = 0; unsigned long gp_seq = 0; int i; + int j; if (torture_cleanup_begin()) { - if (cur_ops->cb_barrier != NULL) + if (cur_ops->cb_barrier != NULL) { + pr_info("%s: Invoking %pS().\n", __func__, cur_ops->cb_barrier); cur_ops->cb_barrier(); + } + if (cur_ops->gp_slow_unregister) + cur_ops->gp_slow_unregister(NULL); return; } + if (!cur_ops) { + torture_cleanup_end(); + return; + } + + rcutorture_test_nmis(test_nmis); + if (cur_ops->gp_kthread_dbg) + cur_ops->gp_kthread_dbg(); + torture_stop_kthread(rcu_torture_preempt, preempt_task); + rcu_torture_read_exit_cleanup(); rcu_torture_barrier_cleanup(); - torture_stop_kthread(rcu_torture_fwd_prog, fwd_prog_task); + rcu_torture_fwd_prog_cleanup(); torture_stop_kthread(rcu_torture_stall, stall_task); torture_stop_kthread(rcu_torture_writer, writer_task); + if (nocb_tasks) { + for (i = 0; i < nrealnocbers; i++) + torture_stop_kthread(rcu_nocb_toggle, nocb_tasks[i]); + kfree(nocb_tasks); + nocb_tasks = NULL; + } + + if (updown_task) { + torture_stop_kthread(rcu_torture_updown, updown_task); + updown_task = NULL; + } if (reader_tasks) { for (i = 0; i < nrealreaders; i++) torture_stop_kthread(rcu_torture_reader, reader_tasks[i]); kfree(reader_tasks); + reader_tasks = NULL; } - rcu_torture_current = NULL; + kfree(rcu_torture_reader_mbchk); + rcu_torture_reader_mbchk = NULL; if (fakewriter_tasks) { - for (i = 0; i < nfakewriters; i++) { + for (i = 0; i < nrealfakewriters; i++) torture_stop_kthread(rcu_torture_fakewriter, fakewriter_tasks[i]); - } kfree(fakewriter_tasks); fakewriter_tasks = NULL; } - rcutorture_get_gp_data(cur_ops->ttype, &flags, &gp_seq); - srcutorture_get_gp_data(cur_ops->ttype, srcu_ctlp, &flags, &gp_seq); - pr_alert("%s: End-test grace-period state: g%lu f%#x\n", - cur_ops->name, gp_seq, flags); + if (cur_ops->get_gp_data) + cur_ops->get_gp_data(&flags, &gp_seq); + pr_alert("%s: End-test grace-period state: g%ld f%#x total-gps=%ld\n", + cur_ops->name, (long)gp_seq, flags, + rcutorture_seq_diff(gp_seq, start_gp_seq)); torture_stop_kthread(rcu_torture_stats, stats_task); torture_stop_kthread(rcu_torture_fqs, fqs_task); - if (rcu_torture_can_boost()) + if (rcu_torture_can_boost() && rcutor_hp >= 0) cpuhp_remove_state(rcutor_hp); /* * Wait for all RCU callbacks to fire, then do torture-type-specific * cleanup operations. */ - if (cur_ops->cb_barrier != NULL) + if (cur_ops->cb_barrier != NULL) { + pr_info("%s: Invoking %pS().\n", __func__, cur_ops->cb_barrier); cur_ops->cb_barrier(); + } if (cur_ops->cleanup != NULL) cur_ops->cleanup(); + rcu_torture_mem_dump_obj(); + rcu_torture_stats_print(); /* -After- the stats thread is stopped! */ if (err_segs_recorded) { @@ -2141,26 +4099,74 @@ rcu_torture_cleanup(void) pr_alert("\t: No segments recorded!!!\n"); firsttime = 1; for (i = 0; i < rt_read_nsegs; i++) { - pr_alert("\t%d: %#x ", i, err_segs[i].rt_readstate); + if (IS_ENABLED(CONFIG_RCU_TORTURE_TEST_LOG_GP)) + pr_alert("\t%lluus ", div64_u64(err_segs[i].rt_ts, 1000ULL)); + else + pr_alert("\t"); + pr_cont("%d: %#4x", i, err_segs[i].rt_readstate); if (err_segs[i].rt_delay_jiffies != 0) { pr_cont("%s%ldjiffies", firsttime ? "" : "+", err_segs[i].rt_delay_jiffies); firsttime = 0; } + if (IS_ENABLED(CONFIG_RCU_TORTURE_TEST_LOG_CPU)) { + pr_cont(" CPU %2d", err_segs[i].rt_cpu); + if (err_segs[i].rt_cpu != err_segs[i].rt_end_cpu) + pr_cont("->%-2d", err_segs[i].rt_end_cpu); + else + pr_cont(" ..."); + } + if (IS_ENABLED(CONFIG_RCU_TORTURE_TEST_LOG_GP) && + cur_ops->gather_gp_seqs && cur_ops->format_gp_seqs) { + char buf1[20+1]; + char buf2[20+1]; + char sepchar = '-'; + + cur_ops->format_gp_seqs(err_segs[i].rt_gp_seq, + buf1, ARRAY_SIZE(buf1)); + cur_ops->format_gp_seqs(err_segs[i].rt_gp_seq_end, + buf2, ARRAY_SIZE(buf2)); + if (err_segs[i].rt_gp_seq == err_segs[i].rt_gp_seq_end) { + if (buf2[0]) { + for (j = 0; buf2[j]; j++) + buf2[j] = '.'; + if (j) + buf2[j - 1] = ' '; + } + sepchar = ' '; + } + pr_cont(" %s%c%s", buf1, sepchar, buf2); + } if (err_segs[i].rt_delay_ms != 0) { - pr_cont("%s%ldms", firsttime ? "" : "+", + pr_cont(" %s%ldms", firsttime ? "" : "+", err_segs[i].rt_delay_ms); firsttime = 0; } if (err_segs[i].rt_delay_us != 0) { - pr_cont("%s%ldus", firsttime ? "" : "+", + pr_cont(" %s%ldus", firsttime ? "" : "+", err_segs[i].rt_delay_us); firsttime = 0; } - pr_cont("%s\n", - err_segs[i].rt_preempted ? "preempted" : ""); + pr_cont("%s", err_segs[i].rt_preempted ? " preempted" : ""); + if (err_segs[i].rt_readstate & RCUTORTURE_RDR_BH) + pr_cont(" BH"); + if (err_segs[i].rt_readstate & RCUTORTURE_RDR_IRQ) + pr_cont(" IRQ"); + if (err_segs[i].rt_readstate & RCUTORTURE_RDR_PREEMPT) + pr_cont(" PREEMPT"); + if (err_segs[i].rt_readstate & RCUTORTURE_RDR_RBH) + pr_cont(" RBH"); + if (err_segs[i].rt_readstate & RCUTORTURE_RDR_SCHED) + pr_cont(" SCHED"); + if (err_segs[i].rt_readstate & RCUTORTURE_RDR_RCU_1) + pr_cont(" RCU_1"); + if (err_segs[i].rt_readstate & RCUTORTURE_RDR_RCU_2) + pr_cont(" RCU_2"); + pr_cont("\n"); } + if (rt_read_preempted) + pr_alert("\tReader was preempted.\n"); } if (atomic_read(&n_rcu_torture_error) || n_rcu_torture_barrier_error) rcu_torture_print_module_parms(cur_ops, "End of test: FAILURE"); @@ -2170,9 +4176,13 @@ rcu_torture_cleanup(void) else rcu_torture_print_module_parms(cur_ops, "End of test: SUCCESS"); torture_cleanup_end(); + if (cur_ops->gp_slow_unregister) + cur_ops->gp_slow_unregister(NULL); + + if (gpwrap_lag && cur_ops->set_gpwrap_lag) + rcu_gpwrap_lag_cleanup(); } -#ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD static void rcu_torture_leak_cb(struct rcu_head *rhp) { } @@ -2190,7 +4200,6 @@ static void rcu_torture_err_cb(struct rcu_head *rhp) */ pr_alert("%s: duplicated callback was invoked.\n", KBUILD_MODNAME); } -#endif /* #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */ /* * Verify that double-free causes debug-objects to complain, but only @@ -2199,33 +4208,233 @@ static void rcu_torture_err_cb(struct rcu_head *rhp) */ static void rcu_test_debug_objects(void) { -#ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD struct rcu_head rh1; struct rcu_head rh2; + int idx; + + if (!IS_ENABLED(CONFIG_DEBUG_OBJECTS_RCU_HEAD)) { + pr_alert("%s: !CONFIG_DEBUG_OBJECTS_RCU_HEAD, not testing duplicate call_%s()\n", + KBUILD_MODNAME, cur_ops->name); + return; + } + + if (WARN_ON_ONCE(cur_ops->debug_objects && + (!cur_ops->call || !cur_ops->cb_barrier))) + return; + + struct rcu_head *rhp = kmalloc(sizeof(*rhp), GFP_KERNEL); init_rcu_head_on_stack(&rh1); init_rcu_head_on_stack(&rh2); - pr_alert("%s: WARN: Duplicate call_rcu() test starting.\n", KBUILD_MODNAME); + pr_alert("%s: WARN: Duplicate call_%s() test starting.\n", KBUILD_MODNAME, cur_ops->name); /* Try to queue the rh2 pair of callbacks for the same grace period. */ - preempt_disable(); /* Prevent preemption from interrupting test. */ - rcu_read_lock(); /* Make it impossible to finish a grace period. */ - call_rcu(&rh1, rcu_torture_leak_cb); /* Start grace period. */ - local_irq_disable(); /* Make it harder to start a new grace period. */ - call_rcu(&rh2, rcu_torture_leak_cb); - call_rcu(&rh2, rcu_torture_err_cb); /* Duplicate callback. */ - local_irq_enable(); - rcu_read_unlock(); - preempt_enable(); + idx = cur_ops->readlock(); /* Make it impossible to finish a grace period. */ + cur_ops->call(&rh1, rcu_torture_leak_cb); /* Start grace period. */ + cur_ops->call(&rh2, rcu_torture_leak_cb); + cur_ops->call(&rh2, rcu_torture_err_cb); /* Duplicate callback. */ + if (rhp) { + cur_ops->call(rhp, rcu_torture_leak_cb); + cur_ops->call(rhp, rcu_torture_err_cb); /* Another duplicate callback. */ + } + cur_ops->readunlock(idx); /* Wait for them all to get done so we can safely return. */ - rcu_barrier(); - pr_alert("%s: WARN: Duplicate call_rcu() test complete.\n", KBUILD_MODNAME); + cur_ops->cb_barrier(); + pr_alert("%s: WARN: Duplicate call_%s() test complete.\n", KBUILD_MODNAME, cur_ops->name); destroy_rcu_head_on_stack(&rh1); destroy_rcu_head_on_stack(&rh2); -#else /* #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */ - pr_alert("%s: !CONFIG_DEBUG_OBJECTS_RCU_HEAD, not testing duplicate call_rcu()\n", KBUILD_MODNAME); -#endif /* #else #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */ + kfree(rhp); +} + +static void rcutorture_sync(void) +{ + static unsigned long n; + + if (cur_ops->sync && !(++n & 0xfff)) + cur_ops->sync(); +} + +static DEFINE_MUTEX(mut0); +static DEFINE_MUTEX(mut1); +static DEFINE_MUTEX(mut2); +static DEFINE_MUTEX(mut3); +static DEFINE_MUTEX(mut4); +static DEFINE_MUTEX(mut5); +static DEFINE_MUTEX(mut6); +static DEFINE_MUTEX(mut7); +static DEFINE_MUTEX(mut8); +static DEFINE_MUTEX(mut9); + +static DECLARE_RWSEM(rwsem0); +static DECLARE_RWSEM(rwsem1); +static DECLARE_RWSEM(rwsem2); +static DECLARE_RWSEM(rwsem3); +static DECLARE_RWSEM(rwsem4); +static DECLARE_RWSEM(rwsem5); +static DECLARE_RWSEM(rwsem6); +static DECLARE_RWSEM(rwsem7); +static DECLARE_RWSEM(rwsem8); +static DECLARE_RWSEM(rwsem9); + +DEFINE_STATIC_SRCU(srcu0); +DEFINE_STATIC_SRCU(srcu1); +DEFINE_STATIC_SRCU(srcu2); +DEFINE_STATIC_SRCU(srcu3); +DEFINE_STATIC_SRCU(srcu4); +DEFINE_STATIC_SRCU(srcu5); +DEFINE_STATIC_SRCU(srcu6); +DEFINE_STATIC_SRCU(srcu7); +DEFINE_STATIC_SRCU(srcu8); +DEFINE_STATIC_SRCU(srcu9); + +static int srcu_lockdep_next(const char *f, const char *fl, const char *fs, const char *fu, int i, + int cyclelen, int deadlock) +{ + int j = i + 1; + + if (j >= cyclelen) + j = deadlock ? 0 : -1; + if (j >= 0) + pr_info("%s: %s(%d), %s(%d), %s(%d)\n", f, fl, i, fs, j, fu, i); + else + pr_info("%s: %s(%d), %s(%d)\n", f, fl, i, fu, i); + return j; +} + +// Test lockdep on SRCU-based deadlock scenarios. +static void rcu_torture_init_srcu_lockdep(void) +{ + int cyclelen; + int deadlock; + bool err = false; + int i; + int j; + int idx; + struct mutex *muts[] = { &mut0, &mut1, &mut2, &mut3, &mut4, + &mut5, &mut6, &mut7, &mut8, &mut9 }; + struct rw_semaphore *rwsems[] = { &rwsem0, &rwsem1, &rwsem2, &rwsem3, &rwsem4, + &rwsem5, &rwsem6, &rwsem7, &rwsem8, &rwsem9 }; + struct srcu_struct *srcus[] = { &srcu0, &srcu1, &srcu2, &srcu3, &srcu4, + &srcu5, &srcu6, &srcu7, &srcu8, &srcu9 }; + int testtype; + + if (!test_srcu_lockdep) + return; + + deadlock = test_srcu_lockdep / 1000; + testtype = (test_srcu_lockdep / 10) % 100; + cyclelen = test_srcu_lockdep % 10; + WARN_ON_ONCE(ARRAY_SIZE(muts) != ARRAY_SIZE(srcus)); + if (WARN_ONCE(deadlock != !!deadlock, + "%s: test_srcu_lockdep=%d and deadlock digit %d must be zero or one.\n", + __func__, test_srcu_lockdep, deadlock)) + err = true; + if (WARN_ONCE(cyclelen <= 0, + "%s: test_srcu_lockdep=%d and cycle-length digit %d must be greater than zero.\n", + __func__, test_srcu_lockdep, cyclelen)) + err = true; + if (err) + goto err_out; + + if (testtype == 0) { + pr_info("%s: test_srcu_lockdep = %05d: SRCU %d-way %sdeadlock.\n", + __func__, test_srcu_lockdep, cyclelen, deadlock ? "" : "non-"); + if (deadlock && cyclelen == 1) + pr_info("%s: Expect hang.\n", __func__); + for (i = 0; i < cyclelen; i++) { + j = srcu_lockdep_next(__func__, "srcu_read_lock", "synchronize_srcu", + "srcu_read_unlock", i, cyclelen, deadlock); + idx = srcu_read_lock(srcus[i]); + if (j >= 0) + synchronize_srcu(srcus[j]); + srcu_read_unlock(srcus[i], idx); + } + return; + } + + if (testtype == 1) { + pr_info("%s: test_srcu_lockdep = %05d: SRCU/mutex %d-way %sdeadlock.\n", + __func__, test_srcu_lockdep, cyclelen, deadlock ? "" : "non-"); + for (i = 0; i < cyclelen; i++) { + pr_info("%s: srcu_read_lock(%d), mutex_lock(%d), mutex_unlock(%d), srcu_read_unlock(%d)\n", + __func__, i, i, i, i); + idx = srcu_read_lock(srcus[i]); + mutex_lock(muts[i]); + mutex_unlock(muts[i]); + srcu_read_unlock(srcus[i], idx); + + j = srcu_lockdep_next(__func__, "mutex_lock", "synchronize_srcu", + "mutex_unlock", i, cyclelen, deadlock); + mutex_lock(muts[i]); + if (j >= 0) + synchronize_srcu(srcus[j]); + mutex_unlock(muts[i]); + } + return; + } + + if (testtype == 2) { + pr_info("%s: test_srcu_lockdep = %05d: SRCU/rwsem %d-way %sdeadlock.\n", + __func__, test_srcu_lockdep, cyclelen, deadlock ? "" : "non-"); + for (i = 0; i < cyclelen; i++) { + pr_info("%s: srcu_read_lock(%d), down_read(%d), up_read(%d), srcu_read_unlock(%d)\n", + __func__, i, i, i, i); + idx = srcu_read_lock(srcus[i]); + down_read(rwsems[i]); + up_read(rwsems[i]); + srcu_read_unlock(srcus[i], idx); + + j = srcu_lockdep_next(__func__, "down_write", "synchronize_srcu", + "up_write", i, cyclelen, deadlock); + down_write(rwsems[i]); + if (j >= 0) + synchronize_srcu(srcus[j]); + up_write(rwsems[i]); + } + return; + } + +#ifdef CONFIG_TASKS_TRACE_RCU + if (testtype == 3) { + pr_info("%s: test_srcu_lockdep = %05d: SRCU and Tasks Trace RCU %d-way %sdeadlock.\n", + __func__, test_srcu_lockdep, cyclelen, deadlock ? "" : "non-"); + if (deadlock && cyclelen == 1) + pr_info("%s: Expect hang.\n", __func__); + for (i = 0; i < cyclelen; i++) { + char *fl = i == 0 ? "rcu_read_lock_trace" : "srcu_read_lock"; + char *fs = i == cyclelen - 1 ? "synchronize_rcu_tasks_trace" + : "synchronize_srcu"; + char *fu = i == 0 ? "rcu_read_unlock_trace" : "srcu_read_unlock"; + + j = srcu_lockdep_next(__func__, fl, fs, fu, i, cyclelen, deadlock); + if (i == 0) + rcu_read_lock_trace(); + else + idx = srcu_read_lock(srcus[i]); + if (j >= 0) { + if (i == cyclelen - 1) + synchronize_rcu_tasks_trace(); + else + synchronize_srcu(srcus[j]); + } + if (i == 0) + rcu_read_unlock_trace(); + else + srcu_read_unlock(srcus[i], idx); + } + return; + } +#endif // #ifdef CONFIG_TASKS_TRACE_RCU + +err_out: + pr_info("%s: test_srcu_lockdep = %05d does nothing.\n", __func__, test_srcu_lockdep); + pr_info("%s: test_srcu_lockdep = DNNL.\n", __func__); + pr_info("%s: D: Deadlock if nonzero.\n", __func__); + pr_info("%s: NN: Test number, 0=SRCU, 1=SRCU/mutex, 2=SRCU/rwsem, 3=SRCU/Tasks Trace RCU.\n", __func__); + pr_info("%s: L: Cycle length.\n", __func__); + if (!IS_ENABLED(CONFIG_TASKS_TRACE_RCU)) + pr_info("%s: NN=3 disallowed because kernel is built with CONFIG_TASKS_TRACE_RCU=n\n", __func__); } static int __init @@ -2234,9 +4443,12 @@ rcu_torture_init(void) long i; int cpu; int firsterr = 0; + int flags = 0; + unsigned long gp_seq = 0; static struct rcu_torture_ops *torture_ops[] = { - &rcu_ops, &rcu_busted_ops, &srcu_ops, &srcud_ops, - &busted_srcud_ops, &tasks_ops, + &rcu_ops, &rcu_busted_ops, &srcu_ops, &srcud_ops, &busted_srcud_ops, + TASKS_OPS TASKS_RUDE_OPS TASKS_TRACING_OPS + &trivial_ops, }; if (!torture_init_begin(torture_type, verbose)) @@ -2255,17 +4467,33 @@ rcu_torture_init(void) for (i = 0; i < ARRAY_SIZE(torture_ops); i++) pr_cont(" %s", torture_ops[i]->name); pr_cont("\n"); - WARN_ON(!IS_MODULE(CONFIG_RCU_TORTURE_TEST)); firsterr = -EINVAL; + cur_ops = NULL; goto unwind; } if (cur_ops->fqs == NULL && fqs_duration != 0) { pr_alert("rcu-torture: ->fqs NULL and non-zero fqs_duration, fqs disabled.\n"); fqs_duration = 0; } + if (nocbs_nthreads != 0 && (cur_ops != &rcu_ops || + !IS_ENABLED(CONFIG_RCU_NOCB_CPU))) { + pr_alert("rcu-torture types: %s and CONFIG_RCU_NOCB_CPU=%d, nocb toggle disabled.\n", + cur_ops->name, IS_ENABLED(CONFIG_RCU_NOCB_CPU)); + nocbs_nthreads = 0; + } if (cur_ops->init) cur_ops->init(); + rcu_torture_init_srcu_lockdep(); + + if (nfakewriters >= 0) { + nrealfakewriters = nfakewriters; + } else { + nrealfakewriters = num_online_cpus() - 2 - nfakewriters; + if (nrealfakewriters <= 0) + nrealfakewriters = 1; + } + if (nreaders >= 0) { nrealreaders = nreaders; } else { @@ -2274,6 +4502,11 @@ rcu_torture_init(void) nrealreaders = 1; } rcu_torture_print_module_parms(cur_ops, "Start of test"); + if (cur_ops->get_gp_data) + cur_ops->get_gp_data(&flags, &gp_seq); + start_gp_seq = gp_seq; + pr_alert("%s: Start-test grace-period state: g%ld f%#x\n", + cur_ops->name, (long)gp_seq, flags); /* Set up the freelist. */ @@ -2292,10 +4525,11 @@ rcu_torture_init(void) atomic_set(&n_rcu_torture_alloc_fail, 0); atomic_set(&n_rcu_torture_free, 0); atomic_set(&n_rcu_torture_mberror, 0); + atomic_set(&n_rcu_torture_mbchk_fail, 0); + atomic_set(&n_rcu_torture_mbchk_tries, 0); atomic_set(&n_rcu_torture_error, 0); n_rcu_torture_barrier_error = 0; n_rcu_torture_boost_ktrerror = 0; - n_rcu_torture_boost_rterror = 0; n_rcu_torture_boost_failure = 0; n_rcu_torture_boosts = 0; for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) @@ -2311,64 +4545,98 @@ rcu_torture_init(void) /* Start up the kthreads. */ - firsterr = torture_create_kthread(rcu_torture_writer, NULL, - writer_task); - if (firsterr) - goto unwind; - if (nfakewriters > 0) { - fakewriter_tasks = kcalloc(nfakewriters, + rcu_torture_write_types(); + if (nrealfakewriters > 0) { + fakewriter_tasks = kcalloc(nrealfakewriters, sizeof(fakewriter_tasks[0]), GFP_KERNEL); if (fakewriter_tasks == NULL) { - VERBOSE_TOROUT_ERRSTRING("out of memory"); + TOROUT_ERRSTRING("out of memory"); firsterr = -ENOMEM; goto unwind; } } - for (i = 0; i < nfakewriters; i++) { + for (i = 0; i < nrealfakewriters; i++) { firsterr = torture_create_kthread(rcu_torture_fakewriter, NULL, fakewriter_tasks[i]); - if (firsterr) + if (torture_init_error(firsterr)) goto unwind; } reader_tasks = kcalloc(nrealreaders, sizeof(reader_tasks[0]), GFP_KERNEL); - if (reader_tasks == NULL) { - VERBOSE_TOROUT_ERRSTRING("out of memory"); + rcu_torture_reader_mbchk = kcalloc(nrealreaders, sizeof(*rcu_torture_reader_mbchk), + GFP_KERNEL); + if (!reader_tasks || !rcu_torture_reader_mbchk) { + TOROUT_ERRSTRING("out of memory"); firsterr = -ENOMEM; goto unwind; } for (i = 0; i < nrealreaders; i++) { + rcu_torture_reader_mbchk[i].rtc_chkrdr = -1; firsterr = torture_create_kthread(rcu_torture_reader, (void *)i, reader_tasks[i]); - if (firsterr) + if (torture_init_error(firsterr)) + goto unwind; + } + + firsterr = torture_create_kthread(rcu_torture_writer, NULL, + writer_task); + if (torture_init_error(firsterr)) + goto unwind; + + firsterr = rcu_torture_updown_init(); + if (torture_init_error(firsterr)) + goto unwind; + nrealnocbers = nocbs_nthreads; + if (WARN_ON(nrealnocbers < 0)) + nrealnocbers = 1; + if (WARN_ON(nocbs_toggle < 0)) + nocbs_toggle = HZ; + if (nrealnocbers > 0) { + nocb_tasks = kcalloc(nrealnocbers, sizeof(nocb_tasks[0]), GFP_KERNEL); + if (nocb_tasks == NULL) { + TOROUT_ERRSTRING("out of memory"); + firsterr = -ENOMEM; + goto unwind; + } + } else { + nocb_tasks = NULL; + } + for (i = 0; i < nrealnocbers; i++) { + firsterr = torture_create_kthread(rcu_nocb_toggle, NULL, nocb_tasks[i]); + if (torture_init_error(firsterr)) goto unwind; } if (stat_interval > 0) { firsterr = torture_create_kthread(rcu_torture_stats, NULL, stats_task); - if (firsterr) + if (torture_init_error(firsterr)) goto unwind; } if (test_no_idle_hz && shuffle_interval > 0) { firsterr = torture_shuffle_init(shuffle_interval * HZ); - if (firsterr) + if (torture_init_error(firsterr)) goto unwind; } if (stutter < 0) stutter = 0; if (stutter) { - firsterr = torture_stutter_init(stutter * HZ); - if (firsterr) + int t; + + t = cur_ops->stall_dur ? cur_ops->stall_dur() : stutter * HZ; + firsterr = torture_stutter_init(stutter * HZ, t); + if (torture_init_error(firsterr)) goto unwind; } if (fqs_duration < 0) fqs_duration = 0; - if (fqs_duration) { + if (fqs_holdoff < 0) + fqs_holdoff = 0; + if (fqs_duration && fqs_holdoff) { /* Create the fqs thread */ firsterr = torture_create_kthread(rcu_torture_fqs, NULL, fqs_task); - if (firsterr) + if (torture_init_error(firsterr)) goto unwind; } if (test_boost_interval < 1) @@ -2382,33 +4650,57 @@ rcu_torture_init(void) firsterr = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "RCU_TORTURE", rcutorture_booster_init, rcutorture_booster_cleanup); - if (firsterr < 0) - goto unwind; rcutor_hp = firsterr; + if (torture_init_error(firsterr)) + goto unwind; } + shutdown_jiffies = jiffies + shutdown_secs * HZ; firsterr = torture_shutdown_init(shutdown_secs, rcu_torture_cleanup); - if (firsterr) + if (torture_init_error(firsterr)) goto unwind; - firsterr = torture_onoff_init(onoff_holdoff * HZ, onoff_interval); - if (firsterr) + firsterr = torture_onoff_init(onoff_holdoff * HZ, onoff_interval, + rcutorture_sync); + if (torture_init_error(firsterr)) goto unwind; firsterr = rcu_torture_stall_init(); - if (firsterr) + if (torture_init_error(firsterr)) goto unwind; firsterr = rcu_torture_fwd_prog_init(); - if (firsterr) + if (torture_init_error(firsterr)) goto unwind; firsterr = rcu_torture_barrier_init(); - if (firsterr) + if (torture_init_error(firsterr)) + goto unwind; + firsterr = rcu_torture_read_exit_init(); + if (torture_init_error(firsterr)) goto unwind; + if (preempt_duration > 0) { + firsterr = torture_create_kthread(rcu_torture_preempt, NULL, preempt_task); + if (torture_init_error(firsterr)) + goto unwind; + } if (object_debug) rcu_test_debug_objects(); + + if (cur_ops->gp_slow_register && !WARN_ON_ONCE(!cur_ops->gp_slow_unregister)) + cur_ops->gp_slow_register(&rcu_fwd_cb_nodelay); + + if (gpwrap_lag && cur_ops->set_gpwrap_lag) { + firsterr = rcu_gpwrap_lag_init(); + if (torture_init_error(firsterr)) + goto unwind; + } + torture_init_end(); return 0; unwind: torture_init_end(); rcu_torture_cleanup(); + if (shutdown_secs) { + WARN_ON(!IS_MODULE(CONFIG_RCU_TORTURE_TEST)); + kernel_power_off(); + } return firsterr; } diff --git a/kernel/rcu/refscale.c b/kernel/rcu/refscale.c new file mode 100644 index 000000000000..07a313782dfd --- /dev/null +++ b/kernel/rcu/refscale.c @@ -0,0 +1,1616 @@ +// SPDX-License-Identifier: GPL-2.0+ +// +// Scalability test comparing RCU vs other mechanisms +// for acquiring references on objects. +// +// Copyright (C) Google, 2020. +// +// Author: Joel Fernandes <joel@joelfernandes.org> + +#define pr_fmt(fmt) fmt + +#include <linux/atomic.h> +#include <linux/bitops.h> +#include <linux/completion.h> +#include <linux/cpu.h> +#include <linux/delay.h> +#include <linux/err.h> +#include <linux/init.h> +#include <linux/interrupt.h> +#include <linux/kthread.h> +#include <linux/kernel.h> +#include <linux/mm.h> +#include <linux/module.h> +#include <linux/moduleparam.h> +#include <linux/notifier.h> +#include <linux/percpu.h> +#include <linux/rcupdate.h> +#include <linux/rcupdate_trace.h> +#include <linux/reboot.h> +#include <linux/sched.h> +#include <linux/seq_buf.h> +#include <linux/spinlock.h> +#include <linux/smp.h> +#include <linux/stat.h> +#include <linux/srcu.h> +#include <linux/slab.h> +#include <linux/torture.h> +#include <linux/types.h> +#include <linux/sched/clock.h> + +#include "rcu.h" + +#define SCALE_FLAG "-ref-scale: " + +#define SCALEOUT(s, x...) \ + pr_alert("%s" SCALE_FLAG s, scale_type, ## x) + +#define VERBOSE_SCALEOUT(s, x...) \ + do { \ + if (verbose) \ + pr_alert("%s" SCALE_FLAG s "\n", scale_type, ## x); \ + } while (0) + +static atomic_t verbose_batch_ctr; + +#define VERBOSE_SCALEOUT_BATCH(s, x...) \ +do { \ + if (verbose && \ + (verbose_batched <= 0 || \ + !(atomic_inc_return(&verbose_batch_ctr) % verbose_batched))) { \ + schedule_timeout_uninterruptible(1); \ + pr_alert("%s" SCALE_FLAG s "\n", scale_type, ## x); \ + } \ +} while (0) + +#define SCALEOUT_ERRSTRING(s, x...) pr_alert("%s" SCALE_FLAG "!!! " s "\n", scale_type, ## x) + +MODULE_DESCRIPTION("Scalability test for object reference mechanisms"); +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Joel Fernandes (Google) <joel@joelfernandes.org>"); + +static char *scale_type = "rcu"; +module_param(scale_type, charp, 0444); +MODULE_PARM_DESC(scale_type, "Type of test (rcu, srcu, refcnt, rwsem, rwlock."); + +torture_param(int, verbose, 0, "Enable verbose debugging printk()s"); +torture_param(int, verbose_batched, 0, "Batch verbose debugging printk()s"); + +// Number of seconds to extend warm-up and cool-down for multiple guest OSes +torture_param(long, guest_os_delay, 0, + "Number of seconds to extend warm-up/cool-down for multiple guest OSes."); +// Wait until there are multiple CPUs before starting test. +torture_param(int, holdoff, IS_BUILTIN(CONFIG_RCU_REF_SCALE_TEST) ? 10 : 0, + "Holdoff time before test start (s)"); +// Number of typesafe_lookup structures, that is, the degree of concurrency. +torture_param(long, lookup_instances, 0, "Number of typesafe_lookup structures."); +// Number of loops per experiment, all readers execute operations concurrently. +torture_param(int, loops, 10000, "Number of loops per experiment."); +// Number of readers, with -1 defaulting to about 75% of the CPUs. +torture_param(int, nreaders, -1, "Number of readers, -1 for 75% of CPUs."); +// Number of runs. +torture_param(int, nruns, 30, "Number of experiments to run."); +// Reader delay in nanoseconds, 0 for no delay. +torture_param(int, readdelay, 0, "Read-side delay in nanoseconds."); + +#ifdef MODULE +# define REFSCALE_SHUTDOWN 0 +#else +# define REFSCALE_SHUTDOWN 1 +#endif + +torture_param(bool, shutdown, REFSCALE_SHUTDOWN, + "Shutdown at end of scalability tests."); + +struct reader_task { + struct task_struct *task; + int start_reader; + wait_queue_head_t wq; + u64 last_duration_ns; +}; + +static struct task_struct *shutdown_task; +static wait_queue_head_t shutdown_wq; + +static struct task_struct *main_task; +static wait_queue_head_t main_wq; +static int shutdown_start; + +static struct reader_task *reader_tasks; + +// Number of readers that are part of the current experiment. +static atomic_t nreaders_exp; + +// Use to wait for all threads to start. +static atomic_t n_init; +static atomic_t n_started; +static atomic_t n_warmedup; +static atomic_t n_cooleddown; + +// Track which experiment is currently running. +static int exp_idx; + +// Operations vector for selecting different types of tests. +struct ref_scale_ops { + bool (*init)(void); + void (*cleanup)(void); + void (*readsection)(const int nloops); + void (*delaysection)(const int nloops, const int udl, const int ndl); + bool enable_irqs; + const char *name; +}; + +static const struct ref_scale_ops *cur_ops; + +static void un_delay(const int udl, const int ndl) +{ + if (udl) + udelay(udl); + if (ndl) + ndelay(ndl); +} + +static void ref_rcu_read_section(const int nloops) +{ + int i; + + for (i = nloops; i >= 0; i--) { + rcu_read_lock(); + rcu_read_unlock(); + } +} + +static void ref_rcu_delay_section(const int nloops, const int udl, const int ndl) +{ + int i; + + for (i = nloops; i >= 0; i--) { + rcu_read_lock(); + un_delay(udl, ndl); + rcu_read_unlock(); + } +} + +static bool rcu_sync_scale_init(void) +{ + return true; +} + +static const struct ref_scale_ops rcu_ops = { + .init = rcu_sync_scale_init, + .readsection = ref_rcu_read_section, + .delaysection = ref_rcu_delay_section, + .name = "rcu" +}; + +// Definitions for SRCU ref scale testing. +DEFINE_STATIC_SRCU(srcu_refctl_scale); +DEFINE_STATIC_SRCU_FAST(srcu_fast_refctl_scale); +DEFINE_STATIC_SRCU_FAST_UPDOWN(srcu_fast_updown_refctl_scale); +static struct srcu_struct *srcu_ctlp = &srcu_refctl_scale; + +static void srcu_ref_scale_read_section(const int nloops) +{ + int i; + int idx; + + for (i = nloops; i >= 0; i--) { + idx = srcu_read_lock(srcu_ctlp); + srcu_read_unlock(srcu_ctlp, idx); + } +} + +static void srcu_ref_scale_delay_section(const int nloops, const int udl, const int ndl) +{ + int i; + int idx; + + for (i = nloops; i >= 0; i--) { + idx = srcu_read_lock(srcu_ctlp); + un_delay(udl, ndl); + srcu_read_unlock(srcu_ctlp, idx); + } +} + +static const struct ref_scale_ops srcu_ops = { + .init = rcu_sync_scale_init, + .readsection = srcu_ref_scale_read_section, + .delaysection = srcu_ref_scale_delay_section, + .name = "srcu" +}; + +static bool srcu_fast_sync_scale_init(void) +{ + srcu_ctlp = &srcu_fast_refctl_scale; + return true; +} + +static void srcu_fast_ref_scale_read_section(const int nloops) +{ + int i; + struct srcu_ctr __percpu *scp; + + for (i = nloops; i >= 0; i--) { + scp = srcu_read_lock_fast(srcu_ctlp); + srcu_read_unlock_fast(srcu_ctlp, scp); + } +} + +static void srcu_fast_ref_scale_delay_section(const int nloops, const int udl, const int ndl) +{ + int i; + struct srcu_ctr __percpu *scp; + + for (i = nloops; i >= 0; i--) { + scp = srcu_read_lock_fast(srcu_ctlp); + un_delay(udl, ndl); + srcu_read_unlock_fast(srcu_ctlp, scp); + } +} + +static const struct ref_scale_ops srcu_fast_ops = { + .init = srcu_fast_sync_scale_init, + .readsection = srcu_fast_ref_scale_read_section, + .delaysection = srcu_fast_ref_scale_delay_section, + .name = "srcu-fast" +}; + +static bool srcu_fast_updown_sync_scale_init(void) +{ + srcu_ctlp = &srcu_fast_updown_refctl_scale; + return true; +} + +static void srcu_fast_updown_ref_scale_read_section(const int nloops) +{ + int i; + struct srcu_ctr __percpu *scp; + + for (i = nloops; i >= 0; i--) { + scp = srcu_read_lock_fast_updown(srcu_ctlp); + srcu_read_unlock_fast_updown(srcu_ctlp, scp); + } +} + +static void srcu_fast_updown_ref_scale_delay_section(const int nloops, const int udl, const int ndl) +{ + int i; + struct srcu_ctr __percpu *scp; + + for (i = nloops; i >= 0; i--) { + scp = srcu_read_lock_fast_updown(srcu_ctlp); + un_delay(udl, ndl); + srcu_read_unlock_fast_updown(srcu_ctlp, scp); + } +} + +static const struct ref_scale_ops srcu_fast_updown_ops = { + .init = srcu_fast_updown_sync_scale_init, + .readsection = srcu_fast_updown_ref_scale_read_section, + .delaysection = srcu_fast_updown_ref_scale_delay_section, + .name = "srcu-fast-updown" +}; + +#ifdef CONFIG_TASKS_RCU + +// Definitions for RCU Tasks ref scale testing: Empty read markers. +// These definitions also work for RCU Rude readers. +static void rcu_tasks_ref_scale_read_section(const int nloops) +{ + int i; + + for (i = nloops; i >= 0; i--) + continue; +} + +static void rcu_tasks_ref_scale_delay_section(const int nloops, const int udl, const int ndl) +{ + int i; + + for (i = nloops; i >= 0; i--) + un_delay(udl, ndl); +} + +static const struct ref_scale_ops rcu_tasks_ops = { + .init = rcu_sync_scale_init, + .readsection = rcu_tasks_ref_scale_read_section, + .delaysection = rcu_tasks_ref_scale_delay_section, + .name = "rcu-tasks" +}; + +#define RCU_TASKS_OPS &rcu_tasks_ops, + +#else // #ifdef CONFIG_TASKS_RCU + +#define RCU_TASKS_OPS + +#endif // #else // #ifdef CONFIG_TASKS_RCU + +#ifdef CONFIG_TASKS_TRACE_RCU + +// Definitions for RCU Tasks Trace ref scale testing. +static void rcu_trace_ref_scale_read_section(const int nloops) +{ + int i; + + for (i = nloops; i >= 0; i--) { + rcu_read_lock_trace(); + rcu_read_unlock_trace(); + } +} + +static void rcu_trace_ref_scale_delay_section(const int nloops, const int udl, const int ndl) +{ + int i; + + for (i = nloops; i >= 0; i--) { + rcu_read_lock_trace(); + un_delay(udl, ndl); + rcu_read_unlock_trace(); + } +} + +static const struct ref_scale_ops rcu_trace_ops = { + .init = rcu_sync_scale_init, + .readsection = rcu_trace_ref_scale_read_section, + .delaysection = rcu_trace_ref_scale_delay_section, + .name = "rcu-trace" +}; + +#define RCU_TRACE_OPS &rcu_trace_ops, + +#else // #ifdef CONFIG_TASKS_TRACE_RCU + +#define RCU_TRACE_OPS + +#endif // #else // #ifdef CONFIG_TASKS_TRACE_RCU + +// Definitions for reference count +static atomic_t refcnt; + +// Definitions acquire-release. +static DEFINE_PER_CPU(unsigned long, test_acqrel); + +static void ref_refcnt_section(const int nloops) +{ + int i; + + for (i = nloops; i >= 0; i--) { + atomic_inc(&refcnt); + atomic_dec(&refcnt); + } +} + +static void ref_refcnt_delay_section(const int nloops, const int udl, const int ndl) +{ + int i; + + for (i = nloops; i >= 0; i--) { + atomic_inc(&refcnt); + un_delay(udl, ndl); + atomic_dec(&refcnt); + } +} + +static const struct ref_scale_ops refcnt_ops = { + .init = rcu_sync_scale_init, + .readsection = ref_refcnt_section, + .delaysection = ref_refcnt_delay_section, + .name = "refcnt" +}; + +static void ref_percpuinc_section(const int nloops) +{ + int i; + + for (i = nloops; i >= 0; i--) { + this_cpu_inc(test_acqrel); + this_cpu_dec(test_acqrel); + } +} + +static void ref_percpuinc_delay_section(const int nloops, const int udl, const int ndl) +{ + int i; + + for (i = nloops; i >= 0; i--) { + this_cpu_inc(test_acqrel); + un_delay(udl, ndl); + this_cpu_dec(test_acqrel); + } +} + +static const struct ref_scale_ops percpuinc_ops = { + .init = rcu_sync_scale_init, + .readsection = ref_percpuinc_section, + .delaysection = ref_percpuinc_delay_section, + .name = "percpuinc" +}; + +// Note that this can lose counts in preemptible kernels. +static void ref_incpercpu_section(const int nloops) +{ + int i; + + for (i = nloops; i >= 0; i--) { + unsigned long *tap = this_cpu_ptr(&test_acqrel); + + WRITE_ONCE(*tap, READ_ONCE(*tap) + 1); + WRITE_ONCE(*tap, READ_ONCE(*tap) - 1); + } +} + +static void ref_incpercpu_delay_section(const int nloops, const int udl, const int ndl) +{ + int i; + + for (i = nloops; i >= 0; i--) { + unsigned long *tap = this_cpu_ptr(&test_acqrel); + + WRITE_ONCE(*tap, READ_ONCE(*tap) + 1); + un_delay(udl, ndl); + WRITE_ONCE(*tap, READ_ONCE(*tap) - 1); + } +} + +static const struct ref_scale_ops incpercpu_ops = { + .init = rcu_sync_scale_init, + .readsection = ref_incpercpu_section, + .delaysection = ref_incpercpu_delay_section, + .name = "incpercpu" +}; + +static void ref_incpercpupreempt_section(const int nloops) +{ + int i; + + for (i = nloops; i >= 0; i--) { + unsigned long *tap; + + preempt_disable(); + tap = this_cpu_ptr(&test_acqrel); + WRITE_ONCE(*tap, READ_ONCE(*tap) + 1); + WRITE_ONCE(*tap, READ_ONCE(*tap) - 1); + preempt_enable(); + } +} + +static void ref_incpercpupreempt_delay_section(const int nloops, const int udl, const int ndl) +{ + int i; + + for (i = nloops; i >= 0; i--) { + unsigned long *tap; + + preempt_disable(); + tap = this_cpu_ptr(&test_acqrel); + WRITE_ONCE(*tap, READ_ONCE(*tap) + 1); + un_delay(udl, ndl); + WRITE_ONCE(*tap, READ_ONCE(*tap) - 1); + preempt_enable(); + } +} + +static const struct ref_scale_ops incpercpupreempt_ops = { + .init = rcu_sync_scale_init, + .readsection = ref_incpercpupreempt_section, + .delaysection = ref_incpercpupreempt_delay_section, + .name = "incpercpupreempt" +}; + +static void ref_incpercpubh_section(const int nloops) +{ + int i; + + for (i = nloops; i >= 0; i--) { + unsigned long *tap; + + local_bh_disable(); + tap = this_cpu_ptr(&test_acqrel); + WRITE_ONCE(*tap, READ_ONCE(*tap) + 1); + WRITE_ONCE(*tap, READ_ONCE(*tap) - 1); + local_bh_enable(); + } +} + +static void ref_incpercpubh_delay_section(const int nloops, const int udl, const int ndl) +{ + int i; + + for (i = nloops; i >= 0; i--) { + unsigned long *tap; + + local_bh_disable(); + tap = this_cpu_ptr(&test_acqrel); + WRITE_ONCE(*tap, READ_ONCE(*tap) + 1); + un_delay(udl, ndl); + WRITE_ONCE(*tap, READ_ONCE(*tap) - 1); + local_bh_enable(); + } +} + +static const struct ref_scale_ops incpercpubh_ops = { + .init = rcu_sync_scale_init, + .readsection = ref_incpercpubh_section, + .delaysection = ref_incpercpubh_delay_section, + .enable_irqs = true, + .name = "incpercpubh" +}; + +static void ref_incpercpuirqsave_section(const int nloops) +{ + int i; + unsigned long flags; + + for (i = nloops; i >= 0; i--) { + unsigned long *tap; + + local_irq_save(flags); + tap = this_cpu_ptr(&test_acqrel); + WRITE_ONCE(*tap, READ_ONCE(*tap) + 1); + WRITE_ONCE(*tap, READ_ONCE(*tap) - 1); + local_irq_restore(flags); + } +} + +static void ref_incpercpuirqsave_delay_section(const int nloops, const int udl, const int ndl) +{ + int i; + unsigned long flags; + + for (i = nloops; i >= 0; i--) { + unsigned long *tap; + + local_irq_save(flags); + tap = this_cpu_ptr(&test_acqrel); + WRITE_ONCE(*tap, READ_ONCE(*tap) + 1); + un_delay(udl, ndl); + WRITE_ONCE(*tap, READ_ONCE(*tap) - 1); + local_irq_restore(flags); + } +} + +static const struct ref_scale_ops incpercpuirqsave_ops = { + .init = rcu_sync_scale_init, + .readsection = ref_incpercpuirqsave_section, + .delaysection = ref_incpercpuirqsave_delay_section, + .name = "incpercpuirqsave" +}; + +// Definitions for rwlock +static rwlock_t test_rwlock; + +static bool ref_rwlock_init(void) +{ + rwlock_init(&test_rwlock); + return true; +} + +static void ref_rwlock_section(const int nloops) +{ + int i; + + for (i = nloops; i >= 0; i--) { + read_lock(&test_rwlock); + read_unlock(&test_rwlock); + } +} + +static void ref_rwlock_delay_section(const int nloops, const int udl, const int ndl) +{ + int i; + + for (i = nloops; i >= 0; i--) { + read_lock(&test_rwlock); + un_delay(udl, ndl); + read_unlock(&test_rwlock); + } +} + +static const struct ref_scale_ops rwlock_ops = { + .init = ref_rwlock_init, + .readsection = ref_rwlock_section, + .delaysection = ref_rwlock_delay_section, + .name = "rwlock" +}; + +// Definitions for rwsem +static struct rw_semaphore test_rwsem; + +static bool ref_rwsem_init(void) +{ + init_rwsem(&test_rwsem); + return true; +} + +static void ref_rwsem_section(const int nloops) +{ + int i; + + for (i = nloops; i >= 0; i--) { + down_read(&test_rwsem); + up_read(&test_rwsem); + } +} + +static void ref_rwsem_delay_section(const int nloops, const int udl, const int ndl) +{ + int i; + + for (i = nloops; i >= 0; i--) { + down_read(&test_rwsem); + un_delay(udl, ndl); + up_read(&test_rwsem); + } +} + +static const struct ref_scale_ops rwsem_ops = { + .init = ref_rwsem_init, + .readsection = ref_rwsem_section, + .delaysection = ref_rwsem_delay_section, + .name = "rwsem" +}; + +// Definitions for global spinlock +static DEFINE_RAW_SPINLOCK(test_lock); + +static void ref_lock_section(const int nloops) +{ + int i; + + preempt_disable(); + for (i = nloops; i >= 0; i--) { + raw_spin_lock(&test_lock); + raw_spin_unlock(&test_lock); + } + preempt_enable(); +} + +static void ref_lock_delay_section(const int nloops, const int udl, const int ndl) +{ + int i; + + preempt_disable(); + for (i = nloops; i >= 0; i--) { + raw_spin_lock(&test_lock); + un_delay(udl, ndl); + raw_spin_unlock(&test_lock); + } + preempt_enable(); +} + +static const struct ref_scale_ops lock_ops = { + .readsection = ref_lock_section, + .delaysection = ref_lock_delay_section, + .name = "lock" +}; + +// Definitions for global irq-save spinlock + +static void ref_lock_irq_section(const int nloops) +{ + unsigned long flags; + int i; + + preempt_disable(); + for (i = nloops; i >= 0; i--) { + raw_spin_lock_irqsave(&test_lock, flags); + raw_spin_unlock_irqrestore(&test_lock, flags); + } + preempt_enable(); +} + +static void ref_lock_irq_delay_section(const int nloops, const int udl, const int ndl) +{ + unsigned long flags; + int i; + + preempt_disable(); + for (i = nloops; i >= 0; i--) { + raw_spin_lock_irqsave(&test_lock, flags); + un_delay(udl, ndl); + raw_spin_unlock_irqrestore(&test_lock, flags); + } + preempt_enable(); +} + +static const struct ref_scale_ops lock_irq_ops = { + .readsection = ref_lock_irq_section, + .delaysection = ref_lock_irq_delay_section, + .name = "lock-irq" +}; + +static void ref_acqrel_section(const int nloops) +{ + unsigned long x; + int i; + + preempt_disable(); + for (i = nloops; i >= 0; i--) { + x = smp_load_acquire(this_cpu_ptr(&test_acqrel)); + smp_store_release(this_cpu_ptr(&test_acqrel), x + 1); + } + preempt_enable(); +} + +static void ref_acqrel_delay_section(const int nloops, const int udl, const int ndl) +{ + unsigned long x; + int i; + + preempt_disable(); + for (i = nloops; i >= 0; i--) { + x = smp_load_acquire(this_cpu_ptr(&test_acqrel)); + un_delay(udl, ndl); + smp_store_release(this_cpu_ptr(&test_acqrel), x + 1); + } + preempt_enable(); +} + +static const struct ref_scale_ops acqrel_ops = { + .readsection = ref_acqrel_section, + .delaysection = ref_acqrel_delay_section, + .name = "acqrel" +}; + +static volatile u64 stopopts; + +static void ref_sched_clock_section(const int nloops) +{ + u64 x = 0; + int i; + + preempt_disable(); + for (i = nloops; i >= 0; i--) + x += sched_clock(); + preempt_enable(); + stopopts = x; +} + +static void ref_sched_clock_delay_section(const int nloops, const int udl, const int ndl) +{ + u64 x = 0; + int i; + + preempt_disable(); + for (i = nloops; i >= 0; i--) { + x += sched_clock(); + un_delay(udl, ndl); + } + preempt_enable(); + stopopts = x; +} + +static const struct ref_scale_ops sched_clock_ops = { + .readsection = ref_sched_clock_section, + .delaysection = ref_sched_clock_delay_section, + .name = "sched-clock" +}; + + +static void ref_clock_section(const int nloops) +{ + u64 x = 0; + int i; + + preempt_disable(); + for (i = nloops; i >= 0; i--) + x += ktime_get_real_fast_ns(); + preempt_enable(); + stopopts = x; +} + +static void ref_clock_delay_section(const int nloops, const int udl, const int ndl) +{ + u64 x = 0; + int i; + + preempt_disable(); + for (i = nloops; i >= 0; i--) { + x += ktime_get_real_fast_ns(); + un_delay(udl, ndl); + } + preempt_enable(); + stopopts = x; +} + +static const struct ref_scale_ops clock_ops = { + .readsection = ref_clock_section, + .delaysection = ref_clock_delay_section, + .name = "clock" +}; + +static void ref_jiffies_section(const int nloops) +{ + u64 x = 0; + int i; + + preempt_disable(); + for (i = nloops; i >= 0; i--) + x += jiffies; + preempt_enable(); + stopopts = x; +} + +static void ref_jiffies_delay_section(const int nloops, const int udl, const int ndl) +{ + u64 x = 0; + int i; + + preempt_disable(); + for (i = nloops; i >= 0; i--) { + x += jiffies; + un_delay(udl, ndl); + } + preempt_enable(); + stopopts = x; +} + +static const struct ref_scale_ops jiffies_ops = { + .readsection = ref_jiffies_section, + .delaysection = ref_jiffies_delay_section, + .name = "jiffies" +}; + +static void ref_preempt_section(const int nloops) +{ + int i; + + migrate_disable(); + for (i = nloops; i >= 0; i--) { + preempt_disable(); + preempt_enable(); + } + migrate_enable(); +} + +static void ref_preempt_delay_section(const int nloops, const int udl, const int ndl) +{ + int i; + + migrate_disable(); + for (i = nloops; i >= 0; i--) { + preempt_disable(); + un_delay(udl, ndl); + preempt_enable(); + } + migrate_enable(); +} + +static const struct ref_scale_ops preempt_ops = { + .readsection = ref_preempt_section, + .delaysection = ref_preempt_delay_section, + .name = "preempt" +}; + +static void ref_bh_section(const int nloops) +{ + int i; + + preempt_disable(); + for (i = nloops; i >= 0; i--) { + local_bh_disable(); + local_bh_enable(); + } + preempt_enable(); +} + +static void ref_bh_delay_section(const int nloops, const int udl, const int ndl) +{ + int i; + + preempt_disable(); + for (i = nloops; i >= 0; i--) { + local_bh_disable(); + un_delay(udl, ndl); + local_bh_enable(); + } + preempt_enable(); +} + +static const struct ref_scale_ops bh_ops = { + .readsection = ref_bh_section, + .delaysection = ref_bh_delay_section, + .enable_irqs = true, + .name = "bh" +}; + +static void ref_irq_section(const int nloops) +{ + int i; + + preempt_disable(); + for (i = nloops; i >= 0; i--) { + local_irq_disable(); + local_irq_enable(); + } + preempt_enable(); +} + +static void ref_irq_delay_section(const int nloops, const int udl, const int ndl) +{ + int i; + + preempt_disable(); + for (i = nloops; i >= 0; i--) { + local_irq_disable(); + un_delay(udl, ndl); + local_irq_enable(); + } + preempt_enable(); +} + +static const struct ref_scale_ops irq_ops = { + .readsection = ref_irq_section, + .delaysection = ref_irq_delay_section, + .name = "irq" +}; + +static void ref_irqsave_section(const int nloops) +{ + unsigned long flags; + int i; + + preempt_disable(); + for (i = nloops; i >= 0; i--) { + local_irq_save(flags); + local_irq_restore(flags); + } + preempt_enable(); +} + +static void ref_irqsave_delay_section(const int nloops, const int udl, const int ndl) +{ + unsigned long flags; + int i; + + preempt_disable(); + for (i = nloops; i >= 0; i--) { + local_irq_save(flags); + un_delay(udl, ndl); + local_irq_restore(flags); + } + preempt_enable(); +} + +static const struct ref_scale_ops irqsave_ops = { + .readsection = ref_irqsave_section, + .delaysection = ref_irqsave_delay_section, + .name = "irqsave" +}; + +//////////////////////////////////////////////////////////////////////// +// +// Methods leveraging SLAB_TYPESAFE_BY_RCU. +// + +// Item to look up in a typesafe manner. Array of pointers to these. +struct refscale_typesafe { + atomic_t rts_refctr; // Used by all flavors + spinlock_t rts_lock; + seqlock_t rts_seqlock; + unsigned int a; + unsigned int b; +}; + +static struct kmem_cache *typesafe_kmem_cachep; +static struct refscale_typesafe **rtsarray; +static long rtsarray_size; +static DEFINE_TORTURE_RANDOM_PERCPU(refscale_rand); +static bool (*rts_acquire)(struct refscale_typesafe *rtsp, unsigned int *start); +static bool (*rts_release)(struct refscale_typesafe *rtsp, unsigned int start); + +// Conditionally acquire an explicit in-structure reference count. +static bool typesafe_ref_acquire(struct refscale_typesafe *rtsp, unsigned int *start) +{ + return atomic_inc_not_zero(&rtsp->rts_refctr); +} + +// Unconditionally release an explicit in-structure reference count. +static bool typesafe_ref_release(struct refscale_typesafe *rtsp, unsigned int start) +{ + if (!atomic_dec_return(&rtsp->rts_refctr)) { + WRITE_ONCE(rtsp->a, rtsp->a + 1); + kmem_cache_free(typesafe_kmem_cachep, rtsp); + } + return true; +} + +// Unconditionally acquire an explicit in-structure spinlock. +static bool typesafe_lock_acquire(struct refscale_typesafe *rtsp, unsigned int *start) +{ + spin_lock(&rtsp->rts_lock); + return true; +} + +// Unconditionally release an explicit in-structure spinlock. +static bool typesafe_lock_release(struct refscale_typesafe *rtsp, unsigned int start) +{ + spin_unlock(&rtsp->rts_lock); + return true; +} + +// Unconditionally acquire an explicit in-structure sequence lock. +static bool typesafe_seqlock_acquire(struct refscale_typesafe *rtsp, unsigned int *start) +{ + *start = read_seqbegin(&rtsp->rts_seqlock); + return true; +} + +// Conditionally release an explicit in-structure sequence lock. Return +// true if this release was successful, that is, if no retry is required. +static bool typesafe_seqlock_release(struct refscale_typesafe *rtsp, unsigned int start) +{ + return !read_seqretry(&rtsp->rts_seqlock, start); +} + +// Do a read-side critical section with the specified delay in +// microseconds and nanoseconds inserted so as to increase probability +// of failure. +static void typesafe_delay_section(const int nloops, const int udl, const int ndl) +{ + unsigned int a; + unsigned int b; + int i; + long idx; + struct refscale_typesafe *rtsp; + unsigned int start; + + for (i = nloops; i >= 0; i--) { + preempt_disable(); + idx = torture_random(this_cpu_ptr(&refscale_rand)) % rtsarray_size; + preempt_enable(); +retry: + rcu_read_lock(); + rtsp = rcu_dereference(rtsarray[idx]); + a = READ_ONCE(rtsp->a); + if (!rts_acquire(rtsp, &start)) { + rcu_read_unlock(); + goto retry; + } + if (a != READ_ONCE(rtsp->a)) { + (void)rts_release(rtsp, start); + rcu_read_unlock(); + goto retry; + } + un_delay(udl, ndl); + b = READ_ONCE(rtsp->a); + // Remember, seqlock read-side release can fail. + if (!rts_release(rtsp, start)) { + rcu_read_unlock(); + goto retry; + } + WARN_ONCE(a != b, "Re-read of ->a changed from %u to %u.\n", a, b); + b = rtsp->b; + rcu_read_unlock(); + WARN_ON_ONCE(a * a != b); + } +} + +// Because the acquisition and release methods are expensive, there +// is no point in optimizing away the un_delay() function's two checks. +// Thus simply define typesafe_read_section() as a simple wrapper around +// typesafe_delay_section(). +static void typesafe_read_section(const int nloops) +{ + typesafe_delay_section(nloops, 0, 0); +} + +// Allocate and initialize one refscale_typesafe structure. +static struct refscale_typesafe *typesafe_alloc_one(void) +{ + struct refscale_typesafe *rtsp; + + rtsp = kmem_cache_alloc(typesafe_kmem_cachep, GFP_KERNEL); + if (!rtsp) + return NULL; + atomic_set(&rtsp->rts_refctr, 1); + WRITE_ONCE(rtsp->a, rtsp->a + 1); + WRITE_ONCE(rtsp->b, rtsp->a * rtsp->a); + return rtsp; +} + +// Slab-allocator constructor for refscale_typesafe structures created +// out of a new slab of system memory. +static void refscale_typesafe_ctor(void *rtsp_in) +{ + struct refscale_typesafe *rtsp = rtsp_in; + + spin_lock_init(&rtsp->rts_lock); + seqlock_init(&rtsp->rts_seqlock); + preempt_disable(); + rtsp->a = torture_random(this_cpu_ptr(&refscale_rand)); + preempt_enable(); +} + +static const struct ref_scale_ops typesafe_ref_ops; +static const struct ref_scale_ops typesafe_lock_ops; +static const struct ref_scale_ops typesafe_seqlock_ops; + +// Initialize for a typesafe test. +static bool typesafe_init(void) +{ + long idx; + long si = lookup_instances; + + typesafe_kmem_cachep = kmem_cache_create("refscale_typesafe", + sizeof(struct refscale_typesafe), sizeof(void *), + SLAB_TYPESAFE_BY_RCU, refscale_typesafe_ctor); + if (!typesafe_kmem_cachep) + return false; + if (si < 0) + si = -si * nr_cpu_ids; + else if (si == 0) + si = nr_cpu_ids; + rtsarray_size = si; + rtsarray = kcalloc(si, sizeof(*rtsarray), GFP_KERNEL); + if (!rtsarray) + return false; + for (idx = 0; idx < rtsarray_size; idx++) { + rtsarray[idx] = typesafe_alloc_one(); + if (!rtsarray[idx]) + return false; + } + if (cur_ops == &typesafe_ref_ops) { + rts_acquire = typesafe_ref_acquire; + rts_release = typesafe_ref_release; + } else if (cur_ops == &typesafe_lock_ops) { + rts_acquire = typesafe_lock_acquire; + rts_release = typesafe_lock_release; + } else if (cur_ops == &typesafe_seqlock_ops) { + rts_acquire = typesafe_seqlock_acquire; + rts_release = typesafe_seqlock_release; + } else { + WARN_ON_ONCE(1); + return false; + } + return true; +} + +// Clean up after a typesafe test. +static void typesafe_cleanup(void) +{ + long idx; + + if (rtsarray) { + for (idx = 0; idx < rtsarray_size; idx++) + kmem_cache_free(typesafe_kmem_cachep, rtsarray[idx]); + kfree(rtsarray); + rtsarray = NULL; + rtsarray_size = 0; + } + kmem_cache_destroy(typesafe_kmem_cachep); + typesafe_kmem_cachep = NULL; + rts_acquire = NULL; + rts_release = NULL; +} + +// The typesafe_init() function distinguishes these structures by address. +static const struct ref_scale_ops typesafe_ref_ops = { + .init = typesafe_init, + .cleanup = typesafe_cleanup, + .readsection = typesafe_read_section, + .delaysection = typesafe_delay_section, + .name = "typesafe_ref" +}; + +static const struct ref_scale_ops typesafe_lock_ops = { + .init = typesafe_init, + .cleanup = typesafe_cleanup, + .readsection = typesafe_read_section, + .delaysection = typesafe_delay_section, + .name = "typesafe_lock" +}; + +static const struct ref_scale_ops typesafe_seqlock_ops = { + .init = typesafe_init, + .cleanup = typesafe_cleanup, + .readsection = typesafe_read_section, + .delaysection = typesafe_delay_section, + .name = "typesafe_seqlock" +}; + +static void rcu_scale_one_reader(void) +{ + if (readdelay <= 0) + cur_ops->readsection(loops); + else + cur_ops->delaysection(loops, readdelay / 1000, readdelay % 1000); +} + +// Warm up cache, or, if needed run a series of rcu_scale_one_reader() +// to allow multiple rcuscale guest OSes to collect mutually valid data. +static void rcu_scale_warm_cool(void) +{ + unsigned long jdone = jiffies + (guest_os_delay > 0 ? guest_os_delay * HZ : -1); + + do { + rcu_scale_one_reader(); + cond_resched(); + } while (time_before(jiffies, jdone)); +} + +// Reader kthread. Repeatedly does empty RCU read-side +// critical section, minimizing update-side interference. +static int +ref_scale_reader(void *arg) +{ + unsigned long flags; + long me = (long)arg; + struct reader_task *rt = &(reader_tasks[me]); + u64 start; + s64 duration; + + VERBOSE_SCALEOUT_BATCH("ref_scale_reader %ld: task started", me); + WARN_ON_ONCE(set_cpus_allowed_ptr(current, cpumask_of(me % nr_cpu_ids))); + set_user_nice(current, MAX_NICE); + atomic_inc(&n_init); + if (holdoff) + schedule_timeout_interruptible(holdoff * HZ); +repeat: + VERBOSE_SCALEOUT_BATCH("ref_scale_reader %ld: waiting to start next experiment on cpu %d", me, raw_smp_processor_id()); + + // Wait for signal that this reader can start. + wait_event(rt->wq, (atomic_read(&nreaders_exp) && smp_load_acquire(&rt->start_reader)) || + torture_must_stop()); + + if (torture_must_stop()) + goto end; + + // Make sure that the CPU is affinitized appropriately during testing. + WARN_ON_ONCE(raw_smp_processor_id() != me % nr_cpu_ids); + + WRITE_ONCE(rt->start_reader, 0); + if (!atomic_dec_return(&n_started)) + while (atomic_read_acquire(&n_started)) + cpu_relax(); + + VERBOSE_SCALEOUT_BATCH("ref_scale_reader %ld: experiment %d started", me, exp_idx); + + + // To reduce noise, do an initial cache-warming invocation, check + // in, and then keep warming until everyone has checked in. + rcu_scale_one_reader(); + if (!atomic_dec_return(&n_warmedup)) + while (atomic_read_acquire(&n_warmedup)) + rcu_scale_one_reader(); + // Also keep interrupts disabled when it is safe to do so, which + // it is not for local_bh_enable(). This also has the effect of + // preventing entries into slow path for rcu_read_unlock(). + if (!cur_ops->enable_irqs) + local_irq_save(flags); + start = ktime_get_mono_fast_ns(); + + rcu_scale_one_reader(); + + duration = ktime_get_mono_fast_ns() - start; + if (!cur_ops->enable_irqs) + local_irq_restore(flags); + + rt->last_duration_ns = WARN_ON_ONCE(duration < 0) ? 0 : duration; + // To reduce runtime-skew noise, do maintain-load invocations until + // everyone is done. + if (!atomic_dec_return(&n_cooleddown)) + while (atomic_read_acquire(&n_cooleddown)) + rcu_scale_one_reader(); + + if (atomic_dec_and_test(&nreaders_exp)) + wake_up(&main_wq); + + VERBOSE_SCALEOUT_BATCH("ref_scale_reader %ld: experiment %d ended, (readers remaining=%d)", + me, exp_idx, atomic_read(&nreaders_exp)); + + if (!torture_must_stop()) + goto repeat; +end: + torture_kthread_stopping("ref_scale_reader"); + return 0; +} + +static void reset_readers(void) +{ + int i; + struct reader_task *rt; + + for (i = 0; i < nreaders; i++) { + rt = &(reader_tasks[i]); + + rt->last_duration_ns = 0; + } +} + +// Print the results of each reader and return the sum of all their durations. +static u64 process_durations(int n) +{ + int i; + struct reader_task *rt; + struct seq_buf s; + char *buf; + u64 sum = 0; + + buf = kmalloc(800 + 64, GFP_KERNEL); + if (!buf) + return 0; + seq_buf_init(&s, buf, 800 + 64); + + seq_buf_printf(&s, "Experiment #%d (Format: <THREAD-NUM>:<Total loop time in ns>)", + exp_idx); + + for (i = 0; i < n && !torture_must_stop(); i++) { + rt = &(reader_tasks[i]); + + if (i % 5 == 0) + seq_buf_putc(&s, '\n'); + + if (seq_buf_used(&s) >= 800) { + pr_alert("%s", seq_buf_str(&s)); + seq_buf_clear(&s); + } + + seq_buf_printf(&s, "%d: %llu\t", i, rt->last_duration_ns); + + sum += rt->last_duration_ns; + } + pr_alert("%s\n", seq_buf_str(&s)); + + kfree(buf); + return sum; +} + +// The main_func is the main orchestrator, it performs a bunch of +// experiments. For every experiment, it orders all the readers +// involved to start and waits for them to finish the experiment. It +// then reads their timestamps and starts the next experiment. Each +// experiment progresses from 1 concurrent reader to N of them at which +// point all the timestamps are printed. +static int main_func(void *arg) +{ + int exp, r; + char buf1[64]; + char *buf; + u64 *result_avg; + + set_cpus_allowed_ptr(current, cpumask_of(nreaders % nr_cpu_ids)); + set_user_nice(current, MAX_NICE); + + VERBOSE_SCALEOUT("main_func task started"); + result_avg = kcalloc(nruns, sizeof(*result_avg), GFP_KERNEL); + buf = kzalloc(800 + 64, GFP_KERNEL); + if (!result_avg || !buf) { + SCALEOUT_ERRSTRING("out of memory"); + goto oom_exit; + } + if (holdoff) + schedule_timeout_interruptible(holdoff * HZ); + + // Wait for all threads to start. + atomic_inc(&n_init); + while (atomic_read(&n_init) < nreaders + 1) + schedule_timeout_uninterruptible(1); + + // Start exp readers up per experiment + rcu_scale_warm_cool(); + for (exp = 0; exp < nruns && !torture_must_stop(); exp++) { + if (torture_must_stop()) + goto end; + + reset_readers(); + atomic_set(&nreaders_exp, nreaders); + atomic_set(&n_started, nreaders); + atomic_set(&n_warmedup, nreaders); + atomic_set(&n_cooleddown, nreaders); + + exp_idx = exp; + + for (r = 0; r < nreaders; r++) { + smp_store_release(&reader_tasks[r].start_reader, 1); + wake_up(&reader_tasks[r].wq); + } + + VERBOSE_SCALEOUT("main_func: experiment started, waiting for %d readers", + nreaders); + + wait_event(main_wq, + !atomic_read(&nreaders_exp) || torture_must_stop()); + + VERBOSE_SCALEOUT("main_func: experiment ended"); + + if (torture_must_stop()) + goto end; + + result_avg[exp] = div_u64(1000 * process_durations(nreaders), nreaders * loops); + } + rcu_scale_warm_cool(); + + // Print the average of all experiments + SCALEOUT("END OF TEST. Calculating average duration per loop (nanoseconds)...\n"); + + pr_alert("Runs\tTime(ns)\n"); + for (exp = 0; exp < nruns; exp++) { + u64 avg; + u32 rem; + + avg = div_u64_rem(result_avg[exp], 1000, &rem); + sprintf(buf1, "%d\t%llu.%03u\n", exp + 1, avg, rem); + strcat(buf, buf1); + if (strlen(buf) >= 800) { + pr_alert("%s", buf); + buf[0] = 0; + } + } + + pr_alert("%s", buf); + +oom_exit: + // This will shutdown everything including us. + if (shutdown) { + shutdown_start = 1; + wake_up(&shutdown_wq); + } + + // Wait for torture to stop us + while (!torture_must_stop()) + schedule_timeout_uninterruptible(1); + +end: + torture_kthread_stopping("main_func"); + kfree(result_avg); + kfree(buf); + return 0; +} + +static void +ref_scale_print_module_parms(const struct ref_scale_ops *cur_ops, const char *tag) +{ + pr_alert("%s" SCALE_FLAG + "--- %s: verbose=%d verbose_batched=%d shutdown=%d holdoff=%d lookup_instances=%ld loops=%d nreaders=%d nruns=%d readdelay=%d\n", scale_type, tag, + verbose, verbose_batched, shutdown, holdoff, lookup_instances, loops, nreaders, nruns, readdelay); +} + +static void +ref_scale_cleanup(void) +{ + int i; + + if (torture_cleanup_begin()) + return; + + if (!cur_ops) { + torture_cleanup_end(); + return; + } + + if (reader_tasks) { + for (i = 0; i < nreaders; i++) + torture_stop_kthread("ref_scale_reader", + reader_tasks[i].task); + } + kfree(reader_tasks); + reader_tasks = NULL; + + torture_stop_kthread("main_task", main_task); + + // Do scale-type-specific cleanup operations. + if (cur_ops->cleanup != NULL) + cur_ops->cleanup(); + + torture_cleanup_end(); +} + +// Shutdown kthread. Just waits to be awakened, then shuts down system. +static int +ref_scale_shutdown(void *arg) +{ + wait_event_idle(shutdown_wq, shutdown_start); + + smp_mb(); // Wake before output. + ref_scale_cleanup(); + kernel_power_off(); + + return -EINVAL; +} + +static int __init +ref_scale_init(void) +{ + long i; + int firsterr = 0; + static const struct ref_scale_ops *scale_ops[] = { + &rcu_ops, &srcu_ops, &srcu_fast_ops, &srcu_fast_updown_ops, + RCU_TRACE_OPS RCU_TASKS_OPS + &refcnt_ops, &percpuinc_ops, &incpercpu_ops, &incpercpupreempt_ops, + &incpercpubh_ops, &incpercpuirqsave_ops, + &rwlock_ops, &rwsem_ops, &lock_ops, &lock_irq_ops, &acqrel_ops, + &sched_clock_ops, &clock_ops, &jiffies_ops, + &preempt_ops, &bh_ops, &irq_ops, &irqsave_ops, + &typesafe_ref_ops, &typesafe_lock_ops, &typesafe_seqlock_ops, + }; + + if (!torture_init_begin(scale_type, verbose)) + return -EBUSY; + + for (i = 0; i < ARRAY_SIZE(scale_ops); i++) { + cur_ops = scale_ops[i]; + if (strcmp(scale_type, cur_ops->name) == 0) + break; + } + if (i == ARRAY_SIZE(scale_ops)) { + pr_alert("rcu-scale: invalid scale type: \"%s\"\n", scale_type); + pr_alert("rcu-scale types:"); + for (i = 0; i < ARRAY_SIZE(scale_ops); i++) + pr_cont(" %s", scale_ops[i]->name); + pr_cont("\n"); + firsterr = -EINVAL; + cur_ops = NULL; + goto unwind; + } + if (cur_ops->init) + if (!cur_ops->init()) { + firsterr = -EUCLEAN; + goto unwind; + } + + ref_scale_print_module_parms(cur_ops, "Start of test"); + + // Shutdown task + if (shutdown) { + init_waitqueue_head(&shutdown_wq); + firsterr = torture_create_kthread(ref_scale_shutdown, NULL, + shutdown_task); + if (torture_init_error(firsterr)) + goto unwind; + schedule_timeout_uninterruptible(1); + } + + // Reader tasks (default to ~75% of online CPUs). + if (nreaders < 0) + nreaders = (num_online_cpus() >> 1) + (num_online_cpus() >> 2); + if (WARN_ONCE(loops <= 0, "%s: loops = %d, adjusted to 1\n", __func__, loops)) + loops = 1; + if (WARN_ONCE(nreaders <= 0, "%s: nreaders = %d, adjusted to 1\n", __func__, nreaders)) + nreaders = 1; + if (WARN_ONCE(nruns <= 0, "%s: nruns = %d, adjusted to 1\n", __func__, nruns)) + nruns = 1; + if (WARN_ONCE(loops > INT_MAX / nreaders, + "%s: nreaders * loops will overflow, adjusted loops to %d", + __func__, INT_MAX / nreaders)) + loops = INT_MAX / nreaders; + reader_tasks = kcalloc(nreaders, sizeof(reader_tasks[0]), + GFP_KERNEL); + if (!reader_tasks) { + SCALEOUT_ERRSTRING("out of memory"); + firsterr = -ENOMEM; + goto unwind; + } + + VERBOSE_SCALEOUT("Starting %d reader threads", nreaders); + + for (i = 0; i < nreaders; i++) { + init_waitqueue_head(&reader_tasks[i].wq); + firsterr = torture_create_kthread(ref_scale_reader, (void *)i, + reader_tasks[i].task); + if (torture_init_error(firsterr)) + goto unwind; + } + + // Main Task + init_waitqueue_head(&main_wq); + firsterr = torture_create_kthread(main_func, NULL, main_task); + if (torture_init_error(firsterr)) + goto unwind; + + torture_init_end(); + return 0; + +unwind: + torture_init_end(); + ref_scale_cleanup(); + if (shutdown) { + WARN_ON(!IS_MODULE(CONFIG_RCU_REF_SCALE_TEST)); + kernel_power_off(); + } + return firsterr; +} + +module_init(ref_scale_init); +module_exit(ref_scale_cleanup); diff --git a/kernel/rcu/srcutiny.c b/kernel/rcu/srcutiny.c index 32dfd6522548..3450c3751ef7 100644 --- a/kernel/rcu/srcutiny.c +++ b/kernel/rcu/srcutiny.c @@ -1,24 +1,11 @@ +// SPDX-License-Identifier: GPL-2.0+ /* * Sleepable Read-Copy Update mechanism for mutual exclusion, * tiny version for non-preemptible single-CPU use. * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, you can access it online at - * http://www.gnu.org/licenses/gpl-2.0.html. - * * Copyright (C) IBM Corporation, 2017 * - * Author: Paul McKenney <paulmck@us.ibm.com> + * Author: Paul McKenney <paulmck@linux.ibm.com> */ #include <linux/export.h> @@ -33,7 +20,11 @@ #include "rcu_segcblist.h" #include "rcu.h" +#ifndef CONFIG_TREE_RCU int rcu_scheduler_active __read_mostly; +#else // #ifndef CONFIG_TREE_RCU +extern int rcu_scheduler_active; +#endif // #else // #ifndef CONFIG_TREE_RCU static LIST_HEAD(srcu_boot_list); static bool srcu_init_done; @@ -47,6 +38,7 @@ static int init_srcu_struct_fields(struct srcu_struct *ssp) ssp->srcu_gp_running = false; ssp->srcu_gp_waiting = false; ssp->srcu_idx = 0; + ssp->srcu_idx_max = 0; INIT_WORK(&ssp->srcu_work, srcu_drive_gp); INIT_LIST_HEAD(&ssp->srcu_work.entry); return 0; @@ -89,19 +81,18 @@ EXPORT_SYMBOL_GPL(init_srcu_struct); * Must invoke this after you are finished using a given srcu_struct that * was initialized via init_srcu_struct(), else you leak memory. */ -void _cleanup_srcu_struct(struct srcu_struct *ssp, bool quiesced) +void cleanup_srcu_struct(struct srcu_struct *ssp) { WARN_ON(ssp->srcu_lock_nesting[0] || ssp->srcu_lock_nesting[1]); - if (quiesced) - WARN_ON(work_pending(&ssp->srcu_work)); - else - flush_work(&ssp->srcu_work); + flush_work(&ssp->srcu_work); WARN_ON(ssp->srcu_gp_running); WARN_ON(ssp->srcu_gp_waiting); WARN_ON(ssp->srcu_cb_head); WARN_ON(&ssp->srcu_cb_head != ssp->srcu_cb_tail); + WARN_ON(ssp->srcu_idx != ssp->srcu_idx_max); + WARN_ON(ssp->srcu_idx & 0x1); } -EXPORT_SYMBOL_GPL(_cleanup_srcu_struct); +EXPORT_SYMBOL_GPL(cleanup_srcu_struct); /* * Removes the count for the old reader from the appropriate element of @@ -109,18 +100,21 @@ EXPORT_SYMBOL_GPL(_cleanup_srcu_struct); */ void __srcu_read_unlock(struct srcu_struct *ssp, int idx) { - int newval = ssp->srcu_lock_nesting[idx] - 1; + int newval; + preempt_disable(); // Needed for PREEMPT_LAZY + newval = READ_ONCE(ssp->srcu_lock_nesting[idx]) - 1; WRITE_ONCE(ssp->srcu_lock_nesting[idx], newval); - if (!newval && READ_ONCE(ssp->srcu_gp_waiting)) + preempt_enable(); + if (!newval && READ_ONCE(ssp->srcu_gp_waiting) && in_task() && !irqs_disabled()) swake_up_one(&ssp->srcu_wq); } EXPORT_SYMBOL_GPL(__srcu_read_unlock); /* * Workqueue handler to drive one grace period and invoke any callbacks - * that become ready as a result. Single-CPU and !PREEMPT operation - * means that we get away with murder on synchronization. ;-) + * that become ready as a result. Single-CPU operation and preemption + * disabling mean that we get away with murder on synchronization. ;-) */ void srcu_drive_gp(struct work_struct *wp) { @@ -130,8 +124,11 @@ void srcu_drive_gp(struct work_struct *wp) struct srcu_struct *ssp; ssp = container_of(wp, struct srcu_struct, srcu_work); - if (ssp->srcu_gp_running || !READ_ONCE(ssp->srcu_cb_head)) + preempt_disable(); // Needed for PREEMPT_LAZY + if (ssp->srcu_gp_running || ULONG_CMP_GE(ssp->srcu_idx, READ_ONCE(ssp->srcu_idx_max))) { + preempt_enable(); return; /* Already running or nothing to do. */ + } /* Remove recently arrived callbacks and wait for readers. */ WRITE_ONCE(ssp->srcu_gp_running, true); @@ -140,16 +137,26 @@ void srcu_drive_gp(struct work_struct *wp) ssp->srcu_cb_head = NULL; ssp->srcu_cb_tail = &ssp->srcu_cb_head; local_irq_enable(); - idx = ssp->srcu_idx; - WRITE_ONCE(ssp->srcu_idx, !ssp->srcu_idx); + idx = (ssp->srcu_idx & 0x2) / 2; + WRITE_ONCE(ssp->srcu_idx, ssp->srcu_idx + 1); WRITE_ONCE(ssp->srcu_gp_waiting, true); /* srcu_read_unlock() wakes! */ - swait_event_exclusive(ssp->srcu_wq, !READ_ONCE(ssp->srcu_lock_nesting[idx])); + preempt_enable(); + do { + // Deadlock issues prevent __srcu_read_unlock() from + // doing an unconditional wakeup, so polling is required. + swait_event_timeout_exclusive(ssp->srcu_wq, + !READ_ONCE(ssp->srcu_lock_nesting[idx]), HZ / 10); + } while (READ_ONCE(ssp->srcu_lock_nesting[idx])); + preempt_disable(); // Needed for PREEMPT_LAZY WRITE_ONCE(ssp->srcu_gp_waiting, false); /* srcu_read_unlock() cheap. */ + WRITE_ONCE(ssp->srcu_idx, ssp->srcu_idx + 1); + preempt_enable(); /* Invoke the callbacks we removed above. */ while (lh) { rhp = lh; lh = lh->next; + debug_rcu_head_callback(rhp); local_bh_disable(); rhp->func(rhp); local_bh_enable(); @@ -161,12 +168,33 @@ void srcu_drive_gp(struct work_struct *wp) * at interrupt level, but the ->srcu_gp_running checks will * straighten that out. */ + preempt_disable(); // Needed for PREEMPT_LAZY WRITE_ONCE(ssp->srcu_gp_running, false); - if (READ_ONCE(ssp->srcu_cb_head)) + idx = ULONG_CMP_LT(ssp->srcu_idx, READ_ONCE(ssp->srcu_idx_max)); + preempt_enable(); + if (idx) schedule_work(&ssp->srcu_work); } EXPORT_SYMBOL_GPL(srcu_drive_gp); +static void srcu_gp_start_if_needed(struct srcu_struct *ssp) +{ + unsigned long cookie; + + lockdep_assert_preemption_disabled(); // Needed for PREEMPT_LAZY + cookie = get_state_synchronize_srcu(ssp); + if (ULONG_CMP_GE(READ_ONCE(ssp->srcu_idx_max), cookie)) { + return; + } + WRITE_ONCE(ssp->srcu_idx_max, cookie); + if (!READ_ONCE(ssp->srcu_gp_running)) { + if (likely(srcu_init_done)) + schedule_work(&ssp->srcu_work); + else if (list_empty(&ssp->srcu_work.entry)) + list_add(&ssp->srcu_work.entry, &srcu_boot_list); + } +} + /* * Enqueue an SRCU callback on the specified srcu_struct structure, * initiating grace-period processing if it is not already running. @@ -178,16 +206,13 @@ void call_srcu(struct srcu_struct *ssp, struct rcu_head *rhp, rhp->func = func; rhp->next = NULL; + preempt_disable(); // Needed for PREEMPT_LAZY local_irq_save(flags); *ssp->srcu_cb_tail = rhp; ssp->srcu_cb_tail = &rhp->next; local_irq_restore(flags); - if (!READ_ONCE(ssp->srcu_gp_running)) { - if (likely(srcu_init_done)) - schedule_work(&ssp->srcu_work); - else if (list_empty(&ssp->srcu_work.entry)) - list_add(&ssp->srcu_work.entry, &srcu_boot_list); - } + srcu_gp_start_if_needed(ssp); + preempt_enable(); } EXPORT_SYMBOL_GPL(call_srcu); @@ -198,6 +223,18 @@ void synchronize_srcu(struct srcu_struct *ssp) { struct rcu_synchronize rs; + srcu_lock_sync(&ssp->dep_map); + + RCU_LOCKDEP_WARN(lockdep_is_held(ssp) || + lock_is_held(&rcu_bh_lock_map) || + lock_is_held(&rcu_lock_map) || + lock_is_held(&rcu_sched_lock_map), + "Illegal synchronize_srcu() in same-type SRCU (or in RCU) read-side critical section"); + + if (rcu_scheduler_active == RCU_SCHEDULER_INACTIVE) + return; + + might_sleep(); init_rcu_head_on_stack(&rs.head); init_completion(&rs.completion); call_srcu(ssp, &rs.head, wakeme_after_rcu); @@ -206,11 +243,59 @@ void synchronize_srcu(struct srcu_struct *ssp) } EXPORT_SYMBOL_GPL(synchronize_srcu); +/* + * get_state_synchronize_srcu - Provide an end-of-grace-period cookie + */ +unsigned long get_state_synchronize_srcu(struct srcu_struct *ssp) +{ + unsigned long ret; + + barrier(); + ret = (READ_ONCE(ssp->srcu_idx) + 3) & ~0x1; + barrier(); + return ret; +} +EXPORT_SYMBOL_GPL(get_state_synchronize_srcu); + +/* + * start_poll_synchronize_srcu - Provide cookie and start grace period + * + * The difference between this and get_state_synchronize_srcu() is that + * this function ensures that the poll_state_synchronize_srcu() will + * eventually return the value true. + */ +unsigned long start_poll_synchronize_srcu(struct srcu_struct *ssp) +{ + unsigned long ret; + + preempt_disable(); // Needed for PREEMPT_LAZY + ret = get_state_synchronize_srcu(ssp); + srcu_gp_start_if_needed(ssp); + preempt_enable(); + return ret; +} +EXPORT_SYMBOL_GPL(start_poll_synchronize_srcu); + +/* + * poll_state_synchronize_srcu - Has cookie's grace period ended? + */ +bool poll_state_synchronize_srcu(struct srcu_struct *ssp, unsigned long cookie) +{ + unsigned long cur_s = READ_ONCE(ssp->srcu_idx); + + barrier(); + return cookie == SRCU_GET_STATE_COMPLETED || + ULONG_CMP_GE(cur_s, cookie) || ULONG_CMP_LT(cur_s, cookie - 3); +} +EXPORT_SYMBOL_GPL(poll_state_synchronize_srcu); + +#ifndef CONFIG_TREE_RCU /* Lockdep diagnostics. */ void __init rcu_scheduler_starting(void) { rcu_scheduler_active = RCU_SCHEDULER_RUNNING; } +#endif // #ifndef CONFIG_TREE_RCU /* * Queue work for srcu_struct structures with early boot callbacks. diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index 3600d88d8956..ea3f128de06f 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -1,24 +1,11 @@ +// SPDX-License-Identifier: GPL-2.0+ /* * Sleepable Read-Copy Update mechanism for mutual exclusion. * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, you can access it online at - * http://www.gnu.org/licenses/gpl-2.0.html. - * * Copyright (C) IBM Corporation, 2006 * Copyright (C) Fujitsu, 2012 * - * Author: Paul McKenney <paulmck@us.ibm.com> + * Authors: Paul McKenney <paulmck@linux.ibm.com> * Lai Jiangshan <laijs@cn.fujitsu.com> * * For detailed explanation of Read-Copy Update mechanism see - @@ -37,6 +24,7 @@ #include <linux/smp.h> #include <linux/delay.h> #include <linux/module.h> +#include <linux/slab.h> #include <linux/srcu.h> #include "rcu.h" @@ -51,6 +39,35 @@ module_param(exp_holdoff, ulong, 0444); static ulong counter_wrap_check = (ULONG_MAX >> 2); module_param(counter_wrap_check, ulong, 0444); +/* + * Control conversion to SRCU_SIZE_BIG: + * 0: Don't convert at all. + * 1: Convert at init_srcu_struct() time. + * 2: Convert when rcutorture invokes srcu_torture_stats_print(). + * 3: Decide at boot time based on system shape (default). + * 0x1x: Convert when excessive contention encountered. + */ +#define SRCU_SIZING_NONE 0 +#define SRCU_SIZING_INIT 1 +#define SRCU_SIZING_TORTURE 2 +#define SRCU_SIZING_AUTO 3 +#define SRCU_SIZING_CONTEND 0x10 +#define SRCU_SIZING_IS(x) ((convert_to_big & ~SRCU_SIZING_CONTEND) == x) +#define SRCU_SIZING_IS_NONE() (SRCU_SIZING_IS(SRCU_SIZING_NONE)) +#define SRCU_SIZING_IS_INIT() (SRCU_SIZING_IS(SRCU_SIZING_INIT)) +#define SRCU_SIZING_IS_TORTURE() (SRCU_SIZING_IS(SRCU_SIZING_TORTURE)) +#define SRCU_SIZING_IS_CONTEND() (convert_to_big & SRCU_SIZING_CONTEND) +static int convert_to_big = SRCU_SIZING_AUTO; +module_param(convert_to_big, int, 0444); + +/* Number of CPUs to trigger init_srcu_struct()-time transition to big. */ +static int big_cpu_lim __read_mostly = 128; +module_param(big_cpu_lim, int, 0444); + +/* Contention events per jiffy to initiate transition to big. */ +static int small_contention_lim __read_mostly = 100; +module_param(small_contention_lim, int, 0444); + /* Early-boot callback-management, so early that no lock is required! */ static LIST_HEAD(srcu_boot_list); static bool __read_mostly srcu_init_done; @@ -58,41 +75,93 @@ static bool __read_mostly srcu_init_done; static void srcu_invoke_callbacks(struct work_struct *work); static void srcu_reschedule(struct srcu_struct *ssp, unsigned long delay); static void process_srcu(struct work_struct *work); +static void srcu_delay_timer(struct timer_list *t); /* Wrappers for lock acquisition and release, see raw_spin_lock_rcu_node(). */ -#define spin_lock_rcu_node(p) \ -do { \ - spin_lock(&ACCESS_PRIVATE(p, lock)); \ - smp_mb__after_unlock_lock(); \ +#define spin_lock_rcu_node(p) \ +do { \ + spin_lock(&ACCESS_PRIVATE(p, lock)); \ + smp_mb__after_unlock_lock(); \ } while (0) #define spin_unlock_rcu_node(p) spin_unlock(&ACCESS_PRIVATE(p, lock)) -#define spin_lock_irq_rcu_node(p) \ -do { \ - spin_lock_irq(&ACCESS_PRIVATE(p, lock)); \ - smp_mb__after_unlock_lock(); \ +#define spin_lock_irq_rcu_node(p) \ +do { \ + spin_lock_irq(&ACCESS_PRIVATE(p, lock)); \ + smp_mb__after_unlock_lock(); \ } while (0) -#define spin_unlock_irq_rcu_node(p) \ +#define spin_unlock_irq_rcu_node(p) \ spin_unlock_irq(&ACCESS_PRIVATE(p, lock)) -#define spin_lock_irqsave_rcu_node(p, flags) \ -do { \ - spin_lock_irqsave(&ACCESS_PRIVATE(p, lock), flags); \ - smp_mb__after_unlock_lock(); \ +#define spin_lock_irqsave_rcu_node(p, flags) \ +do { \ + spin_lock_irqsave(&ACCESS_PRIVATE(p, lock), flags); \ + smp_mb__after_unlock_lock(); \ } while (0) -#define spin_unlock_irqrestore_rcu_node(p, flags) \ - spin_unlock_irqrestore(&ACCESS_PRIVATE(p, lock), flags) \ +#define spin_trylock_irqsave_rcu_node(p, flags) \ +({ \ + bool ___locked = spin_trylock_irqsave(&ACCESS_PRIVATE(p, lock), flags); \ + \ + if (___locked) \ + smp_mb__after_unlock_lock(); \ + ___locked; \ +}) + +#define spin_unlock_irqrestore_rcu_node(p, flags) \ + spin_unlock_irqrestore(&ACCESS_PRIVATE(p, lock), flags) \ /* - * Initialize SRCU combining tree. Note that statically allocated + * Initialize SRCU per-CPU data. Note that statically allocated * srcu_struct structures might already have srcu_read_lock() and - * srcu_read_unlock() running against them. So if the is_static parameter - * is set, don't initialize ->srcu_lock_count[] and ->srcu_unlock_count[]. + * srcu_read_unlock() running against them. So if the is_static + * parameter is set, don't initialize ->srcu_ctrs[].srcu_locks and + * ->srcu_ctrs[].srcu_unlocks. + */ +static void init_srcu_struct_data(struct srcu_struct *ssp) +{ + int cpu; + struct srcu_data *sdp; + + /* + * Initialize the per-CPU srcu_data array, which feeds into the + * leaves of the srcu_node tree. + */ + for_each_possible_cpu(cpu) { + sdp = per_cpu_ptr(ssp->sda, cpu); + spin_lock_init(&ACCESS_PRIVATE(sdp, lock)); + rcu_segcblist_init(&sdp->srcu_cblist); + sdp->srcu_cblist_invoking = false; + sdp->srcu_gp_seq_needed = ssp->srcu_sup->srcu_gp_seq; + sdp->srcu_gp_seq_needed_exp = ssp->srcu_sup->srcu_gp_seq; + sdp->srcu_barrier_head.next = &sdp->srcu_barrier_head; + sdp->mynode = NULL; + sdp->cpu = cpu; + INIT_WORK(&sdp->work, srcu_invoke_callbacks); + timer_setup(&sdp->delay_work, srcu_delay_timer, 0); + sdp->ssp = ssp; + } +} + +/* Invalid seq state, used during snp node initialization */ +#define SRCU_SNP_INIT_SEQ 0x2 + +/* + * Check whether sequence number corresponding to snp node, + * is invalid. */ -static void init_srcu_struct_nodes(struct srcu_struct *ssp, bool is_static) +static inline bool srcu_invl_snp_seq(unsigned long s) +{ + return s == SRCU_SNP_INIT_SEQ; +} + +/* + * Allocated and initialize SRCU combining tree. Returns @true if + * allocation succeeded and @false otherwise. + */ +static bool init_srcu_struct_nodes(struct srcu_struct *ssp, gfp_t gfp_flags) { int cpu; int i; @@ -102,35 +171,41 @@ static void init_srcu_struct_nodes(struct srcu_struct *ssp, bool is_static) struct srcu_node *snp; struct srcu_node *snp_first; + /* Initialize geometry if it has not already been initialized. */ + rcu_init_geometry(); + ssp->srcu_sup->node = kcalloc(rcu_num_nodes, sizeof(*ssp->srcu_sup->node), gfp_flags); + if (!ssp->srcu_sup->node) + return false; + /* Work out the overall tree geometry. */ - ssp->level[0] = &ssp->node[0]; + ssp->srcu_sup->level[0] = &ssp->srcu_sup->node[0]; for (i = 1; i < rcu_num_lvls; i++) - ssp->level[i] = ssp->level[i - 1] + num_rcu_lvl[i - 1]; + ssp->srcu_sup->level[i] = ssp->srcu_sup->level[i - 1] + num_rcu_lvl[i - 1]; rcu_init_levelspread(levelspread, num_rcu_lvl); /* Each pass through this loop initializes one srcu_node structure. */ srcu_for_each_node_breadth_first(ssp, snp) { spin_lock_init(&ACCESS_PRIVATE(snp, lock)); - WARN_ON_ONCE(ARRAY_SIZE(snp->srcu_have_cbs) != + BUILD_BUG_ON(ARRAY_SIZE(snp->srcu_have_cbs) != ARRAY_SIZE(snp->srcu_data_have_cbs)); for (i = 0; i < ARRAY_SIZE(snp->srcu_have_cbs); i++) { - snp->srcu_have_cbs[i] = 0; + snp->srcu_have_cbs[i] = SRCU_SNP_INIT_SEQ; snp->srcu_data_have_cbs[i] = 0; } - snp->srcu_gp_seq_needed_exp = 0; + snp->srcu_gp_seq_needed_exp = SRCU_SNP_INIT_SEQ; snp->grplo = -1; snp->grphi = -1; - if (snp == &ssp->node[0]) { + if (snp == &ssp->srcu_sup->node[0]) { /* Root node, special case. */ snp->srcu_parent = NULL; continue; } /* Non-root node. */ - if (snp == ssp->level[level + 1]) + if (snp == ssp->srcu_sup->level[level + 1]) level++; - snp->srcu_parent = ssp->level[level - 1] + - (snp - ssp->level[level]) / + snp->srcu_parent = ssp->srcu_sup->level[level - 1] + + (snp - ssp->srcu_sup->level[level]) / levelspread[level - 1]; } @@ -138,96 +213,248 @@ static void init_srcu_struct_nodes(struct srcu_struct *ssp, bool is_static) * Initialize the per-CPU srcu_data array, which feeds into the * leaves of the srcu_node tree. */ - WARN_ON_ONCE(ARRAY_SIZE(sdp->srcu_lock_count) != - ARRAY_SIZE(sdp->srcu_unlock_count)); level = rcu_num_lvls - 1; - snp_first = ssp->level[level]; + snp_first = ssp->srcu_sup->level[level]; for_each_possible_cpu(cpu) { sdp = per_cpu_ptr(ssp->sda, cpu); - spin_lock_init(&ACCESS_PRIVATE(sdp, lock)); - rcu_segcblist_init(&sdp->srcu_cblist); - sdp->srcu_cblist_invoking = false; - sdp->srcu_gp_seq_needed = ssp->srcu_gp_seq; - sdp->srcu_gp_seq_needed_exp = ssp->srcu_gp_seq; sdp->mynode = &snp_first[cpu / levelspread[level]]; for (snp = sdp->mynode; snp != NULL; snp = snp->srcu_parent) { if (snp->grplo < 0) snp->grplo = cpu; snp->grphi = cpu; } - sdp->cpu = cpu; - INIT_DELAYED_WORK(&sdp->work, srcu_invoke_callbacks); - sdp->ssp = ssp; - sdp->grpmask = 1 << (cpu - sdp->mynode->grplo); - if (is_static) - continue; - - /* Dynamically allocated, better be no srcu_read_locks()! */ - for (i = 0; i < ARRAY_SIZE(sdp->srcu_lock_count); i++) { - sdp->srcu_lock_count[i] = 0; - sdp->srcu_unlock_count[i] = 0; - } + sdp->grpmask = 1UL << (cpu - sdp->mynode->grplo); } + smp_store_release(&ssp->srcu_sup->srcu_size_state, SRCU_SIZE_WAIT_BARRIER); + return true; } /* * Initialize non-compile-time initialized fields, including the - * associated srcu_node and srcu_data structures. The is_static - * parameter is passed through to init_srcu_struct_nodes(), and - * also tells us that ->sda has already been wired up to srcu_data. + * associated srcu_node and srcu_data structures. The is_static parameter + * tells us that ->sda has already been wired up to srcu_data. */ static int init_srcu_struct_fields(struct srcu_struct *ssp, bool is_static) { - mutex_init(&ssp->srcu_cb_mutex); - mutex_init(&ssp->srcu_gp_mutex); - ssp->srcu_idx = 0; - ssp->srcu_gp_seq = 0; - ssp->srcu_barrier_seq = 0; - mutex_init(&ssp->srcu_barrier_mutex); - atomic_set(&ssp->srcu_barrier_cpu_cnt, 0); - INIT_DELAYED_WORK(&ssp->work, process_srcu); if (!is_static) + ssp->srcu_sup = kzalloc(sizeof(*ssp->srcu_sup), GFP_KERNEL); + if (!ssp->srcu_sup) + return -ENOMEM; + if (!is_static) + spin_lock_init(&ACCESS_PRIVATE(ssp->srcu_sup, lock)); + ssp->srcu_sup->srcu_size_state = SRCU_SIZE_SMALL; + ssp->srcu_sup->node = NULL; + mutex_init(&ssp->srcu_sup->srcu_cb_mutex); + mutex_init(&ssp->srcu_sup->srcu_gp_mutex); + ssp->srcu_sup->srcu_gp_seq = SRCU_GP_SEQ_INITIAL_VAL; + ssp->srcu_sup->srcu_barrier_seq = 0; + mutex_init(&ssp->srcu_sup->srcu_barrier_mutex); + atomic_set(&ssp->srcu_sup->srcu_barrier_cpu_cnt, 0); + INIT_DELAYED_WORK(&ssp->srcu_sup->work, process_srcu); + ssp->srcu_sup->sda_is_static = is_static; + if (!is_static) { ssp->sda = alloc_percpu(struct srcu_data); - init_srcu_struct_nodes(ssp, is_static); - ssp->srcu_gp_seq_needed_exp = 0; - ssp->srcu_last_gp_end = ktime_get_mono_fast_ns(); - smp_store_release(&ssp->srcu_gp_seq_needed, 0); /* Init done. */ - return ssp->sda ? 0 : -ENOMEM; + ssp->srcu_ctrp = &ssp->sda->srcu_ctrs[0]; + } + if (!ssp->sda) + goto err_free_sup; + init_srcu_struct_data(ssp); + ssp->srcu_sup->srcu_gp_seq_needed_exp = SRCU_GP_SEQ_INITIAL_VAL; + ssp->srcu_sup->srcu_last_gp_end = ktime_get_mono_fast_ns(); + if (READ_ONCE(ssp->srcu_sup->srcu_size_state) == SRCU_SIZE_SMALL && SRCU_SIZING_IS_INIT()) { + if (!init_srcu_struct_nodes(ssp, GFP_ATOMIC)) + goto err_free_sda; + WRITE_ONCE(ssp->srcu_sup->srcu_size_state, SRCU_SIZE_BIG); + } + ssp->srcu_sup->srcu_ssp = ssp; + smp_store_release(&ssp->srcu_sup->srcu_gp_seq_needed, + SRCU_GP_SEQ_INITIAL_VAL); /* Init done. */ + return 0; + +err_free_sda: + if (!is_static) { + free_percpu(ssp->sda); + ssp->sda = NULL; + } +err_free_sup: + if (!is_static) { + kfree(ssp->srcu_sup); + ssp->srcu_sup = NULL; + } + return -ENOMEM; } #ifdef CONFIG_DEBUG_LOCK_ALLOC -int __init_srcu_struct(struct srcu_struct *ssp, const char *name, - struct lock_class_key *key) +static int +__init_srcu_struct_common(struct srcu_struct *ssp, const char *name, struct lock_class_key *key) { /* Don't re-initialize a lock while it is held. */ debug_check_no_locks_freed((void *)ssp, sizeof(*ssp)); lockdep_init_map(&ssp->dep_map, name, key, 0); - spin_lock_init(&ACCESS_PRIVATE(ssp, lock)); return init_srcu_struct_fields(ssp, false); } + +int __init_srcu_struct(struct srcu_struct *ssp, const char *name, struct lock_class_key *key) +{ + ssp->srcu_reader_flavor = 0; + return __init_srcu_struct_common(ssp, name, key); +} EXPORT_SYMBOL_GPL(__init_srcu_struct); +int __init_srcu_struct_fast(struct srcu_struct *ssp, const char *name, struct lock_class_key *key) +{ + ssp->srcu_reader_flavor = SRCU_READ_FLAVOR_FAST; + return __init_srcu_struct_common(ssp, name, key); +} +EXPORT_SYMBOL_GPL(__init_srcu_struct_fast); + +int __init_srcu_struct_fast_updown(struct srcu_struct *ssp, const char *name, + struct lock_class_key *key) +{ + ssp->srcu_reader_flavor = SRCU_READ_FLAVOR_FAST_UPDOWN; + return __init_srcu_struct_common(ssp, name, key); +} +EXPORT_SYMBOL_GPL(__init_srcu_struct_fast_updown); + #else /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ /** * init_srcu_struct - initialize a sleep-RCU structure * @ssp: structure to initialize. * - * Must invoke this on a given srcu_struct before passing that srcu_struct + * Use this in place of DEFINE_SRCU() and DEFINE_STATIC_SRCU() + * for non-static srcu_struct structures that are to be passed to + * srcu_read_lock(), srcu_read_lock_nmisafe(), and friends. It is necessary + * to invoke this on a given srcu_struct before passing that srcu_struct * to any other function. Each srcu_struct represents a separate domain * of SRCU protection. */ int init_srcu_struct(struct srcu_struct *ssp) { - spin_lock_init(&ACCESS_PRIVATE(ssp, lock)); + ssp->srcu_reader_flavor = 0; return init_srcu_struct_fields(ssp, false); } EXPORT_SYMBOL_GPL(init_srcu_struct); +/** + * init_srcu_struct_fast - initialize a fast-reader sleep-RCU structure + * @ssp: structure to initialize. + * + * Use this in place of DEFINE_SRCU_FAST() and DEFINE_STATIC_SRCU_FAST() + * for non-static srcu_struct structures that are to be passed to + * srcu_read_lock_fast() and friends. It is necessary to invoke this on a + * given srcu_struct before passing that srcu_struct to any other function. + * Each srcu_struct represents a separate domain of SRCU protection. + */ +int init_srcu_struct_fast(struct srcu_struct *ssp) +{ + ssp->srcu_reader_flavor = SRCU_READ_FLAVOR_FAST; + return init_srcu_struct_fields(ssp, false); +} +EXPORT_SYMBOL_GPL(init_srcu_struct_fast); + +/** + * init_srcu_struct_fast_updown - initialize a fast-reader up/down sleep-RCU structure + * @ssp: structure to initialize. + * + * Use this function in place of DEFINE_SRCU_FAST_UPDOWN() and + * DEFINE_STATIC_SRCU_FAST_UPDOWN() for non-static srcu_struct + * structures that are to be passed to srcu_read_lock_fast_updown(), + * srcu_down_read_fast(), and friends. It is necessary to invoke this on a + * given srcu_struct before passing that srcu_struct to any other function. + * Each srcu_struct represents a separate domain of SRCU protection. + */ +int init_srcu_struct_fast_updown(struct srcu_struct *ssp) +{ + ssp->srcu_reader_flavor = SRCU_READ_FLAVOR_FAST_UPDOWN; + return init_srcu_struct_fields(ssp, false); +} +EXPORT_SYMBOL_GPL(init_srcu_struct_fast_updown); + #endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */ /* + * Initiate a transition to SRCU_SIZE_BIG with lock held. + */ +static void __srcu_transition_to_big(struct srcu_struct *ssp) +{ + lockdep_assert_held(&ACCESS_PRIVATE(ssp->srcu_sup, lock)); + smp_store_release(&ssp->srcu_sup->srcu_size_state, SRCU_SIZE_ALLOC); +} + +/* + * Initiate an idempotent transition to SRCU_SIZE_BIG. + */ +static void srcu_transition_to_big(struct srcu_struct *ssp) +{ + unsigned long flags; + + /* Double-checked locking on ->srcu_size-state. */ + if (smp_load_acquire(&ssp->srcu_sup->srcu_size_state) != SRCU_SIZE_SMALL) + return; + spin_lock_irqsave_rcu_node(ssp->srcu_sup, flags); + if (smp_load_acquire(&ssp->srcu_sup->srcu_size_state) != SRCU_SIZE_SMALL) { + spin_unlock_irqrestore_rcu_node(ssp->srcu_sup, flags); + return; + } + __srcu_transition_to_big(ssp); + spin_unlock_irqrestore_rcu_node(ssp->srcu_sup, flags); +} + +/* + * Check to see if the just-encountered contention event justifies + * a transition to SRCU_SIZE_BIG. + */ +static void spin_lock_irqsave_check_contention(struct srcu_struct *ssp) +{ + unsigned long j; + + if (!SRCU_SIZING_IS_CONTEND() || ssp->srcu_sup->srcu_size_state) + return; + j = jiffies; + if (ssp->srcu_sup->srcu_size_jiffies != j) { + ssp->srcu_sup->srcu_size_jiffies = j; + ssp->srcu_sup->srcu_n_lock_retries = 0; + } + if (++ssp->srcu_sup->srcu_n_lock_retries <= small_contention_lim) + return; + __srcu_transition_to_big(ssp); +} + +/* + * Acquire the specified srcu_data structure's ->lock, but check for + * excessive contention, which results in initiation of a transition + * to SRCU_SIZE_BIG. But only if the srcutree.convert_to_big module + * parameter permits this. + */ +static void spin_lock_irqsave_sdp_contention(struct srcu_data *sdp, unsigned long *flags) +{ + struct srcu_struct *ssp = sdp->ssp; + + if (spin_trylock_irqsave_rcu_node(sdp, *flags)) + return; + spin_lock_irqsave_rcu_node(ssp->srcu_sup, *flags); + spin_lock_irqsave_check_contention(ssp); + spin_unlock_irqrestore_rcu_node(ssp->srcu_sup, *flags); + spin_lock_irqsave_rcu_node(sdp, *flags); +} + +/* + * Acquire the specified srcu_struct structure's ->lock, but check for + * excessive contention, which results in initiation of a transition + * to SRCU_SIZE_BIG. But only if the srcutree.convert_to_big module + * parameter permits this. + */ +static void spin_lock_irqsave_ssp_contention(struct srcu_struct *ssp, unsigned long *flags) +{ + if (spin_trylock_irqsave_rcu_node(ssp->srcu_sup, *flags)) + return; + spin_lock_irqsave_rcu_node(ssp->srcu_sup, *flags); + spin_lock_irqsave_check_contention(ssp); +} + +/* * First-use initialization of statically allocated srcu_struct * structure. Wiring up the combining tree is more than can be * done with compile-time initialization, so this check is added @@ -240,48 +467,72 @@ static void check_init_srcu_struct(struct srcu_struct *ssp) unsigned long flags; /* The smp_load_acquire() pairs with the smp_store_release(). */ - if (!rcu_seq_state(smp_load_acquire(&ssp->srcu_gp_seq_needed))) /*^^^*/ + if (!rcu_seq_state(smp_load_acquire(&ssp->srcu_sup->srcu_gp_seq_needed))) /*^^^*/ return; /* Already initialized. */ - spin_lock_irqsave_rcu_node(ssp, flags); - if (!rcu_seq_state(ssp->srcu_gp_seq_needed)) { - spin_unlock_irqrestore_rcu_node(ssp, flags); + spin_lock_irqsave_rcu_node(ssp->srcu_sup, flags); + if (!rcu_seq_state(ssp->srcu_sup->srcu_gp_seq_needed)) { + spin_unlock_irqrestore_rcu_node(ssp->srcu_sup, flags); return; } init_srcu_struct_fields(ssp, true); - spin_unlock_irqrestore_rcu_node(ssp, flags); + spin_unlock_irqrestore_rcu_node(ssp->srcu_sup, flags); } /* - * Returns approximate total of the readers' ->srcu_lock_count[] values - * for the rank of per-CPU counters specified by idx. + * Is the current or any upcoming grace period to be expedited? */ -static unsigned long srcu_readers_lock_idx(struct srcu_struct *ssp, int idx) +static bool srcu_gp_is_expedited(struct srcu_struct *ssp) +{ + struct srcu_usage *sup = ssp->srcu_sup; + + return ULONG_CMP_LT(READ_ONCE(sup->srcu_gp_seq), READ_ONCE(sup->srcu_gp_seq_needed_exp)); +} + +/* + * Computes approximate total of the readers' ->srcu_ctrs[].srcu_locks + * values for the rank of per-CPU counters specified by idx, and returns + * true if the caller did the proper barrier (gp), and if the count of + * the locks matches that of the unlocks passed in. + */ +static bool srcu_readers_lock_idx(struct srcu_struct *ssp, int idx, bool gp, unsigned long unlocks) { int cpu; + unsigned long mask = 0; unsigned long sum = 0; for_each_possible_cpu(cpu) { - struct srcu_data *cpuc = per_cpu_ptr(ssp->sda, cpu); + struct srcu_data *sdp = per_cpu_ptr(ssp->sda, cpu); - sum += READ_ONCE(cpuc->srcu_lock_count[idx]); + sum += atomic_long_read(&sdp->srcu_ctrs[idx].srcu_locks); + if (IS_ENABLED(CONFIG_PROVE_RCU)) + mask = mask | READ_ONCE(sdp->srcu_reader_flavor); } - return sum; + WARN_ONCE(IS_ENABLED(CONFIG_PROVE_RCU) && (mask & (mask - 1)), + "Mixed reader flavors for srcu_struct at %ps.\n", ssp); + if (mask & SRCU_READ_FLAVOR_SLOWGP && !gp) + return false; + return sum == unlocks; } /* - * Returns approximate total of the readers' ->srcu_unlock_count[] values - * for the rank of per-CPU counters specified by idx. + * Returns approximate total of the readers' ->srcu_ctrs[].srcu_unlocks + * values for the rank of per-CPU counters specified by idx. */ -static unsigned long srcu_readers_unlock_idx(struct srcu_struct *ssp, int idx) +static unsigned long srcu_readers_unlock_idx(struct srcu_struct *ssp, int idx, unsigned long *rdm) { int cpu; + unsigned long mask = ssp->srcu_reader_flavor; unsigned long sum = 0; for_each_possible_cpu(cpu) { - struct srcu_data *cpuc = per_cpu_ptr(ssp->sda, cpu); + struct srcu_data *sdp = per_cpu_ptr(ssp->sda, cpu); - sum += READ_ONCE(cpuc->srcu_unlock_count[idx]); + sum += atomic_long_read(&sdp->srcu_ctrs[idx].srcu_unlocks); + mask = mask | READ_ONCE(sdp->srcu_reader_flavor); } + WARN_ONCE(IS_ENABLED(CONFIG_PROVE_RCU) && (mask & (mask - 1)), + "Mixed reader flavors for srcu_struct at %ps.\n", ssp); + *rdm = mask; return sum; } @@ -291,45 +542,89 @@ static unsigned long srcu_readers_unlock_idx(struct srcu_struct *ssp, int idx) */ static bool srcu_readers_active_idx_check(struct srcu_struct *ssp, int idx) { + bool did_gp; + unsigned long rdm; unsigned long unlocks; - unlocks = srcu_readers_unlock_idx(ssp, idx); + unlocks = srcu_readers_unlock_idx(ssp, idx, &rdm); + did_gp = !!(rdm & SRCU_READ_FLAVOR_SLOWGP); /* * Make sure that a lock is always counted if the corresponding * unlock is counted. Needs to be a smp_mb() as the read side may * contain a read from a variable that is written to before the * synchronize_srcu() in the write side. In this case smp_mb()s - * A and B act like the store buffering pattern. + * A and B (or X and Y) act like the store buffering pattern. * - * This smp_mb() also pairs with smp_mb() C to prevent accesses - * after the synchronize_srcu() from being executed before the - * grace period ends. + * This smp_mb() also pairs with smp_mb() C (or, in the case of X, + * Z) to prevent accesses after the synchronize_srcu() from being + * executed before the grace period ends. */ - smp_mb(); /* A */ + if (!did_gp) + smp_mb(); /* A */ + else if (srcu_gp_is_expedited(ssp)) + synchronize_rcu_expedited(); /* X */ + else + synchronize_rcu(); /* X */ /* * If the locks are the same as the unlocks, then there must have - * been no readers on this index at some time in between. This does - * not mean that there are no more readers, as one could have read - * the current index but not have incremented the lock counter yet. + * been no readers on this index at some point in this function. + * But there might be more readers, as a task might have read + * the current ->srcu_ctrp but not yet have incremented its CPU's + * ->srcu_ctrs[idx].srcu_locks counter. In fact, it is possible + * that most of the tasks have been preempted between fetching + * ->srcu_ctrp and incrementing ->srcu_ctrs[idx].srcu_locks. And + * there could be almost (ULONG_MAX / sizeof(struct task_struct)) + * tasks in a system whose address space was fully populated + * with memory. Call this quantity Nt. * - * So suppose that the updater is preempted here for so long - * that more than ULONG_MAX non-nested readers come and go in - * the meantime. It turns out that this cannot result in overflow - * because if a reader modifies its unlock count after we read it - * above, then that reader's next load of ->srcu_idx is guaranteed - * to get the new value, which will cause it to operate on the - * other bank of counters, where it cannot contribute to the - * overflow of these counters. This means that there is a maximum - * of 2*NR_CPUS increments, which cannot overflow given current - * systems, especially not on 64-bit systems. + * So suppose that the updater is preempted at this + * point in the code for a long time. That now-preempted + * updater has already flipped ->srcu_ctrp (possibly during + * the preceding grace period), done an smp_mb() (again, + * possibly during the preceding grace period), and summed up + * the ->srcu_ctrs[idx].srcu_unlocks counters. How many times + * can a given one of the aforementioned Nt tasks increment the + * old ->srcu_ctrp value's ->srcu_ctrs[idx].srcu_locks counter, + * in the absence of nesting? * - * OK, how about nesting? This does impose a limit on nesting - * of floor(ULONG_MAX/NR_CPUS/2), which should be sufficient, - * especially on 64-bit systems. + * It can clearly do so once, given that it has already fetched + * the old value of ->srcu_ctrp and is just about to use that + * value to index its increment of ->srcu_ctrs[idx].srcu_locks. + * But as soon as it leaves that SRCU read-side critical section, + * it will increment ->srcu_ctrs[idx].srcu_unlocks, which must + * follow the updater's above read from that same value. Thus, + as soon the reading task does an smp_mb() and a later fetch from + * ->srcu_ctrp, that task will be guaranteed to get the new index. + * Except that the increment of ->srcu_ctrs[idx].srcu_unlocks + * in __srcu_read_unlock() is after the smp_mb(), and the fetch + * from ->srcu_ctrp in __srcu_read_lock() is before the smp_mb(). + * Thus, that task might not see the new value of ->srcu_ctrp until + * the -second- __srcu_read_lock(), which in turn means that this + * task might well increment ->srcu_ctrs[idx].srcu_locks for the + * old value of ->srcu_ctrp twice, not just once. + * + * However, it is important to note that a given smp_mb() takes + * effect not just for the task executing it, but also for any + * later task running on that same CPU. + * + * That is, there can be almost Nt + Nc further increments + * of ->srcu_ctrs[idx].srcu_locks for the old index, where Nc + * is the number of CPUs. But this is OK because the size of + * the task_struct structure limits the value of Nt and current + * systems limit Nc to a few thousand. + * + * OK, but what about nesting? This does impose a limit on + * nesting of half of the size of the task_struct structure + * (measured in bytes), which should be sufficient. A late 2022 + * TREE01 rcutorture run reported this size to be no less than + * 9408 bytes, allowing up to 4704 levels of nesting, which is + * comfortably beyond excessive. Especially on 64-bit systems, + * which are unlikely to be configured with an address space fully + * populated with memory, at least not anytime soon. */ - return srcu_readers_lock_idx(ssp, idx) == unlocks; + return srcu_readers_lock_idx(ssp, idx, did_gp, unlocks); } /** @@ -347,17 +642,62 @@ static bool srcu_readers_active(struct srcu_struct *ssp) unsigned long sum = 0; for_each_possible_cpu(cpu) { - struct srcu_data *cpuc = per_cpu_ptr(ssp->sda, cpu); + struct srcu_data *sdp = per_cpu_ptr(ssp->sda, cpu); - sum += READ_ONCE(cpuc->srcu_lock_count[0]); - sum += READ_ONCE(cpuc->srcu_lock_count[1]); - sum -= READ_ONCE(cpuc->srcu_unlock_count[0]); - sum -= READ_ONCE(cpuc->srcu_unlock_count[1]); + sum += atomic_long_read(&sdp->srcu_ctrs[0].srcu_locks); + sum += atomic_long_read(&sdp->srcu_ctrs[1].srcu_locks); + sum -= atomic_long_read(&sdp->srcu_ctrs[0].srcu_unlocks); + sum -= atomic_long_read(&sdp->srcu_ctrs[1].srcu_unlocks); } return sum; } -#define SRCU_INTERVAL 1 +/* + * We use an adaptive strategy for synchronize_srcu() and especially for + * synchronize_srcu_expedited(). We spin for a fixed time period + * (defined below, boot time configurable) to allow SRCU readers to exit + * their read-side critical sections. If there are still some readers + * after one jiffy, we repeatedly block for one jiffy time periods. + * The blocking time is increased as the grace-period age increases, + * with max blocking time capped at 10 jiffies. + */ +#define SRCU_DEFAULT_RETRY_CHECK_DELAY 5 + +static ulong srcu_retry_check_delay = SRCU_DEFAULT_RETRY_CHECK_DELAY; +module_param(srcu_retry_check_delay, ulong, 0444); + +#define SRCU_INTERVAL 1 // Base delay if no expedited GPs pending. +#define SRCU_MAX_INTERVAL 10 // Maximum incremental delay from slow readers. + +#define SRCU_DEFAULT_MAX_NODELAY_PHASE_LO 3UL // Lowmark on default per-GP-phase + // no-delay instances. +#define SRCU_DEFAULT_MAX_NODELAY_PHASE_HI 1000UL // Highmark on default per-GP-phase + // no-delay instances. + +#define SRCU_UL_CLAMP_LO(val, low) ((val) > (low) ? (val) : (low)) +#define SRCU_UL_CLAMP_HI(val, high) ((val) < (high) ? (val) : (high)) +#define SRCU_UL_CLAMP(val, low, high) SRCU_UL_CLAMP_HI(SRCU_UL_CLAMP_LO((val), (low)), (high)) +// per-GP-phase no-delay instances adjusted to allow non-sleeping poll upto +// one jiffies time duration. Mult by 2 is done to factor in the srcu_get_delay() +// called from process_srcu(). +#define SRCU_DEFAULT_MAX_NODELAY_PHASE_ADJUSTED \ + (2UL * USEC_PER_SEC / HZ / SRCU_DEFAULT_RETRY_CHECK_DELAY) + +// Maximum per-GP-phase consecutive no-delay instances. +#define SRCU_DEFAULT_MAX_NODELAY_PHASE \ + SRCU_UL_CLAMP(SRCU_DEFAULT_MAX_NODELAY_PHASE_ADJUSTED, \ + SRCU_DEFAULT_MAX_NODELAY_PHASE_LO, \ + SRCU_DEFAULT_MAX_NODELAY_PHASE_HI) + +static ulong srcu_max_nodelay_phase = SRCU_DEFAULT_MAX_NODELAY_PHASE; +module_param(srcu_max_nodelay_phase, ulong, 0444); + +// Maximum consecutive no-delay instances. +#define SRCU_DEFAULT_MAX_NODELAY (SRCU_DEFAULT_MAX_NODELAY_PHASE > 100 ? \ + SRCU_DEFAULT_MAX_NODELAY_PHASE : 100) + +static ulong srcu_max_nodelay = SRCU_DEFAULT_MAX_NODELAY; +module_param(srcu_max_nodelay, ulong, 0444); /* * Return grace-period delay, zero if there are expedited grace @@ -365,58 +705,121 @@ static bool srcu_readers_active(struct srcu_struct *ssp) */ static unsigned long srcu_get_delay(struct srcu_struct *ssp) { - if (ULONG_CMP_LT(READ_ONCE(ssp->srcu_gp_seq), - READ_ONCE(ssp->srcu_gp_seq_needed_exp))) - return 0; - return SRCU_INTERVAL; + unsigned long gpstart; + unsigned long j; + unsigned long jbase = SRCU_INTERVAL; + struct srcu_usage *sup = ssp->srcu_sup; + + lockdep_assert_held(&ACCESS_PRIVATE(ssp->srcu_sup, lock)); + if (srcu_gp_is_expedited(ssp)) + jbase = 0; + if (rcu_seq_state(READ_ONCE(sup->srcu_gp_seq))) { + j = jiffies - 1; + gpstart = READ_ONCE(sup->srcu_gp_start); + if (time_after(j, gpstart)) + jbase += j - gpstart; + if (!jbase) { + ASSERT_EXCLUSIVE_WRITER(sup->srcu_n_exp_nodelay); + WRITE_ONCE(sup->srcu_n_exp_nodelay, READ_ONCE(sup->srcu_n_exp_nodelay) + 1); + if (READ_ONCE(sup->srcu_n_exp_nodelay) > srcu_max_nodelay_phase) + jbase = 1; + } + } + return jbase > SRCU_MAX_INTERVAL ? SRCU_MAX_INTERVAL : jbase; } -/* Helper for cleanup_srcu_struct() and cleanup_srcu_struct_quiesced(). */ -void _cleanup_srcu_struct(struct srcu_struct *ssp, bool quiesced) +/** + * cleanup_srcu_struct - deconstruct a sleep-RCU structure + * @ssp: structure to clean up. + * + * Must invoke this after you are finished using a given srcu_struct that + * was initialized via init_srcu_struct(), else you leak memory. + */ +void cleanup_srcu_struct(struct srcu_struct *ssp) { int cpu; + unsigned long delay; + struct srcu_usage *sup = ssp->srcu_sup; - if (WARN_ON(!srcu_get_delay(ssp))) + spin_lock_irq_rcu_node(ssp->srcu_sup); + delay = srcu_get_delay(ssp); + spin_unlock_irq_rcu_node(ssp->srcu_sup); + if (WARN_ON(!delay)) return; /* Just leak it! */ if (WARN_ON(srcu_readers_active(ssp))) return; /* Just leak it! */ - if (quiesced) { - if (WARN_ON(delayed_work_pending(&ssp->work))) - return; /* Just leak it! */ - } else { - flush_delayed_work(&ssp->work); + flush_delayed_work(&sup->work); + for_each_possible_cpu(cpu) { + struct srcu_data *sdp = per_cpu_ptr(ssp->sda, cpu); + + timer_delete_sync(&sdp->delay_work); + flush_work(&sdp->work); + if (WARN_ON(rcu_segcblist_n_cbs(&sdp->srcu_cblist))) + return; /* Forgot srcu_barrier(), so just leak it! */ } - for_each_possible_cpu(cpu) - if (quiesced) { - if (WARN_ON(delayed_work_pending(&per_cpu_ptr(ssp->sda, cpu)->work))) - return; /* Just leak it! */ - } else { - flush_delayed_work(&per_cpu_ptr(ssp->sda, cpu)->work); - } - if (WARN_ON(rcu_seq_state(READ_ONCE(ssp->srcu_gp_seq)) != SRCU_STATE_IDLE) || + if (WARN_ON(rcu_seq_state(READ_ONCE(sup->srcu_gp_seq)) != SRCU_STATE_IDLE) || + WARN_ON(rcu_seq_current(&sup->srcu_gp_seq) != sup->srcu_gp_seq_needed) || WARN_ON(srcu_readers_active(ssp))) { - pr_info("%s: Active srcu_struct %p state: %d\n", - __func__, ssp, rcu_seq_state(READ_ONCE(ssp->srcu_gp_seq))); - return; /* Caller forgot to stop doing call_srcu()? */ + pr_info("%s: Active srcu_struct %p read state: %d gp state: %lu/%lu\n", + __func__, ssp, rcu_seq_state(READ_ONCE(sup->srcu_gp_seq)), + rcu_seq_current(&sup->srcu_gp_seq), sup->srcu_gp_seq_needed); + return; // Caller forgot to stop doing call_srcu()? + // Or caller invoked start_poll_synchronize_srcu() + // and then cleanup_srcu_struct() before that grace + // period ended? + } + kfree(sup->node); + sup->node = NULL; + sup->srcu_size_state = SRCU_SIZE_SMALL; + if (!sup->sda_is_static) { + free_percpu(ssp->sda); + ssp->sda = NULL; + kfree(sup); + ssp->srcu_sup = NULL; } - free_percpu(ssp->sda); - ssp->sda = NULL; } -EXPORT_SYMBOL_GPL(_cleanup_srcu_struct); +EXPORT_SYMBOL_GPL(cleanup_srcu_struct); + +/* + * Check for consistent reader flavor. + */ +void __srcu_check_read_flavor(struct srcu_struct *ssp, int read_flavor) +{ + int old_read_flavor; + struct srcu_data *sdp; + + /* NMI-unsafe use in NMI is a bad sign, as is multi-bit read_flavor values. */ + WARN_ON_ONCE((read_flavor != SRCU_READ_FLAVOR_NMI) && in_nmi()); + WARN_ON_ONCE(read_flavor & (read_flavor - 1)); + + sdp = raw_cpu_ptr(ssp->sda); + old_read_flavor = READ_ONCE(sdp->srcu_reader_flavor); + WARN_ON_ONCE(ssp->srcu_reader_flavor && read_flavor != ssp->srcu_reader_flavor); + WARN_ON_ONCE(old_read_flavor && ssp->srcu_reader_flavor && + old_read_flavor != ssp->srcu_reader_flavor); + WARN_ON_ONCE(read_flavor == SRCU_READ_FLAVOR_FAST && !ssp->srcu_reader_flavor); + if (!old_read_flavor) { + old_read_flavor = cmpxchg(&sdp->srcu_reader_flavor, 0, read_flavor); + if (!old_read_flavor) + return; + } + WARN_ONCE(old_read_flavor != read_flavor, "CPU %d old state %d new state %d\n", sdp->cpu, old_read_flavor, read_flavor); +} +EXPORT_SYMBOL_GPL(__srcu_check_read_flavor); /* * Counts the new reader in the appropriate per-CPU element of the * srcu_struct. - * Returns an index that must be passed to the matching srcu_read_unlock(). + * Returns a guaranteed non-negative index that must be passed to the + * matching __srcu_read_unlock(). */ int __srcu_read_lock(struct srcu_struct *ssp) { - int idx; + struct srcu_ctr __percpu *scp = READ_ONCE(ssp->srcu_ctrp); - idx = READ_ONCE(ssp->srcu_idx) & 0x1; - this_cpu_inc(ssp->sda->srcu_lock_count[idx]); + this_cpu_inc(scp->srcu_locks.counter); smp_mb(); /* B */ /* Avoid leaking the critical section. */ - return idx; + return __srcu_ptr_to_ctr(ssp, scp); } EXPORT_SYMBOL_GPL(__srcu_read_lock); @@ -428,74 +831,76 @@ EXPORT_SYMBOL_GPL(__srcu_read_lock); void __srcu_read_unlock(struct srcu_struct *ssp, int idx) { smp_mb(); /* C */ /* Avoid leaking the critical section. */ - this_cpu_inc(ssp->sda->srcu_unlock_count[idx]); + this_cpu_inc(__srcu_ctr_to_ptr(ssp, idx)->srcu_unlocks.counter); } EXPORT_SYMBOL_GPL(__srcu_read_unlock); +#ifdef CONFIG_NEED_SRCU_NMI_SAFE + /* - * We use an adaptive strategy for synchronize_srcu() and especially for - * synchronize_srcu_expedited(). We spin for a fixed time period - * (defined below) to allow SRCU readers to exit their read-side critical - * sections. If there are still some readers after a few microseconds, - * we repeatedly block for 1-millisecond time periods. + * Counts the new reader in the appropriate per-CPU element of the + * srcu_struct, but in an NMI-safe manner using RMW atomics. + * Returns an index that must be passed to the matching srcu_read_unlock(). */ -#define SRCU_RETRY_CHECK_DELAY 5 +int __srcu_read_lock_nmisafe(struct srcu_struct *ssp) +{ + struct srcu_ctr __percpu *scpp = READ_ONCE(ssp->srcu_ctrp); + struct srcu_ctr *scp = raw_cpu_ptr(scpp); + + atomic_long_inc(&scp->srcu_locks); + smp_mb__after_atomic(); /* B */ /* Avoid leaking the critical section. */ + return __srcu_ptr_to_ctr(ssp, scpp); +} +EXPORT_SYMBOL_GPL(__srcu_read_lock_nmisafe); + +/* + * Removes the count for the old reader from the appropriate per-CPU + * element of the srcu_struct. Note that this may well be a different + * CPU than that which was incremented by the corresponding srcu_read_lock(). + */ +void __srcu_read_unlock_nmisafe(struct srcu_struct *ssp, int idx) +{ + smp_mb__before_atomic(); /* C */ /* Avoid leaking the critical section. */ + atomic_long_inc(&raw_cpu_ptr(__srcu_ctr_to_ptr(ssp, idx))->srcu_unlocks); +} +EXPORT_SYMBOL_GPL(__srcu_read_unlock_nmisafe); + +#endif // CONFIG_NEED_SRCU_NMI_SAFE /* * Start an SRCU grace period. */ static void srcu_gp_start(struct srcu_struct *ssp) { - struct srcu_data *sdp = this_cpu_ptr(ssp->sda); int state; - lockdep_assert_held(&ACCESS_PRIVATE(ssp, lock)); - WARN_ON_ONCE(ULONG_CMP_GE(ssp->srcu_gp_seq, ssp->srcu_gp_seq_needed)); - spin_lock_rcu_node(sdp); /* Interrupts already disabled. */ - rcu_segcblist_advance(&sdp->srcu_cblist, - rcu_seq_current(&ssp->srcu_gp_seq)); - (void)rcu_segcblist_accelerate(&sdp->srcu_cblist, - rcu_seq_snap(&ssp->srcu_gp_seq)); - spin_unlock_rcu_node(sdp); /* Interrupts remain disabled. */ + lockdep_assert_held(&ACCESS_PRIVATE(ssp->srcu_sup, lock)); + WARN_ON_ONCE(ULONG_CMP_GE(ssp->srcu_sup->srcu_gp_seq, ssp->srcu_sup->srcu_gp_seq_needed)); + WRITE_ONCE(ssp->srcu_sup->srcu_gp_start, jiffies); + WRITE_ONCE(ssp->srcu_sup->srcu_n_exp_nodelay, 0); smp_mb(); /* Order prior store to ->srcu_gp_seq_needed vs. GP start. */ - rcu_seq_start(&ssp->srcu_gp_seq); - state = rcu_seq_state(READ_ONCE(ssp->srcu_gp_seq)); + rcu_seq_start(&ssp->srcu_sup->srcu_gp_seq); + state = rcu_seq_state(ssp->srcu_sup->srcu_gp_seq); WARN_ON_ONCE(state != SRCU_STATE_SCAN1); } -/* - * Track online CPUs to guide callback workqueue placement. - */ -DEFINE_PER_CPU(bool, srcu_online); -void srcu_online_cpu(unsigned int cpu) +static void srcu_delay_timer(struct timer_list *t) { - WRITE_ONCE(per_cpu(srcu_online, cpu), true); -} + struct srcu_data *sdp = container_of(t, struct srcu_data, delay_work); -void srcu_offline_cpu(unsigned int cpu) -{ - WRITE_ONCE(per_cpu(srcu_online, cpu), false); + queue_work_on(sdp->cpu, rcu_gp_wq, &sdp->work); } -/* - * Place the workqueue handler on the specified CPU if online, otherwise - * just run it whereever. This is useful for placing workqueue handlers - * that are to invoke the specified CPU's callbacks. - */ -static bool srcu_queue_delayed_work_on(int cpu, struct workqueue_struct *wq, - struct delayed_work *dwork, +static void srcu_queue_delayed_work_on(struct srcu_data *sdp, unsigned long delay) { - bool ret; + if (!delay) { + queue_work_on(sdp->cpu, rcu_gp_wq, &sdp->work); + return; + } - preempt_disable(); - if (READ_ONCE(per_cpu(srcu_online, cpu))) - ret = queue_delayed_work_on(cpu, wq, dwork, delay); - else - ret = queue_delayed_work(wq, dwork, delay); - preempt_enable(); - return ret; + timer_reduce(&sdp->delay_work, jiffies + delay); } /* @@ -504,7 +909,7 @@ static bool srcu_queue_delayed_work_on(int cpu, struct workqueue_struct *wq, */ static void srcu_schedule_cbs_sdp(struct srcu_data *sdp, unsigned long delay) { - srcu_queue_delayed_work_on(sdp->cpu, rcu_gp_wq, &sdp->work, delay); + srcu_queue_delayed_work_on(sdp, delay); } /* @@ -519,7 +924,7 @@ static void srcu_schedule_cbs_snp(struct srcu_struct *ssp, struct srcu_node *snp int cpu; for (cpu = snp->grplo; cpu <= snp->grphi; cpu++) { - if (!(mask & (1 << (cpu - snp->grplo)))) + if (!(mask & (1UL << (cpu - snp->grplo)))) continue; srcu_schedule_cbs_sdp(per_cpu_ptr(ssp->sda, cpu), delay); } @@ -536,80 +941,100 @@ static void srcu_schedule_cbs_snp(struct srcu_struct *ssp, struct srcu_node *snp */ static void srcu_gp_end(struct srcu_struct *ssp) { - unsigned long cbdelay; + unsigned long cbdelay = 1; bool cbs; bool last_lvl; int cpu; - unsigned long flags; unsigned long gpseq; int idx; unsigned long mask; struct srcu_data *sdp; + unsigned long sgsne; struct srcu_node *snp; + int ss_state; + struct srcu_usage *sup = ssp->srcu_sup; /* Prevent more than one additional grace period. */ - mutex_lock(&ssp->srcu_cb_mutex); + mutex_lock(&sup->srcu_cb_mutex); /* End the current grace period. */ - spin_lock_irq_rcu_node(ssp); - idx = rcu_seq_state(ssp->srcu_gp_seq); + spin_lock_irq_rcu_node(sup); + idx = rcu_seq_state(sup->srcu_gp_seq); WARN_ON_ONCE(idx != SRCU_STATE_SCAN2); - cbdelay = srcu_get_delay(ssp); - ssp->srcu_last_gp_end = ktime_get_mono_fast_ns(); - rcu_seq_end(&ssp->srcu_gp_seq); - gpseq = rcu_seq_current(&ssp->srcu_gp_seq); - if (ULONG_CMP_LT(ssp->srcu_gp_seq_needed_exp, gpseq)) - ssp->srcu_gp_seq_needed_exp = gpseq; - spin_unlock_irq_rcu_node(ssp); - mutex_unlock(&ssp->srcu_gp_mutex); + if (srcu_gp_is_expedited(ssp)) + cbdelay = 0; + + WRITE_ONCE(sup->srcu_last_gp_end, ktime_get_mono_fast_ns()); + rcu_seq_end(&sup->srcu_gp_seq); + gpseq = rcu_seq_current(&sup->srcu_gp_seq); + if (ULONG_CMP_LT(sup->srcu_gp_seq_needed_exp, gpseq)) + WRITE_ONCE(sup->srcu_gp_seq_needed_exp, gpseq); + spin_unlock_irq_rcu_node(sup); + mutex_unlock(&sup->srcu_gp_mutex); /* A new grace period can start at this point. But only one. */ /* Initiate callback invocation as needed. */ - idx = rcu_seq_ctr(gpseq) % ARRAY_SIZE(snp->srcu_have_cbs); - srcu_for_each_node_breadth_first(ssp, snp) { - spin_lock_irq_rcu_node(snp); - cbs = false; - last_lvl = snp >= ssp->level[rcu_num_lvls - 1]; - if (last_lvl) - cbs = snp->srcu_have_cbs[idx] == gpseq; - snp->srcu_have_cbs[idx] = gpseq; - rcu_seq_set_state(&snp->srcu_have_cbs[idx], 1); - if (ULONG_CMP_LT(snp->srcu_gp_seq_needed_exp, gpseq)) - snp->srcu_gp_seq_needed_exp = gpseq; - mask = snp->srcu_data_have_cbs[idx]; - snp->srcu_data_have_cbs[idx] = 0; - spin_unlock_irq_rcu_node(snp); - if (cbs) - srcu_schedule_cbs_snp(ssp, snp, mask, cbdelay); - - /* Occasionally prevent srcu_data counter wrap. */ - if (!(gpseq & counter_wrap_check) && last_lvl) - for (cpu = snp->grplo; cpu <= snp->grphi; cpu++) { - sdp = per_cpu_ptr(ssp->sda, cpu); - spin_lock_irqsave_rcu_node(sdp, flags); - if (ULONG_CMP_GE(gpseq, - sdp->srcu_gp_seq_needed + 100)) - sdp->srcu_gp_seq_needed = gpseq; - if (ULONG_CMP_GE(gpseq, - sdp->srcu_gp_seq_needed_exp + 100)) - sdp->srcu_gp_seq_needed_exp = gpseq; - spin_unlock_irqrestore_rcu_node(sdp, flags); - } + ss_state = smp_load_acquire(&sup->srcu_size_state); + if (ss_state < SRCU_SIZE_WAIT_BARRIER) { + srcu_schedule_cbs_sdp(per_cpu_ptr(ssp->sda, get_boot_cpu_id()), + cbdelay); + } else { + idx = rcu_seq_ctr(gpseq) % ARRAY_SIZE(snp->srcu_have_cbs); + srcu_for_each_node_breadth_first(ssp, snp) { + spin_lock_irq_rcu_node(snp); + cbs = false; + last_lvl = snp >= sup->level[rcu_num_lvls - 1]; + if (last_lvl) + cbs = ss_state < SRCU_SIZE_BIG || snp->srcu_have_cbs[idx] == gpseq; + snp->srcu_have_cbs[idx] = gpseq; + rcu_seq_set_state(&snp->srcu_have_cbs[idx], 1); + sgsne = snp->srcu_gp_seq_needed_exp; + if (srcu_invl_snp_seq(sgsne) || ULONG_CMP_LT(sgsne, gpseq)) + WRITE_ONCE(snp->srcu_gp_seq_needed_exp, gpseq); + if (ss_state < SRCU_SIZE_BIG) + mask = ~0; + else + mask = snp->srcu_data_have_cbs[idx]; + snp->srcu_data_have_cbs[idx] = 0; + spin_unlock_irq_rcu_node(snp); + if (cbs) + srcu_schedule_cbs_snp(ssp, snp, mask, cbdelay); + } } + /* Occasionally prevent srcu_data counter wrap. */ + if (!(gpseq & counter_wrap_check)) + for_each_possible_cpu(cpu) { + sdp = per_cpu_ptr(ssp->sda, cpu); + spin_lock_irq_rcu_node(sdp); + if (ULONG_CMP_GE(gpseq, sdp->srcu_gp_seq_needed + 100)) + sdp->srcu_gp_seq_needed = gpseq; + if (ULONG_CMP_GE(gpseq, sdp->srcu_gp_seq_needed_exp + 100)) + sdp->srcu_gp_seq_needed_exp = gpseq; + spin_unlock_irq_rcu_node(sdp); + } + /* Callback initiation done, allow grace periods after next. */ - mutex_unlock(&ssp->srcu_cb_mutex); + mutex_unlock(&sup->srcu_cb_mutex); /* Start a new grace period if needed. */ - spin_lock_irq_rcu_node(ssp); - gpseq = rcu_seq_current(&ssp->srcu_gp_seq); + spin_lock_irq_rcu_node(sup); + gpseq = rcu_seq_current(&sup->srcu_gp_seq); if (!rcu_seq_state(gpseq) && - ULONG_CMP_LT(gpseq, ssp->srcu_gp_seq_needed)) { + ULONG_CMP_LT(gpseq, sup->srcu_gp_seq_needed)) { srcu_gp_start(ssp); - spin_unlock_irq_rcu_node(ssp); + spin_unlock_irq_rcu_node(sup); srcu_reschedule(ssp, 0); } else { - spin_unlock_irq_rcu_node(ssp); + spin_unlock_irq_rcu_node(sup); + } + + /* Transition to big if needed. */ + if (ss_state != SRCU_SIZE_SMALL && ss_state != SRCU_SIZE_BIG) { + if (ss_state == SRCU_SIZE_ALLOC) + init_srcu_struct_nodes(ssp, GFP_KERNEL); + else + smp_store_release(&sup->srcu_size_state, ss_state + 1); } } @@ -624,23 +1049,27 @@ static void srcu_funnel_exp_start(struct srcu_struct *ssp, struct srcu_node *snp unsigned long s) { unsigned long flags; + unsigned long sgsne; - for (; snp != NULL; snp = snp->srcu_parent) { - if (rcu_seq_done(&ssp->srcu_gp_seq, s) || - ULONG_CMP_GE(READ_ONCE(snp->srcu_gp_seq_needed_exp), s)) - return; - spin_lock_irqsave_rcu_node(snp, flags); - if (ULONG_CMP_GE(snp->srcu_gp_seq_needed_exp, s)) { + if (snp) + for (; snp != NULL; snp = snp->srcu_parent) { + sgsne = READ_ONCE(snp->srcu_gp_seq_needed_exp); + if (WARN_ON_ONCE(rcu_seq_done(&ssp->srcu_sup->srcu_gp_seq, s)) || + (!srcu_invl_snp_seq(sgsne) && ULONG_CMP_GE(sgsne, s))) + return; + spin_lock_irqsave_rcu_node(snp, flags); + sgsne = snp->srcu_gp_seq_needed_exp; + if (!srcu_invl_snp_seq(sgsne) && ULONG_CMP_GE(sgsne, s)) { + spin_unlock_irqrestore_rcu_node(snp, flags); + return; + } + WRITE_ONCE(snp->srcu_gp_seq_needed_exp, s); spin_unlock_irqrestore_rcu_node(snp, flags); - return; } - WRITE_ONCE(snp->srcu_gp_seq_needed_exp, s); - spin_unlock_irqrestore_rcu_node(snp, flags); - } - spin_lock_irqsave_rcu_node(ssp, flags); - if (ULONG_CMP_LT(ssp->srcu_gp_seq_needed_exp, s)) - ssp->srcu_gp_seq_needed_exp = s; - spin_unlock_irqrestore_rcu_node(ssp, flags); + spin_lock_irqsave_ssp_contention(ssp, &flags); + if (ULONG_CMP_LT(ssp->srcu_sup->srcu_gp_seq_needed_exp, s)) + WRITE_ONCE(ssp->srcu_sup->srcu_gp_seq_needed_exp, s); + spin_unlock_irqrestore_rcu_node(ssp->srcu_sup, flags); } /* @@ -652,116 +1081,175 @@ static void srcu_funnel_exp_start(struct srcu_struct *ssp, struct srcu_node *snp * * Note that this function also does the work of srcu_funnel_exp_start(), * in some cases by directly invoking it. + * + * The srcu read lock should be hold around this function. And s is a seq snap + * after holding that lock. */ static void srcu_funnel_gp_start(struct srcu_struct *ssp, struct srcu_data *sdp, unsigned long s, bool do_norm) { unsigned long flags; int idx = rcu_seq_ctr(s) % ARRAY_SIZE(sdp->mynode->srcu_have_cbs); - struct srcu_node *snp = sdp->mynode; + unsigned long sgsne; + struct srcu_node *snp; + struct srcu_node *snp_leaf; unsigned long snp_seq; + struct srcu_usage *sup = ssp->srcu_sup; - /* Each pass through the loop does one level of the srcu_node tree. */ - for (; snp != NULL; snp = snp->srcu_parent) { - if (rcu_seq_done(&ssp->srcu_gp_seq, s) && snp != sdp->mynode) - return; /* GP already done and CBs recorded. */ - spin_lock_irqsave_rcu_node(snp, flags); - if (ULONG_CMP_GE(snp->srcu_have_cbs[idx], s)) { + /* Ensure that snp node tree is fully initialized before traversing it */ + if (smp_load_acquire(&sup->srcu_size_state) < SRCU_SIZE_WAIT_BARRIER) + snp_leaf = NULL; + else + snp_leaf = sdp->mynode; + + if (snp_leaf) + /* Each pass through the loop does one level of the srcu_node tree. */ + for (snp = snp_leaf; snp != NULL; snp = snp->srcu_parent) { + if (WARN_ON_ONCE(rcu_seq_done(&sup->srcu_gp_seq, s)) && snp != snp_leaf) + return; /* GP already done and CBs recorded. */ + spin_lock_irqsave_rcu_node(snp, flags); snp_seq = snp->srcu_have_cbs[idx]; - if (snp == sdp->mynode && snp_seq == s) - snp->srcu_data_have_cbs[idx] |= sdp->grpmask; - spin_unlock_irqrestore_rcu_node(snp, flags); - if (snp == sdp->mynode && snp_seq != s) { - srcu_schedule_cbs_sdp(sdp, do_norm - ? SRCU_INTERVAL - : 0); + if (!srcu_invl_snp_seq(snp_seq) && ULONG_CMP_GE(snp_seq, s)) { + if (snp == snp_leaf && snp_seq == s) + snp->srcu_data_have_cbs[idx] |= sdp->grpmask; + spin_unlock_irqrestore_rcu_node(snp, flags); + if (snp == snp_leaf && snp_seq != s) { + srcu_schedule_cbs_sdp(sdp, do_norm ? SRCU_INTERVAL : 0); + return; + } + if (!do_norm) + srcu_funnel_exp_start(ssp, snp, s); return; } - if (!do_norm) - srcu_funnel_exp_start(ssp, snp, s); - return; + snp->srcu_have_cbs[idx] = s; + if (snp == snp_leaf) + snp->srcu_data_have_cbs[idx] |= sdp->grpmask; + sgsne = snp->srcu_gp_seq_needed_exp; + if (!do_norm && (srcu_invl_snp_seq(sgsne) || ULONG_CMP_LT(sgsne, s))) + WRITE_ONCE(snp->srcu_gp_seq_needed_exp, s); + spin_unlock_irqrestore_rcu_node(snp, flags); } - snp->srcu_have_cbs[idx] = s; - if (snp == sdp->mynode) - snp->srcu_data_have_cbs[idx] |= sdp->grpmask; - if (!do_norm && ULONG_CMP_LT(snp->srcu_gp_seq_needed_exp, s)) - snp->srcu_gp_seq_needed_exp = s; - spin_unlock_irqrestore_rcu_node(snp, flags); - } /* Top of tree, must ensure the grace period will be started. */ - spin_lock_irqsave_rcu_node(ssp, flags); - if (ULONG_CMP_LT(ssp->srcu_gp_seq_needed, s)) { + spin_lock_irqsave_ssp_contention(ssp, &flags); + if (ULONG_CMP_LT(sup->srcu_gp_seq_needed, s)) { /* * Record need for grace period s. Pair with load * acquire setting up for initialization. */ - smp_store_release(&ssp->srcu_gp_seq_needed, s); /*^^^*/ + smp_store_release(&sup->srcu_gp_seq_needed, s); /*^^^*/ } - if (!do_norm && ULONG_CMP_LT(ssp->srcu_gp_seq_needed_exp, s)) - ssp->srcu_gp_seq_needed_exp = s; + if (!do_norm && ULONG_CMP_LT(sup->srcu_gp_seq_needed_exp, s)) + WRITE_ONCE(sup->srcu_gp_seq_needed_exp, s); - /* If grace period not already done and none in progress, start it. */ - if (!rcu_seq_done(&ssp->srcu_gp_seq, s) && - rcu_seq_state(ssp->srcu_gp_seq) == SRCU_STATE_IDLE) { - WARN_ON_ONCE(ULONG_CMP_GE(ssp->srcu_gp_seq, ssp->srcu_gp_seq_needed)); + /* If grace period not already in progress, start it. */ + if (!WARN_ON_ONCE(rcu_seq_done(&sup->srcu_gp_seq, s)) && + rcu_seq_state(sup->srcu_gp_seq) == SRCU_STATE_IDLE) { srcu_gp_start(ssp); + + // And how can that list_add() in the "else" clause + // possibly be safe for concurrent execution? Well, + // it isn't. And it does not have to be. After all, it + // can only be executed during early boot when there is only + // the one boot CPU running with interrupts still disabled. if (likely(srcu_init_done)) - queue_delayed_work(rcu_gp_wq, &ssp->work, - srcu_get_delay(ssp)); - else if (list_empty(&ssp->work.work.entry)) - list_add(&ssp->work.work.entry, &srcu_boot_list); + queue_delayed_work(rcu_gp_wq, &sup->work, + !!srcu_get_delay(ssp)); + else if (list_empty(&sup->work.work.entry)) + list_add(&sup->work.work.entry, &srcu_boot_list); } - spin_unlock_irqrestore_rcu_node(ssp, flags); + spin_unlock_irqrestore_rcu_node(sup, flags); } /* * Wait until all readers counted by array index idx complete, but * loop an additional time if there is an expedited grace period pending. - * The caller must ensure that ->srcu_idx is not changed while checking. + * The caller must ensure that ->srcu_ctrp is not changed while checking. */ static bool try_check_zero(struct srcu_struct *ssp, int idx, int trycount) { + unsigned long curdelay; + + spin_lock_irq_rcu_node(ssp->srcu_sup); + curdelay = !srcu_get_delay(ssp); + spin_unlock_irq_rcu_node(ssp->srcu_sup); + for (;;) { if (srcu_readers_active_idx_check(ssp, idx)) return true; - if (--trycount + !srcu_get_delay(ssp) <= 0) + if ((--trycount + curdelay) <= 0) return false; - udelay(SRCU_RETRY_CHECK_DELAY); + udelay(srcu_retry_check_delay); } } /* - * Increment the ->srcu_idx counter so that future SRCU readers will + * Increment the ->srcu_ctrp counter so that future SRCU readers will * use the other rank of the ->srcu_(un)lock_count[] arrays. This allows * us to wait for pre-existing readers in a starvation-free manner. */ static void srcu_flip(struct srcu_struct *ssp) { /* - * Ensure that if this updater saw a given reader's increment - * from __srcu_read_lock(), that reader was using an old value - * of ->srcu_idx. Also ensure that if a given reader sees the - * new value of ->srcu_idx, this updater's earlier scans cannot - * have seen that reader's increments (which is OK, because this - * grace period need not wait on that reader). + * Because the flip of ->srcu_ctrp is executed only if the + * preceding call to srcu_readers_active_idx_check() found that + * the ->srcu_ctrs[].srcu_unlocks and ->srcu_ctrs[].srcu_locks sums + * matched and because that summing uses atomic_long_read(), + * there is ordering due to a control dependency between that + * summing and the WRITE_ONCE() in this call to srcu_flip(). + * This ordering ensures that if this updater saw a given reader's + * increment from __srcu_read_lock(), that reader was using a value + * of ->srcu_ctrp from before the previous call to srcu_flip(), + * which should be quite rare. This ordering thus helps forward + * progress because the grace period could otherwise be delayed + * by additional calls to __srcu_read_lock() using that old (soon + * to be new) value of ->srcu_ctrp. + * + * This sum-equality check and ordering also ensures that if + * a given call to __srcu_read_lock() uses the new value of + * ->srcu_ctrp, this updater's earlier scans cannot have seen + * that reader's increments, which is all to the good, because + * this grace period need not wait on that reader. After all, + * if those earlier scans had seen that reader, there would have + * been a sum mismatch and this code would not be reached. + * + * This means that the following smp_mb() is redundant, but + * it stays until either (1) Compilers learn about this sort of + * control dependency or (2) Some production workload running on + * a production system is unduly delayed by this slowpath smp_mb(). + * Except for _lite() readers, where it is inoperative, which + * means that it is a good thing that it is redundant. */ smp_mb(); /* E */ /* Pairs with B and C. */ - WRITE_ONCE(ssp->srcu_idx, ssp->srcu_idx + 1); + WRITE_ONCE(ssp->srcu_ctrp, + &ssp->sda->srcu_ctrs[!(ssp->srcu_ctrp - &ssp->sda->srcu_ctrs[0])]); /* * Ensure that if the updater misses an __srcu_read_unlock() - * increment, that task's next __srcu_read_lock() will see the - * above counter update. Note that both this memory barrier - * and the one in srcu_readers_active_idx_check() provide the - * guarantee for __srcu_read_lock(). + * increment, that task's __srcu_read_lock() following its next + * __srcu_read_lock() or __srcu_read_unlock() will see the above + * counter update. Note that both this memory barrier and the + * one in srcu_readers_active_idx_check() provide the guarantee + * for __srcu_read_lock(). + * + * Note that this is a performance optimization, in which we spend + * an otherwise unnecessary smp_mb() in order to reduce the number + * of full per-CPU-variable scans in srcu_readers_lock_idx() and + * srcu_readers_unlock_idx(). But this performance optimization + * is not so optimal for SRCU-fast, where we would be spending + * not smp_mb(), but rather synchronize_rcu(). At the same time, + * the overhead of the smp_mb() is in the noise, so there is no + * point in omitting it in the SRCU-fast case. So the same code + * is executed either way. */ smp_mb(); /* D */ /* Pairs with C. */ } /* - * If SRCU is likely idle, return true, otherwise return false. + * If SRCU is likely idle, in other words, the next SRCU grace period + * should be expedited, return true, otherwise return false. Except that + * in the presence of _lite() readers, always return false. * * Note that it is OK for several current from-idle requests for a new * grace period from idle to specify expediting because they will all end @@ -778,45 +1266,50 @@ static void srcu_flip(struct srcu_struct *ssp) * it, if this function was preempted for enough time for the counters * to wrap, it really doesn't matter whether or not we expedite the grace * period. The extra overhead of a needlessly expedited grace period is - * negligible when amoritized over that time period, and the extra latency + * negligible when amortized over that time period, and the extra latency * of a needlessly non-expedited grace period is similarly negligible. */ -static bool srcu_might_be_idle(struct srcu_struct *ssp) +static bool srcu_should_expedite(struct srcu_struct *ssp) { unsigned long curseq; unsigned long flags; struct srcu_data *sdp; unsigned long t; + unsigned long tlast; + check_init_srcu_struct(ssp); + /* If _lite() readers, don't do unsolicited expediting. */ + if (this_cpu_read(ssp->sda->srcu_reader_flavor) & SRCU_READ_FLAVOR_SLOWGP) + return false; /* If the local srcu_data structure has callbacks, not idle. */ - local_irq_save(flags); - sdp = this_cpu_ptr(ssp->sda); + sdp = raw_cpu_ptr(ssp->sda); + spin_lock_irqsave_rcu_node(sdp, flags); if (rcu_segcblist_pend_cbs(&sdp->srcu_cblist)) { - local_irq_restore(flags); + spin_unlock_irqrestore_rcu_node(sdp, flags); return false; /* Callbacks already present, so not idle. */ } - local_irq_restore(flags); + spin_unlock_irqrestore_rcu_node(sdp, flags); /* - * No local callbacks, so probabalistically probe global state. + * No local callbacks, so probabilistically probe global state. * Exact information would require acquiring locks, which would - * kill scalability, hence the probabalistic nature of the probe. + * kill scalability, hence the probabilistic nature of the probe. */ /* First, see if enough time has passed since the last GP. */ t = ktime_get_mono_fast_ns(); + tlast = READ_ONCE(ssp->srcu_sup->srcu_last_gp_end); if (exp_holdoff == 0 || - time_in_range_open(t, ssp->srcu_last_gp_end, - ssp->srcu_last_gp_end + exp_holdoff)) + time_in_range_open(t, tlast, tlast + exp_holdoff)) return false; /* Too soon after last GP. */ /* Next, check for probable idleness. */ - curseq = rcu_seq_current(&ssp->srcu_gp_seq); + curseq = rcu_seq_current(&ssp->srcu_sup->srcu_gp_seq); smp_mb(); /* Order ->srcu_gp_seq with ->srcu_gp_seq_needed. */ - if (ULONG_CMP_LT(curseq, READ_ONCE(ssp->srcu_gp_seq_needed))) + if (ULONG_CMP_LT(curseq, READ_ONCE(ssp->srcu_sup->srcu_gp_seq_needed))) return false; /* Grace period in progress, so not idle. */ smp_mb(); /* Order ->srcu_gp_seq with prior access. */ - if (curseq != rcu_seq_current(&ssp->srcu_gp_seq)) + if (curseq != rcu_seq_current(&ssp->srcu_sup->srcu_gp_seq)) return false; /* GP # changed, so not idle. */ return true; /* With reasonable probability, idle! */ } @@ -829,6 +1322,109 @@ static void srcu_leak_callback(struct rcu_head *rhp) } /* + * Start an SRCU grace period, and also queue the callback if non-NULL. + */ +static unsigned long srcu_gp_start_if_needed(struct srcu_struct *ssp, + struct rcu_head *rhp, bool do_norm) +{ + unsigned long flags; + int idx; + bool needexp = false; + bool needgp = false; + unsigned long s; + struct srcu_data *sdp; + struct srcu_node *sdp_mynode; + int ss_state; + + check_init_srcu_struct(ssp); + /* + * While starting a new grace period, make sure we are in an + * SRCU read-side critical section so that the grace-period + * sequence number cannot wrap around in the meantime. + */ + idx = __srcu_read_lock_nmisafe(ssp); + ss_state = smp_load_acquire(&ssp->srcu_sup->srcu_size_state); + if (ss_state < SRCU_SIZE_WAIT_CALL) + sdp = per_cpu_ptr(ssp->sda, get_boot_cpu_id()); + else + sdp = raw_cpu_ptr(ssp->sda); + spin_lock_irqsave_sdp_contention(sdp, &flags); + if (rhp) + rcu_segcblist_enqueue(&sdp->srcu_cblist, rhp); + /* + * It's crucial to capture the snapshot 's' for acceleration before + * reading the current gp_seq that is used for advancing. This is + * essential because if the acceleration snapshot is taken after a + * failed advancement attempt, there's a risk that a grace period may + * conclude and a new one may start in the interim. If the snapshot is + * captured after this sequence of events, the acceleration snapshot 's' + * could be excessively advanced, leading to acceleration failure. + * In such a scenario, an 'acceleration leak' can occur, where new + * callbacks become indefinitely stuck in the RCU_NEXT_TAIL segment. + * Also note that encountering advancing failures is a normal + * occurrence when the grace period for RCU_WAIT_TAIL is in progress. + * + * To see this, consider the following events which occur if + * rcu_seq_snap() were to be called after advance: + * + * 1) The RCU_WAIT_TAIL segment has callbacks (gp_num = X + 4) and the + * RCU_NEXT_READY_TAIL also has callbacks (gp_num = X + 8). + * + * 2) The grace period for RCU_WAIT_TAIL is seen as started but not + * completed so rcu_seq_current() returns X + SRCU_STATE_SCAN1. + * + * 3) This value is passed to rcu_segcblist_advance() which can't move + * any segment forward and fails. + * + * 4) srcu_gp_start_if_needed() still proceeds with callback acceleration. + * But then the call to rcu_seq_snap() observes the grace period for the + * RCU_WAIT_TAIL segment as completed and the subsequent one for the + * RCU_NEXT_READY_TAIL segment as started (ie: X + 4 + SRCU_STATE_SCAN1) + * so it returns a snapshot of the next grace period, which is X + 12. + * + * 5) The value of X + 12 is passed to rcu_segcblist_accelerate() but the + * freshly enqueued callback in RCU_NEXT_TAIL can't move to + * RCU_NEXT_READY_TAIL which already has callbacks for a previous grace + * period (gp_num = X + 8). So acceleration fails. + */ + s = rcu_seq_snap(&ssp->srcu_sup->srcu_gp_seq); + if (rhp) { + rcu_segcblist_advance(&sdp->srcu_cblist, + rcu_seq_current(&ssp->srcu_sup->srcu_gp_seq)); + /* + * Acceleration can never fail because the base current gp_seq + * used for acceleration is <= the value of gp_seq used for + * advancing. This means that RCU_NEXT_TAIL segment will + * always be able to be emptied by the acceleration into the + * RCU_NEXT_READY_TAIL or RCU_WAIT_TAIL segments. + */ + WARN_ON_ONCE(!rcu_segcblist_accelerate(&sdp->srcu_cblist, s)); + } + if (ULONG_CMP_LT(sdp->srcu_gp_seq_needed, s)) { + sdp->srcu_gp_seq_needed = s; + needgp = true; + } + if (!do_norm && ULONG_CMP_LT(sdp->srcu_gp_seq_needed_exp, s)) { + sdp->srcu_gp_seq_needed_exp = s; + needexp = true; + } + spin_unlock_irqrestore_rcu_node(sdp, flags); + + /* Ensure that snp node tree is fully initialized before traversing it */ + if (ss_state < SRCU_SIZE_WAIT_BARRIER) + sdp_mynode = NULL; + else + sdp_mynode = sdp->mynode; + + if (needgp) + srcu_funnel_gp_start(ssp, sdp, s, do_norm); + else if (needexp) + srcu_funnel_exp_start(ssp, sdp_mynode, s); + __srcu_read_unlock_nmisafe(ssp, idx); + return s; +} + +/* * Enqueue an SRCU callback on the srcu_data structure associated with * the current CPU and the specified srcu_struct structure, initiating * grace-period processing if it is not already running. @@ -856,17 +1452,9 @@ static void srcu_leak_callback(struct rcu_head *rhp) * srcu_read_lock(), and srcu_read_unlock() that are all passed the same * srcu_struct structure. */ -void __call_srcu(struct srcu_struct *ssp, struct rcu_head *rhp, - rcu_callback_t func, bool do_norm) +static void __call_srcu(struct srcu_struct *ssp, struct rcu_head *rhp, + rcu_callback_t func, bool do_norm) { - unsigned long flags; - int idx; - bool needexp = false; - bool needgp = false; - unsigned long s; - struct srcu_data *sdp; - - check_init_srcu_struct(ssp); if (debug_rcu_head_queue(rhp)) { /* Probable double call_srcu(), so leak the callback. */ WRITE_ONCE(rhp->func, srcu_leak_callback); @@ -874,29 +1462,7 @@ void __call_srcu(struct srcu_struct *ssp, struct rcu_head *rhp, return; } rhp->func = func; - idx = srcu_read_lock(ssp); - local_irq_save(flags); - sdp = this_cpu_ptr(ssp->sda); - spin_lock_rcu_node(sdp); - rcu_segcblist_enqueue(&sdp->srcu_cblist, rhp, false); - rcu_segcblist_advance(&sdp->srcu_cblist, - rcu_seq_current(&ssp->srcu_gp_seq)); - s = rcu_seq_snap(&ssp->srcu_gp_seq); - (void)rcu_segcblist_accelerate(&sdp->srcu_cblist, s); - if (ULONG_CMP_LT(sdp->srcu_gp_seq_needed, s)) { - sdp->srcu_gp_seq_needed = s; - needgp = true; - } - if (!do_norm && ULONG_CMP_LT(sdp->srcu_gp_seq_needed_exp, s)) { - sdp->srcu_gp_seq_needed_exp = s; - needexp = true; - } - spin_unlock_irqrestore_rcu_node(sdp, flags); - if (needgp) - srcu_funnel_gp_start(ssp, sdp, s, do_norm); - else if (needexp) - srcu_funnel_exp_start(ssp, sdp->mynode, s); - srcu_read_unlock(ssp, idx); + (void)srcu_gp_start_if_needed(ssp, rhp, do_norm); } /** @@ -913,8 +1479,12 @@ void __call_srcu(struct srcu_struct *ssp, struct rcu_head *rhp, * read-side critical sections are delimited by srcu_read_lock() and * srcu_read_unlock(), and may be nested. * - * The callback will be invoked from process context, but must nevertheless - * be fast and must not block. + * The callback will be invoked from process context, but with bh + * disabled. The callback function must therefore be fast and must + * not block. + * + * See the description of call_rcu() for more detailed information on + * memory ordering guarantees. */ void call_srcu(struct srcu_struct *ssp, struct rcu_head *rhp, rcu_callback_t func) @@ -930,7 +1500,9 @@ static void __synchronize_srcu(struct srcu_struct *ssp, bool do_norm) { struct rcu_synchronize rcu; - RCU_LOCKDEP_WARN(lock_is_held(&ssp->dep_map) || + srcu_lock_sync(&ssp->dep_map); + + RCU_LOCKDEP_WARN(lockdep_is_held(ssp) || lock_is_held(&rcu_bh_lock_map) || lock_is_held(&rcu_lock_map) || lock_is_held(&rcu_sched_lock_map), @@ -978,8 +1550,9 @@ EXPORT_SYMBOL_GPL(synchronize_srcu_expedited); * * Wait for the count to drain to zero of both indexes. To avoid the * possible starvation of synchronize_srcu(), it waits for the count of - * the index=((->srcu_idx & 1) ^ 1) to drain to zero at first, - * and then flip the srcu_idx and wait for the count of the other index. + * the index=!(ssp->srcu_ctrp - &ssp->sda->srcu_ctrs[0]) to drain to zero + * at first, and then flip the ->srcu_ctrp and wait for the count of the + * other index. * * Can block; must be called from process context. * @@ -1011,20 +1584,96 @@ EXPORT_SYMBOL_GPL(synchronize_srcu_expedited); * synchronize_srcu(), srcu_read_lock(), and srcu_read_unlock() are * passed the same srcu_struct structure. * - * If SRCU is likely idle, expedite the first request. This semantic - * was provided by Classic SRCU, and is relied upon by its users, so TREE - * SRCU must also provide it. Note that detecting idleness is heuristic - * and subject to both false positives and negatives. + * Implementation of these memory-ordering guarantees is similar to + * that of synchronize_rcu(). + * + * If SRCU is likely idle as determined by srcu_should_expedite(), + * expedite the first request. This semantic was provided by Classic SRCU, + * and is relied upon by its users, so TREE SRCU must also provide it. + * Note that detecting idleness is heuristic and subject to both false + * positives and negatives. */ void synchronize_srcu(struct srcu_struct *ssp) { - if (srcu_might_be_idle(ssp) || rcu_gp_is_expedited()) + if (srcu_should_expedite(ssp) || rcu_gp_is_expedited()) synchronize_srcu_expedited(ssp); else __synchronize_srcu(ssp, true); } EXPORT_SYMBOL_GPL(synchronize_srcu); +/** + * get_state_synchronize_srcu - Provide an end-of-grace-period cookie + * @ssp: srcu_struct to provide cookie for. + * + * This function returns a cookie that can be passed to + * poll_state_synchronize_srcu(), which will return true if a full grace + * period has elapsed in the meantime. It is the caller's responsibility + * to make sure that grace period happens, for example, by invoking + * call_srcu() after return from get_state_synchronize_srcu(). + */ +unsigned long get_state_synchronize_srcu(struct srcu_struct *ssp) +{ + // Any prior manipulation of SRCU-protected data must happen + // before the load from ->srcu_gp_seq. + smp_mb(); + return rcu_seq_snap(&ssp->srcu_sup->srcu_gp_seq); +} +EXPORT_SYMBOL_GPL(get_state_synchronize_srcu); + +/** + * start_poll_synchronize_srcu - Provide cookie and start grace period + * @ssp: srcu_struct to provide cookie for. + * + * This function returns a cookie that can be passed to + * poll_state_synchronize_srcu(), which will return true if a full grace + * period has elapsed in the meantime. Unlike get_state_synchronize_srcu(), + * this function also ensures that any needed SRCU grace period will be + * started. This convenience does come at a cost in terms of CPU overhead. + */ +unsigned long start_poll_synchronize_srcu(struct srcu_struct *ssp) +{ + return srcu_gp_start_if_needed(ssp, NULL, true); +} +EXPORT_SYMBOL_GPL(start_poll_synchronize_srcu); + +/** + * poll_state_synchronize_srcu - Has cookie's grace period ended? + * @ssp: srcu_struct to provide cookie for. + * @cookie: Return value from get_state_synchronize_srcu() or start_poll_synchronize_srcu(). + * + * This function takes the cookie that was returned from either + * get_state_synchronize_srcu() or start_poll_synchronize_srcu(), and + * returns @true if an SRCU grace period elapsed since the time that the + * cookie was created. + * + * Because cookies are finite in size, wrapping/overflow is possible. + * This is more pronounced on 32-bit systems where cookies are 32 bits, + * where in theory wrapping could happen in about 14 hours assuming + * 25-microsecond expedited SRCU grace periods. However, a more likely + * overflow lower bound is on the order of 24 days in the case of + * one-millisecond SRCU grace periods. Of course, wrapping in a 64-bit + * system requires geologic timespans, as in more than seven million years + * even for expedited SRCU grace periods. + * + * Wrapping/overflow is much more of an issue for CONFIG_SMP=n systems + * that also have CONFIG_PREEMPTION=n, which selects Tiny SRCU. This uses + * a 16-bit cookie, which rcutorture routinely wraps in a matter of a + * few minutes. If this proves to be a problem, this counter will be + * expanded to the same size as for Tree SRCU. + */ +bool poll_state_synchronize_srcu(struct srcu_struct *ssp, unsigned long cookie) +{ + if (cookie != SRCU_GET_STATE_COMPLETED && + !rcu_seq_done_exact(&ssp->srcu_sup->srcu_gp_seq, cookie)) + return false; + // Ensure that the end of the SRCU grace period happens before + // any subsequent code that the caller might execute. + smp_mb(); // ^^^ + return true; +} +EXPORT_SYMBOL_GPL(poll_state_synchronize_srcu); + /* * Callback function for srcu_barrier() use. */ @@ -1033,10 +1682,33 @@ static void srcu_barrier_cb(struct rcu_head *rhp) struct srcu_data *sdp; struct srcu_struct *ssp; + rhp->next = rhp; // Mark the callback as having been invoked. sdp = container_of(rhp, struct srcu_data, srcu_barrier_head); ssp = sdp->ssp; - if (atomic_dec_and_test(&ssp->srcu_barrier_cpu_cnt)) - complete(&ssp->srcu_barrier_completion); + if (atomic_dec_and_test(&ssp->srcu_sup->srcu_barrier_cpu_cnt)) + complete(&ssp->srcu_sup->srcu_barrier_completion); +} + +/* + * Enqueue an srcu_barrier() callback on the specified srcu_data + * structure's ->cblist. but only if that ->cblist already has at least one + * callback enqueued. Note that if a CPU already has callbacks enqueue, + * it must have already registered the need for a future grace period, + * so all we need do is enqueue a callback that will use the same grace + * period as the last callback already in the queue. + */ +static void srcu_barrier_one_cpu(struct srcu_struct *ssp, struct srcu_data *sdp) +{ + spin_lock_irq_rcu_node(sdp); + atomic_inc(&ssp->srcu_sup->srcu_barrier_cpu_cnt); + sdp->srcu_barrier_head.func = srcu_barrier_cb; + debug_rcu_head_queue(&sdp->srcu_barrier_head); + if (!rcu_segcblist_entrain(&sdp->srcu_cblist, + &sdp->srcu_barrier_head)) { + debug_rcu_head_unqueue(&sdp->srcu_barrier_head); + atomic_dec(&ssp->srcu_sup->srcu_barrier_cpu_cnt); + } + spin_unlock_irq_rcu_node(sdp); } /** @@ -1046,54 +1718,98 @@ static void srcu_barrier_cb(struct rcu_head *rhp) void srcu_barrier(struct srcu_struct *ssp) { int cpu; - struct srcu_data *sdp; - unsigned long s = rcu_seq_snap(&ssp->srcu_barrier_seq); + int idx; + unsigned long s = rcu_seq_snap(&ssp->srcu_sup->srcu_barrier_seq); check_init_srcu_struct(ssp); - mutex_lock(&ssp->srcu_barrier_mutex); - if (rcu_seq_done(&ssp->srcu_barrier_seq, s)) { + mutex_lock(&ssp->srcu_sup->srcu_barrier_mutex); + if (rcu_seq_done(&ssp->srcu_sup->srcu_barrier_seq, s)) { smp_mb(); /* Force ordering following return. */ - mutex_unlock(&ssp->srcu_barrier_mutex); + mutex_unlock(&ssp->srcu_sup->srcu_barrier_mutex); return; /* Someone else did our work for us. */ } - rcu_seq_start(&ssp->srcu_barrier_seq); - init_completion(&ssp->srcu_barrier_completion); + rcu_seq_start(&ssp->srcu_sup->srcu_barrier_seq); + init_completion(&ssp->srcu_sup->srcu_barrier_completion); /* Initial count prevents reaching zero until all CBs are posted. */ - atomic_set(&ssp->srcu_barrier_cpu_cnt, 1); + atomic_set(&ssp->srcu_sup->srcu_barrier_cpu_cnt, 1); - /* - * Each pass through this loop enqueues a callback, but only - * on CPUs already having callbacks enqueued. Note that if - * a CPU already has callbacks enqueue, it must have already - * registered the need for a future grace period, so all we - * need do is enqueue a callback that will use the same - * grace period as the last callback already in the queue. - */ - for_each_possible_cpu(cpu) { - sdp = per_cpu_ptr(ssp->sda, cpu); - spin_lock_irq_rcu_node(sdp); - atomic_inc(&ssp->srcu_barrier_cpu_cnt); - sdp->srcu_barrier_head.func = srcu_barrier_cb; - debug_rcu_head_queue(&sdp->srcu_barrier_head); - if (!rcu_segcblist_entrain(&sdp->srcu_cblist, - &sdp->srcu_barrier_head, 0)) { - debug_rcu_head_unqueue(&sdp->srcu_barrier_head); - atomic_dec(&ssp->srcu_barrier_cpu_cnt); - } - spin_unlock_irq_rcu_node(sdp); - } + idx = __srcu_read_lock_nmisafe(ssp); + if (smp_load_acquire(&ssp->srcu_sup->srcu_size_state) < SRCU_SIZE_WAIT_BARRIER) + srcu_barrier_one_cpu(ssp, per_cpu_ptr(ssp->sda, get_boot_cpu_id())); + else + for_each_possible_cpu(cpu) + srcu_barrier_one_cpu(ssp, per_cpu_ptr(ssp->sda, cpu)); + __srcu_read_unlock_nmisafe(ssp, idx); /* Remove the initial count, at which point reaching zero can happen. */ - if (atomic_dec_and_test(&ssp->srcu_barrier_cpu_cnt)) - complete(&ssp->srcu_barrier_completion); - wait_for_completion(&ssp->srcu_barrier_completion); + if (atomic_dec_and_test(&ssp->srcu_sup->srcu_barrier_cpu_cnt)) + complete(&ssp->srcu_sup->srcu_barrier_completion); + wait_for_completion(&ssp->srcu_sup->srcu_barrier_completion); - rcu_seq_end(&ssp->srcu_barrier_seq); - mutex_unlock(&ssp->srcu_barrier_mutex); + rcu_seq_end(&ssp->srcu_sup->srcu_barrier_seq); + mutex_unlock(&ssp->srcu_sup->srcu_barrier_mutex); } EXPORT_SYMBOL_GPL(srcu_barrier); +/* Callback for srcu_expedite_current() usage. */ +static void srcu_expedite_current_cb(struct rcu_head *rhp) +{ + unsigned long flags; + bool needcb = false; + struct srcu_data *sdp = container_of(rhp, struct srcu_data, srcu_ec_head); + + spin_lock_irqsave_sdp_contention(sdp, &flags); + if (sdp->srcu_ec_state == SRCU_EC_IDLE) { + WARN_ON_ONCE(1); + } else if (sdp->srcu_ec_state == SRCU_EC_PENDING) { + sdp->srcu_ec_state = SRCU_EC_IDLE; + } else { + WARN_ON_ONCE(sdp->srcu_ec_state != SRCU_EC_REPOST); + sdp->srcu_ec_state = SRCU_EC_PENDING; + needcb = true; + } + spin_unlock_irqrestore_rcu_node(sdp, flags); + // If needed, requeue ourselves as an expedited SRCU callback. + if (needcb) + __call_srcu(sdp->ssp, &sdp->srcu_ec_head, srcu_expedite_current_cb, false); +} + +/** + * srcu_expedite_current - Expedite the current SRCU grace period + * @ssp: srcu_struct to expedite. + * + * Cause the current SRCU grace period to become expedited. The grace + * period following the current one might also be expedited. If there is + * no current grace period, one might be created. If the current grace + * period is currently sleeping, that sleep will complete before expediting + * will take effect. + */ +void srcu_expedite_current(struct srcu_struct *ssp) +{ + unsigned long flags; + bool needcb = false; + struct srcu_data *sdp; + + migrate_disable(); + sdp = this_cpu_ptr(ssp->sda); + spin_lock_irqsave_sdp_contention(sdp, &flags); + if (sdp->srcu_ec_state == SRCU_EC_IDLE) { + sdp->srcu_ec_state = SRCU_EC_PENDING; + needcb = true; + } else if (sdp->srcu_ec_state == SRCU_EC_PENDING) { + sdp->srcu_ec_state = SRCU_EC_REPOST; + } else { + WARN_ON_ONCE(sdp->srcu_ec_state != SRCU_EC_REPOST); + } + spin_unlock_irqrestore_rcu_node(sdp, flags); + // If needed, queue an expedited SRCU callback. + if (needcb) + __call_srcu(ssp, &sdp->srcu_ec_head, srcu_expedite_current_cb, false); + migrate_enable(); +} +EXPORT_SYMBOL_GPL(srcu_expedite_current); + /** * srcu_batches_completed - return batches completed. * @ssp: srcu_struct on which to report batch completion. @@ -1103,7 +1819,7 @@ EXPORT_SYMBOL_GPL(srcu_barrier); */ unsigned long srcu_batches_completed(struct srcu_struct *ssp) { - return ssp->srcu_idx; + return READ_ONCE(ssp->srcu_sup->srcu_gp_seq); } EXPORT_SYMBOL_GPL(srcu_batches_completed); @@ -1116,11 +1832,11 @@ static void srcu_advance_state(struct srcu_struct *ssp) { int idx; - mutex_lock(&ssp->srcu_gp_mutex); + mutex_lock(&ssp->srcu_sup->srcu_gp_mutex); /* * Because readers might be delayed for an extended period after - * fetching ->srcu_idx for their index, at any point in time there + * fetching ->srcu_ctrp for their index, at any point in time there * might well be readers using both idx=0 and idx=1. We therefore * need to wait for readers to clear from both index values before * invoking a callback. @@ -1128,46 +1844,50 @@ static void srcu_advance_state(struct srcu_struct *ssp) * The load-acquire ensures that we see the accesses performed * by the prior grace period. */ - idx = rcu_seq_state(smp_load_acquire(&ssp->srcu_gp_seq)); /* ^^^ */ + idx = rcu_seq_state(smp_load_acquire(&ssp->srcu_sup->srcu_gp_seq)); /* ^^^ */ if (idx == SRCU_STATE_IDLE) { - spin_lock_irq_rcu_node(ssp); - if (ULONG_CMP_GE(ssp->srcu_gp_seq, ssp->srcu_gp_seq_needed)) { - WARN_ON_ONCE(rcu_seq_state(ssp->srcu_gp_seq)); - spin_unlock_irq_rcu_node(ssp); - mutex_unlock(&ssp->srcu_gp_mutex); + spin_lock_irq_rcu_node(ssp->srcu_sup); + if (ULONG_CMP_GE(ssp->srcu_sup->srcu_gp_seq, ssp->srcu_sup->srcu_gp_seq_needed)) { + WARN_ON_ONCE(rcu_seq_state(ssp->srcu_sup->srcu_gp_seq)); + spin_unlock_irq_rcu_node(ssp->srcu_sup); + mutex_unlock(&ssp->srcu_sup->srcu_gp_mutex); return; } - idx = rcu_seq_state(READ_ONCE(ssp->srcu_gp_seq)); + idx = rcu_seq_state(READ_ONCE(ssp->srcu_sup->srcu_gp_seq)); if (idx == SRCU_STATE_IDLE) srcu_gp_start(ssp); - spin_unlock_irq_rcu_node(ssp); + spin_unlock_irq_rcu_node(ssp->srcu_sup); if (idx != SRCU_STATE_IDLE) { - mutex_unlock(&ssp->srcu_gp_mutex); + mutex_unlock(&ssp->srcu_sup->srcu_gp_mutex); return; /* Someone else started the grace period. */ } } - if (rcu_seq_state(READ_ONCE(ssp->srcu_gp_seq)) == SRCU_STATE_SCAN1) { - idx = 1 ^ (ssp->srcu_idx & 1); + if (rcu_seq_state(READ_ONCE(ssp->srcu_sup->srcu_gp_seq)) == SRCU_STATE_SCAN1) { + idx = !(ssp->srcu_ctrp - &ssp->sda->srcu_ctrs[0]); if (!try_check_zero(ssp, idx, 1)) { - mutex_unlock(&ssp->srcu_gp_mutex); + mutex_unlock(&ssp->srcu_sup->srcu_gp_mutex); return; /* readers present, retry later. */ } srcu_flip(ssp); - rcu_seq_set_state(&ssp->srcu_gp_seq, SRCU_STATE_SCAN2); + spin_lock_irq_rcu_node(ssp->srcu_sup); + rcu_seq_set_state(&ssp->srcu_sup->srcu_gp_seq, SRCU_STATE_SCAN2); + ssp->srcu_sup->srcu_n_exp_nodelay = 0; + spin_unlock_irq_rcu_node(ssp->srcu_sup); } - if (rcu_seq_state(READ_ONCE(ssp->srcu_gp_seq)) == SRCU_STATE_SCAN2) { + if (rcu_seq_state(READ_ONCE(ssp->srcu_sup->srcu_gp_seq)) == SRCU_STATE_SCAN2) { /* * SRCU read-side critical sections are normally short, * so check at least twice in quick succession after a flip. */ - idx = 1 ^ (ssp->srcu_idx & 1); + idx = !(ssp->srcu_ctrp - &ssp->sda->srcu_ctrs[0]); if (!try_check_zero(ssp, idx, 2)) { - mutex_unlock(&ssp->srcu_gp_mutex); + mutex_unlock(&ssp->srcu_sup->srcu_gp_mutex); return; /* readers present, retry later. */ } + ssp->srcu_sup->srcu_n_exp_nodelay = 0; srcu_gp_end(ssp); /* Releases ->srcu_gp_mutex. */ } } @@ -1180,18 +1900,26 @@ static void srcu_advance_state(struct srcu_struct *ssp) */ static void srcu_invoke_callbacks(struct work_struct *work) { + long len; bool more; struct rcu_cblist ready_cbs; struct rcu_head *rhp; struct srcu_data *sdp; struct srcu_struct *ssp; - sdp = container_of(work, struct srcu_data, work.work); + sdp = container_of(work, struct srcu_data, work); + ssp = sdp->ssp; rcu_cblist_init(&ready_cbs); spin_lock_irq_rcu_node(sdp); + WARN_ON_ONCE(!rcu_segcblist_segempty(&sdp->srcu_cblist, RCU_NEXT_TAIL)); rcu_segcblist_advance(&sdp->srcu_cblist, - rcu_seq_current(&ssp->srcu_gp_seq)); + rcu_seq_current(&ssp->srcu_sup->srcu_gp_seq)); + /* + * Although this function is theoretically re-entrant, concurrent + * callbacks invocation is disallowed to avoid executing an SRCU barrier + * too early. + */ if (sdp->srcu_cblist_invoking || !rcu_segcblist_ready_cbs(&sdp->srcu_cblist)) { spin_unlock_irq_rcu_node(sdp); @@ -1201,26 +1929,28 @@ static void srcu_invoke_callbacks(struct work_struct *work) /* We are on the job! Extract and invoke ready callbacks. */ sdp->srcu_cblist_invoking = true; rcu_segcblist_extract_done_cbs(&sdp->srcu_cblist, &ready_cbs); + len = ready_cbs.len; spin_unlock_irq_rcu_node(sdp); rhp = rcu_cblist_dequeue(&ready_cbs); for (; rhp != NULL; rhp = rcu_cblist_dequeue(&ready_cbs)) { debug_rcu_head_unqueue(rhp); + debug_rcu_head_callback(rhp); local_bh_disable(); rhp->func(rhp); local_bh_enable(); } + WARN_ON_ONCE(ready_cbs.len); /* * Update counts, accelerate new callbacks, and if needed, * schedule another round of callback invocation. */ spin_lock_irq_rcu_node(sdp); - rcu_segcblist_insert_count(&sdp->srcu_cblist, &ready_cbs); - (void)rcu_segcblist_accelerate(&sdp->srcu_cblist, - rcu_seq_snap(&ssp->srcu_gp_seq)); + rcu_segcblist_add_len(&sdp->srcu_cblist, -len); sdp->srcu_cblist_invoking = false; more = rcu_segcblist_ready_cbs(&sdp->srcu_cblist); spin_unlock_irq_rcu_node(sdp); + /* An SRCU barrier or callbacks from previous nesting work pending */ if (more) srcu_schedule_cbs_sdp(sdp, 0); } @@ -1233,20 +1963,20 @@ static void srcu_reschedule(struct srcu_struct *ssp, unsigned long delay) { bool pushgp = true; - spin_lock_irq_rcu_node(ssp); - if (ULONG_CMP_GE(ssp->srcu_gp_seq, ssp->srcu_gp_seq_needed)) { - if (!WARN_ON_ONCE(rcu_seq_state(ssp->srcu_gp_seq))) { + spin_lock_irq_rcu_node(ssp->srcu_sup); + if (ULONG_CMP_GE(ssp->srcu_sup->srcu_gp_seq, ssp->srcu_sup->srcu_gp_seq_needed)) { + if (!WARN_ON_ONCE(rcu_seq_state(ssp->srcu_sup->srcu_gp_seq))) { /* All requests fulfilled, time to go idle. */ pushgp = false; } - } else if (!rcu_seq_state(ssp->srcu_gp_seq)) { + } else if (!rcu_seq_state(ssp->srcu_sup->srcu_gp_seq)) { /* Outstanding request and no GP. Start one. */ srcu_gp_start(ssp); } - spin_unlock_irq_rcu_node(ssp); + spin_unlock_irq_rcu_node(ssp->srcu_sup); if (pushgp) - queue_delayed_work(rcu_gp_wq, &ssp->work, delay); + queue_delayed_work(rcu_gp_wq, &ssp->srcu_sup->work, delay); } /* @@ -1254,61 +1984,106 @@ static void srcu_reschedule(struct srcu_struct *ssp, unsigned long delay) */ static void process_srcu(struct work_struct *work) { + unsigned long curdelay; + unsigned long j; struct srcu_struct *ssp; + struct srcu_usage *sup; - ssp = container_of(work, struct srcu_struct, work.work); + sup = container_of(work, struct srcu_usage, work.work); + ssp = sup->srcu_ssp; srcu_advance_state(ssp); - srcu_reschedule(ssp, srcu_get_delay(ssp)); + spin_lock_irq_rcu_node(ssp->srcu_sup); + curdelay = srcu_get_delay(ssp); + spin_unlock_irq_rcu_node(ssp->srcu_sup); + if (curdelay) { + WRITE_ONCE(sup->reschedule_count, 0); + } else { + j = jiffies; + if (READ_ONCE(sup->reschedule_jiffies) == j) { + ASSERT_EXCLUSIVE_WRITER(sup->reschedule_count); + WRITE_ONCE(sup->reschedule_count, READ_ONCE(sup->reschedule_count) + 1); + if (READ_ONCE(sup->reschedule_count) > srcu_max_nodelay) + curdelay = 1; + } else { + WRITE_ONCE(sup->reschedule_count, 1); + WRITE_ONCE(sup->reschedule_jiffies, j); + } + } + srcu_reschedule(ssp, curdelay); } -void srcutorture_get_gp_data(enum rcutorture_type test_type, - struct srcu_struct *ssp, int *flags, +void srcutorture_get_gp_data(struct srcu_struct *ssp, int *flags, unsigned long *gp_seq) { - if (test_type != SRCU_FLAVOR) - return; *flags = 0; - *gp_seq = rcu_seq_current(&ssp->srcu_gp_seq); + *gp_seq = rcu_seq_current(&ssp->srcu_sup->srcu_gp_seq); } EXPORT_SYMBOL_GPL(srcutorture_get_gp_data); +static const char * const srcu_size_state_name[] = { + "SRCU_SIZE_SMALL", + "SRCU_SIZE_ALLOC", + "SRCU_SIZE_WAIT_BARRIER", + "SRCU_SIZE_WAIT_CALL", + "SRCU_SIZE_WAIT_CBS1", + "SRCU_SIZE_WAIT_CBS2", + "SRCU_SIZE_WAIT_CBS3", + "SRCU_SIZE_WAIT_CBS4", + "SRCU_SIZE_BIG", + "SRCU_SIZE_???", +}; + void srcu_torture_stats_print(struct srcu_struct *ssp, char *tt, char *tf) { int cpu; int idx; unsigned long s0 = 0, s1 = 0; - - idx = ssp->srcu_idx & 0x1; - pr_alert("%s%s Tree SRCU g%ld per-CPU(idx=%d):", - tt, tf, rcu_seq_current(&ssp->srcu_gp_seq), idx); - for_each_possible_cpu(cpu) { - unsigned long l0, l1; - unsigned long u0, u1; - long c0, c1; - struct srcu_data *sdp; - - sdp = per_cpu_ptr(ssp->sda, cpu); - u0 = sdp->srcu_unlock_count[!idx]; - u1 = sdp->srcu_unlock_count[idx]; - - /* - * Make sure that a lock is always counted if the corresponding - * unlock is counted. - */ - smp_rmb(); - - l0 = sdp->srcu_lock_count[!idx]; - l1 = sdp->srcu_lock_count[idx]; - - c0 = l0 - u0; - c1 = l1 - u1; - pr_cont(" %d(%ld,%ld %1p)", - cpu, c0, c1, rcu_segcblist_head(&sdp->srcu_cblist)); - s0 += c0; - s1 += c1; + int ss_state = READ_ONCE(ssp->srcu_sup->srcu_size_state); + int ss_state_idx = ss_state; + + idx = ssp->srcu_ctrp - &ssp->sda->srcu_ctrs[0]; + if (ss_state < 0 || ss_state >= ARRAY_SIZE(srcu_size_state_name)) + ss_state_idx = ARRAY_SIZE(srcu_size_state_name) - 1; + pr_alert("%s%s Tree SRCU g%ld state %d (%s)", + tt, tf, rcu_seq_current(&ssp->srcu_sup->srcu_gp_seq), ss_state, + srcu_size_state_name[ss_state_idx]); + if (!ssp->sda) { + // Called after cleanup_srcu_struct(), perhaps. + pr_cont(" No per-CPU srcu_data structures (->sda == NULL).\n"); + } else { + pr_cont(" per-CPU(idx=%d):", idx); + for_each_possible_cpu(cpu) { + unsigned long l0, l1; + unsigned long u0, u1; + long c0, c1; + struct srcu_data *sdp; + + sdp = per_cpu_ptr(ssp->sda, cpu); + u0 = data_race(atomic_long_read(&sdp->srcu_ctrs[!idx].srcu_unlocks)); + u1 = data_race(atomic_long_read(&sdp->srcu_ctrs[idx].srcu_unlocks)); + + /* + * Make sure that a lock is always counted if the corresponding + * unlock is counted. + */ + smp_rmb(); + + l0 = data_race(atomic_long_read(&sdp->srcu_ctrs[!idx].srcu_locks)); + l1 = data_race(atomic_long_read(&sdp->srcu_ctrs[idx].srcu_locks)); + + c0 = l0 - u0; + c1 = l1 - u1; + pr_cont(" %d(%ld,%ld %c)", + cpu, c0, c1, + "C."[rcu_segcblist_empty(&sdp->srcu_cblist)]); + s0 += c0; + s1 += c1; + } + pr_cont(" T(%ld,%ld)\n", s0, s1); } - pr_cont(" T(%ld,%ld)\n", s0, s1); + if (SRCU_SIZING_IS_TORTURE()) + srcu_transition_to_big(ssp); } EXPORT_SYMBOL_GPL(srcu_torture_stats_print); @@ -1317,20 +2092,117 @@ static int __init srcu_bootup_announce(void) pr_info("Hierarchical SRCU implementation.\n"); if (exp_holdoff != DEFAULT_SRCU_EXP_HOLDOFF) pr_info("\tNon-default auto-expedite holdoff of %lu ns.\n", exp_holdoff); + if (srcu_retry_check_delay != SRCU_DEFAULT_RETRY_CHECK_DELAY) + pr_info("\tNon-default retry check delay of %lu us.\n", srcu_retry_check_delay); + if (srcu_max_nodelay != SRCU_DEFAULT_MAX_NODELAY) + pr_info("\tNon-default max no-delay of %lu.\n", srcu_max_nodelay); + pr_info("\tMax phase no-delay instances is %lu.\n", srcu_max_nodelay_phase); return 0; } early_initcall(srcu_bootup_announce); void __init srcu_init(void) { - struct srcu_struct *ssp; + struct srcu_usage *sup; + + /* Decide on srcu_struct-size strategy. */ + if (SRCU_SIZING_IS(SRCU_SIZING_AUTO)) { + if (nr_cpu_ids >= big_cpu_lim) { + convert_to_big = SRCU_SIZING_INIT; // Don't bother waiting for contention. + pr_info("%s: Setting srcu_struct sizes to big.\n", __func__); + } else { + convert_to_big = SRCU_SIZING_NONE | SRCU_SIZING_CONTEND; + pr_info("%s: Setting srcu_struct sizes based on contention.\n", __func__); + } + } + /* + * Once that is set, call_srcu() can follow the normal path and + * queue delayed work. This must follow RCU workqueues creation + * and timers initialization. + */ srcu_init_done = true; while (!list_empty(&srcu_boot_list)) { - ssp = list_first_entry(&srcu_boot_list, struct srcu_struct, + sup = list_first_entry(&srcu_boot_list, struct srcu_usage, work.work.entry); - check_init_srcu_struct(ssp); - list_del_init(&ssp->work.work.entry); - queue_work(rcu_gp_wq, &ssp->work.work); + list_del_init(&sup->work.work.entry); + if (SRCU_SIZING_IS(SRCU_SIZING_INIT) && + sup->srcu_size_state == SRCU_SIZE_SMALL) + sup->srcu_size_state = SRCU_SIZE_ALLOC; + queue_work(rcu_gp_wq, &sup->work.work); } } + +#ifdef CONFIG_MODULES + +/* Initialize any global-scope srcu_struct structures used by this module. */ +static int srcu_module_coming(struct module *mod) +{ + int i; + struct srcu_struct *ssp; + struct srcu_struct **sspp = mod->srcu_struct_ptrs; + + for (i = 0; i < mod->num_srcu_structs; i++) { + ssp = *(sspp++); + ssp->sda = alloc_percpu(struct srcu_data); + if (WARN_ON_ONCE(!ssp->sda)) + return -ENOMEM; + ssp->srcu_ctrp = &ssp->sda->srcu_ctrs[0]; + } + return 0; +} + +/* Clean up any global-scope srcu_struct structures used by this module. */ +static void srcu_module_going(struct module *mod) +{ + int i; + struct srcu_struct *ssp; + struct srcu_struct **sspp = mod->srcu_struct_ptrs; + + for (i = 0; i < mod->num_srcu_structs; i++) { + ssp = *(sspp++); + if (!rcu_seq_state(smp_load_acquire(&ssp->srcu_sup->srcu_gp_seq_needed)) && + !WARN_ON_ONCE(!ssp->srcu_sup->sda_is_static)) + cleanup_srcu_struct(ssp); + if (!WARN_ON(srcu_readers_active(ssp))) + free_percpu(ssp->sda); + } +} + +/* Handle one module, either coming or going. */ +static int srcu_module_notify(struct notifier_block *self, + unsigned long val, void *data) +{ + struct module *mod = data; + int ret = 0; + + switch (val) { + case MODULE_STATE_COMING: + ret = srcu_module_coming(mod); + break; + case MODULE_STATE_GOING: + srcu_module_going(mod); + break; + default: + break; + } + return ret; +} + +static struct notifier_block srcu_module_nb = { + .notifier_call = srcu_module_notify, + .priority = 0, +}; + +static __init int init_srcu_module_notifier(void) +{ + int ret; + + ret = register_module_notifier(&srcu_module_nb); + if (ret) + pr_warn("Failed to register srcu module notifier\n"); + return ret; +} +late_initcall(init_srcu_module_notifier); + +#endif /* #ifdef CONFIG_MODULES */ diff --git a/kernel/rcu/sync.c b/kernel/rcu/sync.c index be10036fa621..da60a9947c00 100644 --- a/kernel/rcu/sync.c +++ b/kernel/rcu/sync.c @@ -1,20 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0+ /* * RCU-based infrastructure for lightweight reader-writer locking * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, you can access it online at - * http://www.gnu.org/licenses/gpl-2.0.html. - * * Copyright (c) 2015, Red Hat, Inc. * * Author: Oleg Nesterov <oleg@redhat.com> @@ -23,132 +10,39 @@ #include <linux/rcu_sync.h> #include <linux/sched.h> -#ifdef CONFIG_PROVE_RCU -#define __INIT_HELD(func) .held = func, -#else -#define __INIT_HELD(func) -#endif - -static const struct { - void (*sync)(void); - void (*call)(struct rcu_head *, void (*)(struct rcu_head *)); - void (*wait)(void); -#ifdef CONFIG_PROVE_RCU - int (*held)(void); -#endif -} gp_ops[] = { - [RCU_SYNC] = { - .sync = synchronize_rcu, - .call = call_rcu, - .wait = rcu_barrier, - __INIT_HELD(rcu_read_lock_held) - }, - [RCU_SCHED_SYNC] = { - .sync = synchronize_rcu, - .call = call_rcu, - .wait = rcu_barrier, - __INIT_HELD(rcu_read_lock_sched_held) - }, - [RCU_BH_SYNC] = { - .sync = synchronize_rcu, - .call = call_rcu, - .wait = rcu_barrier, - __INIT_HELD(rcu_read_lock_bh_held) - }, -}; - -enum { GP_IDLE = 0, GP_PENDING, GP_PASSED }; -enum { CB_IDLE = 0, CB_PENDING, CB_REPLAY }; +enum { GP_IDLE = 0, GP_ENTER, GP_PASSED, GP_EXIT, GP_REPLAY }; #define rss_lock gp_wait.lock -#ifdef CONFIG_PROVE_RCU -void rcu_sync_lockdep_assert(struct rcu_sync *rsp) -{ - RCU_LOCKDEP_WARN(!gp_ops[rsp->gp_type].held(), - "suspicious rcu_sync_is_idle() usage"); -} - -EXPORT_SYMBOL_GPL(rcu_sync_lockdep_assert); -#endif - /** * rcu_sync_init() - Initialize an rcu_sync structure * @rsp: Pointer to rcu_sync structure to be initialized - * @type: Flavor of RCU with which to synchronize rcu_sync structure */ -void rcu_sync_init(struct rcu_sync *rsp, enum rcu_sync_type type) +void rcu_sync_init(struct rcu_sync *rsp) { memset(rsp, 0, sizeof(*rsp)); init_waitqueue_head(&rsp->gp_wait); - rsp->gp_type = type; } -/** - * rcu_sync_enter_start - Force readers onto slow path for multiple updates - * @rsp: Pointer to rcu_sync structure to use for synchronization - * - * Must be called after rcu_sync_init() and before first use. - * - * Ensures rcu_sync_is_idle() returns false and rcu_sync_{enter,exit}() - * pairs turn into NO-OPs. - */ -void rcu_sync_enter_start(struct rcu_sync *rsp) -{ - rsp->gp_count++; - rsp->gp_state = GP_PASSED; -} +static void rcu_sync_func(struct rcu_head *rhp); -/** - * rcu_sync_enter() - Force readers onto slowpath - * @rsp: Pointer to rcu_sync structure to use for synchronization - * - * This function is used by updaters who need readers to make use of - * a slowpath during the update. After this function returns, all - * subsequent calls to rcu_sync_is_idle() will return false, which - * tells readers to stay off their fastpaths. A later call to - * rcu_sync_exit() re-enables reader slowpaths. - * - * When called in isolation, rcu_sync_enter() must wait for a grace - * period, however, closely spaced calls to rcu_sync_enter() can - * optimize away the grace-period wait via a state machine implemented - * by rcu_sync_enter(), rcu_sync_exit(), and rcu_sync_func(). - */ -void rcu_sync_enter(struct rcu_sync *rsp) +static void rcu_sync_call(struct rcu_sync *rsp) { - bool need_wait, need_sync; - - spin_lock_irq(&rsp->rss_lock); - need_wait = rsp->gp_count++; - need_sync = rsp->gp_state == GP_IDLE; - if (need_sync) - rsp->gp_state = GP_PENDING; - spin_unlock_irq(&rsp->rss_lock); - - WARN_ON_ONCE(need_wait && need_sync); - if (need_sync) { - gp_ops[rsp->gp_type].sync(); - rsp->gp_state = GP_PASSED; - wake_up_all(&rsp->gp_wait); - } else if (need_wait) { - wait_event(rsp->gp_wait, rsp->gp_state == GP_PASSED); - } else { - /* - * Possible when there's a pending CB from a rcu_sync_exit(). - * Nobody has yet been allowed the 'fast' path and thus we can - * avoid doing any sync(). The callback will get 'dropped'. - */ - WARN_ON_ONCE(rsp->gp_state != GP_PASSED); - } + call_rcu_hurry(&rsp->cb_head, rcu_sync_func); } /** * rcu_sync_func() - Callback function managing reader access to fastpath * @rhp: Pointer to rcu_head in rcu_sync structure to use for synchronization * - * This function is passed to one of the call_rcu() functions by + * This function is passed to call_rcu() function by rcu_sync_enter() and * rcu_sync_exit(), so that it is invoked after a grace period following the - * that invocation of rcu_sync_exit(). It takes action based on events that + * that invocation of enter/exit. + * + * If it is called by rcu_sync_enter() it signals that all the readers were + * switched onto slow path. + * + * If it is called by rcu_sync_exit() it takes action based on events that * have taken place in the meantime, so that closely spaced rcu_sync_enter() * and rcu_sync_exit() pairs need not wait for a grace period. * @@ -165,35 +59,88 @@ static void rcu_sync_func(struct rcu_head *rhp) struct rcu_sync *rsp = container_of(rhp, struct rcu_sync, cb_head); unsigned long flags; - WARN_ON_ONCE(rsp->gp_state != GP_PASSED); - WARN_ON_ONCE(rsp->cb_state == CB_IDLE); + WARN_ON_ONCE(READ_ONCE(rsp->gp_state) == GP_IDLE); + WARN_ON_ONCE(READ_ONCE(rsp->gp_state) == GP_PASSED); spin_lock_irqsave(&rsp->rss_lock, flags); if (rsp->gp_count) { /* - * A new rcu_sync_begin() has happened; drop the callback. + * We're at least a GP after the GP_IDLE->GP_ENTER transition. */ - rsp->cb_state = CB_IDLE; - } else if (rsp->cb_state == CB_REPLAY) { + WRITE_ONCE(rsp->gp_state, GP_PASSED); + wake_up_locked(&rsp->gp_wait); + } else if (rsp->gp_state == GP_REPLAY) { /* - * A new rcu_sync_exit() has happened; requeue the callback - * to catch a later GP. + * A new rcu_sync_exit() has happened; requeue the callback to + * catch a later GP. */ - rsp->cb_state = CB_PENDING; - gp_ops[rsp->gp_type].call(&rsp->cb_head, rcu_sync_func); + WRITE_ONCE(rsp->gp_state, GP_EXIT); + rcu_sync_call(rsp); } else { /* - * We're at least a GP after rcu_sync_exit(); eveybody will now - * have observed the write side critical section. Let 'em rip!. + * We're at least a GP after the last rcu_sync_exit(); everybody + * will now have observed the write side critical section. + * Let 'em rip! */ - rsp->cb_state = CB_IDLE; - rsp->gp_state = GP_IDLE; + WRITE_ONCE(rsp->gp_state, GP_IDLE); } spin_unlock_irqrestore(&rsp->rss_lock, flags); } /** - * rcu_sync_exit() - Allow readers back onto fast patch after grace period + * rcu_sync_enter() - Force readers onto slowpath + * @rsp: Pointer to rcu_sync structure to use for synchronization + * + * This function is used by updaters who need readers to make use of + * a slowpath during the update. After this function returns, all + * subsequent calls to rcu_sync_is_idle() will return false, which + * tells readers to stay off their fastpaths. A later call to + * rcu_sync_exit() re-enables reader fastpaths. + * + * When called in isolation, rcu_sync_enter() must wait for a grace + * period, however, closely spaced calls to rcu_sync_enter() can + * optimize away the grace-period wait via a state machine implemented + * by rcu_sync_enter(), rcu_sync_exit(), and rcu_sync_func(). + */ +void rcu_sync_enter(struct rcu_sync *rsp) +{ + int gp_state; + + spin_lock_irq(&rsp->rss_lock); + gp_state = rsp->gp_state; + if (gp_state == GP_IDLE) { + WRITE_ONCE(rsp->gp_state, GP_ENTER); + WARN_ON_ONCE(rsp->gp_count); + /* + * Note that we could simply do rcu_sync_call(rsp) here and + * avoid the "if (gp_state == GP_IDLE)" block below. + * + * However, synchronize_rcu() can be faster if rcu_expedited + * or rcu_blocking_is_gp() is true. + * + * Another reason is that we can't wait for rcu callback if + * we are called at early boot time but this shouldn't happen. + */ + } + rsp->gp_count++; + spin_unlock_irq(&rsp->rss_lock); + + if (gp_state == GP_IDLE) { + /* + * See the comment above, this simply does the "synchronous" + * call_rcu(rcu_sync_func) which does GP_ENTER -> GP_PASSED. + */ + synchronize_rcu(); + rcu_sync_func(&rsp->cb_head); + /* Not really needed, wait_event() would see GP_PASSED. */ + return; + } + + wait_event(rsp->gp_wait, READ_ONCE(rsp->gp_state) >= GP_PASSED); +} + +/** + * rcu_sync_exit() - Allow readers back onto fast path after grace period * @rsp: Pointer to rcu_sync structure to use for synchronization * * This function is used by updaters who have completed, and can therefore @@ -204,13 +151,16 @@ static void rcu_sync_func(struct rcu_head *rhp) */ void rcu_sync_exit(struct rcu_sync *rsp) { + WARN_ON_ONCE(READ_ONCE(rsp->gp_state) == GP_IDLE); + spin_lock_irq(&rsp->rss_lock); + WARN_ON_ONCE(rsp->gp_count == 0); if (!--rsp->gp_count) { - if (rsp->cb_state == CB_IDLE) { - rsp->cb_state = CB_PENDING; - gp_ops[rsp->gp_type].call(&rsp->cb_head, rcu_sync_func); - } else if (rsp->cb_state == CB_PENDING) { - rsp->cb_state = CB_REPLAY; + if (rsp->gp_state == GP_PASSED) { + WRITE_ONCE(rsp->gp_state, GP_EXIT); + rcu_sync_call(rsp); + } else if (rsp->gp_state == GP_EXIT) { + WRITE_ONCE(rsp->gp_state, GP_REPLAY); } } spin_unlock_irq(&rsp->rss_lock); @@ -222,18 +172,19 @@ void rcu_sync_exit(struct rcu_sync *rsp) */ void rcu_sync_dtor(struct rcu_sync *rsp) { - int cb_state; + int gp_state; - WARN_ON_ONCE(rsp->gp_count); + WARN_ON_ONCE(READ_ONCE(rsp->gp_state) == GP_PASSED); spin_lock_irq(&rsp->rss_lock); - if (rsp->cb_state == CB_REPLAY) - rsp->cb_state = CB_PENDING; - cb_state = rsp->cb_state; + WARN_ON_ONCE(rsp->gp_count); + if (rsp->gp_state == GP_REPLAY) + WRITE_ONCE(rsp->gp_state, GP_EXIT); + gp_state = rsp->gp_state; spin_unlock_irq(&rsp->rss_lock); - if (cb_state != CB_IDLE) { - gp_ops[rsp->gp_type].wait(); - WARN_ON_ONCE(rsp->cb_state != CB_IDLE); + if (gp_state != GP_IDLE) { + rcu_barrier(); + WARN_ON_ONCE(rsp->gp_state != GP_IDLE); } } diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h new file mode 100644 index 000000000000..2dc044fd126e --- /dev/null +++ b/kernel/rcu/tasks.h @@ -0,0 +1,2283 @@ +/* SPDX-License-Identifier: GPL-2.0+ */ +/* + * Task-based RCU implementations. + * + * Copyright (C) 2020 Paul E. McKenney + */ + +#ifdef CONFIG_TASKS_RCU_GENERIC +#include "rcu_segcblist.h" + +//////////////////////////////////////////////////////////////////////// +// +// Generic data structures. + +struct rcu_tasks; +typedef void (*rcu_tasks_gp_func_t)(struct rcu_tasks *rtp); +typedef void (*pregp_func_t)(struct list_head *hop); +typedef void (*pertask_func_t)(struct task_struct *t, struct list_head *hop); +typedef void (*postscan_func_t)(struct list_head *hop); +typedef void (*holdouts_func_t)(struct list_head *hop, bool ndrpt, bool *frptp); +typedef void (*postgp_func_t)(struct rcu_tasks *rtp); + +/** + * struct rcu_tasks_percpu - Per-CPU component of definition for a Tasks-RCU-like mechanism. + * @cblist: Callback list. + * @lock: Lock protecting per-CPU callback list. + * @rtp_jiffies: Jiffies counter value for statistics. + * @lazy_timer: Timer to unlazify callbacks. + * @urgent_gp: Number of additional non-lazy grace periods. + * @rtp_n_lock_retries: Rough lock-contention statistic. + * @rtp_work: Work queue for invoking callbacks. + * @rtp_irq_work: IRQ work queue for deferred wakeups. + * @barrier_q_head: RCU callback for barrier operation. + * @rtp_blkd_tasks: List of tasks blocked as readers. + * @rtp_exit_list: List of tasks in the latter portion of do_exit(). + * @cpu: CPU number corresponding to this entry. + * @index: Index of this CPU in rtpcp_array of the rcu_tasks structure. + * @rtpp: Pointer to the rcu_tasks structure. + */ +struct rcu_tasks_percpu { + struct rcu_segcblist cblist; + raw_spinlock_t __private lock; + unsigned long rtp_jiffies; + unsigned long rtp_n_lock_retries; + struct timer_list lazy_timer; + unsigned int urgent_gp; + struct work_struct rtp_work; + struct irq_work rtp_irq_work; + struct rcu_head barrier_q_head; + struct list_head rtp_blkd_tasks; + struct list_head rtp_exit_list; + int cpu; + int index; + struct rcu_tasks *rtpp; +}; + +/** + * struct rcu_tasks - Definition for a Tasks-RCU-like mechanism. + * @cbs_wait: RCU wait allowing a new callback to get kthread's attention. + * @cbs_gbl_lock: Lock protecting callback list. + * @tasks_gp_mutex: Mutex protecting grace period, needed during mid-boot dead zone. + * @gp_func: This flavor's grace-period-wait function. + * @gp_state: Grace period's most recent state transition (debugging). + * @gp_sleep: Per-grace-period sleep to prevent CPU-bound looping. + * @init_fract: Initial backoff sleep interval. + * @gp_jiffies: Time of last @gp_state transition. + * @gp_start: Most recent grace-period start in jiffies. + * @tasks_gp_seq: Number of grace periods completed since boot in upper bits. + * @n_ipis: Number of IPIs sent to encourage grace periods to end. + * @n_ipis_fails: Number of IPI-send failures. + * @kthread_ptr: This flavor's grace-period/callback-invocation kthread. + * @lazy_jiffies: Number of jiffies to allow callbacks to be lazy. + * @pregp_func: This flavor's pre-grace-period function (optional). + * @pertask_func: This flavor's per-task scan function (optional). + * @postscan_func: This flavor's post-task scan function (optional). + * @holdouts_func: This flavor's holdout-list scan function (optional). + * @postgp_func: This flavor's post-grace-period function (optional). + * @call_func: This flavor's call_rcu()-equivalent function. + * @wait_state: Task state for synchronous grace-period waits (default TASK_UNINTERRUPTIBLE). + * @rtpcpu: This flavor's rcu_tasks_percpu structure. + * @rtpcp_array: Array of pointers to rcu_tasks_percpu structure of CPUs in cpu_possible_mask. + * @percpu_enqueue_shift: Shift down CPU ID this much when enqueuing callbacks. + * @percpu_enqueue_lim: Number of per-CPU callback queues in use for enqueuing. + * @percpu_dequeue_lim: Number of per-CPU callback queues in use for dequeuing. + * @percpu_dequeue_gpseq: RCU grace-period number to propagate enqueue limit to dequeuers. + * @barrier_q_mutex: Serialize barrier operations. + * @barrier_q_count: Number of queues being waited on. + * @barrier_q_completion: Barrier wait/wakeup mechanism. + * @barrier_q_seq: Sequence number for barrier operations. + * @barrier_q_start: Most recent barrier start in jiffies. + * @name: This flavor's textual name. + * @kname: This flavor's kthread name. + */ +struct rcu_tasks { + struct rcuwait cbs_wait; + raw_spinlock_t cbs_gbl_lock; + struct mutex tasks_gp_mutex; + int gp_state; + int gp_sleep; + int init_fract; + unsigned long gp_jiffies; + unsigned long gp_start; + unsigned long tasks_gp_seq; + unsigned long n_ipis; + unsigned long n_ipis_fails; + struct task_struct *kthread_ptr; + unsigned long lazy_jiffies; + rcu_tasks_gp_func_t gp_func; + pregp_func_t pregp_func; + pertask_func_t pertask_func; + postscan_func_t postscan_func; + holdouts_func_t holdouts_func; + postgp_func_t postgp_func; + call_rcu_func_t call_func; + unsigned int wait_state; + struct rcu_tasks_percpu __percpu *rtpcpu; + struct rcu_tasks_percpu **rtpcp_array; + int percpu_enqueue_shift; + int percpu_enqueue_lim; + int percpu_dequeue_lim; + unsigned long percpu_dequeue_gpseq; + struct mutex barrier_q_mutex; + atomic_t barrier_q_count; + struct completion barrier_q_completion; + unsigned long barrier_q_seq; + unsigned long barrier_q_start; + char *name; + char *kname; +}; + +static void call_rcu_tasks_iw_wakeup(struct irq_work *iwp); + +#define DEFINE_RCU_TASKS(rt_name, gp, call, n) \ +static DEFINE_PER_CPU(struct rcu_tasks_percpu, rt_name ## __percpu) = { \ + .lock = __RAW_SPIN_LOCK_UNLOCKED(rt_name ## __percpu.cbs_pcpu_lock), \ + .rtp_irq_work = IRQ_WORK_INIT_HARD(call_rcu_tasks_iw_wakeup), \ +}; \ +static struct rcu_tasks rt_name = \ +{ \ + .cbs_wait = __RCUWAIT_INITIALIZER(rt_name.wait), \ + .cbs_gbl_lock = __RAW_SPIN_LOCK_UNLOCKED(rt_name.cbs_gbl_lock), \ + .tasks_gp_mutex = __MUTEX_INITIALIZER(rt_name.tasks_gp_mutex), \ + .gp_func = gp, \ + .call_func = call, \ + .wait_state = TASK_UNINTERRUPTIBLE, \ + .rtpcpu = &rt_name ## __percpu, \ + .lazy_jiffies = DIV_ROUND_UP(HZ, 4), \ + .name = n, \ + .percpu_enqueue_shift = order_base_2(CONFIG_NR_CPUS), \ + .percpu_enqueue_lim = 1, \ + .percpu_dequeue_lim = 1, \ + .barrier_q_mutex = __MUTEX_INITIALIZER(rt_name.barrier_q_mutex), \ + .barrier_q_seq = (0UL - 50UL) << RCU_SEQ_CTR_SHIFT, \ + .kname = #rt_name, \ +} + +#ifdef CONFIG_TASKS_RCU + +/* Report delay of scan exiting tasklist in rcu_tasks_postscan(). */ +static void tasks_rcu_exit_srcu_stall(struct timer_list *unused); +static DEFINE_TIMER(tasks_rcu_exit_srcu_stall_timer, tasks_rcu_exit_srcu_stall); +#endif + +/* Avoid IPIing CPUs early in the grace period. */ +#define RCU_TASK_IPI_DELAY (IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB) ? HZ / 2 : 0) +static int rcu_task_ipi_delay __read_mostly = RCU_TASK_IPI_DELAY; +module_param(rcu_task_ipi_delay, int, 0644); + +/* Control stall timeouts. Disable with <= 0, otherwise jiffies till stall. */ +#define RCU_TASK_BOOT_STALL_TIMEOUT (HZ * 30) +#define RCU_TASK_STALL_TIMEOUT (HZ * 60 * 10) +static int rcu_task_stall_timeout __read_mostly = RCU_TASK_STALL_TIMEOUT; +module_param(rcu_task_stall_timeout, int, 0644); +#define RCU_TASK_STALL_INFO (HZ * 10) +static int rcu_task_stall_info __read_mostly = RCU_TASK_STALL_INFO; +module_param(rcu_task_stall_info, int, 0644); +static int rcu_task_stall_info_mult __read_mostly = 3; +module_param(rcu_task_stall_info_mult, int, 0444); + +static int rcu_task_enqueue_lim __read_mostly = -1; +module_param(rcu_task_enqueue_lim, int, 0444); + +static bool rcu_task_cb_adjust; +static int rcu_task_contend_lim __read_mostly = 100; +module_param(rcu_task_contend_lim, int, 0444); +static int rcu_task_collapse_lim __read_mostly = 10; +module_param(rcu_task_collapse_lim, int, 0444); +static int rcu_task_lazy_lim __read_mostly = 32; +module_param(rcu_task_lazy_lim, int, 0444); + +static int rcu_task_cpu_ids; + +/* RCU tasks grace-period state for debugging. */ +#define RTGS_INIT 0 +#define RTGS_WAIT_WAIT_CBS 1 +#define RTGS_WAIT_GP 2 +#define RTGS_PRE_WAIT_GP 3 +#define RTGS_SCAN_TASKLIST 4 +#define RTGS_POST_SCAN_TASKLIST 5 +#define RTGS_WAIT_SCAN_HOLDOUTS 6 +#define RTGS_SCAN_HOLDOUTS 7 +#define RTGS_POST_GP 8 +#define RTGS_WAIT_READERS 9 +#define RTGS_INVOKE_CBS 10 +#define RTGS_WAIT_CBS 11 +#ifndef CONFIG_TINY_RCU +static const char * const rcu_tasks_gp_state_names[] = { + "RTGS_INIT", + "RTGS_WAIT_WAIT_CBS", + "RTGS_WAIT_GP", + "RTGS_PRE_WAIT_GP", + "RTGS_SCAN_TASKLIST", + "RTGS_POST_SCAN_TASKLIST", + "RTGS_WAIT_SCAN_HOLDOUTS", + "RTGS_SCAN_HOLDOUTS", + "RTGS_POST_GP", + "RTGS_WAIT_READERS", + "RTGS_INVOKE_CBS", + "RTGS_WAIT_CBS", +}; +#endif /* #ifndef CONFIG_TINY_RCU */ + +//////////////////////////////////////////////////////////////////////// +// +// Generic code. + +static void rcu_tasks_invoke_cbs_wq(struct work_struct *wp); + +/* Record grace-period phase and time. */ +static void set_tasks_gp_state(struct rcu_tasks *rtp, int newstate) +{ + rtp->gp_state = newstate; + rtp->gp_jiffies = jiffies; +} + +#ifndef CONFIG_TINY_RCU +/* Return state name. */ +static const char *tasks_gp_state_getname(struct rcu_tasks *rtp) +{ + int i = data_race(rtp->gp_state); // Let KCSAN detect update races + int j = READ_ONCE(i); // Prevent the compiler from reading twice + + if (j >= ARRAY_SIZE(rcu_tasks_gp_state_names)) + return "???"; + return rcu_tasks_gp_state_names[j]; +} +#endif /* #ifndef CONFIG_TINY_RCU */ + +// Initialize per-CPU callback lists for the specified flavor of +// Tasks RCU. Do not enqueue callbacks before this function is invoked. +static void cblist_init_generic(struct rcu_tasks *rtp) +{ + int cpu; + int lim; + int shift; + int maxcpu; + int index = 0; + + if (rcu_task_enqueue_lim < 0) { + rcu_task_enqueue_lim = 1; + rcu_task_cb_adjust = true; + } else if (rcu_task_enqueue_lim == 0) { + rcu_task_enqueue_lim = 1; + } + lim = rcu_task_enqueue_lim; + + rtp->rtpcp_array = kcalloc(num_possible_cpus(), sizeof(struct rcu_tasks_percpu *), GFP_KERNEL); + BUG_ON(!rtp->rtpcp_array); + + for_each_possible_cpu(cpu) { + struct rcu_tasks_percpu *rtpcp = per_cpu_ptr(rtp->rtpcpu, cpu); + + WARN_ON_ONCE(!rtpcp); + if (cpu) + raw_spin_lock_init(&ACCESS_PRIVATE(rtpcp, lock)); + if (rcu_segcblist_empty(&rtpcp->cblist)) + rcu_segcblist_init(&rtpcp->cblist); + INIT_WORK(&rtpcp->rtp_work, rcu_tasks_invoke_cbs_wq); + rtpcp->cpu = cpu; + rtpcp->rtpp = rtp; + rtpcp->index = index; + rtp->rtpcp_array[index] = rtpcp; + index++; + if (!rtpcp->rtp_blkd_tasks.next) + INIT_LIST_HEAD(&rtpcp->rtp_blkd_tasks); + if (!rtpcp->rtp_exit_list.next) + INIT_LIST_HEAD(&rtpcp->rtp_exit_list); + rtpcp->barrier_q_head.next = &rtpcp->barrier_q_head; + maxcpu = cpu; + } + + rcu_task_cpu_ids = maxcpu + 1; + if (lim > rcu_task_cpu_ids) + lim = rcu_task_cpu_ids; + shift = ilog2(rcu_task_cpu_ids / lim); + if (((rcu_task_cpu_ids - 1) >> shift) >= lim) + shift++; + WRITE_ONCE(rtp->percpu_enqueue_shift, shift); + WRITE_ONCE(rtp->percpu_dequeue_lim, lim); + smp_store_release(&rtp->percpu_enqueue_lim, lim); + + pr_info("%s: Setting shift to %d and lim to %d rcu_task_cb_adjust=%d rcu_task_cpu_ids=%d.\n", + rtp->name, data_race(rtp->percpu_enqueue_shift), data_race(rtp->percpu_enqueue_lim), + rcu_task_cb_adjust, rcu_task_cpu_ids); +} + +// Compute wakeup time for lazy callback timer. +static unsigned long rcu_tasks_lazy_time(struct rcu_tasks *rtp) +{ + return jiffies + rtp->lazy_jiffies; +} + +// Timer handler that unlazifies lazy callbacks. +static void call_rcu_tasks_generic_timer(struct timer_list *tlp) +{ + unsigned long flags; + bool needwake = false; + struct rcu_tasks *rtp; + struct rcu_tasks_percpu *rtpcp = timer_container_of(rtpcp, tlp, + lazy_timer); + + rtp = rtpcp->rtpp; + raw_spin_lock_irqsave_rcu_node(rtpcp, flags); + if (!rcu_segcblist_empty(&rtpcp->cblist) && rtp->lazy_jiffies) { + if (!rtpcp->urgent_gp) + rtpcp->urgent_gp = 1; + needwake = true; + mod_timer(&rtpcp->lazy_timer, rcu_tasks_lazy_time(rtp)); + } + raw_spin_unlock_irqrestore_rcu_node(rtpcp, flags); + if (needwake) + rcuwait_wake_up(&rtp->cbs_wait); +} + +// IRQ-work handler that does deferred wakeup for call_rcu_tasks_generic(). +static void call_rcu_tasks_iw_wakeup(struct irq_work *iwp) +{ + struct rcu_tasks *rtp; + struct rcu_tasks_percpu *rtpcp = container_of(iwp, struct rcu_tasks_percpu, rtp_irq_work); + + rtp = rtpcp->rtpp; + rcuwait_wake_up(&rtp->cbs_wait); +} + +// Enqueue a callback for the specified flavor of Tasks RCU. +static void call_rcu_tasks_generic(struct rcu_head *rhp, rcu_callback_t func, + struct rcu_tasks *rtp) +{ + int chosen_cpu; + unsigned long flags; + bool havekthread = smp_load_acquire(&rtp->kthread_ptr); + int ideal_cpu; + unsigned long j; + bool needadjust = false; + bool needwake; + struct rcu_tasks_percpu *rtpcp; + + rhp->next = NULL; + rhp->func = func; + local_irq_save(flags); + rcu_read_lock(); + ideal_cpu = smp_processor_id() >> READ_ONCE(rtp->percpu_enqueue_shift); + chosen_cpu = cpumask_next(ideal_cpu - 1, cpu_possible_mask); + WARN_ON_ONCE(chosen_cpu >= rcu_task_cpu_ids); + rtpcp = per_cpu_ptr(rtp->rtpcpu, chosen_cpu); + if (!raw_spin_trylock_rcu_node(rtpcp)) { // irqs already disabled. + raw_spin_lock_rcu_node(rtpcp); // irqs already disabled. + j = jiffies; + if (rtpcp->rtp_jiffies != j) { + rtpcp->rtp_jiffies = j; + rtpcp->rtp_n_lock_retries = 0; + } + if (rcu_task_cb_adjust && ++rtpcp->rtp_n_lock_retries > rcu_task_contend_lim && + READ_ONCE(rtp->percpu_enqueue_lim) != rcu_task_cpu_ids) + needadjust = true; // Defer adjustment to avoid deadlock. + } + // Queuing callbacks before initialization not yet supported. + if (WARN_ON_ONCE(!rcu_segcblist_is_enabled(&rtpcp->cblist))) + rcu_segcblist_init(&rtpcp->cblist); + needwake = (func == wakeme_after_rcu) || + (rcu_segcblist_n_cbs(&rtpcp->cblist) == rcu_task_lazy_lim); + if (havekthread && !needwake && !timer_pending(&rtpcp->lazy_timer)) { + if (rtp->lazy_jiffies) + mod_timer(&rtpcp->lazy_timer, rcu_tasks_lazy_time(rtp)); + else + needwake = rcu_segcblist_empty(&rtpcp->cblist); + } + if (needwake) + rtpcp->urgent_gp = 3; + rcu_segcblist_enqueue(&rtpcp->cblist, rhp); + raw_spin_unlock_irqrestore_rcu_node(rtpcp, flags); + if (unlikely(needadjust)) { + raw_spin_lock_irqsave(&rtp->cbs_gbl_lock, flags); + if (rtp->percpu_enqueue_lim != rcu_task_cpu_ids) { + WRITE_ONCE(rtp->percpu_enqueue_shift, 0); + WRITE_ONCE(rtp->percpu_dequeue_lim, rcu_task_cpu_ids); + smp_store_release(&rtp->percpu_enqueue_lim, rcu_task_cpu_ids); + pr_info("Switching %s to per-CPU callback queuing.\n", rtp->name); + } + raw_spin_unlock_irqrestore(&rtp->cbs_gbl_lock, flags); + } + rcu_read_unlock(); + /* We can't create the thread unless interrupts are enabled. */ + if (needwake && READ_ONCE(rtp->kthread_ptr)) + irq_work_queue(&rtpcp->rtp_irq_work); +} + +// RCU callback function for rcu_barrier_tasks_generic(). +static void rcu_barrier_tasks_generic_cb(struct rcu_head *rhp) +{ + struct rcu_tasks *rtp; + struct rcu_tasks_percpu *rtpcp; + + rhp->next = rhp; // Mark the callback as having been invoked. + rtpcp = container_of(rhp, struct rcu_tasks_percpu, barrier_q_head); + rtp = rtpcp->rtpp; + if (atomic_dec_and_test(&rtp->barrier_q_count)) + complete(&rtp->barrier_q_completion); +} + +// Wait for all in-flight callbacks for the specified RCU Tasks flavor. +// Operates in a manner similar to rcu_barrier(). +static void __maybe_unused rcu_barrier_tasks_generic(struct rcu_tasks *rtp) +{ + int cpu; + unsigned long flags; + struct rcu_tasks_percpu *rtpcp; + unsigned long s = rcu_seq_snap(&rtp->barrier_q_seq); + + mutex_lock(&rtp->barrier_q_mutex); + if (rcu_seq_done(&rtp->barrier_q_seq, s)) { + smp_mb(); + mutex_unlock(&rtp->barrier_q_mutex); + return; + } + rtp->barrier_q_start = jiffies; + rcu_seq_start(&rtp->barrier_q_seq); + init_completion(&rtp->barrier_q_completion); + atomic_set(&rtp->barrier_q_count, 2); + for_each_possible_cpu(cpu) { + if (cpu >= smp_load_acquire(&rtp->percpu_dequeue_lim)) + break; + rtpcp = per_cpu_ptr(rtp->rtpcpu, cpu); + rtpcp->barrier_q_head.func = rcu_barrier_tasks_generic_cb; + raw_spin_lock_irqsave_rcu_node(rtpcp, flags); + if (rcu_segcblist_entrain(&rtpcp->cblist, &rtpcp->barrier_q_head)) + atomic_inc(&rtp->barrier_q_count); + raw_spin_unlock_irqrestore_rcu_node(rtpcp, flags); + } + if (atomic_sub_and_test(2, &rtp->barrier_q_count)) + complete(&rtp->barrier_q_completion); + wait_for_completion(&rtp->barrier_q_completion); + rcu_seq_end(&rtp->barrier_q_seq); + mutex_unlock(&rtp->barrier_q_mutex); +} + +// Advance callbacks and indicate whether either a grace period or +// callback invocation is needed. +static int rcu_tasks_need_gpcb(struct rcu_tasks *rtp) +{ + int cpu; + int dequeue_limit; + unsigned long flags; + bool gpdone = poll_state_synchronize_rcu(rtp->percpu_dequeue_gpseq); + long n; + long ncbs = 0; + long ncbsnz = 0; + int needgpcb = 0; + + dequeue_limit = smp_load_acquire(&rtp->percpu_dequeue_lim); + for (cpu = 0; cpu < dequeue_limit; cpu++) { + if (!cpu_possible(cpu)) + continue; + struct rcu_tasks_percpu *rtpcp = per_cpu_ptr(rtp->rtpcpu, cpu); + + /* Advance and accelerate any new callbacks. */ + if (!rcu_segcblist_n_cbs(&rtpcp->cblist)) + continue; + raw_spin_lock_irqsave_rcu_node(rtpcp, flags); + // Should we shrink down to a single callback queue? + n = rcu_segcblist_n_cbs(&rtpcp->cblist); + if (n) { + ncbs += n; + if (cpu > 0) + ncbsnz += n; + } + rcu_segcblist_advance(&rtpcp->cblist, rcu_seq_current(&rtp->tasks_gp_seq)); + (void)rcu_segcblist_accelerate(&rtpcp->cblist, rcu_seq_snap(&rtp->tasks_gp_seq)); + if (rtpcp->urgent_gp > 0 && rcu_segcblist_pend_cbs(&rtpcp->cblist)) { + if (rtp->lazy_jiffies) + rtpcp->urgent_gp--; + needgpcb |= 0x3; + } else if (rcu_segcblist_empty(&rtpcp->cblist)) { + rtpcp->urgent_gp = 0; + } + if (rcu_segcblist_ready_cbs(&rtpcp->cblist)) + needgpcb |= 0x1; + raw_spin_unlock_irqrestore_rcu_node(rtpcp, flags); + } + + // Shrink down to a single callback queue if appropriate. + // This is done in two stages: (1) If there are no more than + // rcu_task_collapse_lim callbacks on CPU 0 and none on any other + // CPU, limit enqueueing to CPU 0. (2) After an RCU grace period, + // if there has not been an increase in callbacks, limit dequeuing + // to CPU 0. Note the matching RCU read-side critical section in + // call_rcu_tasks_generic(). + if (rcu_task_cb_adjust && ncbs <= rcu_task_collapse_lim) { + raw_spin_lock_irqsave(&rtp->cbs_gbl_lock, flags); + if (rtp->percpu_enqueue_lim > 1) { + WRITE_ONCE(rtp->percpu_enqueue_shift, order_base_2(rcu_task_cpu_ids)); + smp_store_release(&rtp->percpu_enqueue_lim, 1); + rtp->percpu_dequeue_gpseq = get_state_synchronize_rcu(); + gpdone = false; + pr_info("Starting switch %s to CPU-0 callback queuing.\n", rtp->name); + } + raw_spin_unlock_irqrestore(&rtp->cbs_gbl_lock, flags); + } + if (rcu_task_cb_adjust && !ncbsnz && gpdone) { + raw_spin_lock_irqsave(&rtp->cbs_gbl_lock, flags); + if (rtp->percpu_enqueue_lim < rtp->percpu_dequeue_lim) { + WRITE_ONCE(rtp->percpu_dequeue_lim, 1); + pr_info("Completing switch %s to CPU-0 callback queuing.\n", rtp->name); + } + if (rtp->percpu_dequeue_lim == 1) { + for (cpu = rtp->percpu_dequeue_lim; cpu < rcu_task_cpu_ids; cpu++) { + if (!cpu_possible(cpu)) + continue; + struct rcu_tasks_percpu *rtpcp = per_cpu_ptr(rtp->rtpcpu, cpu); + + WARN_ON_ONCE(rcu_segcblist_n_cbs(&rtpcp->cblist)); + } + } + raw_spin_unlock_irqrestore(&rtp->cbs_gbl_lock, flags); + } + + return needgpcb; +} + +// Advance callbacks and invoke any that are ready. +static void rcu_tasks_invoke_cbs(struct rcu_tasks *rtp, struct rcu_tasks_percpu *rtpcp) +{ + int cpuwq; + unsigned long flags; + int len; + int index; + struct rcu_head *rhp; + struct rcu_cblist rcl = RCU_CBLIST_INITIALIZER(rcl); + struct rcu_tasks_percpu *rtpcp_next; + + index = rtpcp->index * 2 + 1; + if (index < num_possible_cpus()) { + rtpcp_next = rtp->rtpcp_array[index]; + if (rtpcp_next->cpu < smp_load_acquire(&rtp->percpu_dequeue_lim)) { + cpuwq = rcu_cpu_beenfullyonline(rtpcp_next->cpu) ? rtpcp_next->cpu : WORK_CPU_UNBOUND; + queue_work_on(cpuwq, system_percpu_wq, &rtpcp_next->rtp_work); + index++; + if (index < num_possible_cpus()) { + rtpcp_next = rtp->rtpcp_array[index]; + if (rtpcp_next->cpu < smp_load_acquire(&rtp->percpu_dequeue_lim)) { + cpuwq = rcu_cpu_beenfullyonline(rtpcp_next->cpu) ? rtpcp_next->cpu : WORK_CPU_UNBOUND; + queue_work_on(cpuwq, system_percpu_wq, &rtpcp_next->rtp_work); + } + } + } + } + + if (rcu_segcblist_empty(&rtpcp->cblist)) + return; + raw_spin_lock_irqsave_rcu_node(rtpcp, flags); + rcu_segcblist_advance(&rtpcp->cblist, rcu_seq_current(&rtp->tasks_gp_seq)); + rcu_segcblist_extract_done_cbs(&rtpcp->cblist, &rcl); + raw_spin_unlock_irqrestore_rcu_node(rtpcp, flags); + len = rcl.len; + for (rhp = rcu_cblist_dequeue(&rcl); rhp; rhp = rcu_cblist_dequeue(&rcl)) { + debug_rcu_head_callback(rhp); + local_bh_disable(); + rhp->func(rhp); + local_bh_enable(); + cond_resched(); + } + raw_spin_lock_irqsave_rcu_node(rtpcp, flags); + rcu_segcblist_add_len(&rtpcp->cblist, -len); + (void)rcu_segcblist_accelerate(&rtpcp->cblist, rcu_seq_snap(&rtp->tasks_gp_seq)); + raw_spin_unlock_irqrestore_rcu_node(rtpcp, flags); +} + +// Workqueue flood to advance callbacks and invoke any that are ready. +static void rcu_tasks_invoke_cbs_wq(struct work_struct *wp) +{ + struct rcu_tasks *rtp; + struct rcu_tasks_percpu *rtpcp = container_of(wp, struct rcu_tasks_percpu, rtp_work); + + rtp = rtpcp->rtpp; + rcu_tasks_invoke_cbs(rtp, rtpcp); +} + +// Wait for one grace period. +static void rcu_tasks_one_gp(struct rcu_tasks *rtp, bool midboot) +{ + int needgpcb; + + mutex_lock(&rtp->tasks_gp_mutex); + + // If there were none, wait a bit and start over. + if (unlikely(midboot)) { + needgpcb = 0x2; + } else { + mutex_unlock(&rtp->tasks_gp_mutex); + set_tasks_gp_state(rtp, RTGS_WAIT_CBS); + rcuwait_wait_event(&rtp->cbs_wait, + (needgpcb = rcu_tasks_need_gpcb(rtp)), + TASK_IDLE); + mutex_lock(&rtp->tasks_gp_mutex); + } + + if (needgpcb & 0x2) { + // Wait for one grace period. + set_tasks_gp_state(rtp, RTGS_WAIT_GP); + rtp->gp_start = jiffies; + rcu_seq_start(&rtp->tasks_gp_seq); + rtp->gp_func(rtp); + rcu_seq_end(&rtp->tasks_gp_seq); + } + + // Invoke callbacks. + set_tasks_gp_state(rtp, RTGS_INVOKE_CBS); + rcu_tasks_invoke_cbs(rtp, per_cpu_ptr(rtp->rtpcpu, 0)); + mutex_unlock(&rtp->tasks_gp_mutex); +} + +// RCU-tasks kthread that detects grace periods and invokes callbacks. +static int __noreturn rcu_tasks_kthread(void *arg) +{ + int cpu; + struct rcu_tasks *rtp = arg; + + for_each_possible_cpu(cpu) { + struct rcu_tasks_percpu *rtpcp = per_cpu_ptr(rtp->rtpcpu, cpu); + + timer_setup(&rtpcp->lazy_timer, call_rcu_tasks_generic_timer, 0); + rtpcp->urgent_gp = 1; + } + + /* Run on housekeeping CPUs by default. Sysadm can move if desired. */ + housekeeping_affine(current, HK_TYPE_RCU); + smp_store_release(&rtp->kthread_ptr, current); // Let GPs start! + + /* + * Each pass through the following loop makes one check for + * newly arrived callbacks, and, if there are some, waits for + * one RCU-tasks grace period and then invokes the callbacks. + * This loop is terminated by the system going down. ;-) + */ + for (;;) { + // Wait for one grace period and invoke any callbacks + // that are ready. + rcu_tasks_one_gp(rtp, false); + + // Paranoid sleep to keep this from entering a tight loop. + schedule_timeout_idle(rtp->gp_sleep); + } +} + +// Wait for a grace period for the specified flavor of Tasks RCU. +static void synchronize_rcu_tasks_generic(struct rcu_tasks *rtp) +{ + /* Complain if the scheduler has not started. */ + if (WARN_ONCE(rcu_scheduler_active == RCU_SCHEDULER_INACTIVE, + "synchronize_%s() called too soon", rtp->name)) + return; + + // If the grace-period kthread is running, use it. + if (READ_ONCE(rtp->kthread_ptr)) { + wait_rcu_gp_state(rtp->wait_state, rtp->call_func); + return; + } + rcu_tasks_one_gp(rtp, true); +} + +/* Spawn RCU-tasks grace-period kthread. */ +static void __init rcu_spawn_tasks_kthread_generic(struct rcu_tasks *rtp) +{ + struct task_struct *t; + + t = kthread_run(rcu_tasks_kthread, rtp, "%s_kthread", rtp->kname); + if (WARN_ONCE(IS_ERR(t), "%s: Could not start %s grace-period kthread, OOM is now expected behavior\n", __func__, rtp->name)) + return; + smp_mb(); /* Ensure others see full kthread. */ +} + +#ifndef CONFIG_TINY_RCU + +/* + * Print any non-default Tasks RCU settings. + */ +static void __init rcu_tasks_bootup_oddness(void) +{ +#if defined(CONFIG_TASKS_RCU) || defined(CONFIG_TASKS_TRACE_RCU) + int rtsimc; + + if (rcu_task_stall_timeout != RCU_TASK_STALL_TIMEOUT) + pr_info("\tTasks-RCU CPU stall warnings timeout set to %d (rcu_task_stall_timeout).\n", rcu_task_stall_timeout); + rtsimc = clamp(rcu_task_stall_info_mult, 1, 10); + if (rtsimc != rcu_task_stall_info_mult) { + pr_info("\tTasks-RCU CPU stall info multiplier clamped to %d (rcu_task_stall_info_mult).\n", rtsimc); + rcu_task_stall_info_mult = rtsimc; + } +#endif /* #ifdef CONFIG_TASKS_RCU */ +#ifdef CONFIG_TASKS_RCU + pr_info("\tTrampoline variant of Tasks RCU enabled.\n"); +#endif /* #ifdef CONFIG_TASKS_RCU */ +#ifdef CONFIG_TASKS_RUDE_RCU + pr_info("\tRude variant of Tasks RCU enabled.\n"); +#endif /* #ifdef CONFIG_TASKS_RUDE_RCU */ +#ifdef CONFIG_TASKS_TRACE_RCU + pr_info("\tTracing variant of Tasks RCU enabled.\n"); +#endif /* #ifdef CONFIG_TASKS_TRACE_RCU */ +} + + +/* Dump out rcutorture-relevant state common to all RCU-tasks flavors. */ +static void show_rcu_tasks_generic_gp_kthread(struct rcu_tasks *rtp, char *s) +{ + int cpu; + bool havecbs = false; + bool haveurgent = false; + bool haveurgentcbs = false; + + for_each_possible_cpu(cpu) { + struct rcu_tasks_percpu *rtpcp = per_cpu_ptr(rtp->rtpcpu, cpu); + + if (!data_race(rcu_segcblist_empty(&rtpcp->cblist))) + havecbs = true; + if (data_race(rtpcp->urgent_gp)) + haveurgent = true; + if (!data_race(rcu_segcblist_empty(&rtpcp->cblist)) && data_race(rtpcp->urgent_gp)) + haveurgentcbs = true; + if (havecbs && haveurgent && haveurgentcbs) + break; + } + pr_info("%s: %s(%d) since %lu g:%lu i:%lu/%lu %c%c%c%c l:%lu %s\n", + rtp->kname, + tasks_gp_state_getname(rtp), data_race(rtp->gp_state), + jiffies - data_race(rtp->gp_jiffies), + data_race(rcu_seq_current(&rtp->tasks_gp_seq)), + data_race(rtp->n_ipis_fails), data_race(rtp->n_ipis), + ".k"[!!data_race(rtp->kthread_ptr)], + ".C"[havecbs], + ".u"[haveurgent], + ".U"[haveurgentcbs], + rtp->lazy_jiffies, + s); +} + +/* Dump out more rcutorture-relevant state common to all RCU-tasks flavors. */ +static void rcu_tasks_torture_stats_print_generic(struct rcu_tasks *rtp, char *tt, + char *tf, char *tst) +{ + cpumask_var_t cm; + int cpu; + bool gotcb = false; + unsigned long j = jiffies; + + pr_alert("%s%s Tasks%s RCU g%ld gp_start %lu gp_jiffies %lu gp_state %d (%s).\n", + tt, tf, tst, data_race(rtp->tasks_gp_seq), + j - data_race(rtp->gp_start), j - data_race(rtp->gp_jiffies), + data_race(rtp->gp_state), tasks_gp_state_getname(rtp)); + pr_alert("\tEnqueue shift %d limit %d Dequeue limit %d gpseq %lu.\n", + data_race(rtp->percpu_enqueue_shift), + data_race(rtp->percpu_enqueue_lim), + data_race(rtp->percpu_dequeue_lim), + data_race(rtp->percpu_dequeue_gpseq)); + (void)zalloc_cpumask_var(&cm, GFP_KERNEL); + pr_alert("\tCallback counts:"); + for_each_possible_cpu(cpu) { + long n; + struct rcu_tasks_percpu *rtpcp = per_cpu_ptr(rtp->rtpcpu, cpu); + + if (cpumask_available(cm) && !rcu_barrier_cb_is_done(&rtpcp->barrier_q_head)) + cpumask_set_cpu(cpu, cm); + n = rcu_segcblist_n_cbs(&rtpcp->cblist); + if (!n) + continue; + pr_cont(" %d:%ld", cpu, n); + gotcb = true; + } + if (gotcb) + pr_cont(".\n"); + else + pr_cont(" (none).\n"); + pr_alert("\tBarrier seq %lu start %lu count %d holdout CPUs ", + data_race(rtp->barrier_q_seq), j - data_race(rtp->barrier_q_start), + atomic_read(&rtp->barrier_q_count)); + if (cpumask_available(cm) && !cpumask_empty(cm)) + pr_cont(" %*pbl.\n", cpumask_pr_args(cm)); + else + pr_cont("(none).\n"); + free_cpumask_var(cm); +} + +#endif // #ifndef CONFIG_TINY_RCU + +static void exit_tasks_rcu_finish_trace(struct task_struct *t); + +#if defined(CONFIG_TASKS_RCU) || defined(CONFIG_TASKS_TRACE_RCU) + +//////////////////////////////////////////////////////////////////////// +// +// Shared code between task-list-scanning variants of Tasks RCU. + +/* Wait for one RCU-tasks grace period. */ +static void rcu_tasks_wait_gp(struct rcu_tasks *rtp) +{ + struct task_struct *g; + int fract; + LIST_HEAD(holdouts); + unsigned long j; + unsigned long lastinfo; + unsigned long lastreport; + bool reported = false; + int rtsi; + struct task_struct *t; + + set_tasks_gp_state(rtp, RTGS_PRE_WAIT_GP); + rtp->pregp_func(&holdouts); + + /* + * There were callbacks, so we need to wait for an RCU-tasks + * grace period. Start off by scanning the task list for tasks + * that are not already voluntarily blocked. Mark these tasks + * and make a list of them in holdouts. + */ + set_tasks_gp_state(rtp, RTGS_SCAN_TASKLIST); + if (rtp->pertask_func) { + rcu_read_lock(); + for_each_process_thread(g, t) + rtp->pertask_func(t, &holdouts); + rcu_read_unlock(); + } + + set_tasks_gp_state(rtp, RTGS_POST_SCAN_TASKLIST); + rtp->postscan_func(&holdouts); + + /* + * Each pass through the following loop scans the list of holdout + * tasks, removing any that are no longer holdouts. When the list + * is empty, we are done. + */ + lastreport = jiffies; + lastinfo = lastreport; + rtsi = READ_ONCE(rcu_task_stall_info); + + // Start off with initial wait and slowly back off to 1 HZ wait. + fract = rtp->init_fract; + + while (!list_empty(&holdouts)) { + ktime_t exp; + bool firstreport; + bool needreport; + int rtst; + + // Slowly back off waiting for holdouts + set_tasks_gp_state(rtp, RTGS_WAIT_SCAN_HOLDOUTS); + if (!IS_ENABLED(CONFIG_PREEMPT_RT)) { + schedule_timeout_idle(fract); + } else { + exp = jiffies_to_nsecs(fract); + __set_current_state(TASK_IDLE); + schedule_hrtimeout_range(&exp, jiffies_to_nsecs(HZ / 2), HRTIMER_MODE_REL_HARD); + } + + if (fract < HZ) + fract++; + + rtst = READ_ONCE(rcu_task_stall_timeout); + needreport = rtst > 0 && time_after(jiffies, lastreport + rtst); + if (needreport) { + lastreport = jiffies; + reported = true; + } + firstreport = true; + WARN_ON(signal_pending(current)); + set_tasks_gp_state(rtp, RTGS_SCAN_HOLDOUTS); + rtp->holdouts_func(&holdouts, needreport, &firstreport); + + // Print pre-stall informational messages if needed. + j = jiffies; + if (rtsi > 0 && !reported && time_after(j, lastinfo + rtsi)) { + lastinfo = j; + rtsi = rtsi * rcu_task_stall_info_mult; + pr_info("%s: %s grace period number %lu (since boot) is %lu jiffies old.\n", + __func__, rtp->kname, rtp->tasks_gp_seq, j - rtp->gp_start); + } + } + + set_tasks_gp_state(rtp, RTGS_POST_GP); + rtp->postgp_func(rtp); +} + +#endif /* #if defined(CONFIG_TASKS_RCU) || defined(CONFIG_TASKS_TRACE_RCU) */ + +#ifdef CONFIG_TASKS_RCU + +//////////////////////////////////////////////////////////////////////// +// +// Simple variant of RCU whose quiescent states are voluntary context +// switch, cond_resched_tasks_rcu_qs(), user-space execution, and idle. +// As such, grace periods can take one good long time. There are no +// read-side primitives similar to rcu_read_lock() and rcu_read_unlock() +// because this implementation is intended to get the system into a safe +// state for some of the manipulations involved in tracing and the like. +// Finally, this implementation does not support high call_rcu_tasks() +// rates from multiple CPUs. If this is required, per-CPU callback lists +// will be needed. +// +// The implementation uses rcu_tasks_wait_gp(), which relies on function +// pointers in the rcu_tasks structure. The rcu_spawn_tasks_kthread() +// function sets these function pointers up so that rcu_tasks_wait_gp() +// invokes these functions in this order: +// +// rcu_tasks_pregp_step(): +// Invokes synchronize_rcu() in order to wait for all in-flight +// t->on_rq and t->nvcsw transitions to complete. This works because +// all such transitions are carried out with interrupts disabled. +// rcu_tasks_pertask(), invoked on every non-idle task: +// For every runnable non-idle task other than the current one, use +// get_task_struct() to pin down that task, snapshot that task's +// number of voluntary context switches, and add that task to the +// holdout list. +// rcu_tasks_postscan(): +// Gather per-CPU lists of tasks in do_exit() to ensure that all +// tasks that were in the process of exiting (and which thus might +// not know to synchronize with this RCU Tasks grace period) have +// completed exiting. The synchronize_rcu() in rcu_tasks_postgp() +// will take care of any tasks stuck in the non-preemptible region +// of do_exit() following its call to exit_tasks_rcu_finish(). +// check_all_holdout_tasks(), repeatedly until holdout list is empty: +// Scans the holdout list, attempting to identify a quiescent state +// for each task on the list. If there is a quiescent state, the +// corresponding task is removed from the holdout list. +// rcu_tasks_postgp(): +// Invokes synchronize_rcu() in order to ensure that all prior +// t->on_rq and t->nvcsw transitions are seen by all CPUs and tasks +// to have happened before the end of this RCU Tasks grace period. +// Again, this works because all such transitions are carried out +// with interrupts disabled. +// +// For each exiting task, the exit_tasks_rcu_start() and +// exit_tasks_rcu_finish() functions add and remove, respectively, the +// current task to a per-CPU list of tasks that rcu_tasks_postscan() must +// wait on. This is necessary because rcu_tasks_postscan() must wait on +// tasks that have already been removed from the global list of tasks. +// +// Pre-grace-period update-side code is ordered before the grace +// via the raw_spin_lock.*rcu_node(). Pre-grace-period read-side code +// is ordered before the grace period via synchronize_rcu() call in +// rcu_tasks_pregp_step() and by the scheduler's locks and interrupt +// disabling. + +/* Pre-grace-period preparation. */ +static void rcu_tasks_pregp_step(struct list_head *hop) +{ + /* + * Wait for all pre-existing t->on_rq and t->nvcsw transitions + * to complete. Invoking synchronize_rcu() suffices because all + * these transitions occur with interrupts disabled. Without this + * synchronize_rcu(), a read-side critical section that started + * before the grace period might be incorrectly seen as having + * started after the grace period. + * + * This synchronize_rcu() also dispenses with the need for a + * memory barrier on the first store to t->rcu_tasks_holdout, + * as it forces the store to happen after the beginning of the + * grace period. + */ + synchronize_rcu(); +} + +/* Check for quiescent states since the pregp's synchronize_rcu() */ +static bool rcu_tasks_is_holdout(struct task_struct *t) +{ + int cpu; + + /* Has the task been seen voluntarily sleeping? */ + if (!READ_ONCE(t->on_rq)) + return false; + + /* + * t->on_rq && !t->se.sched_delayed *could* be considered sleeping but + * since it is a spurious state (it will transition into the + * traditional blocked state or get woken up without outside + * dependencies), not considering it such should only affect timing. + * + * Be conservative for now and not include it. + */ + + /* + * Idle tasks (or idle injection) within the idle loop are RCU-tasks + * quiescent states. But CPU boot code performed by the idle task + * isn't a quiescent state. + */ + if (is_idle_task(t)) + return false; + + cpu = task_cpu(t); + + /* Idle tasks on offline CPUs are RCU-tasks quiescent states. */ + if (t == idle_task(cpu) && !rcu_cpu_online(cpu)) + return false; + + return true; +} + +/* Per-task initial processing. */ +static void rcu_tasks_pertask(struct task_struct *t, struct list_head *hop) +{ + if (t != current && rcu_tasks_is_holdout(t)) { + get_task_struct(t); + t->rcu_tasks_nvcsw = READ_ONCE(t->nvcsw); + WRITE_ONCE(t->rcu_tasks_holdout, true); + list_add(&t->rcu_tasks_holdout_list, hop); + } +} + +void call_rcu_tasks(struct rcu_head *rhp, rcu_callback_t func); +DEFINE_RCU_TASKS(rcu_tasks, rcu_tasks_wait_gp, call_rcu_tasks, "RCU Tasks"); + +/* Processing between scanning taskslist and draining the holdout list. */ +static void rcu_tasks_postscan(struct list_head *hop) +{ + int cpu; + int rtsi = READ_ONCE(rcu_task_stall_info); + + if (!IS_ENABLED(CONFIG_TINY_RCU)) { + tasks_rcu_exit_srcu_stall_timer.expires = jiffies + rtsi; + add_timer(&tasks_rcu_exit_srcu_stall_timer); + } + + /* + * Exiting tasks may escape the tasklist scan. Those are vulnerable + * until their final schedule() with TASK_DEAD state. To cope with + * this, divide the fragile exit path part in two intersecting + * read side critical sections: + * + * 1) A task_struct list addition before calling exit_notify(), + * which may remove the task from the tasklist, with the + * removal after the final preempt_disable() call in do_exit(). + * + * 2) An _RCU_ read side starting with the final preempt_disable() + * call in do_exit() and ending with the final call to schedule() + * with TASK_DEAD state. + * + * This handles the part 1). And postgp will handle part 2) with a + * call to synchronize_rcu(). + */ + + for_each_possible_cpu(cpu) { + unsigned long j = jiffies + 1; + struct rcu_tasks_percpu *rtpcp = per_cpu_ptr(rcu_tasks.rtpcpu, cpu); + struct task_struct *t; + struct task_struct *t1; + struct list_head tmp; + + raw_spin_lock_irq_rcu_node(rtpcp); + list_for_each_entry_safe(t, t1, &rtpcp->rtp_exit_list, rcu_tasks_exit_list) { + if (list_empty(&t->rcu_tasks_holdout_list)) + rcu_tasks_pertask(t, hop); + + // RT kernels need frequent pauses, otherwise + // pause at least once per pair of jiffies. + if (!IS_ENABLED(CONFIG_PREEMPT_RT) && time_before(jiffies, j)) + continue; + + // Keep our place in the list while pausing. + // Nothing else traverses this list, so adding a + // bare list_head is OK. + list_add(&tmp, &t->rcu_tasks_exit_list); + raw_spin_unlock_irq_rcu_node(rtpcp); + cond_resched(); // For CONFIG_PREEMPT=n kernels + raw_spin_lock_irq_rcu_node(rtpcp); + t1 = list_entry(tmp.next, struct task_struct, rcu_tasks_exit_list); + list_del(&tmp); + j = jiffies + 1; + } + raw_spin_unlock_irq_rcu_node(rtpcp); + } + + if (!IS_ENABLED(CONFIG_TINY_RCU)) + timer_delete_sync(&tasks_rcu_exit_srcu_stall_timer); +} + +/* See if tasks are still holding out, complain if so. */ +static void check_holdout_task(struct task_struct *t, + bool needreport, bool *firstreport) +{ + int cpu; + + if (!READ_ONCE(t->rcu_tasks_holdout) || + t->rcu_tasks_nvcsw != READ_ONCE(t->nvcsw) || + !rcu_tasks_is_holdout(t) || + (IS_ENABLED(CONFIG_NO_HZ_FULL) && + !is_idle_task(t) && READ_ONCE(t->rcu_tasks_idle_cpu) >= 0)) { + WRITE_ONCE(t->rcu_tasks_holdout, false); + list_del_init(&t->rcu_tasks_holdout_list); + put_task_struct(t); + return; + } + rcu_request_urgent_qs_task(t); + if (!needreport) + return; + if (*firstreport) { + pr_err("INFO: rcu_tasks detected stalls on tasks:\n"); + *firstreport = false; + } + cpu = task_cpu(t); + pr_alert("%p: %c%c nvcsw: %lu/%lu holdout: %d idle_cpu: %d/%d\n", + t, ".I"[is_idle_task(t)], + "N."[cpu < 0 || !tick_nohz_full_cpu(cpu)], + t->rcu_tasks_nvcsw, t->nvcsw, t->rcu_tasks_holdout, + data_race(t->rcu_tasks_idle_cpu), cpu); + sched_show_task(t); +} + +/* Scan the holdout lists for tasks no longer holding out. */ +static void check_all_holdout_tasks(struct list_head *hop, + bool needreport, bool *firstreport) +{ + struct task_struct *t, *t1; + + list_for_each_entry_safe(t, t1, hop, rcu_tasks_holdout_list) { + check_holdout_task(t, needreport, firstreport); + cond_resched(); + } +} + +/* Finish off the Tasks-RCU grace period. */ +static void rcu_tasks_postgp(struct rcu_tasks *rtp) +{ + /* + * Because ->on_rq and ->nvcsw are not guaranteed to have a full + * memory barriers prior to them in the schedule() path, memory + * reordering on other CPUs could cause their RCU-tasks read-side + * critical sections to extend past the end of the grace period. + * However, because these ->nvcsw updates are carried out with + * interrupts disabled, we can use synchronize_rcu() to force the + * needed ordering on all such CPUs. + * + * This synchronize_rcu() also confines all ->rcu_tasks_holdout + * accesses to be within the grace period, avoiding the need for + * memory barriers for ->rcu_tasks_holdout accesses. + * + * In addition, this synchronize_rcu() waits for exiting tasks + * to complete their final preempt_disable() region of execution, + * enforcing the whole region before tasklist removal until + * the final schedule() with TASK_DEAD state to be an RCU TASKS + * read side critical section. + */ + synchronize_rcu(); +} + +static void tasks_rcu_exit_srcu_stall(struct timer_list *unused) +{ +#ifndef CONFIG_TINY_RCU + int rtsi; + + rtsi = READ_ONCE(rcu_task_stall_info); + pr_info("%s: %s grace period number %lu (since boot) gp_state: %s is %lu jiffies old.\n", + __func__, rcu_tasks.kname, rcu_tasks.tasks_gp_seq, + tasks_gp_state_getname(&rcu_tasks), jiffies - rcu_tasks.gp_jiffies); + pr_info("Please check any exiting tasks stuck between calls to exit_tasks_rcu_start() and exit_tasks_rcu_finish()\n"); + tasks_rcu_exit_srcu_stall_timer.expires = jiffies + rtsi; + add_timer(&tasks_rcu_exit_srcu_stall_timer); +#endif // #ifndef CONFIG_TINY_RCU +} + +/** + * call_rcu_tasks() - Queue an RCU for invocation task-based grace period + * @rhp: structure to be used for queueing the RCU updates. + * @func: actual callback function to be invoked after the grace period + * + * The callback function will be invoked some time after a full grace + * period elapses, in other words after all currently executing RCU + * read-side critical sections have completed. call_rcu_tasks() assumes + * that the read-side critical sections end at a voluntary context + * switch (not a preemption!), cond_resched_tasks_rcu_qs(), entry into idle, + * or transition to usermode execution. As such, there are no read-side + * primitives analogous to rcu_read_lock() and rcu_read_unlock() because + * this primitive is intended to determine that all tasks have passed + * through a safe state, not so much for data-structure synchronization. + * + * See the description of call_rcu() for more detailed information on + * memory ordering guarantees. + */ +void call_rcu_tasks(struct rcu_head *rhp, rcu_callback_t func) +{ + call_rcu_tasks_generic(rhp, func, &rcu_tasks); +} +EXPORT_SYMBOL_GPL(call_rcu_tasks); + +/** + * synchronize_rcu_tasks - wait until an rcu-tasks grace period has elapsed. + * + * Control will return to the caller some time after a full rcu-tasks + * grace period has elapsed, in other words after all currently + * executing rcu-tasks read-side critical sections have elapsed. These + * read-side critical sections are delimited by calls to schedule(), + * cond_resched_tasks_rcu_qs(), idle execution, userspace execution, calls + * to synchronize_rcu_tasks(), and (in theory, anyway) cond_resched(). + * + * This is a very specialized primitive, intended only for a few uses in + * tracing and other situations requiring manipulation of function + * preambles and profiling hooks. The synchronize_rcu_tasks() function + * is not (yet) intended for heavy use from multiple CPUs. + * + * See the description of synchronize_rcu() for more detailed information + * on memory ordering guarantees. + */ +void synchronize_rcu_tasks(void) +{ + synchronize_rcu_tasks_generic(&rcu_tasks); +} +EXPORT_SYMBOL_GPL(synchronize_rcu_tasks); + +/** + * rcu_barrier_tasks - Wait for in-flight call_rcu_tasks() callbacks. + * + * Although the current implementation is guaranteed to wait, it is not + * obligated to, for example, if there are no pending callbacks. + */ +void rcu_barrier_tasks(void) +{ + rcu_barrier_tasks_generic(&rcu_tasks); +} +EXPORT_SYMBOL_GPL(rcu_barrier_tasks); + +static int rcu_tasks_lazy_ms = -1; +module_param(rcu_tasks_lazy_ms, int, 0444); + +static int __init rcu_spawn_tasks_kthread(void) +{ + rcu_tasks.gp_sleep = HZ / 10; + rcu_tasks.init_fract = HZ / 10; + if (rcu_tasks_lazy_ms >= 0) + rcu_tasks.lazy_jiffies = msecs_to_jiffies(rcu_tasks_lazy_ms); + rcu_tasks.pregp_func = rcu_tasks_pregp_step; + rcu_tasks.pertask_func = rcu_tasks_pertask; + rcu_tasks.postscan_func = rcu_tasks_postscan; + rcu_tasks.holdouts_func = check_all_holdout_tasks; + rcu_tasks.postgp_func = rcu_tasks_postgp; + rcu_tasks.wait_state = TASK_IDLE; + rcu_spawn_tasks_kthread_generic(&rcu_tasks); + return 0; +} + +#if !defined(CONFIG_TINY_RCU) +void show_rcu_tasks_classic_gp_kthread(void) +{ + show_rcu_tasks_generic_gp_kthread(&rcu_tasks, ""); +} +EXPORT_SYMBOL_GPL(show_rcu_tasks_classic_gp_kthread); + +void rcu_tasks_torture_stats_print(char *tt, char *tf) +{ + rcu_tasks_torture_stats_print_generic(&rcu_tasks, tt, tf, ""); +} +EXPORT_SYMBOL_GPL(rcu_tasks_torture_stats_print); +#endif // !defined(CONFIG_TINY_RCU) + +struct task_struct *get_rcu_tasks_gp_kthread(void) +{ + return rcu_tasks.kthread_ptr; +} +EXPORT_SYMBOL_GPL(get_rcu_tasks_gp_kthread); + +void rcu_tasks_get_gp_data(int *flags, unsigned long *gp_seq) +{ + *flags = 0; + *gp_seq = rcu_seq_current(&rcu_tasks.tasks_gp_seq); +} +EXPORT_SYMBOL_GPL(rcu_tasks_get_gp_data); + +/* + * Protect against tasklist scan blind spot while the task is exiting and + * may be removed from the tasklist. Do this by adding the task to yet + * another list. + * + * Note that the task will remove itself from this list, so there is no + * need for get_task_struct(), except in the case where rcu_tasks_pertask() + * adds it to the holdout list, in which case rcu_tasks_pertask() supplies + * the needed get_task_struct(). + */ +void exit_tasks_rcu_start(void) +{ + unsigned long flags; + struct rcu_tasks_percpu *rtpcp; + struct task_struct *t = current; + + WARN_ON_ONCE(!list_empty(&t->rcu_tasks_exit_list)); + preempt_disable(); + rtpcp = this_cpu_ptr(rcu_tasks.rtpcpu); + t->rcu_tasks_exit_cpu = smp_processor_id(); + raw_spin_lock_irqsave_rcu_node(rtpcp, flags); + WARN_ON_ONCE(!rtpcp->rtp_exit_list.next); + list_add(&t->rcu_tasks_exit_list, &rtpcp->rtp_exit_list); + raw_spin_unlock_irqrestore_rcu_node(rtpcp, flags); + preempt_enable(); +} + +/* + * Remove the task from the "yet another list" because do_exit() is now + * non-preemptible, allowing synchronize_rcu() to wait beyond this point. + */ +void exit_tasks_rcu_finish(void) +{ + unsigned long flags; + struct rcu_tasks_percpu *rtpcp; + struct task_struct *t = current; + + WARN_ON_ONCE(list_empty(&t->rcu_tasks_exit_list)); + rtpcp = per_cpu_ptr(rcu_tasks.rtpcpu, t->rcu_tasks_exit_cpu); + raw_spin_lock_irqsave_rcu_node(rtpcp, flags); + list_del_init(&t->rcu_tasks_exit_list); + raw_spin_unlock_irqrestore_rcu_node(rtpcp, flags); + + exit_tasks_rcu_finish_trace(t); +} + +#else /* #ifdef CONFIG_TASKS_RCU */ +void exit_tasks_rcu_start(void) { } +void exit_tasks_rcu_finish(void) { exit_tasks_rcu_finish_trace(current); } +#endif /* #else #ifdef CONFIG_TASKS_RCU */ + +#ifdef CONFIG_TASKS_RUDE_RCU + +//////////////////////////////////////////////////////////////////////// +// +// "Rude" variant of Tasks RCU, inspired by Steve Rostedt's +// trick of passing an empty function to schedule_on_each_cpu(). +// This approach provides batching of concurrent calls to the synchronous +// synchronize_rcu_tasks_rude() API. This invokes schedule_on_each_cpu() +// in order to send IPIs far and wide and induces otherwise unnecessary +// context switches on all online CPUs, whether idle or not. +// +// Callback handling is provided by the rcu_tasks_kthread() function. +// +// Ordering is provided by the scheduler's context-switch code. + +// Empty function to allow workqueues to force a context switch. +static void rcu_tasks_be_rude(struct work_struct *work) +{ +} + +// Wait for one rude RCU-tasks grace period. +static void rcu_tasks_rude_wait_gp(struct rcu_tasks *rtp) +{ + rtp->n_ipis += cpumask_weight(cpu_online_mask); + schedule_on_each_cpu(rcu_tasks_be_rude); +} + +static void call_rcu_tasks_rude(struct rcu_head *rhp, rcu_callback_t func); +DEFINE_RCU_TASKS(rcu_tasks_rude, rcu_tasks_rude_wait_gp, call_rcu_tasks_rude, + "RCU Tasks Rude"); + +/* + * call_rcu_tasks_rude() - Queue a callback rude task-based grace period + * @rhp: structure to be used for queueing the RCU updates. + * @func: actual callback function to be invoked after the grace period + * + * The callback function will be invoked some time after a full grace + * period elapses, in other words after all currently executing RCU + * read-side critical sections have completed. call_rcu_tasks_rude() + * assumes that the read-side critical sections end at context switch, + * cond_resched_tasks_rcu_qs(), or transition to usermode execution (as + * usermode execution is schedulable). As such, there are no read-side + * primitives analogous to rcu_read_lock() and rcu_read_unlock() because + * this primitive is intended to determine that all tasks have passed + * through a safe state, not so much for data-structure synchronization. + * + * See the description of call_rcu() for more detailed information on + * memory ordering guarantees. + * + * This is no longer exported, and is instead reserved for use by + * synchronize_rcu_tasks_rude(). + */ +static void call_rcu_tasks_rude(struct rcu_head *rhp, rcu_callback_t func) +{ + call_rcu_tasks_generic(rhp, func, &rcu_tasks_rude); +} + +/** + * synchronize_rcu_tasks_rude - wait for a rude rcu-tasks grace period + * + * Control will return to the caller some time after a rude rcu-tasks + * grace period has elapsed, in other words after all currently + * executing rcu-tasks read-side critical sections have elapsed. These + * read-side critical sections are delimited by calls to schedule(), + * cond_resched_tasks_rcu_qs(), userspace execution (which is a schedulable + * context), and (in theory, anyway) cond_resched(). + * + * This is a very specialized primitive, intended only for a few uses in + * tracing and other situations requiring manipulation of function preambles + * and profiling hooks. The synchronize_rcu_tasks_rude() function is not + * (yet) intended for heavy use from multiple CPUs. + * + * See the description of synchronize_rcu() for more detailed information + * on memory ordering guarantees. + */ +void synchronize_rcu_tasks_rude(void) +{ + if (!IS_ENABLED(CONFIG_ARCH_WANTS_NO_INSTR) || IS_ENABLED(CONFIG_FORCE_TASKS_RUDE_RCU)) + synchronize_rcu_tasks_generic(&rcu_tasks_rude); +} +EXPORT_SYMBOL_GPL(synchronize_rcu_tasks_rude); + +static int __init rcu_spawn_tasks_rude_kthread(void) +{ + rcu_tasks_rude.gp_sleep = HZ / 10; + rcu_spawn_tasks_kthread_generic(&rcu_tasks_rude); + return 0; +} + +#if !defined(CONFIG_TINY_RCU) +void show_rcu_tasks_rude_gp_kthread(void) +{ + show_rcu_tasks_generic_gp_kthread(&rcu_tasks_rude, ""); +} +EXPORT_SYMBOL_GPL(show_rcu_tasks_rude_gp_kthread); + +void rcu_tasks_rude_torture_stats_print(char *tt, char *tf) +{ + rcu_tasks_torture_stats_print_generic(&rcu_tasks_rude, tt, tf, ""); +} +EXPORT_SYMBOL_GPL(rcu_tasks_rude_torture_stats_print); +#endif // !defined(CONFIG_TINY_RCU) + +struct task_struct *get_rcu_tasks_rude_gp_kthread(void) +{ + return rcu_tasks_rude.kthread_ptr; +} +EXPORT_SYMBOL_GPL(get_rcu_tasks_rude_gp_kthread); + +void rcu_tasks_rude_get_gp_data(int *flags, unsigned long *gp_seq) +{ + *flags = 0; + *gp_seq = rcu_seq_current(&rcu_tasks_rude.tasks_gp_seq); +} +EXPORT_SYMBOL_GPL(rcu_tasks_rude_get_gp_data); + +#endif /* #ifdef CONFIG_TASKS_RUDE_RCU */ + +//////////////////////////////////////////////////////////////////////// +// +// Tracing variant of Tasks RCU. This variant is designed to be used +// to protect tracing hooks, including those of BPF. This variant +// therefore: +// +// 1. Has explicit read-side markers to allow finite grace periods +// in the face of in-kernel loops for PREEMPT=n builds. +// +// 2. Protects code in the idle loop, exception entry/exit, and +// CPU-hotplug code paths, similar to the capabilities of SRCU. +// +// 3. Avoids expensive read-side instructions, having overhead similar +// to that of Preemptible RCU. +// +// There are of course downsides. For example, the grace-period code +// can send IPIs to CPUs, even when those CPUs are in the idle loop or +// in nohz_full userspace. If needed, these downsides can be at least +// partially remedied. +// +// Perhaps most important, this variant of RCU does not affect the vanilla +// flavors, rcu_preempt and rcu_sched. The fact that RCU Tasks Trace +// readers can operate from idle, offline, and exception entry/exit in no +// way allows rcu_preempt and rcu_sched readers to also do so. +// +// The implementation uses rcu_tasks_wait_gp(), which relies on function +// pointers in the rcu_tasks structure. The rcu_spawn_tasks_trace_kthread() +// function sets these function pointers up so that rcu_tasks_wait_gp() +// invokes these functions in this order: +// +// rcu_tasks_trace_pregp_step(): +// Disables CPU hotplug, adds all currently executing tasks to the +// holdout list, then checks the state of all tasks that blocked +// or were preempted within their current RCU Tasks Trace read-side +// critical section, adding them to the holdout list if appropriate. +// Finally, this function re-enables CPU hotplug. +// The ->pertask_func() pointer is NULL, so there is no per-task processing. +// rcu_tasks_trace_postscan(): +// Invokes synchronize_rcu() to wait for late-stage exiting tasks +// to finish exiting. +// check_all_holdout_tasks_trace(), repeatedly until holdout list is empty: +// Scans the holdout list, attempting to identify a quiescent state +// for each task on the list. If there is a quiescent state, the +// corresponding task is removed from the holdout list. Once this +// list is empty, the grace period has completed. +// rcu_tasks_trace_postgp(): +// Provides the needed full memory barrier and does debug checks. +// +// The exit_tasks_rcu_finish_trace() synchronizes with exiting tasks. +// +// Pre-grace-period update-side code is ordered before the grace period +// via the ->cbs_lock and barriers in rcu_tasks_kthread(). Pre-grace-period +// read-side code is ordered before the grace period by atomic operations +// on .b.need_qs flag of each task involved in this process, or by scheduler +// context-switch ordering (for locked-down non-running readers). + +// The lockdep state must be outside of #ifdef to be useful. +#ifdef CONFIG_DEBUG_LOCK_ALLOC +static struct lock_class_key rcu_lock_trace_key; +struct lockdep_map rcu_trace_lock_map = + STATIC_LOCKDEP_MAP_INIT("rcu_read_lock_trace", &rcu_lock_trace_key); +EXPORT_SYMBOL_GPL(rcu_trace_lock_map); +#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ + +#ifdef CONFIG_TASKS_TRACE_RCU + +// Record outstanding IPIs to each CPU. No point in sending two... +static DEFINE_PER_CPU(bool, trc_ipi_to_cpu); + +// The number of detections of task quiescent state relying on +// heavyweight readers executing explicit memory barriers. +static unsigned long n_heavy_reader_attempts; +static unsigned long n_heavy_reader_updates; +static unsigned long n_heavy_reader_ofl_updates; +static unsigned long n_trc_holdouts; + +void call_rcu_tasks_trace(struct rcu_head *rhp, rcu_callback_t func); +DEFINE_RCU_TASKS(rcu_tasks_trace, rcu_tasks_wait_gp, call_rcu_tasks_trace, + "RCU Tasks Trace"); + +/* Load from ->trc_reader_special.b.need_qs with proper ordering. */ +static u8 rcu_ld_need_qs(struct task_struct *t) +{ + smp_mb(); // Enforce full grace-period ordering. + return smp_load_acquire(&t->trc_reader_special.b.need_qs); +} + +/* Store to ->trc_reader_special.b.need_qs with proper ordering. */ +static void rcu_st_need_qs(struct task_struct *t, u8 v) +{ + smp_store_release(&t->trc_reader_special.b.need_qs, v); + smp_mb(); // Enforce full grace-period ordering. +} + +/* + * Do a cmpxchg() on ->trc_reader_special.b.need_qs, allowing for + * the four-byte operand-size restriction of some platforms. + * + * Returns the old value, which is often ignored. + */ +u8 rcu_trc_cmpxchg_need_qs(struct task_struct *t, u8 old, u8 new) +{ + return cmpxchg(&t->trc_reader_special.b.need_qs, old, new); +} +EXPORT_SYMBOL_GPL(rcu_trc_cmpxchg_need_qs); + +/* + * If we are the last reader, signal the grace-period kthread. + * Also remove from the per-CPU list of blocked tasks. + */ +void rcu_read_unlock_trace_special(struct task_struct *t) +{ + unsigned long flags; + struct rcu_tasks_percpu *rtpcp; + union rcu_special trs; + + // Open-coded full-word version of rcu_ld_need_qs(). + smp_mb(); // Enforce full grace-period ordering. + trs = smp_load_acquire(&t->trc_reader_special); + + if (IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB) && t->trc_reader_special.b.need_mb) + smp_mb(); // Pairs with update-side barriers. + // Update .need_qs before ->trc_reader_nesting for irq/NMI handlers. + if (trs.b.need_qs == (TRC_NEED_QS_CHECKED | TRC_NEED_QS)) { + u8 result = rcu_trc_cmpxchg_need_qs(t, TRC_NEED_QS_CHECKED | TRC_NEED_QS, + TRC_NEED_QS_CHECKED); + + WARN_ONCE(result != trs.b.need_qs, "%s: result = %d", __func__, result); + } + if (trs.b.blocked) { + rtpcp = per_cpu_ptr(rcu_tasks_trace.rtpcpu, t->trc_blkd_cpu); + raw_spin_lock_irqsave_rcu_node(rtpcp, flags); + list_del_init(&t->trc_blkd_node); + WRITE_ONCE(t->trc_reader_special.b.blocked, false); + raw_spin_unlock_irqrestore_rcu_node(rtpcp, flags); + } + WRITE_ONCE(t->trc_reader_nesting, 0); +} +EXPORT_SYMBOL_GPL(rcu_read_unlock_trace_special); + +/* Add a newly blocked reader task to its CPU's list. */ +void rcu_tasks_trace_qs_blkd(struct task_struct *t) +{ + unsigned long flags; + struct rcu_tasks_percpu *rtpcp; + + local_irq_save(flags); + rtpcp = this_cpu_ptr(rcu_tasks_trace.rtpcpu); + raw_spin_lock_rcu_node(rtpcp); // irqs already disabled + t->trc_blkd_cpu = smp_processor_id(); + if (!rtpcp->rtp_blkd_tasks.next) + INIT_LIST_HEAD(&rtpcp->rtp_blkd_tasks); + list_add(&t->trc_blkd_node, &rtpcp->rtp_blkd_tasks); + WRITE_ONCE(t->trc_reader_special.b.blocked, true); + raw_spin_unlock_irqrestore_rcu_node(rtpcp, flags); +} +EXPORT_SYMBOL_GPL(rcu_tasks_trace_qs_blkd); + +/* Add a task to the holdout list, if it is not already on the list. */ +static void trc_add_holdout(struct task_struct *t, struct list_head *bhp) +{ + if (list_empty(&t->trc_holdout_list)) { + get_task_struct(t); + list_add(&t->trc_holdout_list, bhp); + n_trc_holdouts++; + } +} + +/* Remove a task from the holdout list, if it is in fact present. */ +static void trc_del_holdout(struct task_struct *t) +{ + if (!list_empty(&t->trc_holdout_list)) { + list_del_init(&t->trc_holdout_list); + put_task_struct(t); + n_trc_holdouts--; + } +} + +/* IPI handler to check task state. */ +static void trc_read_check_handler(void *t_in) +{ + int nesting; + struct task_struct *t = current; + struct task_struct *texp = t_in; + + // If the task is no longer running on this CPU, leave. + if (unlikely(texp != t)) + goto reset_ipi; // Already on holdout list, so will check later. + + // If the task is not in a read-side critical section, and + // if this is the last reader, awaken the grace-period kthread. + nesting = READ_ONCE(t->trc_reader_nesting); + if (likely(!nesting)) { + rcu_trc_cmpxchg_need_qs(t, 0, TRC_NEED_QS_CHECKED); + goto reset_ipi; + } + // If we are racing with an rcu_read_unlock_trace(), try again later. + if (unlikely(nesting < 0)) + goto reset_ipi; + + // Get here if the task is in a read-side critical section. + // Set its state so that it will update state for the grace-period + // kthread upon exit from that critical section. + rcu_trc_cmpxchg_need_qs(t, 0, TRC_NEED_QS | TRC_NEED_QS_CHECKED); + +reset_ipi: + // Allow future IPIs to be sent on CPU and for task. + // Also order this IPI handler against any later manipulations of + // the intended task. + smp_store_release(per_cpu_ptr(&trc_ipi_to_cpu, smp_processor_id()), false); // ^^^ + smp_store_release(&texp->trc_ipi_to_cpu, -1); // ^^^ +} + +/* Callback function for scheduler to check locked-down task. */ +static int trc_inspect_reader(struct task_struct *t, void *bhp_in) +{ + struct list_head *bhp = bhp_in; + int cpu = task_cpu(t); + int nesting; + bool ofl = cpu_is_offline(cpu); + + if (task_curr(t) && !ofl) { + // If no chance of heavyweight readers, do it the hard way. + if (!IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB)) + return -EINVAL; + + // If heavyweight readers are enabled on the remote task, + // we can inspect its state despite its currently running. + // However, we cannot safely change its state. + n_heavy_reader_attempts++; + // Check for "running" idle tasks on offline CPUs. + if (!rcu_watching_zero_in_eqs(cpu, &t->trc_reader_nesting)) + return -EINVAL; // No quiescent state, do it the hard way. + n_heavy_reader_updates++; + nesting = 0; + } else { + // The task is not running, so C-language access is safe. + nesting = t->trc_reader_nesting; + WARN_ON_ONCE(ofl && task_curr(t) && (t != idle_task(task_cpu(t)))); + if (IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB) && ofl) + n_heavy_reader_ofl_updates++; + } + + // If not exiting a read-side critical section, mark as checked + // so that the grace-period kthread will remove it from the + // holdout list. + if (!nesting) { + rcu_trc_cmpxchg_need_qs(t, 0, TRC_NEED_QS_CHECKED); + return 0; // In QS, so done. + } + if (nesting < 0) + return -EINVAL; // Reader transitioning, try again later. + + // The task is in a read-side critical section, so set up its + // state so that it will update state upon exit from that critical + // section. + if (!rcu_trc_cmpxchg_need_qs(t, 0, TRC_NEED_QS | TRC_NEED_QS_CHECKED)) + trc_add_holdout(t, bhp); + return 0; +} + +/* Attempt to extract the state for the specified task. */ +static void trc_wait_for_one_reader(struct task_struct *t, + struct list_head *bhp) +{ + int cpu; + + // If a previous IPI is still in flight, let it complete. + if (smp_load_acquire(&t->trc_ipi_to_cpu) != -1) // Order IPI + return; + + // The current task had better be in a quiescent state. + if (t == current) { + rcu_trc_cmpxchg_need_qs(t, 0, TRC_NEED_QS_CHECKED); + WARN_ON_ONCE(READ_ONCE(t->trc_reader_nesting)); + return; + } + + // Attempt to nail down the task for inspection. + get_task_struct(t); + if (!task_call_func(t, trc_inspect_reader, bhp)) { + put_task_struct(t); + return; + } + put_task_struct(t); + + // If this task is not yet on the holdout list, then we are in + // an RCU read-side critical section. Otherwise, the invocation of + // trc_add_holdout() that added it to the list did the necessary + // get_task_struct(). Either way, the task cannot be freed out + // from under this code. + + // If currently running, send an IPI, either way, add to list. + trc_add_holdout(t, bhp); + if (task_curr(t) && + time_after(jiffies + 1, rcu_tasks_trace.gp_start + rcu_task_ipi_delay)) { + // The task is currently running, so try IPIing it. + cpu = task_cpu(t); + + // If there is already an IPI outstanding, let it happen. + if (per_cpu(trc_ipi_to_cpu, cpu) || t->trc_ipi_to_cpu >= 0) + return; + + per_cpu(trc_ipi_to_cpu, cpu) = true; + t->trc_ipi_to_cpu = cpu; + rcu_tasks_trace.n_ipis++; + if (smp_call_function_single(cpu, trc_read_check_handler, t, 0)) { + // Just in case there is some other reason for + // failure than the target CPU being offline. + WARN_ONCE(1, "%s(): smp_call_function_single() failed for CPU: %d\n", + __func__, cpu); + rcu_tasks_trace.n_ipis_fails++; + per_cpu(trc_ipi_to_cpu, cpu) = false; + t->trc_ipi_to_cpu = -1; + } + } +} + +/* + * Initialize for first-round processing for the specified task. + * Return false if task is NULL or already taken care of, true otherwise. + */ +static bool rcu_tasks_trace_pertask_prep(struct task_struct *t, bool notself) +{ + // During early boot when there is only the one boot CPU, there + // is no idle task for the other CPUs. Also, the grace-period + // kthread is always in a quiescent state. In addition, just return + // if this task is already on the list. + if (unlikely(t == NULL) || (t == current && notself) || !list_empty(&t->trc_holdout_list)) + return false; + + rcu_st_need_qs(t, 0); + t->trc_ipi_to_cpu = -1; + return true; +} + +/* Do first-round processing for the specified task. */ +static void rcu_tasks_trace_pertask(struct task_struct *t, struct list_head *hop) +{ + if (rcu_tasks_trace_pertask_prep(t, true)) + trc_wait_for_one_reader(t, hop); +} + +/* Initialize for a new RCU-tasks-trace grace period. */ +static void rcu_tasks_trace_pregp_step(struct list_head *hop) +{ + LIST_HEAD(blkd_tasks); + int cpu; + unsigned long flags; + struct rcu_tasks_percpu *rtpcp; + struct task_struct *t; + + // There shouldn't be any old IPIs, but... + for_each_possible_cpu(cpu) + WARN_ON_ONCE(per_cpu(trc_ipi_to_cpu, cpu)); + + // Disable CPU hotplug across the CPU scan for the benefit of + // any IPIs that might be needed. This also waits for all readers + // in CPU-hotplug code paths. + cpus_read_lock(); + + // These rcu_tasks_trace_pertask_prep() calls are serialized to + // allow safe access to the hop list. + for_each_online_cpu(cpu) { + rcu_read_lock(); + // Note that cpu_curr_snapshot() picks up the target + // CPU's current task while its runqueue is locked with + // an smp_mb__after_spinlock(). This ensures that either + // the grace-period kthread will see that task's read-side + // critical section or the task will see the updater's pre-GP + // accesses. The trailing smp_mb() in cpu_curr_snapshot() + // does not currently play a role other than simplify + // that function's ordering semantics. If these simplified + // ordering semantics continue to be redundant, that smp_mb() + // might be removed. + t = cpu_curr_snapshot(cpu); + if (rcu_tasks_trace_pertask_prep(t, true)) + trc_add_holdout(t, hop); + rcu_read_unlock(); + cond_resched_tasks_rcu_qs(); + } + + // Only after all running tasks have been accounted for is it + // safe to take care of the tasks that have blocked within their + // current RCU tasks trace read-side critical section. + for_each_possible_cpu(cpu) { + rtpcp = per_cpu_ptr(rcu_tasks_trace.rtpcpu, cpu); + raw_spin_lock_irqsave_rcu_node(rtpcp, flags); + list_splice_init(&rtpcp->rtp_blkd_tasks, &blkd_tasks); + while (!list_empty(&blkd_tasks)) { + rcu_read_lock(); + t = list_first_entry(&blkd_tasks, struct task_struct, trc_blkd_node); + list_del_init(&t->trc_blkd_node); + list_add(&t->trc_blkd_node, &rtpcp->rtp_blkd_tasks); + raw_spin_unlock_irqrestore_rcu_node(rtpcp, flags); + rcu_tasks_trace_pertask(t, hop); + rcu_read_unlock(); + raw_spin_lock_irqsave_rcu_node(rtpcp, flags); + } + raw_spin_unlock_irqrestore_rcu_node(rtpcp, flags); + cond_resched_tasks_rcu_qs(); + } + + // Re-enable CPU hotplug now that the holdout list is populated. + cpus_read_unlock(); +} + +/* + * Do intermediate processing between task and holdout scans. + */ +static void rcu_tasks_trace_postscan(struct list_head *hop) +{ + // Wait for late-stage exiting tasks to finish exiting. + // These might have passed the call to exit_tasks_rcu_finish(). + + // If you remove the following line, update rcu_trace_implies_rcu_gp()!!! + synchronize_rcu(); + // Any tasks that exit after this point will set + // TRC_NEED_QS_CHECKED in ->trc_reader_special.b.need_qs. +} + +/* Communicate task state back to the RCU tasks trace stall warning request. */ +struct trc_stall_chk_rdr { + int nesting; + int ipi_to_cpu; + u8 needqs; +}; + +static int trc_check_slow_task(struct task_struct *t, void *arg) +{ + struct trc_stall_chk_rdr *trc_rdrp = arg; + + if (task_curr(t) && cpu_online(task_cpu(t))) + return false; // It is running, so decline to inspect it. + trc_rdrp->nesting = READ_ONCE(t->trc_reader_nesting); + trc_rdrp->ipi_to_cpu = READ_ONCE(t->trc_ipi_to_cpu); + trc_rdrp->needqs = rcu_ld_need_qs(t); + return true; +} + +/* Show the state of a task stalling the current RCU tasks trace GP. */ +static void show_stalled_task_trace(struct task_struct *t, bool *firstreport) +{ + int cpu; + struct trc_stall_chk_rdr trc_rdr; + bool is_idle_tsk = is_idle_task(t); + + if (*firstreport) { + pr_err("INFO: rcu_tasks_trace detected stalls on tasks:\n"); + *firstreport = false; + } + cpu = task_cpu(t); + if (!task_call_func(t, trc_check_slow_task, &trc_rdr)) + pr_alert("P%d: %c%c\n", + t->pid, + ".I"[t->trc_ipi_to_cpu >= 0], + ".i"[is_idle_tsk]); + else + pr_alert("P%d: %c%c%c%c nesting: %d%c%c cpu: %d%s\n", + t->pid, + ".I"[trc_rdr.ipi_to_cpu >= 0], + ".i"[is_idle_tsk], + ".N"[cpu >= 0 && tick_nohz_full_cpu(cpu)], + ".B"[!!data_race(t->trc_reader_special.b.blocked)], + trc_rdr.nesting, + " !CN"[trc_rdr.needqs & 0x3], + " ?"[trc_rdr.needqs > 0x3], + cpu, cpu_online(cpu) ? "" : "(offline)"); + sched_show_task(t); +} + +/* List stalled IPIs for RCU tasks trace. */ +static void show_stalled_ipi_trace(void) +{ + int cpu; + + for_each_possible_cpu(cpu) + if (per_cpu(trc_ipi_to_cpu, cpu)) + pr_alert("\tIPI outstanding to CPU %d\n", cpu); +} + +/* Do one scan of the holdout list. */ +static void check_all_holdout_tasks_trace(struct list_head *hop, + bool needreport, bool *firstreport) +{ + struct task_struct *g, *t; + + // Disable CPU hotplug across the holdout list scan for IPIs. + cpus_read_lock(); + + list_for_each_entry_safe(t, g, hop, trc_holdout_list) { + // If safe and needed, try to check the current task. + if (READ_ONCE(t->trc_ipi_to_cpu) == -1 && + !(rcu_ld_need_qs(t) & TRC_NEED_QS_CHECKED)) + trc_wait_for_one_reader(t, hop); + + // If check succeeded, remove this task from the list. + if (smp_load_acquire(&t->trc_ipi_to_cpu) == -1 && + rcu_ld_need_qs(t) == TRC_NEED_QS_CHECKED) + trc_del_holdout(t); + else if (needreport) + show_stalled_task_trace(t, firstreport); + cond_resched_tasks_rcu_qs(); + } + + // Re-enable CPU hotplug now that the holdout list scan has completed. + cpus_read_unlock(); + + if (needreport) { + if (*firstreport) + pr_err("INFO: rcu_tasks_trace detected stalls? (Late IPI?)\n"); + show_stalled_ipi_trace(); + } +} + +static void rcu_tasks_trace_empty_fn(void *unused) +{ +} + +/* Wait for grace period to complete and provide ordering. */ +static void rcu_tasks_trace_postgp(struct rcu_tasks *rtp) +{ + int cpu; + + // Wait for any lingering IPI handlers to complete. Note that + // if a CPU has gone offline or transitioned to userspace in the + // meantime, all IPI handlers should have been drained beforehand. + // Yes, this assumes that CPUs process IPIs in order. If that ever + // changes, there will need to be a recheck and/or timed wait. + for_each_online_cpu(cpu) + if (WARN_ON_ONCE(smp_load_acquire(per_cpu_ptr(&trc_ipi_to_cpu, cpu)))) + smp_call_function_single(cpu, rcu_tasks_trace_empty_fn, NULL, 1); + + smp_mb(); // Caller's code must be ordered after wakeup. + // Pairs with pretty much every ordering primitive. +} + +/* Report any needed quiescent state for this exiting task. */ +static void exit_tasks_rcu_finish_trace(struct task_struct *t) +{ + union rcu_special trs = READ_ONCE(t->trc_reader_special); + + rcu_trc_cmpxchg_need_qs(t, 0, TRC_NEED_QS_CHECKED); + WARN_ON_ONCE(READ_ONCE(t->trc_reader_nesting)); + if (WARN_ON_ONCE(rcu_ld_need_qs(t) & TRC_NEED_QS || trs.b.blocked)) + rcu_read_unlock_trace_special(t); + else + WRITE_ONCE(t->trc_reader_nesting, 0); +} + +/** + * call_rcu_tasks_trace() - Queue a callback trace task-based grace period + * @rhp: structure to be used for queueing the RCU updates. + * @func: actual callback function to be invoked after the grace period + * + * The callback function will be invoked some time after a trace rcu-tasks + * grace period elapses, in other words after all currently executing + * trace rcu-tasks read-side critical sections have completed. These + * read-side critical sections are delimited by calls to rcu_read_lock_trace() + * and rcu_read_unlock_trace(). + * + * See the description of call_rcu() for more detailed information on + * memory ordering guarantees. + */ +void call_rcu_tasks_trace(struct rcu_head *rhp, rcu_callback_t func) +{ + call_rcu_tasks_generic(rhp, func, &rcu_tasks_trace); +} +EXPORT_SYMBOL_GPL(call_rcu_tasks_trace); + +/** + * synchronize_rcu_tasks_trace - wait for a trace rcu-tasks grace period + * + * Control will return to the caller some time after a trace rcu-tasks + * grace period has elapsed, in other words after all currently executing + * trace rcu-tasks read-side critical sections have elapsed. These read-side + * critical sections are delimited by calls to rcu_read_lock_trace() + * and rcu_read_unlock_trace(). + * + * This is a very specialized primitive, intended only for a few uses in + * tracing and other situations requiring manipulation of function preambles + * and profiling hooks. The synchronize_rcu_tasks_trace() function is not + * (yet) intended for heavy use from multiple CPUs. + * + * See the description of synchronize_rcu() for more detailed information + * on memory ordering guarantees. + */ +void synchronize_rcu_tasks_trace(void) +{ + RCU_LOCKDEP_WARN(lock_is_held(&rcu_trace_lock_map), "Illegal synchronize_rcu_tasks_trace() in RCU Tasks Trace read-side critical section"); + synchronize_rcu_tasks_generic(&rcu_tasks_trace); +} +EXPORT_SYMBOL_GPL(synchronize_rcu_tasks_trace); + +/** + * rcu_barrier_tasks_trace - Wait for in-flight call_rcu_tasks_trace() callbacks. + * + * Although the current implementation is guaranteed to wait, it is not + * obligated to, for example, if there are no pending callbacks. + */ +void rcu_barrier_tasks_trace(void) +{ + rcu_barrier_tasks_generic(&rcu_tasks_trace); +} +EXPORT_SYMBOL_GPL(rcu_barrier_tasks_trace); + +int rcu_tasks_trace_lazy_ms = -1; +module_param(rcu_tasks_trace_lazy_ms, int, 0444); + +static int __init rcu_spawn_tasks_trace_kthread(void) +{ + if (IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB)) { + rcu_tasks_trace.gp_sleep = HZ / 10; + rcu_tasks_trace.init_fract = HZ / 10; + } else { + rcu_tasks_trace.gp_sleep = HZ / 200; + if (rcu_tasks_trace.gp_sleep <= 0) + rcu_tasks_trace.gp_sleep = 1; + rcu_tasks_trace.init_fract = HZ / 200; + if (rcu_tasks_trace.init_fract <= 0) + rcu_tasks_trace.init_fract = 1; + } + if (rcu_tasks_trace_lazy_ms >= 0) + rcu_tasks_trace.lazy_jiffies = msecs_to_jiffies(rcu_tasks_trace_lazy_ms); + rcu_tasks_trace.pregp_func = rcu_tasks_trace_pregp_step; + rcu_tasks_trace.postscan_func = rcu_tasks_trace_postscan; + rcu_tasks_trace.holdouts_func = check_all_holdout_tasks_trace; + rcu_tasks_trace.postgp_func = rcu_tasks_trace_postgp; + rcu_spawn_tasks_kthread_generic(&rcu_tasks_trace); + return 0; +} + +#if !defined(CONFIG_TINY_RCU) +void show_rcu_tasks_trace_gp_kthread(void) +{ + char buf[64]; + + snprintf(buf, sizeof(buf), "N%lu h:%lu/%lu/%lu", + data_race(n_trc_holdouts), + data_race(n_heavy_reader_ofl_updates), + data_race(n_heavy_reader_updates), + data_race(n_heavy_reader_attempts)); + show_rcu_tasks_generic_gp_kthread(&rcu_tasks_trace, buf); +} +EXPORT_SYMBOL_GPL(show_rcu_tasks_trace_gp_kthread); + +void rcu_tasks_trace_torture_stats_print(char *tt, char *tf) +{ + rcu_tasks_torture_stats_print_generic(&rcu_tasks_trace, tt, tf, ""); +} +EXPORT_SYMBOL_GPL(rcu_tasks_trace_torture_stats_print); +#endif // !defined(CONFIG_TINY_RCU) + +struct task_struct *get_rcu_tasks_trace_gp_kthread(void) +{ + return rcu_tasks_trace.kthread_ptr; +} +EXPORT_SYMBOL_GPL(get_rcu_tasks_trace_gp_kthread); + +void rcu_tasks_trace_get_gp_data(int *flags, unsigned long *gp_seq) +{ + *flags = 0; + *gp_seq = rcu_seq_current(&rcu_tasks_trace.tasks_gp_seq); +} +EXPORT_SYMBOL_GPL(rcu_tasks_trace_get_gp_data); + +#else /* #ifdef CONFIG_TASKS_TRACE_RCU */ +static void exit_tasks_rcu_finish_trace(struct task_struct *t) { } +#endif /* #else #ifdef CONFIG_TASKS_TRACE_RCU */ + +#ifndef CONFIG_TINY_RCU +void show_rcu_tasks_gp_kthreads(void) +{ + show_rcu_tasks_classic_gp_kthread(); + show_rcu_tasks_rude_gp_kthread(); + show_rcu_tasks_trace_gp_kthread(); +} +#endif /* #ifndef CONFIG_TINY_RCU */ + +#ifdef CONFIG_PROVE_RCU +struct rcu_tasks_test_desc { + struct rcu_head rh; + const char *name; + bool notrun; + unsigned long runstart; +}; + +static struct rcu_tasks_test_desc tests[] = { + { + .name = "call_rcu_tasks()", + /* If not defined, the test is skipped. */ + .notrun = IS_ENABLED(CONFIG_TASKS_RCU), + }, + { + .name = "call_rcu_tasks_trace()", + /* If not defined, the test is skipped. */ + .notrun = IS_ENABLED(CONFIG_TASKS_TRACE_RCU) + } +}; + +#if defined(CONFIG_TASKS_RCU) || defined(CONFIG_TASKS_TRACE_RCU) +static void test_rcu_tasks_callback(struct rcu_head *rhp) +{ + struct rcu_tasks_test_desc *rttd = + container_of(rhp, struct rcu_tasks_test_desc, rh); + + pr_info("Callback from %s invoked.\n", rttd->name); + + rttd->notrun = false; +} +#endif // #if defined(CONFIG_TASKS_RCU) || defined(CONFIG_TASKS_TRACE_RCU) + +static void rcu_tasks_initiate_self_tests(void) +{ +#ifdef CONFIG_TASKS_RCU + pr_info("Running RCU Tasks wait API self tests\n"); + tests[0].runstart = jiffies; + synchronize_rcu_tasks(); + call_rcu_tasks(&tests[0].rh, test_rcu_tasks_callback); +#endif + +#ifdef CONFIG_TASKS_RUDE_RCU + pr_info("Running RCU Tasks Rude wait API self tests\n"); + synchronize_rcu_tasks_rude(); +#endif + +#ifdef CONFIG_TASKS_TRACE_RCU + pr_info("Running RCU Tasks Trace wait API self tests\n"); + tests[1].runstart = jiffies; + synchronize_rcu_tasks_trace(); + call_rcu_tasks_trace(&tests[1].rh, test_rcu_tasks_callback); +#endif +} + +/* + * Return: 0 - test passed + * 1 - test failed, but have not timed out yet + * -1 - test failed and timed out + */ +static int rcu_tasks_verify_self_tests(void) +{ + int ret = 0; + int i; + unsigned long bst = rcu_task_stall_timeout; + + if (bst <= 0 || bst > RCU_TASK_BOOT_STALL_TIMEOUT) + bst = RCU_TASK_BOOT_STALL_TIMEOUT; + for (i = 0; i < ARRAY_SIZE(tests); i++) { + while (tests[i].notrun) { // still hanging. + if (time_after(jiffies, tests[i].runstart + bst)) { + pr_err("%s has failed boot-time tests.\n", tests[i].name); + ret = -1; + break; + } + ret = 1; + break; + } + } + WARN_ON(ret < 0); + + return ret; +} + +/* + * Repeat the rcu_tasks_verify_self_tests() call once every second until the + * test passes or has timed out. + */ +static struct delayed_work rcu_tasks_verify_work; +static void rcu_tasks_verify_work_fn(struct work_struct *work __maybe_unused) +{ + int ret = rcu_tasks_verify_self_tests(); + + if (ret <= 0) + return; + + /* Test fails but not timed out yet, reschedule another check */ + schedule_delayed_work(&rcu_tasks_verify_work, HZ); +} + +static int rcu_tasks_verify_schedule_work(void) +{ + INIT_DELAYED_WORK(&rcu_tasks_verify_work, rcu_tasks_verify_work_fn); + rcu_tasks_verify_work_fn(NULL); + return 0; +} +late_initcall(rcu_tasks_verify_schedule_work); +#else /* #ifdef CONFIG_PROVE_RCU */ +static void rcu_tasks_initiate_self_tests(void) { } +#endif /* #else #ifdef CONFIG_PROVE_RCU */ + +void __init tasks_cblist_init_generic(void) +{ + lockdep_assert_irqs_disabled(); + WARN_ON(num_online_cpus() > 1); + +#ifdef CONFIG_TASKS_RCU + cblist_init_generic(&rcu_tasks); +#endif + +#ifdef CONFIG_TASKS_RUDE_RCU + cblist_init_generic(&rcu_tasks_rude); +#endif + +#ifdef CONFIG_TASKS_TRACE_RCU + cblist_init_generic(&rcu_tasks_trace); +#endif +} + +static int __init rcu_init_tasks_generic(void) +{ +#ifdef CONFIG_TASKS_RCU + rcu_spawn_tasks_kthread(); +#endif + +#ifdef CONFIG_TASKS_RUDE_RCU + rcu_spawn_tasks_rude_kthread(); +#endif + +#ifdef CONFIG_TASKS_TRACE_RCU + rcu_spawn_tasks_trace_kthread(); +#endif + + // Run the self-tests. + rcu_tasks_initiate_self_tests(); + + return 0; +} +core_initcall(rcu_init_tasks_generic); + +#else /* #ifdef CONFIG_TASKS_RCU_GENERIC */ +static inline void rcu_tasks_bootup_oddness(void) {} +#endif /* #else #ifdef CONFIG_TASKS_RCU_GENERIC */ diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c index 5f5963ba313e..585cade21010 100644 --- a/kernel/rcu/tiny.c +++ b/kernel/rcu/tiny.c @@ -1,23 +1,10 @@ +// SPDX-License-Identifier: GPL-2.0+ /* * Read-Copy Update mechanism for mutual exclusion, the Bloatwatch edition. * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, you can access it online at - * http://www.gnu.org/licenses/gpl-2.0.html. - * * Copyright IBM Corporation, 2008 * - * Author: Paul E. McKenney <paulmck@linux.vnet.ibm.com> + * Author: Paul E. McKenney <paulmck@linux.ibm.com> * * For detailed explanation of Read-Copy Update mechanism see - * Documentation/RCU @@ -35,6 +22,8 @@ #include <linux/time.h> #include <linux/cpu.h> #include <linux/prefetch.h> +#include <linux/slab.h> +#include <linux/mm.h> #include "rcu.h" @@ -43,17 +32,19 @@ struct rcu_ctrlblk { struct rcu_head *rcucblist; /* List of pending callbacks (CBs). */ struct rcu_head **donetail; /* ->next pointer of last "done" CB. */ struct rcu_head **curtail; /* ->next pointer of last CB. */ + unsigned long gp_seq; /* Grace-period counter. */ }; /* Definition for rcupdate control block. */ static struct rcu_ctrlblk rcu_ctrlblk = { .donetail = &rcu_ctrlblk.rcucblist, .curtail = &rcu_ctrlblk.rcucblist, + .gp_seq = 0 - 300UL, }; void rcu_barrier(void) { - wait_rcu_gp(call_rcu); + wait_rcu_gp(call_rcu_hurry); } EXPORT_SYMBOL(rcu_barrier); @@ -65,8 +56,9 @@ void rcu_qs(void) local_irq_save(flags); if (rcu_ctrlblk.donetail != rcu_ctrlblk.curtail) { rcu_ctrlblk.donetail = rcu_ctrlblk.curtail; - raise_softirq(RCU_SOFTIRQ); + raise_softirq_irqoff(RCU_SOFTIRQ); } + WRITE_ONCE(rcu_ctrlblk.gp_seq, rcu_ctrlblk.gp_seq + 2); local_irq_restore(flags); } @@ -76,18 +68,35 @@ void rcu_qs(void) * be called from hardirq context. It is normally called from the * scheduling-clock interrupt. */ -void rcu_check_callbacks(int user) +void rcu_sched_clock_irq(int user) { - if (user) { + if (user) rcu_qs(); - } else if (rcu_ctrlblk.donetail != rcu_ctrlblk.curtail) { - set_tsk_need_resched(current); - set_preempt_need_resched(); - } + else if (rcu_ctrlblk.donetail != rcu_ctrlblk.curtail) + set_need_resched_current(); +} + +/* + * Reclaim the specified callback, either by invoking it for non-kfree cases or + * freeing it directly (for kfree). Return true if kfreeing, false otherwise. + */ +static inline bool rcu_reclaim_tiny(struct rcu_head *head) +{ + rcu_callback_t f; + + rcu_lock_acquire(&rcu_callback_map); + + trace_rcu_invoke_callback("", head); + f = head->func; + debug_rcu_head_callback(head); + WRITE_ONCE(head->func, (rcu_callback_t)0L); + f(head); + rcu_lock_release(&rcu_callback_map); + return false; } /* Invoke the RCU callbacks whose grace period has elapsed. */ -static __latent_entropy void rcu_process_callbacks(struct softirq_action *unused) +static __latent_entropy void rcu_process_callbacks(void) { struct rcu_head *next, *list; unsigned long flags; @@ -112,9 +121,7 @@ static __latent_entropy void rcu_process_callbacks(struct softirq_action *unused next = list->next; prefetch(next); debug_rcu_head_unqueue(list); - local_bh_disable(); - __rcu_reclaim("", list); - local_bh_enable(); + rcu_reclaim_tiny(list); list = next; } } @@ -122,8 +129,10 @@ static __latent_entropy void rcu_process_callbacks(struct softirq_action *unused /* * Wait for a grace period to elapse. But it is illegal to invoke * synchronize_rcu() from within an RCU read-side critical section. - * Therefore, any legal call to synchronize_rcu() is a quiescent - * state, and so on a UP system, synchronize_rcu() need do nothing. + * Therefore, any legal call to synchronize_rcu() is a quiescent state, + * and so on a UP system, synchronize_rcu() need do nothing, other than + * let the polled APIs know that another grace period elapsed. + * * (But Lai Jiangshan points out the benefits of doing might_sleep() * to reduce latency.) * @@ -135,6 +144,9 @@ void synchronize_rcu(void) lock_is_held(&rcu_lock_map) || lock_is_held(&rcu_sched_lock_map), "Illegal synchronize_rcu() in RCU read-side critical section"); + preempt_disable(); + WRITE_ONCE(rcu_ctrlblk.gp_seq, rcu_ctrlblk.gp_seq + 2); + preempt_enable(); } EXPORT_SYMBOL_GPL(synchronize_rcu); @@ -145,9 +157,17 @@ EXPORT_SYMBOL_GPL(synchronize_rcu); */ void call_rcu(struct rcu_head *head, rcu_callback_t func) { + static atomic_t doublefrees; unsigned long flags; - debug_rcu_head_queue(head); + if (debug_rcu_head_queue(head)) { + if (atomic_inc_return(&doublefrees) < 4) { + pr_err("%s(): Double-freed CB %p->%pS()!!! ", __func__, head, head->func); + mem_dump_obj(head); + } + return; + } + head->func = func; head->next = NULL; @@ -163,9 +183,70 @@ void call_rcu(struct rcu_head *head, rcu_callback_t func) } EXPORT_SYMBOL_GPL(call_rcu); +/* + * Store a grace-period-counter "cookie". For more information, + * see the Tree RCU header comment. + */ +void get_completed_synchronize_rcu_full(struct rcu_gp_oldstate *rgosp) +{ + rgosp->rgos_norm = RCU_GET_STATE_COMPLETED; +} +EXPORT_SYMBOL_GPL(get_completed_synchronize_rcu_full); + +/* + * Return a grace-period-counter "cookie". For more information, + * see the Tree RCU header comment. + */ +unsigned long get_state_synchronize_rcu(void) +{ + return READ_ONCE(rcu_ctrlblk.gp_seq); +} +EXPORT_SYMBOL_GPL(get_state_synchronize_rcu); + +/* + * Return a grace-period-counter "cookie" and ensure that a future grace + * period completes. For more information, see the Tree RCU header comment. + */ +unsigned long start_poll_synchronize_rcu(void) +{ + unsigned long gp_seq = get_state_synchronize_rcu(); + + if (unlikely(is_idle_task(current))) { + /* force scheduling for rcu_qs() */ + resched_cpu(0); + } + return gp_seq; +} +EXPORT_SYMBOL_GPL(start_poll_synchronize_rcu); + +/* + * Return true if the grace period corresponding to oldstate has completed + * and false otherwise. For more information, see the Tree RCU header + * comment. + */ +bool poll_state_synchronize_rcu(unsigned long oldstate) +{ + return oldstate == RCU_GET_STATE_COMPLETED || READ_ONCE(rcu_ctrlblk.gp_seq) != oldstate; +} +EXPORT_SYMBOL_GPL(poll_state_synchronize_rcu); + +#if IS_ENABLED(CONFIG_RCU_TORTURE_TEST) +unsigned long long rcutorture_gather_gp_seqs(void) +{ + return READ_ONCE(rcu_ctrlblk.gp_seq) & 0xffffULL; +} +EXPORT_SYMBOL_GPL(rcutorture_gather_gp_seqs); + +void rcutorture_format_gp_seqs(unsigned long long seqs, char *cp, size_t len) +{ + snprintf(cp, len, "g%04llx", seqs & 0xffffULL); +} +EXPORT_SYMBOL_GPL(rcutorture_format_gp_seqs); +#endif + void __init rcu_init(void) { open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); rcu_early_boot_tests(); - srcu_init(); + tasks_cblist_init_generic(); } diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 9180158756d2..293bbd9ac3f4 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1,27 +1,14 @@ +// SPDX-License-Identifier: GPL-2.0+ /* - * Read-Copy Update mechanism for mutual exclusion - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, you can access it online at - * http://www.gnu.org/licenses/gpl-2.0.html. + * Read-Copy Update mechanism for mutual exclusion (tree-based version) * * Copyright IBM Corporation, 2008 * * Authors: Dipankar Sarma <dipankar@in.ibm.com> * Manfred Spraul <manfred@colorfullife.com> - * Paul E. McKenney <paulmck@linux.vnet.ibm.com> Hierarchical version + * Paul E. McKenney <paulmck@linux.ibm.com> * - * Based on the original work by Paul McKenney <paulmck@us.ibm.com> + * Based on the original work by Paul McKenney <paulmck@linux.ibm.com> * and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen. * * For detailed explanation of Read-Copy Update mechanism see - @@ -44,7 +31,10 @@ #include <linux/bitops.h> #include <linux/export.h> #include <linux/completion.h> +#include <linux/kmemleak.h> #include <linux/moduleparam.h> +#include <linux/panic.h> +#include <linux/panic_notifier.h> #include <linux/percpu.h> #include <linux/notifier.h> #include <linux/cpu.h> @@ -56,12 +46,25 @@ #include <uapi/linux/sched/types.h> #include <linux/prefetch.h> #include <linux/delay.h> -#include <linux/stop_machine.h> #include <linux/random.h> #include <linux/trace_events.h> #include <linux/suspend.h> #include <linux/ftrace.h> #include <linux/tick.h> +#include <linux/sysrq.h> +#include <linux/kprobes.h> +#include <linux/gfp.h> +#include <linux/oom.h> +#include <linux/smpboot.h> +#include <linux/jiffies.h> +#include <linux/slab.h> +#include <linux/sched/isolation.h> +#include <linux/sched/clock.h> +#include <linux/vmalloc.h> +#include <linux/mm.h> +#include <linux/kasan.h> +#include <linux/context_tracking.h> +#include "../time/tick-internal.h" #include "tree.h" #include "rcu.h" @@ -72,37 +75,47 @@ #define MODULE_PARAM_PREFIX "rcutree." /* Data structures. */ - -/* - * Steal a bit from the bottom of ->dynticks for idle entry/exit - * control. Initially this is for TLB flushing. - */ -#define RCU_DYNTICK_CTRL_MASK 0x1 -#define RCU_DYNTICK_CTRL_CTR (RCU_DYNTICK_CTRL_MASK + 1) -#ifndef rcu_eqs_special_exit -#define rcu_eqs_special_exit() do { } while (0) -#endif +static void rcu_sr_normal_gp_cleanup_work(struct work_struct *); static DEFINE_PER_CPU_SHARED_ALIGNED(struct rcu_data, rcu_data) = { - .dynticks_nesting = 1, - .dynticks_nmi_nesting = DYNTICK_IRQ_NONIDLE, - .dynticks = ATOMIC_INIT(RCU_DYNTICK_CTRL_CTR), + .gpwrap = true, }; -struct rcu_state rcu_state = { + +int rcu_get_gpwrap_count(int cpu) +{ + struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); + + return READ_ONCE(rdp->gpwrap_count); +} +EXPORT_SYMBOL_GPL(rcu_get_gpwrap_count); + +static struct rcu_state rcu_state = { .level = { &rcu_state.node[0] }, .gp_state = RCU_GP_IDLE, .gp_seq = (0UL - 300UL) << RCU_SEQ_CTR_SHIFT, .barrier_mutex = __MUTEX_INITIALIZER(rcu_state.barrier_mutex), + .barrier_lock = __RAW_SPIN_LOCK_UNLOCKED(rcu_state.barrier_lock), .name = RCU_NAME, .abbr = RCU_ABBR, .exp_mutex = __MUTEX_INITIALIZER(rcu_state.exp_mutex), .exp_wake_mutex = __MUTEX_INITIALIZER(rcu_state.exp_wake_mutex), - .ofl_lock = __RAW_SPIN_LOCK_UNLOCKED(rcu_state.ofl_lock), + .ofl_lock = __ARCH_SPIN_LOCK_UNLOCKED, + .srs_cleanup_work = __WORK_INITIALIZER(rcu_state.srs_cleanup_work, + rcu_sr_normal_gp_cleanup_work), + .srs_cleanups_pending = ATOMIC_INIT(0), +#ifdef CONFIG_RCU_NOCB_CPU + .nocb_mutex = __MUTEX_INITIALIZER(rcu_state.nocb_mutex), +#endif }; /* Dump rcu_node combining tree at boot to verify correct setup. */ static bool dump_tree; module_param(dump_tree, bool, 0444); +/* By default, use RCU_SOFTIRQ instead of rcuc kthreads. */ +static bool use_softirq = !IS_ENABLED(CONFIG_PREEMPT_RT); +#ifndef CONFIG_PREEMPT_RT +module_param(use_softirq, bool, 0444); +#endif /* Control rcu_node-tree auto-balancing at boot time. */ static bool rcu_fanout_exact; module_param(rcu_fanout_exact, bool, 0444); @@ -113,8 +126,6 @@ int rcu_num_lvls __read_mostly = RCU_NUM_LVLS; /* Number of rcu_nodes at specified level. */ int num_rcu_lvl[] = NUM_RCU_LVL_INIT; int rcu_num_nodes __read_mostly = NUM_RCU_NODES; /* Total # rcu_nodes in use. */ -/* panic() on RCU Stall sysctl. */ -int sysctl_panic_on_rcu_stall __read_mostly; /* * The rcu_scheduler_active variable is initialized to the value @@ -147,17 +158,22 @@ static int rcu_scheduler_fully_active __read_mostly; static void rcu_report_qs_rnp(unsigned long mask, struct rcu_node *rnp, unsigned long gps, unsigned long flags); -static void rcu_init_new_rnp(struct rcu_node *rnp_leaf); -static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf); -static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu); static void invoke_rcu_core(void); -static void invoke_rcu_callbacks(struct rcu_data *rdp); static void rcu_report_exp_rdp(struct rcu_data *rdp); -static void sync_sched_exp_online_cleanup(int cpu); +static void check_cb_ovld_locked(struct rcu_data *rdp, struct rcu_node *rnp); +static bool rcu_rdp_is_offloaded(struct rcu_data *rdp); +static bool rcu_rdp_cpu_online(struct rcu_data *rdp); +static bool rcu_init_invoked(void); +static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf); +static void rcu_init_new_rnp(struct rcu_node *rnp_leaf); -/* rcuc/rcub kthread realtime priority */ +/* + * rcuc/rcub/rcuop kthread realtime priority. The "rcuop" + * real-time priority(enabling/disabling) is controlled by + * the extra CONFIG_RCU_NOCB_CPU_CB_BOOST configuration. + */ static int kthread_prio = IS_ENABLED(CONFIG_RCU_BOOST) ? 1 : 0; -module_param(kthread_prio, int, 0644); +module_param(kthread_prio, int, 0444); /* Delay in jiffies for grace-period initialization delays, debug only. */ @@ -167,6 +183,15 @@ static int gp_init_delay; module_param(gp_init_delay, int, 0444); static int gp_cleanup_delay; module_param(gp_cleanup_delay, int, 0444); +static int nohz_full_patience_delay; +module_param(nohz_full_patience_delay, int, 0444); +static int nohz_full_patience_delay_jiffies; + +// Add delay to rcu_read_unlock() for strict grace periods. +static int rcu_unlock_delay; +#ifdef CONFIG_RCU_STRICT_GRACE_PERIOD +module_param(rcu_unlock_delay, int, 0444); +#endif /* Retrieve RCU kthreads priority for rcutorture */ int rcu_get_gp_kthreads_prio(void) @@ -184,18 +209,7 @@ EXPORT_SYMBOL_GPL(rcu_get_gp_kthreads_prio); * the need for long delays to increase some race probabilities with the * need for fast grace periods to increase other race probabilities. */ -#define PER_RCU_NODE_PERIOD 3 /* Number of grace periods between delays. */ - -/* - * Compute the mask of online CPUs for the specified rcu_node structure. - * This will not be stable unless the rcu_node structure's ->lock is - * held, but the bit corresponding to the current CPU will be stable - * in most contexts. - */ -unsigned long rcu_rnp_online_cpus(struct rcu_node *rnp) -{ - return READ_ONCE(rnp->qsmaskinitnext); -} +#define PER_RCU_NODE_PERIOD 3 /* Number of grace periods between delays for debugging. */ /* * Return true if an RCU grace period is in progress. The READ_ONCE()s @@ -215,146 +229,118 @@ static long rcu_get_n_cbs_cpu(int cpu) { struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); - if (rcu_segcblist_is_enabled(&rdp->cblist)) /* Online normal CPU? */ + if (rcu_segcblist_is_enabled(&rdp->cblist)) return rcu_segcblist_n_cbs(&rdp->cblist); - return rcu_get_n_cbs_nocb_cpu(rdp); /* Works for offline, too. */ + return 0; } +/** + * rcu_softirq_qs - Provide a set of RCU quiescent states in softirq processing + * + * Mark a quiescent state for RCU, Tasks RCU, and Tasks Trace RCU. + * This is a special-purpose function to be used in the softirq + * infrastructure and perhaps the occasional long-running softirq + * handler. + * + * Note that from RCU's viewpoint, a call to rcu_softirq_qs() is + * equivalent to momentarily completely enabling preemption. For + * example, given this code:: + * + * local_bh_disable(); + * do_something(); + * rcu_softirq_qs(); // A + * do_something_else(); + * local_bh_enable(); // B + * + * A call to synchronize_rcu() that began concurrently with the + * call to do_something() would be guaranteed to wait only until + * execution reached statement A. Without that rcu_softirq_qs(), + * that same synchronize_rcu() would instead be guaranteed to wait + * until execution reached statement B. + */ void rcu_softirq_qs(void) { + RCU_LOCKDEP_WARN(lock_is_held(&rcu_bh_lock_map) || + lock_is_held(&rcu_lock_map) || + lock_is_held(&rcu_sched_lock_map), + "Illegal rcu_softirq_qs() in RCU read-side critical section"); rcu_qs(); rcu_preempt_deferred_qs(current); + rcu_tasks_qs(current, false); } /* - * Record entry into an extended quiescent state. This is only to be - * called when not already in an extended quiescent state. - */ -static void rcu_dynticks_eqs_enter(void) -{ - struct rcu_data *rdp = this_cpu_ptr(&rcu_data); - int seq; - - /* - * CPUs seeing atomic_add_return() must see prior RCU read-side - * critical sections, and we also must force ordering with the - * next idle sojourn. - */ - seq = atomic_add_return(RCU_DYNTICK_CTRL_CTR, &rdp->dynticks); - /* Better be in an extended quiescent state! */ - WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && - (seq & RCU_DYNTICK_CTRL_CTR)); - /* Better not have special action (TLB flush) pending! */ - WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && - (seq & RCU_DYNTICK_CTRL_MASK)); -} - -/* - * Record exit from an extended quiescent state. This is only to be - * called from an extended quiescent state. - */ -static void rcu_dynticks_eqs_exit(void) -{ - struct rcu_data *rdp = this_cpu_ptr(&rcu_data); - int seq; - - /* - * CPUs seeing atomic_add_return() must see prior idle sojourns, - * and we also must force ordering with the next RCU read-side - * critical section. - */ - seq = atomic_add_return(RCU_DYNTICK_CTRL_CTR, &rdp->dynticks); - WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && - !(seq & RCU_DYNTICK_CTRL_CTR)); - if (seq & RCU_DYNTICK_CTRL_MASK) { - atomic_andnot(RCU_DYNTICK_CTRL_MASK, &rdp->dynticks); - smp_mb__after_atomic(); /* _exit after clearing mask. */ - /* Prefer duplicate flushes to losing a flush. */ - rcu_eqs_special_exit(); - } -} - -/* - * Reset the current CPU's ->dynticks counter to indicate that the + * Reset the current CPU's RCU_WATCHING counter to indicate that the * newly onlined CPU is no longer in an extended quiescent state. * This will either leave the counter unchanged, or increment it * to the next non-quiescent value. * * The non-atomic test/increment sequence works because the upper bits - * of the ->dynticks counter are manipulated only by the corresponding CPU, + * of the ->state variable are manipulated only by the corresponding CPU, * or when the corresponding CPU is offline. */ -static void rcu_dynticks_eqs_online(void) +static void rcu_watching_online(void) { - struct rcu_data *rdp = this_cpu_ptr(&rcu_data); - - if (atomic_read(&rdp->dynticks) & RCU_DYNTICK_CTRL_CTR) + if (ct_rcu_watching() & CT_RCU_WATCHING) return; - atomic_add(RCU_DYNTICK_CTRL_CTR, &rdp->dynticks); + ct_state_inc(CT_RCU_WATCHING); } /* - * Is the current CPU in an extended quiescent state? - * - * No ordering, as we are sampling CPU-local information. + * Return true if the snapshot returned from ct_rcu_watching() + * indicates that RCU is in an extended quiescent state. */ -bool rcu_dynticks_curr_cpu_in_eqs(void) +static bool rcu_watching_snap_in_eqs(int snap) { - struct rcu_data *rdp = this_cpu_ptr(&rcu_data); - - return !(atomic_read(&rdp->dynticks) & RCU_DYNTICK_CTRL_CTR); + return !(snap & CT_RCU_WATCHING); } -/* - * Snapshot the ->dynticks counter with full ordering so as to allow - * stable comparison of this counter with past and future snapshots. +/** + * rcu_watching_snap_stopped_since() - Has RCU stopped watching a given CPU + * since the specified @snap? + * + * @rdp: The rcu_data corresponding to the CPU for which to check EQS. + * @snap: rcu_watching snapshot taken when the CPU wasn't in an EQS. + * + * Returns true if the CPU corresponding to @rdp has spent some time in an + * extended quiescent state since @snap. Note that this doesn't check if it + * /still/ is in an EQS, just that it went through one since @snap. + * + * This is meant to be used in a loop waiting for a CPU to go through an EQS. */ -int rcu_dynticks_snap(struct rcu_data *rdp) +static bool rcu_watching_snap_stopped_since(struct rcu_data *rdp, int snap) { - int snap = atomic_add_return(0, &rdp->dynticks); - - return snap & ~RCU_DYNTICK_CTRL_MASK; -} + /* + * The first failing snapshot is already ordered against the accesses + * performed by the remote CPU after it exits idle. + * + * The second snapshot therefore only needs to order against accesses + * performed by the remote CPU prior to entering idle and therefore can + * rely solely on acquire semantics. + */ + if (WARN_ON_ONCE(rcu_watching_snap_in_eqs(snap))) + return true; -/* - * Return true if the snapshot returned from rcu_dynticks_snap() - * indicates that RCU is in an extended quiescent state. - */ -static bool rcu_dynticks_in_eqs(int snap) -{ - return !(snap & RCU_DYNTICK_CTRL_CTR); + return snap != ct_rcu_watching_cpu_acquire(rdp->cpu); } /* - * Return true if the CPU corresponding to the specified rcu_data - * structure has spent some time in an extended quiescent state since - * rcu_dynticks_snap() returned the specified snapshot. + * Return true if the referenced integer is zero while the specified + * CPU remains within a single extended quiescent state. */ -static bool rcu_dynticks_in_eqs_since(struct rcu_data *rdp, int snap) +bool rcu_watching_zero_in_eqs(int cpu, int *vp) { - return snap != rcu_dynticks_snap(rdp); -} + int snap; -/* - * Set the special (bottom) bit of the specified CPU so that it - * will take special action (such as flushing its TLB) on the - * next exit from an extended quiescent state. Returns true if - * the bit was successfully set, or false if the CPU was not in - * an extended quiescent state. - */ -bool rcu_eqs_special_set(int cpu) -{ - int old; - int new; - struct rcu_data *rdp = &per_cpu(rcu_data, cpu); + // If not quiescent, force back to earlier extended quiescent state. + snap = ct_rcu_watching_cpu(cpu) & ~CT_RCU_WATCHING; + smp_rmb(); // Order CT state and *vp reads. + if (READ_ONCE(*vp)) + return false; // Non-zero, so report failure; + smp_rmb(); // Order *vp read and CT state re-read. - do { - old = atomic_read(&rdp->dynticks); - if (old & RCU_DYNTICK_CTRL_CTR) - return false; - new = old | RCU_DYNTICK_CTRL_MASK; - } while (atomic_cmpxchg(&rdp->dynticks, old, new) != old); - return true; + // If still in the same extended quiescent state, we are good! + return snap == ct_rcu_watching_cpu(cpu); } /* @@ -368,45 +354,91 @@ bool rcu_eqs_special_set(int cpu) * * The caller must have disabled interrupts and must not be idle. */ -static void __maybe_unused rcu_momentary_dyntick_idle(void) +notrace void rcu_momentary_eqs(void) { - int special; + int seq; raw_cpu_write(rcu_data.rcu_need_heavy_qs, false); - special = atomic_add_return(2 * RCU_DYNTICK_CTRL_CTR, - &this_cpu_ptr(&rcu_data)->dynticks); + seq = ct_state_inc(2 * CT_RCU_WATCHING); /* It is illegal to call this from idle state. */ - WARN_ON_ONCE(!(special & RCU_DYNTICK_CTRL_CTR)); + WARN_ON_ONCE(!(seq & CT_RCU_WATCHING)); rcu_preempt_deferred_qs(current); } +EXPORT_SYMBOL_GPL(rcu_momentary_eqs); /** - * rcu_is_cpu_rrupt_from_idle - see if idle or immediately interrupted from idle + * rcu_is_cpu_rrupt_from_idle - see if 'interrupted' from idle * - * If the current CPU is idle or running at a first-level (not nested) - * interrupt from idle, return true. The caller must have at least - * disabled preemption. + * If the current CPU is idle and running at a first-level (not nested) + * interrupt, or directly, from idle, return true. + * + * The caller must have at least disabled IRQs. */ static int rcu_is_cpu_rrupt_from_idle(void) { - return __this_cpu_read(rcu_data.dynticks_nesting) <= 0 && - __this_cpu_read(rcu_data.dynticks_nmi_nesting) <= 1; + long nmi_nesting = ct_nmi_nesting(); + + /* + * Usually called from the tick; but also used from smp_function_call() + * for expedited grace periods. This latter can result in running from + * the idle task, instead of an actual IPI. + */ + lockdep_assert_irqs_disabled(); + + /* Check for counter underflows */ + RCU_LOCKDEP_WARN(ct_nesting() < 0, + "RCU nesting counter underflow!"); + + /* Non-idle interrupt or nested idle interrupt */ + if (nmi_nesting > 1) + return false; + + /* + * Non nested idle interrupt (interrupting section where RCU + * wasn't watching). + */ + if (nmi_nesting == 1) + return true; + + /* Not in an interrupt */ + if (!nmi_nesting) { + RCU_LOCKDEP_WARN(!in_task() || !is_idle_task(current), + "RCU nmi_nesting counter not in idle task!"); + return !rcu_is_watching_curr_cpu(); + } + + RCU_LOCKDEP_WARN(1, "RCU nmi_nesting counter underflow/zero!"); + + return false; } -#define DEFAULT_RCU_BLIMIT 10 /* Maximum callbacks per rcu_do_batch. */ +#define DEFAULT_RCU_BLIMIT (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD) ? 1000 : 10) + // Maximum callbacks per rcu_do_batch ... +#define DEFAULT_MAX_RCU_BLIMIT 10000 // ... even during callback flood. static long blimit = DEFAULT_RCU_BLIMIT; -#define DEFAULT_RCU_QHIMARK 10000 /* If this many pending, ignore blimit. */ +#define DEFAULT_RCU_QHIMARK 10000 // If this many pending, ignore blimit. static long qhimark = DEFAULT_RCU_QHIMARK; -#define DEFAULT_RCU_QLOMARK 100 /* Once only this many pending, use blimit. */ +#define DEFAULT_RCU_QLOMARK 100 // Once only this many pending, use blimit. static long qlowmark = DEFAULT_RCU_QLOMARK; +#define DEFAULT_RCU_QOVLD_MULT 2 +#define DEFAULT_RCU_QOVLD (DEFAULT_RCU_QOVLD_MULT * DEFAULT_RCU_QHIMARK) +static long qovld = DEFAULT_RCU_QOVLD; // If this many pending, hammer QS. +static long qovld_calc = -1; // No pre-initialization lock acquisitions! module_param(blimit, long, 0444); module_param(qhimark, long, 0444); module_param(qlowmark, long, 0444); +module_param(qovld, long, 0444); -static ulong jiffies_till_first_fqs = ULONG_MAX; +static ulong jiffies_till_first_fqs = IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD) ? 0 : ULONG_MAX; static ulong jiffies_till_next_fqs = ULONG_MAX; static bool rcu_kick_kthreads; +static int rcu_divisor = 7; +module_param(rcu_divisor, int, 0644); + +/* Force an exit from rcu_do_batch() after 3 milliseconds. */ +static long rcu_resched_ns = 3 * NSEC_PER_MSEC; +module_param(rcu_resched_ns, long, 0644); /* * How long the grace period must be before we start recruiting @@ -414,7 +446,7 @@ static bool rcu_kick_kthreads; */ static ulong jiffies_till_sched_qs = ULONG_MAX; module_param(jiffies_till_sched_qs, ulong, 0444); -static ulong jiffies_to_sched_qs; /* Adjusted version of above if not default */ +static ulong jiffies_to_sched_qs; /* See adjust_jiffies_till_sched_qs(). */ module_param(jiffies_to_sched_qs, ulong, 0444); /* Display only! */ /* @@ -432,6 +464,7 @@ static void adjust_jiffies_till_sched_qs(void) WRITE_ONCE(jiffies_to_sched_qs, jiffies_till_sched_qs); return; } + /* Otherwise, set to third fqs scan, but bound below on large system. */ j = READ_ONCE(jiffies_till_first_fqs) + 2 * READ_ONCE(jiffies_till_next_fqs); if (j < HZ / 10 + nr_cpu_ids / RCU_JIFFIES_FQS_DIV) @@ -464,12 +497,12 @@ static int param_set_next_fqs_jiffies(const char *val, const struct kernel_param return ret; } -static struct kernel_param_ops first_fqs_jiffies_ops = { +static const struct kernel_param_ops first_fqs_jiffies_ops = { .set = param_set_first_fqs_jiffies, .get = param_get_ulong, }; -static struct kernel_param_ops next_fqs_jiffies_ops = { +static const struct kernel_param_ops next_fqs_jiffies_ops = { .set = param_set_next_fqs_jiffies, .get = param_get_ulong, }; @@ -479,8 +512,7 @@ module_param_cb(jiffies_till_next_fqs, &next_fqs_jiffies_ops, &jiffies_till_next module_param(rcu_kick_kthreads, bool, 0644); static void force_qs_rnp(int (*f)(struct rcu_data *rdp)); -static void force_quiescent_state(void); -static int rcu_pending(void); +static int rcu_pending(int user); /* * Return the number of RCU GPs completed thus far for debug & stats. @@ -504,432 +536,219 @@ unsigned long rcu_exp_batches_completed(void) EXPORT_SYMBOL_GPL(rcu_exp_batches_completed); /* - * Force a quiescent state. - */ -void rcu_force_quiescent_state(void) -{ - force_quiescent_state(); -} -EXPORT_SYMBOL_GPL(rcu_force_quiescent_state); - -/* - * Convert a ->gp_state value to a character string. - */ -static const char *gp_state_getname(short gs) -{ - if (gs < 0 || gs >= ARRAY_SIZE(gp_state_names)) - return "???"; - return gp_state_names[gs]; -} - -/* - * Show the state of the grace-period kthreads. + * Return the root node of the rcu_state structure. */ -void show_rcu_gp_kthreads(void) +static struct rcu_node *rcu_get_root(void) { - int cpu; - unsigned long j; - struct rcu_data *rdp; - struct rcu_node *rnp; - - j = jiffies - READ_ONCE(rcu_state.gp_activity); - pr_info("%s: wait state: %s(%d) ->state: %#lx delta ->gp_activity %ld\n", - rcu_state.name, gp_state_getname(rcu_state.gp_state), - rcu_state.gp_state, rcu_state.gp_kthread->state, j); - rcu_for_each_node_breadth_first(rnp) { - if (ULONG_CMP_GE(rcu_state.gp_seq, rnp->gp_seq_needed)) - continue; - pr_info("\trcu_node %d:%d ->gp_seq %lu ->gp_seq_needed %lu\n", - rnp->grplo, rnp->grphi, rnp->gp_seq, - rnp->gp_seq_needed); - if (!rcu_is_leaf_node(rnp)) - continue; - for_each_leaf_node_possible_cpu(rnp, cpu) { - rdp = per_cpu_ptr(&rcu_data, cpu); - if (rdp->gpwrap || - ULONG_CMP_GE(rcu_state.gp_seq, - rdp->gp_seq_needed)) - continue; - pr_info("\tcpu %d ->gp_seq_needed %lu\n", - cpu, rdp->gp_seq_needed); - } - } - /* sched_show_task(rcu_state.gp_kthread); */ + return &rcu_state.node[0]; } -EXPORT_SYMBOL_GPL(show_rcu_gp_kthreads); /* * Send along grace-period-related data for rcutorture diagnostics. */ -void rcutorture_get_gp_data(enum rcutorture_type test_type, int *flags, - unsigned long *gp_seq) +void rcutorture_get_gp_data(int *flags, unsigned long *gp_seq) { - switch (test_type) { - case RCU_FLAVOR: - case RCU_BH_FLAVOR: - case RCU_SCHED_FLAVOR: - *flags = READ_ONCE(rcu_state.gp_flags); - *gp_seq = rcu_seq_current(&rcu_state.gp_seq); - break; - default: - break; - } + *flags = READ_ONCE(rcu_state.gp_flags); + *gp_seq = rcu_seq_current(&rcu_state.gp_seq); } EXPORT_SYMBOL_GPL(rcutorture_get_gp_data); -/* - * Return the root node of the rcu_state structure. - */ -static struct rcu_node *rcu_get_root(void) +/* Gather grace-period sequence numbers for rcutorture diagnostics. */ +unsigned long long rcutorture_gather_gp_seqs(void) { - return &rcu_state.node[0]; + return ((READ_ONCE(rcu_state.gp_seq) & 0xffffULL) << 40) | + ((READ_ONCE(rcu_state.expedited_sequence) & 0xffffffULL) << 16) | + (READ_ONCE(rcu_state.gp_seq_polled) & 0xffffULL); } +EXPORT_SYMBOL_GPL(rcutorture_gather_gp_seqs); -/* - * Enter an RCU extended quiescent state, which can be either the - * idle loop or adaptive-tickless usermode execution. - * - * We crowbar the ->dynticks_nmi_nesting field to zero to allow for - * the possibility of usermode upcalls having messed up our count - * of interrupt nesting level during the prior busy period. - */ -static void rcu_eqs_enter(bool user) +/* Format grace-period sequence numbers for rcutorture diagnostics. */ +void rcutorture_format_gp_seqs(unsigned long long seqs, char *cp, size_t len) { - struct rcu_data *rdp = this_cpu_ptr(&rcu_data); + unsigned int egp = (seqs >> 16) & 0xffffffULL; + unsigned int ggp = (seqs >> 40) & 0xffffULL; + unsigned int pgp = seqs & 0xffffULL; - WARN_ON_ONCE(rdp->dynticks_nmi_nesting != DYNTICK_IRQ_NONIDLE); - WRITE_ONCE(rdp->dynticks_nmi_nesting, 0); - WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && - rdp->dynticks_nesting == 0); - if (rdp->dynticks_nesting != 1) { - rdp->dynticks_nesting--; - return; - } - - lockdep_assert_irqs_disabled(); - trace_rcu_dyntick(TPS("Start"), rdp->dynticks_nesting, 0, rdp->dynticks); - WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !user && !is_idle_task(current)); - rdp = this_cpu_ptr(&rcu_data); - do_nocb_deferred_wakeup(rdp); - rcu_prepare_for_idle(); - rcu_preempt_deferred_qs(current); - WRITE_ONCE(rdp->dynticks_nesting, 0); /* Avoid irq-access tearing. */ - rcu_dynticks_eqs_enter(); - rcu_dynticks_task_enter(); + snprintf(cp, len, "g%04x:e%06x:p%04x", ggp, egp, pgp); } +EXPORT_SYMBOL_GPL(rcutorture_format_gp_seqs); -/** - * rcu_idle_enter - inform RCU that current CPU is entering idle - * - * Enter idle mode, in other words, -leave- the mode in which RCU - * read-side critical sections can occur. (Though RCU read-side - * critical sections can occur in irq handlers in idle, a possibility - * handled by irq_enter() and irq_exit().) - * - * If you add or remove a call to rcu_idle_enter(), be sure to test with - * CONFIG_RCU_EQS_DEBUG=y. +#if defined(CONFIG_NO_HZ_FULL) && (!defined(CONFIG_GENERIC_ENTRY) || !defined(CONFIG_VIRT_XFER_TO_GUEST_WORK)) +/* + * An empty function that will trigger a reschedule on + * IRQ tail once IRQs get re-enabled on userspace/guest resume. */ -void rcu_idle_enter(void) +static void late_wakeup_func(struct irq_work *work) { - lockdep_assert_irqs_disabled(); - rcu_eqs_enter(false); } -#ifdef CONFIG_NO_HZ_FULL -/** - * rcu_user_enter - inform RCU that we are resuming userspace. - * - * Enter RCU idle mode right before resuming userspace. No use of RCU - * is permitted between this call and rcu_user_exit(). This way the - * CPU doesn't need to maintain the tick for RCU maintenance purposes - * when the CPU runs in userspace. - * - * If you add or remove a call to rcu_user_enter(), be sure to test with - * CONFIG_RCU_EQS_DEBUG=y. - */ -void rcu_user_enter(void) -{ - lockdep_assert_irqs_disabled(); - rcu_eqs_enter(true); -} -#endif /* CONFIG_NO_HZ_FULL */ +static DEFINE_PER_CPU(struct irq_work, late_wakeup_work) = + IRQ_WORK_INIT(late_wakeup_func); /* - * If we are returning from the outermost NMI handler that interrupted an - * RCU-idle period, update rdp->dynticks and rdp->dynticks_nmi_nesting - * to let the RCU grace-period handling know that the CPU is back to - * being RCU-idle. + * If either: + * + * 1) the task is about to enter in guest mode and $ARCH doesn't support KVM generic work + * 2) the task is about to enter in user mode and $ARCH doesn't support generic entry. * - * If you add or remove a call to rcu_nmi_exit_common(), be sure to test - * with CONFIG_RCU_EQS_DEBUG=y. + * In these cases the late RCU wake ups aren't supported in the resched loops and our + * last resort is to fire a local irq_work that will trigger a reschedule once IRQs + * get re-enabled again. */ -static __always_inline void rcu_nmi_exit_common(bool irq) +noinstr void rcu_irq_work_resched(void) { struct rcu_data *rdp = this_cpu_ptr(&rcu_data); - /* - * Check for ->dynticks_nmi_nesting underflow and bad ->dynticks. - * (We are exiting an NMI handler, so RCU better be paying attention - * to us!) - */ - WARN_ON_ONCE(rdp->dynticks_nmi_nesting <= 0); - WARN_ON_ONCE(rcu_dynticks_curr_cpu_in_eqs()); - - /* - * If the nesting level is not 1, the CPU wasn't RCU-idle, so - * leave it in non-RCU-idle state. - */ - if (rdp->dynticks_nmi_nesting != 1) { - trace_rcu_dyntick(TPS("--="), rdp->dynticks_nmi_nesting, rdp->dynticks_nmi_nesting - 2, rdp->dynticks); - WRITE_ONCE(rdp->dynticks_nmi_nesting, /* No store tearing. */ - rdp->dynticks_nmi_nesting - 2); + if (IS_ENABLED(CONFIG_GENERIC_ENTRY) && !(current->flags & PF_VCPU)) return; - } - - /* This NMI interrupted an RCU-idle CPU, restore RCU-idleness. */ - trace_rcu_dyntick(TPS("Startirq"), rdp->dynticks_nmi_nesting, 0, rdp->dynticks); - WRITE_ONCE(rdp->dynticks_nmi_nesting, 0); /* Avoid store tearing. */ - - if (irq) - rcu_prepare_for_idle(); - - rcu_dynticks_eqs_enter(); - - if (irq) - rcu_dynticks_task_enter(); -} -/** - * rcu_nmi_exit - inform RCU of exit from NMI context - * @irq: Is this call from rcu_irq_exit? - * - * If you add or remove a call to rcu_nmi_exit(), be sure to test - * with CONFIG_RCU_EQS_DEBUG=y. - */ -void rcu_nmi_exit(void) -{ - rcu_nmi_exit_common(false); -} - -/** - * rcu_irq_exit - inform RCU that current CPU is exiting irq towards idle - * - * Exit from an interrupt handler, which might possibly result in entering - * idle mode, in other words, leaving the mode in which read-side critical - * sections can occur. The caller must have disabled interrupts. - * - * This code assumes that the idle loop never does anything that might - * result in unbalanced calls to irq_enter() and irq_exit(). If your - * architecture's idle loop violates this assumption, RCU will give you what - * you deserve, good and hard. But very infrequently and irreproducibly. - * - * Use things like work queues to work around this limitation. - * - * You have been warned. - * - * If you add or remove a call to rcu_irq_exit(), be sure to test with - * CONFIG_RCU_EQS_DEBUG=y. - */ -void rcu_irq_exit(void) -{ - lockdep_assert_irqs_disabled(); - rcu_nmi_exit_common(true); -} - -/* - * Wrapper for rcu_irq_exit() where interrupts are enabled. - * - * If you add or remove a call to rcu_irq_exit_irqson(), be sure to test - * with CONFIG_RCU_EQS_DEBUG=y. - */ -void rcu_irq_exit_irqson(void) -{ - unsigned long flags; - - local_irq_save(flags); - rcu_irq_exit(); - local_irq_restore(flags); -} - -/* - * Exit an RCU extended quiescent state, which can be either the - * idle loop or adaptive-tickless usermode execution. - * - * We crowbar the ->dynticks_nmi_nesting field to DYNTICK_IRQ_NONIDLE to - * allow for the possibility of usermode upcalls messing up our count of - * interrupt nesting level during the busy period that is just now starting. - */ -static void rcu_eqs_exit(bool user) -{ - struct rcu_data *rdp; - long oldval; - - lockdep_assert_irqs_disabled(); - rdp = this_cpu_ptr(&rcu_data); - oldval = rdp->dynticks_nesting; - WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && oldval < 0); - if (oldval) { - rdp->dynticks_nesting++; + if (IS_ENABLED(CONFIG_VIRT_XFER_TO_GUEST_WORK) && (current->flags & PF_VCPU)) return; + + instrumentation_begin(); + if (do_nocb_deferred_wakeup(rdp) && need_resched()) { + irq_work_queue(this_cpu_ptr(&late_wakeup_work)); } - rcu_dynticks_task_exit(); - rcu_dynticks_eqs_exit(); - rcu_cleanup_after_idle(); - trace_rcu_dyntick(TPS("End"), rdp->dynticks_nesting, 1, rdp->dynticks); - WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !user && !is_idle_task(current)); - WRITE_ONCE(rdp->dynticks_nesting, 1); - WARN_ON_ONCE(rdp->dynticks_nmi_nesting); - WRITE_ONCE(rdp->dynticks_nmi_nesting, DYNTICK_IRQ_NONIDLE); + instrumentation_end(); } +#endif /* #if defined(CONFIG_NO_HZ_FULL) && (!defined(CONFIG_GENERIC_ENTRY) || !defined(CONFIG_VIRT_XFER_TO_GUEST_WORK)) */ +#ifdef CONFIG_PROVE_RCU /** - * rcu_idle_exit - inform RCU that current CPU is leaving idle - * - * Exit idle mode, in other words, -enter- the mode in which RCU - * read-side critical sections can occur. - * - * If you add or remove a call to rcu_idle_exit(), be sure to test with - * CONFIG_RCU_EQS_DEBUG=y. + * rcu_irq_exit_check_preempt - Validate that scheduling is possible */ -void rcu_idle_exit(void) +void rcu_irq_exit_check_preempt(void) { - unsigned long flags; + lockdep_assert_irqs_disabled(); - local_irq_save(flags); - rcu_eqs_exit(false); - local_irq_restore(flags); + RCU_LOCKDEP_WARN(ct_nesting() <= 0, + "RCU nesting counter underflow/zero!"); + RCU_LOCKDEP_WARN(ct_nmi_nesting() != + CT_NESTING_IRQ_NONIDLE, + "Bad RCU nmi_nesting counter\n"); + RCU_LOCKDEP_WARN(!rcu_is_watching_curr_cpu(), + "RCU in extended quiescent state!"); } +#endif /* #ifdef CONFIG_PROVE_RCU */ #ifdef CONFIG_NO_HZ_FULL /** - * rcu_user_exit - inform RCU that we are exiting userspace. + * __rcu_irq_enter_check_tick - Enable scheduler tick on CPU if RCU needs it. * - * Exit RCU idle mode while entering the kernel because it can - * run a RCU read side critical section anytime. + * The scheduler tick is not normally enabled when CPUs enter the kernel + * from nohz_full userspace execution. After all, nohz_full userspace + * execution is an RCU quiescent state and the time executing in the kernel + * is quite short. Except of course when it isn't. And it is not hard to + * cause a large system to spend tens of seconds or even minutes looping + * in the kernel, which can cause a number of problems, include RCU CPU + * stall warnings. * - * If you add or remove a call to rcu_user_exit(), be sure to test with - * CONFIG_RCU_EQS_DEBUG=y. - */ -void rcu_user_exit(void) -{ - rcu_eqs_exit(1); -} -#endif /* CONFIG_NO_HZ_FULL */ - -/** - * rcu_nmi_enter_common - inform RCU of entry to NMI context - * @irq: Is this call from rcu_irq_enter? + * Therefore, if a nohz_full CPU fails to report a quiescent state + * in a timely manner, the RCU grace-period kthread sets that CPU's + * ->rcu_urgent_qs flag with the expectation that the next interrupt or + * exception will invoke this function, which will turn on the scheduler + * tick, which will enable RCU to detect that CPU's quiescent states, + * for example, due to cond_resched() calls in CONFIG_PREEMPT=n kernels. + * The tick will be disabled once a quiescent state is reported for + * this CPU. * - * If the CPU was idle from RCU's viewpoint, update rdp->dynticks and - * rdp->dynticks_nmi_nesting to let the RCU grace-period handling know - * that the CPU is active. This implementation permits nested NMIs, as - * long as the nesting level does not overflow an int. (You will probably - * run out of stack space first.) - * - * If you add or remove a call to rcu_nmi_enter_common(), be sure to test - * with CONFIG_RCU_EQS_DEBUG=y. + * Of course, in carefully tuned systems, there might never be an + * interrupt or exception. In that case, the RCU grace-period kthread + * will eventually cause one to happen. However, in less carefully + * controlled environments, this function allows RCU to get what it + * needs without creating otherwise useless interruptions. */ -static __always_inline void rcu_nmi_enter_common(bool irq) +void __rcu_irq_enter_check_tick(void) { struct rcu_data *rdp = this_cpu_ptr(&rcu_data); - long incby = 2; - - /* Complain about underflow. */ - WARN_ON_ONCE(rdp->dynticks_nmi_nesting < 0); - /* - * If idle from RCU viewpoint, atomically increment ->dynticks - * to mark non-idle and increment ->dynticks_nmi_nesting by one. - * Otherwise, increment ->dynticks_nmi_nesting by two. This means - * if ->dynticks_nmi_nesting is equal to one, we are guaranteed - * to be in the outermost NMI handler that interrupted an RCU-idle - * period (observation due to Andy Lutomirski). - */ - if (rcu_dynticks_curr_cpu_in_eqs()) { - - if (irq) - rcu_dynticks_task_exit(); - - rcu_dynticks_eqs_exit(); + // If we're here from NMI there's nothing to do. + if (in_nmi()) + return; - if (irq) - rcu_cleanup_after_idle(); + RCU_LOCKDEP_WARN(!rcu_is_watching_curr_cpu(), + "Illegal rcu_irq_enter_check_tick() from extended quiescent state"); - incby = 1; + if (!tick_nohz_full_cpu(rdp->cpu) || + !READ_ONCE(rdp->rcu_urgent_qs) || + READ_ONCE(rdp->rcu_forced_tick)) { + // RCU doesn't need nohz_full help from this CPU, or it is + // already getting that help. + return; } - trace_rcu_dyntick(incby == 1 ? TPS("Endirq") : TPS("++="), - rdp->dynticks_nmi_nesting, - rdp->dynticks_nmi_nesting + incby, rdp->dynticks); - WRITE_ONCE(rdp->dynticks_nmi_nesting, /* Prevent store tearing. */ - rdp->dynticks_nmi_nesting + incby); - barrier(); -} -/** - * rcu_nmi_enter - inform RCU of entry to NMI context - */ -void rcu_nmi_enter(void) -{ - rcu_nmi_enter_common(false); + // We get here only when not in an extended quiescent state and + // from interrupts (as opposed to NMIs). Therefore, (1) RCU is + // already watching and (2) The fact that we are in an interrupt + // handler and that the rcu_node lock is an irq-disabled lock + // prevents self-deadlock. So we can safely recheck under the lock. + // Note that the nohz_full state currently cannot change. + raw_spin_lock_rcu_node(rdp->mynode); + if (READ_ONCE(rdp->rcu_urgent_qs) && !rdp->rcu_forced_tick) { + // A nohz_full CPU is in the kernel and RCU needs a + // quiescent state. Turn on the tick! + WRITE_ONCE(rdp->rcu_forced_tick, true); + tick_dep_set_cpu(rdp->cpu, TICK_DEP_BIT_RCU); + } + raw_spin_unlock_rcu_node(rdp->mynode); } +NOKPROBE_SYMBOL(__rcu_irq_enter_check_tick); +#endif /* CONFIG_NO_HZ_FULL */ -/** - * rcu_irq_enter - inform RCU that current CPU is entering irq away from idle - * - * Enter an interrupt handler, which might possibly result in exiting - * idle mode, in other words, entering the mode in which read-side critical - * sections can occur. The caller must have disabled interrupts. - * - * Note that the Linux kernel is fully capable of entering an interrupt - * handler that it never exits, for example when doing upcalls to user mode! - * This code assumes that the idle loop never does upcalls to user mode. - * If your architecture's idle loop does do upcalls to user mode (or does - * anything else that results in unbalanced calls to the irq_enter() and - * irq_exit() functions), RCU will give you what you deserve, good and hard. - * But very infrequently and irreproducibly. - * - * Use things like work queues to work around this limitation. - * - * You have been warned. +/* + * Check to see if any future non-offloaded RCU-related work will need + * to be done by the current CPU, even if none need be done immediately, + * returning 1 if so. This function is part of the RCU implementation; + * it is -not- an exported member of the RCU API. This is used by + * the idle-entry code to figure out whether it is safe to disable the + * scheduler-clock interrupt. * - * If you add or remove a call to rcu_irq_enter(), be sure to test with - * CONFIG_RCU_EQS_DEBUG=y. + * Just check whether or not this CPU has non-offloaded RCU callbacks + * queued. */ -void rcu_irq_enter(void) +int rcu_needs_cpu(void) { - lockdep_assert_irqs_disabled(); - rcu_nmi_enter_common(true); + return !rcu_segcblist_empty(&this_cpu_ptr(&rcu_data)->cblist) && + !rcu_rdp_is_offloaded(this_cpu_ptr(&rcu_data)); } /* - * Wrapper for rcu_irq_enter() where interrupts are enabled. - * - * If you add or remove a call to rcu_irq_enter_irqson(), be sure to test - * with CONFIG_RCU_EQS_DEBUG=y. + * If any sort of urgency was applied to the current CPU (for example, + * the scheduler-clock interrupt was enabled on a nohz_full CPU) in order + * to get to a quiescent state, disable it. */ -void rcu_irq_enter_irqson(void) +static void rcu_disable_urgency_upon_qs(struct rcu_data *rdp) { - unsigned long flags; - - local_irq_save(flags); - rcu_irq_enter(); - local_irq_restore(flags); + raw_lockdep_assert_held_rcu_node(rdp->mynode); + WRITE_ONCE(rdp->rcu_urgent_qs, false); + WRITE_ONCE(rdp->rcu_need_heavy_qs, false); + if (tick_nohz_full_cpu(rdp->cpu) && rdp->rcu_forced_tick) { + tick_dep_clear_cpu(rdp->cpu, TICK_DEP_BIT_RCU); + WRITE_ONCE(rdp->rcu_forced_tick, false); + } } /** - * rcu_is_watching - see if RCU thinks that the current CPU is not idle + * rcu_is_watching - RCU read-side critical sections permitted on current CPU? + * + * Return @true if RCU is watching the running CPU and @false otherwise. + * An @true return means that this CPU can safely enter RCU read-side + * critical sections. + * + * Although calls to rcu_is_watching() from most parts of the kernel + * will return @true, there are important exceptions. For example, if the + * current CPU is deep within its idle loop, in kernel entry/exit code, + * or offline, rcu_is_watching() will return @false. * - * Return true if RCU is watching the running CPU, which means that this - * CPU can safely enter RCU read-side critical sections. In other words, - * if the current CPU is not in its idle loop or is in an interrupt or - * NMI handler, return true. + * Make notrace because it can be called by the internal functions of + * ftrace, and making this notrace removes unnecessary recursion calls. */ -bool notrace rcu_is_watching(void) +notrace bool rcu_is_watching(void) { bool ret; preempt_disable_notrace(); - ret = !rcu_dynticks_curr_cpu_in_eqs(); + ret = rcu_is_watching_curr_cpu(); preempt_enable_notrace(); return ret; } @@ -953,43 +772,27 @@ void rcu_request_urgent_qs_task(struct task_struct *t) smp_store_release(per_cpu_ptr(&rcu_data.rcu_urgent_qs, cpu), true); } -#if defined(CONFIG_PROVE_RCU) && defined(CONFIG_HOTPLUG_CPU) +static unsigned long seq_gpwrap_lag = ULONG_MAX / 4; -/* - * Is the current CPU online as far as RCU is concerned? - * - * Disable preemption to avoid false positives that could otherwise - * happen due to the current CPU number being sampled, this task being - * preempted, its old CPU being taken offline, resuming on some other CPU, - * then determining that its old CPU is now offline. - * - * Disable checking if in an NMI handler because we cannot safely - * report errors from NMI handlers anyway. In addition, it is OK to use - * RCU on an offline processor during initial boot, hence the check for - * rcu_scheduler_fully_active. +/** + * rcu_set_gpwrap_lag - Set RCU GP sequence overflow lag value. + * @lag_gps: Set overflow lag to this many grace period worth of counters + * which is used by rcutorture to quickly force a gpwrap situation. + * @lag_gps = 0 means we reset it back to the boot-time value. */ -bool rcu_lockdep_current_cpu_online(void) +void rcu_set_gpwrap_lag(unsigned long lag_gps) { - struct rcu_data *rdp; - struct rcu_node *rnp; - bool ret = false; + unsigned long lag_seq_count; - if (in_nmi() || !rcu_scheduler_fully_active) - return true; - preempt_disable(); - rdp = this_cpu_ptr(&rcu_data); - rnp = rdp->mynode; - if (rdp->grpmask & rcu_rnp_online_cpus(rnp)) - ret = true; - preempt_enable(); - return ret; + lag_seq_count = (lag_gps == 0) + ? ULONG_MAX / 4 + : lag_gps << RCU_SEQ_CTR_SHIFT; + WRITE_ONCE(seq_gpwrap_lag, lag_seq_count); } -EXPORT_SYMBOL_GPL(rcu_lockdep_current_cpu_online); - -#endif /* #if defined(CONFIG_PROVE_RCU) && defined(CONFIG_HOTPLUG_CPU) */ +EXPORT_SYMBOL_GPL(rcu_set_gpwrap_lag); /* - * We are reporting a quiescent state on behalf of some other CPU, so + * When trying to report a quiescent state on behalf of some other CPU, * it is our responsibility to check for and handle potential overflow * of the rcu_node ->gp_seq counter with respect to the rcu_data counters. * After all, the CPU might be in deep idle state, and thus executing no @@ -998,22 +801,35 @@ EXPORT_SYMBOL_GPL(rcu_lockdep_current_cpu_online); static void rcu_gpnum_ovf(struct rcu_node *rnp, struct rcu_data *rdp) { raw_lockdep_assert_held_rcu_node(rnp); - if (ULONG_CMP_LT(rcu_seq_current(&rdp->gp_seq) + ULONG_MAX / 4, - rnp->gp_seq)) + if (ULONG_CMP_LT(rcu_seq_current(&rdp->gp_seq) + seq_gpwrap_lag, + rnp->gp_seq)) { WRITE_ONCE(rdp->gpwrap, true); + WRITE_ONCE(rdp->gpwrap_count, READ_ONCE(rdp->gpwrap_count) + 1); + } if (ULONG_CMP_LT(rdp->rcu_iw_gp_seq + ULONG_MAX / 4, rnp->gp_seq)) rdp->rcu_iw_gp_seq = rnp->gp_seq + ULONG_MAX / 4; } /* - * Snapshot the specified CPU's dynticks counter so that we can later + * Snapshot the specified CPU's RCU_WATCHING counter so that we can later * credit them with an implicit quiescent state. Return 1 if this CPU * is in dynticks idle mode, which is an extended quiescent state. */ -static int dyntick_save_progress_counter(struct rcu_data *rdp) +static int rcu_watching_snap_save(struct rcu_data *rdp) { - rdp->dynticks_snap = rcu_dynticks_snap(rdp); - if (rcu_dynticks_in_eqs(rdp->dynticks_snap)) { + /* + * Full ordering between remote CPU's post idle accesses and updater's + * accesses prior to current GP (and also the started GP sequence number) + * is enforced by rcu_seq_start() implicit barrier and even further by + * smp_mb__after_unlock_lock() barriers chained all the way throughout the + * rnp locking tree since rcu_gp_init() and up to the current leaf rnp + * locking. + * + * Ordering between remote CPU's pre idle accesses and post grace period + * updater's accesses is enforced by the below acquire semantic. + */ + rdp->watching_snap = ct_rcu_watching_cpu_acquire(rdp->cpu); + if (rcu_watching_snap_in_eqs(rdp->watching_snap)) { trace_rcu_fqs(rcu_state.name, rdp->gp_seq, rdp->cpu, TPS("dti")); rcu_gpnum_ovf(rdp->mynode, rdp); return 1; @@ -1021,38 +837,24 @@ static int dyntick_save_progress_counter(struct rcu_data *rdp) return 0; } -/* - * Handler for the irq_work request posted when a grace period has - * gone on for too long, but not yet long enough for an RCU CPU - * stall warning. Set state appropriately, but just complain if - * there is unexpected state on entry. - */ -static void rcu_iw_handler(struct irq_work *iwp) -{ - struct rcu_data *rdp; - struct rcu_node *rnp; - - rdp = container_of(iwp, struct rcu_data, rcu_iw); - rnp = rdp->mynode; - raw_spin_lock_rcu_node(rnp); - if (!WARN_ON_ONCE(!rdp->rcu_iw_pending)) { - rdp->rcu_iw_gp_seq = rnp->gp_seq; - rdp->rcu_iw_pending = false; - } - raw_spin_unlock_rcu_node(rnp); -} +#ifndef arch_irq_stat_cpu +#define arch_irq_stat_cpu(cpu) 0 +#endif /* - * Return true if the specified CPU has passed through a quiescent - * state by virtue of being in or having passed through an dynticks - * idle state since the last call to dyntick_save_progress_counter() - * for this same CPU, or by virtue of having been offline. + * Returns positive if the specified CPU has passed through a quiescent state + * by virtue of being in or having passed through an dynticks idle state since + * the last call to rcu_watching_snap_save() for this same CPU, or by + * virtue of having been offline. + * + * Returns negative if the specified CPU needs a force resched. + * + * Returns zero otherwise. */ -static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) +static int rcu_watching_snap_recheck(struct rcu_data *rdp) { unsigned long jtsq; - bool *rnhqp; - bool *ruqp; + int ret = 0; struct rcu_node *rnp = rdp->mynode; /* @@ -1063,30 +865,43 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) * read-side critical section that started before the beginning * of the current RCU grace period. */ - if (rcu_dynticks_in_eqs_since(rdp, rdp->dynticks_snap)) { + if (rcu_watching_snap_stopped_since(rdp, rdp->watching_snap)) { trace_rcu_fqs(rcu_state.name, rdp->gp_seq, rdp->cpu, TPS("dti")); rcu_gpnum_ovf(rnp, rdp); return 1; } - /* If waiting too long on an offline CPU, complain. */ - if (!(rdp->grpmask & rcu_rnp_online_cpus(rnp)) && - time_after(jiffies, rcu_state.gp_start + HZ)) { - bool onl; + /* + * Complain if a CPU that is considered to be offline from RCU's + * perspective has not yet reported a quiescent state. After all, + * the offline CPU should have reported a quiescent state during + * the CPU-offline process, or, failing that, by rcu_gp_init() + * if it ran concurrently with either the CPU going offline or the + * last task on a leaf rcu_node structure exiting its RCU read-side + * critical section while all CPUs corresponding to that structure + * are offline. This added warning detects bugs in any of these + * code paths. + * + * The rcu_node structure's ->lock is held here, which excludes + * the relevant portions the CPU-hotplug code, the grace-period + * initialization code, and the rcu_read_unlock() code paths. + * + * For more detail, please refer to the "Hotplug CPU" section + * of RCU's Requirements documentation. + */ + if (WARN_ON_ONCE(!rcu_rdp_cpu_online(rdp))) { struct rcu_node *rnp1; - WARN_ON(1); /* Offline CPUs are supposed to report QS! */ pr_info("%s: grp: %d-%d level: %d ->gp_seq %ld ->completedqs %ld\n", __func__, rnp->grplo, rnp->grphi, rnp->level, (long)rnp->gp_seq, (long)rnp->completedqs); for (rnp1 = rnp; rnp1; rnp1 = rnp1->parent) pr_info("%s: %d:%d ->qsmask %#lx ->qsmaskinit %#lx ->qsmaskinitnext %#lx ->rcu_gp_init_mask %#lx\n", __func__, rnp1->grplo, rnp1->grphi, rnp1->qsmask, rnp1->qsmaskinit, rnp1->qsmaskinitnext, rnp1->rcu_gp_init_mask); - onl = !!(rdp->grpmask & rcu_rnp_online_cpus(rnp)); pr_info("%s %d: %c online: %ld(%d) offline: %ld(%d)\n", - __func__, rdp->cpu, ".o"[onl], - (long)rdp->rcu_onl_gp_seq, rdp->rcu_onl_gp_flags, - (long)rdp->rcu_ofl_gp_seq, rdp->rcu_ofl_gp_flags); + __func__, rdp->cpu, ".o"[rcu_rdp_cpu_online(rdp)], + (long)rdp->rcu_onl_gp_seq, rdp->rcu_onl_gp_state, + (long)rdp->rcu_ofl_gp_seq, rdp->rcu_ofl_gp_state); return 1; /* Break things loose after complaining. */ } @@ -1102,20 +917,19 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) * is set way high. */ jtsq = READ_ONCE(jiffies_to_sched_qs); - ruqp = per_cpu_ptr(&rcu_data.rcu_urgent_qs, rdp->cpu); - rnhqp = &per_cpu(rcu_data.rcu_need_heavy_qs, rdp->cpu); - if (!READ_ONCE(*rnhqp) && + if (!READ_ONCE(rdp->rcu_need_heavy_qs) && (time_after(jiffies, rcu_state.gp_start + jtsq * 2) || - time_after(jiffies, rcu_state.jiffies_resched))) { - WRITE_ONCE(*rnhqp, true); + time_after(jiffies, rcu_state.jiffies_resched) || + rcu_state.cbovld)) { + WRITE_ONCE(rdp->rcu_need_heavy_qs, true); /* Store rcu_need_heavy_qs before rcu_urgent_qs. */ - smp_store_release(ruqp, true); + smp_store_release(&rdp->rcu_urgent_qs, true); } else if (time_after(jiffies, rcu_state.gp_start + jtsq)) { - WRITE_ONCE(*ruqp, true); + WRITE_ONCE(rdp->rcu_urgent_qs, true); } /* - * NO_HZ_FULL CPUs can run in-kernel without rcu_check_callbacks! + * NO_HZ_FULL CPUs can run in-kernel without rcu_sched_clock_irq! * The above code handles this, but only for straight cond_resched(). * And some in-kernel loops check need_resched() before calling * cond_resched(), which defeats the above code for CPUs that are @@ -1123,10 +937,11 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) * So hit them over the head with the resched_cpu() hammer! */ if (tick_nohz_full_cpu(rdp->cpu) && - time_after(jiffies, - READ_ONCE(rdp->last_fqs_resched) + jtsq * 3)) { - resched_cpu(rdp->cpu); + (time_after(jiffies, READ_ONCE(rdp->last_fqs_resched) + jtsq * 3) || + rcu_state.cbovld)) { + WRITE_ONCE(rdp->rcu_urgent_qs, true); WRITE_ONCE(rdp->last_fqs_resched, jiffies); + ret = -1; } /* @@ -1139,317 +954,46 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) if (time_after(jiffies, rcu_state.jiffies_resched)) { if (time_after(jiffies, READ_ONCE(rdp->last_fqs_resched) + jtsq)) { - resched_cpu(rdp->cpu); WRITE_ONCE(rdp->last_fqs_resched, jiffies); + ret = -1; } if (IS_ENABLED(CONFIG_IRQ_WORK) && !rdp->rcu_iw_pending && rdp->rcu_iw_gp_seq != rnp->gp_seq && (rnp->ffmask & rdp->grpmask)) { - init_irq_work(&rdp->rcu_iw, rcu_iw_handler); rdp->rcu_iw_pending = true; rdp->rcu_iw_gp_seq = rnp->gp_seq; irq_work_queue_on(&rdp->rcu_iw, rdp->cpu); } - } - return 0; -} - -static void record_gp_stall_check_time(void) -{ - unsigned long j = jiffies; - unsigned long j1; - - rcu_state.gp_start = j; - j1 = rcu_jiffies_till_stall_check(); - /* Record ->gp_start before ->jiffies_stall. */ - smp_store_release(&rcu_state.jiffies_stall, j + j1); /* ^^^ */ - rcu_state.jiffies_resched = j + j1 / 2; - rcu_state.n_force_qs_gpstart = READ_ONCE(rcu_state.n_force_qs); -} - -/* - * Complain about starvation of grace-period kthread. - */ -static void rcu_check_gp_kthread_starvation(void) -{ - struct task_struct *gpk = rcu_state.gp_kthread; - unsigned long j; - - j = jiffies - READ_ONCE(rcu_state.gp_activity); - if (j > 2 * HZ) { - pr_err("%s kthread starved for %ld jiffies! g%ld f%#x %s(%d) ->state=%#lx ->cpu=%d\n", - rcu_state.name, j, - (long)rcu_seq_current(&rcu_state.gp_seq), - rcu_state.gp_flags, - gp_state_getname(rcu_state.gp_state), rcu_state.gp_state, - gpk ? gpk->state : ~0, gpk ? task_cpu(gpk) : -1); - if (gpk) { - pr_err("RCU grace-period kthread stack dump:\n"); - sched_show_task(gpk); - wake_up_process(gpk); + if (rcu_cpu_stall_cputime && rdp->snap_record.gp_seq != rdp->gp_seq) { + int cpu = rdp->cpu; + struct rcu_snap_record *rsrp; + struct kernel_cpustat *kcsp; + + kcsp = &kcpustat_cpu(cpu); + + rsrp = &rdp->snap_record; + rsrp->cputime_irq = kcpustat_field(kcsp, CPUTIME_IRQ, cpu); + rsrp->cputime_softirq = kcpustat_field(kcsp, CPUTIME_SOFTIRQ, cpu); + rsrp->cputime_system = kcpustat_field(kcsp, CPUTIME_SYSTEM, cpu); + rsrp->nr_hardirqs = kstat_cpu_irqs_sum(cpu) + arch_irq_stat_cpu(cpu); + rsrp->nr_softirqs = kstat_cpu_softirqs_sum(cpu); + rsrp->nr_csw = nr_context_switches_cpu(cpu); + rsrp->jiffies = jiffies; + rsrp->gp_seq = rdp->gp_seq; } } -} - -/* - * Dump stacks of all tasks running on stalled CPUs. First try using - * NMIs, but fall back to manual remote stack tracing on architectures - * that don't support NMI-based stack dumps. The NMI-triggered stack - * traces are more accurate because they are printed by the target CPU. - */ -static void rcu_dump_cpu_stacks(void) -{ - int cpu; - unsigned long flags; - struct rcu_node *rnp; - - rcu_for_each_leaf_node(rnp) { - raw_spin_lock_irqsave_rcu_node(rnp, flags); - for_each_leaf_node_possible_cpu(rnp, cpu) - if (rnp->qsmask & leaf_node_cpu_bit(rnp, cpu)) - if (!trigger_single_cpu_backtrace(cpu)) - dump_cpu_task(cpu); - raw_spin_unlock_irqrestore_rcu_node(rnp, flags); - } -} - -/* - * If too much time has passed in the current grace period, and if - * so configured, go kick the relevant kthreads. - */ -static void rcu_stall_kick_kthreads(void) -{ - unsigned long j; - - if (!rcu_kick_kthreads) - return; - j = READ_ONCE(rcu_state.jiffies_kick_kthreads); - if (time_after(jiffies, j) && rcu_state.gp_kthread && - (rcu_gp_in_progress() || READ_ONCE(rcu_state.gp_flags))) { - WARN_ONCE(1, "Kicking %s grace-period kthread\n", - rcu_state.name); - rcu_ftrace_dump(DUMP_ALL); - wake_up_process(rcu_state.gp_kthread); - WRITE_ONCE(rcu_state.jiffies_kick_kthreads, j + HZ); - } -} - -static void panic_on_rcu_stall(void) -{ - if (sysctl_panic_on_rcu_stall) - panic("RCU Stall\n"); -} - -static void print_other_cpu_stall(unsigned long gp_seq) -{ - int cpu; - unsigned long flags; - unsigned long gpa; - unsigned long j; - int ndetected = 0; - struct rcu_node *rnp = rcu_get_root(); - long totqlen = 0; - /* Kick and suppress, if so configured. */ - rcu_stall_kick_kthreads(); - if (rcu_cpu_stall_suppress) - return; - - /* - * OK, time to rat on our buddy... - * See Documentation/RCU/stallwarn.txt for info on how to debug - * RCU CPU stall warnings. - */ - pr_err("INFO: %s detected stalls on CPUs/tasks:", rcu_state.name); - print_cpu_stall_info_begin(); - rcu_for_each_leaf_node(rnp) { - raw_spin_lock_irqsave_rcu_node(rnp, flags); - ndetected += rcu_print_task_stall(rnp); - if (rnp->qsmask != 0) { - for_each_leaf_node_possible_cpu(rnp, cpu) - if (rnp->qsmask & leaf_node_cpu_bit(rnp, cpu)) { - print_cpu_stall_info(cpu); - ndetected++; - } - } - raw_spin_unlock_irqrestore_rcu_node(rnp, flags); - } - - print_cpu_stall_info_end(); - for_each_possible_cpu(cpu) - totqlen += rcu_get_n_cbs_cpu(cpu); - pr_cont("(detected by %d, t=%ld jiffies, g=%ld, q=%lu)\n", - smp_processor_id(), (long)(jiffies - rcu_state.gp_start), - (long)rcu_seq_current(&rcu_state.gp_seq), totqlen); - if (ndetected) { - rcu_dump_cpu_stacks(); - - /* Complain about tasks blocking the grace period. */ - rcu_print_detail_task_stall(); - } else { - if (rcu_seq_current(&rcu_state.gp_seq) != gp_seq) { - pr_err("INFO: Stall ended before state dump start\n"); - } else { - j = jiffies; - gpa = READ_ONCE(rcu_state.gp_activity); - pr_err("All QSes seen, last %s kthread activity %ld (%ld-%ld), jiffies_till_next_fqs=%ld, root ->qsmask %#lx\n", - rcu_state.name, j - gpa, j, gpa, - READ_ONCE(jiffies_till_next_fqs), - rcu_get_root()->qsmask); - /* In this case, the current CPU might be at fault. */ - sched_show_task(current); - } - } - /* Rewrite if needed in case of slow consoles. */ - if (ULONG_CMP_GE(jiffies, READ_ONCE(rcu_state.jiffies_stall))) - WRITE_ONCE(rcu_state.jiffies_stall, - jiffies + 3 * rcu_jiffies_till_stall_check() + 3); - - rcu_check_gp_kthread_starvation(); - - panic_on_rcu_stall(); - - force_quiescent_state(); /* Kick them all. */ -} - -static void print_cpu_stall(void) -{ - int cpu; - unsigned long flags; - struct rcu_data *rdp = this_cpu_ptr(&rcu_data); - struct rcu_node *rnp = rcu_get_root(); - long totqlen = 0; - - /* Kick and suppress, if so configured. */ - rcu_stall_kick_kthreads(); - if (rcu_cpu_stall_suppress) - return; - - /* - * OK, time to rat on ourselves... - * See Documentation/RCU/stallwarn.txt for info on how to debug - * RCU CPU stall warnings. - */ - pr_err("INFO: %s self-detected stall on CPU", rcu_state.name); - print_cpu_stall_info_begin(); - raw_spin_lock_irqsave_rcu_node(rdp->mynode, flags); - print_cpu_stall_info(smp_processor_id()); - raw_spin_unlock_irqrestore_rcu_node(rdp->mynode, flags); - print_cpu_stall_info_end(); - for_each_possible_cpu(cpu) - totqlen += rcu_get_n_cbs_cpu(cpu); - pr_cont(" (t=%lu jiffies g=%ld q=%lu)\n", - jiffies - rcu_state.gp_start, - (long)rcu_seq_current(&rcu_state.gp_seq), totqlen); - - rcu_check_gp_kthread_starvation(); - - rcu_dump_cpu_stacks(); - - raw_spin_lock_irqsave_rcu_node(rnp, flags); - /* Rewrite if needed in case of slow consoles. */ - if (ULONG_CMP_GE(jiffies, READ_ONCE(rcu_state.jiffies_stall))) - WRITE_ONCE(rcu_state.jiffies_stall, - jiffies + 3 * rcu_jiffies_till_stall_check() + 3); - raw_spin_unlock_irqrestore_rcu_node(rnp, flags); - - panic_on_rcu_stall(); - - /* - * Attempt to revive the RCU machinery by forcing a context switch. - * - * A context switch would normally allow the RCU state machine to make - * progress and it could be we're stuck in kernel space without context - * switches for an entirely unreasonable amount of time. - */ - set_tsk_need_resched(current); - set_preempt_need_resched(); -} - -static void check_cpu_stall(struct rcu_data *rdp) -{ - unsigned long gs1; - unsigned long gs2; - unsigned long gps; - unsigned long j; - unsigned long jn; - unsigned long js; - struct rcu_node *rnp; - - if ((rcu_cpu_stall_suppress && !rcu_kick_kthreads) || - !rcu_gp_in_progress()) - return; - rcu_stall_kick_kthreads(); - j = jiffies; - - /* - * Lots of memory barriers to reject false positives. - * - * The idea is to pick up rcu_state.gp_seq, then - * rcu_state.jiffies_stall, then rcu_state.gp_start, and finally - * another copy of rcu_state.gp_seq. These values are updated in - * the opposite order with memory barriers (or equivalent) during - * grace-period initialization and cleanup. Now, a false positive - * can occur if we get an new value of rcu_state.gp_start and a old - * value of rcu_state.jiffies_stall. But given the memory barriers, - * the only way that this can happen is if one grace period ends - * and another starts between these two fetches. This is detected - * by comparing the second fetch of rcu_state.gp_seq with the - * previous fetch from rcu_state.gp_seq. - * - * Given this check, comparisons of jiffies, rcu_state.jiffies_stall, - * and rcu_state.gp_start suffice to forestall false positives. - */ - gs1 = READ_ONCE(rcu_state.gp_seq); - smp_rmb(); /* Pick up ->gp_seq first... */ - js = READ_ONCE(rcu_state.jiffies_stall); - smp_rmb(); /* ...then ->jiffies_stall before the rest... */ - gps = READ_ONCE(rcu_state.gp_start); - smp_rmb(); /* ...and finally ->gp_start before ->gp_seq again. */ - gs2 = READ_ONCE(rcu_state.gp_seq); - if (gs1 != gs2 || - ULONG_CMP_LT(j, js) || - ULONG_CMP_GE(gps, js)) - return; /* No stall or GP completed since entering function. */ - rnp = rdp->mynode; - jn = jiffies + 3 * rcu_jiffies_till_stall_check() + 3; - if (rcu_gp_in_progress() && - (READ_ONCE(rnp->qsmask) & rdp->grpmask) && - cmpxchg(&rcu_state.jiffies_stall, js, jn) == js) { - - /* We haven't checked in, so go dump stack. */ - print_cpu_stall(); - - } else if (rcu_gp_in_progress() && - ULONG_CMP_GE(j, js + RCU_STALL_RAT_DELAY) && - cmpxchg(&rcu_state.jiffies_stall, js, jn) == js) { - - /* They had a few time units to dump stack, so complain. */ - print_other_cpu_stall(gs2); - } -} - -/** - * rcu_cpu_stall_reset - prevent further stall warnings in current grace period - * - * Set the stall-warning timeout way off into the future, thus preventing - * any RCU CPU stall-warning messages from appearing in the current set of - * RCU grace periods. - * - * The caller must disable hard irqs. - */ -void rcu_cpu_stall_reset(void) -{ - WRITE_ONCE(rcu_state.jiffies_stall, jiffies + ULONG_MAX / 2); + return ret; } /* Trace-event wrapper function for trace_rcu_future_grace_period. */ static void trace_rcu_this_gp(struct rcu_node *rnp, struct rcu_data *rdp, unsigned long gp_seq_req, const char *s) { - trace_rcu_future_grace_period(rcu_state.name, rnp->gp_seq, gp_seq_req, - rnp->level, rnp->grplo, rnp->grphi, s); + trace_rcu_future_grace_period(rcu_state.name, READ_ONCE(rnp->gp_seq), + gp_seq_req, rnp->level, + rnp->grplo, rnp->grphi, s); } /* @@ -1496,7 +1040,7 @@ static bool rcu_start_this_gp(struct rcu_node *rnp_start, struct rcu_data *rdp, TPS("Prestarted")); goto unlock_out; } - rnp->gp_seq_needed = gp_seq_req; + WRITE_ONCE(rnp->gp_seq_needed, gp_seq_req); if (rcu_seq_state(rcu_seq_current(&rnp->gp_seq))) { /* * We just marked the leaf or internal node, and a @@ -1521,18 +1065,18 @@ static bool rcu_start_this_gp(struct rcu_node *rnp_start, struct rcu_data *rdp, } trace_rcu_this_gp(rnp, rdp, gp_seq_req, TPS("Startedroot")); WRITE_ONCE(rcu_state.gp_flags, rcu_state.gp_flags | RCU_GP_FLAG_INIT); - rcu_state.gp_req_activity = jiffies; - if (!rcu_state.gp_kthread) { + WRITE_ONCE(rcu_state.gp_req_activity, jiffies); + if (!READ_ONCE(rcu_state.gp_kthread)) { trace_rcu_this_gp(rnp, rdp, gp_seq_req, TPS("NoGPkthread")); goto unlock_out; } - trace_rcu_grace_period(rcu_state.name, READ_ONCE(rcu_state.gp_seq), TPS("newreq")); + trace_rcu_grace_period(rcu_state.name, data_race(rcu_state.gp_seq), TPS("newreq")); ret = true; /* Caller must wake GP kthread. */ unlock_out: /* Push furthest requested GP to leaf node and rcu_data structure. */ if (ULONG_CMP_LT(gp_seq_req, rnp->gp_seq_needed)) { - rnp_start->gp_seq_needed = rnp->gp_seq_needed; - rdp->gp_seq_needed = rnp->gp_seq_needed; + WRITE_ONCE(rnp_start->gp_seq_needed, rnp->gp_seq_needed); + WRITE_ONCE(rdp->gp_seq_needed, rnp->gp_seq_needed); } if (rnp != rnp_start) raw_spin_unlock_rcu_node(rnp); @@ -1557,17 +1101,29 @@ static bool rcu_future_gp_cleanup(struct rcu_node *rnp) } /* - * Awaken the grace-period kthread. Don't do a self-awaken, and don't - * bother awakening when there is nothing for the grace-period kthread - * to do (as in several CPUs raced to awaken, and we lost), and finally - * don't try to awaken a kthread that has not yet been created. + * Awaken the grace-period kthread. Don't do a self-awaken (unless in an + * interrupt or softirq handler, in which case we just might immediately + * sleep upon return, resulting in a grace-period hang), and don't bother + * awakening when there is nothing for the grace-period kthread to do + * (as in several CPUs raced to awaken, we lost), and finally don't try + * to awaken a kthread that has not yet been created. If all those checks + * are passed, track some debug information and awaken. + * + * So why do the self-wakeup when in an interrupt or softirq handler + * in the grace-period kthread's context? Because the kthread might have + * been interrupted just as it was going to sleep, and just after the final + * pre-sleep check of the awaken condition. In this case, a wakeup really + * is required, and is therefore supplied. */ static void rcu_gp_kthread_wake(void) { - if (current == rcu_state.gp_kthread || - !READ_ONCE(rcu_state.gp_flags) || - !rcu_state.gp_kthread) + struct task_struct *t = READ_ONCE(rcu_state.gp_kthread); + + if ((current == t && !in_hardirq() && !in_serving_softirq()) || + !READ_ONCE(rcu_state.gp_flags) || !t) return; + WRITE_ONCE(rcu_state.gp_wake_time, jiffies); + WRITE_ONCE(rcu_state.gp_wake_seq, READ_ONCE(rcu_state.gp_seq)); swake_up_one(&rcu_state.gp_wq); } @@ -1588,12 +1144,15 @@ static bool rcu_accelerate_cbs(struct rcu_node *rnp, struct rcu_data *rdp) unsigned long gp_seq_req; bool ret = false; + rcu_lockdep_assert_cblist_protected(rdp); raw_lockdep_assert_held_rcu_node(rnp); /* If no pending (not yet ready to invoke) callbacks, nothing to do. */ if (!rcu_segcblist_pend_cbs(&rdp->cblist)) return false; + trace_rcu_segcb_stats(&rdp->cblist, TPS("SegCbPreAcc")); + /* * Callbacks are often registered with incomplete grace-period * information. Something about the fact that getting exact @@ -1610,9 +1169,12 @@ static bool rcu_accelerate_cbs(struct rcu_node *rnp, struct rcu_data *rdp) /* Trace depending on how much we were able to accelerate. */ if (rcu_segcblist_restempty(&rdp->cblist, RCU_WAIT_TAIL)) - trace_rcu_grace_period(rcu_state.name, rdp->gp_seq, TPS("AccWaitCB")); + trace_rcu_grace_period(rcu_state.name, gp_seq_req, TPS("AccWaitCB")); else - trace_rcu_grace_period(rcu_state.name, rdp->gp_seq, TPS("AccReadyCB")); + trace_rcu_grace_period(rcu_state.name, gp_seq_req, TPS("AccReadyCB")); + + trace_rcu_segcb_stats(&rdp->cblist, TPS("SegCbPostAcc")); + return ret; } @@ -1629,9 +1191,9 @@ static void rcu_accelerate_cbs_unlocked(struct rcu_node *rnp, unsigned long c; bool needwake; - lockdep_assert_irqs_disabled(); + rcu_lockdep_assert_cblist_protected(rdp); c = rcu_seq_snap(&rcu_state.gp_seq); - if (!rdp->gpwrap && ULONG_CMP_GE(rdp->gp_seq_needed, c)) { + if (!READ_ONCE(rdp->gpwrap) && ULONG_CMP_GE(rdp->gp_seq_needed, c)) { /* Old request still live, so mark recent callbacks. */ (void)rcu_segcblist_accelerate(&rdp->cblist, c); return; @@ -1655,6 +1217,7 @@ static void rcu_accelerate_cbs_unlocked(struct rcu_node *rnp, */ static bool rcu_advance_cbs(struct rcu_node *rnp, struct rcu_data *rdp) { + rcu_lockdep_assert_cblist_protected(rdp); raw_lockdep_assert_held_rcu_node(rnp); /* If no pending (not yet ready to invoke) callbacks, nothing to do. */ @@ -1672,6 +1235,35 @@ static bool rcu_advance_cbs(struct rcu_node *rnp, struct rcu_data *rdp) } /* + * Move and classify callbacks, but only if doing so won't require + * that the RCU grace-period kthread be awakened. + */ +static void __maybe_unused rcu_advance_cbs_nowake(struct rcu_node *rnp, + struct rcu_data *rdp) +{ + rcu_lockdep_assert_cblist_protected(rdp); + if (!rcu_seq_state(rcu_seq_current(&rnp->gp_seq)) || !raw_spin_trylock_rcu_node(rnp)) + return; + // The grace period cannot end while we hold the rcu_node lock. + if (rcu_seq_state(rcu_seq_current(&rnp->gp_seq))) + WARN_ON_ONCE(rcu_advance_cbs(rnp, rdp)); + raw_spin_unlock_rcu_node(rnp); +} + +/* + * In CONFIG_RCU_STRICT_GRACE_PERIOD=y kernels, attempt to generate a + * quiescent state. This is intended to be invoked when the CPU notices + * a new grace period. + */ +static void rcu_strict_gp_check_qs(void) +{ + if (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD)) { + rcu_read_lock(); + rcu_read_unlock(); + } +} + +/* * Update CPU-local rcu_data state to record the beginnings and ends of * grace periods. The caller must hold the ->lock of the leaf rcu_node * structure corresponding to the current CPU, and must have irqs disabled. @@ -1679,8 +1271,9 @@ static bool rcu_advance_cbs(struct rcu_node *rnp, struct rcu_data *rdp) */ static bool __note_gp_changes(struct rcu_node *rnp, struct rcu_data *rdp) { - bool ret; - bool need_gp; + bool ret = false; + bool need_qs; + const bool offloaded = rcu_rdp_is_offloaded(rdp); raw_lockdep_assert_held_rcu_node(rnp); @@ -1689,30 +1282,37 @@ static bool __note_gp_changes(struct rcu_node *rnp, struct rcu_data *rdp) /* Handle the ends of any preceding grace periods first. */ if (rcu_seq_completed_gp(rdp->gp_seq, rnp->gp_seq) || - unlikely(READ_ONCE(rdp->gpwrap))) { - ret = rcu_advance_cbs(rnp, rdp); /* Advance callbacks. */ + unlikely(rdp->gpwrap)) { + if (!offloaded) + ret = rcu_advance_cbs(rnp, rdp); /* Advance CBs. */ + rdp->core_needs_qs = false; trace_rcu_grace_period(rcu_state.name, rdp->gp_seq, TPS("cpuend")); } else { - ret = rcu_accelerate_cbs(rnp, rdp); /* Recent callbacks. */ + if (!offloaded) + ret = rcu_accelerate_cbs(rnp, rdp); /* Recent CBs. */ + if (rdp->core_needs_qs) + rdp->core_needs_qs = !!(rnp->qsmask & rdp->grpmask); } /* Now handle the beginnings of any new-to-this-CPU grace periods. */ if (rcu_seq_new_gp(rdp->gp_seq, rnp->gp_seq) || - unlikely(READ_ONCE(rdp->gpwrap))) { + unlikely(rdp->gpwrap)) { /* * If the current grace period is waiting for this CPU, * set up to detect a quiescent state, otherwise don't * go looking for one. */ trace_rcu_grace_period(rcu_state.name, rnp->gp_seq, TPS("cpustart")); - need_gp = !!(rnp->qsmask & rdp->grpmask); - rdp->cpu_no_qs.b.norm = need_gp; - rdp->core_needs_qs = need_gp; + need_qs = !!(rnp->qsmask & rdp->grpmask); + rdp->cpu_no_qs.b.norm = need_qs; + rdp->core_needs_qs = need_qs; zero_cpu_stall_ticks(rdp); } rdp->gp_seq = rnp->gp_seq; /* Remember new grace-period state. */ - if (ULONG_CMP_GE(rnp->gp_seq_needed, rdp->gp_seq_needed) || rdp->gpwrap) - rdp->gp_seq_needed = rnp->gp_seq_needed; + if (ULONG_CMP_LT(rdp->gp_seq_needed, rnp->gp_seq_needed) || rdp->gpwrap) + WRITE_ONCE(rdp->gp_seq_needed, rnp->gp_seq_needed); + if (IS_ENABLED(CONFIG_PROVE_RCU) && rdp->gpwrap) + WRITE_ONCE(rdp->last_sched_clock, jiffies); WRITE_ONCE(rdp->gpwrap, false); rcu_gpnum_ovf(rnp, rdp); return ret; @@ -1734,32 +1334,485 @@ static void note_gp_changes(struct rcu_data *rdp) } needwake = __note_gp_changes(rnp, rdp); raw_spin_unlock_irqrestore_rcu_node(rnp, flags); + rcu_strict_gp_check_qs(); if (needwake) rcu_gp_kthread_wake(); } +static atomic_t *rcu_gp_slow_suppress; + +/* Register a counter to suppress debugging grace-period delays. */ +void rcu_gp_slow_register(atomic_t *rgssp) +{ + WARN_ON_ONCE(rcu_gp_slow_suppress); + + WRITE_ONCE(rcu_gp_slow_suppress, rgssp); +} +EXPORT_SYMBOL_GPL(rcu_gp_slow_register); + +/* Unregister a counter, with NULL for not caring which. */ +void rcu_gp_slow_unregister(atomic_t *rgssp) +{ + WARN_ON_ONCE(rgssp && rgssp != rcu_gp_slow_suppress && rcu_gp_slow_suppress != NULL); + + WRITE_ONCE(rcu_gp_slow_suppress, NULL); +} +EXPORT_SYMBOL_GPL(rcu_gp_slow_unregister); + +static bool rcu_gp_slow_is_suppressed(void) +{ + atomic_t *rgssp = READ_ONCE(rcu_gp_slow_suppress); + + return rgssp && atomic_read(rgssp); +} + static void rcu_gp_slow(int delay) { - if (delay > 0 && - !(rcu_seq_ctr(rcu_state.gp_seq) % - (rcu_num_nodes * PER_RCU_NODE_PERIOD * delay))) - schedule_timeout_uninterruptible(delay); + if (!rcu_gp_slow_is_suppressed() && delay > 0 && + !(rcu_seq_ctr(rcu_state.gp_seq) % (rcu_num_nodes * PER_RCU_NODE_PERIOD * delay))) + schedule_timeout_idle(delay); +} + +static unsigned long sleep_duration; + +/* Allow rcutorture to stall the grace-period kthread. */ +void rcu_gp_set_torture_wait(int duration) +{ + if (IS_ENABLED(CONFIG_RCU_TORTURE_TEST) && duration > 0) + WRITE_ONCE(sleep_duration, duration); +} +EXPORT_SYMBOL_GPL(rcu_gp_set_torture_wait); + +/* Actually implement the aforementioned wait. */ +static void rcu_gp_torture_wait(void) +{ + unsigned long duration; + + if (!IS_ENABLED(CONFIG_RCU_TORTURE_TEST)) + return; + duration = xchg(&sleep_duration, 0UL); + if (duration > 0) { + pr_alert("%s: Waiting %lu jiffies\n", __func__, duration); + schedule_timeout_idle(duration); + pr_alert("%s: Wait complete\n", __func__); + } +} + +/* + * Handler for on_each_cpu() to invoke the target CPU's RCU core + * processing. + */ +static void rcu_strict_gp_boundary(void *unused) +{ + invoke_rcu_core(); +} + +// Make the polled API aware of the beginning of a grace period. +static void rcu_poll_gp_seq_start(unsigned long *snap) +{ + struct rcu_node *rnp = rcu_get_root(); + + if (rcu_scheduler_active != RCU_SCHEDULER_INACTIVE) + raw_lockdep_assert_held_rcu_node(rnp); + + // If RCU was idle, note beginning of GP. + if (!rcu_seq_state(rcu_state.gp_seq_polled)) + rcu_seq_start(&rcu_state.gp_seq_polled); + + // Either way, record current state. + *snap = rcu_state.gp_seq_polled; +} + +// Make the polled API aware of the end of a grace period. +static void rcu_poll_gp_seq_end(unsigned long *snap) +{ + struct rcu_node *rnp = rcu_get_root(); + + if (rcu_scheduler_active != RCU_SCHEDULER_INACTIVE) + raw_lockdep_assert_held_rcu_node(rnp); + + // If the previously noted GP is still in effect, record the + // end of that GP. Either way, zero counter to avoid counter-wrap + // problems. + if (*snap && *snap == rcu_state.gp_seq_polled) { + rcu_seq_end(&rcu_state.gp_seq_polled); + rcu_state.gp_seq_polled_snap = 0; + rcu_state.gp_seq_polled_exp_snap = 0; + } else { + *snap = 0; + } +} + +// Make the polled API aware of the beginning of a grace period, but +// where caller does not hold the root rcu_node structure's lock. +static void rcu_poll_gp_seq_start_unlocked(unsigned long *snap) +{ + unsigned long flags; + struct rcu_node *rnp = rcu_get_root(); + + if (rcu_init_invoked()) { + if (rcu_scheduler_active != RCU_SCHEDULER_INACTIVE) + lockdep_assert_irqs_enabled(); + raw_spin_lock_irqsave_rcu_node(rnp, flags); + } + rcu_poll_gp_seq_start(snap); + if (rcu_init_invoked()) + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); +} + +// Make the polled API aware of the end of a grace period, but where +// caller does not hold the root rcu_node structure's lock. +static void rcu_poll_gp_seq_end_unlocked(unsigned long *snap) +{ + unsigned long flags; + struct rcu_node *rnp = rcu_get_root(); + + if (rcu_init_invoked()) { + if (rcu_scheduler_active != RCU_SCHEDULER_INACTIVE) + lockdep_assert_irqs_enabled(); + raw_spin_lock_irqsave_rcu_node(rnp, flags); + } + rcu_poll_gp_seq_end(snap); + if (rcu_init_invoked()) + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); +} + +/* + * There is a single llist, which is used for handling + * synchronize_rcu() users' enqueued rcu_synchronize nodes. + * Within this llist, there are two tail pointers: + * + * wait tail: Tracks the set of nodes, which need to + * wait for the current GP to complete. + * done tail: Tracks the set of nodes, for which grace + * period has elapsed. These nodes processing + * will be done as part of the cleanup work + * execution by a kworker. + * + * At every grace period init, a new wait node is added + * to the llist. This wait node is used as wait tail + * for this new grace period. Given that there are a fixed + * number of wait nodes, if all wait nodes are in use + * (which can happen when kworker callback processing + * is delayed) and additional grace period is requested. + * This means, a system is slow in processing callbacks. + * + * TODO: If a slow processing is detected, a first node + * in the llist should be used as a wait-tail for this + * grace period, therefore users which should wait due + * to a slow process are handled by _this_ grace period + * and not next. + * + * Below is an illustration of how the done and wait + * tail pointers move from one set of rcu_synchronize nodes + * to the other, as grace periods start and finish and + * nodes are processed by kworker. + * + * + * a. Initial llist callbacks list: + * + * +----------+ +--------+ +-------+ + * | | | | | | + * | head |---------> | cb2 |--------->| cb1 | + * | | | | | | + * +----------+ +--------+ +-------+ + * + * + * + * b. New GP1 Start: + * + * WAIT TAIL + * | + * | + * v + * +----------+ +--------+ +--------+ +-------+ + * | | | | | | | | + * | head ------> wait |------> cb2 |------> | cb1 | + * | | | head1 | | | | | + * +----------+ +--------+ +--------+ +-------+ + * + * + * + * c. GP completion: + * + * WAIT_TAIL == DONE_TAIL + * + * DONE TAIL + * | + * | + * v + * +----------+ +--------+ +--------+ +-------+ + * | | | | | | | | + * | head ------> wait |------> cb2 |------> | cb1 | + * | | | head1 | | | | | + * +----------+ +--------+ +--------+ +-------+ + * + * + * + * d. New callbacks and GP2 start: + * + * WAIT TAIL DONE TAIL + * | | + * | | + * v v + * +----------+ +------+ +------+ +------+ +-----+ +-----+ +-----+ + * | | | | | | | | | | | | | | + * | head ------> wait |--->| cb4 |--->| cb3 |--->|wait |--->| cb2 |--->| cb1 | + * | | | head2| | | | | |head1| | | | | + * +----------+ +------+ +------+ +------+ +-----+ +-----+ +-----+ + * + * + * + * e. GP2 completion: + * + * WAIT_TAIL == DONE_TAIL + * DONE TAIL + * | + * | + * v + * +----------+ +------+ +------+ +------+ +-----+ +-----+ +-----+ + * | | | | | | | | | | | | | | + * | head ------> wait |--->| cb4 |--->| cb3 |--->|wait |--->| cb2 |--->| cb1 | + * | | | head2| | | | | |head1| | | | | + * +----------+ +------+ +------+ +------+ +-----+ +-----+ +-----+ + * + * + * While the llist state transitions from d to e, a kworker + * can start executing rcu_sr_normal_gp_cleanup_work() and + * can observe either the old done tail (@c) or the new + * done tail (@e). So, done tail updates and reads need + * to use the rel-acq semantics. If the concurrent kworker + * observes the old done tail, the newly queued work + * execution will process the updated done tail. If the + * concurrent kworker observes the new done tail, then + * the newly queued work will skip processing the done + * tail, as workqueue semantics guarantees that the new + * work is executed only after the previous one completes. + * + * f. kworker callbacks processing complete: + * + * + * DONE TAIL + * | + * | + * v + * +----------+ +--------+ + * | | | | + * | head ------> wait | + * | | | head2 | + * +----------+ +--------+ + * + */ +static bool rcu_sr_is_wait_head(struct llist_node *node) +{ + return &(rcu_state.srs_wait_nodes)[0].node <= node && + node <= &(rcu_state.srs_wait_nodes)[SR_NORMAL_GP_WAIT_HEAD_MAX - 1].node; +} + +static struct llist_node *rcu_sr_get_wait_head(void) +{ + struct sr_wait_node *sr_wn; + int i; + + for (i = 0; i < SR_NORMAL_GP_WAIT_HEAD_MAX; i++) { + sr_wn = &(rcu_state.srs_wait_nodes)[i]; + + if (!atomic_cmpxchg_acquire(&sr_wn->inuse, 0, 1)) + return &sr_wn->node; + } + + return NULL; +} + +static void rcu_sr_put_wait_head(struct llist_node *node) +{ + struct sr_wait_node *sr_wn = container_of(node, struct sr_wait_node, node); + + atomic_set_release(&sr_wn->inuse, 0); +} + +/* Enable rcu_normal_wake_from_gp automatically on small systems. */ +#define WAKE_FROM_GP_CPU_THRESHOLD 16 + +static int rcu_normal_wake_from_gp = -1; +module_param(rcu_normal_wake_from_gp, int, 0644); +static struct workqueue_struct *sync_wq; + +static void rcu_sr_normal_complete(struct llist_node *node) +{ + struct rcu_synchronize *rs = container_of( + (struct rcu_head *) node, struct rcu_synchronize, head); + + WARN_ONCE(IS_ENABLED(CONFIG_PROVE_RCU) && + !poll_state_synchronize_rcu_full(&rs->oldstate), + "A full grace period is not passed yet!\n"); + + /* Finally. */ + complete(&rs->completion); +} + +static void rcu_sr_normal_gp_cleanup_work(struct work_struct *work) +{ + struct llist_node *done, *rcu, *next, *head; + + /* + * This work execution can potentially execute + * while a new done tail is being updated by + * grace period kthread in rcu_sr_normal_gp_cleanup(). + * So, read and updates of done tail need to + * follow acq-rel semantics. + * + * Given that wq semantics guarantees that a single work + * cannot execute concurrently by multiple kworkers, + * the done tail list manipulations are protected here. + */ + done = smp_load_acquire(&rcu_state.srs_done_tail); + if (WARN_ON_ONCE(!done)) + return; + + WARN_ON_ONCE(!rcu_sr_is_wait_head(done)); + head = done->next; + done->next = NULL; + + /* + * The dummy node, which is pointed to by the + * done tail which is acq-read above is not removed + * here. This allows lockless additions of new + * rcu_synchronize nodes in rcu_sr_normal_add_req(), + * while the cleanup work executes. The dummy + * nodes is removed, in next round of cleanup + * work execution. + */ + llist_for_each_safe(rcu, next, head) { + if (!rcu_sr_is_wait_head(rcu)) { + rcu_sr_normal_complete(rcu); + continue; + } + + rcu_sr_put_wait_head(rcu); + } + + /* Order list manipulations with atomic access. */ + atomic_dec_return_release(&rcu_state.srs_cleanups_pending); +} + +/* + * Helper function for rcu_gp_cleanup(). + */ +static void rcu_sr_normal_gp_cleanup(void) +{ + struct llist_node *wait_tail, *next = NULL, *rcu = NULL; + int done = 0; + + wait_tail = rcu_state.srs_wait_tail; + if (wait_tail == NULL) + return; + + rcu_state.srs_wait_tail = NULL; + ASSERT_EXCLUSIVE_WRITER(rcu_state.srs_wait_tail); + WARN_ON_ONCE(!rcu_sr_is_wait_head(wait_tail)); + + /* + * Process (a) and (d) cases. See an illustration. + */ + llist_for_each_safe(rcu, next, wait_tail->next) { + if (rcu_sr_is_wait_head(rcu)) + break; + + rcu_sr_normal_complete(rcu); + // It can be last, update a next on this step. + wait_tail->next = next; + + if (++done == SR_MAX_USERS_WAKE_FROM_GP) + break; + } + + /* + * Fast path, no more users to process except putting the second last + * wait head if no inflight-workers. If there are in-flight workers, + * they will remove the last wait head. + * + * Note that the ACQUIRE orders atomic access with list manipulation. + */ + if (wait_tail->next && wait_tail->next->next == NULL && + rcu_sr_is_wait_head(wait_tail->next) && + !atomic_read_acquire(&rcu_state.srs_cleanups_pending)) { + rcu_sr_put_wait_head(wait_tail->next); + wait_tail->next = NULL; + } + + /* Concurrent sr_normal_gp_cleanup work might observe this update. */ + ASSERT_EXCLUSIVE_WRITER(rcu_state.srs_done_tail); + smp_store_release(&rcu_state.srs_done_tail, wait_tail); + + /* + * We schedule a work in order to perform a final processing + * of outstanding users(if still left) and releasing wait-heads + * added by rcu_sr_normal_gp_init() call. + */ + if (wait_tail->next) { + atomic_inc(&rcu_state.srs_cleanups_pending); + if (!queue_work(sync_wq, &rcu_state.srs_cleanup_work)) + atomic_dec(&rcu_state.srs_cleanups_pending); + } +} + +/* + * Helper function for rcu_gp_init(). + */ +static bool rcu_sr_normal_gp_init(void) +{ + struct llist_node *first; + struct llist_node *wait_head; + bool start_new_poll = false; + + first = READ_ONCE(rcu_state.srs_next.first); + if (!first || rcu_sr_is_wait_head(first)) + return start_new_poll; + + wait_head = rcu_sr_get_wait_head(); + if (!wait_head) { + // Kick another GP to retry. + start_new_poll = true; + return start_new_poll; + } + + /* Inject a wait-dummy-node. */ + llist_add(wait_head, &rcu_state.srs_next); + + /* + * A waiting list of rcu_synchronize nodes should be empty on + * this step, since a GP-kthread, rcu_gp_init() -> gp_cleanup(), + * rolls it over. If not, it is a BUG, warn a user. + */ + WARN_ON_ONCE(rcu_state.srs_wait_tail != NULL); + rcu_state.srs_wait_tail = wait_head; + ASSERT_EXCLUSIVE_WRITER(rcu_state.srs_wait_tail); + + return start_new_poll; +} + +static void rcu_sr_normal_add_req(struct rcu_synchronize *rs) +{ + llist_add((struct llist_node *) &rs->head, &rcu_state.srs_next); } /* * Initialize a new grace period. Return false if no grace period required. */ -static bool rcu_gp_init(void) +static noinline_for_stack bool rcu_gp_init(void) { unsigned long flags; unsigned long oldmask; unsigned long mask; struct rcu_data *rdp; struct rcu_node *rnp = rcu_get_root(); + bool start_new_poll; + unsigned long old_gp_seq; WRITE_ONCE(rcu_state.gp_activity, jiffies); raw_spin_lock_irq_rcu_node(rnp); - if (!READ_ONCE(rcu_state.gp_flags)) { + if (!rcu_state.gp_flags) { /* Spurious wakeup, tell caller to go back to sleep. */ raw_spin_unlock_irq_rcu_node(rnp); return false; @@ -1777,26 +1830,73 @@ static bool rcu_gp_init(void) /* Advance to a new grace period and initialize state. */ record_gp_stall_check_time(); + /* + * A new wait segment must be started before gp_seq advanced, so + * that previous gp waiters won't observe the new gp_seq. + */ + start_new_poll = rcu_sr_normal_gp_init(); /* Record GP times before starting GP, hence rcu_seq_start(). */ + old_gp_seq = rcu_state.gp_seq; + /* + * Critical ordering: rcu_seq_start() must happen BEFORE the CPU hotplug + * scan below. Otherwise we risk a race where a newly onlining CPU could + * be missed by the current grace period, potentially leading to + * use-after-free errors. For a detailed explanation of this race, see + * Documentation/RCU/Design/Requirements/Requirements.rst in the + * "Hotplug CPU" section. + * + * Also note that the root rnp's gp_seq is kept separate from, and lags, + * the rcu_state's gp_seq, for a reason. See the Quick-Quiz on + * Single-node systems for more details (in Data-Structures.rst). + */ rcu_seq_start(&rcu_state.gp_seq); + /* Ensure that rcu_seq_done_exact() guardband doesn't give false positives. */ + WARN_ON_ONCE(IS_ENABLED(CONFIG_PROVE_RCU) && + rcu_seq_done_exact(&old_gp_seq, rcu_seq_snap(&rcu_state.gp_seq))); + + ASSERT_EXCLUSIVE_WRITER(rcu_state.gp_seq); trace_rcu_grace_period(rcu_state.name, rcu_state.gp_seq, TPS("start")); + rcu_poll_gp_seq_start(&rcu_state.gp_seq_polled_snap); raw_spin_unlock_irq_rcu_node(rnp); /* - * Apply per-leaf buffered online and offline operations to the - * rcu_node tree. Note that this new grace period need not wait - * for subsequent online CPUs, and that quiescent-state forcing - * will handle subsequent offline CPUs. + * The "start_new_poll" is set to true, only when this GP is not able + * to handle anything and there are outstanding users. It happens when + * the rcu_sr_normal_gp_init() function was not able to insert a dummy + * separator to the llist, because there were no left any dummy-nodes. + * + * Number of dummy-nodes is fixed, it could be that we are run out of + * them, if so we start a new pool request to repeat a try. It is rare + * and it means that a system is doing a slow processing of callbacks. */ - rcu_state.gp_state = RCU_GP_ONOFF; + if (start_new_poll) + (void) start_poll_synchronize_rcu(); + + /* + * Apply per-leaf buffered online and offline operations to + * the rcu_node tree. Note that this new grace period need not + * wait for subsequent online CPUs, and that RCU hooks in the CPU + * offlining path, when combined with checks in this function, + * will handle CPUs that are currently going offline or that will + * go offline later. Please also refer to "Hotplug CPU" section + * of RCU's Requirements documentation. + */ + WRITE_ONCE(rcu_state.gp_state, RCU_GP_ONOFF); + /* Exclude CPU hotplug operations. */ rcu_for_each_leaf_node(rnp) { - raw_spin_lock(&rcu_state.ofl_lock); - raw_spin_lock_irq_rcu_node(rnp); + local_irq_disable(); + /* + * Serialize with CPU offline. See Requirements.rst > Hotplug CPU > + * Concurrent Quiescent State Reporting for Offline CPUs. + */ + arch_spin_lock(&rcu_state.ofl_lock); + raw_spin_lock_rcu_node(rnp); if (rnp->qsmaskinit == rnp->qsmaskinitnext && !rnp->wait_blkd_tasks) { /* Nothing to do on this leaf rcu_node structure. */ - raw_spin_unlock_irq_rcu_node(rnp); - raw_spin_unlock(&rcu_state.ofl_lock); + raw_spin_unlock_rcu_node(rnp); + arch_spin_unlock(&rcu_state.ofl_lock); + local_irq_enable(); continue; } @@ -1831,8 +1931,9 @@ static bool rcu_gp_init(void) rcu_cleanup_dead_rnp(rnp); } - raw_spin_unlock_irq_rcu_node(rnp); - raw_spin_unlock(&rcu_state.ofl_lock); + raw_spin_unlock_rcu_node(rnp); + arch_spin_unlock(&rcu_state.ofl_lock); + local_irq_enable(); } rcu_gp_slow(gp_preinit_delay); /* Races with CPU hotplug. */ @@ -1848,7 +1949,7 @@ static bool rcu_gp_init(void) * The grace period cannot complete until the initialization * process finishes, because this kthread handles both. */ - rcu_state.gp_state = RCU_GP_INIT; + WRITE_ONCE(rcu_state.gp_state, RCU_GP_INIT); rcu_for_each_node_breadth_first(rnp) { rcu_gp_slow(gp_init_delay); raw_spin_lock_irqsave_rcu_node(rnp, flags); @@ -1862,7 +1963,12 @@ static bool rcu_gp_init(void) trace_rcu_grace_period_init(rcu_state.name, rnp->gp_seq, rnp->level, rnp->grplo, rnp->grphi, rnp->qsmask); - /* Quiescent states for tasks on any now-offline CPUs. */ + /* + * Quiescent states for tasks on any now-offline CPUs. Since we + * released the ofl and rnp lock before this loop, CPUs might + * have gone offline and we have to report QS on their behalf. + * See Requirements.rst > Hotplug CPU > Concurrent QS Reporting. + */ mask = rnp->qsmask & ~rnp->qsmaskinitnext; rnp->rcu_gp_init_mask = mask; if ((mask || rnp->wait_blkd_tasks) && rcu_is_leaf_node(rnp)) @@ -1873,6 +1979,10 @@ static bool rcu_gp_init(void) WRITE_ONCE(rcu_state.gp_activity, jiffies); } + // If strict, make all CPUs aware of new grace period. + if (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD)) + on_each_cpu(rcu_strict_gp_boundary, NULL, 0); + return true; } @@ -1884,12 +1994,16 @@ static bool rcu_gp_fqs_check_wake(int *gfp) { struct rcu_node *rnp = rcu_get_root(); - /* Someone like call_rcu() requested a force-quiescent-state scan. */ + // If under overload conditions, force an immediate FQS scan. + if (*gfp & RCU_GP_FLAG_OVLD) + return true; + + // Someone like call_rcu() requested a force-quiescent-state scan. *gfp = READ_ONCE(rcu_state.gp_flags); if (*gfp & RCU_GP_FLAG_FQS) return true; - /* The current grace period has completed. */ + // The current grace period has completed. if (!READ_ONCE(rnp->qsmask) && !rcu_preempt_blocked_readers_cgp(rnp)) return true; @@ -1901,22 +2015,33 @@ static bool rcu_gp_fqs_check_wake(int *gfp) */ static void rcu_gp_fqs(bool first_time) { + int nr_fqs = READ_ONCE(rcu_state.nr_fqs_jiffies_stall); struct rcu_node *rnp = rcu_get_root(); WRITE_ONCE(rcu_state.gp_activity, jiffies); - rcu_state.n_force_qs++; + WRITE_ONCE(rcu_state.n_force_qs, rcu_state.n_force_qs + 1); + + WARN_ON_ONCE(nr_fqs > 3); + /* Only countdown nr_fqs for stall purposes if jiffies moves. */ + if (nr_fqs) { + if (nr_fqs == 1) { + WRITE_ONCE(rcu_state.jiffies_stall, + jiffies + rcu_jiffies_till_stall_check()); + } + WRITE_ONCE(rcu_state.nr_fqs_jiffies_stall, --nr_fqs); + } + if (first_time) { /* Collect dyntick-idle snapshots. */ - force_qs_rnp(dyntick_save_progress_counter); + force_qs_rnp(rcu_watching_snap_save); } else { /* Handle dyntick-idle and offline CPUs. */ - force_qs_rnp(rcu_implicit_dynticks_qs); + force_qs_rnp(rcu_watching_snap_recheck); } /* Clear flag to prevent immediate re-entry. */ if (READ_ONCE(rcu_state.gp_flags) & RCU_GP_FLAG_FQS) { raw_spin_lock_irq_rcu_node(rnp); - WRITE_ONCE(rcu_state.gp_flags, - READ_ONCE(rcu_state.gp_flags) & ~RCU_GP_FLAG_FQS); + WRITE_ONCE(rcu_state.gp_flags, rcu_state.gp_flags & ~RCU_GP_FLAG_FQS); raw_spin_unlock_irq_rcu_node(rnp); } } @@ -1924,45 +2049,66 @@ static void rcu_gp_fqs(bool first_time) /* * Loop doing repeated quiescent-state forcing until the grace period ends. */ -static void rcu_gp_fqs_loop(void) +static noinline_for_stack void rcu_gp_fqs_loop(void) { - bool first_gp_fqs; - int gf; + bool first_gp_fqs = true; + int gf = 0; unsigned long j; int ret; struct rcu_node *rnp = rcu_get_root(); - first_gp_fqs = true; j = READ_ONCE(jiffies_till_first_fqs); + if (rcu_state.cbovld) + gf = RCU_GP_FLAG_OVLD; ret = 0; for (;;) { - if (!ret) { - rcu_state.jiffies_force_qs = jiffies + j; + if (rcu_state.cbovld) { + j = (j + 2) / 3; + if (j <= 0) + j = 1; + } + if (!ret || time_before(jiffies + j, rcu_state.jiffies_force_qs)) { + WRITE_ONCE(rcu_state.jiffies_force_qs, jiffies + j); + /* + * jiffies_force_qs before RCU_GP_WAIT_FQS state + * update; required for stall checks. + */ + smp_wmb(); WRITE_ONCE(rcu_state.jiffies_kick_kthreads, - jiffies + 3 * j); + jiffies + (j ? 3 * j : 2)); } - trace_rcu_grace_period(rcu_state.name, - READ_ONCE(rcu_state.gp_seq), + trace_rcu_grace_period(rcu_state.name, rcu_state.gp_seq, TPS("fqswait")); - rcu_state.gp_state = RCU_GP_WAIT_FQS; - ret = swait_event_idle_timeout_exclusive( - rcu_state.gp_wq, rcu_gp_fqs_check_wake(&gf), j); - rcu_state.gp_state = RCU_GP_DOING_FQS; + WRITE_ONCE(rcu_state.gp_state, RCU_GP_WAIT_FQS); + (void)swait_event_idle_timeout_exclusive(rcu_state.gp_wq, + rcu_gp_fqs_check_wake(&gf), j); + rcu_gp_torture_wait(); + WRITE_ONCE(rcu_state.gp_state, RCU_GP_DOING_FQS); /* Locking provides needed memory barriers. */ - /* If grace period done, leave loop. */ + /* + * Exit the loop if the root rcu_node structure indicates that the grace period + * has ended, leave the loop. The rcu_preempt_blocked_readers_cgp(rnp) check + * is required only for single-node rcu_node trees because readers blocking + * the current grace period are queued only on leaf rcu_node structures. + * For multi-node trees, checking the root node's ->qsmask suffices, because a + * given root node's ->qsmask bit is cleared only when all CPUs and tasks from + * the corresponding leaf nodes have passed through their quiescent state. + */ if (!READ_ONCE(rnp->qsmask) && !rcu_preempt_blocked_readers_cgp(rnp)) break; /* If time for quiescent-state forcing, do it. */ - if (ULONG_CMP_GE(jiffies, rcu_state.jiffies_force_qs) || - (gf & RCU_GP_FLAG_FQS)) { - trace_rcu_grace_period(rcu_state.name, - READ_ONCE(rcu_state.gp_seq), + if (!time_after(rcu_state.jiffies_force_qs, jiffies) || + (gf & (RCU_GP_FLAG_FQS | RCU_GP_FLAG_OVLD))) { + trace_rcu_grace_period(rcu_state.name, rcu_state.gp_seq, TPS("fqsstart")); rcu_gp_fqs(first_gp_fqs); - first_gp_fqs = false; - trace_rcu_grace_period(rcu_state.name, - READ_ONCE(rcu_state.gp_seq), + gf = 0; + if (first_gp_fqs) { + first_gp_fqs = false; + gf = rcu_state.cbovld ? RCU_GP_FLAG_OVLD : 0; + } + trace_rcu_grace_period(rcu_state.name, rcu_state.gp_seq, TPS("fqsend")); cond_resched_tasks_rcu_qs(); WRITE_ONCE(rcu_state.gp_activity, jiffies); @@ -1973,8 +2119,7 @@ static void rcu_gp_fqs_loop(void) cond_resched_tasks_rcu_qs(); WRITE_ONCE(rcu_state.gp_activity, jiffies); WARN_ON(signal_pending(current)); - trace_rcu_grace_period(rcu_state.name, - READ_ONCE(rcu_state.gp_seq), + trace_rcu_grace_period(rcu_state.name, rcu_state.gp_seq, TPS("fqswaitsig")); ret = 1; /* Keep old FQS timing. */ j = jiffies; @@ -1982,6 +2127,7 @@ static void rcu_gp_fqs_loop(void) j = 1; else j = rcu_state.jiffies_force_qs - j; + gf = 0; } } } @@ -1989,11 +2135,13 @@ static void rcu_gp_fqs_loop(void) /* * Clean up after the old grace period. */ -static void rcu_gp_cleanup(void) +static noinline void rcu_gp_cleanup(void) { - unsigned long gp_duration; + int cpu; bool needgp = false; + unsigned long gp_duration; unsigned long new_gp_seq; + bool offloaded; struct rcu_data *rdp; struct rcu_node *rnp = rcu_get_root(); struct swait_queue_head *sq; @@ -2013,6 +2161,7 @@ static void rcu_gp_cleanup(void) * safe for us to drop the lock in order to mark the grace * period as completed in all of the rcu_node structures. */ + rcu_poll_gp_seq_end(&rcu_state.gp_seq_polled_snap); raw_spin_unlock_irq_rcu_node(rnp); /* @@ -2032,11 +2181,19 @@ static void rcu_gp_cleanup(void) dump_blkd_tasks(rnp, 10); WARN_ON_ONCE(rnp->qsmask); WRITE_ONCE(rnp->gp_seq, new_gp_seq); + if (!rnp->parent) + smp_mb(); // Order against failing poll_state_synchronize_rcu_full(). rdp = this_cpu_ptr(&rcu_data); if (rnp == rdp->mynode) needgp = __note_gp_changes(rnp, rdp) || needgp; /* smp_mb() provided by prior unlock-lock pair. */ needgp = rcu_future_gp_cleanup(rnp) || needgp; + // Reset overload indication for CPUs no longer overloaded + if (rcu_is_leaf_node(rnp)) + for_each_leaf_node_cpu_mask(rnp, cpu, rnp->cbovldmask) { + rdp = per_cpu_ptr(&rcu_data, cpu); + check_cb_ovld_locked(rdp, rnp); + } sq = rcu_nocb_gp_get(rnp); raw_spin_unlock_irq_rcu_node(rnp); rcu_nocb_gp_cleanup(sq); @@ -2050,7 +2207,8 @@ static void rcu_gp_cleanup(void) /* Declare grace period done, trace first to use old GP number. */ trace_rcu_grace_period(rcu_state.name, rcu_state.gp_seq, TPS("end")); rcu_seq_end(&rcu_state.gp_seq); - rcu_state.gp_state = RCU_GP_IDLE; + ASSERT_EXCLUSIVE_WRITER(rcu_state.gp_seq); + WRITE_ONCE(rcu_state.gp_state, RCU_GP_IDLE); /* Check for GP requests since above loop. */ rdp = this_cpu_ptr(&rcu_data); if (!needgp && ULONG_CMP_LT(rnp->gp_seq, rnp->gp_seq_needed)) { @@ -2059,17 +2217,40 @@ static void rcu_gp_cleanup(void) needgp = true; } /* Advance CBs to reduce false positives below. */ - if (!rcu_accelerate_cbs(rnp, rdp) && needgp) { + offloaded = rcu_rdp_is_offloaded(rdp); + if ((offloaded || !rcu_accelerate_cbs(rnp, rdp)) && needgp) { + + // We get here if a grace period was needed (“needgp”) + // and the above call to rcu_accelerate_cbs() did not set + // the RCU_GP_FLAG_INIT bit in ->gp_state (which records + // the need for another grace period). The purpose + // of the “offloaded” check is to avoid invoking + // rcu_accelerate_cbs() on an offloaded CPU because we do not + // hold the ->nocb_lock needed to safely access an offloaded + // ->cblist. We do not want to acquire that lock because + // it can be heavily contended during callback floods. + WRITE_ONCE(rcu_state.gp_flags, RCU_GP_FLAG_INIT); - rcu_state.gp_req_activity = jiffies; - trace_rcu_grace_period(rcu_state.name, - READ_ONCE(rcu_state.gp_seq), - TPS("newreq")); + WRITE_ONCE(rcu_state.gp_req_activity, jiffies); + trace_rcu_grace_period(rcu_state.name, rcu_state.gp_seq, TPS("newreq")); } else { - WRITE_ONCE(rcu_state.gp_flags, - rcu_state.gp_flags & RCU_GP_FLAG_INIT); + + // We get here either if there is no need for an + // additional grace period or if rcu_accelerate_cbs() has + // already set the RCU_GP_FLAG_INIT bit in ->gp_flags. + // So all we need to do is to clear all of the other + // ->gp_flags bits. + + WRITE_ONCE(rcu_state.gp_flags, rcu_state.gp_flags & RCU_GP_FLAG_INIT); } raw_spin_unlock_irq_rcu_node(rnp); + + // Make synchronize_rcu() users aware of the end of old grace period. + rcu_sr_normal_gp_cleanup(); + + // If strict, make all CPUs aware of the end of the old grace period. + if (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD)) + on_each_cpu(rcu_strict_gp_boundary, NULL, 0); } /* @@ -2082,22 +2263,21 @@ static int __noreturn rcu_gp_kthread(void *unused) /* Handle grace-period start. */ for (;;) { - trace_rcu_grace_period(rcu_state.name, - READ_ONCE(rcu_state.gp_seq), + trace_rcu_grace_period(rcu_state.name, rcu_state.gp_seq, TPS("reqwait")); - rcu_state.gp_state = RCU_GP_WAIT_GPS; + WRITE_ONCE(rcu_state.gp_state, RCU_GP_WAIT_GPS); swait_event_idle_exclusive(rcu_state.gp_wq, READ_ONCE(rcu_state.gp_flags) & RCU_GP_FLAG_INIT); - rcu_state.gp_state = RCU_GP_DONE_GPS; + rcu_gp_torture_wait(); + WRITE_ONCE(rcu_state.gp_state, RCU_GP_DONE_GPS); /* Locking provides needed memory barrier. */ if (rcu_gp_init()) break; cond_resched_tasks_rcu_qs(); WRITE_ONCE(rcu_state.gp_activity, jiffies); WARN_ON(signal_pending(current)); - trace_rcu_grace_period(rcu_state.name, - READ_ONCE(rcu_state.gp_seq), + trace_rcu_grace_period(rcu_state.name, rcu_state.gp_seq, TPS("reqwaitsig")); } @@ -2105,9 +2285,9 @@ static int __noreturn rcu_gp_kthread(void *unused) rcu_gp_fqs_loop(); /* Handle grace-period end. */ - rcu_state.gp_state = RCU_GP_CLEANUP; + WRITE_ONCE(rcu_state.gp_state, RCU_GP_CLEANUP); rcu_gp_cleanup(); - rcu_state.gp_state = RCU_GP_CLEANED; + WRITE_ONCE(rcu_state.gp_state, RCU_GP_CLEANED); } } @@ -2125,8 +2305,7 @@ static void rcu_report_qs_rsp(unsigned long flags) { raw_lockdep_assert_held_rcu_node(rcu_get_root()); WARN_ON_ONCE(!rcu_gp_in_progress()); - WRITE_ONCE(rcu_state.gp_flags, - READ_ONCE(rcu_state.gp_flags) | RCU_GP_FLAG_FQS); + WRITE_ONCE(rcu_state.gp_flags, rcu_state.gp_flags | RCU_GP_FLAG_FQS); raw_spin_unlock_irqrestore_rcu_node(rcu_get_root(), flags); rcu_gp_kthread_wake(); } @@ -2168,7 +2347,7 @@ static void rcu_report_qs_rnp(unsigned long mask, struct rcu_node *rnp, WARN_ON_ONCE(oldmask); /* Any child must be all zeroed! */ WARN_ON_ONCE(!rcu_is_leaf_node(rnp) && rcu_preempt_blocked_readers_cgp(rnp)); - rnp->qsmask &= ~mask; + WRITE_ONCE(rnp->qsmask, rnp->qsmask & ~mask); trace_rcu_quiescent_state_report(rcu_state.name, rnp->gp_seq, mask, rnp->qsmask, rnp->level, rnp->grplo, rnp->grphi, @@ -2191,7 +2370,7 @@ static void rcu_report_qs_rnp(unsigned long mask, struct rcu_node *rnp, rnp_c = rnp; rnp = rnp->parent; raw_spin_lock_irqsave_rcu_node(rnp, flags); - oldmask = rnp_c->qsmask; + oldmask = READ_ONCE(rnp_c->qsmask); } /* @@ -2218,7 +2397,7 @@ rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags) struct rcu_node *rnp_p; raw_lockdep_assert_held_rcu_node(rnp); - if (WARN_ON_ONCE(!IS_ENABLED(CONFIG_PREEMPT)) || + if (WARN_ON_ONCE(!IS_ENABLED(CONFIG_PREEMPT_RCU)) || WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp)) || rnp->qsmask != 0) { raw_spin_unlock_irqrestore_rcu_node(rnp, flags); @@ -2249,13 +2428,13 @@ rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags) * structure. This must be called from the specified CPU. */ static void -rcu_report_qs_rdp(int cpu, struct rcu_data *rdp) +rcu_report_qs_rdp(struct rcu_data *rdp) { unsigned long flags; unsigned long mask; - bool needwake; struct rcu_node *rnp; + WARN_ON_ONCE(rdp->cpu != smp_processor_id()); rnp = rdp->mynode; raw_spin_lock_irqsave_rcu_node(rnp, flags); if (rdp->cpu_no_qs.b.norm || rdp->gp_seq != rnp->gp_seq || @@ -2272,21 +2451,28 @@ rcu_report_qs_rdp(int cpu, struct rcu_data *rdp) return; } mask = rdp->grpmask; + rdp->core_needs_qs = false; if ((rnp->qsmask & mask) == 0) { raw_spin_unlock_irqrestore_rcu_node(rnp, flags); } else { - rdp->core_needs_qs = false; - /* * This GP can't end until cpu checks in, so all of our * callbacks can be processed during the next GP. + * + * NOCB kthreads have their own way to deal with that... */ - needwake = rcu_accelerate_cbs(rnp, rdp); + if (!rcu_rdp_is_offloaded(rdp)) { + /* + * The current GP has not yet ended, so it + * should not be possible for rcu_accelerate_cbs() + * to return true. So complain, but don't awaken. + */ + WARN_ON_ONCE(rcu_accelerate_cbs(rnp, rdp)); + } + rcu_disable_urgency_upon_qs(rdp); rcu_report_qs_rnp(mask, rnp, rnp->gp_seq, flags); /* ^^^ Released rnp->lock */ - if (needwake) - rcu_gp_kthread_wake(); } } @@ -2320,166 +2506,154 @@ rcu_check_quiescent_state(struct rcu_data *rdp) * Tell RCU we are done (but rcu_report_qs_rdp() will be the * judge of that). */ - rcu_report_qs_rdp(rdp->cpu, rdp); + rcu_report_qs_rdp(rdp); } -/* - * Near the end of the offline process. Trace the fact that this CPU - * is going offline. - */ -int rcutree_dying_cpu(unsigned int cpu) +/* Return true if callback-invocation time limit exceeded. */ +static bool rcu_do_batch_check_time(long count, long tlimit, + bool jlimit_check, unsigned long jlimit) { - RCU_TRACE(bool blkd;) - RCU_TRACE(struct rcu_data *rdp = this_cpu_ptr(&rcu_data);) - RCU_TRACE(struct rcu_node *rnp = rdp->mynode;) - - if (!IS_ENABLED(CONFIG_HOTPLUG_CPU)) - return 0; - - RCU_TRACE(blkd = !!(rnp->qsmask & rdp->grpmask);) - trace_rcu_grace_period(rcu_state.name, rnp->gp_seq, - blkd ? TPS("cpuofl") : TPS("cpuofl-bgp")); - return 0; -} - -/* - * All CPUs for the specified rcu_node structure have gone offline, - * and all tasks that were preempted within an RCU read-side critical - * section while running on one of those CPUs have since exited their RCU - * read-side critical section. Some other CPU is reporting this fact with - * the specified rcu_node structure's ->lock held and interrupts disabled. - * This function therefore goes up the tree of rcu_node structures, - * clearing the corresponding bits in the ->qsmaskinit fields. Note that - * the leaf rcu_node structure's ->qsmaskinit field has already been - * updated. - * - * This function does check that the specified rcu_node structure has - * all CPUs offline and no blocked tasks, so it is OK to invoke it - * prematurely. That said, invoking it after the fact will cost you - * a needless lock acquisition. So once it has done its work, don't - * invoke it again. - */ -static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf) -{ - long mask; - struct rcu_node *rnp = rnp_leaf; - - raw_lockdep_assert_held_rcu_node(rnp_leaf); - if (!IS_ENABLED(CONFIG_HOTPLUG_CPU) || - WARN_ON_ONCE(rnp_leaf->qsmaskinit) || - WARN_ON_ONCE(rcu_preempt_has_tasks(rnp_leaf))) - return; - for (;;) { - mask = rnp->grpmask; - rnp = rnp->parent; - if (!rnp) - break; - raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */ - rnp->qsmaskinit &= ~mask; - /* Between grace periods, so better already be zero! */ - WARN_ON_ONCE(rnp->qsmask); - if (rnp->qsmaskinit) { - raw_spin_unlock_rcu_node(rnp); - /* irqs remain disabled. */ - return; - } - raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled. */ - } -} - -/* - * The CPU has been completely removed, and some other CPU is reporting - * this fact from process context. Do the remainder of the cleanup. - * There can only be one CPU hotplug operation at a time, so no need for - * explicit locking. - */ -int rcutree_dead_cpu(unsigned int cpu) -{ - struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); - struct rcu_node *rnp = rdp->mynode; /* Outgoing CPU's rdp & rnp. */ - - if (!IS_ENABLED(CONFIG_HOTPLUG_CPU)) - return 0; - - /* Adjust any no-longer-needed kthreads. */ - rcu_boost_kthread_setaffinity(rnp, -1); - /* Do any needed no-CB deferred wakeups from this CPU. */ - do_nocb_deferred_wakeup(per_cpu_ptr(&rcu_data, cpu)); - return 0; + // Invoke local_clock() only once per 32 consecutive callbacks. + return unlikely(tlimit) && + (!likely(count & 31) || + (IS_ENABLED(CONFIG_RCU_DOUBLE_CHECK_CB_TIME) && + jlimit_check && time_after(jiffies, jlimit))) && + local_clock() >= tlimit; } /* * Invoke any RCU callbacks that have made it to the end of their grace - * period. Thottle as specified by rdp->blimit. + * period. Throttle as specified by rdp->blimit. */ static void rcu_do_batch(struct rcu_data *rdp) { + long bl; + long count = 0; + int div; + bool __maybe_unused empty; unsigned long flags; - struct rcu_head *rhp; + unsigned long jlimit; + bool jlimit_check = false; + long pending; struct rcu_cblist rcl = RCU_CBLIST_INITIALIZER(rcl); - long bl, count; + struct rcu_head *rhp; + long tlimit = 0; /* If no callbacks are ready, just return. */ if (!rcu_segcblist_ready_cbs(&rdp->cblist)) { trace_rcu_batch_start(rcu_state.name, - rcu_segcblist_n_lazy_cbs(&rdp->cblist), rcu_segcblist_n_cbs(&rdp->cblist), 0); trace_rcu_batch_end(rcu_state.name, 0, !rcu_segcblist_empty(&rdp->cblist), need_resched(), is_idle_task(current), - rcu_is_callbacks_kthread()); + rcu_is_callbacks_kthread(rdp)); return; } /* - * Extract the list of ready callbacks, disabling to prevent + * Extract the list of ready callbacks, disabling IRQs to prevent * races with call_rcu() from interrupt handlers. Leave the * callback counts, as rcu_barrier() needs to be conservative. + * + * Callbacks execution is fully ordered against preceding grace period + * completion (materialized by rnp->gp_seq update) thanks to the + * smp_mb__after_unlock_lock() upon node locking required for callbacks + * advancing. In NOCB mode this ordering is then further relayed through + * the nocb locking that protects both callbacks advancing and extraction. */ - local_irq_save(flags); + rcu_nocb_lock_irqsave(rdp, flags); WARN_ON_ONCE(cpu_is_offline(smp_processor_id())); - bl = rdp->blimit; + pending = rcu_segcblist_get_seglen(&rdp->cblist, RCU_DONE_TAIL); + div = READ_ONCE(rcu_divisor); + div = div < 0 ? 7 : div > sizeof(long) * 8 - 2 ? sizeof(long) * 8 - 2 : div; + bl = max(rdp->blimit, pending >> div); + if ((in_serving_softirq() || rdp->rcu_cpu_kthread_status == RCU_KTHREAD_RUNNING) && + (IS_ENABLED(CONFIG_RCU_DOUBLE_CHECK_CB_TIME) || unlikely(bl > 100))) { + const long npj = NSEC_PER_SEC / HZ; + long rrn = READ_ONCE(rcu_resched_ns); + + rrn = rrn < NSEC_PER_MSEC ? NSEC_PER_MSEC : rrn > NSEC_PER_SEC ? NSEC_PER_SEC : rrn; + tlimit = local_clock() + rrn; + jlimit = jiffies + (rrn + npj + 1) / npj; + jlimit_check = true; + } trace_rcu_batch_start(rcu_state.name, - rcu_segcblist_n_lazy_cbs(&rdp->cblist), rcu_segcblist_n_cbs(&rdp->cblist), bl); rcu_segcblist_extract_done_cbs(&rdp->cblist, &rcl); - local_irq_restore(flags); + if (rcu_rdp_is_offloaded(rdp)) + rdp->qlen_last_fqs_check = rcu_segcblist_n_cbs(&rdp->cblist); + + trace_rcu_segcb_stats(&rdp->cblist, TPS("SegCbDequeued")); + rcu_nocb_unlock_irqrestore(rdp, flags); /* Invoke callbacks. */ + tick_dep_set_task(current, TICK_DEP_BIT_RCU); rhp = rcu_cblist_dequeue(&rcl); + for (; rhp; rhp = rcu_cblist_dequeue(&rcl)) { + rcu_callback_t f; + + count++; debug_rcu_head_unqueue(rhp); - if (__rcu_reclaim(rcu_state.name, rhp)) - rcu_cblist_dequeued_lazy(&rcl); + + rcu_lock_acquire(&rcu_callback_map); + trace_rcu_invoke_callback(rcu_state.name, rhp); + + f = rhp->func; + debug_rcu_head_callback(rhp); + WRITE_ONCE(rhp->func, (rcu_callback_t)0L); + f(rhp); + + rcu_lock_release(&rcu_callback_map); + /* * Stop only if limit reached and CPU has something to do. - * Note: The rcl structure counts down from zero. */ - if (-rcl.len >= bl && - (need_resched() || - (!is_idle_task(current) && !rcu_is_callbacks_kthread()))) - break; + if (in_serving_softirq()) { + if (count >= bl && (need_resched() || !is_idle_task(current))) + break; + /* + * Make sure we don't spend too much time here and deprive other + * softirq vectors of CPU cycles. + */ + if (rcu_do_batch_check_time(count, tlimit, jlimit_check, jlimit)) + break; + } else { + // In rcuc/rcuoc context, so no worries about + // depriving other softirq vectors of CPU cycles. + local_bh_enable(); + lockdep_assert_irqs_enabled(); + cond_resched_tasks_rcu_qs(); + lockdep_assert_irqs_enabled(); + local_bh_disable(); + // But rcuc kthreads can delay quiescent-state + // reporting, so check time limits for them. + if (rdp->rcu_cpu_kthread_status == RCU_KTHREAD_RUNNING && + rcu_do_batch_check_time(count, tlimit, jlimit_check, jlimit)) { + rdp->rcu_cpu_has_work = 1; + break; + } + } } - local_irq_save(flags); - count = -rcl.len; + rcu_nocb_lock_irqsave(rdp, flags); + rdp->n_cbs_invoked += count; trace_rcu_batch_end(rcu_state.name, count, !!rcl.head, need_resched(), - is_idle_task(current), rcu_is_callbacks_kthread()); + is_idle_task(current), rcu_is_callbacks_kthread(rdp)); /* Update counts and requeue any remaining callbacks. */ rcu_segcblist_insert_done_cbs(&rdp->cblist, &rcl); - smp_mb(); /* List handling before counting for rcu_barrier(). */ - rcu_segcblist_insert_count(&rdp->cblist, &rcl); + rcu_segcblist_add_len(&rdp->cblist, -count); /* Reinstate batch limit if we have worked down the excess. */ count = rcu_segcblist_n_cbs(&rdp->cblist); - if (rdp->blimit == LONG_MAX && count <= qlowmark) + if (rdp->blimit >= DEFAULT_MAX_RCU_BLIMIT && count <= qlowmark) rdp->blimit = blimit; /* Reset ->qlen_last_fqs_check trigger if enough CBs have drained. */ if (count == 0 && rdp->qlen_last_fqs_check != 0) { rdp->qlen_last_fqs_check = 0; - rdp->n_force_qs_snap = rcu_state.n_force_qs; + rdp->n_force_qs_snap = READ_ONCE(rcu_state.n_force_qs); } else if (count < rdp->qlen_last_fqs_check - qhimark) rdp->qlen_last_fqs_check = count; @@ -2487,64 +2661,79 @@ static void rcu_do_batch(struct rcu_data *rdp) * The following usually indicates a double call_rcu(). To track * this down, try building with CONFIG_DEBUG_OBJECTS_RCU_HEAD=y. */ - WARN_ON_ONCE(rcu_segcblist_empty(&rdp->cblist) != (count == 0)); + empty = rcu_segcblist_empty(&rdp->cblist); + WARN_ON_ONCE(count == 0 && !empty); + WARN_ON_ONCE(!IS_ENABLED(CONFIG_RCU_NOCB_CPU) && + count != 0 && empty); + WARN_ON_ONCE(count == 0 && rcu_segcblist_n_segment_cbs(&rdp->cblist) != 0); + WARN_ON_ONCE(!empty && rcu_segcblist_n_segment_cbs(&rdp->cblist) == 0); - local_irq_restore(flags); + rcu_nocb_unlock_irqrestore(rdp, flags); - /* Re-invoke RCU core processing if there are callbacks remaining. */ - if (rcu_segcblist_ready_cbs(&rdp->cblist)) - invoke_rcu_core(); + tick_dep_clear_task(current, TICK_DEP_BIT_RCU); } /* - * Check to see if this CPU is in a non-context-switch quiescent state - * (user mode or idle loop for rcu, non-softirq execution for rcu_bh). - * Also schedule RCU core processing. - * - * This function must be called from hardirq context. It is normally - * invoked from the scheduling-clock interrupt. + * This function is invoked from each scheduling-clock interrupt, + * and checks to see if this CPU is in a non-context-switch quiescent + * state, for example, user mode or idle loop. It also schedules RCU + * core processing. If the current grace period has gone on too long, + * it will ask the scheduler to manufacture a context switch for the sole + * purpose of providing the needed quiescent state. */ -void rcu_check_callbacks(int user) +void rcu_sched_clock_irq(int user) { + unsigned long j; + + if (IS_ENABLED(CONFIG_PROVE_RCU)) { + j = jiffies; + WARN_ON_ONCE(time_before(j, __this_cpu_read(rcu_data.last_sched_clock))); + __this_cpu_write(rcu_data.last_sched_clock, j); + } trace_rcu_utilization(TPS("Start scheduler-tick")); + lockdep_assert_irqs_disabled(); raw_cpu_inc(rcu_data.ticks_this_gp); /* The load-acquire pairs with the store-release setting to true. */ if (smp_load_acquire(this_cpu_ptr(&rcu_data.rcu_urgent_qs))) { /* Idle and userspace execution already are quiescent states. */ - if (!rcu_is_cpu_rrupt_from_idle() && !user) { - set_tsk_need_resched(current); - set_preempt_need_resched(); - } + if (!rcu_is_cpu_rrupt_from_idle() && !user) + set_need_resched_current(); __this_cpu_write(rcu_data.rcu_urgent_qs, false); } - rcu_flavor_check_callbacks(user); - if (rcu_pending()) + rcu_flavor_sched_clock_irq(user); + if (rcu_pending(user)) invoke_rcu_core(); + if (user || rcu_is_cpu_rrupt_from_idle()) + rcu_note_voluntary_context_switch(current); + lockdep_assert_irqs_disabled(); trace_rcu_utilization(TPS("End scheduler-tick")); } /* - * Scan the leaf rcu_node structures, processing dyntick state for any that - * have not yet encountered a quiescent state, using the function specified. - * Also initiate boosting for any threads blocked on the root rcu_node. - * - * The caller must have suppressed start of new grace periods. + * Scan the leaf rcu_node structures. For each structure on which all + * CPUs have reported a quiescent state and on which there are tasks + * blocking the current grace period, initiate RCU priority boosting. + * Otherwise, invoke the specified function to check dyntick state for + * each CPU that has not yet reported a quiescent state. */ static void force_qs_rnp(int (*f)(struct rcu_data *rdp)) { int cpu; unsigned long flags; - unsigned long mask; struct rcu_node *rnp; + rcu_state.cbovld = rcu_state.cbovldnext; + rcu_state.cbovldnext = false; rcu_for_each_leaf_node(rnp) { + unsigned long mask = 0; + unsigned long rsmask = 0; + cond_resched_tasks_rcu_qs(); - mask = 0; raw_spin_lock_irqsave_rcu_node(rnp, flags); + rcu_state.cbovldnext |= !!rnp->cbovldmask; if (rnp->qsmask == 0) { - if (!IS_ENABLED(CONFIG_PREEMPT) || - rcu_preempt_blocked_readers_cgp(rnp)) { + if (rcu_preempt_blocked_readers_cgp(rnp)) { /* * No point in scanning bits because they * are all zero. But we might need to @@ -2557,12 +2746,18 @@ static void force_qs_rnp(int (*f)(struct rcu_data *rdp)) raw_spin_unlock_irqrestore_rcu_node(rnp, flags); continue; } - for_each_leaf_node_possible_cpu(rnp, cpu) { - unsigned long bit = leaf_node_cpu_bit(rnp, cpu); - if ((rnp->qsmask & bit) != 0) { - if (f(per_cpu_ptr(&rcu_data, cpu))) - mask |= bit; + for_each_leaf_node_cpu_mask(rnp, cpu, rnp->qsmask) { + struct rcu_data *rdp; + int ret; + + rdp = per_cpu_ptr(&rcu_data, cpu); + ret = f(rdp); + if (ret > 0) { + mask |= rdp->grpmask; + rcu_disable_urgency_upon_qs(rdp); } + if (ret < 0) + rsmask |= rdp->grpmask; } if (mask != 0) { /* Idle/offline CPUs, report (releases rnp->lock). */ @@ -2571,6 +2766,9 @@ static void force_qs_rnp(int (*f)(struct rcu_data *rdp)) /* Nothing to do here, so just drop the lock. */ raw_spin_unlock_irqrestore_rcu_node(rnp, flags); } + + for_each_leaf_node_cpu_mask(rnp, cpu, rsmask) + resched_cpu(cpu); } } @@ -2578,18 +2776,20 @@ static void force_qs_rnp(int (*f)(struct rcu_data *rdp)) * Force quiescent states on reluctant CPUs, and also detect which * CPUs are in dyntick-idle mode. */ -static void force_quiescent_state(void) +void rcu_force_quiescent_state(void) { unsigned long flags; bool ret; struct rcu_node *rnp; struct rcu_node *rnp_old = NULL; + if (!rcu_gp_in_progress()) + return; /* Funnel through hierarchy to reduce memory contention. */ - rnp = __this_cpu_read(rcu_data.mynode); + rnp = raw_cpu_read(rcu_data.mynode); for (; rnp != NULL; rnp = rnp->parent) { ret = (READ_ONCE(rcu_state.gp_flags) & RCU_GP_FLAG_FQS) || - !raw_spin_trylock(&rnp->fqslock); + !raw_spin_trylock(&rnp->fqslock); if (rnp_old != NULL) raw_spin_unlock(&rnp_old->fqslock); if (ret) @@ -2605,120 +2805,23 @@ static void force_quiescent_state(void) raw_spin_unlock_irqrestore_rcu_node(rnp_old, flags); return; /* Someone beat us to it. */ } - WRITE_ONCE(rcu_state.gp_flags, - READ_ONCE(rcu_state.gp_flags) | RCU_GP_FLAG_FQS); + WRITE_ONCE(rcu_state.gp_flags, rcu_state.gp_flags | RCU_GP_FLAG_FQS); raw_spin_unlock_irqrestore_rcu_node(rnp_old, flags); rcu_gp_kthread_wake(); } +EXPORT_SYMBOL_GPL(rcu_force_quiescent_state); -/* - * This function checks for grace-period requests that fail to motivate - * RCU to come out of its idle mode. - */ -void -rcu_check_gp_start_stall(struct rcu_node *rnp, struct rcu_data *rdp, - const unsigned long gpssdelay) -{ - unsigned long flags; - unsigned long j; - struct rcu_node *rnp_root = rcu_get_root(); - static atomic_t warned = ATOMIC_INIT(0); - - if (!IS_ENABLED(CONFIG_PROVE_RCU) || rcu_gp_in_progress() || - ULONG_CMP_GE(rnp_root->gp_seq, rnp_root->gp_seq_needed)) - return; - j = jiffies; /* Expensive access, and in common case don't get here. */ - if (time_before(j, READ_ONCE(rcu_state.gp_req_activity) + gpssdelay) || - time_before(j, READ_ONCE(rcu_state.gp_activity) + gpssdelay) || - atomic_read(&warned)) - return; - - raw_spin_lock_irqsave_rcu_node(rnp, flags); - j = jiffies; - if (rcu_gp_in_progress() || - ULONG_CMP_GE(rnp_root->gp_seq, rnp_root->gp_seq_needed) || - time_before(j, READ_ONCE(rcu_state.gp_req_activity) + gpssdelay) || - time_before(j, READ_ONCE(rcu_state.gp_activity) + gpssdelay) || - atomic_read(&warned)) { - raw_spin_unlock_irqrestore_rcu_node(rnp, flags); - return; - } - /* Hold onto the leaf lock to make others see warned==1. */ - - if (rnp_root != rnp) - raw_spin_lock_rcu_node(rnp_root); /* irqs already disabled. */ - j = jiffies; - if (rcu_gp_in_progress() || - ULONG_CMP_GE(rnp_root->gp_seq, rnp_root->gp_seq_needed) || - time_before(j, rcu_state.gp_req_activity + gpssdelay) || - time_before(j, rcu_state.gp_activity + gpssdelay) || - atomic_xchg(&warned, 1)) { - raw_spin_unlock_rcu_node(rnp_root); /* irqs remain disabled. */ - raw_spin_unlock_irqrestore_rcu_node(rnp, flags); - return; - } - pr_alert("%s: g%ld->%ld gar:%lu ga:%lu f%#x gs:%d %s->state:%#lx\n", - __func__, (long)READ_ONCE(rcu_state.gp_seq), - (long)READ_ONCE(rnp_root->gp_seq_needed), - j - rcu_state.gp_req_activity, j - rcu_state.gp_activity, - rcu_state.gp_flags, rcu_state.gp_state, rcu_state.name, - rcu_state.gp_kthread ? rcu_state.gp_kthread->state : 0x1ffffL); - WARN_ON(1); - if (rnp_root != rnp) - raw_spin_unlock_rcu_node(rnp_root); - raw_spin_unlock_irqrestore_rcu_node(rnp, flags); -} - -/* - * Do a forward-progress check for rcutorture. This is normally invoked - * due to an OOM event. The argument "j" gives the time period during - * which rcutorture would like progress to have been made. - */ -void rcu_fwd_progress_check(unsigned long j) +// Workqueue handler for an RCU reader for kernels enforcing struct RCU +// grace periods. +static void strict_work_handler(struct work_struct *work) { - unsigned long cbs; - int cpu; - unsigned long max_cbs = 0; - int max_cpu = -1; - struct rcu_data *rdp; - - if (rcu_gp_in_progress()) { - pr_info("%s: GP age %lu jiffies\n", - __func__, jiffies - rcu_state.gp_start); - show_rcu_gp_kthreads(); - } else { - pr_info("%s: Last GP end %lu jiffies ago\n", - __func__, jiffies - rcu_state.gp_end); - preempt_disable(); - rdp = this_cpu_ptr(&rcu_data); - rcu_check_gp_start_stall(rdp->mynode, rdp, j); - preempt_enable(); - } - for_each_possible_cpu(cpu) { - cbs = rcu_get_n_cbs_cpu(cpu); - if (!cbs) - continue; - if (max_cpu < 0) - pr_info("%s: callbacks", __func__); - pr_cont(" %d: %lu", cpu, cbs); - if (cbs <= max_cbs) - continue; - max_cbs = cbs; - max_cpu = cpu; - } - if (max_cpu >= 0) - pr_cont("\n"); + rcu_read_lock(); + rcu_read_unlock(); } -EXPORT_SYMBOL_GPL(rcu_fwd_progress_check); -/* - * This does the RCU core processing work for the specified rcu_data - * structures. This may be called only from the CPU to whom the rdp - * belongs. - */ -static __latent_entropy void rcu_process_callbacks(struct softirq_action *unused) +/* Perform RCU core processing work for the current CPU. */ +static __latent_entropy void rcu_core(void) { - unsigned long flags; struct rcu_data *rdp = raw_cpu_ptr(&rcu_data); struct rcu_node *rnp = rdp->mynode; @@ -2728,11 +2831,11 @@ static __latent_entropy void rcu_process_callbacks(struct softirq_action *unused WARN_ON_ONCE(!rdp->beenonline); /* Report any deferred quiescent states if preemption enabled. */ - if (!(preempt_count() & PREEMPT_MASK)) { + if (IS_ENABLED(CONFIG_PREEMPT_COUNT) && (!(preempt_count() & PREEMPT_MASK))) { rcu_preempt_deferred_qs(current); } else if (rcu_preempt_need_deferred_qs(current)) { - set_tsk_need_resched(current); - set_preempt_need_resched(); + guard(irqsave)(); + set_need_resched_current(); } /* Update RCU state based on any recent quiescent states. */ @@ -2740,54 +2843,161 @@ static __latent_entropy void rcu_process_callbacks(struct softirq_action *unused /* No grace period and unregistered callbacks? */ if (!rcu_gp_in_progress() && - rcu_segcblist_is_enabled(&rdp->cblist)) { - local_irq_save(flags); + rcu_segcblist_is_enabled(&rdp->cblist) && !rcu_rdp_is_offloaded(rdp)) { + guard(irqsave)(); if (!rcu_segcblist_restempty(&rdp->cblist, RCU_NEXT_READY_TAIL)) rcu_accelerate_cbs_unlocked(rnp, rdp); - local_irq_restore(flags); } rcu_check_gp_start_stall(rnp, rdp, rcu_jiffies_till_stall_check()); /* If there are callbacks ready, invoke them. */ - if (rcu_segcblist_ready_cbs(&rdp->cblist)) - invoke_rcu_callbacks(rdp); + if (!rcu_rdp_is_offloaded(rdp) && rcu_segcblist_ready_cbs(&rdp->cblist) && + likely(READ_ONCE(rcu_scheduler_fully_active))) { + rcu_do_batch(rdp); + /* Re-invoke RCU core processing if there are callbacks remaining. */ + if (rcu_segcblist_ready_cbs(&rdp->cblist)) + invoke_rcu_core(); + } /* Do any needed deferred wakeups of rcuo kthreads. */ do_nocb_deferred_wakeup(rdp); trace_rcu_utilization(TPS("End RCU core")); + + // If strict GPs, schedule an RCU reader in a clean environment. + if (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD)) + queue_work_on(rdp->cpu, rcu_gp_wq, &rdp->strict_work); +} + +static void rcu_core_si(void) +{ + rcu_core(); +} + +static void rcu_wake_cond(struct task_struct *t, int status) +{ + /* + * If the thread is yielding, only wake it when this + * is invoked from idle + */ + if (t && (status != RCU_KTHREAD_YIELDING || is_idle_task(current))) + wake_up_process(t); +} + +static void invoke_rcu_core_kthread(void) +{ + struct task_struct *t; + unsigned long flags; + + local_irq_save(flags); + __this_cpu_write(rcu_data.rcu_cpu_has_work, 1); + t = __this_cpu_read(rcu_data.rcu_cpu_kthread_task); + if (t != NULL && t != current) + rcu_wake_cond(t, __this_cpu_read(rcu_data.rcu_cpu_kthread_status)); + local_irq_restore(flags); } /* - * Schedule RCU callback invocation. If the running implementation of RCU - * does not support RCU priority boosting, just do a direct call, otherwise - * wake up the per-CPU kernel kthread. Note that because we are running - * on the current CPU with softirqs disabled, the rcu_cpu_kthread_task - * cannot disappear out from under us. + * Wake up this CPU's rcuc kthread to do RCU core processing. */ -static void invoke_rcu_callbacks(struct rcu_data *rdp) +static void invoke_rcu_core(void) { - if (unlikely(!READ_ONCE(rcu_scheduler_fully_active))) - return; - if (likely(!rcu_state.boost)) { - rcu_do_batch(rdp); + if (!cpu_online(smp_processor_id())) return; + if (use_softirq) + raise_softirq(RCU_SOFTIRQ); + else + invoke_rcu_core_kthread(); +} + +static void rcu_cpu_kthread_park(unsigned int cpu) +{ + per_cpu(rcu_data.rcu_cpu_kthread_status, cpu) = RCU_KTHREAD_OFFCPU; +} + +static int rcu_cpu_kthread_should_run(unsigned int cpu) +{ + return __this_cpu_read(rcu_data.rcu_cpu_has_work); +} + +/* + * Per-CPU kernel thread that invokes RCU callbacks. This replaces + * the RCU softirq used in configurations of RCU that do not support RCU + * priority boosting. + */ +static void rcu_cpu_kthread(unsigned int cpu) +{ + unsigned int *statusp = this_cpu_ptr(&rcu_data.rcu_cpu_kthread_status); + char work, *workp = this_cpu_ptr(&rcu_data.rcu_cpu_has_work); + unsigned long *j = this_cpu_ptr(&rcu_data.rcuc_activity); + int spincnt; + + trace_rcu_utilization(TPS("Start CPU kthread@rcu_run")); + for (spincnt = 0; spincnt < 10; spincnt++) { + WRITE_ONCE(*j, jiffies); + local_bh_disable(); + *statusp = RCU_KTHREAD_RUNNING; + local_irq_disable(); + work = *workp; + WRITE_ONCE(*workp, 0); + local_irq_enable(); + if (work) + rcu_core(); + local_bh_enable(); + if (!READ_ONCE(*workp)) { + trace_rcu_utilization(TPS("End CPU kthread@rcu_wait")); + *statusp = RCU_KTHREAD_WAITING; + return; + } } - invoke_rcu_callbacks_kthread(); + *statusp = RCU_KTHREAD_YIELDING; + trace_rcu_utilization(TPS("Start CPU kthread@rcu_yield")); + schedule_timeout_idle(2); + trace_rcu_utilization(TPS("End CPU kthread@rcu_yield")); + *statusp = RCU_KTHREAD_WAITING; + WRITE_ONCE(*j, jiffies); } -static void invoke_rcu_core(void) +static struct smp_hotplug_thread rcu_cpu_thread_spec = { + .store = &rcu_data.rcu_cpu_kthread_task, + .thread_should_run = rcu_cpu_kthread_should_run, + .thread_fn = rcu_cpu_kthread, + .thread_comm = "rcuc/%u", + .setup = rcu_cpu_kthread_setup, + .park = rcu_cpu_kthread_park, +}; + +/* + * Spawn per-CPU RCU core processing kthreads. + */ +static int __init rcu_spawn_core_kthreads(void) { - if (cpu_online(smp_processor_id())) - raise_softirq(RCU_SOFTIRQ); + int cpu; + + for_each_possible_cpu(cpu) + per_cpu(rcu_data.rcu_cpu_has_work, cpu) = 0; + if (use_softirq) + return 0; + WARN_ONCE(smpboot_register_percpu_thread(&rcu_cpu_thread_spec), + "%s: Could not start rcuc kthread, OOM is now expected behavior\n", __func__); + return 0; +} + +static void rcutree_enqueue(struct rcu_data *rdp, struct rcu_head *head, rcu_callback_t func) +{ + rcu_segcblist_enqueue(&rdp->cblist, head); + trace_rcu_callback(rcu_state.name, head, + rcu_segcblist_n_cbs(&rdp->cblist)); + trace_rcu_segcb_stats(&rdp->cblist, TPS("SegCBQueued")); } /* * Handle any core-RCU processing required by a call_rcu() invocation. */ -static void __call_rcu_core(struct rcu_data *rdp, struct rcu_head *head, - unsigned long flags) +static void call_rcu_core(struct rcu_data *rdp, struct rcu_head *head, + rcu_callback_t func, unsigned long flags) { + rcutree_enqueue(rdp, head, func); /* * If called from an extended quiescent state, invoke the RCU * core in order to force a re-evaluation of RCU's idleness. @@ -2801,9 +3011,9 @@ static void __call_rcu_core(struct rcu_data *rdp, struct rcu_head *head, /* * Force the grace period if too many callbacks or too long waiting. - * Enforce hysteresis, and don't invoke force_quiescent_state() + * Enforce hysteresis, and don't invoke rcu_force_quiescent_state() * if some other CPU has recently done so. Also, don't bother - * invoking force_quiescent_state() if the newly enqueued callback + * invoking rcu_force_quiescent_state() if the newly enqueued callback * is the only one waiting for a grace period to complete. */ if (unlikely(rcu_segcblist_n_cbs(&rdp->cblist) > @@ -2817,11 +3027,11 @@ static void __call_rcu_core(struct rcu_data *rdp, struct rcu_head *head, rcu_accelerate_cbs_unlocked(rdp->mynode, rdp); } else { /* Give the grace period a kick. */ - rdp->blimit = LONG_MAX; - if (rcu_state.n_force_qs == rdp->n_force_qs_snap && + rdp->blimit = DEFAULT_MAX_RCU_BLIMIT; + if (READ_ONCE(rcu_state.n_force_qs) == rdp->n_force_qs_snap && rcu_segcblist_first_pend_cb(&rdp->cblist) != head) - force_quiescent_state(); - rdp->n_force_qs_snap = rcu_state.n_force_qs; + rcu_force_quiescent_state(); + rdp->n_force_qs_snap = READ_ONCE(rcu_state.n_force_qs); rdp->qlen_last_fqs_check = rcu_segcblist_n_cbs(&rdp->cblist); } } @@ -2835,80 +3045,144 @@ static void rcu_leak_callback(struct rcu_head *rhp) } /* - * Helper function for call_rcu() and friends. The cpu argument will - * normally be -1, indicating "currently running CPU". It may specify - * a CPU only if that CPU is a no-CBs CPU. Currently, only rcu_barrier() - * is expected to specify a CPU. + * Check and if necessary update the leaf rcu_node structure's + * ->cbovldmask bit corresponding to the current CPU based on that CPU's + * number of queued RCU callbacks. The caller must hold the leaf rcu_node + * structure's ->lock. + */ +static void check_cb_ovld_locked(struct rcu_data *rdp, struct rcu_node *rnp) +{ + raw_lockdep_assert_held_rcu_node(rnp); + if (qovld_calc <= 0) + return; // Early boot and wildcard value set. + if (rcu_segcblist_n_cbs(&rdp->cblist) >= qovld_calc) + WRITE_ONCE(rnp->cbovldmask, rnp->cbovldmask | rdp->grpmask); + else + WRITE_ONCE(rnp->cbovldmask, rnp->cbovldmask & ~rdp->grpmask); +} + +/* + * Check and if necessary update the leaf rcu_node structure's + * ->cbovldmask bit corresponding to the current CPU based on that CPU's + * number of queued RCU callbacks. No locks need be held, but the + * caller must have disabled interrupts. + * + * Note that this function ignores the possibility that there are a lot + * of callbacks all of which have already seen the end of their respective + * grace periods. This omission is due to the need for no-CBs CPUs to + * be holding ->nocb_lock to do this check, which is too heavy for a + * common-case operation. */ +static void check_cb_ovld(struct rcu_data *rdp) +{ + struct rcu_node *const rnp = rdp->mynode; + + if (qovld_calc <= 0 || + ((rcu_segcblist_n_cbs(&rdp->cblist) >= qovld_calc) == + !!(READ_ONCE(rnp->cbovldmask) & rdp->grpmask))) + return; // Early boot wildcard value or already set correctly. + raw_spin_lock_rcu_node(rnp); + check_cb_ovld_locked(rdp, rnp); + raw_spin_unlock_rcu_node(rnp); +} + static void -__call_rcu(struct rcu_head *head, rcu_callback_t func, int cpu, bool lazy) +__call_rcu_common(struct rcu_head *head, rcu_callback_t func, bool lazy_in) { + static atomic_t doublefrees; unsigned long flags; + bool lazy; struct rcu_data *rdp; /* Misaligned rcu_head! */ WARN_ON_ONCE((unsigned long)head & (sizeof(void *) - 1)); + /* Avoid NULL dereference if callback is NULL. */ + if (WARN_ON_ONCE(!func)) + return; + if (debug_rcu_head_queue(head)) { /* * Probable double call_rcu(), so leak the callback. * Use rcu:rcu_callback trace event to find the previous - * time callback was passed to __call_rcu(). + * time callback was passed to call_rcu(). */ - WARN_ONCE(1, "__call_rcu(): Double-freed CB %p->%pF()!!!\n", - head, head->func); + if (atomic_inc_return(&doublefrees) < 4) { + pr_err("%s(): Double-freed CB %p->%pS()!!! ", __func__, head, head->func); + mem_dump_obj(head); + } WRITE_ONCE(head->func, rcu_leak_callback); return; } head->func = func; head->next = NULL; + kasan_record_aux_stack(head); + local_irq_save(flags); rdp = this_cpu_ptr(&rcu_data); + RCU_LOCKDEP_WARN(!rcu_rdp_cpu_online(rdp), "Callback enqueued on offline CPU!"); - /* Add the callback to our list. */ - if (unlikely(!rcu_segcblist_is_enabled(&rdp->cblist)) || cpu != -1) { - int offline; + lazy = lazy_in && !rcu_async_should_hurry(); - if (cpu != -1) - rdp = per_cpu_ptr(&rcu_data, cpu); - if (likely(rdp->mynode)) { - /* Post-boot, so this should be for a no-CBs CPU. */ - offline = !__call_rcu_nocb(rdp, head, lazy, flags); - WARN_ON_ONCE(offline); - /* Offline CPU, _call_rcu() illegal, leak callback. */ - local_irq_restore(flags); - return; - } - /* - * Very early boot, before rcu_init(). Initialize if needed - * and then drop through to queue the callback. - */ - WARN_ON_ONCE(cpu != -1); + /* Add the callback to our list. */ + if (unlikely(!rcu_segcblist_is_enabled(&rdp->cblist))) { + // This can trigger due to call_rcu() from offline CPU: + WARN_ON_ONCE(rcu_scheduler_active != RCU_SCHEDULER_INACTIVE); WARN_ON_ONCE(!rcu_is_watching()); + // Very early boot, before rcu_init(). Initialize if needed + // and then drop through to queue the callback. if (rcu_segcblist_empty(&rdp->cblist)) rcu_segcblist_init(&rdp->cblist); } - rcu_segcblist_enqueue(&rdp->cblist, head, lazy); - if (!lazy) - rcu_idle_count_callbacks_posted(); - if (__is_kfree_rcu_offset((unsigned long)func)) - trace_rcu_kfree_callback(rcu_state.name, head, - (unsigned long)func, - rcu_segcblist_n_lazy_cbs(&rdp->cblist), - rcu_segcblist_n_cbs(&rdp->cblist)); - else - trace_rcu_callback(rcu_state.name, head, - rcu_segcblist_n_lazy_cbs(&rdp->cblist), - rcu_segcblist_n_cbs(&rdp->cblist)); + check_cb_ovld(rdp); - /* Go handle any RCU core processing required. */ - __call_rcu_core(rdp, head, flags); + if (unlikely(rcu_rdp_is_offloaded(rdp))) + call_rcu_nocb(rdp, head, func, flags, lazy); + else + call_rcu_core(rdp, head, func, flags); local_irq_restore(flags); } +#ifdef CONFIG_RCU_LAZY +static bool enable_rcu_lazy __read_mostly = !IS_ENABLED(CONFIG_RCU_LAZY_DEFAULT_OFF); +module_param(enable_rcu_lazy, bool, 0444); + +/** + * call_rcu_hurry() - Queue RCU callback for invocation after grace period, and + * flush all lazy callbacks (including the new one) to the main ->cblist while + * doing so. + * + * @head: structure to be used for queueing the RCU updates. + * @func: actual callback function to be invoked after the grace period + * + * The callback function will be invoked some time after a full grace + * period elapses, in other words after all pre-existing RCU read-side + * critical sections have completed. + * + * Use this API instead of call_rcu() if you don't want the callback to be + * delayed for very long periods of time, which can happen on systems without + * memory pressure and on systems which are lightly loaded or mostly idle. + * This function will cause callbacks to be invoked sooner than later at the + * expense of extra power. Other than that, this function is identical to, and + * reuses call_rcu()'s logic. Refer to call_rcu() for more details about memory + * ordering and other functionality. + */ +void call_rcu_hurry(struct rcu_head *head, rcu_callback_t func) +{ + __call_rcu_common(head, func, false); +} +EXPORT_SYMBOL_GPL(call_rcu_hurry); +#else +#define enable_rcu_lazy false +#endif + /** * call_rcu() - Queue an RCU callback for invocation after a grace period. + * By default the callbacks are 'lazy' and are kept hidden from the main + * ->cblist to prevent starting of grace periods too soon. + * If you desire grace periods to start very soon, use call_rcu_hurry(). + * * @head: structure to be used for queueing the RCU updates. * @func: actual callback function to be invoked after the grace period * @@ -2916,12 +3190,20 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func, int cpu, bool lazy) * period elapses, in other words after all pre-existing RCU read-side * critical sections have completed. However, the callback function * might well execute concurrently with RCU read-side critical sections - * that started after call_rcu() was invoked. RCU read-side critical - * sections are delimited by rcu_read_lock() and rcu_read_unlock(), and - * may be nested. In addition, regions of code across which interrupts, - * preemption, or softirqs have been disabled also serve as RCU read-side - * critical sections. This includes hardware interrupt handlers, softirq - * handlers, and NMI handlers. + * that started after call_rcu() was invoked. + * + * It is perfectly legal to repost an RCU callback, potentially with + * a different callback function, from within its callback function. + * The specified function will be invoked after another full grace period + * has elapsed. This use case is similar in form to the common practice + * of reposting a timer from within its own handler. + * + * RCU read-side critical sections are delimited by rcu_read_lock() + * and rcu_read_unlock(), and may be nested. In addition, but only in + * v5.0 and later, regions of code across which interrupts, preemption, + * or softirqs have been disabled also serve as RCU read-side critical + * sections. This includes hardware interrupt handlers, softirq handlers, + * and NMI handlers. * * Note that all CPUs must agree that the grace period extended beyond * all pre-existing RCU read-side critical section. On systems with more @@ -2941,32 +3223,177 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func, int cpu, bool lazy) * between the call to call_rcu() and the invocation of "func()" -- even * if CPU A and CPU B are the same CPU (but again only if the system has * more than one CPU). + * + * Implementation of these memory-ordering guarantees is described here: + * Documentation/RCU/Design/Memory-Ordering/Tree-RCU-Memory-Ordering.rst. + * + * Specific to call_rcu() (as opposed to the other call_rcu*() functions), + * in kernels built with CONFIG_RCU_LAZY=y, call_rcu() might delay for many + * seconds before starting the grace period needed by the corresponding + * callback. This delay can significantly improve energy-efficiency + * on low-utilization battery-powered devices. To avoid this delay, + * in latency-sensitive kernel code, use call_rcu_hurry(). */ void call_rcu(struct rcu_head *head, rcu_callback_t func) { - __call_rcu(head, func, -1, 0); + __call_rcu_common(head, func, enable_rcu_lazy); } EXPORT_SYMBOL_GPL(call_rcu); /* - * Queue an RCU callback for lazy invocation after a grace period. - * This will likely be later named something like "call_rcu_lazy()", - * but this change will require some way of tagging the lazy RCU - * callbacks in the list of pending callbacks. Until then, this - * function may only be called from __kfree_rcu(). + * During early boot, any blocking grace-period wait automatically + * implies a grace period. + * + * Later on, this could in theory be the case for kernels built with + * CONFIG_SMP=y && CONFIG_PREEMPTION=y running on a single CPU, but this + * is not a common case. Furthermore, this optimization would cause + * the rcu_gp_oldstate structure to expand by 50%, so this potential + * grace-period optimization is ignored once the scheduler is running. */ -void kfree_call_rcu(struct rcu_head *head, rcu_callback_t func) +static int rcu_blocking_is_gp(void) { - __call_rcu(head, func, -1, 1); + if (rcu_scheduler_active != RCU_SCHEDULER_INACTIVE) { + might_sleep(); + return false; + } + return true; +} + +/* + * Helper function for the synchronize_rcu() API. + */ +static void synchronize_rcu_normal(void) +{ + struct rcu_synchronize rs; + + trace_rcu_sr_normal(rcu_state.name, &rs.head, TPS("request")); + + if (READ_ONCE(rcu_normal_wake_from_gp) < 1) { + wait_rcu_gp(call_rcu_hurry); + goto trace_complete_out; + } + + init_rcu_head_on_stack(&rs.head); + init_completion(&rs.completion); + + /* + * This code might be preempted, therefore take a GP + * snapshot before adding a request. + */ + if (IS_ENABLED(CONFIG_PROVE_RCU)) + get_state_synchronize_rcu_full(&rs.oldstate); + + rcu_sr_normal_add_req(&rs); + + /* Kick a GP and start waiting. */ + (void) start_poll_synchronize_rcu(); + + /* Now we can wait. */ + wait_for_completion(&rs.completion); + destroy_rcu_head_on_stack(&rs.head); + +trace_complete_out: + trace_rcu_sr_normal(rcu_state.name, &rs.head, TPS("complete")); } -EXPORT_SYMBOL_GPL(kfree_call_rcu); + +/** + * synchronize_rcu - wait until a grace period has elapsed. + * + * Control will return to the caller some time after a full grace + * period has elapsed, in other words after all currently executing RCU + * read-side critical sections have completed. Note, however, that + * upon return from synchronize_rcu(), the caller might well be executing + * concurrently with new RCU read-side critical sections that began while + * synchronize_rcu() was waiting. + * + * RCU read-side critical sections are delimited by rcu_read_lock() + * and rcu_read_unlock(), and may be nested. In addition, but only in + * v5.0 and later, regions of code across which interrupts, preemption, + * or softirqs have been disabled also serve as RCU read-side critical + * sections. This includes hardware interrupt handlers, softirq handlers, + * and NMI handlers. + * + * Note that this guarantee implies further memory-ordering guarantees. + * On systems with more than one CPU, when synchronize_rcu() returns, + * each CPU is guaranteed to have executed a full memory barrier since + * the end of its last RCU read-side critical section whose beginning + * preceded the call to synchronize_rcu(). In addition, each CPU having + * an RCU read-side critical section that extends beyond the return from + * synchronize_rcu() is guaranteed to have executed a full memory barrier + * after the beginning of synchronize_rcu() and before the beginning of + * that RCU read-side critical section. Note that these guarantees include + * CPUs that are offline, idle, or executing in user mode, as well as CPUs + * that are executing in the kernel. + * + * Furthermore, if CPU A invoked synchronize_rcu(), which returned + * to its caller on CPU B, then both CPU A and CPU B are guaranteed + * to have executed a full memory barrier during the execution of + * synchronize_rcu() -- even if CPU A and CPU B are the same CPU (but + * again only if the system has more than one CPU). + * + * Implementation of these memory-ordering guarantees is described here: + * Documentation/RCU/Design/Memory-Ordering/Tree-RCU-Memory-Ordering.rst. + */ +void synchronize_rcu(void) +{ + unsigned long flags; + struct rcu_node *rnp; + + RCU_LOCKDEP_WARN(lock_is_held(&rcu_bh_lock_map) || + lock_is_held(&rcu_lock_map) || + lock_is_held(&rcu_sched_lock_map), + "Illegal synchronize_rcu() in RCU read-side critical section"); + if (!rcu_blocking_is_gp()) { + if (rcu_gp_is_expedited()) + synchronize_rcu_expedited(); + else + synchronize_rcu_normal(); + return; + } + + // Context allows vacuous grace periods. + // Note well that this code runs with !PREEMPT && !SMP. + // In addition, all code that advances grace periods runs at + // process level. Therefore, this normal GP overlaps with other + // normal GPs only by being fully nested within them, which allows + // reuse of ->gp_seq_polled_snap. + rcu_poll_gp_seq_start_unlocked(&rcu_state.gp_seq_polled_snap); + rcu_poll_gp_seq_end_unlocked(&rcu_state.gp_seq_polled_snap); + + // Update the normal grace-period counters to record + // this grace period, but only those used by the boot CPU. + // The rcu_scheduler_starting() will take care of the rest of + // these counters. + local_irq_save(flags); + WARN_ON_ONCE(num_online_cpus() > 1); + rcu_state.gp_seq += (1 << RCU_SEQ_CTR_SHIFT); + for (rnp = this_cpu_ptr(&rcu_data)->mynode; rnp; rnp = rnp->parent) + rnp->gp_seq_needed = rnp->gp_seq = rcu_state.gp_seq; + local_irq_restore(flags); +} +EXPORT_SYMBOL_GPL(synchronize_rcu); + +/** + * get_completed_synchronize_rcu_full - Return a full pre-completed polled state cookie + * @rgosp: Place to put state cookie + * + * Stores into @rgosp a value that will always be treated by functions + * like poll_state_synchronize_rcu_full() as a cookie whose grace period + * has already completed. + */ +void get_completed_synchronize_rcu_full(struct rcu_gp_oldstate *rgosp) +{ + rgosp->rgos_norm = RCU_GET_STATE_COMPLETED; + rgosp->rgos_exp = RCU_GET_STATE_COMPLETED; +} +EXPORT_SYMBOL_GPL(get_completed_synchronize_rcu_full); /** * get_state_synchronize_rcu - Snapshot current RCU state * * Returns a cookie that is used by a later call to cond_synchronize_rcu() - * to determine whether or not a full grace period has elapsed in the - * meantime. + * or poll_state_synchronize_rcu() to determine whether or not a full + * grace period has elapsed in the meantime. */ unsigned long get_state_synchronize_rcu(void) { @@ -2975,33 +3402,252 @@ unsigned long get_state_synchronize_rcu(void) * before the load from ->gp_seq. */ smp_mb(); /* ^^^ */ - return rcu_seq_snap(&rcu_state.gp_seq); + return rcu_seq_snap(&rcu_state.gp_seq_polled); } EXPORT_SYMBOL_GPL(get_state_synchronize_rcu); /** - * cond_synchronize_rcu - Conditionally wait for an RCU grace period + * get_state_synchronize_rcu_full - Snapshot RCU state, both normal and expedited + * @rgosp: location to place combined normal/expedited grace-period state * - * @oldstate: return value from earlier call to get_state_synchronize_rcu() + * Places the normal and expedited grace-period states in @rgosp. This + * state value can be passed to a later call to cond_synchronize_rcu_full() + * or poll_state_synchronize_rcu_full() to determine whether or not a + * grace period (whether normal or expedited) has elapsed in the meantime. + * The rcu_gp_oldstate structure takes up twice the memory of an unsigned + * long, but is guaranteed to see all grace periods. In contrast, the + * combined state occupies less memory, but can sometimes fail to take + * grace periods into account. + * + * This does not guarantee that the needed grace period will actually + * start. + */ +void get_state_synchronize_rcu_full(struct rcu_gp_oldstate *rgosp) +{ + /* + * Any prior manipulation of RCU-protected data must happen + * before the loads from ->gp_seq and ->expedited_sequence. + */ + smp_mb(); /* ^^^ */ + + // Yes, rcu_state.gp_seq, not rnp_root->gp_seq, the latter's use + // in poll_state_synchronize_rcu_full() notwithstanding. Use of + // the latter here would result in too-short grace periods due to + // interactions with newly onlined CPUs. + rgosp->rgos_norm = rcu_seq_snap(&rcu_state.gp_seq); + rgosp->rgos_exp = rcu_seq_snap(&rcu_state.expedited_sequence); +} +EXPORT_SYMBOL_GPL(get_state_synchronize_rcu_full); + +/* + * Helper function for start_poll_synchronize_rcu() and + * start_poll_synchronize_rcu_full(). + */ +static void start_poll_synchronize_rcu_common(void) +{ + unsigned long flags; + bool needwake; + struct rcu_data *rdp; + struct rcu_node *rnp; + + local_irq_save(flags); + rdp = this_cpu_ptr(&rcu_data); + rnp = rdp->mynode; + raw_spin_lock_rcu_node(rnp); // irqs already disabled. + // Note it is possible for a grace period to have elapsed between + // the above call to get_state_synchronize_rcu() and the below call + // to rcu_seq_snap. This is OK, the worst that happens is that we + // get a grace period that no one needed. These accesses are ordered + // by smp_mb(), and we are accessing them in the opposite order + // from which they are updated at grace-period start, as required. + needwake = rcu_start_this_gp(rnp, rdp, rcu_seq_snap(&rcu_state.gp_seq)); + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); + if (needwake) + rcu_gp_kthread_wake(); +} + +/** + * start_poll_synchronize_rcu - Snapshot and start RCU grace period + * + * Returns a cookie that is used by a later call to cond_synchronize_rcu() + * or poll_state_synchronize_rcu() to determine whether or not a full + * grace period has elapsed in the meantime. If the needed grace period + * is not already slated to start, notifies RCU core of the need for that + * grace period. + */ +unsigned long start_poll_synchronize_rcu(void) +{ + unsigned long gp_seq = get_state_synchronize_rcu(); + + start_poll_synchronize_rcu_common(); + return gp_seq; +} +EXPORT_SYMBOL_GPL(start_poll_synchronize_rcu); + +/** + * start_poll_synchronize_rcu_full - Take a full snapshot and start RCU grace period + * @rgosp: value from get_state_synchronize_rcu_full() or start_poll_synchronize_rcu_full() + * + * Places the normal and expedited grace-period states in *@rgos. This + * state value can be passed to a later call to cond_synchronize_rcu_full() + * or poll_state_synchronize_rcu_full() to determine whether or not a + * grace period (whether normal or expedited) has elapsed in the meantime. + * If the needed grace period is not already slated to start, notifies + * RCU core of the need for that grace period. + */ +void start_poll_synchronize_rcu_full(struct rcu_gp_oldstate *rgosp) +{ + get_state_synchronize_rcu_full(rgosp); + + start_poll_synchronize_rcu_common(); +} +EXPORT_SYMBOL_GPL(start_poll_synchronize_rcu_full); + +/** + * poll_state_synchronize_rcu - Has the specified RCU grace period completed? + * @oldstate: value from get_state_synchronize_rcu() or start_poll_synchronize_rcu() + * + * If a full RCU grace period has elapsed since the earlier call from + * which @oldstate was obtained, return @true, otherwise return @false. + * If @false is returned, it is the caller's responsibility to invoke this + * function later on until it does return @true. Alternatively, the caller + * can explicitly wait for a grace period, for example, by passing @oldstate + * to either cond_synchronize_rcu() or cond_synchronize_rcu_expedited() + * on the one hand or by directly invoking either synchronize_rcu() or + * synchronize_rcu_expedited() on the other. + * + * Yes, this function does not take counter wrap into account. + * But counter wrap is harmless. If the counter wraps, we have waited for + * more than a billion grace periods (and way more on a 64-bit system!). + * Those needing to keep old state values for very long time periods + * (many hours even on 32-bit systems) should check them occasionally and + * either refresh them or set a flag indicating that the grace period has + * completed. Alternatively, they can use get_completed_synchronize_rcu() + * to get a guaranteed-completed grace-period state. + * + * In addition, because oldstate compresses the grace-period state for + * both normal and expedited grace periods into a single unsigned long, + * it can miss a grace period when synchronize_rcu() runs concurrently + * with synchronize_rcu_expedited(). If this is unacceptable, please + * instead use the _full() variant of these polling APIs. + * + * This function provides the same memory-ordering guarantees that + * would be provided by a synchronize_rcu() that was invoked at the call + * to the function that provided @oldstate, and that returned at the end + * of this function. + */ +bool poll_state_synchronize_rcu(unsigned long oldstate) +{ + if (oldstate == RCU_GET_STATE_COMPLETED || + rcu_seq_done_exact(&rcu_state.gp_seq_polled, oldstate)) { + smp_mb(); /* Ensure GP ends before subsequent accesses. */ + return true; + } + return false; +} +EXPORT_SYMBOL_GPL(poll_state_synchronize_rcu); + +/** + * poll_state_synchronize_rcu_full - Has the specified RCU grace period completed? + * @rgosp: value from get_state_synchronize_rcu_full() or start_poll_synchronize_rcu_full() + * + * If a full RCU grace period has elapsed since the earlier call from + * which *rgosp was obtained, return @true, otherwise return @false. + * If @false is returned, it is the caller's responsibility to invoke this + * function later on until it does return @true. Alternatively, the caller + * can explicitly wait for a grace period, for example, by passing @rgosp + * to cond_synchronize_rcu() or by directly invoking synchronize_rcu(). + * + * Yes, this function does not take counter wrap into account. + * But counter wrap is harmless. If the counter wraps, we have waited + * for more than a billion grace periods (and way more on a 64-bit + * system!). Those needing to keep rcu_gp_oldstate values for very + * long time periods (many hours even on 32-bit systems) should check + * them occasionally and either refresh them or set a flag indicating + * that the grace period has completed. Alternatively, they can use + * get_completed_synchronize_rcu_full() to get a guaranteed-completed + * grace-period state. + * + * This function provides the same memory-ordering guarantees that would + * be provided by a synchronize_rcu() that was invoked at the call to + * the function that provided @rgosp, and that returned at the end of this + * function. And this guarantee requires that the root rcu_node structure's + * ->gp_seq field be checked instead of that of the rcu_state structure. + * The problem is that the just-ending grace-period's callbacks can be + * invoked between the time that the root rcu_node structure's ->gp_seq + * field is updated and the time that the rcu_state structure's ->gp_seq + * field is updated. Therefore, if a single synchronize_rcu() is to + * cause a subsequent poll_state_synchronize_rcu_full() to return @true, + * then the root rcu_node structure is the one that needs to be polled. + */ +bool poll_state_synchronize_rcu_full(struct rcu_gp_oldstate *rgosp) +{ + struct rcu_node *rnp = rcu_get_root(); + + smp_mb(); // Order against root rcu_node structure grace-period cleanup. + if (rgosp->rgos_norm == RCU_GET_STATE_COMPLETED || + rcu_seq_done_exact(&rnp->gp_seq, rgosp->rgos_norm) || + rgosp->rgos_exp == RCU_GET_STATE_COMPLETED || + rcu_seq_done_exact(&rcu_state.expedited_sequence, rgosp->rgos_exp)) { + smp_mb(); /* Ensure GP ends before subsequent accesses. */ + return true; + } + return false; +} +EXPORT_SYMBOL_GPL(poll_state_synchronize_rcu_full); + +/** + * cond_synchronize_rcu - Conditionally wait for an RCU grace period + * @oldstate: value from get_state_synchronize_rcu(), start_poll_synchronize_rcu(), or start_poll_synchronize_rcu_expedited() * * If a full RCU grace period has elapsed since the earlier call to - * get_state_synchronize_rcu(), just return. Otherwise, invoke - * synchronize_rcu() to wait for a full grace period. + * get_state_synchronize_rcu() or start_poll_synchronize_rcu(), just return. + * Otherwise, invoke synchronize_rcu() to wait for a full grace period. * - * Yes, this function does not take counter wrap into account. But - * counter wrap is harmless. If the counter wraps, we have waited for + * Yes, this function does not take counter wrap into account. + * But counter wrap is harmless. If the counter wraps, we have waited for * more than 2 billion grace periods (and way more on a 64-bit system!), - * so waiting for one additional grace period should be just fine. + * so waiting for a couple of additional grace periods should be just fine. + * + * This function provides the same memory-ordering guarantees that + * would be provided by a synchronize_rcu() that was invoked at the call + * to the function that provided @oldstate and that returned at the end + * of this function. */ void cond_synchronize_rcu(unsigned long oldstate) { - if (!rcu_seq_done(&rcu_state.gp_seq, oldstate)) + if (!poll_state_synchronize_rcu(oldstate)) synchronize_rcu(); - else - smp_mb(); /* Ensure GP ends before subsequent accesses. */ } EXPORT_SYMBOL_GPL(cond_synchronize_rcu); +/** + * cond_synchronize_rcu_full - Conditionally wait for an RCU grace period + * @rgosp: value from get_state_synchronize_rcu_full(), start_poll_synchronize_rcu_full(), or start_poll_synchronize_rcu_expedited_full() + * + * If a full RCU grace period has elapsed since the call to + * get_state_synchronize_rcu_full(), start_poll_synchronize_rcu_full(), + * or start_poll_synchronize_rcu_expedited_full() from which @rgosp was + * obtained, just return. Otherwise, invoke synchronize_rcu() to wait + * for a full grace period. + * + * Yes, this function does not take counter wrap into account. + * But counter wrap is harmless. If the counter wraps, we have waited for + * more than 2 billion grace periods (and way more on a 64-bit system!), + * so waiting for a couple of additional grace periods should be just fine. + * + * This function provides the same memory-ordering guarantees that + * would be provided by a synchronize_rcu() that was invoked at the call + * to the function that provided @rgosp and that returned at the end of + * this function. + */ +void cond_synchronize_rcu_full(struct rcu_gp_oldstate *rgosp) +{ + if (!poll_state_synchronize_rcu_full(rgosp)) + synchronize_rcu(); +} +EXPORT_SYMBOL_GPL(cond_synchronize_rcu_full); + /* * Check to see if there is any immediate RCU-related work to be done by * the current CPU, returning 1 if so and zero otherwise. The checks are @@ -3009,29 +3655,42 @@ EXPORT_SYMBOL_GPL(cond_synchronize_rcu); * CPU-local state are performed first. However, we must check for CPU * stalls first, else we might not get a chance. */ -static int rcu_pending(void) +static int rcu_pending(int user) { + bool gp_in_progress; struct rcu_data *rdp = this_cpu_ptr(&rcu_data); struct rcu_node *rnp = rdp->mynode; + lockdep_assert_irqs_disabled(); + /* Check for CPU stalls, if enabled. */ check_cpu_stall(rdp); - /* Is this CPU a NO_HZ_FULL CPU that should ignore RCU? */ - if (rcu_nohz_full_cpu()) + /* Does this CPU need a deferred NOCB wakeup? */ + if (rcu_nocb_need_deferred_wakeup(rdp, RCU_NOCB_WAKE)) + return 1; + + /* Is this a nohz_full CPU in userspace or idle? (Ignore RCU if so.) */ + gp_in_progress = rcu_gp_in_progress(); + if ((user || rcu_is_cpu_rrupt_from_idle() || + (gp_in_progress && + time_before(jiffies, READ_ONCE(rcu_state.gp_start) + + nohz_full_patience_delay_jiffies))) && + rcu_nohz_full_cpu()) return 0; /* Is the RCU core waiting for a quiescent state from this CPU? */ - if (rdp->core_needs_qs && !rdp->cpu_no_qs.b.norm) + if (rdp->core_needs_qs && !rdp->cpu_no_qs.b.norm && gp_in_progress) return 1; /* Does this CPU have callbacks ready to invoke? */ - if (rcu_segcblist_ready_cbs(&rdp->cblist)) + if (!rcu_rdp_is_offloaded(rdp) && + rcu_segcblist_ready_cbs(&rdp->cblist)) return 1; /* Has RCU gone idle with this CPU needing another grace period? */ - if (!rcu_gp_in_progress() && - rcu_segcblist_is_enabled(&rdp->cblist) && + if (!gp_in_progress && rcu_segcblist_is_enabled(&rdp->cblist) && + !rcu_rdp_is_offloaded(rdp) && !rcu_segcblist_restempty(&rdp->cblist, RCU_NEXT_READY_TAIL)) return 1; @@ -3040,37 +3699,11 @@ static int rcu_pending(void) unlikely(READ_ONCE(rdp->gpwrap))) /* outside lock */ return 1; - /* Does this CPU need a deferred NOCB wakeup? */ - if (rcu_nocb_need_deferred_wakeup(rdp)) - return 1; - /* nothing to do */ return 0; } /* - * Return true if the specified CPU has any callback. If all_lazy is - * non-NULL, store an indication of whether all callbacks are lazy. - * (If there are no callbacks, all of them are deemed to be lazy.) - */ -static bool rcu_cpu_has_callbacks(bool *all_lazy) -{ - bool al = true; - bool hc = false; - struct rcu_data *rdp; - - rdp = this_cpu_ptr(&rcu_data); - if (!rcu_segcblist_empty(&rdp->cblist)) { - hc = true; - if (rcu_segcblist_n_nonlazy_cbs(&rdp->cblist)) - al = false; - } - if (all_lazy) - *all_lazy = al; - return hc; -} - -/* * Helper function for rcu_barrier() tracing. If tracing is disabled, * the compiler is expected to optimize this away. */ @@ -3083,35 +3716,77 @@ static void rcu_barrier_trace(const char *s, int cpu, unsigned long done) /* * RCU callback function for rcu_barrier(). If we are last, wake * up the task executing rcu_barrier(). + * + * Note that the value of rcu_state.barrier_sequence must be captured + * before the atomic_dec_and_test(). Otherwise, if this CPU is not last, + * other CPUs might count the value down to zero before this CPU gets + * around to invoking rcu_barrier_trace(), which might result in bogus + * data from the next instance of rcu_barrier(). */ static void rcu_barrier_callback(struct rcu_head *rhp) { + unsigned long __maybe_unused s = rcu_state.barrier_sequence; + + rhp->next = rhp; // Mark the callback as having been invoked. if (atomic_dec_and_test(&rcu_state.barrier_cpu_count)) { - rcu_barrier_trace(TPS("LastCB"), -1, - rcu_state.barrier_sequence); + rcu_barrier_trace(TPS("LastCB"), -1, s); complete(&rcu_state.barrier_completion); } else { - rcu_barrier_trace(TPS("CB"), -1, rcu_state.barrier_sequence); + rcu_barrier_trace(TPS("CB"), -1, s); } } /* - * Called with preemption disabled, and from cross-cpu IRQ context. + * If needed, entrain an rcu_barrier() callback on rdp->cblist. */ -static void rcu_barrier_func(void *unused) +static void rcu_barrier_entrain(struct rcu_data *rdp) { - struct rcu_data *rdp = raw_cpu_ptr(&rcu_data); + unsigned long gseq = READ_ONCE(rcu_state.barrier_sequence); + unsigned long lseq = READ_ONCE(rdp->barrier_seq_snap); + bool wake_nocb = false; + bool was_alldone = false; + lockdep_assert_held(&rcu_state.barrier_lock); + if (rcu_seq_state(lseq) || !rcu_seq_state(gseq) || rcu_seq_ctr(lseq) != rcu_seq_ctr(gseq)) + return; rcu_barrier_trace(TPS("IRQ"), -1, rcu_state.barrier_sequence); rdp->barrier_head.func = rcu_barrier_callback; debug_rcu_head_queue(&rdp->barrier_head); - if (rcu_segcblist_entrain(&rdp->cblist, &rdp->barrier_head, 0)) { + rcu_nocb_lock(rdp); + /* + * Flush bypass and wakeup rcuog if we add callbacks to an empty regular + * queue. This way we don't wait for bypass timer that can reach seconds + * if it's fully lazy. + */ + was_alldone = rcu_rdp_is_offloaded(rdp) && !rcu_segcblist_pend_cbs(&rdp->cblist); + WARN_ON_ONCE(!rcu_nocb_flush_bypass(rdp, NULL, jiffies, false)); + wake_nocb = was_alldone && rcu_segcblist_pend_cbs(&rdp->cblist); + if (rcu_segcblist_entrain(&rdp->cblist, &rdp->barrier_head)) { atomic_inc(&rcu_state.barrier_cpu_count); } else { debug_rcu_head_unqueue(&rdp->barrier_head); - rcu_barrier_trace(TPS("IRQNQ"), -1, - rcu_state.barrier_sequence); + rcu_barrier_trace(TPS("IRQNQ"), -1, rcu_state.barrier_sequence); } + rcu_nocb_unlock(rdp); + if (wake_nocb) + wake_nocb_gp(rdp, false); + smp_store_release(&rdp->barrier_seq_snap, gseq); +} + +/* + * Called with preemption disabled, and from cross-cpu IRQ context. + */ +static void rcu_barrier_handler(void *cpu_in) +{ + uintptr_t cpu = (uintptr_t)cpu_in; + struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); + + lockdep_assert_irqs_disabled(); + WARN_ON_ONCE(cpu != rdp->cpu); + WARN_ON_ONCE(cpu != smp_processor_id()); + raw_spin_lock(&rcu_state.barrier_lock); + rcu_barrier_entrain(rdp); + raw_spin_unlock(&rcu_state.barrier_lock); } /** @@ -3121,10 +3796,17 @@ static void rcu_barrier_func(void *unused) * to complete. For example, if there are no RCU callbacks queued anywhere * in the system, then rcu_barrier() is within its rights to return * immediately, without waiting for anything, much less an RCU grace period. + * In fact, rcu_barrier() will normally not result in any RCU grace periods + * beyond those that were already destined to be executed. + * + * In kernels built with CONFIG_RCU_LAZY=y, this function also hurries all + * pending lazy RCU callbacks. */ void rcu_barrier(void) { - int cpu; + uintptr_t cpu; + unsigned long flags; + unsigned long gseq; struct rcu_data *rdp; unsigned long s = rcu_seq_snap(&rcu_state.barrier_sequence); @@ -3135,26 +3817,28 @@ void rcu_barrier(void) /* Did someone else do our work for us? */ if (rcu_seq_done(&rcu_state.barrier_sequence, s)) { - rcu_barrier_trace(TPS("EarlyExit"), -1, - rcu_state.barrier_sequence); + rcu_barrier_trace(TPS("EarlyExit"), -1, rcu_state.barrier_sequence); smp_mb(); /* caller's subsequent code after above check. */ mutex_unlock(&rcu_state.barrier_mutex); return; } /* Mark the start of the barrier operation. */ + raw_spin_lock_irqsave(&rcu_state.barrier_lock, flags); rcu_seq_start(&rcu_state.barrier_sequence); + gseq = rcu_state.barrier_sequence; rcu_barrier_trace(TPS("Inc1"), -1, rcu_state.barrier_sequence); /* - * Initialize the count to one rather than to zero in order to - * avoid a too-soon return to zero in case of a short grace period - * (or preemption of this task). Exclude CPU-hotplug operations - * to ensure that no offline CPU has callbacks queued. + * Initialize the count to two rather than to zero in order + * to avoid a too-soon return to zero in case of an immediate + * invocation of the just-enqueued callback (or preemption of + * this task). Exclude CPU-hotplug operations to ensure that no + * offline non-offloaded CPU has callbacks queued. */ init_completion(&rcu_state.barrier_completion); - atomic_set(&rcu_state.barrier_cpu_count, 1); - get_online_cpus(); + atomic_set(&rcu_state.barrier_cpu_count, 2); + raw_spin_unlock_irqrestore(&rcu_state.barrier_lock, flags); /* * Force each CPU with callbacks to register a new callback. @@ -3162,37 +3846,38 @@ void rcu_barrier(void) * corresponding CPU's preceding callbacks have been invoked. */ for_each_possible_cpu(cpu) { - if (!cpu_online(cpu) && !rcu_is_nocb_cpu(cpu)) - continue; rdp = per_cpu_ptr(&rcu_data, cpu); - if (rcu_is_nocb_cpu(cpu)) { - if (!rcu_nocb_cpu_needs_barrier(cpu)) { - rcu_barrier_trace(TPS("OfflineNoCB"), cpu, - rcu_state.barrier_sequence); - } else { - rcu_barrier_trace(TPS("OnlineNoCB"), cpu, - rcu_state.barrier_sequence); - smp_mb__before_atomic(); - atomic_inc(&rcu_state.barrier_cpu_count); - __call_rcu(&rdp->barrier_head, - rcu_barrier_callback, cpu, 0); - } - } else if (rcu_segcblist_n_cbs(&rdp->cblist)) { - rcu_barrier_trace(TPS("OnlineQ"), cpu, - rcu_state.barrier_sequence); - smp_call_function_single(cpu, rcu_barrier_func, NULL, 1); - } else { - rcu_barrier_trace(TPS("OnlineNQ"), cpu, - rcu_state.barrier_sequence); +retry: + if (smp_load_acquire(&rdp->barrier_seq_snap) == gseq) + continue; + raw_spin_lock_irqsave(&rcu_state.barrier_lock, flags); + if (!rcu_segcblist_n_cbs(&rdp->cblist)) { + WRITE_ONCE(rdp->barrier_seq_snap, gseq); + raw_spin_unlock_irqrestore(&rcu_state.barrier_lock, flags); + rcu_barrier_trace(TPS("NQ"), cpu, rcu_state.barrier_sequence); + continue; + } + if (!rcu_rdp_cpu_online(rdp)) { + rcu_barrier_entrain(rdp); + WARN_ON_ONCE(READ_ONCE(rdp->barrier_seq_snap) != gseq); + raw_spin_unlock_irqrestore(&rcu_state.barrier_lock, flags); + rcu_barrier_trace(TPS("OfflineNoCBQ"), cpu, rcu_state.barrier_sequence); + continue; } + raw_spin_unlock_irqrestore(&rcu_state.barrier_lock, flags); + if (smp_call_function_single(cpu, rcu_barrier_handler, (void *)cpu, 1)) { + schedule_timeout_uninterruptible(1); + goto retry; + } + WARN_ON_ONCE(READ_ONCE(rdp->barrier_seq_snap) != gseq); + rcu_barrier_trace(TPS("OnlineQ"), cpu, rcu_state.barrier_sequence); } - put_online_cpus(); /* * Now that we have an rcu_barrier_callback() callback on each * CPU, and thus each counted, remove the initial count. */ - if (atomic_dec_and_test(&rcu_state.barrier_cpu_count)) + if (atomic_sub_and_test(2, &rcu_state.barrier_cpu_count)) complete(&rcu_state.barrier_completion); /* Wait for all rcu_barrier_callback() callbacks to be invoked. */ @@ -3201,16 +3886,218 @@ void rcu_barrier(void) /* Mark the end of the barrier operation. */ rcu_barrier_trace(TPS("Inc2"), -1, rcu_state.barrier_sequence); rcu_seq_end(&rcu_state.barrier_sequence); + gseq = rcu_state.barrier_sequence; + for_each_possible_cpu(cpu) { + rdp = per_cpu_ptr(&rcu_data, cpu); + + WRITE_ONCE(rdp->barrier_seq_snap, gseq); + } /* Other rcu_barrier() invocations can now safely proceed. */ mutex_unlock(&rcu_state.barrier_mutex); } EXPORT_SYMBOL_GPL(rcu_barrier); +static unsigned long rcu_barrier_last_throttle; + +/** + * rcu_barrier_throttled - Do rcu_barrier(), but limit to one per second + * + * This can be thought of as guard rails around rcu_barrier() that + * permits unrestricted userspace use, at least assuming the hardware's + * try_cmpxchg() is robust. There will be at most one call per second to + * rcu_barrier() system-wide from use of this function, which means that + * callers might needlessly wait a second or three. + * + * This is intended for use by test suites to avoid OOM by flushing RCU + * callbacks from the previous test before starting the next. See the + * rcutree.do_rcu_barrier module parameter for more information. + * + * Why not simply make rcu_barrier() more scalable? That might be + * the eventual endpoint, but let's keep it simple for the time being. + * Note that the module parameter infrastructure serializes calls to a + * given .set() function, but should concurrent .set() invocation ever be + * possible, we are ready! + */ +static void rcu_barrier_throttled(void) +{ + unsigned long j = jiffies; + unsigned long old = READ_ONCE(rcu_barrier_last_throttle); + unsigned long s = rcu_seq_snap(&rcu_state.barrier_sequence); + + while (time_in_range(j, old, old + HZ / 16) || + !try_cmpxchg(&rcu_barrier_last_throttle, &old, j)) { + schedule_timeout_idle(HZ / 16); + if (rcu_seq_done(&rcu_state.barrier_sequence, s)) { + smp_mb(); /* caller's subsequent code after above check. */ + return; + } + j = jiffies; + old = READ_ONCE(rcu_barrier_last_throttle); + } + rcu_barrier(); +} + +/* + * Invoke rcu_barrier_throttled() when a rcutree.do_rcu_barrier + * request arrives. We insist on a true value to allow for possible + * future expansion. + */ +static int param_set_do_rcu_barrier(const char *val, const struct kernel_param *kp) +{ + bool b; + int ret; + + if (rcu_scheduler_active != RCU_SCHEDULER_RUNNING) + return -EAGAIN; + ret = kstrtobool(val, &b); + if (!ret && b) { + atomic_inc((atomic_t *)kp->arg); + rcu_barrier_throttled(); + atomic_dec((atomic_t *)kp->arg); + } + return ret; +} + +/* + * Output the number of outstanding rcutree.do_rcu_barrier requests. + */ +static int param_get_do_rcu_barrier(char *buffer, const struct kernel_param *kp) +{ + return sprintf(buffer, "%d\n", atomic_read((atomic_t *)kp->arg)); +} + +static const struct kernel_param_ops do_rcu_barrier_ops = { + .set = param_set_do_rcu_barrier, + .get = param_get_do_rcu_barrier, +}; +static atomic_t do_rcu_barrier; +module_param_cb(do_rcu_barrier, &do_rcu_barrier_ops, &do_rcu_barrier, 0644); + +/* + * Compute the mask of online CPUs for the specified rcu_node structure. + * This will not be stable unless the rcu_node structure's ->lock is + * held, but the bit corresponding to the current CPU will be stable + * in most contexts. + */ +static unsigned long rcu_rnp_online_cpus(struct rcu_node *rnp) +{ + return READ_ONCE(rnp->qsmaskinitnext); +} + +/* + * Is the CPU corresponding to the specified rcu_data structure online + * from RCU's perspective? This perspective is given by that structure's + * ->qsmaskinitnext field rather than by the global cpu_online_mask. + */ +static bool rcu_rdp_cpu_online(struct rcu_data *rdp) +{ + return !!(rdp->grpmask & rcu_rnp_online_cpus(rdp->mynode)); +} + +bool rcu_cpu_online(int cpu) +{ + struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); + + return rcu_rdp_cpu_online(rdp); +} + +#if defined(CONFIG_PROVE_RCU) && defined(CONFIG_HOTPLUG_CPU) + +/* + * Is the current CPU online as far as RCU is concerned? + * + * Disable preemption to avoid false positives that could otherwise + * happen due to the current CPU number being sampled, this task being + * preempted, its old CPU being taken offline, resuming on some other CPU, + * then determining that its old CPU is now offline. + * + * Disable checking if in an NMI handler because we cannot safely + * report errors from NMI handlers anyway. In addition, it is OK to use + * RCU on an offline processor during initial boot, hence the check for + * rcu_scheduler_fully_active. + */ +bool notrace rcu_lockdep_current_cpu_online(void) +{ + struct rcu_data *rdp; + bool ret = false; + + if (in_nmi() || !rcu_scheduler_fully_active) + return true; + preempt_disable_notrace(); + rdp = this_cpu_ptr(&rcu_data); + /* + * Strictly, we care here about the case where the current CPU is + * in rcutree_report_cpu_starting() and thus has an excuse for rdp->grpmask + * not being up to date. So arch_spin_is_locked() might have a + * false positive if it's held by some *other* CPU, but that's + * OK because that just means a false *negative* on the warning. + */ + if (rcu_rdp_cpu_online(rdp) || arch_spin_is_locked(&rcu_state.ofl_lock)) + ret = true; + preempt_enable_notrace(); + return ret; +} +EXPORT_SYMBOL_GPL(rcu_lockdep_current_cpu_online); + +#endif /* #if defined(CONFIG_PROVE_RCU) && defined(CONFIG_HOTPLUG_CPU) */ + +// Has rcu_init() been invoked? This is used (for example) to determine +// whether spinlocks may be acquired safely. +static bool rcu_init_invoked(void) +{ + return !!READ_ONCE(rcu_state.n_online_cpus); +} + +/* + * All CPUs for the specified rcu_node structure have gone offline, + * and all tasks that were preempted within an RCU read-side critical + * section while running on one of those CPUs have since exited their RCU + * read-side critical section. Some other CPU is reporting this fact with + * the specified rcu_node structure's ->lock held and interrupts disabled. + * This function therefore goes up the tree of rcu_node structures, + * clearing the corresponding bits in the ->qsmaskinit fields. Note that + * the leaf rcu_node structure's ->qsmaskinit field has already been + * updated. + * + * This function does check that the specified rcu_node structure has + * all CPUs offline and no blocked tasks, so it is OK to invoke it + * prematurely. That said, invoking it after the fact will cost you + * a needless lock acquisition. So once it has done its work, don't + * invoke it again. + */ +static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf) +{ + long mask; + struct rcu_node *rnp = rnp_leaf; + + raw_lockdep_assert_held_rcu_node(rnp_leaf); + if (!IS_ENABLED(CONFIG_HOTPLUG_CPU) || + WARN_ON_ONCE(rnp_leaf->qsmaskinit) || + WARN_ON_ONCE(rcu_preempt_has_tasks(rnp_leaf))) + return; + for (;;) { + mask = rnp->grpmask; + rnp = rnp->parent; + if (!rnp) + break; + raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */ + rnp->qsmaskinit &= ~mask; + /* Between grace periods, so better already be zero! */ + WARN_ON_ONCE(rnp->qsmask); + if (rnp->qsmaskinit) { + raw_spin_unlock_rcu_node(rnp); + /* irqs remain disabled. */ + return; + } + raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled. */ + } +} + /* * Propagate ->qsinitmask bits up the rcu_node tree to account for the * first CPU in a given leaf rcu_node structure coming online. The caller - * must hold the corresponding leaf rcu_node ->lock with interrrupts + * must hold the corresponding leaf rcu_node ->lock with interrupts * disabled. */ static void rcu_init_new_rnp(struct rcu_node *rnp_leaf) @@ -3241,20 +4128,93 @@ static void rcu_init_new_rnp(struct rcu_node *rnp_leaf) static void __init rcu_boot_init_percpu_data(int cpu) { + struct context_tracking *ct = this_cpu_ptr(&context_tracking); struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); /* Set up local state, ensuring consistent view of global state. */ rdp->grpmask = leaf_node_cpu_bit(rdp->mynode, cpu); - WARN_ON_ONCE(rdp->dynticks_nesting != 1); - WARN_ON_ONCE(rcu_dynticks_in_eqs(rcu_dynticks_snap(rdp))); + INIT_WORK(&rdp->strict_work, strict_work_handler); + WARN_ON_ONCE(ct->nesting != 1); + WARN_ON_ONCE(rcu_watching_snap_in_eqs(ct_rcu_watching_cpu(cpu))); + rdp->barrier_seq_snap = rcu_state.barrier_sequence; rdp->rcu_ofl_gp_seq = rcu_state.gp_seq; - rdp->rcu_ofl_gp_flags = RCU_GP_CLEANED; + rdp->rcu_ofl_gp_state = RCU_GP_CLEANED; rdp->rcu_onl_gp_seq = rcu_state.gp_seq; - rdp->rcu_onl_gp_flags = RCU_GP_CLEANED; + rdp->rcu_onl_gp_state = RCU_GP_CLEANED; + rdp->last_sched_clock = jiffies; rdp->cpu = cpu; rcu_boot_init_nocb_percpu_data(rdp); } +static void rcu_thread_affine_rnp(struct task_struct *t, struct rcu_node *rnp) +{ + cpumask_var_t affinity; + int cpu; + + if (!zalloc_cpumask_var(&affinity, GFP_KERNEL)) + return; + + for_each_leaf_node_possible_cpu(rnp, cpu) + cpumask_set_cpu(cpu, affinity); + + kthread_affine_preferred(t, affinity); + + free_cpumask_var(affinity); +} + +struct kthread_worker *rcu_exp_gp_kworker; + +static void rcu_spawn_exp_par_gp_kworker(struct rcu_node *rnp) +{ + struct kthread_worker *kworker; + const char *name = "rcu_exp_par_gp_kthread_worker/%d"; + struct sched_param param = { .sched_priority = kthread_prio }; + int rnp_index = rnp - rcu_get_root(); + + if (rnp->exp_kworker) + return; + + kworker = kthread_create_worker(0, name, rnp_index); + if (IS_ERR_OR_NULL(kworker)) { + pr_err("Failed to create par gp kworker on %d/%d\n", + rnp->grplo, rnp->grphi); + return; + } + WRITE_ONCE(rnp->exp_kworker, kworker); + + if (IS_ENABLED(CONFIG_RCU_EXP_KTHREAD)) + sched_setscheduler_nocheck(kworker->task, SCHED_FIFO, ¶m); + + rcu_thread_affine_rnp(kworker->task, rnp); + wake_up_process(kworker->task); +} + +static void __init rcu_start_exp_gp_kworker(void) +{ + const char *name = "rcu_exp_gp_kthread_worker"; + struct sched_param param = { .sched_priority = kthread_prio }; + + rcu_exp_gp_kworker = kthread_run_worker(0, name); + if (IS_ERR_OR_NULL(rcu_exp_gp_kworker)) { + pr_err("Failed to create %s!\n", name); + rcu_exp_gp_kworker = NULL; + return; + } + + if (IS_ENABLED(CONFIG_RCU_EXP_KTHREAD)) + sched_setscheduler_nocheck(rcu_exp_gp_kworker->task, SCHED_FIFO, ¶m); +} + +static void rcu_spawn_rnp_kthreads(struct rcu_node *rnp) +{ + if (rcu_scheduler_fully_active) { + mutex_lock(&rnp->kthread_mutex); + rcu_spawn_one_boost_kthread(rnp); + rcu_spawn_exp_par_gp_kworker(rnp); + mutex_unlock(&rnp->kthread_mutex); + } +} + /* * Invoked early in the CPU-online process, when pretty much all services * are available. The incoming CPU is not present. @@ -3262,56 +4222,65 @@ rcu_boot_init_percpu_data(int cpu) * Initializes a CPU's per-CPU RCU data. Note that only one online or * offline event can be happening at a given time. Note also that we can * accept some slop in the rsp->gp_seq access due to the fact that this - * CPU cannot possibly have any RCU callbacks in flight yet. + * CPU cannot possibly have any non-offloaded RCU callbacks in flight yet. + * And any offloaded callbacks are being numbered elsewhere. */ int rcutree_prepare_cpu(unsigned int cpu) { unsigned long flags; + struct context_tracking *ct = per_cpu_ptr(&context_tracking, cpu); struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); struct rcu_node *rnp = rcu_get_root(); /* Set up local state, ensuring consistent view of global state. */ raw_spin_lock_irqsave_rcu_node(rnp, flags); rdp->qlen_last_fqs_check = 0; - rdp->n_force_qs_snap = rcu_state.n_force_qs; + rdp->n_force_qs_snap = READ_ONCE(rcu_state.n_force_qs); rdp->blimit = blimit; - if (rcu_segcblist_empty(&rdp->cblist) && /* No early-boot CBs? */ - !init_nocb_callback_list(rdp)) - rcu_segcblist_init(&rdp->cblist); /* Re-enable callbacks. */ - rdp->dynticks_nesting = 1; /* CPU not up, no tearing. */ - rcu_dynticks_eqs_online(); + ct->nesting = 1; /* CPU not up, no tearing. */ raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled. */ /* + * Only non-NOCB CPUs that didn't have early-boot callbacks need to be + * (re-)initialized. + */ + if (!rcu_segcblist_is_enabled(&rdp->cblist)) + rcu_segcblist_init(&rdp->cblist); /* Re-enable callbacks. */ + + /* * Add CPU to leaf rcu_node pending-online bitmask. Any needed * propagation up the rcu_node tree will happen at the beginning * of the next grace period. */ rnp = rdp->mynode; raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */ - rdp->beenonline = true; /* We have now been online. */ - rdp->gp_seq = rnp->gp_seq; - rdp->gp_seq_needed = rnp->gp_seq; + rdp->gp_seq = READ_ONCE(rnp->gp_seq); + rdp->gp_seq_needed = rdp->gp_seq; rdp->cpu_no_qs.b.norm = true; rdp->core_needs_qs = false; rdp->rcu_iw_pending = false; - rdp->rcu_iw_gp_seq = rnp->gp_seq - 1; + rdp->rcu_iw = IRQ_WORK_INIT_HARD(rcu_iw_handler); + rdp->rcu_iw_gp_seq = rdp->gp_seq - 1; trace_rcu_grace_period(rcu_state.name, rdp->gp_seq, TPS("cpuonl")); raw_spin_unlock_irqrestore_rcu_node(rnp, flags); - rcu_prepare_kthreads(cpu); - rcu_spawn_all_nocb_kthreads(cpu); + + rcu_preempt_deferred_qs_init(rdp); + rcu_spawn_rnp_kthreads(rnp); + rcu_spawn_cpu_nocb_kthread(cpu); + ASSERT_EXCLUSIVE_WRITER(rcu_state.n_online_cpus); + WRITE_ONCE(rcu_state.n_online_cpus, rcu_state.n_online_cpus + 1); return 0; } /* - * Update RCU priority boot kthread affinity for CPU-hotplug changes. + * Has the specified (known valid) CPU ever been fully online? */ -static void rcutree_affinity_setting(unsigned int cpu, int outgoing) +bool rcu_cpu_beenfullyonline(int cpu) { struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); - rcu_boost_kthread_setaffinity(rdp->mynode, outgoing); + return smp_load_acquire(&rdp->beenonline); } /* @@ -3329,39 +4298,14 @@ int rcutree_online_cpu(unsigned int cpu) raw_spin_lock_irqsave_rcu_node(rnp, flags); rnp->ffmask |= rdp->grpmask; raw_spin_unlock_irqrestore_rcu_node(rnp, flags); - if (IS_ENABLED(CONFIG_TREE_SRCU)) - srcu_online_cpu(cpu); if (rcu_scheduler_active == RCU_SCHEDULER_INACTIVE) return 0; /* Too early in boot for scheduler work. */ - sync_sched_exp_online_cleanup(cpu); - rcutree_affinity_setting(cpu, -1); - return 0; -} - -/* - * Near the beginning of the process. The CPU is still very much alive - * with pretty much all services enabled. - */ -int rcutree_offline_cpu(unsigned int cpu) -{ - unsigned long flags; - struct rcu_data *rdp; - struct rcu_node *rnp; - rdp = per_cpu_ptr(&rcu_data, cpu); - rnp = rdp->mynode; - raw_spin_lock_irqsave_rcu_node(rnp, flags); - rnp->ffmask &= ~rdp->grpmask; - raw_spin_unlock_irqrestore_rcu_node(rnp, flags); - - rcutree_affinity_setting(cpu, cpu); - if (IS_ENABLED(CONFIG_TREE_SRCU)) - srcu_offline_cpu(cpu); + // Stop-machine done, so allow nohz_full to disable tick. + tick_dep_clear(TICK_DEP_BIT_RCU); return 0; } -static DEFINE_PER_CPU(int, rcu_cpu_started); - /* * Mark the specified CPU as being online so that subsequent grace periods * (both expedited and normal) will wait on it. Note that this means that @@ -3372,45 +4316,57 @@ static DEFINE_PER_CPU(int, rcu_cpu_started); * Note that this function is special in that it is invoked directly * from the incoming CPU rather than from the cpuhp_step mechanism. * This is because this function must be invoked at a precise location. + * This incoming CPU must not have enabled interrupts yet. + * + * This mirrors the effects of rcutree_report_cpu_dead(). */ -void rcu_cpu_starting(unsigned int cpu) +void rcutree_report_cpu_starting(unsigned int cpu) { - unsigned long flags; unsigned long mask; - int nbits; - unsigned long oldmask; struct rcu_data *rdp; struct rcu_node *rnp; + bool newcpu; - if (per_cpu(rcu_cpu_started, cpu)) + lockdep_assert_irqs_disabled(); + rdp = per_cpu_ptr(&rcu_data, cpu); + if (rdp->cpu_started) return; + rdp->cpu_started = true; - per_cpu(rcu_cpu_started, cpu) = 1; - - rdp = per_cpu_ptr(&rcu_data, cpu); rnp = rdp->mynode; mask = rdp->grpmask; - raw_spin_lock_irqsave_rcu_node(rnp, flags); - rnp->qsmaskinitnext |= mask; - oldmask = rnp->expmaskinitnext; + arch_spin_lock(&rcu_state.ofl_lock); + rcu_watching_online(); + raw_spin_lock(&rcu_state.barrier_lock); + raw_spin_lock_rcu_node(rnp); + WRITE_ONCE(rnp->qsmaskinitnext, rnp->qsmaskinitnext | mask); + raw_spin_unlock(&rcu_state.barrier_lock); + newcpu = !(rnp->expmaskinitnext & mask); rnp->expmaskinitnext |= mask; - oldmask ^= rnp->expmaskinitnext; - nbits = bitmap_weight(&oldmask, BITS_PER_LONG); /* Allow lockless access for expedited grace periods. */ - smp_store_release(&rcu_state.ncpus, rcu_state.ncpus + nbits); /* ^^^ */ + smp_store_release(&rcu_state.ncpus, rcu_state.ncpus + newcpu); /* ^^^ */ + ASSERT_EXCLUSIVE_WRITER(rcu_state.ncpus); rcu_gpnum_ovf(rnp, rdp); /* Offline-induced counter wrap? */ rdp->rcu_onl_gp_seq = READ_ONCE(rcu_state.gp_seq); - rdp->rcu_onl_gp_flags = READ_ONCE(rcu_state.gp_flags); - if (rnp->qsmask & mask) { /* RCU waiting on incoming CPU? */ + rdp->rcu_onl_gp_state = READ_ONCE(rcu_state.gp_state); + + /* An incoming CPU should never be blocking a grace period. */ + if (WARN_ON_ONCE(rnp->qsmask & mask)) { /* RCU waiting on incoming CPU? */ + /* rcu_report_qs_rnp() *really* wants some flags to restore */ + unsigned long flags; + + local_irq_save(flags); + rcu_disable_urgency_upon_qs(rdp); /* Report QS -after- changing ->qsmaskinitnext! */ rcu_report_qs_rnp(mask, rnp, rnp->gp_seq, flags); } else { - raw_spin_unlock_irqrestore_rcu_node(rnp, flags); + raw_spin_unlock_rcu_node(rnp); } + arch_spin_unlock(&rcu_state.ofl_lock); + smp_store_release(&rdp->beenonline, true); smp_mb(); /* Ensure RCU read-side usage follows above initialization. */ } -#ifdef CONFIG_HOTPLUG_CPU /* * The outgoing function has no further need of RCU, so remove it from * the rcu_node tree's ->qsmaskinitnext bit masks. @@ -3418,38 +4374,59 @@ void rcu_cpu_starting(unsigned int cpu) * Note that this function is special in that it is invoked directly * from the outgoing CPU rather than from the cpuhp_step mechanism. * This is because this function must be invoked at a precise location. + * + * This mirrors the effect of rcutree_report_cpu_starting(). */ -void rcu_report_dead(unsigned int cpu) +void rcutree_report_cpu_dead(void) { unsigned long flags; unsigned long mask; - struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); + struct rcu_data *rdp = this_cpu_ptr(&rcu_data); struct rcu_node *rnp = rdp->mynode; /* Outgoing CPU's rdp & rnp. */ - /* QS for any half-done expedited grace period. */ - preempt_disable(); - rcu_report_exp_rdp(this_cpu_ptr(&rcu_data)); - preempt_enable(); + /* + * IRQS must be disabled from now on and until the CPU dies, or an interrupt + * may introduce a new READ-side while it is actually off the QS masks. + */ + lockdep_assert_irqs_disabled(); + /* + * CPUHP_AP_SMPCFD_DYING was the last call for rcu_exp_handler() execution. + * The requested QS must have been reported on the last context switch + * from stop machine to idle. + */ + WARN_ON_ONCE(rdp->cpu_no_qs.b.exp); + // Do any dangling deferred wakeups. + do_nocb_deferred_wakeup(rdp); + rcu_preempt_deferred_qs(current); /* Remove outgoing CPU from mask in the leaf rcu_node structure. */ mask = rdp->grpmask; - raw_spin_lock(&rcu_state.ofl_lock); + + /* + * Hold the ofl_lock and rnp lock to avoid races between CPU going + * offline and doing a QS report (as below), versus rcu_gp_init(). + * See Requirements.rst > Hotplug CPU > Concurrent QS Reporting section + * for more details. + */ + arch_spin_lock(&rcu_state.ofl_lock); raw_spin_lock_irqsave_rcu_node(rnp, flags); /* Enforce GP memory-order guarantee. */ rdp->rcu_ofl_gp_seq = READ_ONCE(rcu_state.gp_seq); - rdp->rcu_ofl_gp_flags = READ_ONCE(rcu_state.gp_flags); + rdp->rcu_ofl_gp_state = READ_ONCE(rcu_state.gp_state); if (rnp->qsmask & mask) { /* RCU waiting on outgoing CPU? */ /* Report quiescent state -before- changing ->qsmaskinitnext! */ + rcu_disable_urgency_upon_qs(rdp); rcu_report_qs_rnp(mask, rnp, rnp->gp_seq, flags); raw_spin_lock_irqsave_rcu_node(rnp, flags); } - rnp->qsmaskinitnext &= ~mask; + /* Clear from ->qsmaskinitnext to mark offline. */ + WRITE_ONCE(rnp->qsmaskinitnext, rnp->qsmaskinitnext & ~mask); raw_spin_unlock_irqrestore_rcu_node(rnp, flags); - raw_spin_unlock(&rcu_state.ofl_lock); - - per_cpu(rcu_cpu_started, cpu) = 0; + arch_spin_unlock(&rcu_state.ofl_lock); + rdp->cpu_started = false; } +#ifdef CONFIG_HOTPLUG_CPU /* * The outgoing CPU has just passed through the dying-idle state, and we * are being invoked from the CPU that was IPIed to continue the offline @@ -3459,36 +4436,105 @@ void rcutree_migrate_callbacks(int cpu) { unsigned long flags; struct rcu_data *my_rdp; + struct rcu_node *my_rnp; struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); - struct rcu_node *rnp_root = rcu_get_root(); bool needwake; - if (rcu_is_nocb_cpu(cpu) || rcu_segcblist_empty(&rdp->cblist)) + if (rcu_rdp_is_offloaded(rdp)) + return; + + raw_spin_lock_irqsave(&rcu_state.barrier_lock, flags); + if (rcu_segcblist_empty(&rdp->cblist)) { + raw_spin_unlock_irqrestore(&rcu_state.barrier_lock, flags); return; /* No callbacks to migrate. */ + } - local_irq_save(flags); + WARN_ON_ONCE(rcu_rdp_cpu_online(rdp)); + rcu_barrier_entrain(rdp); my_rdp = this_cpu_ptr(&rcu_data); - if (rcu_nocb_adopt_orphan_cbs(my_rdp, rdp, flags)) { - local_irq_restore(flags); - return; - } - raw_spin_lock_rcu_node(rnp_root); /* irqs already disabled. */ + my_rnp = my_rdp->mynode; + rcu_nocb_lock(my_rdp); /* irqs already disabled. */ + WARN_ON_ONCE(!rcu_nocb_flush_bypass(my_rdp, NULL, jiffies, false)); + raw_spin_lock_rcu_node(my_rnp); /* irqs already disabled. */ /* Leverage recent GPs and set GP for new callbacks. */ - needwake = rcu_advance_cbs(rnp_root, rdp) || - rcu_advance_cbs(rnp_root, my_rdp); + needwake = rcu_advance_cbs(my_rnp, rdp) || + rcu_advance_cbs(my_rnp, my_rdp); rcu_segcblist_merge(&my_rdp->cblist, &rdp->cblist); - WARN_ON_ONCE(rcu_segcblist_empty(&my_rdp->cblist) != - !rcu_segcblist_n_cbs(&my_rdp->cblist)); - raw_spin_unlock_irqrestore_rcu_node(rnp_root, flags); + raw_spin_unlock(&rcu_state.barrier_lock); /* irqs remain disabled. */ + needwake = needwake || rcu_advance_cbs(my_rnp, my_rdp); + rcu_segcblist_disable(&rdp->cblist); + WARN_ON_ONCE(rcu_segcblist_empty(&my_rdp->cblist) != !rcu_segcblist_n_cbs(&my_rdp->cblist)); + check_cb_ovld_locked(my_rdp, my_rnp); + if (rcu_rdp_is_offloaded(my_rdp)) { + raw_spin_unlock_rcu_node(my_rnp); /* irqs remain disabled. */ + __call_rcu_nocb_wake(my_rdp, true, flags); + } else { + rcu_nocb_unlock(my_rdp); /* irqs remain disabled. */ + raw_spin_unlock_rcu_node(my_rnp); /* irqs remain disabled. */ + } + local_irq_restore(flags); if (needwake) rcu_gp_kthread_wake(); + lockdep_assert_irqs_enabled(); WARN_ONCE(rcu_segcblist_n_cbs(&rdp->cblist) != 0 || !rcu_segcblist_empty(&rdp->cblist), "rcu_cleanup_dead_cpu: Callbacks on offline CPU %d: qlen=%lu, 1stCB=%p\n", cpu, rcu_segcblist_n_cbs(&rdp->cblist), rcu_segcblist_first_cb(&rdp->cblist)); } -#endif + +/* + * The CPU has been completely removed, and some other CPU is reporting + * this fact from process context. Do the remainder of the cleanup. + * There can only be one CPU hotplug operation at a time, so no need for + * explicit locking. + */ +int rcutree_dead_cpu(unsigned int cpu) +{ + ASSERT_EXCLUSIVE_WRITER(rcu_state.n_online_cpus); + WRITE_ONCE(rcu_state.n_online_cpus, rcu_state.n_online_cpus - 1); + // Stop-machine done, so allow nohz_full to disable tick. + tick_dep_clear(TICK_DEP_BIT_RCU); + return 0; +} + +/* + * Near the end of the offline process. Trace the fact that this CPU + * is going offline. + */ +int rcutree_dying_cpu(unsigned int cpu) +{ + bool blkd; + struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); + struct rcu_node *rnp = rdp->mynode; + + blkd = !!(READ_ONCE(rnp->qsmask) & rdp->grpmask); + trace_rcu_grace_period(rcu_state.name, READ_ONCE(rnp->gp_seq), + blkd ? TPS("cpuofl-bgp") : TPS("cpuofl")); + return 0; +} + +/* + * Near the beginning of the process. The CPU is still very much alive + * with pretty much all services enabled. + */ +int rcutree_offline_cpu(unsigned int cpu) +{ + unsigned long flags; + struct rcu_data *rdp; + struct rcu_node *rnp; + + rdp = per_cpu_ptr(&rcu_data, cpu); + rnp = rdp->mynode; + raw_spin_lock_irqsave_rcu_node(rnp, flags); + rnp->ffmask &= ~rdp->grpmask; + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); + + // nohz_full CPUs need the tick for stop-machine to work quickly + tick_dep_set(TICK_DEP_BIT_RCU); + return 0; +} +#endif /* #ifdef CONFIG_HOTPLUG_CPU */ /* * On non-huge systems, use expedited RCU grace periods to make suspend @@ -3500,13 +4546,13 @@ static int rcu_pm_notify(struct notifier_block *self, switch (action) { case PM_HIBERNATION_PREPARE: case PM_SUSPEND_PREPARE: - if (nr_cpu_ids <= 256) /* Expediting bad for large systems. */ - rcu_expedite_gp(); + rcu_async_hurry(); + rcu_expedite_gp(); break; case PM_POST_HIBERNATION: case PM_POST_SUSPEND: - if (nr_cpu_ids <= 256) /* Expediting bad for large systems. */ - rcu_unexpedite_gp(); + rcu_unexpedite_gp(); + rcu_async_relax(); break; default: break; @@ -3520,41 +4566,38 @@ static int rcu_pm_notify(struct notifier_block *self, static int __init rcu_spawn_gp_kthread(void) { unsigned long flags; - int kthread_prio_in = kthread_prio; struct rcu_node *rnp; struct sched_param sp; struct task_struct *t; - - /* Force priority into range. */ - if (IS_ENABLED(CONFIG_RCU_BOOST) && kthread_prio < 2 - && IS_BUILTIN(CONFIG_RCU_TORTURE_TEST)) - kthread_prio = 2; - else if (IS_ENABLED(CONFIG_RCU_BOOST) && kthread_prio < 1) - kthread_prio = 1; - else if (kthread_prio < 0) - kthread_prio = 0; - else if (kthread_prio > 99) - kthread_prio = 99; - - if (kthread_prio != kthread_prio_in) - pr_alert("rcu_spawn_gp_kthread(): Limited prio to %d from %d\n", - kthread_prio, kthread_prio_in); + struct rcu_data *rdp = this_cpu_ptr(&rcu_data); rcu_scheduler_fully_active = 1; t = kthread_create(rcu_gp_kthread, NULL, "%s", rcu_state.name); if (WARN_ONCE(IS_ERR(t), "%s: Could not start grace-period kthread, OOM is now expected behavior\n", __func__)) return 0; - rnp = rcu_get_root(); - raw_spin_lock_irqsave_rcu_node(rnp, flags); - rcu_state.gp_kthread = t; if (kthread_prio) { sp.sched_priority = kthread_prio; sched_setscheduler_nocheck(t, SCHED_FIFO, &sp); } + rnp = rcu_get_root(); + raw_spin_lock_irqsave_rcu_node(rnp, flags); + WRITE_ONCE(rcu_state.gp_activity, jiffies); + WRITE_ONCE(rcu_state.gp_req_activity, jiffies); + // Reset .gp_activity and .gp_req_activity before setting .gp_kthread. + smp_store_release(&rcu_state.gp_kthread, t); /* ^^^ */ raw_spin_unlock_irqrestore_rcu_node(rnp, flags); wake_up_process(t); - rcu_spawn_nocb_kthreads(); - rcu_spawn_boost_kthreads(); + /* This is a pre-SMP initcall, we expect a single CPU */ + WARN_ON(num_online_cpus() > 1); + /* + * Those kthreads couldn't be created on rcu_init() -> rcutree_prepare_cpu() + * due to rcu_scheduler_fully_active. + */ + rcu_spawn_cpu_nocb_kthread(smp_processor_id()); + rcu_spawn_rnp_kthreads(rdp->mynode); + rcu_spawn_core_kthreads(); + /* Create kthread worker for expedited GPs */ + rcu_start_exp_gp_kworker(); return 0; } early_initcall(rcu_spawn_gp_kthread); @@ -3571,9 +4614,20 @@ early_initcall(rcu_spawn_gp_kthread); */ void rcu_scheduler_starting(void) { + unsigned long flags; + struct rcu_node *rnp; + WARN_ON(num_online_cpus() != 1); WARN_ON(nr_context_switches() > 0); rcu_test_sync_prims(); + + // Fix up the ->gp_seq counters. + local_irq_save(flags); + rcu_for_each_node_breadth_first(rnp) + rnp->gp_seq_needed = rnp->gp_seq = rcu_state.gp_seq; + local_irq_restore(flags); + + // Switch out of early boot mode. rcu_scheduler_active = RCU_SCHEDULER_INIT; rcu_test_sync_prims(); } @@ -3646,6 +4700,10 @@ static void __init rcu_init_one(void) init_waitqueue_head(&rnp->exp_wq[2]); init_waitqueue_head(&rnp->exp_wq[3]); spin_lock_init(&rnp->exp_lock); + mutex_init(&rnp->kthread_mutex); + raw_spin_lock_init(&rnp->exp_poll_lock); + rnp->exp_seq_poll_rq = RCU_GET_STATE_COMPLETED; + INIT_WORK(&rnp->exp_poll_wq, sync_rcu_do_polled_gp); } } @@ -3656,20 +4714,58 @@ static void __init rcu_init_one(void) while (i > rnp->grphi) rnp++; per_cpu_ptr(&rcu_data, i)->mynode = rnp; + per_cpu_ptr(&rcu_data, i)->barrier_head.next = + &per_cpu_ptr(&rcu_data, i)->barrier_head; rcu_boot_init_percpu_data(i); } } /* + * Force priority from the kernel command-line into range. + */ +static void __init sanitize_kthread_prio(void) +{ + int kthread_prio_in = kthread_prio; + + if (IS_ENABLED(CONFIG_RCU_BOOST) && kthread_prio < 2 + && IS_BUILTIN(CONFIG_RCU_TORTURE_TEST)) + kthread_prio = 2; + else if (IS_ENABLED(CONFIG_RCU_BOOST) && kthread_prio < 1) + kthread_prio = 1; + else if (kthread_prio < 0) + kthread_prio = 0; + else if (kthread_prio > 99) + kthread_prio = 99; + + if (kthread_prio != kthread_prio_in) + pr_alert("%s: Limited prio to %d from %d\n", + __func__, kthread_prio, kthread_prio_in); +} + +/* * Compute the rcu_node tree geometry from kernel parameters. This cannot * replace the definitions in tree.h because those are needed to size * the ->node array in the rcu_state structure. */ -static void __init rcu_init_geometry(void) +void rcu_init_geometry(void) { ulong d; int i; + static unsigned long old_nr_cpu_ids; int rcu_capacity[RCU_NUM_LVLS]; + static bool initialized; + + if (initialized) { + /* + * Warn if setup_nr_cpu_ids() had not yet been invoked, + * unless nr_cpus_ids == NR_CPUS, in which case who cares? + */ + WARN_ON_ONCE(old_nr_cpu_ids != nr_cpu_ids); + return; + } + + old_nr_cpu_ids = nr_cpu_ids; + initialized = true; /* * Initialize any unspecified boot parameters. @@ -3683,8 +4779,7 @@ static void __init rcu_init_geometry(void) jiffies_till_first_fqs = d; if (jiffies_till_next_fqs == ULONG_MAX) jiffies_till_next_fqs = d; - if (jiffies_till_sched_qs == ULONG_MAX) - adjust_jiffies_till_sched_qs(); + adjust_jiffies_till_sched_qs(); /* If the compile-time values are accurate, just leave. */ if (rcu_fanout_leaf == RCU_FANOUT_LEAF && @@ -3699,8 +4794,7 @@ static void __init rcu_init_geometry(void) * Complain and fall back to the compile-time values if this * limit is exceeded. */ - if (rcu_fanout_leaf < 2 || - rcu_fanout_leaf > sizeof(unsigned long) * 8) { + if (rcu_fanout_leaf < 2 || rcu_fanout_leaf > BITS_PER_LONG) { rcu_fanout_leaf = RCU_FANOUT_LEAF; WARN_ON(1); return; @@ -3764,20 +4858,21 @@ static void __init rcu_dump_rcu_node_tree(void) } struct workqueue_struct *rcu_gp_wq; -struct workqueue_struct *rcu_par_gp_wq; void __init rcu_init(void) { - int cpu; + int cpu = smp_processor_id(); rcu_early_boot_tests(); rcu_bootup_announce(); + sanitize_kthread_prio(); rcu_init_geometry(); rcu_init_one(); if (dump_tree) rcu_dump_rcu_node_tree(); - open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); + if (use_softirq) + open_softirq(RCU_SOFTIRQ, rcu_core_si); /* * We don't need protection against CPU-hotplug here because @@ -3785,19 +4880,40 @@ void __init rcu_init(void) * or the scheduler are operational. */ pm_notifier(rcu_pm_notify, 0); - for_each_online_cpu(cpu) { - rcutree_prepare_cpu(cpu); - rcu_cpu_starting(cpu); - rcutree_online_cpu(cpu); - } + WARN_ON(num_online_cpus() > 1); // Only one CPU this early in boot. + rcutree_prepare_cpu(cpu); + rcutree_report_cpu_starting(cpu); + rcutree_online_cpu(cpu); - /* Create workqueue for expedited GPs and for Tree SRCU. */ - rcu_gp_wq = alloc_workqueue("rcu_gp", WQ_MEM_RECLAIM, 0); + /* Create workqueue for Tree SRCU and for expedited GPs. */ + rcu_gp_wq = alloc_workqueue("rcu_gp", WQ_MEM_RECLAIM | WQ_PERCPU, 0); WARN_ON(!rcu_gp_wq); - rcu_par_gp_wq = alloc_workqueue("rcu_par_gp", WQ_MEM_RECLAIM, 0); - WARN_ON(!rcu_par_gp_wq); - srcu_init(); + + sync_wq = alloc_workqueue("sync_wq", WQ_MEM_RECLAIM | WQ_UNBOUND, 0); + WARN_ON(!sync_wq); + + /* Respect if explicitly disabled via a boot parameter. */ + if (rcu_normal_wake_from_gp < 0) { + if (num_possible_cpus() <= WAKE_FROM_GP_CPU_THRESHOLD) + rcu_normal_wake_from_gp = 1; + } + + /* Fill in default value for rcutree.qovld boot parameter. */ + /* -After- the rcu_node ->lock fields are initialized! */ + if (qovld < 0) + qovld_calc = DEFAULT_RCU_QOVLD_MULT * qhimark; + else + qovld_calc = qovld; + + // Kick-start in case any polled grace periods started early. + (void)start_poll_synchronize_rcu_expedited(); + + rcu_test_sync_prims(); + + tasks_cblist_init_generic(); } +#include "tree_stall.h" #include "tree_exp.h" +#include "tree_nocb.h" #include "tree_plugin.h" diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index d90b02b53c0e..b8bbe7960cda 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -1,44 +1,30 @@ +/* SPDX-License-Identifier: GPL-2.0+ */ /* * Read-Copy Update mechanism for mutual exclusion (tree-based version) * Internal non-public definitions. * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, you can access it online at - * http://www.gnu.org/licenses/gpl-2.0.html. - * * Copyright IBM Corporation, 2008 * * Author: Ingo Molnar <mingo@elte.hu> - * Paul E. McKenney <paulmck@linux.vnet.ibm.com> + * Paul E. McKenney <paulmck@linux.ibm.com> */ #include <linux/cache.h> +#include <linux/kthread.h> #include <linux/spinlock.h> #include <linux/rtmutex.h> #include <linux/threads.h> #include <linux/cpumask.h> #include <linux/seqlock.h> #include <linux/swait.h> -#include <linux/stop_machine.h> #include <linux/rcu_node_tree.h> #include "rcu_segcblist.h" -/* Communicate arguments to a workqueue handler. */ +/* Communicate arguments to a kthread worker handler. */ struct rcu_exp_work { - smp_call_func_t rew_func; unsigned long rew_s; - struct work_struct rew_work; + struct kthread_work rew_work; }; /* RCU's kthread states for tracing. */ @@ -56,7 +42,7 @@ struct rcu_node { raw_spinlock_t __private lock; /* Root rcu_node's lock protects */ /* some rcu_state fields as well as */ /* following. */ - unsigned long gp_seq; /* Track rsp->rcu_gp_seq. */ + unsigned long gp_seq; /* Track rsp->gp_seq. */ unsigned long gp_seq_needed; /* Track furthest future GP request. */ unsigned long completedqs; /* All QSes done for this node. */ unsigned long qsmask; /* CPUs or groups that need to switch in */ @@ -71,7 +57,6 @@ struct rcu_node { /* Initialized from ->qsmaskinitnext at the */ /* beginning of each grace period. */ unsigned long qsmaskinitnext; - /* Online CPUs for next grace period. */ unsigned long expmask; /* CPUs or groups that need to check in */ /* to allow the current expedited GP */ /* to complete. */ @@ -83,12 +68,17 @@ struct rcu_node { /* Online CPUs for next expedited GP. */ /* Any CPU that has ever been online will */ /* have its bit set. */ + struct kthread_worker *exp_kworker; + /* Workers performing per node expedited GP */ + /* initialization. */ + unsigned long cbovldmask; + /* CPUs experiencing callback overload. */ unsigned long ffmask; /* Fully functional CPUs. */ unsigned long grpmask; /* Mask to apply to parent qsmask. */ /* Only one bit will be set in this mask. */ - int grplo; /* lowest-numbered CPU or group here. */ - int grphi; /* highest-numbered CPU or group here. */ - u8 grpnum; /* CPU/group number for next level up. */ + int grplo; /* lowest-numbered CPU here. */ + int grphi; /* highest-numbered CPU here. */ + u8 grpnum; /* group number for next level up. */ u8 level; /* root is at level 0. */ bool wait_blkd_tasks;/* Necessary to wait for blocked tasks to */ /* exit RCU read-side critical sections */ @@ -122,11 +112,15 @@ struct rcu_node { /* side effect, not as a lock. */ unsigned long boost_time; /* When to start boosting (jiffies). */ + struct mutex kthread_mutex; + /* Exclusion for thread spawning and affinity */ + /* manipulation. */ struct task_struct *boost_kthread_task; /* kthread that takes care of priority */ /* boosting for this rcu_node structure. */ unsigned int boost_kthread_status; /* State of boost_kthread_task for tracing. */ + unsigned long n_boosts; /* Number of boosts for this rcu_node structure. */ #ifdef CONFIG_RCU_NOCB_CPU struct swait_queue_head nocb_gp_wq[2]; /* Place for rcu_nocb_kthread() to wait GP. */ @@ -138,6 +132,10 @@ struct rcu_node { wait_queue_head_t exp_wq[4]; struct rcu_exp_work rew; bool exp_need_flush; /* Need to flush workitem? */ + raw_spinlock_t exp_poll_lock; + /* Lock and data for polled expedited grace periods. */ + unsigned long exp_seq_poll_rq; + struct work_struct exp_poll_wq; } ____cacheline_internodealigned_in_smp; /* @@ -159,22 +157,54 @@ union rcu_noqs { u16 s; /* Set of bits, aggregate OR here. */ }; +/* + * Record the snapshot of the core stats at half of the first RCU stall timeout. + * The member gp_seq is used to ensure that all members are updated only once + * during the sampling period. The snapshot is taken only if this gp_seq is not + * equal to rdp->gp_seq. + */ +struct rcu_snap_record { + unsigned long gp_seq; /* Track rdp->gp_seq counter */ + u64 cputime_irq; /* Accumulated cputime of hard irqs */ + u64 cputime_softirq;/* Accumulated cputime of soft irqs */ + u64 cputime_system; /* Accumulated cputime of kernel tasks */ + u64 nr_hardirqs; /* Accumulated number of hard irqs */ + unsigned int nr_softirqs; /* Accumulated number of soft irqs */ + unsigned long long nr_csw; /* Accumulated number of task switches */ + unsigned long jiffies; /* Track jiffies value */ +}; + +/* + * An IRQ work (deferred_qs_iw) is used by RCU to get the scheduler's attention. + * to report quiescent states at the soonest possible time. + * The request can be in one of the following states: + * - DEFER_QS_IDLE: An IRQ work is yet to be scheduled. + * - DEFER_QS_PENDING: An IRQ work was scheduled but either not yet run, or it + * ran and we still haven't reported a quiescent state. + */ +#define DEFER_QS_IDLE 0 +#define DEFER_QS_PENDING 1 + /* Per-CPU data for read-copy update. */ struct rcu_data { /* 1) quiescent-state and grace-period handling : */ - unsigned long gp_seq; /* Track rsp->rcu_gp_seq counter. */ + unsigned long gp_seq; /* Track rsp->gp_seq counter. */ unsigned long gp_seq_needed; /* Track furthest future GP request. */ union rcu_noqs cpu_no_qs; /* No QSes yet for this CPU. */ - bool core_needs_qs; /* Core waits for quiesc state. */ + bool core_needs_qs; /* Core waits for quiescent state. */ bool beenonline; /* CPU online at least once. */ bool gpwrap; /* Possible ->gp_seq wrap. */ - bool deferred_qs; /* This CPU awaiting a deferred QS? */ + unsigned int gpwrap_count; /* Count of GP sequence wrap. */ + bool cpu_started; /* RCU watching this onlining CPU. */ struct rcu_node *mynode; /* This CPU's leaf of hierarchy */ unsigned long grpmask; /* Mask to apply to leaf qsmask. */ unsigned long ticks_this_gp; /* The number of scheduling-clock */ /* ticks this CPU has handled */ /* during and after the last grace */ /* period it is aware of. */ + struct irq_work defer_qs_iw; /* Obtain later scheduler attention. */ + int defer_qs_iw_pending; /* Scheduler attention pending? */ + struct work_struct strict_work; /* Schedule readers for strict GPs. */ /* 2) batch handling */ struct rcu_segcblist cblist; /* Segmented callback list, with */ @@ -182,77 +212,96 @@ struct rcu_data { /* different grace periods. */ long qlen_last_fqs_check; /* qlen at last check for QS forcing */ + unsigned long n_cbs_invoked; /* # callbacks invoked since boot. */ unsigned long n_force_qs_snap; /* did other CPU force QS recently? */ long blimit; /* Upper limit on a processed batch */ /* 3) dynticks interface. */ - int dynticks_snap; /* Per-GP tracking for dynticks. */ - long dynticks_nesting; /* Track process nesting level. */ - long dynticks_nmi_nesting; /* Track irq/NMI nesting level. */ - atomic_t dynticks; /* Even value for idle, else odd. */ + int watching_snap; /* Per-GP tracking for dynticks. */ bool rcu_need_heavy_qs; /* GP old, so heavy quiescent state! */ bool rcu_urgent_qs; /* GP old need light quiescent state. */ -#ifdef CONFIG_RCU_FAST_NO_HZ - bool all_lazy; /* Are all CPU's CBs lazy? */ - unsigned long nonlazy_posted; /* # times non-lazy CB posted to CPU. */ - unsigned long nonlazy_posted_snap; - /* Nonlazy_posted snapshot. */ - unsigned long last_accelerate; /* Last jiffy CBs were accelerated. */ - unsigned long last_advance_all; /* Last jiffy CBs were all advanced. */ - int tick_nohz_enabled_snap; /* Previously seen value from sysfs. */ -#endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */ + bool rcu_forced_tick; /* Forced tick to provide QS. */ + bool rcu_forced_tick_exp; /* ... provide QS to expedited GP. */ /* 4) rcu_barrier(), OOM callbacks, and expediting. */ + unsigned long barrier_seq_snap; /* Snap of rcu_state.barrier_sequence. */ struct rcu_head barrier_head; - int exp_dynticks_snap; /* Double-check need for IPI. */ + int exp_watching_snap; /* Double-check need for IPI. */ /* 5) Callback offloading. */ #ifdef CONFIG_RCU_NOCB_CPU - struct rcu_head *nocb_head; /* CBs waiting for kthread. */ - struct rcu_head **nocb_tail; - atomic_long_t nocb_q_count; /* # CBs waiting for nocb */ - atomic_long_t nocb_q_count_lazy; /* invocation (all stages). */ - struct rcu_head *nocb_follower_head; /* CBs ready to invoke. */ - struct rcu_head **nocb_follower_tail; - struct swait_queue_head nocb_wq; /* For nocb kthreads to sleep on. */ - struct task_struct *nocb_kthread; + struct swait_queue_head nocb_cb_wq; /* For nocb kthreads to sleep on. */ + struct swait_queue_head nocb_state_wq; /* For offloading state changes */ + struct task_struct *nocb_gp_kthread; raw_spinlock_t nocb_lock; /* Guard following pair of fields. */ int nocb_defer_wakeup; /* Defer wakeup of nocb_kthread. */ struct timer_list nocb_timer; /* Enforce finite deferral. */ - - /* The following fields are used by the leader, hence own cacheline. */ - struct rcu_head *nocb_gp_head ____cacheline_internodealigned_in_smp; - /* CBs waiting for GP. */ - struct rcu_head **nocb_gp_tail; - bool nocb_leader_sleep; /* Is the nocb leader thread asleep? */ - struct rcu_data *nocb_next_follower; - /* Next follower in wakeup chain. */ - - /* The following fields are used by the follower, hence new cachline. */ - struct rcu_data *nocb_leader ____cacheline_internodealigned_in_smp; - /* Leader CPU takes GP-end wakeups. */ + unsigned long nocb_gp_adv_time; /* Last call_rcu() CB adv (jiffies). */ + struct mutex nocb_gp_kthread_mutex; /* Exclusion for nocb gp kthread */ + /* spawning */ + + /* The following fields are used by call_rcu, hence own cacheline. */ + raw_spinlock_t nocb_bypass_lock ____cacheline_internodealigned_in_smp; + struct rcu_cblist nocb_bypass; /* Lock-contention-bypass CB list. */ + unsigned long nocb_bypass_first; /* Time (jiffies) of first enqueue. */ + unsigned long nocb_nobypass_last; /* Last ->cblist enqueue (jiffies). */ + int nocb_nobypass_count; /* # ->cblist enqueues at ^^^ time. */ + + /* The following fields are used by GP kthread, hence own cacheline. */ + raw_spinlock_t nocb_gp_lock ____cacheline_internodealigned_in_smp; + u8 nocb_gp_sleep; /* Is the nocb GP thread asleep? */ + u8 nocb_gp_bypass; /* Found a bypass on last scan? */ + u8 nocb_gp_gp; /* GP to wait for on last scan? */ + unsigned long nocb_gp_seq; /* If so, ->gp_seq to wait for. */ + unsigned long nocb_gp_loops; /* # passes through wait code. */ + struct swait_queue_head nocb_gp_wq; /* For nocb kthreads to sleep on. */ + bool nocb_cb_sleep; /* Is the nocb CB thread asleep? */ + struct task_struct *nocb_cb_kthread; + struct list_head nocb_head_rdp; /* + * Head of rcu_data list in wakeup chain, + * if rdp_gp. + */ + struct list_head nocb_entry_rdp; /* rcu_data node in wakeup chain. */ + struct rcu_data *nocb_toggling_rdp; /* rdp queued for (de-)offloading */ + + /* The following fields are used by CB kthread, hence new cacheline. */ + struct rcu_data *nocb_gp_rdp ____cacheline_internodealigned_in_smp; + /* GP rdp takes GP-end wakeups. */ #endif /* #ifdef CONFIG_RCU_NOCB_CPU */ - /* 6) Diagnostic data, including RCU CPU stall warnings. */ + /* 6) RCU priority boosting. */ + struct task_struct *rcu_cpu_kthread_task; + /* rcuc per-CPU kthread or NULL. */ + unsigned int rcu_cpu_kthread_status; + char rcu_cpu_has_work; + unsigned long rcuc_activity; + + /* 7) Diagnostic data, including RCU CPU stall warnings. */ unsigned int softirq_snap; /* Snapshot of softirq activity. */ /* ->rcu_iw* fields protected by leaf rcu_node ->lock. */ struct irq_work rcu_iw; /* Check for non-irq activity. */ bool rcu_iw_pending; /* Is ->rcu_iw pending? */ unsigned long rcu_iw_gp_seq; /* ->gp_seq associated with ->rcu_iw. */ unsigned long rcu_ofl_gp_seq; /* ->gp_seq at last offline. */ - short rcu_ofl_gp_flags; /* ->gp_flags at last offline. */ + short rcu_ofl_gp_state; /* ->gp_state at last offline. */ unsigned long rcu_onl_gp_seq; /* ->gp_seq at last online. */ - short rcu_onl_gp_flags; /* ->gp_flags at last online. */ + short rcu_onl_gp_state; /* ->gp_state at last online. */ unsigned long last_fqs_resched; /* Time of last rcu_resched(). */ + unsigned long last_sched_clock; /* Jiffies of last rcu_sched_clock_irq(). */ + struct rcu_snap_record snap_record; /* Snapshot of core stats at half of */ + /* the first RCU stall timeout */ + long lazy_len; /* Length of buffered lazy callbacks. */ int cpu; }; /* Values for nocb_defer_wakeup field in struct rcu_data. */ #define RCU_NOCB_WAKE_NOT 0 -#define RCU_NOCB_WAKE 1 -#define RCU_NOCB_WAKE_FORCE 2 +#define RCU_NOCB_WAKE_BYPASS 1 +#define RCU_NOCB_WAKE_LAZY 2 +#define RCU_NOCB_WAKE 3 +#define RCU_NOCB_WAKE_FORCE 4 #define RCU_JIFFIES_TILL_FORCE_QS (1 + (HZ > 250) + (HZ > 500)) /* For jiffies_till_first_fqs and */ @@ -278,6 +327,19 @@ do { \ } while (0) /* + * A max threshold for synchronize_rcu() users which are + * awaken directly by the rcu_gp_kthread(). Left part is + * deferred to the main worker. + */ +#define SR_MAX_USERS_WAKE_FROM_GP 5 +#define SR_NORMAL_GP_WAIT_HEAD_MAX 5 + +struct sr_wait_node { + atomic_t inuse; + struct llist_node node; +}; + +/* * RCU global state, including node hierarchy. This hierarchy is * represented in "heap" form in a dense array. The root (first level) * of the hierarchy is in ->node[0] (referenced by ->level[0]), the second @@ -293,16 +355,23 @@ struct rcu_state { /* Hierarchy levels (+1 to */ /* shut bogus gcc warning) */ int ncpus; /* # CPUs seen so far. */ + int n_online_cpus; /* # CPUs online for RCU. */ /* The following fields are guarded by the root rcu_node's lock. */ - u8 boost ____cacheline_internodealigned_in_smp; - /* Subject to priority boost. */ - unsigned long gp_seq; /* Grace-period sequence #. */ + unsigned long gp_seq ____cacheline_internodealigned_in_smp; + /* Grace-period sequence #. */ + unsigned long gp_max; /* Maximum GP duration in */ + /* jiffies. */ struct task_struct *gp_kthread; /* Task for grace periods. */ struct swait_queue_head gp_wq; /* Where GP task waits. */ short gp_flags; /* Commands for GP task. */ short gp_state; /* GP kthread sleep state. */ + unsigned long gp_wake_time; /* Last GP kthread wake. */ + unsigned long gp_wake_seq; /* ->gp_seq at ^^^. */ + unsigned long gp_seq_polled; /* GP seq for polled API. */ + unsigned long gp_seq_polled_snap; /* ->gp_seq_polled at normal GP start. */ + unsigned long gp_seq_polled_exp_snap; /* ->gp_seq_polled at expedited GP start. */ /* End of fields guarded by root rcu_node's lock. */ @@ -313,12 +382,16 @@ struct rcu_state { /* rcu_barrier(). */ /* End of fields guarded by barrier_mutex. */ + raw_spinlock_t barrier_lock; /* Protects ->barrier_seq_snap. */ + struct mutex exp_mutex; /* Serialize expedited GP. */ struct mutex exp_wake_mutex; /* Serialize wakeup. */ unsigned long expedited_sequence; /* Take a ticket. */ atomic_t expedited_need_qs; /* # CPUs left to check in. */ struct swait_queue_head expedited_wq; /* Wait for check-ins. */ int ncpus_snap; /* # CPUs seen last time. */ + u8 cbovld; /* Callback overload now? */ + u8 cbovldnext; /* ^ ^ next time? */ unsigned long jiffies_force_qs; /* Time at which to invoke */ /* force_quiescent_state(). */ @@ -336,23 +409,39 @@ struct rcu_state { /* in jiffies. */ unsigned long jiffies_stall; /* Time at which to check */ /* for CPU stalls. */ + int nr_fqs_jiffies_stall; /* Number of fqs loops after + * which read jiffies and set + * jiffies_stall. Stall + * warnings disabled if !0. */ unsigned long jiffies_resched; /* Time at which to resched */ /* a reluctant CPU. */ unsigned long n_force_qs_gpstart; /* Snapshot of n_force_qs at */ /* GP start. */ - unsigned long gp_max; /* Maximum GP duration in */ - /* jiffies. */ const char *name; /* Name of structure. */ char abbr; /* Abbreviated name. */ - raw_spinlock_t ofl_lock ____cacheline_internodealigned_in_smp; + arch_spinlock_t ofl_lock ____cacheline_internodealigned_in_smp; /* Synchronize offline with */ /* GP pre-initialization. */ + + /* synchronize_rcu() part. */ + struct llist_head srs_next; /* request a GP users. */ + struct llist_node *srs_wait_tail; /* wait for GP users. */ + struct llist_node *srs_done_tail; /* ready for GP users. */ + struct sr_wait_node srs_wait_nodes[SR_NORMAL_GP_WAIT_HEAD_MAX]; + struct work_struct srs_cleanup_work; + atomic_t srs_cleanups_pending; /* srs inflight worker cleanups. */ + +#ifdef CONFIG_RCU_NOCB_CPU + struct mutex nocb_mutex; /* Guards (de-)offloading */ + int nocb_is_setup; /* nocb is setup from boot */ +#endif }; /* Values for rcu_state structure's gp_flags field. */ #define RCU_GP_FLAG_INIT 0x1 /* Need grace-period initialization. */ #define RCU_GP_FLAG_FQS 0x2 /* Need grace-period quiescent-state forcing. */ +#define RCU_GP_FLAG_OVLD 0x4 /* Experiencing callback overload. */ /* Values for rcu_state structure's gp_state field. */ #define RCU_GP_IDLE 0 /* Initial state and no GP in progress. */ @@ -365,18 +454,6 @@ struct rcu_state { #define RCU_GP_CLEANUP 7 /* Grace-period cleanup started. */ #define RCU_GP_CLEANED 8 /* Grace-period cleanup complete. */ -static const char * const gp_state_names[] = { - "RCU_GP_IDLE", - "RCU_GP_WAIT_GPS", - "RCU_GP_DONE_GPS", - "RCU_GP_ONOFF", - "RCU_GP_INIT", - "RCU_GP_WAIT_FQS", - "RCU_GP_DOING_FQS", - "RCU_GP_CLEANUP", - "RCU_GP_CLEANED", -}; - /* * In order to export the rcu_state name to the tracing tools, it * needs to be added in the __tracepoint_string section. @@ -400,73 +477,72 @@ static const char *tp_rcu_varname __used __tracepoint_string = rcu_name; #define RCU_NAME rcu_name #endif /* #else #ifdef CONFIG_TRACING */ -int rcu_dynticks_snap(struct rcu_data *rdp); - -#ifdef CONFIG_RCU_BOOST -DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_status); -DECLARE_PER_CPU(int, rcu_cpu_kthread_cpu); -DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_loops); -DECLARE_PER_CPU(char, rcu_cpu_has_work); -#endif /* #ifdef CONFIG_RCU_BOOST */ - -/* Forward declarations for rcutree_plugin.h */ +/* Forward declarations for tree_plugin.h */ static void rcu_bootup_announce(void); static void rcu_qs(void); static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp); #ifdef CONFIG_HOTPLUG_CPU static bool rcu_preempt_has_tasks(struct rcu_node *rnp); #endif /* #ifdef CONFIG_HOTPLUG_CPU */ -static void rcu_print_detail_task_stall(void); -static int rcu_print_task_stall(struct rcu_node *rnp); static int rcu_print_task_exp_stall(struct rcu_node *rnp); static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp); -static void rcu_flavor_check_callbacks(int user); -void call_rcu(struct rcu_head *head, rcu_callback_t func); +static void rcu_flavor_sched_clock_irq(int user); static void dump_blkd_tasks(struct rcu_node *rnp, int ncheck); +static void rcu_preempt_deferred_qs_init(struct rcu_data *rdp); static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags); static void rcu_preempt_boost_start_gp(struct rcu_node *rnp); -static void invoke_rcu_callbacks_kthread(void); -static bool rcu_is_callbacks_kthread(void); -static void __init rcu_spawn_boost_kthreads(void); -static void rcu_prepare_kthreads(int cpu); -static void rcu_cleanup_after_idle(void); -static void rcu_prepare_for_idle(void); -static void rcu_idle_count_callbacks_posted(void); +static bool rcu_is_callbacks_kthread(struct rcu_data *rdp); +static void rcu_cpu_kthread_setup(unsigned int cpu); +static void rcu_spawn_one_boost_kthread(struct rcu_node *rnp); static bool rcu_preempt_has_tasks(struct rcu_node *rnp); static bool rcu_preempt_need_deferred_qs(struct task_struct *t); -static void rcu_preempt_deferred_qs(struct task_struct *t); -static void print_cpu_stall_info_begin(void); -static void print_cpu_stall_info(int cpu); -static void print_cpu_stall_info_end(void); static void zero_cpu_stall_ticks(struct rcu_data *rdp); -static bool rcu_nocb_cpu_needs_barrier(int cpu); static struct swait_queue_head *rcu_nocb_gp_get(struct rcu_node *rnp); static void rcu_nocb_gp_cleanup(struct swait_queue_head *sq); static void rcu_init_one_nocb(struct rcu_node *rnp); -static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp, - bool lazy, unsigned long flags); -static bool rcu_nocb_adopt_orphan_cbs(struct rcu_data *my_rdp, - struct rcu_data *rdp, - unsigned long flags); -static int rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp); -static void do_nocb_deferred_wakeup(struct rcu_data *rdp); +static bool wake_nocb_gp(struct rcu_data *rdp, bool force); +static bool rcu_nocb_flush_bypass(struct rcu_data *rdp, struct rcu_head *rhp, + unsigned long j, bool lazy); +static void call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *head, + rcu_callback_t func, unsigned long flags, bool lazy); +static void __maybe_unused __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_empty, + unsigned long flags); +static int rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp, int level); +static bool do_nocb_deferred_wakeup(struct rcu_data *rdp); static void rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp); -static void rcu_spawn_all_nocb_kthreads(int cpu); -static void __init rcu_spawn_nocb_kthreads(void); +static void rcu_spawn_cpu_nocb_kthread(int cpu); +static void show_rcu_nocb_state(struct rcu_data *rdp); +static void rcu_nocb_lock(struct rcu_data *rdp); +static void rcu_nocb_unlock(struct rcu_data *rdp); +static void rcu_nocb_unlock_irqrestore(struct rcu_data *rdp, + unsigned long flags); +static void rcu_lockdep_assert_cblist_protected(struct rcu_data *rdp); #ifdef CONFIG_RCU_NOCB_CPU static void __init rcu_organize_nocb_kthreads(void); -#endif /* #ifdef CONFIG_RCU_NOCB_CPU */ -static bool init_nocb_callback_list(struct rcu_data *rdp); -static unsigned long rcu_get_n_cbs_nocb_cpu(struct rcu_data *rdp); + +/* + * Disable IRQs before checking offloaded state so that local + * locking is safe against concurrent de-offloading. + */ +#define rcu_nocb_lock_irqsave(rdp, flags) \ +do { \ + local_irq_save(flags); \ + if (rcu_segcblist_is_offloaded(&(rdp)->cblist)) \ + raw_spin_lock(&(rdp)->nocb_lock); \ +} while (0) +#else /* #ifdef CONFIG_RCU_NOCB_CPU */ +#define rcu_nocb_lock_irqsave(rdp, flags) local_irq_save(flags) +#endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */ + static void rcu_bind_gp_kthread(void); static bool rcu_nohz_full_cpu(void); -static void rcu_dynticks_task_enter(void); -static void rcu_dynticks_task_exit(void); - -#ifdef CONFIG_SRCU -void srcu_online_cpu(unsigned int cpu); -void srcu_offline_cpu(unsigned int cpu); -#else /* #ifdef CONFIG_SRCU */ -void srcu_online_cpu(unsigned int cpu) { } -void srcu_offline_cpu(unsigned int cpu) { } -#endif /* #else #ifdef CONFIG_SRCU */ + +/* Forward declarations for tree_stall.h */ +static void record_gp_stall_check_time(void); +static void rcu_iw_handler(struct irq_work *iwp); +static void check_cpu_stall(struct rcu_data *rdp); +static void rcu_check_gp_start_stall(struct rcu_node *rnp, struct rcu_data *rdp, + const unsigned long gpssdelay); + +/* Forward declarations for tree_exp.h. */ +static void sync_rcu_do_polled_gp(struct work_struct *wp); diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index 928fe5893a57..96c49c56fc14 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -1,37 +1,30 @@ +/* SPDX-License-Identifier: GPL-2.0+ */ /* * RCU expedited grace periods * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, you can access it online at - * http://www.gnu.org/licenses/gpl-2.0.html. - * * Copyright IBM Corporation, 2016 * - * Authors: Paul E. McKenney <paulmck@linux.vnet.ibm.com> + * Authors: Paul E. McKenney <paulmck@linux.ibm.com> */ +#include <linux/console.h> #include <linux/lockdep.h> +static void rcu_exp_handler(void *unused); +static int rcu_print_task_exp_stall(struct rcu_node *rnp); +static void rcu_exp_print_detail_task_stall_rnp(struct rcu_node *rnp); + /* * Record the start of an expedited grace period. */ static void rcu_exp_gp_seq_start(void) { rcu_seq_start(&rcu_state.expedited_sequence); + rcu_poll_gp_seq_start_unlocked(&rcu_state.gp_seq_polled_exp_snap); } /* - * Return then value that expedited-grace-period counter will have + * Return the value that the expedited-grace-period counter will have * at the end of the current grace period. */ static __maybe_unused unsigned long rcu_exp_gp_seq_endval(void) @@ -44,12 +37,15 @@ static __maybe_unused unsigned long rcu_exp_gp_seq_endval(void) */ static void rcu_exp_gp_seq_end(void) { + rcu_poll_gp_seq_end_unlocked(&rcu_state.gp_seq_polled_exp_snap); rcu_seq_end(&rcu_state.expedited_sequence); smp_mb(); /* Ensure that consecutive grace periods serialize. */ } /* - * Take a snapshot of the expedited-grace-period counter. + * Take a snapshot of the expedited-grace-period counter, which is the + * earliest value that will indicate that a full grace period has + * elapsed since the current time. */ static unsigned long rcu_exp_gp_seq_snap(void) { @@ -144,7 +140,14 @@ static void __maybe_unused sync_exp_reset_tree(void) rcu_for_each_node_breadth_first(rnp) { raw_spin_lock_irqsave_rcu_node(rnp, flags); WARN_ON_ONCE(rnp->expmask); - rnp->expmask = rnp->expmaskinit; + WRITE_ONCE(rnp->expmask, rnp->expmaskinit); + /* + * Need to wait for any blocked tasks as well. Note that + * additional blocking tasks will also block the expedited GP + * until such time as the ->expmask bits are cleared. + */ + if (rcu_is_leaf_node(rnp) && rcu_preempt_has_tasks(rnp)) + WRITE_ONCE(rnp->exp_tasks, rnp->blkd_tasks.next); raw_spin_unlock_irqrestore_rcu_node(rnp, flags); } } @@ -153,37 +156,31 @@ static void __maybe_unused sync_exp_reset_tree(void) * Return non-zero if there is no RCU expedited grace period in progress * for the specified rcu_node structure, in other words, if all CPUs and * tasks covered by the specified rcu_node structure have done their bit - * for the current expedited grace period. Works only for preemptible - * RCU -- other RCU implementation use other means. - * - * Caller must hold the specificed rcu_node structure's ->lock + * for the current expedited grace period. */ -static bool sync_rcu_preempt_exp_done(struct rcu_node *rnp) +static bool sync_rcu_exp_done(struct rcu_node *rnp) { raw_lockdep_assert_held_rcu_node(rnp); - - return rnp->exp_tasks == NULL && + return READ_ONCE(rnp->exp_tasks) == NULL && READ_ONCE(rnp->expmask) == 0; } /* - * Like sync_rcu_preempt_exp_done(), but this function assumes the caller - * doesn't hold the rcu_node's ->lock, and will acquire and release the lock - * itself + * Like sync_rcu_exp_done(), but where the caller does not hold the + * rcu_node's ->lock. */ -static bool sync_rcu_preempt_exp_done_unlocked(struct rcu_node *rnp) +static bool sync_rcu_exp_done_unlocked(struct rcu_node *rnp) { unsigned long flags; bool ret; raw_spin_lock_irqsave_rcu_node(rnp, flags); - ret = sync_rcu_preempt_exp_done(rnp); + ret = sync_rcu_exp_done(rnp); raw_spin_unlock_irqrestore_rcu_node(rnp, flags); return ret; } - /* * Report the exit from RCU read-side critical section for the last task * that queued itself during or before the current expedited preemptible-RCU @@ -191,8 +188,6 @@ static bool sync_rcu_preempt_exp_done_unlocked(struct rcu_node *rnp) * which the task was queued or to one of that rcu_node structure's ancestors, * recursively up the tree. (Calm down, calm down, we do the recursion * iteratively!) - * - * Caller must hold the specified rcu_node structure's ->lock. */ static void __rcu_report_exp_rnp(struct rcu_node *rnp, bool wake, unsigned long flags) @@ -200,8 +195,9 @@ static void __rcu_report_exp_rnp(struct rcu_node *rnp, { unsigned long mask; + raw_lockdep_assert_held_rcu_node(rnp); for (;;) { - if (!sync_rcu_preempt_exp_done(rnp)) { + if (!sync_rcu_exp_done(rnp)) { if (!rnp->expmask) rcu_initiate_boost(rnp, flags); else @@ -210,10 +206,9 @@ static void __rcu_report_exp_rnp(struct rcu_node *rnp, } if (rnp->parent == NULL) { raw_spin_unlock_irqrestore_rcu_node(rnp, flags); - if (wake) { - smp_mb(); /* EGP done before wake_up(). */ + if (wake) swake_up_one(&rcu_state.expedited_wq); - } + break; } mask = rnp->grpmask; @@ -221,7 +216,7 @@ static void __rcu_report_exp_rnp(struct rcu_node *rnp, rnp = rnp->parent; raw_spin_lock_rcu_node(rnp); /* irqs already disabled */ WARN_ON_ONCE(!(rnp->expmask & mask)); - rnp->expmask &= ~mask; + WRITE_ONCE(rnp->expmask, rnp->expmask & ~mask); } } @@ -239,19 +234,30 @@ static void __maybe_unused rcu_report_exp_rnp(struct rcu_node *rnp, bool wake) /* * Report expedited quiescent state for multiple CPUs, all covered by the - * specified leaf rcu_node structure. + * specified leaf rcu_node structure, which is acquired by the caller. */ -static void rcu_report_exp_cpu_mult(struct rcu_node *rnp, - unsigned long mask, bool wake) +static void rcu_report_exp_cpu_mult(struct rcu_node *rnp, unsigned long flags, + unsigned long mask_in, bool wake) + __releases(rnp->lock) { - unsigned long flags; + int cpu; + unsigned long mask; + struct rcu_data *rdp; - raw_spin_lock_irqsave_rcu_node(rnp, flags); - if (!(rnp->expmask & mask)) { + raw_lockdep_assert_held_rcu_node(rnp); + if (!(rnp->expmask & mask_in)) { raw_spin_unlock_irqrestore_rcu_node(rnp, flags); return; } - rnp->expmask &= ~mask; + mask = mask_in & rnp->expmask; + WRITE_ONCE(rnp->expmask, rnp->expmask & ~mask); + for_each_leaf_node_cpu_mask(rnp, cpu, mask) { + rdp = per_cpu_ptr(&rcu_data, cpu); + if (!IS_ENABLED(CONFIG_NO_HZ_FULL) || !rdp->rcu_forced_tick_exp) + continue; + rdp->rcu_forced_tick_exp = false; + tick_dep_clear_cpu(cpu, TICK_DEP_BIT_RCU_EXP); + } __rcu_report_exp_rnp(rnp, wake, flags); /* Releases rnp->lock. */ } @@ -260,8 +266,13 @@ static void rcu_report_exp_cpu_mult(struct rcu_node *rnp, */ static void rcu_report_exp_rdp(struct rcu_data *rdp) { - WRITE_ONCE(rdp->deferred_qs, false); - rcu_report_exp_cpu_mult(rdp->mynode, rdp->grpmask, true); + unsigned long flags; + struct rcu_node *rnp = rdp->mynode; + + raw_spin_lock_irqsave_rcu_node(rnp, flags); + WRITE_ONCE(rdp->cpu_no_qs.b.exp, false); + ASSERT_EXCLUSIVE_WRITER(rdp->cpu_no_qs.b.exp); + rcu_report_exp_cpu_mult(rnp, flags, rdp->grpmask, true); } /* Common code for work-done checking. */ @@ -269,8 +280,12 @@ static bool sync_exp_work_done(unsigned long s) { if (rcu_exp_gp_seq_done(s)) { trace_rcu_exp_grace_period(rcu_state.name, s, TPS("done")); - /* Ensure test happens before caller kfree(). */ - smp_mb__before_atomic(); /* ^^^ */ + /* + * Order GP completion with preceding accesses. Order also GP + * completion with post GP update side accesses. Pairs with + * rcu_seq_end(). + */ + smp_mb(); return true; } return false; @@ -320,7 +335,7 @@ static bool exp_funnel_lock(unsigned long s) sync_exp_work_done(s)); return true; } - rnp->exp_seq_rq = s; /* Followers can wait on us. */ + WRITE_ONCE(rnp->exp_seq_rq, s); /* Followers can wait on us. */ spin_unlock(&rnp->exp_lock); trace_rcu_exp_funnel_lock(rcu_state.name, rnp->level, rnp->grplo, rnp->grphi, TPS("nxtlvl")); @@ -340,67 +355,74 @@ fastpath: * Select the CPUs within the specified rcu_node that the upcoming * expedited grace period needs to wait for. */ -static void sync_rcu_exp_select_node_cpus(struct work_struct *wp) +static void __sync_rcu_exp_select_node_cpus(struct rcu_exp_work *rewp) { int cpu; unsigned long flags; - smp_call_func_t func; unsigned long mask_ofl_test; unsigned long mask_ofl_ipi; int ret; - struct rcu_exp_work *rewp = - container_of(wp, struct rcu_exp_work, rew_work); struct rcu_node *rnp = container_of(rewp, struct rcu_node, rew); - func = rewp->rew_func; raw_spin_lock_irqsave_rcu_node(rnp, flags); /* Each pass checks a CPU for identity, offline, and idle. */ mask_ofl_test = 0; for_each_leaf_node_cpu_mask(rnp, cpu, rnp->expmask) { - unsigned long mask = leaf_node_cpu_bit(rnp, cpu); struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); + unsigned long mask = rdp->grpmask; int snap; if (raw_smp_processor_id() == cpu || !(rnp->qsmaskinitnext & mask)) { mask_ofl_test |= mask; } else { - snap = rcu_dynticks_snap(rdp); - if (rcu_dynticks_in_eqs(snap)) + /* + * Full ordering between remote CPU's post idle accesses + * and updater's accesses prior to current GP (and also + * the started GP sequence number) is enforced by + * rcu_seq_start() implicit barrier, relayed by kworkers + * locking and even further by smp_mb__after_unlock_lock() + * barriers chained all the way throughout the rnp locking + * tree since sync_exp_reset_tree() and up to the current + * leaf rnp locking. + * + * Ordering between remote CPU's pre idle accesses and + * post grace period updater's accesses is enforced by the + * below acquire semantic. + */ + snap = ct_rcu_watching_cpu_acquire(cpu); + if (rcu_watching_snap_in_eqs(snap)) mask_ofl_test |= mask; else - rdp->exp_dynticks_snap = snap; + rdp->exp_watching_snap = snap; } } mask_ofl_ipi = rnp->expmask & ~mask_ofl_test; - /* - * Need to wait for any blocked tasks as well. Note that - * additional blocking tasks will also block the expedited GP - * until such time as the ->expmask bits are cleared. - */ - if (rcu_preempt_has_tasks(rnp)) - rnp->exp_tasks = rnp->blkd_tasks.next; raw_spin_unlock_irqrestore_rcu_node(rnp, flags); /* IPI the remaining CPUs for expedited quiescent state. */ - for_each_leaf_node_cpu_mask(rnp, cpu, rnp->expmask) { - unsigned long mask = leaf_node_cpu_bit(rnp, cpu); + for_each_leaf_node_cpu_mask(rnp, cpu, mask_ofl_ipi) { struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); + unsigned long mask = rdp->grpmask; - if (!(mask_ofl_ipi & mask)) - continue; retry_ipi: - if (rcu_dynticks_in_eqs_since(rdp, rdp->exp_dynticks_snap)) { + if (rcu_watching_snap_stopped_since(rdp, rdp->exp_watching_snap)) { mask_ofl_test |= mask; continue; } - ret = smp_call_function_single(cpu, func, NULL, 0); - if (!ret) { - mask_ofl_ipi &= ~mask; + if (get_cpu() == cpu) { + mask_ofl_test |= mask; + put_cpu(); continue; } + ret = smp_call_function_single(cpu, rcu_exp_handler, NULL, 0); + put_cpu(); + /* The CPU will report the QS in response to the IPI. */ + if (!ret) + continue; + /* Failed, raced with CPU hotplug operation. */ raw_spin_lock_irqsave_rcu_node(rnp, flags); if ((rnp->qsmaskinitnext & mask) && @@ -408,27 +430,80 @@ retry_ipi: /* Online, so delay for a bit and try again. */ raw_spin_unlock_irqrestore_rcu_node(rnp, flags); trace_rcu_exp_grace_period(rcu_state.name, rcu_exp_gp_seq_endval(), TPS("selectofl")); - schedule_timeout_uninterruptible(1); + schedule_timeout_idle(1); goto retry_ipi; } - /* CPU really is offline, so we can ignore it. */ - if (!(rnp->expmask & mask)) - mask_ofl_ipi &= ~mask; + /* CPU really is offline, so we must report its QS. */ + if (rnp->expmask & mask) + mask_ofl_test |= mask; raw_spin_unlock_irqrestore_rcu_node(rnp, flags); } /* Report quiescent states for those that went offline. */ - mask_ofl_test |= mask_ofl_ipi; - if (mask_ofl_test) - rcu_report_exp_cpu_mult(rnp, mask_ofl_test, false); + if (mask_ofl_test) { + raw_spin_lock_irqsave_rcu_node(rnp, flags); + rcu_report_exp_cpu_mult(rnp, flags, mask_ofl_test, false); + } +} + +static void rcu_exp_sel_wait_wake(unsigned long s); + +static void sync_rcu_exp_select_node_cpus(struct kthread_work *wp) +{ + struct rcu_exp_work *rewp = + container_of(wp, struct rcu_exp_work, rew_work); + + __sync_rcu_exp_select_node_cpus(rewp); +} + +static inline bool rcu_exp_worker_started(void) +{ + return !!READ_ONCE(rcu_exp_gp_kworker); +} + +static inline bool rcu_exp_par_worker_started(struct rcu_node *rnp) +{ + return !!READ_ONCE(rnp->exp_kworker); +} + +static inline void sync_rcu_exp_select_cpus_queue_work(struct rcu_node *rnp) +{ + kthread_init_work(&rnp->rew.rew_work, sync_rcu_exp_select_node_cpus); + /* + * Use rcu_exp_par_gp_kworker, because flushing a work item from + * another work item on the same kthread worker can result in + * deadlock. + */ + kthread_queue_work(READ_ONCE(rnp->exp_kworker), &rnp->rew.rew_work); +} + +static inline void sync_rcu_exp_select_cpus_flush_work(struct rcu_node *rnp) +{ + kthread_flush_work(&rnp->rew.rew_work); +} + +/* + * Work-queue handler to drive an expedited grace period forward. + */ +static void wait_rcu_exp_gp(struct kthread_work *wp) +{ + struct rcu_exp_work *rewp; + + rewp = container_of(wp, struct rcu_exp_work, rew_work); + rcu_exp_sel_wait_wake(rewp->rew_s); +} + +static inline void synchronize_rcu_expedited_queue_work(struct rcu_exp_work *rew) +{ + kthread_init_work(&rew->rew_work, wait_rcu_exp_gp); + kthread_queue_work(rcu_exp_gp_kworker, &rew->rew_work); } /* * Select the nodes that the upcoming expedited grace period needs * to wait for. */ -static void sync_rcu_exp_select_cpus(smp_call_func_t func) +static void sync_rcu_exp_select_cpus(void) { - int cpu; struct rcu_node *rnp; trace_rcu_exp_grace_period(rcu_state.name, rcu_exp_gp_seq_endval(), TPS("reset")); @@ -440,104 +515,162 @@ static void sync_rcu_exp_select_cpus(smp_call_func_t func) rnp->exp_need_flush = false; if (!READ_ONCE(rnp->expmask)) continue; /* Avoid early boot non-existent wq. */ - rnp->rew.rew_func = func; - if (!READ_ONCE(rcu_par_gp_wq) || + if (!rcu_exp_par_worker_started(rnp) || rcu_scheduler_active != RCU_SCHEDULER_RUNNING || rcu_is_last_leaf_node(rnp)) { - /* No workqueues yet or last leaf, do direct call. */ + /* No worker started yet or last leaf, do direct call. */ sync_rcu_exp_select_node_cpus(&rnp->rew.rew_work); continue; } - INIT_WORK(&rnp->rew.rew_work, sync_rcu_exp_select_node_cpus); - preempt_disable(); - cpu = find_next_bit(&rnp->ffmask, BITS_PER_LONG, -1); - /* If all offline, queue the work on an unbound CPU. */ - if (unlikely(cpu > rnp->grphi - rnp->grplo)) - cpu = WORK_CPU_UNBOUND; - else - cpu += rnp->grplo; - queue_work_on(cpu, rcu_par_gp_wq, &rnp->rew.rew_work); - preempt_enable(); + sync_rcu_exp_select_cpus_queue_work(rnp); rnp->exp_need_flush = true; } - /* Wait for workqueue jobs (if any) to complete. */ + /* Wait for jobs (if any) to complete. */ rcu_for_each_leaf_node(rnp) if (rnp->exp_need_flush) - flush_work(&rnp->rew.rew_work); + sync_rcu_exp_select_cpus_flush_work(rnp); +} + +/* + * Wait for the expedited grace period to elapse, within time limit. + * If the time limit is exceeded without the grace period elapsing, + * return false, otherwise return true. + */ +static bool synchronize_rcu_expedited_wait_once(long tlimit) +{ + int t; + struct rcu_node *rnp_root = rcu_get_root(); + + t = swait_event_timeout_exclusive(rcu_state.expedited_wq, + sync_rcu_exp_done_unlocked(rnp_root), + tlimit); + // Workqueues should not be signaled. + if (t > 0 || sync_rcu_exp_done_unlocked(rnp_root)) + return true; + WARN_ON(t < 0); /* workqueues should not be signaled. */ + return false; } -static void synchronize_sched_expedited_wait(void) +/* + * Print out an expedited RCU CPU stall warning message. + */ +static void synchronize_rcu_expedited_stall(unsigned long jiffies_start, unsigned long j) { int cpu; - unsigned long jiffies_stall; - unsigned long jiffies_start; unsigned long mask; int ndetected; struct rcu_node *rnp; struct rcu_node *rnp_root = rcu_get_root(); - int ret; + + if (READ_ONCE(csd_lock_suppress_rcu_stall) && csd_lock_is_stuck()) { + pr_err("INFO: %s detected expedited stalls, but suppressed full report due to a stuck CSD-lock.\n", rcu_state.name); + return; + } + pr_err("INFO: %s detected expedited stalls on CPUs/tasks: {", rcu_state.name); + ndetected = 0; + rcu_for_each_leaf_node(rnp) { + ndetected += rcu_print_task_exp_stall(rnp); + for_each_leaf_node_possible_cpu(rnp, cpu) { + struct rcu_data *rdp; + + mask = leaf_node_cpu_bit(rnp, cpu); + if (!(READ_ONCE(rnp->expmask) & mask)) + continue; + ndetected++; + rdp = per_cpu_ptr(&rcu_data, cpu); + pr_cont(" %d-%c%c%c%c", cpu, + "O."[!!cpu_online(cpu)], + "o."[!!(rdp->grpmask & rnp->expmaskinit)], + "N."[!!(rdp->grpmask & rnp->expmaskinitnext)], + "D."[!!data_race(rdp->cpu_no_qs.b.exp)]); + } + } + pr_cont(" } %lu jiffies s: %lu root: %#lx/%c\n", + j - jiffies_start, rcu_state.expedited_sequence, data_race(rnp_root->expmask), + ".T"[!!data_race(rnp_root->exp_tasks)]); + if (ndetected) { + pr_err("blocking rcu_node structures (internal RCU debug):"); + rcu_for_each_node_breadth_first(rnp) { + if (rnp == rnp_root) + continue; /* printed unconditionally */ + if (sync_rcu_exp_done_unlocked(rnp)) + continue; + pr_cont(" l=%u:%d-%d:%#lx/%c", + rnp->level, rnp->grplo, rnp->grphi, data_race(rnp->expmask), + ".T"[!!data_race(rnp->exp_tasks)]); + } + pr_cont("\n"); + } + rcu_for_each_leaf_node(rnp) { + for_each_leaf_node_possible_cpu(rnp, cpu) { + mask = leaf_node_cpu_bit(rnp, cpu); + if (!(READ_ONCE(rnp->expmask) & mask)) + continue; + dump_cpu_task(cpu); + } + rcu_exp_print_detail_task_stall_rnp(rnp); + } +} + +/* + * Wait for the expedited grace period to elapse, issuing any needed + * RCU CPU stall warnings along the way. + */ +static void synchronize_rcu_expedited_wait(void) +{ + int cpu; + unsigned long j; + unsigned long jiffies_stall; + unsigned long jiffies_start; + unsigned long mask; + struct rcu_data *rdp; + struct rcu_node *rnp; + unsigned long flags; trace_rcu_exp_grace_period(rcu_state.name, rcu_exp_gp_seq_endval(), TPS("startwait")); - jiffies_stall = rcu_jiffies_till_stall_check(); + jiffies_stall = rcu_exp_jiffies_till_stall_check(); jiffies_start = jiffies; - - for (;;) { - ret = swait_event_timeout_exclusive( - rcu_state.expedited_wq, - sync_rcu_preempt_exp_done_unlocked(rnp_root), - jiffies_stall); - if (ret > 0 || sync_rcu_preempt_exp_done_unlocked(rnp_root)) + if (tick_nohz_full_enabled() && rcu_inkernel_boot_has_ended()) { + if (synchronize_rcu_expedited_wait_once(1)) return; - WARN_ON(ret < 0); /* workqueues should not be signaled. */ - if (rcu_cpu_stall_suppress) - continue; - panic_on_rcu_stall(); - pr_err("INFO: %s detected expedited stalls on CPUs/tasks: {", - rcu_state.name); - ndetected = 0; rcu_for_each_leaf_node(rnp) { - ndetected += rcu_print_task_exp_stall(rnp); - for_each_leaf_node_possible_cpu(rnp, cpu) { - struct rcu_data *rdp; - - mask = leaf_node_cpu_bit(rnp, cpu); - if (!(rnp->expmask & mask)) - continue; - ndetected++; + raw_spin_lock_irqsave_rcu_node(rnp, flags); + mask = READ_ONCE(rnp->expmask); + for_each_leaf_node_cpu_mask(rnp, cpu, mask) { rdp = per_cpu_ptr(&rcu_data, cpu); - pr_cont(" %d-%c%c%c", cpu, - "O."[!!cpu_online(cpu)], - "o."[!!(rdp->grpmask & rnp->expmaskinit)], - "N."[!!(rdp->grpmask & rnp->expmaskinitnext)]); - } - } - pr_cont(" } %lu jiffies s: %lu root: %#lx/%c\n", - jiffies - jiffies_start, rcu_state.expedited_sequence, - rnp_root->expmask, ".T"[!!rnp_root->exp_tasks]); - if (ndetected) { - pr_err("blocking rcu_node structures:"); - rcu_for_each_node_breadth_first(rnp) { - if (rnp == rnp_root) - continue; /* printed unconditionally */ - if (sync_rcu_preempt_exp_done_unlocked(rnp)) + if (rdp->rcu_forced_tick_exp) continue; - pr_cont(" l=%u:%d-%d:%#lx/%c", - rnp->level, rnp->grplo, rnp->grphi, - rnp->expmask, - ".T"[!!rnp->exp_tasks]); - } - pr_cont("\n"); - } - rcu_for_each_leaf_node(rnp) { - for_each_leaf_node_possible_cpu(rnp, cpu) { - mask = leaf_node_cpu_bit(rnp, cpu); - if (!(rnp->expmask & mask)) - continue; - dump_cpu_task(cpu); + rdp->rcu_forced_tick_exp = true; + if (cpu_online(cpu)) + tick_dep_set_cpu(cpu, TICK_DEP_BIT_RCU_EXP); } + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); } - jiffies_stall = 3 * rcu_jiffies_till_stall_check() + 3; + j = READ_ONCE(jiffies_till_first_fqs); + if (synchronize_rcu_expedited_wait_once(j + HZ)) + return; + } + + for (;;) { + unsigned long j; + + if (synchronize_rcu_expedited_wait_once(jiffies_stall)) + return; + if (rcu_stall_is_suppressed()) + continue; + + nbcon_cpu_emergency_enter(); + + j = jiffies; + rcu_stall_notifier_call_chain(RCU_STALL_NOTIFY_EXP, (void *)(j - jiffies_start)); + trace_rcu_stall_warning(rcu_state.name, TPS("ExpeditedStall")); + synchronize_rcu_expedited_stall(jiffies_start, j); + jiffies_stall = 3 * rcu_exp_jiffies_till_stall_check() + 3; + + nbcon_cpu_emergency_exit(); + + panic_on_rcu_stall(); } } @@ -551,26 +684,25 @@ static void rcu_exp_wait_wake(unsigned long s) { struct rcu_node *rnp; - synchronize_sched_expedited_wait(); - rcu_exp_gp_seq_end(); - trace_rcu_exp_grace_period(rcu_state.name, s, TPS("end")); + synchronize_rcu_expedited_wait(); - /* - * Switch over to wakeup mode, allowing the next GP, but -only- the - * next GP, to proceed. - */ + // Switch over to wakeup mode, allowing the next GP to proceed. + // End the previous grace period only after acquiring the mutex + // to ensure that only one GP runs concurrently with wakeups. mutex_lock(&rcu_state.exp_wake_mutex); + rcu_exp_gp_seq_end(); + trace_rcu_exp_grace_period(rcu_state.name, s, TPS("end")); rcu_for_each_node_breadth_first(rnp) { if (ULONG_CMP_LT(READ_ONCE(rnp->exp_seq_rq), s)) { spin_lock(&rnp->exp_lock); /* Recheck, avoid hang in case someone just arrived. */ if (ULONG_CMP_LT(rnp->exp_seq_rq, s)) - rnp->exp_seq_rq = s; + WRITE_ONCE(rnp->exp_seq_rq, s); spin_unlock(&rnp->exp_lock); } smp_mb(); /* All above changes before wakeup. */ - wake_up_all(&rnp->exp_wq[rcu_seq_ctr(rcu_state.expedited_sequence) & 0x3]); + wake_up_all(&rnp->exp_wq[rcu_seq_ctr(s) & 0x3]); } trace_rcu_exp_grace_period(rcu_state.name, s, TPS("endwake")); mutex_unlock(&rcu_state.exp_wake_mutex); @@ -580,69 +712,24 @@ static void rcu_exp_wait_wake(unsigned long s) * Common code to drive an expedited grace period forward, used by * workqueues and mid-boot-time tasks. */ -static void rcu_exp_sel_wait_wake(smp_call_func_t func, unsigned long s) +static void rcu_exp_sel_wait_wake(unsigned long s) { /* Initialize the rcu_node tree in preparation for the wait. */ - sync_rcu_exp_select_cpus(func); + sync_rcu_exp_select_cpus(); /* Wait and clean up, including waking everyone. */ rcu_exp_wait_wake(s); } -/* - * Work-queue handler to drive an expedited grace period forward. - */ -static void wait_rcu_exp_gp(struct work_struct *wp) +/* Request an expedited quiescent state. */ +static void rcu_exp_need_qs(void) { - struct rcu_exp_work *rewp; - - rewp = container_of(wp, struct rcu_exp_work, rew_work); - rcu_exp_sel_wait_wake(rewp->rew_func, rewp->rew_s); -} - -/* - * Given a smp_call_function() handler, kick off the specified - * implementation of expedited grace period. - */ -static void _synchronize_rcu_expedited(smp_call_func_t func) -{ - struct rcu_data *rdp; - struct rcu_exp_work rew; - struct rcu_node *rnp; - unsigned long s; - - /* If expedited grace periods are prohibited, fall back to normal. */ - if (rcu_gp_is_normal()) { - wait_rcu_gp(call_rcu); - return; - } - - /* Take a snapshot of the sequence number. */ - s = rcu_exp_gp_seq_snap(); - if (exp_funnel_lock(s)) - return; /* Someone else did our work for us. */ - - /* Ensure that load happens before action based on it. */ - if (unlikely(rcu_scheduler_active == RCU_SCHEDULER_INIT)) { - /* Direct call during scheduler init and early_initcalls(). */ - rcu_exp_sel_wait_wake(func, s); - } else { - /* Marshall arguments & schedule the expedited grace period. */ - rew.rew_func = func; - rew.rew_s = s; - INIT_WORK_ONSTACK(&rew.rew_work, wait_rcu_exp_gp); - queue_work(rcu_gp_wq, &rew.rew_work); - } - - /* Wait for expedited grace period to complete. */ - rdp = per_cpu_ptr(&rcu_data, raw_smp_processor_id()); - rnp = rcu_get_root(); - wait_event(rnp->exp_wq[rcu_seq_ctr(s) & 0x3], - sync_exp_work_done(s)); - smp_mb(); /* Workqueue actions happen before return. */ - - /* Let the next expedited grace period start. */ - mutex_unlock(&rcu_state.exp_mutex); + lockdep_assert_irqs_disabled(); + ASSERT_EXCLUSIVE_WRITER_SCOPED(*this_cpu_ptr(&rcu_data.cpu_no_qs.b.exp)); + __this_cpu_write(rcu_data.cpu_no_qs.b.exp, true); + /* Store .exp before .rcu_urgent_qs. */ + smp_store_release(this_cpu_ptr(&rcu_data.rcu_urgent_qs), true); + set_need_resched_current(); } #ifdef CONFIG_PREEMPT_RCU @@ -654,32 +741,39 @@ static void _synchronize_rcu_expedited(smp_call_func_t func) * ->expmask fields in the rcu_node tree. Otherwise, immediately * report the quiescent state. */ -static void sync_rcu_exp_handler(void *unused) +static void rcu_exp_handler(void *unused) { + int depth = rcu_preempt_depth(); unsigned long flags; struct rcu_data *rdp = this_cpu_ptr(&rcu_data); struct rcu_node *rnp = rdp->mynode; struct task_struct *t = current; /* - * First, the common case of not being in an RCU read-side + * WARN if the CPU is unexpectedly already looking for a + * QS or has already reported one. + */ + ASSERT_EXCLUSIVE_WRITER_SCOPED(rdp->cpu_no_qs.b.exp); + if (WARN_ON_ONCE(!(READ_ONCE(rnp->expmask) & rdp->grpmask) || + READ_ONCE(rdp->cpu_no_qs.b.exp))) + return; + + /* + * Second, the common case of not being in an RCU read-side * critical section. If also enabled or idle, immediately * report the quiescent state, otherwise defer. */ - if (!t->rcu_read_lock_nesting) { + if (!depth) { if (!(preempt_count() & (PREEMPT_MASK | SOFTIRQ_MASK)) || - rcu_dynticks_curr_cpu_in_eqs()) { + rcu_is_cpu_rrupt_from_idle()) rcu_report_exp_rdp(rdp); - } else { - rdp->deferred_qs = true; - set_tsk_need_resched(t); - set_preempt_need_resched(); - } + else + rcu_exp_need_qs(); return; } /* - * Second, the less-common case of being in an RCU read-side + * Third, the less-common case of being in an RCU read-side * critical section. In this case we can count on a future * rcu_read_unlock(). However, this rcu_read_unlock() might * execute on some other CPU, but in that case there will be @@ -690,152 +784,330 @@ static void sync_rcu_exp_handler(void *unused) * can have caused this quiescent state to already have been * reported, so we really do need to check ->expmask. */ - if (t->rcu_read_lock_nesting > 0) { + if (depth > 0) { raw_spin_lock_irqsave_rcu_node(rnp, flags); if (rnp->expmask & rdp->grpmask) { - rdp->deferred_qs = true; - WRITE_ONCE(t->rcu_read_unlock_special.b.exp_hint, true); + WRITE_ONCE(rdp->cpu_no_qs.b.exp, true); + t->rcu_read_unlock_special.b.exp_hint = true; } raw_spin_unlock_irqrestore_rcu_node(rnp, flags); + return; } - /* - * The final and least likely case is where the interrupted - * code was just about to or just finished exiting the RCU-preempt - * read-side critical section, and no, we can't tell which. - * So either way, set ->deferred_qs to flag later code that - * a quiescent state is required. - * - * If the CPU is fully enabled (or if some buggy RCU-preempt - * read-side critical section is being used from idle), just - * invoke rcu_preempt_defer_qs() to immediately report the - * quiescent state. We cannot use rcu_read_unlock_special() - * because we are in an interrupt handler, which will cause that - * function to take an early exit without doing anything. - * - * Otherwise, force a context switch after the CPU enables everything. - */ - rdp->deferred_qs = true; - if (!(preempt_count() & (PREEMPT_MASK | SOFTIRQ_MASK)) || - WARN_ON_ONCE(rcu_dynticks_curr_cpu_in_eqs())) { - rcu_preempt_deferred_qs(t); - } else { - set_tsk_need_resched(t); - set_preempt_need_resched(); + // Fourth and finally, negative nesting depth should not happen. + WARN_ON_ONCE(1); +} + +/* + * Scan the current list of tasks blocked within RCU read-side critical + * sections, printing out the tid of each that is blocking the current + * expedited grace period. + */ +static int rcu_print_task_exp_stall(struct rcu_node *rnp) +{ + unsigned long flags; + int ndetected = 0; + struct task_struct *t; + + raw_spin_lock_irqsave_rcu_node(rnp, flags); + if (!rnp->exp_tasks) { + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); + return 0; } + t = list_entry(rnp->exp_tasks->prev, + struct task_struct, rcu_node_entry); + list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) { + pr_cont(" P%d", t->pid); + ndetected++; + } + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); + return ndetected; } -/* PREEMPT=y, so no PREEMPT=n expedited grace period to clean up after. */ -static void sync_sched_exp_online_cleanup(int cpu) +/* + * Scan the current list of tasks blocked within RCU read-side critical + * sections, dumping the stack of each that is blocking the current + * expedited grace period. + */ +static void rcu_exp_print_detail_task_stall_rnp(struct rcu_node *rnp) { + unsigned long flags; + struct task_struct *t; + + if (!rcu_exp_stall_task_details) + return; + raw_spin_lock_irqsave_rcu_node(rnp, flags); + if (!READ_ONCE(rnp->exp_tasks)) { + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); + return; + } + t = list_entry(rnp->exp_tasks->prev, + struct task_struct, rcu_node_entry); + list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) { + /* + * We could be printing a lot while holding a spinlock. + * Avoid triggering hard lockup. + */ + touch_nmi_watchdog(); + sched_show_task(t); + } + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); } +#else /* #ifdef CONFIG_PREEMPT_RCU */ + +/* Invoked on each online non-idle CPU for expedited quiescent state. */ +static void rcu_exp_handler(void *unused) +{ + struct rcu_data *rdp = this_cpu_ptr(&rcu_data); + struct rcu_node *rnp = rdp->mynode; + bool preempt_bh_enabled = !(preempt_count() & (PREEMPT_MASK | SOFTIRQ_MASK)); + + ASSERT_EXCLUSIVE_WRITER_SCOPED(rdp->cpu_no_qs.b.exp); + if (!(READ_ONCE(rnp->expmask) & rdp->grpmask) || + __this_cpu_read(rcu_data.cpu_no_qs.b.exp)) + return; + if (rcu_is_cpu_rrupt_from_idle() || + (IS_ENABLED(CONFIG_PREEMPT_COUNT) && preempt_bh_enabled)) { + rcu_report_exp_rdp(this_cpu_ptr(&rcu_data)); + return; + } + rcu_exp_need_qs(); +} + +/* + * Because preemptible RCU does not exist, we never have to check for + * tasks blocked within RCU read-side critical sections that are + * blocking the current expedited grace period. + */ +static int rcu_print_task_exp_stall(struct rcu_node *rnp) +{ + return 0; +} + +/* + * Because preemptible RCU does not exist, we never have to print out + * tasks blocked within RCU read-side critical sections that are blocking + * the current expedited grace period. + */ +static void rcu_exp_print_detail_task_stall_rnp(struct rcu_node *rnp) +{ +} + +#endif /* #else #ifdef CONFIG_PREEMPT_RCU */ + /** * synchronize_rcu_expedited - Brute-force RCU grace period * - * Wait for an RCU-preempt grace period, but expedite it. The basic - * idea is to IPI all non-idle non-nohz online CPUs. The IPI handler - * checks whether the CPU is in an RCU-preempt critical section, and - * if so, it sets a flag that causes the outermost rcu_read_unlock() - * to report the quiescent state. On the other hand, if the CPU is - * not in an RCU read-side critical section, the IPI handler reports - * the quiescent state immediately. + * Wait for an RCU grace period, but expedite it. The basic idea is to + * IPI all non-idle non-nohz online CPUs. The IPI handler checks whether + * the CPU is in an RCU critical section, and if so, it sets a flag that + * causes the outermost rcu_read_unlock() to report the quiescent state + * for RCU-preempt or asks the scheduler for help for RCU-sched. On the + * other hand, if the CPU is not in an RCU read-side critical section, + * the IPI handler reports the quiescent state immediately. * - * Although this is a greate improvement over previous expedited + * Although this is a great improvement over previous expedited * implementations, it is still unfriendly to real-time workloads, so is * thus not recommended for any sort of common-case code. In fact, if * you are using synchronize_rcu_expedited() in a loop, please restructure - * your code to batch your updates, and then Use a single synchronize_rcu() + * your code to batch your updates, and then use a single synchronize_rcu() * instead. * * This has the same semantics as (but is more brutal than) synchronize_rcu(). */ void synchronize_rcu_expedited(void) { + unsigned long flags; + struct rcu_exp_work rew; + struct rcu_node *rnp; + unsigned long s; + RCU_LOCKDEP_WARN(lock_is_held(&rcu_bh_lock_map) || lock_is_held(&rcu_lock_map) || lock_is_held(&rcu_sched_lock_map), "Illegal synchronize_rcu_expedited() in RCU read-side critical section"); - if (rcu_scheduler_active == RCU_SCHEDULER_INACTIVE) + /* Is the state is such that the call is a grace period? */ + if (rcu_blocking_is_gp()) { + // Note well that this code runs with !PREEMPT && !SMP. + // In addition, all code that advances grace periods runs + // at process level. Therefore, this expedited GP overlaps + // with other expedited GPs only by being fully nested within + // them, which allows reuse of ->gp_seq_polled_exp_snap. + rcu_poll_gp_seq_start_unlocked(&rcu_state.gp_seq_polled_exp_snap); + rcu_poll_gp_seq_end_unlocked(&rcu_state.gp_seq_polled_exp_snap); + + local_irq_save(flags); + WARN_ON_ONCE(num_online_cpus() > 1); + rcu_state.expedited_sequence += (1 << RCU_SEQ_CTR_SHIFT); + local_irq_restore(flags); + return; // Context allows vacuous grace periods. + } + + /* If expedited grace periods are prohibited, fall back to normal. */ + if (rcu_gp_is_normal()) { + synchronize_rcu_normal(); return; - _synchronize_rcu_expedited(sync_rcu_exp_handler); + } + + /* Take a snapshot of the sequence number. */ + s = rcu_exp_gp_seq_snap(); + if (exp_funnel_lock(s)) + return; /* Someone else did our work for us. */ + + /* Ensure that load happens before action based on it. */ + if (unlikely((rcu_scheduler_active == RCU_SCHEDULER_INIT) || !rcu_exp_worker_started())) { + /* Direct call during scheduler init and early_initcalls(). */ + rcu_exp_sel_wait_wake(s); + } else { + /* Marshall arguments & schedule the expedited grace period. */ + rew.rew_s = s; + synchronize_rcu_expedited_queue_work(&rew); + } + + /* Wait for expedited grace period to complete. */ + rnp = rcu_get_root(); + wait_event(rnp->exp_wq[rcu_seq_ctr(s) & 0x3], + sync_exp_work_done(s)); + + /* Let the next expedited grace period start. */ + mutex_unlock(&rcu_state.exp_mutex); } EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); -#else /* #ifdef CONFIG_PREEMPT_RCU */ - -/* Invoked on each online non-idle CPU for expedited quiescent state. */ -static void sync_sched_exp_handler(void *unused) +/* + * Ensure that start_poll_synchronize_rcu_expedited() has the expedited + * RCU grace periods that it needs. + */ +static void sync_rcu_do_polled_gp(struct work_struct *wp) { - struct rcu_data *rdp; - struct rcu_node *rnp; + unsigned long flags; + int i = 0; + struct rcu_node *rnp = container_of(wp, struct rcu_node, exp_poll_wq); + unsigned long s; - rdp = this_cpu_ptr(&rcu_data); - rnp = rdp->mynode; - if (!(READ_ONCE(rnp->expmask) & rdp->grpmask) || - __this_cpu_read(rcu_data.cpu_no_qs.b.exp)) - return; - if (rcu_is_cpu_rrupt_from_idle()) { - rcu_report_exp_rdp(this_cpu_ptr(&rcu_data)); + raw_spin_lock_irqsave(&rnp->exp_poll_lock, flags); + s = rnp->exp_seq_poll_rq; + rnp->exp_seq_poll_rq = RCU_GET_STATE_COMPLETED; + raw_spin_unlock_irqrestore(&rnp->exp_poll_lock, flags); + if (s == RCU_GET_STATE_COMPLETED) return; + while (!poll_state_synchronize_rcu(s)) { + synchronize_rcu_expedited(); + if (i == 10 || i == 20) + pr_info("%s: i = %d s = %lx gp_seq_polled = %lx\n", __func__, i, s, READ_ONCE(rcu_state.gp_seq_polled)); + i++; } - __this_cpu_write(rcu_data.cpu_no_qs.b.exp, true); - /* Store .exp before .rcu_urgent_qs. */ - smp_store_release(this_cpu_ptr(&rcu_data.rcu_urgent_qs), true); - set_tsk_need_resched(current); - set_preempt_need_resched(); + raw_spin_lock_irqsave(&rnp->exp_poll_lock, flags); + s = rnp->exp_seq_poll_rq; + if (poll_state_synchronize_rcu(s)) + rnp->exp_seq_poll_rq = RCU_GET_STATE_COMPLETED; + raw_spin_unlock_irqrestore(&rnp->exp_poll_lock, flags); } -/* Send IPI for expedited cleanup if needed at end of CPU-hotplug operation. */ -static void sync_sched_exp_online_cleanup(int cpu) +/** + * start_poll_synchronize_rcu_expedited - Snapshot current RCU state and start expedited grace period + * + * Returns a cookie to pass to a call to cond_synchronize_rcu(), + * cond_synchronize_rcu_expedited(), or poll_state_synchronize_rcu(), + * allowing them to determine whether or not any sort of grace period has + * elapsed in the meantime. If the needed expedited grace period is not + * already slated to start, initiates that grace period. + */ +unsigned long start_poll_synchronize_rcu_expedited(void) { + unsigned long flags; struct rcu_data *rdp; - int ret; struct rcu_node *rnp; + unsigned long s; - rdp = per_cpu_ptr(&rcu_data, cpu); + s = get_state_synchronize_rcu(); + rdp = per_cpu_ptr(&rcu_data, raw_smp_processor_id()); rnp = rdp->mynode; - if (!(READ_ONCE(rnp->expmask) & rdp->grpmask)) - return; - ret = smp_call_function_single(cpu, sync_sched_exp_handler, NULL, 0); - WARN_ON_ONCE(ret); + if (rcu_init_invoked()) + raw_spin_lock_irqsave(&rnp->exp_poll_lock, flags); + if (!poll_state_synchronize_rcu(s)) { + if (rcu_init_invoked()) { + rnp->exp_seq_poll_rq = s; + queue_work(rcu_gp_wq, &rnp->exp_poll_wq); + } + } + if (rcu_init_invoked()) + raw_spin_unlock_irqrestore(&rnp->exp_poll_lock, flags); + + return s; } +EXPORT_SYMBOL_GPL(start_poll_synchronize_rcu_expedited); -/* - * Because a context switch is a grace period for !PREEMPT, any - * blocking grace-period wait automatically implies a grace period if - * there is only one CPU online at any point time during execution of - * either synchronize_rcu() or synchronize_rcu_expedited(). It is OK to - * occasionally incorrectly indicate that there are multiple CPUs online - * when there was in fact only one the whole time, as this just adds some - * overhead: RCU still operates correctly. +/** + * start_poll_synchronize_rcu_expedited_full - Take a full snapshot and start expedited grace period + * @rgosp: Place to put snapshot of grace-period state + * + * Places the normal and expedited grace-period states in rgosp. This + * state value can be passed to a later call to cond_synchronize_rcu_full() + * or poll_state_synchronize_rcu_full() to determine whether or not a + * grace period (whether normal or expedited) has elapsed in the meantime. + * If the needed expedited grace period is not already slated to start, + * initiates that grace period. */ -static int rcu_blocking_is_gp(void) +void start_poll_synchronize_rcu_expedited_full(struct rcu_gp_oldstate *rgosp) { - int ret; - - might_sleep(); /* Check for RCU read-side critical section. */ - preempt_disable(); - ret = num_online_cpus() <= 1; - preempt_enable(); - return ret; + get_state_synchronize_rcu_full(rgosp); + (void)start_poll_synchronize_rcu_expedited(); } +EXPORT_SYMBOL_GPL(start_poll_synchronize_rcu_expedited_full); -/* PREEMPT=n implementation of synchronize_rcu_expedited(). */ -void synchronize_rcu_expedited(void) +/** + * cond_synchronize_rcu_expedited - Conditionally wait for an expedited RCU grace period + * + * @oldstate: value from get_state_synchronize_rcu(), start_poll_synchronize_rcu(), or start_poll_synchronize_rcu_expedited() + * + * If any type of full RCU grace period has elapsed since the earlier + * call to get_state_synchronize_rcu(), start_poll_synchronize_rcu(), + * or start_poll_synchronize_rcu_expedited(), just return. Otherwise, + * invoke synchronize_rcu_expedited() to wait for a full grace period. + * + * Yes, this function does not take counter wrap into account. + * But counter wrap is harmless. If the counter wraps, we have waited for + * more than 2 billion grace periods (and way more on a 64-bit system!), + * so waiting for a couple of additional grace periods should be just fine. + * + * This function provides the same memory-ordering guarantees that + * would be provided by a synchronize_rcu() that was invoked at the call + * to the function that provided @oldstate and that returned at the end + * of this function. + */ +void cond_synchronize_rcu_expedited(unsigned long oldstate) { - RCU_LOCKDEP_WARN(lock_is_held(&rcu_bh_lock_map) || - lock_is_held(&rcu_lock_map) || - lock_is_held(&rcu_sched_lock_map), - "Illegal synchronize_rcu_expedited() in RCU read-side critical section"); - - /* If only one CPU, this is automatically a grace period. */ - if (rcu_blocking_is_gp()) - return; - - _synchronize_rcu_expedited(sync_sched_exp_handler); + if (!poll_state_synchronize_rcu(oldstate)) + synchronize_rcu_expedited(); } -EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); +EXPORT_SYMBOL_GPL(cond_synchronize_rcu_expedited); -#endif /* #else #ifdef CONFIG_PREEMPT_RCU */ +/** + * cond_synchronize_rcu_expedited_full - Conditionally wait for an expedited RCU grace period + * @rgosp: value from get_state_synchronize_rcu_full(), start_poll_synchronize_rcu_full(), or start_poll_synchronize_rcu_expedited_full() + * + * If a full RCU grace period has elapsed since the call to + * get_state_synchronize_rcu_full(), start_poll_synchronize_rcu_full(), + * or start_poll_synchronize_rcu_expedited_full() from which @rgosp was + * obtained, just return. Otherwise, invoke synchronize_rcu_expedited() + * to wait for a full grace period. + * + * Yes, this function does not take counter wrap into account. + * But counter wrap is harmless. If the counter wraps, we have waited for + * more than 2 billion grace periods (and way more on a 64-bit system!), + * so waiting for a couple of additional grace periods should be just fine. + * + * This function provides the same memory-ordering guarantees that + * would be provided by a synchronize_rcu() that was invoked at the call + * to the function that provided @rgosp and that returned at the end of + * this function. + */ +void cond_synchronize_rcu_expedited_full(struct rcu_gp_oldstate *rgosp) +{ + if (!poll_state_synchronize_rcu_full(rgosp)) + synchronize_rcu_expedited(); +} +EXPORT_SYMBOL_GPL(cond_synchronize_rcu_expedited_full); diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h new file mode 100644 index 000000000000..e6cd56603cad --- /dev/null +++ b/kernel/rcu/tree_nocb.h @@ -0,0 +1,1705 @@ +/* SPDX-License-Identifier: GPL-2.0+ */ +/* + * Read-Copy Update mechanism for mutual exclusion (tree-based version) + * Internal non-public definitions that provide either classic + * or preemptible semantics. + * + * Copyright Red Hat, 2009 + * Copyright IBM Corporation, 2009 + * Copyright SUSE, 2021 + * + * Author: Ingo Molnar <mingo@elte.hu> + * Paul E. McKenney <paulmck@linux.ibm.com> + * Frederic Weisbecker <frederic@kernel.org> + */ + +#ifdef CONFIG_RCU_NOCB_CPU +static cpumask_var_t rcu_nocb_mask; /* CPUs to have callbacks offloaded. */ +static bool __read_mostly rcu_nocb_poll; /* Offload kthread are to poll. */ + +static inline bool rcu_current_is_nocb_kthread(struct rcu_data *rdp) +{ + /* Race on early boot between thread creation and assignment */ + if (!rdp->nocb_cb_kthread || !rdp->nocb_gp_kthread) + return true; + + if (current == rdp->nocb_cb_kthread || current == rdp->nocb_gp_kthread) + if (in_task()) + return true; + return false; +} + +/* + * Offload callback processing from the boot-time-specified set of CPUs + * specified by rcu_nocb_mask. For the CPUs in the set, there are kthreads + * created that pull the callbacks from the corresponding CPU, wait for + * a grace period to elapse, and invoke the callbacks. These kthreads + * are organized into GP kthreads, which manage incoming callbacks, wait for + * grace periods, and awaken CB kthreads, and the CB kthreads, which only + * invoke callbacks. Each GP kthread invokes its own CBs. The no-CBs CPUs + * do a wake_up() on their GP kthread when they insert a callback into any + * empty list, unless the rcu_nocb_poll boot parameter has been specified, + * in which case each kthread actively polls its CPU. (Which isn't so great + * for energy efficiency, but which does reduce RCU's overhead on that CPU.) + * + * This is intended to be used in conjunction with Frederic Weisbecker's + * adaptive-idle work, which would seriously reduce OS jitter on CPUs + * running CPU-bound user-mode computations. + * + * Offloading of callbacks can also be used as an energy-efficiency + * measure because CPUs with no RCU callbacks queued are more aggressive + * about entering dyntick-idle mode. + */ + + +/* + * Parse the boot-time rcu_nocb_mask CPU list from the kernel parameters. + * If the list is invalid, a warning is emitted and all CPUs are offloaded. + */ +static int __init rcu_nocb_setup(char *str) +{ + alloc_bootmem_cpumask_var(&rcu_nocb_mask); + if (*str == '=') { + if (cpulist_parse(++str, rcu_nocb_mask)) { + pr_warn("rcu_nocbs= bad CPU range, all CPUs set\n"); + cpumask_setall(rcu_nocb_mask); + } + } + rcu_state.nocb_is_setup = true; + return 1; +} +__setup("rcu_nocbs", rcu_nocb_setup); + +static int __init parse_rcu_nocb_poll(char *arg) +{ + rcu_nocb_poll = true; + return 1; +} +__setup("rcu_nocb_poll", parse_rcu_nocb_poll); + +/* + * Don't bother bypassing ->cblist if the call_rcu() rate is low. + * After all, the main point of bypassing is to avoid lock contention + * on ->nocb_lock, which only can happen at high call_rcu() rates. + */ +static int nocb_nobypass_lim_per_jiffy = 16 * 1000 / HZ; +module_param(nocb_nobypass_lim_per_jiffy, int, 0); + +/* + * Acquire the specified rcu_data structure's ->nocb_bypass_lock. If the + * lock isn't immediately available, perform minimal sanity check. + */ +static void rcu_nocb_bypass_lock(struct rcu_data *rdp) + __acquires(&rdp->nocb_bypass_lock) +{ + lockdep_assert_irqs_disabled(); + if (raw_spin_trylock(&rdp->nocb_bypass_lock)) + return; + /* + * Contention expected only when local enqueue collide with + * remote flush from kthreads. + */ + WARN_ON_ONCE(smp_processor_id() != rdp->cpu); + raw_spin_lock(&rdp->nocb_bypass_lock); +} + +/* + * Conditionally acquire the specified rcu_data structure's + * ->nocb_bypass_lock. + */ +static bool rcu_nocb_bypass_trylock(struct rcu_data *rdp) +{ + lockdep_assert_irqs_disabled(); + return raw_spin_trylock(&rdp->nocb_bypass_lock); +} + +/* + * Release the specified rcu_data structure's ->nocb_bypass_lock. + */ +static void rcu_nocb_bypass_unlock(struct rcu_data *rdp) + __releases(&rdp->nocb_bypass_lock) +{ + lockdep_assert_irqs_disabled(); + raw_spin_unlock(&rdp->nocb_bypass_lock); +} + +/* + * Acquire the specified rcu_data structure's ->nocb_lock, but only + * if it corresponds to a no-CBs CPU. + */ +static void rcu_nocb_lock(struct rcu_data *rdp) +{ + lockdep_assert_irqs_disabled(); + if (!rcu_rdp_is_offloaded(rdp)) + return; + raw_spin_lock(&rdp->nocb_lock); +} + +/* + * Release the specified rcu_data structure's ->nocb_lock, but only + * if it corresponds to a no-CBs CPU. + */ +static void rcu_nocb_unlock(struct rcu_data *rdp) +{ + if (rcu_rdp_is_offloaded(rdp)) { + lockdep_assert_irqs_disabled(); + raw_spin_unlock(&rdp->nocb_lock); + } +} + +/* + * Release the specified rcu_data structure's ->nocb_lock and restore + * interrupts, but only if it corresponds to a no-CBs CPU. + */ +static void rcu_nocb_unlock_irqrestore(struct rcu_data *rdp, + unsigned long flags) +{ + if (rcu_rdp_is_offloaded(rdp)) { + lockdep_assert_irqs_disabled(); + raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags); + } else { + local_irq_restore(flags); + } +} + +/* Lockdep check that ->cblist may be safely accessed. */ +static void rcu_lockdep_assert_cblist_protected(struct rcu_data *rdp) +{ + lockdep_assert_irqs_disabled(); + if (rcu_rdp_is_offloaded(rdp)) + lockdep_assert_held(&rdp->nocb_lock); +} + +/* + * Wake up any no-CBs CPUs' kthreads that were waiting on the just-ended + * grace period. + */ +static void rcu_nocb_gp_cleanup(struct swait_queue_head *sq) +{ + swake_up_all(sq); +} + +static struct swait_queue_head *rcu_nocb_gp_get(struct rcu_node *rnp) +{ + return &rnp->nocb_gp_wq[rcu_seq_ctr(rnp->gp_seq) & 0x1]; +} + +static void rcu_init_one_nocb(struct rcu_node *rnp) +{ + init_swait_queue_head(&rnp->nocb_gp_wq[0]); + init_swait_queue_head(&rnp->nocb_gp_wq[1]); +} + +static bool __wake_nocb_gp(struct rcu_data *rdp_gp, + struct rcu_data *rdp, + bool force, unsigned long flags) + __releases(rdp_gp->nocb_gp_lock) +{ + bool needwake = false; + + if (!READ_ONCE(rdp_gp->nocb_gp_kthread)) { + raw_spin_unlock_irqrestore(&rdp_gp->nocb_gp_lock, flags); + trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, + TPS("AlreadyAwake")); + return false; + } + + if (rdp_gp->nocb_defer_wakeup > RCU_NOCB_WAKE_NOT) { + WRITE_ONCE(rdp_gp->nocb_defer_wakeup, RCU_NOCB_WAKE_NOT); + timer_delete(&rdp_gp->nocb_timer); + } + + if (force || READ_ONCE(rdp_gp->nocb_gp_sleep)) { + WRITE_ONCE(rdp_gp->nocb_gp_sleep, false); + needwake = true; + } + raw_spin_unlock_irqrestore(&rdp_gp->nocb_gp_lock, flags); + if (needwake) { + trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("DoWake")); + swake_up_one(&rdp_gp->nocb_gp_wq); + } + + return needwake; +} + +/* + * Kick the GP kthread for this NOCB group. + */ +static bool wake_nocb_gp(struct rcu_data *rdp, bool force) +{ + unsigned long flags; + struct rcu_data *rdp_gp = rdp->nocb_gp_rdp; + + raw_spin_lock_irqsave(&rdp_gp->nocb_gp_lock, flags); + return __wake_nocb_gp(rdp_gp, rdp, force, flags); +} + +#ifdef CONFIG_RCU_LAZY +/* + * LAZY_FLUSH_JIFFIES decides the maximum amount of time that + * can elapse before lazy callbacks are flushed. Lazy callbacks + * could be flushed much earlier for a number of other reasons + * however, LAZY_FLUSH_JIFFIES will ensure no lazy callbacks are + * left unsubmitted to RCU after those many jiffies. + */ +#define LAZY_FLUSH_JIFFIES (10 * HZ) +static unsigned long jiffies_lazy_flush = LAZY_FLUSH_JIFFIES; + +// To be called only from test code. +void rcu_set_jiffies_lazy_flush(unsigned long jif) +{ + jiffies_lazy_flush = jif; +} +EXPORT_SYMBOL(rcu_set_jiffies_lazy_flush); + +unsigned long rcu_get_jiffies_lazy_flush(void) +{ + return jiffies_lazy_flush; +} +EXPORT_SYMBOL(rcu_get_jiffies_lazy_flush); +#endif + +/* + * Arrange to wake the GP kthread for this NOCB group at some future + * time when it is safe to do so. + */ +static void wake_nocb_gp_defer(struct rcu_data *rdp, int waketype, + const char *reason) +{ + unsigned long flags; + struct rcu_data *rdp_gp = rdp->nocb_gp_rdp; + + raw_spin_lock_irqsave(&rdp_gp->nocb_gp_lock, flags); + + /* + * Bypass wakeup overrides previous deferments. In case of + * callback storms, no need to wake up too early. + */ + if (waketype == RCU_NOCB_WAKE_LAZY && + rdp_gp->nocb_defer_wakeup == RCU_NOCB_WAKE_NOT) { + mod_timer(&rdp_gp->nocb_timer, jiffies + rcu_get_jiffies_lazy_flush()); + WRITE_ONCE(rdp_gp->nocb_defer_wakeup, waketype); + } else if (waketype == RCU_NOCB_WAKE_BYPASS) { + mod_timer(&rdp_gp->nocb_timer, jiffies + 2); + WRITE_ONCE(rdp_gp->nocb_defer_wakeup, waketype); + } else { + if (rdp_gp->nocb_defer_wakeup < RCU_NOCB_WAKE) + mod_timer(&rdp_gp->nocb_timer, jiffies + 1); + if (rdp_gp->nocb_defer_wakeup < waketype) + WRITE_ONCE(rdp_gp->nocb_defer_wakeup, waketype); + } + + raw_spin_unlock_irqrestore(&rdp_gp->nocb_gp_lock, flags); + + trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, reason); +} + +/* + * Flush the ->nocb_bypass queue into ->cblist, enqueuing rhp if non-NULL. + * However, if there is a callback to be enqueued and if ->nocb_bypass + * proves to be initially empty, just return false because the no-CB GP + * kthread may need to be awakened in this case. + * + * Return true if there was something to be flushed and it succeeded, otherwise + * false. + * + * Note that this function always returns true if rhp is NULL. + */ +static bool rcu_nocb_do_flush_bypass(struct rcu_data *rdp, struct rcu_head *rhp_in, + unsigned long j, bool lazy) +{ + struct rcu_cblist rcl; + struct rcu_head *rhp = rhp_in; + + WARN_ON_ONCE(!rcu_rdp_is_offloaded(rdp)); + rcu_lockdep_assert_cblist_protected(rdp); + lockdep_assert_held(&rdp->nocb_bypass_lock); + if (rhp && !rcu_cblist_n_cbs(&rdp->nocb_bypass)) { + raw_spin_unlock(&rdp->nocb_bypass_lock); + return false; + } + /* Note: ->cblist.len already accounts for ->nocb_bypass contents. */ + if (rhp) + rcu_segcblist_inc_len(&rdp->cblist); /* Must precede enqueue. */ + + /* + * If the new CB requested was a lazy one, queue it onto the main + * ->cblist so that we can take advantage of the grace-period that will + * happen regardless. But queue it onto the bypass list first so that + * the lazy CB is ordered with the existing CBs in the bypass list. + */ + if (lazy && rhp) { + rcu_cblist_enqueue(&rdp->nocb_bypass, rhp); + rhp = NULL; + } + rcu_cblist_flush_enqueue(&rcl, &rdp->nocb_bypass, rhp); + WRITE_ONCE(rdp->lazy_len, 0); + + rcu_segcblist_insert_pend_cbs(&rdp->cblist, &rcl); + WRITE_ONCE(rdp->nocb_bypass_first, j); + rcu_nocb_bypass_unlock(rdp); + return true; +} + +/* + * Flush the ->nocb_bypass queue into ->cblist, enqueuing rhp if non-NULL. + * However, if there is a callback to be enqueued and if ->nocb_bypass + * proves to be initially empty, just return false because the no-CB GP + * kthread may need to be awakened in this case. + * + * Note that this function always returns true if rhp is NULL. + */ +static bool rcu_nocb_flush_bypass(struct rcu_data *rdp, struct rcu_head *rhp, + unsigned long j, bool lazy) +{ + if (!rcu_rdp_is_offloaded(rdp)) + return true; + rcu_lockdep_assert_cblist_protected(rdp); + rcu_nocb_bypass_lock(rdp); + return rcu_nocb_do_flush_bypass(rdp, rhp, j, lazy); +} + +/* + * If the ->nocb_bypass_lock is immediately available, flush the + * ->nocb_bypass queue into ->cblist. + */ +static void rcu_nocb_try_flush_bypass(struct rcu_data *rdp, unsigned long j) +{ + rcu_lockdep_assert_cblist_protected(rdp); + if (!rcu_rdp_is_offloaded(rdp) || + !rcu_nocb_bypass_trylock(rdp)) + return; + WARN_ON_ONCE(!rcu_nocb_do_flush_bypass(rdp, NULL, j, false)); +} + +/* + * See whether it is appropriate to use the ->nocb_bypass list in order + * to control contention on ->nocb_lock. A limited number of direct + * enqueues are permitted into ->cblist per jiffy. If ->nocb_bypass + * is non-empty, further callbacks must be placed into ->nocb_bypass, + * otherwise rcu_barrier() breaks. Use rcu_nocb_flush_bypass() to switch + * back to direct use of ->cblist. However, ->nocb_bypass should not be + * used if ->cblist is empty, because otherwise callbacks can be stranded + * on ->nocb_bypass because we cannot count on the current CPU ever again + * invoking call_rcu(). The general rule is that if ->nocb_bypass is + * non-empty, the corresponding no-CBs grace-period kthread must not be + * in an indefinite sleep state. + * + * Finally, it is not permitted to use the bypass during early boot, + * as doing so would confuse the auto-initialization code. Besides + * which, there is no point in worrying about lock contention while + * there is only one CPU in operation. + */ +static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp, + bool *was_alldone, unsigned long flags, + bool lazy) +{ + unsigned long c; + unsigned long cur_gp_seq; + unsigned long j = jiffies; + long ncbs = rcu_cblist_n_cbs(&rdp->nocb_bypass); + bool bypass_is_lazy = (ncbs == READ_ONCE(rdp->lazy_len)); + + lockdep_assert_irqs_disabled(); + + // Pure softirq/rcuc based processing: no bypassing, no + // locking. + if (!rcu_rdp_is_offloaded(rdp)) { + *was_alldone = !rcu_segcblist_pend_cbs(&rdp->cblist); + return false; + } + + // Don't use ->nocb_bypass during early boot. + if (rcu_scheduler_active != RCU_SCHEDULER_RUNNING) { + rcu_nocb_lock(rdp); + WARN_ON_ONCE(rcu_cblist_n_cbs(&rdp->nocb_bypass)); + *was_alldone = !rcu_segcblist_pend_cbs(&rdp->cblist); + return false; + } + + // If we have advanced to a new jiffy, reset counts to allow + // moving back from ->nocb_bypass to ->cblist. + if (j == rdp->nocb_nobypass_last) { + c = rdp->nocb_nobypass_count + 1; + } else { + WRITE_ONCE(rdp->nocb_nobypass_last, j); + c = rdp->nocb_nobypass_count - nocb_nobypass_lim_per_jiffy; + if (ULONG_CMP_LT(rdp->nocb_nobypass_count, + nocb_nobypass_lim_per_jiffy)) + c = 0; + else if (c > nocb_nobypass_lim_per_jiffy) + c = nocb_nobypass_lim_per_jiffy; + } + WRITE_ONCE(rdp->nocb_nobypass_count, c); + + // If there hasn't yet been all that many ->cblist enqueues + // this jiffy, tell the caller to enqueue onto ->cblist. But flush + // ->nocb_bypass first. + // Lazy CBs throttle this back and do immediate bypass queuing. + if (rdp->nocb_nobypass_count < nocb_nobypass_lim_per_jiffy && !lazy) { + rcu_nocb_lock(rdp); + *was_alldone = !rcu_segcblist_pend_cbs(&rdp->cblist); + if (*was_alldone) + trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, + TPS("FirstQ")); + + WARN_ON_ONCE(!rcu_nocb_flush_bypass(rdp, NULL, j, false)); + WARN_ON_ONCE(rcu_cblist_n_cbs(&rdp->nocb_bypass)); + return false; // Caller must enqueue the callback. + } + + // If ->nocb_bypass has been used too long or is too full, + // flush ->nocb_bypass to ->cblist. + if ((ncbs && !bypass_is_lazy && j != READ_ONCE(rdp->nocb_bypass_first)) || + (ncbs && bypass_is_lazy && + (time_after(j, READ_ONCE(rdp->nocb_bypass_first) + rcu_get_jiffies_lazy_flush()))) || + ncbs >= qhimark) { + rcu_nocb_lock(rdp); + *was_alldone = !rcu_segcblist_pend_cbs(&rdp->cblist); + + if (!rcu_nocb_flush_bypass(rdp, rhp, j, lazy)) { + if (*was_alldone) + trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, + TPS("FirstQ")); + WARN_ON_ONCE(rcu_cblist_n_cbs(&rdp->nocb_bypass)); + return false; // Caller must enqueue the callback. + } + if (j != rdp->nocb_gp_adv_time && + rcu_segcblist_nextgp(&rdp->cblist, &cur_gp_seq) && + rcu_seq_done(&rdp->mynode->gp_seq, cur_gp_seq)) { + rcu_advance_cbs_nowake(rdp->mynode, rdp); + rdp->nocb_gp_adv_time = j; + } + + // The flush succeeded and we moved CBs into the regular list. + // Don't wait for the wake up timer as it may be too far ahead. + // Wake up the GP thread now instead, if the cblist was empty. + __call_rcu_nocb_wake(rdp, *was_alldone, flags); + + return true; // Callback already enqueued. + } + + // We need to use the bypass. + rcu_nocb_bypass_lock(rdp); + ncbs = rcu_cblist_n_cbs(&rdp->nocb_bypass); + rcu_segcblist_inc_len(&rdp->cblist); /* Must precede enqueue. */ + rcu_cblist_enqueue(&rdp->nocb_bypass, rhp); + + if (lazy) + WRITE_ONCE(rdp->lazy_len, rdp->lazy_len + 1); + + if (!ncbs) { + WRITE_ONCE(rdp->nocb_bypass_first, j); + trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("FirstBQ")); + } + rcu_nocb_bypass_unlock(rdp); + + // A wake up of the grace period kthread or timer adjustment + // needs to be done only if: + // 1. Bypass list was fully empty before (this is the first + // bypass list entry), or: + // 2. Both of these conditions are met: + // a. The bypass list previously had only lazy CBs, and: + // b. The new CB is non-lazy. + if (!ncbs || (bypass_is_lazy && !lazy)) { + // No-CBs GP kthread might be indefinitely asleep, if so, wake. + rcu_nocb_lock(rdp); // Rare during call_rcu() flood. + if (!rcu_segcblist_pend_cbs(&rdp->cblist)) { + trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, + TPS("FirstBQwake")); + __call_rcu_nocb_wake(rdp, true, flags); + } else { + trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, + TPS("FirstBQnoWake")); + rcu_nocb_unlock(rdp); + } + } + return true; // Callback already enqueued. +} + +/* + * Awaken the no-CBs grace-period kthread if needed, either due to it + * legitimately being asleep or due to overload conditions. + * + * If warranted, also wake up the kthread servicing this CPUs queues. + */ +static void __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_alldone, + unsigned long flags) + __releases(rdp->nocb_lock) +{ + long bypass_len; + unsigned long cur_gp_seq; + unsigned long j; + long lazy_len; + long len; + struct task_struct *t; + struct rcu_data *rdp_gp = rdp->nocb_gp_rdp; + + // If we are being polled or there is no kthread, just leave. + t = READ_ONCE(rdp->nocb_gp_kthread); + if (rcu_nocb_poll || !t) { + rcu_nocb_unlock(rdp); + trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, + TPS("WakeNotPoll")); + return; + } + // Need to actually to a wakeup. + len = rcu_segcblist_n_cbs(&rdp->cblist); + bypass_len = rcu_cblist_n_cbs(&rdp->nocb_bypass); + lazy_len = READ_ONCE(rdp->lazy_len); + if (was_alldone) { + rdp->qlen_last_fqs_check = len; + // Only lazy CBs in bypass list + if (lazy_len && bypass_len == lazy_len) { + rcu_nocb_unlock(rdp); + wake_nocb_gp_defer(rdp, RCU_NOCB_WAKE_LAZY, + TPS("WakeLazy")); + } else if (!irqs_disabled_flags(flags)) { + /* ... if queue was empty ... */ + rcu_nocb_unlock(rdp); + wake_nocb_gp(rdp, false); + trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, + TPS("WakeEmpty")); + } else { + rcu_nocb_unlock(rdp); + wake_nocb_gp_defer(rdp, RCU_NOCB_WAKE, + TPS("WakeEmptyIsDeferred")); + } + } else if (len > rdp->qlen_last_fqs_check + qhimark) { + /* ... or if many callbacks queued. */ + rdp->qlen_last_fqs_check = len; + j = jiffies; + if (j != rdp->nocb_gp_adv_time && + rcu_segcblist_nextgp(&rdp->cblist, &cur_gp_seq) && + rcu_seq_done(&rdp->mynode->gp_seq, cur_gp_seq)) { + rcu_advance_cbs_nowake(rdp->mynode, rdp); + rdp->nocb_gp_adv_time = j; + } + smp_mb(); /* Enqueue before timer_pending(). */ + if ((rdp->nocb_cb_sleep || + !rcu_segcblist_ready_cbs(&rdp->cblist)) && + !timer_pending(&rdp_gp->nocb_timer)) { + rcu_nocb_unlock(rdp); + wake_nocb_gp_defer(rdp, RCU_NOCB_WAKE_FORCE, + TPS("WakeOvfIsDeferred")); + } else { + rcu_nocb_unlock(rdp); + trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WakeNot")); + } + } else { + rcu_nocb_unlock(rdp); + trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WakeNot")); + } +} + +static void call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *head, + rcu_callback_t func, unsigned long flags, bool lazy) +{ + bool was_alldone; + + if (!rcu_nocb_try_bypass(rdp, head, &was_alldone, flags, lazy)) { + /* Not enqueued on bypass but locked, do regular enqueue */ + rcutree_enqueue(rdp, head, func); + __call_rcu_nocb_wake(rdp, was_alldone, flags); /* unlocks */ + } +} + +static void nocb_gp_toggle_rdp(struct rcu_data *rdp_gp, struct rcu_data *rdp) +{ + struct rcu_segcblist *cblist = &rdp->cblist; + unsigned long flags; + + /* + * Locking orders future de-offloaded callbacks enqueue against previous + * handling of this rdp. Ie: Make sure rcuog is done with this rdp before + * deoffloaded callbacks can be enqueued. + */ + raw_spin_lock_irqsave(&rdp->nocb_lock, flags); + if (!rcu_segcblist_test_flags(cblist, SEGCBLIST_OFFLOADED)) { + /* + * Offloading. Set our flag and notify the offload worker. + * We will handle this rdp until it ever gets de-offloaded. + */ + list_add_tail(&rdp->nocb_entry_rdp, &rdp_gp->nocb_head_rdp); + rcu_segcblist_set_flags(cblist, SEGCBLIST_OFFLOADED); + } else { + /* + * De-offloading. Clear our flag and notify the de-offload worker. + * We will ignore this rdp until it ever gets re-offloaded. + */ + list_del(&rdp->nocb_entry_rdp); + rcu_segcblist_clear_flags(cblist, SEGCBLIST_OFFLOADED); + } + raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags); +} + +static void nocb_gp_sleep(struct rcu_data *my_rdp, int cpu) +{ + trace_rcu_nocb_wake(rcu_state.name, cpu, TPS("Sleep")); + swait_event_interruptible_exclusive(my_rdp->nocb_gp_wq, + !READ_ONCE(my_rdp->nocb_gp_sleep)); + trace_rcu_nocb_wake(rcu_state.name, cpu, TPS("EndSleep")); +} + +/* + * No-CBs GP kthreads come here to wait for additional callbacks to show up + * or for grace periods to end. + */ +static void nocb_gp_wait(struct rcu_data *my_rdp) +{ + bool bypass = false; + int __maybe_unused cpu = my_rdp->cpu; + unsigned long cur_gp_seq; + unsigned long flags; + bool gotcbs = false; + unsigned long j = jiffies; + bool lazy = false; + bool needwait_gp = false; // This prevents actual uninitialized use. + bool needwake; + bool needwake_gp; + struct rcu_data *rdp, *rdp_toggling = NULL; + struct rcu_node *rnp; + unsigned long wait_gp_seq = 0; // Suppress "use uninitialized" warning. + bool wasempty = false; + + /* + * Each pass through the following loop checks for CBs and for the + * nearest grace period (if any) to wait for next. The CB kthreads + * and the global grace-period kthread are awakened if needed. + */ + WARN_ON_ONCE(my_rdp->nocb_gp_rdp != my_rdp); + /* + * An rcu_data structure is removed from the list after its + * CPU is de-offloaded and added to the list before that CPU is + * (re-)offloaded. If the following loop happens to be referencing + * that rcu_data structure during the time that the corresponding + * CPU is de-offloaded and then immediately re-offloaded, this + * loop's rdp pointer will be carried to the end of the list by + * the resulting pair of list operations. This can cause the loop + * to skip over some of the rcu_data structures that were supposed + * to have been scanned. Fortunately a new iteration through the + * entire loop is forced after a given CPU's rcu_data structure + * is added to the list, so the skipped-over rcu_data structures + * won't be ignored for long. + */ + list_for_each_entry(rdp, &my_rdp->nocb_head_rdp, nocb_entry_rdp) { + long bypass_ncbs; + bool flush_bypass = false; + long lazy_ncbs; + + trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("Check")); + rcu_nocb_lock_irqsave(rdp, flags); + lockdep_assert_held(&rdp->nocb_lock); + bypass_ncbs = rcu_cblist_n_cbs(&rdp->nocb_bypass); + lazy_ncbs = READ_ONCE(rdp->lazy_len); + + if (bypass_ncbs && (lazy_ncbs == bypass_ncbs) && + (time_after(j, READ_ONCE(rdp->nocb_bypass_first) + rcu_get_jiffies_lazy_flush()) || + bypass_ncbs > 2 * qhimark)) { + flush_bypass = true; + } else if (bypass_ncbs && (lazy_ncbs != bypass_ncbs) && + (time_after(j, READ_ONCE(rdp->nocb_bypass_first) + 1) || + bypass_ncbs > 2 * qhimark)) { + flush_bypass = true; + } else if (!bypass_ncbs && rcu_segcblist_empty(&rdp->cblist)) { + rcu_nocb_unlock_irqrestore(rdp, flags); + continue; /* No callbacks here, try next. */ + } + + if (flush_bypass) { + // Bypass full or old, so flush it. + (void)rcu_nocb_try_flush_bypass(rdp, j); + bypass_ncbs = rcu_cblist_n_cbs(&rdp->nocb_bypass); + lazy_ncbs = READ_ONCE(rdp->lazy_len); + } + + if (bypass_ncbs) { + trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, + bypass_ncbs == lazy_ncbs ? TPS("Lazy") : TPS("Bypass")); + if (bypass_ncbs == lazy_ncbs) + lazy = true; + else + bypass = true; + } + rnp = rdp->mynode; + + // Advance callbacks if helpful and low contention. + needwake_gp = false; + if (!rcu_segcblist_restempty(&rdp->cblist, + RCU_NEXT_READY_TAIL) || + (rcu_segcblist_nextgp(&rdp->cblist, &cur_gp_seq) && + rcu_seq_done(&rnp->gp_seq, cur_gp_seq))) { + raw_spin_lock_rcu_node(rnp); /* irqs disabled. */ + needwake_gp = rcu_advance_cbs(rnp, rdp); + wasempty = rcu_segcblist_restempty(&rdp->cblist, + RCU_NEXT_READY_TAIL); + raw_spin_unlock_rcu_node(rnp); /* irqs disabled. */ + } + // Need to wait on some grace period? + WARN_ON_ONCE(wasempty && + !rcu_segcblist_restempty(&rdp->cblist, + RCU_NEXT_READY_TAIL)); + if (rcu_segcblist_nextgp(&rdp->cblist, &cur_gp_seq)) { + if (!needwait_gp || + ULONG_CMP_LT(cur_gp_seq, wait_gp_seq)) + wait_gp_seq = cur_gp_seq; + needwait_gp = true; + trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, + TPS("NeedWaitGP")); + } + if (rcu_segcblist_ready_cbs(&rdp->cblist)) { + needwake = rdp->nocb_cb_sleep; + WRITE_ONCE(rdp->nocb_cb_sleep, false); + } else { + needwake = false; + } + rcu_nocb_unlock_irqrestore(rdp, flags); + if (needwake) { + swake_up_one(&rdp->nocb_cb_wq); + gotcbs = true; + } + if (needwake_gp) + rcu_gp_kthread_wake(); + } + + my_rdp->nocb_gp_bypass = bypass; + my_rdp->nocb_gp_gp = needwait_gp; + my_rdp->nocb_gp_seq = needwait_gp ? wait_gp_seq : 0; + + // At least one child with non-empty ->nocb_bypass, so set + // timer in order to avoid stranding its callbacks. + if (!rcu_nocb_poll) { + // If bypass list only has lazy CBs. Add a deferred lazy wake up. + if (lazy && !bypass) { + wake_nocb_gp_defer(my_rdp, RCU_NOCB_WAKE_LAZY, + TPS("WakeLazyIsDeferred")); + // Otherwise add a deferred bypass wake up. + } else if (bypass) { + wake_nocb_gp_defer(my_rdp, RCU_NOCB_WAKE_BYPASS, + TPS("WakeBypassIsDeferred")); + } + } + + if (rcu_nocb_poll) { + /* Polling, so trace if first poll in the series. */ + if (gotcbs) + trace_rcu_nocb_wake(rcu_state.name, cpu, TPS("Poll")); + if (list_empty(&my_rdp->nocb_head_rdp)) { + raw_spin_lock_irqsave(&my_rdp->nocb_gp_lock, flags); + if (!my_rdp->nocb_toggling_rdp) + WRITE_ONCE(my_rdp->nocb_gp_sleep, true); + raw_spin_unlock_irqrestore(&my_rdp->nocb_gp_lock, flags); + /* Wait for any offloading rdp */ + nocb_gp_sleep(my_rdp, cpu); + } else { + schedule_timeout_idle(1); + } + } else if (!needwait_gp) { + /* Wait for callbacks to appear. */ + nocb_gp_sleep(my_rdp, cpu); + } else { + rnp = my_rdp->mynode; + trace_rcu_this_gp(rnp, my_rdp, wait_gp_seq, TPS("StartWait")); + swait_event_interruptible_exclusive( + rnp->nocb_gp_wq[rcu_seq_ctr(wait_gp_seq) & 0x1], + rcu_seq_done(&rnp->gp_seq, wait_gp_seq) || + !READ_ONCE(my_rdp->nocb_gp_sleep)); + trace_rcu_this_gp(rnp, my_rdp, wait_gp_seq, TPS("EndWait")); + } + + if (!rcu_nocb_poll) { + raw_spin_lock_irqsave(&my_rdp->nocb_gp_lock, flags); + // (De-)queue an rdp to/from the group if its nocb state is changing + rdp_toggling = my_rdp->nocb_toggling_rdp; + if (rdp_toggling) + my_rdp->nocb_toggling_rdp = NULL; + + if (my_rdp->nocb_defer_wakeup > RCU_NOCB_WAKE_NOT) { + WRITE_ONCE(my_rdp->nocb_defer_wakeup, RCU_NOCB_WAKE_NOT); + timer_delete(&my_rdp->nocb_timer); + } + WRITE_ONCE(my_rdp->nocb_gp_sleep, true); + raw_spin_unlock_irqrestore(&my_rdp->nocb_gp_lock, flags); + } else { + rdp_toggling = READ_ONCE(my_rdp->nocb_toggling_rdp); + if (rdp_toggling) { + /* + * Paranoid locking to make sure nocb_toggling_rdp is well + * reset *before* we (re)set SEGCBLIST_KTHREAD_GP or we could + * race with another round of nocb toggling for this rdp. + * Nocb locking should prevent from that already but we stick + * to paranoia, especially in rare path. + */ + raw_spin_lock_irqsave(&my_rdp->nocb_gp_lock, flags); + my_rdp->nocb_toggling_rdp = NULL; + raw_spin_unlock_irqrestore(&my_rdp->nocb_gp_lock, flags); + } + } + + if (rdp_toggling) { + nocb_gp_toggle_rdp(my_rdp, rdp_toggling); + swake_up_one(&rdp_toggling->nocb_state_wq); + } + + my_rdp->nocb_gp_seq = -1; + WARN_ON(signal_pending(current)); +} + +/* + * No-CBs grace-period-wait kthread. There is one of these per group + * of CPUs, but only once at least one CPU in that group has come online + * at least once since boot. This kthread checks for newly posted + * callbacks from any of the CPUs it is responsible for, waits for a + * grace period, then awakens all of the rcu_nocb_cb_kthread() instances + * that then have callback-invocation work to do. + */ +static int rcu_nocb_gp_kthread(void *arg) +{ + struct rcu_data *rdp = arg; + + for (;;) { + WRITE_ONCE(rdp->nocb_gp_loops, rdp->nocb_gp_loops + 1); + nocb_gp_wait(rdp); + cond_resched_tasks_rcu_qs(); + } + return 0; +} + +static inline bool nocb_cb_wait_cond(struct rcu_data *rdp) +{ + return !READ_ONCE(rdp->nocb_cb_sleep) || kthread_should_park(); +} + +/* + * Invoke any ready callbacks from the corresponding no-CBs CPU, + * then, if there are no more, wait for more to appear. + */ +static void nocb_cb_wait(struct rcu_data *rdp) +{ + struct rcu_segcblist *cblist = &rdp->cblist; + unsigned long cur_gp_seq; + unsigned long flags; + bool needwake_gp = false; + struct rcu_node *rnp = rdp->mynode; + + swait_event_interruptible_exclusive(rdp->nocb_cb_wq, + nocb_cb_wait_cond(rdp)); + if (kthread_should_park()) { + /* + * kthread_park() must be preceded by an rcu_barrier(). + * But yet another rcu_barrier() might have sneaked in between + * the barrier callback execution and the callbacks counter + * decrement. + */ + if (rdp->nocb_cb_sleep) { + rcu_nocb_lock_irqsave(rdp, flags); + WARN_ON_ONCE(rcu_segcblist_n_cbs(&rdp->cblist)); + rcu_nocb_unlock_irqrestore(rdp, flags); + kthread_parkme(); + } + } else if (READ_ONCE(rdp->nocb_cb_sleep)) { + WARN_ON(signal_pending(current)); + trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WokeEmpty")); + } + + WARN_ON_ONCE(!rcu_rdp_is_offloaded(rdp)); + + local_irq_save(flags); + rcu_momentary_eqs(); + local_irq_restore(flags); + /* + * Disable BH to provide the expected environment. Also, when + * transitioning to/from NOCB mode, a self-requeuing callback might + * be invoked from softirq. A short grace period could cause both + * instances of this callback would execute concurrently. + */ + local_bh_disable(); + rcu_do_batch(rdp); + local_bh_enable(); + lockdep_assert_irqs_enabled(); + rcu_nocb_lock_irqsave(rdp, flags); + if (rcu_segcblist_nextgp(cblist, &cur_gp_seq) && + rcu_seq_done(&rnp->gp_seq, cur_gp_seq) && + raw_spin_trylock_rcu_node(rnp)) { /* irqs already disabled. */ + needwake_gp = rcu_advance_cbs(rdp->mynode, rdp); + raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled. */ + } + + if (!rcu_segcblist_ready_cbs(cblist)) { + WRITE_ONCE(rdp->nocb_cb_sleep, true); + trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("CBSleep")); + } else { + WRITE_ONCE(rdp->nocb_cb_sleep, false); + } + + rcu_nocb_unlock_irqrestore(rdp, flags); + if (needwake_gp) + rcu_gp_kthread_wake(); +} + +/* + * Per-rcu_data kthread, but only for no-CBs CPUs. Repeatedly invoke + * nocb_cb_wait() to do the dirty work. + */ +static int rcu_nocb_cb_kthread(void *arg) +{ + struct rcu_data *rdp = arg; + + // Each pass through this loop does one callback batch, and, + // if there are no more ready callbacks, waits for them. + for (;;) { + nocb_cb_wait(rdp); + cond_resched_tasks_rcu_qs(); + } + return 0; +} + +/* Is a deferred wakeup of rcu_nocb_kthread() required? */ +static int rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp, int level) +{ + return READ_ONCE(rdp->nocb_defer_wakeup) >= level; +} + +/* Do a deferred wakeup of rcu_nocb_kthread(). */ +static bool do_nocb_deferred_wakeup_common(struct rcu_data *rdp_gp, + struct rcu_data *rdp, int level, + unsigned long flags) + __releases(rdp_gp->nocb_gp_lock) +{ + int ndw; + int ret; + + if (!rcu_nocb_need_deferred_wakeup(rdp_gp, level)) { + raw_spin_unlock_irqrestore(&rdp_gp->nocb_gp_lock, flags); + return false; + } + + ndw = rdp_gp->nocb_defer_wakeup; + ret = __wake_nocb_gp(rdp_gp, rdp, ndw == RCU_NOCB_WAKE_FORCE, flags); + trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("DeferredWake")); + + return ret; +} + +/* Do a deferred wakeup of rcu_nocb_kthread() from a timer handler. */ +static void do_nocb_deferred_wakeup_timer(struct timer_list *t) +{ + unsigned long flags; + struct rcu_data *rdp = timer_container_of(rdp, t, nocb_timer); + + WARN_ON_ONCE(rdp->nocb_gp_rdp != rdp); + trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("Timer")); + + raw_spin_lock_irqsave(&rdp->nocb_gp_lock, flags); + smp_mb__after_spinlock(); /* Timer expire before wakeup. */ + do_nocb_deferred_wakeup_common(rdp, rdp, RCU_NOCB_WAKE_BYPASS, flags); +} + +/* + * Do a deferred wakeup of rcu_nocb_kthread() from fastpath. + * This means we do an inexact common-case check. Note that if + * we miss, ->nocb_timer will eventually clean things up. + */ +static bool do_nocb_deferred_wakeup(struct rcu_data *rdp) +{ + unsigned long flags; + struct rcu_data *rdp_gp = rdp->nocb_gp_rdp; + + if (!rdp_gp || !rcu_nocb_need_deferred_wakeup(rdp_gp, RCU_NOCB_WAKE)) + return false; + + raw_spin_lock_irqsave(&rdp_gp->nocb_gp_lock, flags); + return do_nocb_deferred_wakeup_common(rdp_gp, rdp, RCU_NOCB_WAKE, flags); +} + +void rcu_nocb_flush_deferred_wakeup(void) +{ + do_nocb_deferred_wakeup(this_cpu_ptr(&rcu_data)); +} +EXPORT_SYMBOL_GPL(rcu_nocb_flush_deferred_wakeup); + +static int rcu_nocb_queue_toggle_rdp(struct rcu_data *rdp) +{ + struct rcu_data *rdp_gp = rdp->nocb_gp_rdp; + bool wake_gp = false; + unsigned long flags; + + raw_spin_lock_irqsave(&rdp_gp->nocb_gp_lock, flags); + // Queue this rdp for add/del to/from the list to iterate on rcuog + WRITE_ONCE(rdp_gp->nocb_toggling_rdp, rdp); + if (rdp_gp->nocb_gp_sleep) { + rdp_gp->nocb_gp_sleep = false; + wake_gp = true; + } + raw_spin_unlock_irqrestore(&rdp_gp->nocb_gp_lock, flags); + + return wake_gp; +} + +static bool rcu_nocb_rdp_deoffload_wait_cond(struct rcu_data *rdp) +{ + unsigned long flags; + bool ret; + + /* + * Locking makes sure rcuog is done handling this rdp before deoffloaded + * enqueue can happen. Also it keeps the SEGCBLIST_OFFLOADED flag stable + * while the ->nocb_lock is held. + */ + raw_spin_lock_irqsave(&rdp->nocb_lock, flags); + ret = !rcu_segcblist_test_flags(&rdp->cblist, SEGCBLIST_OFFLOADED); + raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags); + + return ret; +} + +static int rcu_nocb_rdp_deoffload(struct rcu_data *rdp) +{ + unsigned long flags; + int wake_gp; + struct rcu_data *rdp_gp = rdp->nocb_gp_rdp; + + /* CPU must be offline, unless it's early boot */ + WARN_ON_ONCE(cpu_online(rdp->cpu) && rdp->cpu != raw_smp_processor_id()); + + pr_info("De-offloading %d\n", rdp->cpu); + + /* Flush all callbacks from segcblist and bypass */ + rcu_barrier(); + + /* + * Make sure the rcuoc kthread isn't in the middle of a nocb locked + * sequence while offloading is deactivated, along with nocb locking. + */ + if (rdp->nocb_cb_kthread) + kthread_park(rdp->nocb_cb_kthread); + + rcu_nocb_lock_irqsave(rdp, flags); + WARN_ON_ONCE(rcu_cblist_n_cbs(&rdp->nocb_bypass)); + WARN_ON_ONCE(rcu_segcblist_n_cbs(&rdp->cblist)); + rcu_nocb_unlock_irqrestore(rdp, flags); + + wake_gp = rcu_nocb_queue_toggle_rdp(rdp); + + mutex_lock(&rdp_gp->nocb_gp_kthread_mutex); + + if (rdp_gp->nocb_gp_kthread) { + if (wake_gp) + wake_up_process(rdp_gp->nocb_gp_kthread); + + swait_event_exclusive(rdp->nocb_state_wq, + rcu_nocb_rdp_deoffload_wait_cond(rdp)); + } else { + /* + * No kthread to clear the flags for us or remove the rdp from the nocb list + * to iterate. Do it here instead. Locking doesn't look stricly necessary + * but we stick to paranoia in this rare path. + */ + raw_spin_lock_irqsave(&rdp->nocb_lock, flags); + rcu_segcblist_clear_flags(&rdp->cblist, SEGCBLIST_OFFLOADED); + raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags); + + list_del(&rdp->nocb_entry_rdp); + } + + mutex_unlock(&rdp_gp->nocb_gp_kthread_mutex); + + return 0; +} + +int rcu_nocb_cpu_deoffload(int cpu) +{ + struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); + int ret = 0; + + cpus_read_lock(); + mutex_lock(&rcu_state.nocb_mutex); + if (rcu_rdp_is_offloaded(rdp)) { + if (!cpu_online(cpu)) { + ret = rcu_nocb_rdp_deoffload(rdp); + if (!ret) + cpumask_clear_cpu(cpu, rcu_nocb_mask); + } else { + pr_info("NOCB: Cannot CB-deoffload online CPU %d\n", rdp->cpu); + ret = -EINVAL; + } + } + mutex_unlock(&rcu_state.nocb_mutex); + cpus_read_unlock(); + + return ret; +} +EXPORT_SYMBOL_GPL(rcu_nocb_cpu_deoffload); + +static bool rcu_nocb_rdp_offload_wait_cond(struct rcu_data *rdp) +{ + unsigned long flags; + bool ret; + + raw_spin_lock_irqsave(&rdp->nocb_lock, flags); + ret = rcu_segcblist_test_flags(&rdp->cblist, SEGCBLIST_OFFLOADED); + raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags); + + return ret; +} + +static int rcu_nocb_rdp_offload(struct rcu_data *rdp) +{ + int wake_gp; + + WARN_ON_ONCE(cpu_online(rdp->cpu)); + /* + * For now we only support re-offload, ie: the rdp must have been + * offloaded on boot first. + */ + if (!rdp->nocb_gp_rdp) + return -EINVAL; + + if (WARN_ON_ONCE(!rdp->nocb_gp_kthread)) + return -EINVAL; + + pr_info("Offloading %d\n", rdp->cpu); + + WARN_ON_ONCE(rcu_cblist_n_cbs(&rdp->nocb_bypass)); + WARN_ON_ONCE(rcu_segcblist_n_cbs(&rdp->cblist)); + + wake_gp = rcu_nocb_queue_toggle_rdp(rdp); + if (wake_gp) + wake_up_process(rdp->nocb_gp_kthread); + + swait_event_exclusive(rdp->nocb_state_wq, + rcu_nocb_rdp_offload_wait_cond(rdp)); + + kthread_unpark(rdp->nocb_cb_kthread); + + return 0; +} + +int rcu_nocb_cpu_offload(int cpu) +{ + struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); + int ret = 0; + + cpus_read_lock(); + mutex_lock(&rcu_state.nocb_mutex); + if (!rcu_rdp_is_offloaded(rdp)) { + if (!cpu_online(cpu)) { + ret = rcu_nocb_rdp_offload(rdp); + if (!ret) + cpumask_set_cpu(cpu, rcu_nocb_mask); + } else { + pr_info("NOCB: Cannot CB-offload online CPU %d\n", rdp->cpu); + ret = -EINVAL; + } + } + mutex_unlock(&rcu_state.nocb_mutex); + cpus_read_unlock(); + + return ret; +} +EXPORT_SYMBOL_GPL(rcu_nocb_cpu_offload); + +#ifdef CONFIG_RCU_LAZY +static unsigned long +lazy_rcu_shrink_count(struct shrinker *shrink, struct shrink_control *sc) +{ + int cpu; + unsigned long count = 0; + + if (WARN_ON_ONCE(!cpumask_available(rcu_nocb_mask))) + return 0; + + /* Protect rcu_nocb_mask against concurrent (de-)offloading. */ + if (!mutex_trylock(&rcu_state.nocb_mutex)) + return 0; + + /* Snapshot count of all CPUs */ + for_each_cpu(cpu, rcu_nocb_mask) { + struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); + + count += READ_ONCE(rdp->lazy_len); + } + + mutex_unlock(&rcu_state.nocb_mutex); + + return count ? count : SHRINK_EMPTY; +} + +static unsigned long +lazy_rcu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc) +{ + int cpu; + unsigned long flags; + unsigned long count = 0; + + if (WARN_ON_ONCE(!cpumask_available(rcu_nocb_mask))) + return 0; + /* + * Protect against concurrent (de-)offloading. Otherwise nocb locking + * may be ignored or imbalanced. + */ + if (!mutex_trylock(&rcu_state.nocb_mutex)) { + /* + * But really don't insist if nocb_mutex is contended since we + * can't guarantee that it will never engage in a dependency + * chain involving memory allocation. The lock is seldom contended + * anyway. + */ + return 0; + } + + /* Snapshot count of all CPUs */ + for_each_cpu(cpu, rcu_nocb_mask) { + struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); + int _count; + + if (WARN_ON_ONCE(!rcu_rdp_is_offloaded(rdp))) + continue; + + if (!READ_ONCE(rdp->lazy_len)) + continue; + + rcu_nocb_lock_irqsave(rdp, flags); + /* + * Recheck under the nocb lock. Since we are not holding the bypass + * lock we may still race with increments from the enqueuer but still + * we know for sure if there is at least one lazy callback. + */ + _count = READ_ONCE(rdp->lazy_len); + if (!_count) { + rcu_nocb_unlock_irqrestore(rdp, flags); + continue; + } + rcu_nocb_try_flush_bypass(rdp, jiffies); + rcu_nocb_unlock_irqrestore(rdp, flags); + wake_nocb_gp(rdp, false); + sc->nr_to_scan -= _count; + count += _count; + if (sc->nr_to_scan <= 0) + break; + } + + mutex_unlock(&rcu_state.nocb_mutex); + + return count ? count : SHRINK_STOP; +} +#endif // #ifdef CONFIG_RCU_LAZY + +void __init rcu_init_nohz(void) +{ + int cpu; + struct rcu_data *rdp; + const struct cpumask *cpumask = NULL; + struct shrinker * __maybe_unused lazy_rcu_shrinker; + +#if defined(CONFIG_NO_HZ_FULL) + if (tick_nohz_full_running && !cpumask_empty(tick_nohz_full_mask)) + cpumask = tick_nohz_full_mask; +#endif + + if (IS_ENABLED(CONFIG_RCU_NOCB_CPU_DEFAULT_ALL) && + !rcu_state.nocb_is_setup && !cpumask) + cpumask = cpu_possible_mask; + + if (cpumask) { + if (!cpumask_available(rcu_nocb_mask)) { + if (!zalloc_cpumask_var(&rcu_nocb_mask, GFP_KERNEL)) { + pr_info("rcu_nocb_mask allocation failed, callback offloading disabled.\n"); + return; + } + } + + cpumask_or(rcu_nocb_mask, rcu_nocb_mask, cpumask); + rcu_state.nocb_is_setup = true; + } + + if (!rcu_state.nocb_is_setup) + return; + +#ifdef CONFIG_RCU_LAZY + lazy_rcu_shrinker = shrinker_alloc(0, "rcu-lazy"); + if (!lazy_rcu_shrinker) { + pr_err("Failed to allocate lazy_rcu shrinker!\n"); + } else { + lazy_rcu_shrinker->count_objects = lazy_rcu_shrink_count; + lazy_rcu_shrinker->scan_objects = lazy_rcu_shrink_scan; + + shrinker_register(lazy_rcu_shrinker); + } +#endif // #ifdef CONFIG_RCU_LAZY + + if (!cpumask_subset(rcu_nocb_mask, cpu_possible_mask)) { + pr_info("\tNote: kernel parameter 'rcu_nocbs=', 'nohz_full', or 'isolcpus=' contains nonexistent CPUs.\n"); + cpumask_and(rcu_nocb_mask, cpu_possible_mask, + rcu_nocb_mask); + } + if (cpumask_empty(rcu_nocb_mask)) + pr_info("\tOffload RCU callbacks from CPUs: (none).\n"); + else + pr_info("\tOffload RCU callbacks from CPUs: %*pbl.\n", + cpumask_pr_args(rcu_nocb_mask)); + if (rcu_nocb_poll) + pr_info("\tPoll for callbacks from no-CBs CPUs.\n"); + + for_each_cpu(cpu, rcu_nocb_mask) { + rdp = per_cpu_ptr(&rcu_data, cpu); + if (rcu_segcblist_empty(&rdp->cblist)) + rcu_segcblist_init(&rdp->cblist); + rcu_segcblist_set_flags(&rdp->cblist, SEGCBLIST_OFFLOADED); + } + rcu_organize_nocb_kthreads(); +} + +/* Initialize per-rcu_data variables for no-CBs CPUs. */ +static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp) +{ + init_swait_queue_head(&rdp->nocb_cb_wq); + init_swait_queue_head(&rdp->nocb_gp_wq); + init_swait_queue_head(&rdp->nocb_state_wq); + raw_spin_lock_init(&rdp->nocb_lock); + raw_spin_lock_init(&rdp->nocb_bypass_lock); + raw_spin_lock_init(&rdp->nocb_gp_lock); + timer_setup(&rdp->nocb_timer, do_nocb_deferred_wakeup_timer, 0); + rcu_cblist_init(&rdp->nocb_bypass); + WRITE_ONCE(rdp->lazy_len, 0); + mutex_init(&rdp->nocb_gp_kthread_mutex); +} + +/* + * If the specified CPU is a no-CBs CPU that does not already have its + * rcuo CB kthread, spawn it. Additionally, if the rcuo GP kthread + * for this CPU's group has not yet been created, spawn it as well. + */ +static void rcu_spawn_cpu_nocb_kthread(int cpu) +{ + struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); + struct rcu_data *rdp_gp; + struct task_struct *t; + struct sched_param sp; + + if (!rcu_scheduler_fully_active || !rcu_state.nocb_is_setup) + return; + + /* If there already is an rcuo kthread, then nothing to do. */ + if (rdp->nocb_cb_kthread) + return; + + /* If we didn't spawn the GP kthread first, reorganize! */ + sp.sched_priority = kthread_prio; + rdp_gp = rdp->nocb_gp_rdp; + mutex_lock(&rdp_gp->nocb_gp_kthread_mutex); + if (!rdp_gp->nocb_gp_kthread) { + t = kthread_run(rcu_nocb_gp_kthread, rdp_gp, + "rcuog/%d", rdp_gp->cpu); + if (WARN_ONCE(IS_ERR(t), "%s: Could not start rcuo GP kthread, OOM is now expected behavior\n", __func__)) { + mutex_unlock(&rdp_gp->nocb_gp_kthread_mutex); + goto err; + } + WRITE_ONCE(rdp_gp->nocb_gp_kthread, t); + if (kthread_prio) + sched_setscheduler_nocheck(t, SCHED_FIFO, &sp); + } + mutex_unlock(&rdp_gp->nocb_gp_kthread_mutex); + + /* Spawn the kthread for this CPU. */ + t = kthread_create(rcu_nocb_cb_kthread, rdp, + "rcuo%c/%d", rcu_state.abbr, cpu); + if (WARN_ONCE(IS_ERR(t), "%s: Could not start rcuo CB kthread, OOM is now expected behavior\n", __func__)) + goto err; + + if (rcu_rdp_is_offloaded(rdp)) + wake_up_process(t); + else + kthread_park(t); + + if (IS_ENABLED(CONFIG_RCU_NOCB_CPU_CB_BOOST) && kthread_prio) + sched_setscheduler_nocheck(t, SCHED_FIFO, &sp); + + WRITE_ONCE(rdp->nocb_cb_kthread, t); + WRITE_ONCE(rdp->nocb_gp_kthread, rdp_gp->nocb_gp_kthread); + return; + +err: + /* + * No need to protect against concurrent rcu_barrier() + * because the number of callbacks should be 0 for a non-boot CPU, + * therefore rcu_barrier() shouldn't even try to grab the nocb_lock. + * But hold nocb_mutex to avoid nocb_lock imbalance from shrinker. + */ + WARN_ON_ONCE(system_state > SYSTEM_BOOTING && rcu_segcblist_n_cbs(&rdp->cblist)); + mutex_lock(&rcu_state.nocb_mutex); + if (rcu_rdp_is_offloaded(rdp)) { + rcu_nocb_rdp_deoffload(rdp); + cpumask_clear_cpu(cpu, rcu_nocb_mask); + } + mutex_unlock(&rcu_state.nocb_mutex); +} + +/* How many CB CPU IDs per GP kthread? Default of -1 for sqrt(nr_cpu_ids). */ +static int rcu_nocb_gp_stride = -1; +module_param(rcu_nocb_gp_stride, int, 0444); + +/* + * Initialize GP-CB relationships for all no-CBs CPU. + */ +static void __init rcu_organize_nocb_kthreads(void) +{ + int cpu; + bool firsttime = true; + bool gotnocbs = false; + bool gotnocbscbs = true; + int ls = rcu_nocb_gp_stride; + int nl = 0; /* Next GP kthread. */ + struct rcu_data *rdp; + struct rcu_data *rdp_gp = NULL; /* Suppress misguided gcc warn. */ + + if (!cpumask_available(rcu_nocb_mask)) + return; + if (ls == -1) { + ls = nr_cpu_ids / int_sqrt(nr_cpu_ids); + rcu_nocb_gp_stride = ls; + } + + /* + * Each pass through this loop sets up one rcu_data structure. + * Should the corresponding CPU come online in the future, then + * we will spawn the needed set of rcu_nocb_kthread() kthreads. + */ + for_each_possible_cpu(cpu) { + rdp = per_cpu_ptr(&rcu_data, cpu); + if (rdp->cpu >= nl) { + /* New GP kthread, set up for CBs & next GP. */ + gotnocbs = true; + nl = DIV_ROUND_UP(rdp->cpu + 1, ls) * ls; + rdp_gp = rdp; + INIT_LIST_HEAD(&rdp->nocb_head_rdp); + if (dump_tree) { + if (!firsttime) + pr_cont("%s\n", gotnocbscbs + ? "" : " (self only)"); + gotnocbscbs = false; + firsttime = false; + pr_alert("%s: No-CB GP kthread CPU %d:", + __func__, cpu); + } + } else { + /* Another CB kthread, link to previous GP kthread. */ + gotnocbscbs = true; + if (dump_tree) + pr_cont(" %d", cpu); + } + rdp->nocb_gp_rdp = rdp_gp; + if (cpumask_test_cpu(cpu, rcu_nocb_mask)) + list_add_tail(&rdp->nocb_entry_rdp, &rdp_gp->nocb_head_rdp); + } + if (gotnocbs && dump_tree) + pr_cont("%s\n", gotnocbscbs ? "" : " (self only)"); +} + +/* + * Bind the current task to the offloaded CPUs. If there are no offloaded + * CPUs, leave the task unbound. Splat if the bind attempt fails. + */ +void rcu_bind_current_to_nocb(void) +{ + if (cpumask_available(rcu_nocb_mask) && !cpumask_empty(rcu_nocb_mask)) + WARN_ON(sched_setaffinity(current->pid, rcu_nocb_mask)); +} +EXPORT_SYMBOL_GPL(rcu_bind_current_to_nocb); + +// The ->on_cpu field is available only in CONFIG_SMP=y, so... +#ifdef CONFIG_SMP +static char *show_rcu_should_be_on_cpu(struct task_struct *tsp) +{ + return tsp && task_is_running(tsp) && !tsp->on_cpu ? "!" : ""; +} +#else // #ifdef CONFIG_SMP +static char *show_rcu_should_be_on_cpu(struct task_struct *tsp) +{ + return ""; +} +#endif // #else #ifdef CONFIG_SMP + +/* + * Dump out nocb grace-period kthread state for the specified rcu_data + * structure. + */ +static void show_rcu_nocb_gp_state(struct rcu_data *rdp) +{ + struct rcu_node *rnp = rdp->mynode; + + pr_info("nocb GP %d %c%c%c%c%c %c[%c%c] %c%c:%ld rnp %d:%d %lu %c CPU %d%s\n", + rdp->cpu, + "kK"[!!rdp->nocb_gp_kthread], + "lL"[raw_spin_is_locked(&rdp->nocb_gp_lock)], + "dD"[!!rdp->nocb_defer_wakeup], + "tT"[timer_pending(&rdp->nocb_timer)], + "sS"[!!rdp->nocb_gp_sleep], + ".W"[swait_active(&rdp->nocb_gp_wq)], + ".W"[swait_active(&rnp->nocb_gp_wq[0])], + ".W"[swait_active(&rnp->nocb_gp_wq[1])], + ".B"[!!rdp->nocb_gp_bypass], + ".G"[!!rdp->nocb_gp_gp], + (long)rdp->nocb_gp_seq, + rnp->grplo, rnp->grphi, READ_ONCE(rdp->nocb_gp_loops), + rdp->nocb_gp_kthread ? task_state_to_char(rdp->nocb_gp_kthread) : '.', + rdp->nocb_gp_kthread ? (int)task_cpu(rdp->nocb_gp_kthread) : -1, + show_rcu_should_be_on_cpu(rdp->nocb_gp_kthread)); +} + +/* Dump out nocb kthread state for the specified rcu_data structure. */ +static void show_rcu_nocb_state(struct rcu_data *rdp) +{ + char bufd[22]; + char bufw[45]; + char bufr[45]; + char bufn[22]; + char bufb[22]; + struct rcu_data *nocb_next_rdp; + struct rcu_segcblist *rsclp = &rdp->cblist; + bool waslocked; + bool wassleep; + + if (rdp->nocb_gp_rdp == rdp) + show_rcu_nocb_gp_state(rdp); + + if (!rcu_segcblist_is_offloaded(&rdp->cblist)) + return; + + nocb_next_rdp = list_next_or_null_rcu(&rdp->nocb_gp_rdp->nocb_head_rdp, + &rdp->nocb_entry_rdp, + typeof(*rdp), + nocb_entry_rdp); + + sprintf(bufd, "%ld", rsclp->seglen[RCU_DONE_TAIL]); + sprintf(bufw, "%ld(%ld)", rsclp->seglen[RCU_WAIT_TAIL], rsclp->gp_seq[RCU_WAIT_TAIL]); + sprintf(bufr, "%ld(%ld)", rsclp->seglen[RCU_NEXT_READY_TAIL], + rsclp->gp_seq[RCU_NEXT_READY_TAIL]); + sprintf(bufn, "%ld", rsclp->seglen[RCU_NEXT_TAIL]); + sprintf(bufb, "%ld", rcu_cblist_n_cbs(&rdp->nocb_bypass)); + pr_info(" CB %d^%d->%d %c%c%c%c%c F%ld L%ld C%d %c%s%c%s%c%s%c%s%c%s q%ld %c CPU %d%s\n", + rdp->cpu, rdp->nocb_gp_rdp->cpu, + nocb_next_rdp ? nocb_next_rdp->cpu : -1, + "kK"[!!rdp->nocb_cb_kthread], + "bB"[raw_spin_is_locked(&rdp->nocb_bypass_lock)], + "lL"[raw_spin_is_locked(&rdp->nocb_lock)], + "sS"[!!rdp->nocb_cb_sleep], + ".W"[swait_active(&rdp->nocb_cb_wq)], + jiffies - rdp->nocb_bypass_first, + jiffies - rdp->nocb_nobypass_last, + rdp->nocb_nobypass_count, + ".D"[rcu_segcblist_ready_cbs(rsclp)], + rcu_segcblist_segempty(rsclp, RCU_DONE_TAIL) ? "" : bufd, + ".W"[!rcu_segcblist_segempty(rsclp, RCU_WAIT_TAIL)], + rcu_segcblist_segempty(rsclp, RCU_WAIT_TAIL) ? "" : bufw, + ".R"[!rcu_segcblist_segempty(rsclp, RCU_NEXT_READY_TAIL)], + rcu_segcblist_segempty(rsclp, RCU_NEXT_READY_TAIL) ? "" : bufr, + ".N"[!rcu_segcblist_segempty(rsclp, RCU_NEXT_TAIL)], + rcu_segcblist_segempty(rsclp, RCU_NEXT_TAIL) ? "" : bufn, + ".B"[!!rcu_cblist_n_cbs(&rdp->nocb_bypass)], + !rcu_cblist_n_cbs(&rdp->nocb_bypass) ? "" : bufb, + rcu_segcblist_n_cbs(&rdp->cblist), + rdp->nocb_cb_kthread ? task_state_to_char(rdp->nocb_cb_kthread) : '.', + rdp->nocb_cb_kthread ? (int)task_cpu(rdp->nocb_cb_kthread) : -1, + show_rcu_should_be_on_cpu(rdp->nocb_cb_kthread)); + + /* It is OK for GP kthreads to have GP state. */ + if (rdp->nocb_gp_rdp == rdp) + return; + + waslocked = raw_spin_is_locked(&rdp->nocb_gp_lock); + wassleep = swait_active(&rdp->nocb_gp_wq); + if (!rdp->nocb_gp_sleep && !waslocked && !wassleep) + return; /* Nothing untoward. */ + + pr_info(" nocb GP activity on CB-only CPU!!! %c%c%c %c\n", + "lL"[waslocked], + "dD"[!!rdp->nocb_defer_wakeup], + "sS"[!!rdp->nocb_gp_sleep], + ".W"[wassleep]); +} + +#else /* #ifdef CONFIG_RCU_NOCB_CPU */ + +/* No ->nocb_lock to acquire. */ +static void rcu_nocb_lock(struct rcu_data *rdp) +{ +} + +/* No ->nocb_lock to release. */ +static void rcu_nocb_unlock(struct rcu_data *rdp) +{ +} + +/* No ->nocb_lock to release. */ +static void rcu_nocb_unlock_irqrestore(struct rcu_data *rdp, + unsigned long flags) +{ + local_irq_restore(flags); +} + +/* Lockdep check that ->cblist may be safely accessed. */ +static void rcu_lockdep_assert_cblist_protected(struct rcu_data *rdp) +{ + lockdep_assert_irqs_disabled(); +} + +static void rcu_nocb_gp_cleanup(struct swait_queue_head *sq) +{ +} + +static struct swait_queue_head *rcu_nocb_gp_get(struct rcu_node *rnp) +{ + return NULL; +} + +static void rcu_init_one_nocb(struct rcu_node *rnp) +{ +} + +static bool wake_nocb_gp(struct rcu_data *rdp, bool force) +{ + return false; +} + +static bool rcu_nocb_flush_bypass(struct rcu_data *rdp, struct rcu_head *rhp, + unsigned long j, bool lazy) +{ + return true; +} + +static void call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *head, + rcu_callback_t func, unsigned long flags, bool lazy) +{ + WARN_ON_ONCE(1); /* Should be dead code! */ +} + +static void __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_empty, + unsigned long flags) +{ + WARN_ON_ONCE(1); /* Should be dead code! */ +} + +static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp) +{ +} + +static int rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp, int level) +{ + return false; +} + +static bool do_nocb_deferred_wakeup(struct rcu_data *rdp) +{ + return false; +} + +static void rcu_spawn_cpu_nocb_kthread(int cpu) +{ +} + +static void show_rcu_nocb_state(struct rcu_data *rdp) +{ +} + +#endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */ diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 1b3dd2fc0cd6..dbe2d02be824 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -1,67 +1,42 @@ +/* SPDX-License-Identifier: GPL-2.0+ */ /* * Read-Copy Update mechanism for mutual exclusion (tree-based version) * Internal non-public definitions that provide either classic * or preemptible semantics. * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, you can access it online at - * http://www.gnu.org/licenses/gpl-2.0.html. - * * Copyright Red Hat, 2009 * Copyright IBM Corporation, 2009 * * Author: Ingo Molnar <mingo@elte.hu> - * Paul E. McKenney <paulmck@linux.vnet.ibm.com> + * Paul E. McKenney <paulmck@linux.ibm.com> */ -#include <linux/delay.h> -#include <linux/gfp.h> -#include <linux/oom.h> -#include <linux/sched/debug.h> -#include <linux/smpboot.h> -#include <linux/sched/isolation.h> -#include <uapi/linux/sched/types.h> -#include "../time/tick-internal.h" - -#ifdef CONFIG_RCU_BOOST - #include "../locking/rtmutex_common.h" -/* - * Control variables for per-CPU and per-rcu_node kthreads. - */ -static DEFINE_PER_CPU(struct task_struct *, rcu_cpu_kthread_task); -DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_status); -DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_loops); -DEFINE_PER_CPU(char, rcu_cpu_has_work); - -#else /* #ifdef CONFIG_RCU_BOOST */ - -/* - * Some architectures do not define rt_mutexes, but if !CONFIG_RCU_BOOST, - * all uses are in dead code. Provide a definition to keep the compiler - * happy, but add WARN_ON_ONCE() to complain if used in the wrong place. - * This probably needs to be excluded from -rt builds. - */ -#define rt_mutex_owner(a) ({ WARN_ON_ONCE(1); NULL; }) -#define rt_mutex_futex_unlock(x) WARN_ON_ONCE(1) - -#endif /* #else #ifdef CONFIG_RCU_BOOST */ +static bool rcu_rdp_is_offloaded(struct rcu_data *rdp) +{ + /* + * In order to read the offloaded state of an rdp in a safe + * and stable way and prevent from its value to be changed + * under us, we must either hold the barrier mutex, the cpu + * hotplug lock (read or write) or the nocb lock. Local + * non-preemptible reads are also safe. NOCB kthreads and + * timers have their own means of synchronization against the + * offloaded state updaters. + */ + RCU_NOCB_LOCKDEP_WARN( + !(lockdep_is_held(&rcu_state.barrier_mutex) || + (IS_ENABLED(CONFIG_HOTPLUG_CPU) && lockdep_is_cpus_held()) || + lockdep_is_held(&rdp->nocb_lock) || + lockdep_is_held(&rcu_state.nocb_mutex) || + ((!(IS_ENABLED(CONFIG_PREEMPT_COUNT) && preemptible()) || softirq_count()) && + rdp == this_cpu_ptr(&rcu_data)) || + rcu_current_is_nocb_kthread(rdp)), + "Unsafe read of RCU_NOCB offloaded state" + ); -#ifdef CONFIG_RCU_NOCB_CPU -static cpumask_var_t rcu_nocb_mask; /* CPUs to have callbacks offloaded. */ -static bool __read_mostly rcu_nocb_poll; /* Offload kthread are to poll. */ -#endif /* #ifdef CONFIG_RCU_NOCB_CPU */ + return rcu_segcblist_is_offloaded(&rdp->cblist); +} /* * Check the RCU kernel configuration parameters and print informative @@ -77,10 +52,10 @@ static void __init rcu_bootup_announce_oddness(void) RCU_FANOUT); if (rcu_fanout_exact) pr_info("\tHierarchical RCU autobalancing is disabled.\n"); - if (IS_ENABLED(CONFIG_RCU_FAST_NO_HZ)) - pr_info("\tRCU dyntick-idle grace-period acceleration is enabled.\n"); if (IS_ENABLED(CONFIG_PROVE_RCU)) pr_info("\tRCU lockdep checking is enabled.\n"); + if (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD)) + pr_info("\tRCU strict (and thus non-scalable) grace periods are enabled.\n"); if (RCU_NUM_LVLS >= 4) pr_info("\tFour(or more)-level hierarchy is enabled.\n"); if (RCU_FANOUT_LEAF != 16) @@ -101,6 +76,8 @@ static void __init rcu_bootup_announce_oddness(void) pr_info("\tBoot-time adjustment of callback high-water mark to %ld.\n", qhimark); if (qlowmark != DEFAULT_RCU_QLOMARK) pr_info("\tBoot-time adjustment of callback low-water mark to %ld.\n", qlowmark); + if (qovld != DEFAULT_RCU_QOVLD) + pr_info("\tBoot-time adjustment of callback overload level to %ld.\n", qovld); if (jiffies_till_first_fqs != ULONG_MAX) pr_info("\tBoot-time adjustment of first FQS scan delay to %ld jiffies.\n", jiffies_till_first_fqs); if (jiffies_till_next_fqs != ULONG_MAX) @@ -110,13 +87,25 @@ static void __init rcu_bootup_announce_oddness(void) if (rcu_kick_kthreads) pr_info("\tKick kthreads if too-long grace period.\n"); if (IS_ENABLED(CONFIG_DEBUG_OBJECTS_RCU_HEAD)) - pr_info("\tRCU callback double-/use-after-free debug enabled.\n"); + pr_info("\tRCU callback double-/use-after-free debug is enabled.\n"); if (gp_preinit_delay) pr_info("\tRCU debug GP pre-init slowdown %d jiffies.\n", gp_preinit_delay); if (gp_init_delay) pr_info("\tRCU debug GP init slowdown %d jiffies.\n", gp_init_delay); if (gp_cleanup_delay) - pr_info("\tRCU debug GP init slowdown %d jiffies.\n", gp_cleanup_delay); + pr_info("\tRCU debug GP cleanup slowdown %d jiffies.\n", gp_cleanup_delay); + if (nohz_full_patience_delay < 0) { + pr_info("\tRCU NOCB CPU patience negative (%d), resetting to zero.\n", nohz_full_patience_delay); + nohz_full_patience_delay = 0; + } else if (nohz_full_patience_delay > 5 * MSEC_PER_SEC) { + pr_info("\tRCU NOCB CPU patience too large (%d), resetting to %ld.\n", nohz_full_patience_delay, 5 * MSEC_PER_SEC); + nohz_full_patience_delay = 5 * MSEC_PER_SEC; + } else if (nohz_full_patience_delay) { + pr_info("\tRCU NOCB CPU patience set to %d milliseconds.\n", nohz_full_patience_delay); + } + nohz_full_patience_delay_jiffies = msecs_to_jiffies(nohz_full_patience_delay); + if (!use_softirq) + pr_info("\tRCU_SOFTIRQ processing moved to rcuc kthreads.\n"); if (IS_ENABLED(CONFIG_RCU_EQS_DEBUG)) pr_info("\tRCU debug extended QS entry/exit.\n"); rcupdate_announce_bootup_oddness(); @@ -194,9 +183,9 @@ static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp) switch (blkd_state) { case 0: case RCU_EXP_TASKS: - case RCU_EXP_TASKS + RCU_GP_BLKD: + case RCU_EXP_TASKS | RCU_GP_BLKD: case RCU_GP_TASKS: - case RCU_GP_TASKS + RCU_EXP_TASKS: + case RCU_GP_TASKS | RCU_EXP_TASKS: /* * Blocking neither GP, or first task blocking the normal @@ -209,10 +198,10 @@ static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp) case RCU_EXP_BLKD: case RCU_GP_BLKD: - case RCU_GP_BLKD + RCU_EXP_BLKD: - case RCU_GP_TASKS + RCU_EXP_BLKD: - case RCU_GP_TASKS + RCU_GP_BLKD + RCU_EXP_BLKD: - case RCU_GP_TASKS + RCU_EXP_TASKS + RCU_GP_BLKD + RCU_EXP_BLKD: + case RCU_GP_BLKD | RCU_EXP_BLKD: + case RCU_GP_TASKS | RCU_EXP_BLKD: + case RCU_GP_TASKS | RCU_GP_BLKD | RCU_EXP_BLKD: + case RCU_GP_TASKS | RCU_EXP_TASKS | RCU_GP_BLKD | RCU_EXP_BLKD: /* * First task arriving that blocks either GP, or first task @@ -225,9 +214,9 @@ static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp) list_add_tail(&t->rcu_node_entry, &rnp->blkd_tasks); break; - case RCU_EXP_TASKS + RCU_EXP_BLKD: - case RCU_EXP_TASKS + RCU_GP_BLKD + RCU_EXP_BLKD: - case RCU_GP_TASKS + RCU_EXP_TASKS + RCU_EXP_BLKD: + case RCU_EXP_TASKS | RCU_EXP_BLKD: + case RCU_EXP_TASKS | RCU_GP_BLKD | RCU_EXP_BLKD: + case RCU_GP_TASKS | RCU_EXP_TASKS | RCU_EXP_BLKD: /* * Second or subsequent task blocking the expedited GP. @@ -238,8 +227,8 @@ static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp) list_add(&t->rcu_node_entry, rnp->exp_tasks); break; - case RCU_GP_TASKS + RCU_GP_BLKD: - case RCU_GP_TASKS + RCU_EXP_TASKS + RCU_GP_BLKD: + case RCU_GP_TASKS | RCU_GP_BLKD: + case RCU_GP_TASKS | RCU_EXP_TASKS | RCU_GP_BLKD: /* * Second or subsequent task blocking the normal GP. @@ -263,11 +252,11 @@ static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp) * blocked tasks. */ if (!rnp->gp_tasks && (blkd_state & RCU_GP_BLKD)) { - rnp->gp_tasks = &t->rcu_node_entry; + WRITE_ONCE(rnp->gp_tasks, &t->rcu_node_entry); WARN_ON_ONCE(rnp->completedqs == rnp->gp_seq); } if (!rnp->exp_tasks && (blkd_state & RCU_EXP_BLKD)) - rnp->exp_tasks = &t->rcu_node_entry; + WRITE_ONCE(rnp->exp_tasks, &t->rcu_node_entry); WARN_ON_ONCE(!(blkd_state & RCU_GP_BLKD) != !(rnp->qsmask & rdp->grpmask)); WARN_ON_ONCE(!(blkd_state & RCU_EXP_BLKD) != @@ -279,11 +268,14 @@ static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp) * GP should not be able to end until we report, so there should be * no need to check for a subsequent expedited GP. (Though we are * still in a quiescent state in any case.) + * + * Interrupts are disabled, so ->cpu_no_qs.b.exp cannot change. */ - if (blkd_state & RCU_EXP_BLKD && rdp->deferred_qs) + if (blkd_state & RCU_EXP_BLKD && rdp->cpu_no_qs.b.exp) rcu_report_exp_rdp(rdp); else - WARN_ON_ONCE(rdp->deferred_qs); + WARN_ON_ONCE(rdp->cpu_no_qs.b.exp); + ASSERT_EXCLUSIVE_WRITER_SCOPED(rdp->cpu_no_qs.b.exp); } /* @@ -297,18 +289,22 @@ static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp) * current task, there might be any number of other tasks blocked while * in an RCU read-side critical section. * + * Unlike non-preemptible-RCU, quiescent state reports for expedited + * grace periods are handled separately via deferred quiescent states + * and context switch events. + * * Callers to this function must disable preemption. */ static void rcu_qs(void) { RCU_LOCKDEP_WARN(preemptible(), "rcu_qs() invoked with preemption enabled!!!\n"); - if (__this_cpu_read(rcu_data.cpu_no_qs.s)) { + if (__this_cpu_read(rcu_data.cpu_no_qs.b.norm)) { trace_rcu_grace_period(TPS("rcu_preempt"), __this_cpu_read(rcu_data.gp_seq), TPS("cpuqs")); __this_cpu_write(rcu_data.cpu_no_qs.b.norm, false); - barrier(); /* Coordinate with rcu_flavor_check_callbacks(). */ - current->rcu_read_unlock_special.b.need_qs = false; + barrier(); /* Coordinate with rcu_flavor_sched_clock_irq(). */ + WRITE_ONCE(current->rcu_read_unlock_special.b.need_qs, false); } } @@ -331,11 +327,10 @@ void rcu_note_context_switch(bool preempt) struct rcu_data *rdp = this_cpu_ptr(&rcu_data); struct rcu_node *rnp; - barrier(); /* Avoid RCU read-side critical sections leaking down. */ trace_rcu_utilization(TPS("Start context switch")); lockdep_assert_irqs_disabled(); - WARN_ON_ONCE(!preempt && t->rcu_read_lock_nesting > 0); - if (t->rcu_read_lock_nesting > 0 && + WARN_ONCE(!preempt && rcu_preempt_depth() > 0, "Voluntary context switch within RCU read-side critical section!"); + if (rcu_preempt_depth() > 0 && !t->rcu_read_unlock_special.b.blocked) { /* Possibly blocking in an RCU read-side critical section. */ @@ -349,7 +344,7 @@ void rcu_note_context_switch(bool preempt) * then queue the task as required based on the states * of any ongoing and expedited grace periods. */ - WARN_ON_ONCE((rdp->grpmask & rcu_rnp_online_cpus(rnp)) == 0); + WARN_ON_ONCE(!rcu_rdp_cpu_online(rdp)); WARN_ON_ONCE(!list_empty(&t->rcu_node_entry)); trace_rcu_preempt_task(rcu_state.name, t->pid, @@ -357,15 +352,6 @@ void rcu_note_context_switch(bool preempt) ? rnp->gp_seq : rcu_seq_snap(&rnp->gp_seq)); rcu_preempt_ctxt_queue(rnp, rdp); - } else if (t->rcu_read_lock_nesting < 0 && - t->rcu_read_unlock_special.s) { - - /* - * Complete exit from RCU read-side critical section on - * behalf of preempted instance of __rcu_read_unlock(). - */ - rcu_read_unlock_special(t); - rcu_preempt_deferred_qs(t); } else { rcu_preempt_deferred_qs(t); } @@ -380,10 +366,10 @@ void rcu_note_context_switch(bool preempt) * means that we continue to block the current grace period. */ rcu_qs(); - if (rdp->deferred_qs) + if (rdp->cpu_no_qs.b.exp) rcu_report_exp_rdp(rdp); + rcu_tasks_qs(current, preempt); trace_rcu_utilization(TPS("End context switch")); - barrier(); /* Avoid RCU read-side critical sections leaking up. */ } EXPORT_SYMBOL_GPL(rcu_note_context_switch); @@ -394,14 +380,30 @@ EXPORT_SYMBOL_GPL(rcu_note_context_switch); */ static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp) { - return rnp->gp_tasks != NULL; + return READ_ONCE(rnp->gp_tasks) != NULL; } -/* Bias and limit values for ->rcu_read_lock_nesting. */ -#define RCU_NEST_BIAS INT_MAX -#define RCU_NEST_NMAX (-INT_MAX / 2) +/* limit value for ->rcu_read_lock_nesting. */ #define RCU_NEST_PMAX (INT_MAX / 2) +static void rcu_preempt_read_enter(void) +{ + WRITE_ONCE(current->rcu_read_lock_nesting, READ_ONCE(current->rcu_read_lock_nesting) + 1); +} + +static int rcu_preempt_read_exit(void) +{ + int ret = READ_ONCE(current->rcu_read_lock_nesting) - 1; + + WRITE_ONCE(current->rcu_read_lock_nesting, ret); + return ret; +} + +static void rcu_preempt_depth_set(int val) +{ + WRITE_ONCE(current->rcu_read_lock_nesting, val); +} + /* * Preemptible RCU implementation for rcu_read_lock(). * Just increment ->rcu_read_lock_nesting, shared state will be updated @@ -409,9 +411,11 @@ static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp) */ void __rcu_read_lock(void) { - current->rcu_read_lock_nesting++; + rcu_preempt_read_enter(); if (IS_ENABLED(CONFIG_PROVE_LOCKING)) - WARN_ON_ONCE(current->rcu_read_lock_nesting > RCU_NEST_PMAX); + WARN_ON_ONCE(rcu_preempt_depth() > RCU_NEST_PMAX); + if (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD) && rcu_state.gp_kthread) + WRITE_ONCE(current->rcu_read_unlock_special.b.need_qs, true); barrier(); /* critical section after entry code. */ } EXPORT_SYMBOL_GPL(__rcu_read_lock); @@ -427,21 +431,16 @@ void __rcu_read_unlock(void) { struct task_struct *t = current; - if (t->rcu_read_lock_nesting != 1) { - --t->rcu_read_lock_nesting; - } else { - barrier(); /* critical section before exit code. */ - t->rcu_read_lock_nesting = -RCU_NEST_BIAS; - barrier(); /* assign before ->rcu_read_unlock_special load */ + barrier(); // critical section before exit code. + if (rcu_preempt_read_exit() == 0) { + barrier(); // critical-section exit before .s check. if (unlikely(READ_ONCE(t->rcu_read_unlock_special.s))) rcu_read_unlock_special(t); - barrier(); /* ->rcu_read_unlock_special load before assign */ - t->rcu_read_lock_nesting = 0; } if (IS_ENABLED(CONFIG_PROVE_LOCKING)) { - int rrln = t->rcu_read_lock_nesting; + int rrln = rcu_preempt_depth(); - WARN_ON_ONCE(rrln < 0 && rrln > RCU_NEST_NMAX); + WARN_ON_ONCE(rrln < 0 || rrln > RCU_NEST_PMAX); } } EXPORT_SYMBOL_GPL(__rcu_read_unlock); @@ -475,7 +474,7 @@ static bool rcu_preempt_has_tasks(struct rcu_node *rnp) * be quite short, for example, in the case of the call from * rcu_read_unlock_special(). */ -static void +static notrace void rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, unsigned long flags) { bool empty_exp; @@ -487,23 +486,28 @@ rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, unsigned long flags) struct rcu_node *rnp; union rcu_special special; + rdp = this_cpu_ptr(&rcu_data); + if (rdp->defer_qs_iw_pending == DEFER_QS_PENDING) + rdp->defer_qs_iw_pending = DEFER_QS_IDLE; + /* * If RCU core is waiting for this CPU to exit its critical section, * report the fact that it has exited. Because irqs are disabled, * t->rcu_read_unlock_special cannot change. */ special = t->rcu_read_unlock_special; - rdp = this_cpu_ptr(&rcu_data); - if (!special.s && !rdp->deferred_qs) { + if (!special.s && !rdp->cpu_no_qs.b.exp) { local_irq_restore(flags); return; } + t->rcu_read_unlock_special.s = 0; if (special.b.need_qs) { - rcu_qs(); - t->rcu_read_unlock_special.b.need_qs = false; - if (!t->rcu_read_unlock_special.s && !rdp->deferred_qs) { - local_irq_restore(flags); - return; + if (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD)) { + rdp->cpu_no_qs.b.norm = false; + rcu_report_qs_rdp(rdp); + udelay(rcu_unlock_delay); + } else { + rcu_qs(); } } @@ -513,17 +517,11 @@ rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, unsigned long flags) * tasks are handled when removing the task from the * blocked-tasks list below. */ - if (rdp->deferred_qs) { + if (rdp->cpu_no_qs.b.exp) rcu_report_exp_rdp(rdp); - if (!t->rcu_read_unlock_special.s) { - local_irq_restore(flags); - return; - } - } /* Clean up if blocked during RCU read-side critical section. */ if (special.b.blocked) { - t->rcu_read_unlock_special.b.blocked = false; /* * Remove this task from the list it blocked on. The task @@ -538,22 +536,21 @@ rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, unsigned long flags) empty_norm = !rcu_preempt_blocked_readers_cgp(rnp); WARN_ON_ONCE(rnp->completedqs == rnp->gp_seq && (!empty_norm || rnp->qsmask)); - empty_exp = sync_rcu_preempt_exp_done(rnp); - smp_mb(); /* ensure expedited fastpath sees end of RCU c-s. */ + empty_exp = sync_rcu_exp_done(rnp); np = rcu_next_node_entry(t, rnp); list_del_init(&t->rcu_node_entry); t->rcu_blocked_node = NULL; trace_rcu_unlock_preempted_task(TPS("rcu_preempt"), rnp->gp_seq, t->pid); if (&t->rcu_node_entry == rnp->gp_tasks) - rnp->gp_tasks = np; + WRITE_ONCE(rnp->gp_tasks, np); if (&t->rcu_node_entry == rnp->exp_tasks) - rnp->exp_tasks = np; + WRITE_ONCE(rnp->exp_tasks, np); if (IS_ENABLED(CONFIG_RCU_BOOST)) { /* Snapshot ->boost_mtx ownership w/rnp->lock held. */ - drop_boost_mutex = rt_mutex_owner(&rnp->boost_mtx) == t; + drop_boost_mutex = rt_mutex_owner(&rnp->boost_mtx.rtmutex) == t; if (&t->rcu_node_entry == rnp->boost_tasks) - rnp->boost_tasks = np; + WRITE_ONCE(rnp->boost_tasks, np); } /* @@ -562,7 +559,7 @@ rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, unsigned long flags) * Note that rcu_report_unblock_qs_rnp() releases rnp->lock, * so we must take a snapshot of the expedited state. */ - empty_exp_now = sync_rcu_preempt_exp_done(rnp); + empty_exp_now = sync_rcu_exp_done(rnp); if (!empty_norm && !rcu_preempt_blocked_readers_cgp(rnp)) { trace_rcu_quiescent_state_report(TPS("preempt_rcu"), rnp->gp_seq, @@ -576,16 +573,16 @@ rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, unsigned long flags) raw_spin_unlock_irqrestore_rcu_node(rnp, flags); } - /* Unboost if we were boosted. */ - if (IS_ENABLED(CONFIG_RCU_BOOST) && drop_boost_mutex) - rt_mutex_futex_unlock(&rnp->boost_mtx); - /* * If this was the last task on the expedited lists, * then we need to report up the rcu_node hierarchy. */ if (!empty_exp && empty_exp_now) rcu_report_exp_rnp(rnp, true); + + /* Unboost if we were boosted. */ + if (IS_ENABLED(CONFIG_RCU_BOOST) && drop_boost_mutex) + rt_mutex_futex_unlock(&rnp->boost_mtx.rtmutex); } else { local_irq_restore(flags); } @@ -600,11 +597,11 @@ rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, unsigned long flags) * is disabled. This function cannot be expected to understand these * nuances, so the caller must handle them. */ -static bool rcu_preempt_need_deferred_qs(struct task_struct *t) +static notrace bool rcu_preempt_need_deferred_qs(struct task_struct *t) { - return (__this_cpu_read(rcu_data.deferred_qs) || + return (__this_cpu_read(rcu_data.cpu_no_qs.b.exp) || READ_ONCE(t->rcu_read_unlock_special.s)) && - t->rcu_read_lock_nesting <= 0; + rcu_preempt_depth() == 0; } /* @@ -614,19 +611,110 @@ static bool rcu_preempt_need_deferred_qs(struct task_struct *t) * evaluate safety in terms of interrupt, softirq, and preemption * disabling. */ -static void rcu_preempt_deferred_qs(struct task_struct *t) +notrace void rcu_preempt_deferred_qs(struct task_struct *t) { unsigned long flags; - bool couldrecurse = t->rcu_read_lock_nesting >= 0; if (!rcu_preempt_need_deferred_qs(t)) return; - if (couldrecurse) - t->rcu_read_lock_nesting -= RCU_NEST_BIAS; local_irq_save(flags); rcu_preempt_deferred_qs_irqrestore(t, flags); - if (couldrecurse) - t->rcu_read_lock_nesting += RCU_NEST_BIAS; +} + +/* + * Minimal handler to give the scheduler a chance to re-evaluate. + */ +static void rcu_preempt_deferred_qs_handler(struct irq_work *iwp) +{ + struct rcu_data *rdp; + + lockdep_assert_irqs_disabled(); + rdp = container_of(iwp, struct rcu_data, defer_qs_iw); + + /* + * If the IRQ work handler happens to run in the middle of RCU read-side + * critical section, it could be ineffective in getting the scheduler's + * attention to report a deferred quiescent state (the whole point of the + * IRQ work). For this reason, requeue the IRQ work. + * + * Basically, we want to avoid following situation: + * 1. rcu_read_unlock() queues IRQ work (state -> DEFER_QS_PENDING) + * 2. CPU enters new rcu_read_lock() + * 3. IRQ work runs but cannot report QS due to rcu_preempt_depth() > 0 + * 4. rcu_read_unlock() does not re-queue work (state still PENDING) + * 5. Deferred QS reporting does not happen. + */ + if (rcu_preempt_depth() > 0) + WRITE_ONCE(rdp->defer_qs_iw_pending, DEFER_QS_IDLE); +} + +/* + * Check if expedited grace period processing during unlock is needed. + * + * This function determines whether expedited handling is required based on: + * 1. Task blocking an expedited grace period (based on a heuristic, could be + * false-positive, see below.) + * 2. CPU participating in an expedited grace period + * 3. Strict grace period mode requiring expedited handling + * 4. RCU priority deboosting needs when interrupts were disabled + * + * @t: The task being checked + * @rdp: The per-CPU RCU data + * @rnp: The RCU node for this CPU + * @irqs_were_disabled: Whether interrupts were disabled before rcu_read_unlock() + * + * Returns true if expedited processing of the rcu_read_unlock() is needed. + */ +static bool rcu_unlock_needs_exp_handling(struct task_struct *t, + struct rcu_data *rdp, + struct rcu_node *rnp, + bool irqs_were_disabled) +{ + /* + * Check if this task is blocking an expedited grace period. If the + * task was preempted within an RCU read-side critical section and is + * on the expedited grace period blockers list (exp_tasks), we need + * expedited handling to unblock the expedited GP. This is not an exact + * check because 't' might not be on the exp_tasks list at all - its + * just a fast heuristic that can be false-positive sometimes. + */ + if (t->rcu_blocked_node && READ_ONCE(t->rcu_blocked_node->exp_tasks)) + return true; + + /* + * Check if this CPU is participating in an expedited grace period. + * The expmask bitmap tracks which CPUs need to check in for the + * current expedited GP. If our CPU's bit is set, we need expedited + * handling to help complete the expedited GP. + */ + if (rdp->grpmask & READ_ONCE(rnp->expmask)) + return true; + + /* + * In CONFIG_RCU_STRICT_GRACE_PERIOD=y kernels, all grace periods + * are treated as short for testing purposes even if that means + * disturbing the system more. Check if either: + * - This CPU has not yet reported a quiescent state, or + * - This task was preempted within an RCU critical section + * In either case, require expedited handling for strict GP mode. + */ + if (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD) && + ((rdp->grpmask & READ_ONCE(rnp->qsmask)) || t->rcu_blocked_node)) + return true; + + /* + * RCU priority boosting case: If a task is subject to RCU priority + * boosting and exits an RCU read-side critical section with interrupts + * disabled, we need expedited handling to ensure timely deboosting. + * Without this, a low-priority task could incorrectly run at high + * real-time priority for an extended period degrading real-time + * responsiveness. This applies to all CONFIG_RCU_BOOST=y kernels, + * not just to PREEMPT_RT. + */ + if (IS_ENABLED(CONFIG_RCU_BOOST) && irqs_were_disabled && t->rcu_blocked_node) + return true; + + return false; } /* @@ -637,9 +725,9 @@ static void rcu_preempt_deferred_qs(struct task_struct *t) static void rcu_read_unlock_special(struct task_struct *t) { unsigned long flags; + bool irqs_were_disabled; bool preempt_bh_were_disabled = !!(preempt_count() & (PREEMPT_MASK | SOFTIRQ_MASK)); - bool irqs_were_disabled; /* NMI handlers cannot block and cannot safely manipulate state. */ if (in_nmi()) @@ -648,123 +736,44 @@ static void rcu_read_unlock_special(struct task_struct *t) local_irq_save(flags); irqs_were_disabled = irqs_disabled_flags(flags); if (preempt_bh_were_disabled || irqs_were_disabled) { - WRITE_ONCE(t->rcu_read_unlock_special.b.exp_hint, false); - /* Need to defer quiescent state until everything is enabled. */ - if (irqs_were_disabled) { - /* Enabling irqs does not reschedule, so... */ + bool needs_exp; // Expedited handling needed. + struct rcu_data *rdp = this_cpu_ptr(&rcu_data); + struct rcu_node *rnp = rdp->mynode; + + needs_exp = rcu_unlock_needs_exp_handling(t, rdp, rnp, irqs_were_disabled); + + // Need to defer quiescent state until everything is enabled. + if (use_softirq && (in_hardirq() || (needs_exp && !irqs_were_disabled))) { + // Using softirq, safe to awaken, and either the + // wakeup is free or there is either an expedited + // GP in flight or a potential need to deboost. raise_softirq_irqoff(RCU_SOFTIRQ); } else { - /* Enabling BH or preempt does reschedule, so... */ - set_tsk_need_resched(current); - set_preempt_need_resched(); + // Enabling BH or preempt does reschedule, so... + // Also if no expediting and no possible deboosting, + // slow is OK. Plus nohz_full CPUs eventually get + // tick enabled. + set_need_resched_current(); + if (IS_ENABLED(CONFIG_IRQ_WORK) && irqs_were_disabled && + needs_exp && rdp->defer_qs_iw_pending != DEFER_QS_PENDING && + cpu_online(rdp->cpu)) { + // Get scheduler to re-evaluate and call hooks. + // If !IRQ_WORK, FQS scan will eventually IPI. + rdp->defer_qs_iw_pending = DEFER_QS_PENDING; + irq_work_queue_on(&rdp->defer_qs_iw, rdp->cpu); + } } local_irq_restore(flags); return; } - WRITE_ONCE(t->rcu_read_unlock_special.b.exp_hint, false); rcu_preempt_deferred_qs_irqrestore(t, flags); } /* - * Dump detailed information for all tasks blocking the current RCU - * grace period on the specified rcu_node structure. - */ -static void rcu_print_detail_task_stall_rnp(struct rcu_node *rnp) -{ - unsigned long flags; - struct task_struct *t; - - raw_spin_lock_irqsave_rcu_node(rnp, flags); - if (!rcu_preempt_blocked_readers_cgp(rnp)) { - raw_spin_unlock_irqrestore_rcu_node(rnp, flags); - return; - } - t = list_entry(rnp->gp_tasks->prev, - struct task_struct, rcu_node_entry); - list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) { - /* - * We could be printing a lot while holding a spinlock. - * Avoid triggering hard lockup. - */ - touch_nmi_watchdog(); - sched_show_task(t); - } - raw_spin_unlock_irqrestore_rcu_node(rnp, flags); -} - -/* - * Dump detailed information for all tasks blocking the current RCU - * grace period. - */ -static void rcu_print_detail_task_stall(void) -{ - struct rcu_node *rnp = rcu_get_root(); - - rcu_print_detail_task_stall_rnp(rnp); - rcu_for_each_leaf_node(rnp) - rcu_print_detail_task_stall_rnp(rnp); -} - -static void rcu_print_task_stall_begin(struct rcu_node *rnp) -{ - pr_err("\tTasks blocked on level-%d rcu_node (CPUs %d-%d):", - rnp->level, rnp->grplo, rnp->grphi); -} - -static void rcu_print_task_stall_end(void) -{ - pr_cont("\n"); -} - -/* - * Scan the current list of tasks blocked within RCU read-side critical - * sections, printing out the tid of each. - */ -static int rcu_print_task_stall(struct rcu_node *rnp) -{ - struct task_struct *t; - int ndetected = 0; - - if (!rcu_preempt_blocked_readers_cgp(rnp)) - return 0; - rcu_print_task_stall_begin(rnp); - t = list_entry(rnp->gp_tasks->prev, - struct task_struct, rcu_node_entry); - list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) { - pr_cont(" P%d", t->pid); - ndetected++; - } - rcu_print_task_stall_end(); - return ndetected; -} - -/* - * Scan the current list of tasks blocked within RCU read-side critical - * sections, printing out the tid of each that is blocking the current - * expedited grace period. - */ -static int rcu_print_task_exp_stall(struct rcu_node *rnp) -{ - struct task_struct *t; - int ndetected = 0; - - if (!rnp->exp_tasks) - return 0; - t = list_entry(rnp->exp_tasks->prev, - struct task_struct, rcu_node_entry); - list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) { - pr_cont(" P%d", t->pid); - ndetected++; - } - return ndetected; -} - -/* * Check that the list of blocked tasks for the newly completed grace * period is in fact empty. It is a serious bug to complete a grace * period that still has RCU readers blocked! This function must be - * invoked -before- updating this rnp's ->gp_seq, and the rnp's ->lock - * must be held by the caller. + * invoked -before- updating this rnp's ->gp_seq. * * Also, if there are blocked tasks on the list, they automatically * block the newly created grace period, so set up ->gp_tasks accordingly. @@ -774,11 +783,12 @@ static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp) struct task_struct *t; RCU_LOCKDEP_WARN(preemptible(), "rcu_preempt_check_blocked_tasks() invoked with preemption enabled!!!\n"); + raw_lockdep_assert_held_rcu_node(rnp); if (WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp))) dump_blkd_tasks(rnp, 10); if (rcu_preempt_has_tasks(rnp) && (rnp->qsmaskinit || rnp->wait_blkd_tasks)) { - rnp->gp_tasks = rnp->blkd_tasks.next; + WRITE_ONCE(rnp->gp_tasks, rnp->blkd_tasks.next); t = container_of(rnp->gp_tasks, struct task_struct, rcu_node_entry); trace_rcu_unlock_preempted_task(TPS("rcu_preempt-GPS"), @@ -788,36 +798,32 @@ static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp) } /* - * Check for a quiescent state from the current CPU. When a task blocks, - * the task is recorded in the corresponding CPU's rcu_node structure, - * which is checked elsewhere. - * - * Caller must disable hard irqs. + * Check for a quiescent state from the current CPU, including voluntary + * context switches for Tasks RCU. When a task blocks, the task is + * recorded in the corresponding CPU's rcu_node structure, which is checked + * elsewhere, hence this function need only check for quiescent states + * related to the current CPU, not to those related to tasks. */ -static void rcu_flavor_check_callbacks(int user) +static void rcu_flavor_sched_clock_irq(int user) { struct task_struct *t = current; - if (user || rcu_is_cpu_rrupt_from_idle()) { - rcu_note_voluntary_context_switch(current); - } - if (t->rcu_read_lock_nesting > 0 || + lockdep_assert_irqs_disabled(); + if (rcu_preempt_depth() > 0 || (preempt_count() & (PREEMPT_MASK | SOFTIRQ_MASK))) { /* No QS, force context switch if deferred. */ - if (rcu_preempt_need_deferred_qs(t)) { - set_tsk_need_resched(t); - set_preempt_need_resched(); - } + if (rcu_preempt_need_deferred_qs(t)) + set_need_resched_current(); } else if (rcu_preempt_need_deferred_qs(t)) { rcu_preempt_deferred_qs(t); /* Report deferred QS. */ return; - } else if (!t->rcu_read_lock_nesting) { + } else if (!WARN_ON_ONCE(rcu_preempt_depth())) { rcu_qs(); /* Report immediate QS. */ return; } /* If GP is oldish, ask for help from rcu_read_unlock_special(). */ - if (t->rcu_read_lock_nesting > 0 && + if (rcu_preempt_depth() > 0 && __this_cpu_read(rcu_data.core_needs_qs) && __this_cpu_read(rcu_data.cpu_no_qs.b.norm) && !t->rcu_read_unlock_special.b.need_qs && @@ -825,69 +831,27 @@ static void rcu_flavor_check_callbacks(int user) t->rcu_read_unlock_special.b.need_qs = true; } -/** - * synchronize_rcu - wait until a grace period has elapsed. - * - * Control will return to the caller some time after a full grace - * period has elapsed, in other words after all currently executing RCU - * read-side critical sections have completed. Note, however, that - * upon return from synchronize_rcu(), the caller might well be executing - * concurrently with new RCU read-side critical sections that began while - * synchronize_rcu() was waiting. RCU read-side critical sections are - * delimited by rcu_read_lock() and rcu_read_unlock(), and may be nested. - * In addition, regions of code across which interrupts, preemption, or - * softirqs have been disabled also serve as RCU read-side critical - * sections. This includes hardware interrupt handlers, softirq handlers, - * and NMI handlers. - * - * Note that this guarantee implies further memory-ordering guarantees. - * On systems with more than one CPU, when synchronize_rcu() returns, - * each CPU is guaranteed to have executed a full memory barrier since - * the end of its last RCU read-side critical section whose beginning - * preceded the call to synchronize_rcu(). In addition, each CPU having - * an RCU read-side critical section that extends beyond the return from - * synchronize_rcu() is guaranteed to have executed a full memory barrier - * after the beginning of synchronize_rcu() and before the beginning of - * that RCU read-side critical section. Note that these guarantees include - * CPUs that are offline, idle, or executing in user mode, as well as CPUs - * that are executing in the kernel. - * - * Furthermore, if CPU A invoked synchronize_rcu(), which returned - * to its caller on CPU B, then both CPU A and CPU B are guaranteed - * to have executed a full memory barrier during the execution of - * synchronize_rcu() -- even if CPU A and CPU B are the same CPU (but - * again only if the system has more than one CPU). - */ -void synchronize_rcu(void) -{ - RCU_LOCKDEP_WARN(lock_is_held(&rcu_bh_lock_map) || - lock_is_held(&rcu_lock_map) || - lock_is_held(&rcu_sched_lock_map), - "Illegal synchronize_rcu() in RCU read-side critical section"); - if (rcu_scheduler_active == RCU_SCHEDULER_INACTIVE) - return; - if (rcu_gp_is_expedited()) - synchronize_rcu_expedited(); - else - wait_rcu_gp(call_rcu); -} -EXPORT_SYMBOL_GPL(synchronize_rcu); - /* * Check for a task exiting while in a preemptible-RCU read-side - * critical section, clean up if so. No need to issue warnings, - * as debug_check_no_locks_held() already does this if lockdep - * is enabled. + * critical section, clean up if so. No need to issue warnings, as + * debug_check_no_locks_held() already does this if lockdep is enabled. + * Besides, if this function does anything other than just immediately + * return, there was a bug of some sort. Spewing warnings from this + * function is like as not to simply obscure important prior warnings. */ void exit_rcu(void) { struct task_struct *t = current; - if (likely(list_empty(¤t->rcu_node_entry))) + if (unlikely(!list_empty(¤t->rcu_node_entry))) { + rcu_preempt_depth_set(1); + barrier(); + WRITE_ONCE(t->rcu_read_unlock_special.b.blocked, true); + } else if (unlikely(rcu_preempt_depth())) { + rcu_preempt_depth_set(1); + } else { return; - t->rcu_read_lock_nesting = 1; - barrier(); - t->rcu_read_unlock_special.b.blocked = true; + } __rcu_read_unlock(); rcu_preempt_deferred_qs(current); } @@ -902,40 +866,70 @@ dump_blkd_tasks(struct rcu_node *rnp, int ncheck) int cpu; int i; struct list_head *lhp; - bool onl; struct rcu_data *rdp; struct rcu_node *rnp1; raw_lockdep_assert_held_rcu_node(rnp); pr_info("%s: grp: %d-%d level: %d ->gp_seq %ld ->completedqs %ld\n", __func__, rnp->grplo, rnp->grphi, rnp->level, - (long)rnp->gp_seq, (long)rnp->completedqs); + (long)READ_ONCE(rnp->gp_seq), (long)rnp->completedqs); for (rnp1 = rnp; rnp1; rnp1 = rnp1->parent) pr_info("%s: %d:%d ->qsmask %#lx ->qsmaskinit %#lx ->qsmaskinitnext %#lx\n", __func__, rnp1->grplo, rnp1->grphi, rnp1->qsmask, rnp1->qsmaskinit, rnp1->qsmaskinitnext); pr_info("%s: ->gp_tasks %p ->boost_tasks %p ->exp_tasks %p\n", - __func__, rnp->gp_tasks, rnp->boost_tasks, rnp->exp_tasks); + __func__, READ_ONCE(rnp->gp_tasks), data_race(rnp->boost_tasks), + READ_ONCE(rnp->exp_tasks)); pr_info("%s: ->blkd_tasks", __func__); i = 0; list_for_each(lhp, &rnp->blkd_tasks) { pr_cont(" %p", lhp); - if (++i >= 10) + if (++i >= ncheck) break; } pr_cont("\n"); for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++) { rdp = per_cpu_ptr(&rcu_data, cpu); - onl = !!(rdp->grpmask & rcu_rnp_online_cpus(rnp)); pr_info("\t%d: %c online: %ld(%d) offline: %ld(%d)\n", - cpu, ".o"[onl], - (long)rdp->rcu_onl_gp_seq, rdp->rcu_onl_gp_flags, - (long)rdp->rcu_ofl_gp_seq, rdp->rcu_ofl_gp_flags); + cpu, ".o"[rcu_rdp_cpu_online(rdp)], + (long)rdp->rcu_onl_gp_seq, rdp->rcu_onl_gp_state, + (long)rdp->rcu_ofl_gp_seq, rdp->rcu_ofl_gp_state); } } +static void rcu_preempt_deferred_qs_init(struct rcu_data *rdp) +{ + rdp->defer_qs_iw = IRQ_WORK_INIT_HARD(rcu_preempt_deferred_qs_handler); +} #else /* #ifdef CONFIG_PREEMPT_RCU */ /* + * If strict grace periods are enabled, and if the calling + * __rcu_read_unlock() marks the beginning of a quiescent state, immediately + * report that quiescent state and, if requested, spin for a bit. + */ +void rcu_read_unlock_strict(void) +{ + struct rcu_data *rdp; + + if (irqs_disabled() || in_atomic_preempt_off() || !rcu_state.gp_kthread) + return; + + /* + * rcu_report_qs_rdp() can only be invoked with a stable rdp and + * from the local CPU. + * + * The in_atomic_preempt_off() check ensures that we come here holding + * the last preempt_count (which will get dropped once we return to + * __rcu_read_unlock(). + */ + rdp = this_cpu_ptr(&rcu_data); + rdp->cpu_no_qs.b.norm = false; + rcu_report_qs_rdp(rdp); + udelay(rcu_unlock_delay); +} +EXPORT_SYMBOL_GPL(rcu_read_unlock_strict); + +/* * Tell them what RCU they are running. */ static void __init rcu_bootup_announce(void) @@ -945,7 +939,7 @@ static void __init rcu_bootup_announce(void) } /* - * Note a quiescent state for PREEMPT=n. Because we do not need to know + * Note a quiescent state for PREEMPTION=n. Because we do not need to know * how many quiescent states passed, just if there was at least one since * the start of the grace period, this just sets a flag. The caller must * have disabled preemption. @@ -958,23 +952,16 @@ static void rcu_qs(void) trace_rcu_grace_period(TPS("rcu_sched"), __this_cpu_read(rcu_data.gp_seq), TPS("cpuqs")); __this_cpu_write(rcu_data.cpu_no_qs.b.norm, false); - if (!__this_cpu_read(rcu_data.cpu_no_qs.b.exp)) - return; - __this_cpu_write(rcu_data.cpu_no_qs.b.exp, false); - rcu_report_exp_rdp(this_cpu_ptr(&rcu_data)); + if (__this_cpu_read(rcu_data.cpu_no_qs.b.exp)) + rcu_report_exp_rdp(this_cpu_ptr(&rcu_data)); } /* * Register an urgently needed quiescent state. If there is an - * emergency, invoke rcu_momentary_dyntick_idle() to do a heavy-weight + * emergency, invoke rcu_momentary_eqs() to do a heavy-weight * dyntick-idle quiescent state visible to other CPUs, which will in * some cases serve for expedited as well as normal grace periods. * Either way, register a lightweight quiescent state. - * - * The barrier() calls are redundant in the common case when this is - * called externally, but just in case this is called from within this - * file. - * */ void rcu_all_qs(void) { @@ -982,31 +969,28 @@ void rcu_all_qs(void) if (!raw_cpu_read(rcu_data.rcu_urgent_qs)) return; - preempt_disable(); + preempt_disable(); // For CONFIG_PREEMPT_COUNT=y kernels /* Load rcu_urgent_qs before other flags. */ if (!smp_load_acquire(this_cpu_ptr(&rcu_data.rcu_urgent_qs))) { preempt_enable(); return; } this_cpu_write(rcu_data.rcu_urgent_qs, false); - barrier(); /* Avoid RCU read-side critical sections leaking down. */ if (unlikely(raw_cpu_read(rcu_data.rcu_need_heavy_qs))) { local_irq_save(flags); - rcu_momentary_dyntick_idle(); + rcu_momentary_eqs(); local_irq_restore(flags); } rcu_qs(); - barrier(); /* Avoid RCU read-side critical sections leaking up. */ preempt_enable(); } EXPORT_SYMBOL_GPL(rcu_all_qs); /* - * Note a PREEMPT=n context switch. The caller must have disabled interrupts. + * Note a PREEMPTION=n context switch. The caller must have disabled interrupts. */ void rcu_note_context_switch(bool preempt) { - barrier(); /* Avoid RCU read-side critical sections leaking down. */ trace_rcu_utilization(TPS("Start context switch")); rcu_qs(); /* Load rcu_urgent_qs before other flags. */ @@ -1014,12 +998,10 @@ void rcu_note_context_switch(bool preempt) goto out; this_cpu_write(rcu_data.rcu_urgent_qs, false); if (unlikely(raw_cpu_read(rcu_data.rcu_need_heavy_qs))) - rcu_momentary_dyntick_idle(); - if (!preempt) - rcu_tasks_qs(current); + rcu_momentary_eqs(); out: + rcu_tasks_qs(current, preempt); trace_rcu_utilization(TPS("End context switch")); - barrier(); /* Avoid RCU read-side critical sections leaking up. */ } EXPORT_SYMBOL_GPL(rcu_note_context_switch); @@ -1044,37 +1026,24 @@ static bool rcu_preempt_has_tasks(struct rcu_node *rnp) * Because there is no preemptible RCU, there can be no deferred quiescent * states. */ -static bool rcu_preempt_need_deferred_qs(struct task_struct *t) +static notrace bool rcu_preempt_need_deferred_qs(struct task_struct *t) { return false; } -static void rcu_preempt_deferred_qs(struct task_struct *t) { } -/* - * Because preemptible RCU does not exist, we never have to check for - * tasks blocked within RCU read-side critical sections. - */ -static void rcu_print_detail_task_stall(void) +// Except that we do need to respond to a request by an expedited +// grace period for a quiescent state from this CPU. Note that in +// non-preemptible kernels, there can be no context switches within RCU +// read-side critical sections, which in turn means that the leaf rcu_node +// structure's blocked-tasks list is always empty. is therefore no need to +// actually check it. Instead, a quiescent state from this CPU suffices, +// and this function is only called from such a quiescent state. +notrace void rcu_preempt_deferred_qs(struct task_struct *t) { -} - -/* - * Because preemptible RCU does not exist, we never have to check for - * tasks blocked within RCU read-side critical sections. - */ -static int rcu_print_task_stall(struct rcu_node *rnp) -{ - return 0; -} + struct rcu_data *rdp = this_cpu_ptr(&rcu_data); -/* - * Because preemptible RCU does not exist, we never have to check for - * tasks blocked within RCU read-side critical sections that are - * blocking the current expedited grace period. - */ -static int rcu_print_task_exp_stall(struct rcu_node *rnp) -{ - return 0; + if (READ_ONCE(rdp->cpu_no_qs.b.exp)) + rcu_report_exp_rdp(rdp); } /* @@ -1088,49 +1057,31 @@ static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp) } /* - * Check to see if this CPU is in a non-context-switch quiescent state - * (user mode or idle loop for rcu, non-softirq execution for rcu_bh). - * Also schedule RCU core processing. - * - * This function must be called from hardirq context. It is normally - * invoked from the scheduling-clock interrupt. + * Check to see if this CPU is in a non-context-switch quiescent state, + * namely user mode and idle loop. */ -static void rcu_flavor_check_callbacks(int user) +static void rcu_flavor_sched_clock_irq(int user) { - if (user || rcu_is_cpu_rrupt_from_idle()) { + if (user || rcu_is_cpu_rrupt_from_idle() || + (IS_ENABLED(CONFIG_PREEMPT_COUNT) && + (preempt_count() == HARDIRQ_OFFSET))) { /* * Get here if this CPU took its interrupt from user - * mode or from the idle loop, and if this is not a - * nested interrupt. In this case, the CPU is in - * a quiescent state, so note it. + * mode, from the idle loop without this being a nested + * interrupt, or while not holding the task preempt count + * (with PREEMPT_COUNT=y). In this case, the CPU is in a + * quiescent state, so note it. * * No memory barrier is required here because rcu_qs() * references only CPU-local variables that other CPUs * neither access nor modify, at least not while the * corresponding CPU is online. */ - rcu_qs(); } } -/* PREEMPT=n implementation of synchronize_rcu(). */ -void synchronize_rcu(void) -{ - RCU_LOCKDEP_WARN(lock_is_held(&rcu_bh_lock_map) || - lock_is_held(&rcu_lock_map) || - lock_is_held(&rcu_sched_lock_map), - "Illegal synchronize_rcu() in RCU read-side critical section"); - if (rcu_blocking_is_gp()) - return; - if (rcu_gp_is_expedited()) - synchronize_rcu_expedited(); - else - wait_rcu_gp(call_rcu); -} -EXPORT_SYMBOL_GPL(synchronize_rcu); - /* * Because preemptible RCU does not exist, tasks cannot possibly exit * while in preemptible RCU read-side critical sections. @@ -1148,21 +1099,48 @@ dump_blkd_tasks(struct rcu_node *rnp, int ncheck) WARN_ON_ONCE(!list_empty(&rnp->blkd_tasks)); } +static void rcu_preempt_deferred_qs_init(struct rcu_data *rdp) { } + #endif /* #else #ifdef CONFIG_PREEMPT_RCU */ +/* + * If boosting, set rcuc kthreads to realtime priority. + */ +static void rcu_cpu_kthread_setup(unsigned int cpu) +{ + struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); #ifdef CONFIG_RCU_BOOST + struct sched_param sp; + + sp.sched_priority = kthread_prio; + sched_setscheduler_nocheck(current, SCHED_FIFO, &sp); +#endif /* #ifdef CONFIG_RCU_BOOST */ + + WRITE_ONCE(rdp->rcuc_activity, jiffies); +} -static void rcu_wake_cond(struct task_struct *t, int status) +static bool rcu_is_callbacks_nocb_kthread(struct rcu_data *rdp) { - /* - * If the thread is yielding, only wake it when this - * is invoked from idle - */ - if (status != RCU_KTHREAD_YIELDING || is_idle_task(current)) - wake_up_process(t); +#ifdef CONFIG_RCU_NOCB_CPU + return rdp->nocb_cb_kthread == current; +#else + return false; +#endif } /* + * Is the current CPU running the RCU-callbacks kthread? + * Caller must have preemption disabled. + */ +static bool rcu_is_callbacks_kthread(struct rcu_data *rdp) +{ + return rdp->rcu_cpu_kthread_task == current || + rcu_is_callbacks_nocb_kthread(rdp); +} + +#ifdef CONFIG_RCU_BOOST + +/* * Carry out RCU priority boosting on the task indicated by ->exp_tasks * or ->boost_tasks, advancing the pointer to the next task in the * ->blkd_tasks list. @@ -1219,11 +1197,12 @@ static int rcu_boost(struct rcu_node *rnp) * section. */ t = container_of(tb, struct task_struct, rcu_node_entry); - rt_mutex_init_proxy_locked(&rnp->boost_mtx, t); + rt_mutex_init_proxy_locked(&rnp->boost_mtx.rtmutex, t); raw_spin_unlock_irqrestore_rcu_node(rnp, flags); /* Lock only for side effect: boosts task t's priority. */ rt_mutex_lock(&rnp->boost_mtx); rt_mutex_unlock(&rnp->boost_mtx); /* Then keep lockdep happy. */ + rnp->n_boosts++; return READ_ONCE(rnp->exp_tasks) != NULL || READ_ONCE(rnp->boost_tasks) != NULL; @@ -1240,20 +1219,21 @@ static int rcu_boost_kthread(void *arg) trace_rcu_utilization(TPS("Start boost kthread@init")); for (;;) { - rnp->boost_kthread_status = RCU_KTHREAD_WAITING; + WRITE_ONCE(rnp->boost_kthread_status, RCU_KTHREAD_WAITING); trace_rcu_utilization(TPS("End boost kthread@rcu_wait")); - rcu_wait(rnp->boost_tasks || rnp->exp_tasks); + rcu_wait(READ_ONCE(rnp->boost_tasks) || + READ_ONCE(rnp->exp_tasks)); trace_rcu_utilization(TPS("Start boost kthread@rcu_wait")); - rnp->boost_kthread_status = RCU_KTHREAD_RUNNING; + WRITE_ONCE(rnp->boost_kthread_status, RCU_KTHREAD_RUNNING); more2boost = rcu_boost(rnp); if (more2boost) spincnt++; else spincnt = 0; if (spincnt > 10) { - rnp->boost_kthread_status = RCU_KTHREAD_YIELDING; + WRITE_ONCE(rnp->boost_kthread_status, RCU_KTHREAD_YIELDING); trace_rcu_utilization(TPS("End boost kthread@rcu_yield")); - schedule_timeout_interruptible(2); + schedule_timeout_idle(2); trace_rcu_utilization(TPS("Start boost kthread@rcu_yield")); spincnt = 0; } @@ -1276,10 +1256,9 @@ static int rcu_boost_kthread(void *arg) static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags) __releases(rnp->lock) { - struct task_struct *t; - raw_lockdep_assert_held_rcu_node(rnp); - if (!rcu_preempt_blocked_readers_cgp(rnp) && rnp->exp_tasks == NULL) { + if (!rnp->boost_kthread_task || + (!rcu_preempt_blocked_readers_cgp(rnp) && !rnp->exp_tasks)) { raw_spin_unlock_irqrestore_rcu_node(rnp, flags); return; } @@ -1287,44 +1266,18 @@ static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags) (rnp->gp_tasks != NULL && rnp->boost_tasks == NULL && rnp->qsmask == 0 && - ULONG_CMP_GE(jiffies, rnp->boost_time))) { + (!time_after(rnp->boost_time, jiffies) || rcu_state.cbovld || + IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD)))) { if (rnp->exp_tasks == NULL) - rnp->boost_tasks = rnp->gp_tasks; + WRITE_ONCE(rnp->boost_tasks, rnp->gp_tasks); raw_spin_unlock_irqrestore_rcu_node(rnp, flags); - t = rnp->boost_kthread_task; - if (t) - rcu_wake_cond(t, rnp->boost_kthread_status); + rcu_wake_cond(rnp->boost_kthread_task, + READ_ONCE(rnp->boost_kthread_status)); } else { raw_spin_unlock_irqrestore_rcu_node(rnp, flags); } } -/* - * Wake up the per-CPU kthread to invoke RCU callbacks. - */ -static void invoke_rcu_callbacks_kthread(void) -{ - unsigned long flags; - - local_irq_save(flags); - __this_cpu_write(rcu_cpu_has_work, 1); - if (__this_cpu_read(rcu_cpu_kthread_task) != NULL && - current != __this_cpu_read(rcu_cpu_kthread_task)) { - rcu_wake_cond(__this_cpu_read(rcu_cpu_kthread_task), - __this_cpu_read(rcu_cpu_kthread_status)); - } - local_irq_restore(flags); -} - -/* - * Is the current CPU running the RCU-callbacks kthread? - * Caller must have preemption disabled. - */ -static bool rcu_is_callbacks_kthread(void) -{ - return __this_cpu_read(rcu_cpu_kthread_task) == current; -} - #define RCU_BOOST_DELAY_JIFFIES DIV_ROUND_UP(CONFIG_RCU_BOOST_DELAY * HZ, 1000) /* @@ -1338,159 +1291,30 @@ static void rcu_preempt_boost_start_gp(struct rcu_node *rnp) /* * Create an RCU-boost kthread for the specified node if one does not * already exist. We only create this kthread for preemptible RCU. - * Returns zero if all is well, a negated errno otherwise. */ -static int rcu_spawn_one_boost_kthread(struct rcu_node *rnp) +static void rcu_spawn_one_boost_kthread(struct rcu_node *rnp) { - int rnp_index = rnp - rcu_get_root(); unsigned long flags; + int rnp_index = rnp - rcu_get_root(); struct sched_param sp; struct task_struct *t; - if (!IS_ENABLED(CONFIG_PREEMPT_RCU)) - return 0; - - if (!rcu_scheduler_fully_active || rcu_rnp_online_cpus(rnp) == 0) - return 0; + if (rnp->boost_kthread_task) + return; - rcu_state.boost = 1; - if (rnp->boost_kthread_task != NULL) - return 0; t = kthread_create(rcu_boost_kthread, (void *)rnp, "rcub/%d", rnp_index); - if (IS_ERR(t)) - return PTR_ERR(t); + if (WARN_ON_ONCE(IS_ERR(t))) + return; + raw_spin_lock_irqsave_rcu_node(rnp, flags); rnp->boost_kthread_task = t; raw_spin_unlock_irqrestore_rcu_node(rnp, flags); + sp.sched_priority = kthread_prio; sched_setscheduler_nocheck(t, SCHED_FIFO, &sp); + rcu_thread_affine_rnp(t, rnp); wake_up_process(t); /* get to TASK_INTERRUPTIBLE quickly. */ - return 0; -} - -static void rcu_kthread_do_work(void) -{ - rcu_do_batch(this_cpu_ptr(&rcu_data)); -} - -static void rcu_cpu_kthread_setup(unsigned int cpu) -{ - struct sched_param sp; - - sp.sched_priority = kthread_prio; - sched_setscheduler_nocheck(current, SCHED_FIFO, &sp); -} - -static void rcu_cpu_kthread_park(unsigned int cpu) -{ - per_cpu(rcu_cpu_kthread_status, cpu) = RCU_KTHREAD_OFFCPU; -} - -static int rcu_cpu_kthread_should_run(unsigned int cpu) -{ - return __this_cpu_read(rcu_cpu_has_work); -} - -/* - * Per-CPU kernel thread that invokes RCU callbacks. This replaces - * the RCU softirq used in configurations of RCU that do not support RCU - * priority boosting. - */ -static void rcu_cpu_kthread(unsigned int cpu) -{ - unsigned int *statusp = this_cpu_ptr(&rcu_cpu_kthread_status); - char work, *workp = this_cpu_ptr(&rcu_cpu_has_work); - int spincnt; - - for (spincnt = 0; spincnt < 10; spincnt++) { - trace_rcu_utilization(TPS("Start CPU kthread@rcu_wait")); - local_bh_disable(); - *statusp = RCU_KTHREAD_RUNNING; - this_cpu_inc(rcu_cpu_kthread_loops); - local_irq_disable(); - work = *workp; - *workp = 0; - local_irq_enable(); - if (work) - rcu_kthread_do_work(); - local_bh_enable(); - if (*workp == 0) { - trace_rcu_utilization(TPS("End CPU kthread@rcu_wait")); - *statusp = RCU_KTHREAD_WAITING; - return; - } - } - *statusp = RCU_KTHREAD_YIELDING; - trace_rcu_utilization(TPS("Start CPU kthread@rcu_yield")); - schedule_timeout_interruptible(2); - trace_rcu_utilization(TPS("End CPU kthread@rcu_yield")); - *statusp = RCU_KTHREAD_WAITING; -} - -/* - * Set the per-rcu_node kthread's affinity to cover all CPUs that are - * served by the rcu_node in question. The CPU hotplug lock is still - * held, so the value of rnp->qsmaskinit will be stable. - * - * We don't include outgoingcpu in the affinity set, use -1 if there is - * no outgoing CPU. If there are no CPUs left in the affinity set, - * this function allows the kthread to execute on any CPU. - */ -static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu) -{ - struct task_struct *t = rnp->boost_kthread_task; - unsigned long mask = rcu_rnp_online_cpus(rnp); - cpumask_var_t cm; - int cpu; - - if (!t) - return; - if (!zalloc_cpumask_var(&cm, GFP_KERNEL)) - return; - for_each_leaf_node_possible_cpu(rnp, cpu) - if ((mask & leaf_node_cpu_bit(rnp, cpu)) && - cpu != outgoingcpu) - cpumask_set_cpu(cpu, cm); - if (cpumask_weight(cm) == 0) - cpumask_setall(cm); - set_cpus_allowed_ptr(t, cm); - free_cpumask_var(cm); -} - -static struct smp_hotplug_thread rcu_cpu_thread_spec = { - .store = &rcu_cpu_kthread_task, - .thread_should_run = rcu_cpu_kthread_should_run, - .thread_fn = rcu_cpu_kthread, - .thread_comm = "rcuc/%u", - .setup = rcu_cpu_kthread_setup, - .park = rcu_cpu_kthread_park, -}; - -/* - * Spawn boost kthreads -- called as soon as the scheduler is running. - */ -static void __init rcu_spawn_boost_kthreads(void) -{ - struct rcu_node *rnp; - int cpu; - - for_each_possible_cpu(cpu) - per_cpu(rcu_cpu_has_work, cpu) = 0; - if (WARN_ONCE(smpboot_register_percpu_thread(&rcu_cpu_thread_spec), "%s: Could not start rcub kthread, OOM is now expected behavior\n", __func__)) - return; - rcu_for_each_leaf_node(rnp) - (void)rcu_spawn_one_boost_kthread(rnp); -} - -static void rcu_prepare_kthreads(int cpu) -{ - struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); - struct rcu_node *rnp = rdp->mynode; - - /* Fire up the incoming CPU's kthread and leaf rcu_node kthread. */ - if (rcu_scheduler_fully_active) - (void)rcu_spawn_one_boost_kthread(rnp); } #else /* #ifdef CONFIG_RCU_BOOST */ @@ -1501,1195 +1325,16 @@ static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags) raw_spin_unlock_irqrestore_rcu_node(rnp, flags); } -static void invoke_rcu_callbacks_kthread(void) -{ - WARN_ON_ONCE(1); -} - -static bool rcu_is_callbacks_kthread(void) -{ - return false; -} - static void rcu_preempt_boost_start_gp(struct rcu_node *rnp) { } -static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu) -{ -} - -static void __init rcu_spawn_boost_kthreads(void) -{ -} - -static void rcu_prepare_kthreads(int cpu) +static void rcu_spawn_one_boost_kthread(struct rcu_node *rnp) { } #endif /* #else #ifdef CONFIG_RCU_BOOST */ -#if !defined(CONFIG_RCU_FAST_NO_HZ) - -/* - * Check to see if any future RCU-related work will need to be done - * by the current CPU, even if none need be done immediately, returning - * 1 if so. This function is part of the RCU implementation; it is -not- - * an exported member of the RCU API. - * - * Because we not have RCU_FAST_NO_HZ, just check whether or not this - * CPU has RCU callbacks queued. - */ -int rcu_needs_cpu(u64 basemono, u64 *nextevt) -{ - *nextevt = KTIME_MAX; - return rcu_cpu_has_callbacks(NULL); -} - -/* - * Because we do not have RCU_FAST_NO_HZ, don't bother cleaning up - * after it. - */ -static void rcu_cleanup_after_idle(void) -{ -} - -/* - * Do the idle-entry grace-period work, which, because CONFIG_RCU_FAST_NO_HZ=n, - * is nothing. - */ -static void rcu_prepare_for_idle(void) -{ -} - -/* - * Don't bother keeping a running count of the number of RCU callbacks - * posted because CONFIG_RCU_FAST_NO_HZ=n. - */ -static void rcu_idle_count_callbacks_posted(void) -{ -} - -#else /* #if !defined(CONFIG_RCU_FAST_NO_HZ) */ - -/* - * This code is invoked when a CPU goes idle, at which point we want - * to have the CPU do everything required for RCU so that it can enter - * the energy-efficient dyntick-idle mode. This is handled by a - * state machine implemented by rcu_prepare_for_idle() below. - * - * The following three proprocessor symbols control this state machine: - * - * RCU_IDLE_GP_DELAY gives the number of jiffies that a CPU is permitted - * to sleep in dyntick-idle mode with RCU callbacks pending. This - * is sized to be roughly one RCU grace period. Those energy-efficiency - * benchmarkers who might otherwise be tempted to set this to a large - * number, be warned: Setting RCU_IDLE_GP_DELAY too high can hang your - * system. And if you are -that- concerned about energy efficiency, - * just power the system down and be done with it! - * RCU_IDLE_LAZY_GP_DELAY gives the number of jiffies that a CPU is - * permitted to sleep in dyntick-idle mode with only lazy RCU - * callbacks pending. Setting this too high can OOM your system. - * - * The values below work well in practice. If future workloads require - * adjustment, they can be converted into kernel config parameters, though - * making the state machine smarter might be a better option. - */ -#define RCU_IDLE_GP_DELAY 4 /* Roughly one grace period. */ -#define RCU_IDLE_LAZY_GP_DELAY (6 * HZ) /* Roughly six seconds. */ - -static int rcu_idle_gp_delay = RCU_IDLE_GP_DELAY; -module_param(rcu_idle_gp_delay, int, 0644); -static int rcu_idle_lazy_gp_delay = RCU_IDLE_LAZY_GP_DELAY; -module_param(rcu_idle_lazy_gp_delay, int, 0644); - -/* - * Try to advance callbacks on the current CPU, but only if it has been - * awhile since the last time we did so. Afterwards, if there are any - * callbacks ready for immediate invocation, return true. - */ -static bool __maybe_unused rcu_try_advance_all_cbs(void) -{ - bool cbs_ready = false; - struct rcu_data *rdp = this_cpu_ptr(&rcu_data); - struct rcu_node *rnp; - - /* Exit early if we advanced recently. */ - if (jiffies == rdp->last_advance_all) - return false; - rdp->last_advance_all = jiffies; - - rnp = rdp->mynode; - - /* - * Don't bother checking unless a grace period has - * completed since we last checked and there are - * callbacks not yet ready to invoke. - */ - if ((rcu_seq_completed_gp(rdp->gp_seq, - rcu_seq_current(&rnp->gp_seq)) || - unlikely(READ_ONCE(rdp->gpwrap))) && - rcu_segcblist_pend_cbs(&rdp->cblist)) - note_gp_changes(rdp); - - if (rcu_segcblist_ready_cbs(&rdp->cblist)) - cbs_ready = true; - return cbs_ready; -} - -/* - * Allow the CPU to enter dyntick-idle mode unless it has callbacks ready - * to invoke. If the CPU has callbacks, try to advance them. Tell the - * caller to set the timeout based on whether or not there are non-lazy - * callbacks. - * - * The caller must have disabled interrupts. - */ -int rcu_needs_cpu(u64 basemono, u64 *nextevt) -{ - struct rcu_data *rdp = this_cpu_ptr(&rcu_data); - unsigned long dj; - - lockdep_assert_irqs_disabled(); - - /* Snapshot to detect later posting of non-lazy callback. */ - rdp->nonlazy_posted_snap = rdp->nonlazy_posted; - - /* If no callbacks, RCU doesn't need the CPU. */ - if (!rcu_cpu_has_callbacks(&rdp->all_lazy)) { - *nextevt = KTIME_MAX; - return 0; - } - - /* Attempt to advance callbacks. */ - if (rcu_try_advance_all_cbs()) { - /* Some ready to invoke, so initiate later invocation. */ - invoke_rcu_core(); - return 1; - } - rdp->last_accelerate = jiffies; - - /* Request timer delay depending on laziness, and round. */ - if (!rdp->all_lazy) { - dj = round_up(rcu_idle_gp_delay + jiffies, - rcu_idle_gp_delay) - jiffies; - } else { - dj = round_jiffies(rcu_idle_lazy_gp_delay + jiffies) - jiffies; - } - *nextevt = basemono + dj * TICK_NSEC; - return 0; -} - -/* - * Prepare a CPU for idle from an RCU perspective. The first major task - * is to sense whether nohz mode has been enabled or disabled via sysfs. - * The second major task is to check to see if a non-lazy callback has - * arrived at a CPU that previously had only lazy callbacks. The third - * major task is to accelerate (that is, assign grace-period numbers to) - * any recently arrived callbacks. - * - * The caller must have disabled interrupts. - */ -static void rcu_prepare_for_idle(void) -{ - bool needwake; - struct rcu_data *rdp = this_cpu_ptr(&rcu_data); - struct rcu_node *rnp; - int tne; - - lockdep_assert_irqs_disabled(); - if (rcu_is_nocb_cpu(smp_processor_id())) - return; - - /* Handle nohz enablement switches conservatively. */ - tne = READ_ONCE(tick_nohz_active); - if (tne != rdp->tick_nohz_enabled_snap) { - if (rcu_cpu_has_callbacks(NULL)) - invoke_rcu_core(); /* force nohz to see update. */ - rdp->tick_nohz_enabled_snap = tne; - return; - } - if (!tne) - return; - - /* - * If a non-lazy callback arrived at a CPU having only lazy - * callbacks, invoke RCU core for the side-effect of recalculating - * idle duration on re-entry to idle. - */ - if (rdp->all_lazy && - rdp->nonlazy_posted != rdp->nonlazy_posted_snap) { - rdp->all_lazy = false; - rdp->nonlazy_posted_snap = rdp->nonlazy_posted; - invoke_rcu_core(); - return; - } - - /* - * If we have not yet accelerated this jiffy, accelerate all - * callbacks on this CPU. - */ - if (rdp->last_accelerate == jiffies) - return; - rdp->last_accelerate = jiffies; - if (rcu_segcblist_pend_cbs(&rdp->cblist)) { - rnp = rdp->mynode; - raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */ - needwake = rcu_accelerate_cbs(rnp, rdp); - raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled. */ - if (needwake) - rcu_gp_kthread_wake(); - } -} - -/* - * Clean up for exit from idle. Attempt to advance callbacks based on - * any grace periods that elapsed while the CPU was idle, and if any - * callbacks are now ready to invoke, initiate invocation. - */ -static void rcu_cleanup_after_idle(void) -{ - lockdep_assert_irqs_disabled(); - if (rcu_is_nocb_cpu(smp_processor_id())) - return; - if (rcu_try_advance_all_cbs()) - invoke_rcu_core(); -} - -/* - * Keep a running count of the number of non-lazy callbacks posted - * on this CPU. This running counter (which is never decremented) allows - * rcu_prepare_for_idle() to detect when something out of the idle loop - * posts a callback, even if an equal number of callbacks are invoked. - * Of course, callbacks should only be posted from within a trace event - * designed to be called from idle or from within RCU_NONIDLE(). - */ -static void rcu_idle_count_callbacks_posted(void) -{ - __this_cpu_add(rcu_data.nonlazy_posted, 1); -} - -#endif /* #else #if !defined(CONFIG_RCU_FAST_NO_HZ) */ - -#ifdef CONFIG_RCU_FAST_NO_HZ - -static void print_cpu_stall_fast_no_hz(char *cp, int cpu) -{ - struct rcu_data *rdp = &per_cpu(rcu_data, cpu); - unsigned long nlpd = rdp->nonlazy_posted - rdp->nonlazy_posted_snap; - - sprintf(cp, "last_accelerate: %04lx/%04lx, nonlazy_posted: %ld, %c%c", - rdp->last_accelerate & 0xffff, jiffies & 0xffff, - ulong2long(nlpd), - rdp->all_lazy ? 'L' : '.', - rdp->tick_nohz_enabled_snap ? '.' : 'D'); -} - -#else /* #ifdef CONFIG_RCU_FAST_NO_HZ */ - -static void print_cpu_stall_fast_no_hz(char *cp, int cpu) -{ - *cp = '\0'; -} - -#endif /* #else #ifdef CONFIG_RCU_FAST_NO_HZ */ - -/* Initiate the stall-info list. */ -static void print_cpu_stall_info_begin(void) -{ - pr_cont("\n"); -} - -/* - * Print out diagnostic information for the specified stalled CPU. - * - * If the specified CPU is aware of the current RCU grace period, then - * print the number of scheduling clock interrupts the CPU has taken - * during the time that it has been aware. Otherwise, print the number - * of RCU grace periods that this CPU is ignorant of, for example, "1" - * if the CPU was aware of the previous grace period. - * - * Also print out idle and (if CONFIG_RCU_FAST_NO_HZ) idle-entry info. - */ -static void print_cpu_stall_info(int cpu) -{ - unsigned long delta; - char fast_no_hz[72]; - struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); - char *ticks_title; - unsigned long ticks_value; - - /* - * We could be printing a lot while holding a spinlock. Avoid - * triggering hard lockup. - */ - touch_nmi_watchdog(); - - ticks_value = rcu_seq_ctr(rcu_state.gp_seq - rdp->gp_seq); - if (ticks_value) { - ticks_title = "GPs behind"; - } else { - ticks_title = "ticks this GP"; - ticks_value = rdp->ticks_this_gp; - } - print_cpu_stall_fast_no_hz(fast_no_hz, cpu); - delta = rcu_seq_ctr(rdp->mynode->gp_seq - rdp->rcu_iw_gp_seq); - pr_err("\t%d-%c%c%c%c: (%lu %s) idle=%03x/%ld/%#lx softirq=%u/%u fqs=%ld %s\n", - cpu, - "O."[!!cpu_online(cpu)], - "o."[!!(rdp->grpmask & rdp->mynode->qsmaskinit)], - "N."[!!(rdp->grpmask & rdp->mynode->qsmaskinitnext)], - !IS_ENABLED(CONFIG_IRQ_WORK) ? '?' : - rdp->rcu_iw_pending ? (int)min(delta, 9UL) + '0' : - "!."[!delta], - ticks_value, ticks_title, - rcu_dynticks_snap(rdp) & 0xfff, - rdp->dynticks_nesting, rdp->dynticks_nmi_nesting, - rdp->softirq_snap, kstat_softirqs_cpu(RCU_SOFTIRQ, cpu), - READ_ONCE(rcu_state.n_force_qs) - rcu_state.n_force_qs_gpstart, - fast_no_hz); -} - -/* Terminate the stall-info list. */ -static void print_cpu_stall_info_end(void) -{ - pr_err("\t"); -} - -/* Zero ->ticks_this_gp and snapshot the number of RCU softirq handlers. */ -static void zero_cpu_stall_ticks(struct rcu_data *rdp) -{ - rdp->ticks_this_gp = 0; - rdp->softirq_snap = kstat_softirqs_cpu(RCU_SOFTIRQ, smp_processor_id()); - WRITE_ONCE(rdp->last_fqs_resched, jiffies); -} - -#ifdef CONFIG_RCU_NOCB_CPU - -/* - * Offload callback processing from the boot-time-specified set of CPUs - * specified by rcu_nocb_mask. For each CPU in the set, there is a - * kthread created that pulls the callbacks from the corresponding CPU, - * waits for a grace period to elapse, and invokes the callbacks. - * The no-CBs CPUs do a wake_up() on their kthread when they insert - * a callback into any empty list, unless the rcu_nocb_poll boot parameter - * has been specified, in which case each kthread actively polls its - * CPU. (Which isn't so great for energy efficiency, but which does - * reduce RCU's overhead on that CPU.) - * - * This is intended to be used in conjunction with Frederic Weisbecker's - * adaptive-idle work, which would seriously reduce OS jitter on CPUs - * running CPU-bound user-mode computations. - * - * Offloading of callback processing could also in theory be used as - * an energy-efficiency measure because CPUs with no RCU callbacks - * queued are more aggressive about entering dyntick-idle mode. - */ - - -/* Parse the boot-time rcu_nocb_mask CPU list from the kernel parameters. */ -static int __init rcu_nocb_setup(char *str) -{ - alloc_bootmem_cpumask_var(&rcu_nocb_mask); - cpulist_parse(str, rcu_nocb_mask); - return 1; -} -__setup("rcu_nocbs=", rcu_nocb_setup); - -static int __init parse_rcu_nocb_poll(char *arg) -{ - rcu_nocb_poll = true; - return 0; -} -early_param("rcu_nocb_poll", parse_rcu_nocb_poll); - -/* - * Wake up any no-CBs CPUs' kthreads that were waiting on the just-ended - * grace period. - */ -static void rcu_nocb_gp_cleanup(struct swait_queue_head *sq) -{ - swake_up_all(sq); -} - -static struct swait_queue_head *rcu_nocb_gp_get(struct rcu_node *rnp) -{ - return &rnp->nocb_gp_wq[rcu_seq_ctr(rnp->gp_seq) & 0x1]; -} - -static void rcu_init_one_nocb(struct rcu_node *rnp) -{ - init_swait_queue_head(&rnp->nocb_gp_wq[0]); - init_swait_queue_head(&rnp->nocb_gp_wq[1]); -} - -/* Is the specified CPU a no-CBs CPU? */ -bool rcu_is_nocb_cpu(int cpu) -{ - if (cpumask_available(rcu_nocb_mask)) - return cpumask_test_cpu(cpu, rcu_nocb_mask); - return false; -} - -/* - * Kick the leader kthread for this NOCB group. Caller holds ->nocb_lock - * and this function releases it. - */ -static void __wake_nocb_leader(struct rcu_data *rdp, bool force, - unsigned long flags) - __releases(rdp->nocb_lock) -{ - struct rcu_data *rdp_leader = rdp->nocb_leader; - - lockdep_assert_held(&rdp->nocb_lock); - if (!READ_ONCE(rdp_leader->nocb_kthread)) { - raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags); - return; - } - if (rdp_leader->nocb_leader_sleep || force) { - /* Prior smp_mb__after_atomic() orders against prior enqueue. */ - WRITE_ONCE(rdp_leader->nocb_leader_sleep, false); - del_timer(&rdp->nocb_timer); - raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags); - smp_mb(); /* ->nocb_leader_sleep before swake_up_one(). */ - swake_up_one(&rdp_leader->nocb_wq); - } else { - raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags); - } -} - -/* - * Kick the leader kthread for this NOCB group, but caller has not - * acquired locks. - */ -static void wake_nocb_leader(struct rcu_data *rdp, bool force) -{ - unsigned long flags; - - raw_spin_lock_irqsave(&rdp->nocb_lock, flags); - __wake_nocb_leader(rdp, force, flags); -} - -/* - * Arrange to wake the leader kthread for this NOCB group at some - * future time when it is safe to do so. - */ -static void wake_nocb_leader_defer(struct rcu_data *rdp, int waketype, - const char *reason) -{ - unsigned long flags; - - raw_spin_lock_irqsave(&rdp->nocb_lock, flags); - if (rdp->nocb_defer_wakeup == RCU_NOCB_WAKE_NOT) - mod_timer(&rdp->nocb_timer, jiffies + 1); - WRITE_ONCE(rdp->nocb_defer_wakeup, waketype); - trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, reason); - raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags); -} - -/* - * Does the specified CPU need an RCU callback for this invocation - * of rcu_barrier()? - */ -static bool rcu_nocb_cpu_needs_barrier(int cpu) -{ - struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); - unsigned long ret; -#ifdef CONFIG_PROVE_RCU - struct rcu_head *rhp; -#endif /* #ifdef CONFIG_PROVE_RCU */ - - /* - * Check count of all no-CBs callbacks awaiting invocation. - * There needs to be a barrier before this function is called, - * but associated with a prior determination that no more - * callbacks would be posted. In the worst case, the first - * barrier in rcu_barrier() suffices (but the caller cannot - * necessarily rely on this, not a substitute for the caller - * getting the concurrency design right!). There must also be - * a barrier between the following load an posting of a callback - * (if a callback is in fact needed). This is associated with an - * atomic_inc() in the caller. - */ - ret = rcu_get_n_cbs_nocb_cpu(rdp); - -#ifdef CONFIG_PROVE_RCU - rhp = READ_ONCE(rdp->nocb_head); - if (!rhp) - rhp = READ_ONCE(rdp->nocb_gp_head); - if (!rhp) - rhp = READ_ONCE(rdp->nocb_follower_head); - - /* Having no rcuo kthread but CBs after scheduler starts is bad! */ - if (!READ_ONCE(rdp->nocb_kthread) && rhp && - rcu_scheduler_fully_active) { - /* RCU callback enqueued before CPU first came online??? */ - pr_err("RCU: Never-onlined no-CBs CPU %d has CB %p\n", - cpu, rhp->func); - WARN_ON_ONCE(1); - } -#endif /* #ifdef CONFIG_PROVE_RCU */ - - return !!ret; -} - -/* - * Enqueue the specified string of rcu_head structures onto the specified - * CPU's no-CBs lists. The CPU is specified by rdp, the head of the - * string by rhp, and the tail of the string by rhtp. The non-lazy/lazy - * counts are supplied by rhcount and rhcount_lazy. - * - * If warranted, also wake up the kthread servicing this CPUs queues. - */ -static void __call_rcu_nocb_enqueue(struct rcu_data *rdp, - struct rcu_head *rhp, - struct rcu_head **rhtp, - int rhcount, int rhcount_lazy, - unsigned long flags) -{ - int len; - struct rcu_head **old_rhpp; - struct task_struct *t; - - /* Enqueue the callback on the nocb list and update counts. */ - atomic_long_add(rhcount, &rdp->nocb_q_count); - /* rcu_barrier() relies on ->nocb_q_count add before xchg. */ - old_rhpp = xchg(&rdp->nocb_tail, rhtp); - WRITE_ONCE(*old_rhpp, rhp); - atomic_long_add(rhcount_lazy, &rdp->nocb_q_count_lazy); - smp_mb__after_atomic(); /* Store *old_rhpp before _wake test. */ - - /* If we are not being polled and there is a kthread, awaken it ... */ - t = READ_ONCE(rdp->nocb_kthread); - if (rcu_nocb_poll || !t) { - trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, - TPS("WakeNotPoll")); - return; - } - len = rcu_get_n_cbs_nocb_cpu(rdp); - if (old_rhpp == &rdp->nocb_head) { - if (!irqs_disabled_flags(flags)) { - /* ... if queue was empty ... */ - wake_nocb_leader(rdp, false); - trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, - TPS("WakeEmpty")); - } else { - wake_nocb_leader_defer(rdp, RCU_NOCB_WAKE, - TPS("WakeEmptyIsDeferred")); - } - rdp->qlen_last_fqs_check = 0; - } else if (len > rdp->qlen_last_fqs_check + qhimark) { - /* ... or if many callbacks queued. */ - if (!irqs_disabled_flags(flags)) { - wake_nocb_leader(rdp, true); - trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, - TPS("WakeOvf")); - } else { - wake_nocb_leader_defer(rdp, RCU_NOCB_WAKE_FORCE, - TPS("WakeOvfIsDeferred")); - } - rdp->qlen_last_fqs_check = LONG_MAX / 2; - } else { - trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WakeNot")); - } - return; -} - -/* - * This is a helper for __call_rcu(), which invokes this when the normal - * callback queue is inoperable. If this is not a no-CBs CPU, this - * function returns failure back to __call_rcu(), which can complain - * appropriately. - * - * Otherwise, this function queues the callback where the corresponding - * "rcuo" kthread can find it. - */ -static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp, - bool lazy, unsigned long flags) -{ - - if (!rcu_is_nocb_cpu(rdp->cpu)) - return false; - __call_rcu_nocb_enqueue(rdp, rhp, &rhp->next, 1, lazy, flags); - if (__is_kfree_rcu_offset((unsigned long)rhp->func)) - trace_rcu_kfree_callback(rcu_state.name, rhp, - (unsigned long)rhp->func, - -atomic_long_read(&rdp->nocb_q_count_lazy), - -rcu_get_n_cbs_nocb_cpu(rdp)); - else - trace_rcu_callback(rcu_state.name, rhp, - -atomic_long_read(&rdp->nocb_q_count_lazy), - -rcu_get_n_cbs_nocb_cpu(rdp)); - - /* - * If called from an extended quiescent state with interrupts - * disabled, invoke the RCU core in order to allow the idle-entry - * deferred-wakeup check to function. - */ - if (irqs_disabled_flags(flags) && - !rcu_is_watching() && - cpu_online(smp_processor_id())) - invoke_rcu_core(); - - return true; -} - -/* - * Adopt orphaned callbacks on a no-CBs CPU, or return 0 if this is - * not a no-CBs CPU. - */ -static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_data *my_rdp, - struct rcu_data *rdp, - unsigned long flags) -{ - lockdep_assert_irqs_disabled(); - if (!rcu_is_nocb_cpu(smp_processor_id())) - return false; /* Not NOCBs CPU, caller must migrate CBs. */ - __call_rcu_nocb_enqueue(my_rdp, rcu_segcblist_head(&rdp->cblist), - rcu_segcblist_tail(&rdp->cblist), - rcu_segcblist_n_cbs(&rdp->cblist), - rcu_segcblist_n_lazy_cbs(&rdp->cblist), flags); - rcu_segcblist_init(&rdp->cblist); - rcu_segcblist_disable(&rdp->cblist); - return true; -} - -/* - * If necessary, kick off a new grace period, and either way wait - * for a subsequent grace period to complete. - */ -static void rcu_nocb_wait_gp(struct rcu_data *rdp) -{ - unsigned long c; - bool d; - unsigned long flags; - bool needwake; - struct rcu_node *rnp = rdp->mynode; - - local_irq_save(flags); - c = rcu_seq_snap(&rcu_state.gp_seq); - if (!rdp->gpwrap && ULONG_CMP_GE(rdp->gp_seq_needed, c)) { - local_irq_restore(flags); - } else { - raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */ - needwake = rcu_start_this_gp(rnp, rdp, c); - raw_spin_unlock_irqrestore_rcu_node(rnp, flags); - if (needwake) - rcu_gp_kthread_wake(); - } - - /* - * Wait for the grace period. Do so interruptibly to avoid messing - * up the load average. - */ - trace_rcu_this_gp(rnp, rdp, c, TPS("StartWait")); - for (;;) { - swait_event_interruptible_exclusive( - rnp->nocb_gp_wq[rcu_seq_ctr(c) & 0x1], - (d = rcu_seq_done(&rnp->gp_seq, c))); - if (likely(d)) - break; - WARN_ON(signal_pending(current)); - trace_rcu_this_gp(rnp, rdp, c, TPS("ResumeWait")); - } - trace_rcu_this_gp(rnp, rdp, c, TPS("EndWait")); - smp_mb(); /* Ensure that CB invocation happens after GP end. */ -} - -/* - * Leaders come here to wait for additional callbacks to show up. - * This function does not return until callbacks appear. - */ -static void nocb_leader_wait(struct rcu_data *my_rdp) -{ - bool firsttime = true; - unsigned long flags; - bool gotcbs; - struct rcu_data *rdp; - struct rcu_head **tail; - -wait_again: - - /* Wait for callbacks to appear. */ - if (!rcu_nocb_poll) { - trace_rcu_nocb_wake(rcu_state.name, my_rdp->cpu, TPS("Sleep")); - swait_event_interruptible_exclusive(my_rdp->nocb_wq, - !READ_ONCE(my_rdp->nocb_leader_sleep)); - raw_spin_lock_irqsave(&my_rdp->nocb_lock, flags); - my_rdp->nocb_leader_sleep = true; - WRITE_ONCE(my_rdp->nocb_defer_wakeup, RCU_NOCB_WAKE_NOT); - del_timer(&my_rdp->nocb_timer); - raw_spin_unlock_irqrestore(&my_rdp->nocb_lock, flags); - } else if (firsttime) { - firsttime = false; /* Don't drown trace log with "Poll"! */ - trace_rcu_nocb_wake(rcu_state.name, my_rdp->cpu, TPS("Poll")); - } - - /* - * Each pass through the following loop checks a follower for CBs. - * We are our own first follower. Any CBs found are moved to - * nocb_gp_head, where they await a grace period. - */ - gotcbs = false; - smp_mb(); /* wakeup and _sleep before ->nocb_head reads. */ - for (rdp = my_rdp; rdp; rdp = rdp->nocb_next_follower) { - rdp->nocb_gp_head = READ_ONCE(rdp->nocb_head); - if (!rdp->nocb_gp_head) - continue; /* No CBs here, try next follower. */ - - /* Move callbacks to wait-for-GP list, which is empty. */ - WRITE_ONCE(rdp->nocb_head, NULL); - rdp->nocb_gp_tail = xchg(&rdp->nocb_tail, &rdp->nocb_head); - gotcbs = true; - } - - /* No callbacks? Sleep a bit if polling, and go retry. */ - if (unlikely(!gotcbs)) { - WARN_ON(signal_pending(current)); - if (rcu_nocb_poll) { - schedule_timeout_interruptible(1); - } else { - trace_rcu_nocb_wake(rcu_state.name, my_rdp->cpu, - TPS("WokeEmpty")); - } - goto wait_again; - } - - /* Wait for one grace period. */ - rcu_nocb_wait_gp(my_rdp); - - /* Each pass through the following loop wakes a follower, if needed. */ - for (rdp = my_rdp; rdp; rdp = rdp->nocb_next_follower) { - if (!rcu_nocb_poll && - READ_ONCE(rdp->nocb_head) && - READ_ONCE(my_rdp->nocb_leader_sleep)) { - raw_spin_lock_irqsave(&my_rdp->nocb_lock, flags); - my_rdp->nocb_leader_sleep = false;/* No need to sleep.*/ - raw_spin_unlock_irqrestore(&my_rdp->nocb_lock, flags); - } - if (!rdp->nocb_gp_head) - continue; /* No CBs, so no need to wake follower. */ - - /* Append callbacks to follower's "done" list. */ - raw_spin_lock_irqsave(&rdp->nocb_lock, flags); - tail = rdp->nocb_follower_tail; - rdp->nocb_follower_tail = rdp->nocb_gp_tail; - *tail = rdp->nocb_gp_head; - raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags); - if (rdp != my_rdp && tail == &rdp->nocb_follower_head) { - /* List was empty, so wake up the follower. */ - swake_up_one(&rdp->nocb_wq); - } - } - - /* If we (the leader) don't have CBs, go wait some more. */ - if (!my_rdp->nocb_follower_head) - goto wait_again; -} - -/* - * Followers come here to wait for additional callbacks to show up. - * This function does not return until callbacks appear. - */ -static void nocb_follower_wait(struct rcu_data *rdp) -{ - for (;;) { - trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("FollowerSleep")); - swait_event_interruptible_exclusive(rdp->nocb_wq, - READ_ONCE(rdp->nocb_follower_head)); - if (smp_load_acquire(&rdp->nocb_follower_head)) { - /* ^^^ Ensure CB invocation follows _head test. */ - return; - } - WARN_ON(signal_pending(current)); - trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WokeEmpty")); - } -} - -/* - * Per-rcu_data kthread, but only for no-CBs CPUs. Each kthread invokes - * callbacks queued by the corresponding no-CBs CPU, however, there is - * an optional leader-follower relationship so that the grace-period - * kthreads don't have to do quite so many wakeups. - */ -static int rcu_nocb_kthread(void *arg) -{ - int c, cl; - unsigned long flags; - struct rcu_head *list; - struct rcu_head *next; - struct rcu_head **tail; - struct rcu_data *rdp = arg; - - /* Each pass through this loop invokes one batch of callbacks */ - for (;;) { - /* Wait for callbacks. */ - if (rdp->nocb_leader == rdp) - nocb_leader_wait(rdp); - else - nocb_follower_wait(rdp); - - /* Pull the ready-to-invoke callbacks onto local list. */ - raw_spin_lock_irqsave(&rdp->nocb_lock, flags); - list = rdp->nocb_follower_head; - rdp->nocb_follower_head = NULL; - tail = rdp->nocb_follower_tail; - rdp->nocb_follower_tail = &rdp->nocb_follower_head; - raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags); - if (WARN_ON_ONCE(!list)) - continue; - trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WokeNonEmpty")); - - /* Each pass through the following loop invokes a callback. */ - trace_rcu_batch_start(rcu_state.name, - atomic_long_read(&rdp->nocb_q_count_lazy), - rcu_get_n_cbs_nocb_cpu(rdp), -1); - c = cl = 0; - while (list) { - next = list->next; - /* Wait for enqueuing to complete, if needed. */ - while (next == NULL && &list->next != tail) { - trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, - TPS("WaitQueue")); - schedule_timeout_interruptible(1); - trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, - TPS("WokeQueue")); - next = list->next; - } - debug_rcu_head_unqueue(list); - local_bh_disable(); - if (__rcu_reclaim(rcu_state.name, list)) - cl++; - c++; - local_bh_enable(); - cond_resched_tasks_rcu_qs(); - list = next; - } - trace_rcu_batch_end(rcu_state.name, c, !!list, 0, 0, 1); - smp_mb__before_atomic(); /* _add after CB invocation. */ - atomic_long_add(-c, &rdp->nocb_q_count); - atomic_long_add(-cl, &rdp->nocb_q_count_lazy); - } - return 0; -} - -/* Is a deferred wakeup of rcu_nocb_kthread() required? */ -static int rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp) -{ - return READ_ONCE(rdp->nocb_defer_wakeup); -} - -/* Do a deferred wakeup of rcu_nocb_kthread(). */ -static void do_nocb_deferred_wakeup_common(struct rcu_data *rdp) -{ - unsigned long flags; - int ndw; - - raw_spin_lock_irqsave(&rdp->nocb_lock, flags); - if (!rcu_nocb_need_deferred_wakeup(rdp)) { - raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags); - return; - } - ndw = READ_ONCE(rdp->nocb_defer_wakeup); - WRITE_ONCE(rdp->nocb_defer_wakeup, RCU_NOCB_WAKE_NOT); - __wake_nocb_leader(rdp, ndw == RCU_NOCB_WAKE_FORCE, flags); - trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("DeferredWake")); -} - -/* Do a deferred wakeup of rcu_nocb_kthread() from a timer handler. */ -static void do_nocb_deferred_wakeup_timer(struct timer_list *t) -{ - struct rcu_data *rdp = from_timer(rdp, t, nocb_timer); - - do_nocb_deferred_wakeup_common(rdp); -} - -/* - * Do a deferred wakeup of rcu_nocb_kthread() from fastpath. - * This means we do an inexact common-case check. Note that if - * we miss, ->nocb_timer will eventually clean things up. - */ -static void do_nocb_deferred_wakeup(struct rcu_data *rdp) -{ - if (rcu_nocb_need_deferred_wakeup(rdp)) - do_nocb_deferred_wakeup_common(rdp); -} - -void __init rcu_init_nohz(void) -{ - int cpu; - bool need_rcu_nocb_mask = false; - -#if defined(CONFIG_NO_HZ_FULL) - if (tick_nohz_full_running && cpumask_weight(tick_nohz_full_mask)) - need_rcu_nocb_mask = true; -#endif /* #if defined(CONFIG_NO_HZ_FULL) */ - - if (!cpumask_available(rcu_nocb_mask) && need_rcu_nocb_mask) { - if (!zalloc_cpumask_var(&rcu_nocb_mask, GFP_KERNEL)) { - pr_info("rcu_nocb_mask allocation failed, callback offloading disabled.\n"); - return; - } - } - if (!cpumask_available(rcu_nocb_mask)) - return; - -#if defined(CONFIG_NO_HZ_FULL) - if (tick_nohz_full_running) - cpumask_or(rcu_nocb_mask, rcu_nocb_mask, tick_nohz_full_mask); -#endif /* #if defined(CONFIG_NO_HZ_FULL) */ - - if (!cpumask_subset(rcu_nocb_mask, cpu_possible_mask)) { - pr_info("\tNote: kernel parameter 'rcu_nocbs=', 'nohz_full', or 'isolcpus=' contains nonexistent CPUs.\n"); - cpumask_and(rcu_nocb_mask, cpu_possible_mask, - rcu_nocb_mask); - } - if (cpumask_empty(rcu_nocb_mask)) - pr_info("\tOffload RCU callbacks from CPUs: (none).\n"); - else - pr_info("\tOffload RCU callbacks from CPUs: %*pbl.\n", - cpumask_pr_args(rcu_nocb_mask)); - if (rcu_nocb_poll) - pr_info("\tPoll for callbacks from no-CBs CPUs.\n"); - - for_each_cpu(cpu, rcu_nocb_mask) - init_nocb_callback_list(per_cpu_ptr(&rcu_data, cpu)); - rcu_organize_nocb_kthreads(); -} - -/* Initialize per-rcu_data variables for no-CBs CPUs. */ -static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp) -{ - rdp->nocb_tail = &rdp->nocb_head; - init_swait_queue_head(&rdp->nocb_wq); - rdp->nocb_follower_tail = &rdp->nocb_follower_head; - raw_spin_lock_init(&rdp->nocb_lock); - timer_setup(&rdp->nocb_timer, do_nocb_deferred_wakeup_timer, 0); -} - -/* - * If the specified CPU is a no-CBs CPU that does not already have its - * rcuo kthread, spawn it. If the CPUs are brought online out of order, - * this can require re-organizing the leader-follower relationships. - */ -static void rcu_spawn_one_nocb_kthread(int cpu) -{ - struct rcu_data *rdp; - struct rcu_data *rdp_last; - struct rcu_data *rdp_old_leader; - struct rcu_data *rdp_spawn = per_cpu_ptr(&rcu_data, cpu); - struct task_struct *t; - - /* - * If this isn't a no-CBs CPU or if it already has an rcuo kthread, - * then nothing to do. - */ - if (!rcu_is_nocb_cpu(cpu) || rdp_spawn->nocb_kthread) - return; - - /* If we didn't spawn the leader first, reorganize! */ - rdp_old_leader = rdp_spawn->nocb_leader; - if (rdp_old_leader != rdp_spawn && !rdp_old_leader->nocb_kthread) { - rdp_last = NULL; - rdp = rdp_old_leader; - do { - rdp->nocb_leader = rdp_spawn; - if (rdp_last && rdp != rdp_spawn) - rdp_last->nocb_next_follower = rdp; - if (rdp == rdp_spawn) { - rdp = rdp->nocb_next_follower; - } else { - rdp_last = rdp; - rdp = rdp->nocb_next_follower; - rdp_last->nocb_next_follower = NULL; - } - } while (rdp); - rdp_spawn->nocb_next_follower = rdp_old_leader; - } - - /* Spawn the kthread for this CPU. */ - t = kthread_run(rcu_nocb_kthread, rdp_spawn, - "rcuo%c/%d", rcu_state.abbr, cpu); - if (WARN_ONCE(IS_ERR(t), "%s: Could not start rcuo kthread, OOM is now expected behavior\n", __func__)) - return; - WRITE_ONCE(rdp_spawn->nocb_kthread, t); -} - -/* - * If the specified CPU is a no-CBs CPU that does not already have its - * rcuo kthreads, spawn them. - */ -static void rcu_spawn_all_nocb_kthreads(int cpu) -{ - if (rcu_scheduler_fully_active) - rcu_spawn_one_nocb_kthread(cpu); -} - -/* - * Once the scheduler is running, spawn rcuo kthreads for all online - * no-CBs CPUs. This assumes that the early_initcall()s happen before - * non-boot CPUs come online -- if this changes, we will need to add - * some mutual exclusion. - */ -static void __init rcu_spawn_nocb_kthreads(void) -{ - int cpu; - - for_each_online_cpu(cpu) - rcu_spawn_all_nocb_kthreads(cpu); -} - -/* How many follower CPU IDs per leader? Default of -1 for sqrt(nr_cpu_ids). */ -static int rcu_nocb_leader_stride = -1; -module_param(rcu_nocb_leader_stride, int, 0444); - -/* - * Initialize leader-follower relationships for all no-CBs CPU. - */ -static void __init rcu_organize_nocb_kthreads(void) -{ - int cpu; - int ls = rcu_nocb_leader_stride; - int nl = 0; /* Next leader. */ - struct rcu_data *rdp; - struct rcu_data *rdp_leader = NULL; /* Suppress misguided gcc warn. */ - struct rcu_data *rdp_prev = NULL; - - if (!cpumask_available(rcu_nocb_mask)) - return; - if (ls == -1) { - ls = int_sqrt(nr_cpu_ids); - rcu_nocb_leader_stride = ls; - } - - /* - * Each pass through this loop sets up one rcu_data structure. - * Should the corresponding CPU come online in the future, then - * we will spawn the needed set of rcu_nocb_kthread() kthreads. - */ - for_each_cpu(cpu, rcu_nocb_mask) { - rdp = per_cpu_ptr(&rcu_data, cpu); - if (rdp->cpu >= nl) { - /* New leader, set up for followers & next leader. */ - nl = DIV_ROUND_UP(rdp->cpu + 1, ls) * ls; - rdp->nocb_leader = rdp; - rdp_leader = rdp; - } else { - /* Another follower, link to previous leader. */ - rdp->nocb_leader = rdp_leader; - rdp_prev->nocb_next_follower = rdp; - } - rdp_prev = rdp; - } -} - -/* Prevent __call_rcu() from enqueuing callbacks on no-CBs CPUs */ -static bool init_nocb_callback_list(struct rcu_data *rdp) -{ - if (!rcu_is_nocb_cpu(rdp->cpu)) - return false; - - /* If there are early-boot callbacks, move them to nocb lists. */ - if (!rcu_segcblist_empty(&rdp->cblist)) { - rdp->nocb_head = rcu_segcblist_head(&rdp->cblist); - rdp->nocb_tail = rcu_segcblist_tail(&rdp->cblist); - atomic_long_set(&rdp->nocb_q_count, - rcu_segcblist_n_cbs(&rdp->cblist)); - atomic_long_set(&rdp->nocb_q_count_lazy, - rcu_segcblist_n_lazy_cbs(&rdp->cblist)); - rcu_segcblist_init(&rdp->cblist); - } - rcu_segcblist_disable(&rdp->cblist); - return true; -} - -/* - * Bind the current task to the offloaded CPUs. If there are no offloaded - * CPUs, leave the task unbound. Splat if the bind attempt fails. - */ -void rcu_bind_current_to_nocb(void) -{ - if (cpumask_available(rcu_nocb_mask) && cpumask_weight(rcu_nocb_mask)) - WARN_ON(sched_setaffinity(current->pid, rcu_nocb_mask)); -} -EXPORT_SYMBOL_GPL(rcu_bind_current_to_nocb); - -/* - * Return the number of RCU callbacks still queued from the specified - * CPU, which must be a nocbs CPU. - */ -static unsigned long rcu_get_n_cbs_nocb_cpu(struct rcu_data *rdp) -{ - return atomic_long_read(&rdp->nocb_q_count); -} - -#else /* #ifdef CONFIG_RCU_NOCB_CPU */ - -static bool rcu_nocb_cpu_needs_barrier(int cpu) -{ - WARN_ON_ONCE(1); /* Should be dead code. */ - return false; -} - -static void rcu_nocb_gp_cleanup(struct swait_queue_head *sq) -{ -} - -static struct swait_queue_head *rcu_nocb_gp_get(struct rcu_node *rnp) -{ - return NULL; -} - -static void rcu_init_one_nocb(struct rcu_node *rnp) -{ -} - -static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp, - bool lazy, unsigned long flags) -{ - return false; -} - -static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_data *my_rdp, - struct rcu_data *rdp, - unsigned long flags) -{ - return false; -} - -static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp) -{ -} - -static int rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp) -{ - return false; -} - -static void do_nocb_deferred_wakeup(struct rcu_data *rdp) -{ -} - -static void rcu_spawn_all_nocb_kthreads(int cpu) -{ -} - -static void __init rcu_spawn_nocb_kthreads(void) -{ -} - -static bool init_nocb_callback_list(struct rcu_data *rdp) -{ - return false; -} - -static unsigned long rcu_get_n_cbs_nocb_cpu(struct rcu_data *rdp) -{ - return 0; -} - -#endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */ - /* * Is this CPU a NO_HZ_FULL CPU that should ignore RCU so that the * grace-period kthread will do force_quiescent_state() processing? @@ -2697,14 +1342,14 @@ static unsigned long rcu_get_n_cbs_nocb_cpu(struct rcu_data *rdp) * CPU unless the grace period has extended for too long. * * This code relies on the fact that all NO_HZ_FULL CPUs are also - * CONFIG_RCU_NOCB_CPU CPUs. + * RCU_NOCB_CPU CPUs. */ static bool rcu_nohz_full_cpu(void) { #ifdef CONFIG_NO_HZ_FULL if (tick_nohz_full_cpu(smp_processor_id()) && (!rcu_gp_in_progress() || - ULONG_CMP_LT(jiffies, READ_ONCE(rcu_state.gp_start) + HZ))) + time_before(jiffies, READ_ONCE(rcu_state.gp_start) + HZ))) return true; #endif /* #ifdef CONFIG_NO_HZ_FULL */ return false; @@ -2717,21 +1362,5 @@ static void rcu_bind_gp_kthread(void) { if (!tick_nohz_full_enabled()) return; - housekeeping_affine(current, HK_FLAG_RCU); -} - -/* Record the current task on dyntick-idle entry. */ -static void rcu_dynticks_task_enter(void) -{ -#if defined(CONFIG_TASKS_RCU) && defined(CONFIG_NO_HZ_FULL) - WRITE_ONCE(current->rcu_tasks_idle_cpu, smp_processor_id()); -#endif /* #if defined(CONFIG_TASKS_RCU) && defined(CONFIG_NO_HZ_FULL) */ -} - -/* Record no current task on dyntick-idle exit. */ -static void rcu_dynticks_task_exit(void) -{ -#if defined(CONFIG_TASKS_RCU) && defined(CONFIG_NO_HZ_FULL) - WRITE_ONCE(current->rcu_tasks_idle_cpu, -1); -#endif /* #if defined(CONFIG_TASKS_RCU) && defined(CONFIG_NO_HZ_FULL) */ + housekeeping_affine(current, HK_TYPE_RCU); } diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h new file mode 100644 index 000000000000..b67532cb8770 --- /dev/null +++ b/kernel/rcu/tree_stall.h @@ -0,0 +1,1185 @@ +// SPDX-License-Identifier: GPL-2.0+ +/* + * RCU CPU stall warnings for normal RCU grace periods + * + * Copyright IBM Corporation, 2019 + * + * Author: Paul E. McKenney <paulmck@linux.ibm.com> + */ + +#include <linux/console.h> +#include <linux/kvm_para.h> +#include <linux/rcu_notifier.h> +#include <linux/smp.h> + +////////////////////////////////////////////////////////////////////////////// +// +// Controlling CPU stall warnings, including delay calculation. + +/* panic() on RCU Stall sysctl. */ +static int sysctl_panic_on_rcu_stall __read_mostly; +static int sysctl_max_rcu_stall_to_panic __read_mostly; + +static const struct ctl_table rcu_stall_sysctl_table[] = { + { + .procname = "panic_on_rcu_stall", + .data = &sysctl_panic_on_rcu_stall, + .maxlen = sizeof(sysctl_panic_on_rcu_stall), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, + { + .procname = "max_rcu_stall_to_panic", + .data = &sysctl_max_rcu_stall_to_panic, + .maxlen = sizeof(sysctl_max_rcu_stall_to_panic), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ONE, + .extra2 = SYSCTL_INT_MAX, + }, +}; + +static int __init init_rcu_stall_sysctl(void) +{ + register_sysctl_init("kernel", rcu_stall_sysctl_table); + return 0; +} + +subsys_initcall(init_rcu_stall_sysctl); + +#ifdef CONFIG_SYSFS + +static unsigned int rcu_stall_count; + +static ssize_t rcu_stall_count_show(struct kobject *kobj, struct kobj_attribute *attr, + char *page) +{ + return sysfs_emit(page, "%u\n", rcu_stall_count); +} + +static struct kobj_attribute rcu_stall_count_attr = __ATTR_RO(rcu_stall_count); + +static __init int kernel_rcu_stall_sysfs_init(void) +{ + sysfs_add_file_to_group(kernel_kobj, &rcu_stall_count_attr.attr, NULL); + return 0; +} + +late_initcall(kernel_rcu_stall_sysfs_init); + +#endif // CONFIG_SYSFS + +#ifdef CONFIG_PROVE_RCU +#define RCU_STALL_DELAY_DELTA (5 * HZ) +#else +#define RCU_STALL_DELAY_DELTA 0 +#endif +#define RCU_STALL_MIGHT_DIV 8 +#define RCU_STALL_MIGHT_MIN (2 * HZ) + +int rcu_exp_jiffies_till_stall_check(void) +{ + int cpu_stall_timeout = READ_ONCE(rcu_exp_cpu_stall_timeout); + int exp_stall_delay_delta = 0; + int till_stall_check; + + // Zero says to use rcu_cpu_stall_timeout, but in milliseconds. + if (!cpu_stall_timeout) + cpu_stall_timeout = jiffies_to_msecs(rcu_jiffies_till_stall_check()); + + // Limit check must be consistent with the Kconfig limits for + // CONFIG_RCU_EXP_CPU_STALL_TIMEOUT, so check the allowed range. + // The minimum clamped value is "2UL", because at least one full + // tick has to be guaranteed. + till_stall_check = clamp(msecs_to_jiffies(cpu_stall_timeout), 2UL, 300UL * HZ); + + if (cpu_stall_timeout && jiffies_to_msecs(till_stall_check) != cpu_stall_timeout) + WRITE_ONCE(rcu_exp_cpu_stall_timeout, jiffies_to_msecs(till_stall_check)); + +#ifdef CONFIG_PROVE_RCU + /* Add extra ~25% out of till_stall_check. */ + exp_stall_delay_delta = ((till_stall_check * 25) / 100) + 1; +#endif + + return till_stall_check + exp_stall_delay_delta; +} +EXPORT_SYMBOL_GPL(rcu_exp_jiffies_till_stall_check); + +/* Limit-check stall timeouts specified at boottime and runtime. */ +int rcu_jiffies_till_stall_check(void) +{ + int till_stall_check = READ_ONCE(rcu_cpu_stall_timeout); + + /* + * Limit check must be consistent with the Kconfig limits + * for CONFIG_RCU_CPU_STALL_TIMEOUT. + */ + if (till_stall_check < 3) { + WRITE_ONCE(rcu_cpu_stall_timeout, 3); + till_stall_check = 3; + } else if (till_stall_check > 300) { + WRITE_ONCE(rcu_cpu_stall_timeout, 300); + till_stall_check = 300; + } + return till_stall_check * HZ + RCU_STALL_DELAY_DELTA; +} +EXPORT_SYMBOL_GPL(rcu_jiffies_till_stall_check); + +/* Don't do RCU CPU stall warnings during long sysrq printouts. */ +void rcu_sysrq_start(void) +{ + if (!rcu_cpu_stall_suppress) + rcu_cpu_stall_suppress = 2; +} + +void rcu_sysrq_end(void) +{ + if (rcu_cpu_stall_suppress == 2) + rcu_cpu_stall_suppress = 0; +} + +/* Don't print RCU CPU stall warnings during a kernel panic. */ +static int rcu_panic(struct notifier_block *this, unsigned long ev, void *ptr) +{ + rcu_cpu_stall_suppress = 1; + return NOTIFY_DONE; +} + +static struct notifier_block rcu_panic_block = { + .notifier_call = rcu_panic, +}; + +static int __init check_cpu_stall_init(void) +{ + atomic_notifier_chain_register(&panic_notifier_list, &rcu_panic_block); + return 0; +} +early_initcall(check_cpu_stall_init); + +/* If so specified via sysctl, panic, yielding cleaner stall-warning output. */ +static void panic_on_rcu_stall(void) +{ + static int cpu_stall; + + /* + * Attempt to kick out the BPF scheduler if it's installed and defer + * the panic to give the system a chance to recover. + */ + if (scx_rcu_cpu_stall()) + return; + + if (++cpu_stall < sysctl_max_rcu_stall_to_panic) + return; + + if (sysctl_panic_on_rcu_stall) + panic("RCU Stall\n"); +} + +/** + * rcu_cpu_stall_reset - restart stall-warning timeout for current grace period + * + * To perform the reset request from the caller, disable stall detection until + * 3 fqs loops have passed. This is required to ensure a fresh jiffies is + * loaded. It should be safe to do from the fqs loop as enough timer + * interrupts and context switches should have passed. + * + * The caller must disable hard irqs. + */ +void rcu_cpu_stall_reset(void) +{ + WRITE_ONCE(rcu_state.nr_fqs_jiffies_stall, 3); + WRITE_ONCE(rcu_state.jiffies_stall, ULONG_MAX); +} + +////////////////////////////////////////////////////////////////////////////// +// +// Interaction with RCU grace periods + +/* Start of new grace period, so record stall time (and forcing times). */ +static void record_gp_stall_check_time(void) +{ + unsigned long j = jiffies; + unsigned long j1; + + WRITE_ONCE(rcu_state.gp_start, j); + j1 = rcu_jiffies_till_stall_check(); + smp_mb(); // ->gp_start before ->jiffies_stall and caller's ->gp_seq. + WRITE_ONCE(rcu_state.nr_fqs_jiffies_stall, 0); + WRITE_ONCE(rcu_state.jiffies_stall, j + j1); + rcu_state.jiffies_resched = j + j1 / 2; + rcu_state.n_force_qs_gpstart = READ_ONCE(rcu_state.n_force_qs); +} + +/* Zero ->ticks_this_gp and snapshot the number of RCU softirq handlers. */ +static void zero_cpu_stall_ticks(struct rcu_data *rdp) +{ + rdp->ticks_this_gp = 0; + rdp->softirq_snap = kstat_softirqs_cpu(RCU_SOFTIRQ, smp_processor_id()); + WRITE_ONCE(rdp->last_fqs_resched, jiffies); +} + +/* + * If too much time has passed in the current grace period, and if + * so configured, go kick the relevant kthreads. + */ +static void rcu_stall_kick_kthreads(void) +{ + unsigned long j; + + if (!READ_ONCE(rcu_kick_kthreads)) + return; + j = READ_ONCE(rcu_state.jiffies_kick_kthreads); + if (time_after(jiffies, j) && rcu_state.gp_kthread && + (rcu_gp_in_progress() || READ_ONCE(rcu_state.gp_flags))) { + WARN_ONCE(1, "Kicking %s grace-period kthread\n", + rcu_state.name); + rcu_ftrace_dump(DUMP_ALL); + wake_up_process(rcu_state.gp_kthread); + WRITE_ONCE(rcu_state.jiffies_kick_kthreads, j + HZ); + } +} + +/* + * Handler for the irq_work request posted about halfway into the RCU CPU + * stall timeout, and used to detect excessive irq disabling. Set state + * appropriately, but just complain if there is unexpected state on entry. + */ +static void rcu_iw_handler(struct irq_work *iwp) +{ + struct rcu_data *rdp; + struct rcu_node *rnp; + + rdp = container_of(iwp, struct rcu_data, rcu_iw); + rnp = rdp->mynode; + raw_spin_lock_rcu_node(rnp); + if (!WARN_ON_ONCE(!rdp->rcu_iw_pending)) { + rdp->rcu_iw_gp_seq = rnp->gp_seq; + rdp->rcu_iw_pending = false; + } + raw_spin_unlock_rcu_node(rnp); +} + +////////////////////////////////////////////////////////////////////////////// +// +// Printing RCU CPU stall warnings + +#ifdef CONFIG_PREEMPT_RCU + +/* + * Dump detailed information for all tasks blocking the current RCU + * grace period on the specified rcu_node structure. + */ +static void rcu_print_detail_task_stall_rnp(struct rcu_node *rnp) +{ + unsigned long flags; + struct task_struct *t; + + raw_spin_lock_irqsave_rcu_node(rnp, flags); + if (!rcu_preempt_blocked_readers_cgp(rnp)) { + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); + return; + } + t = list_entry(rnp->gp_tasks->prev, + struct task_struct, rcu_node_entry); + list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) { + /* + * We could be printing a lot while holding a spinlock. + * Avoid triggering hard lockup. + */ + touch_nmi_watchdog(); + sched_show_task(t); + } + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); +} + +// Communicate task state back to the RCU CPU stall warning request. +struct rcu_stall_chk_rdr { + int nesting; + union rcu_special rs; + bool on_blkd_list; +}; + +/* + * Report out the state of a not-running task that is stalling the + * current RCU grace period. + */ +static int check_slow_task(struct task_struct *t, void *arg) +{ + struct rcu_stall_chk_rdr *rscrp = arg; + + if (task_curr(t)) + return -EBUSY; // It is running, so decline to inspect it. + rscrp->nesting = t->rcu_read_lock_nesting; + rscrp->rs = t->rcu_read_unlock_special; + rscrp->on_blkd_list = !list_empty(&t->rcu_node_entry); + return 0; +} + +/* + * Scan the current list of tasks blocked within RCU read-side critical + * sections, printing out the tid of each of the first few of them. + */ +static int rcu_print_task_stall(struct rcu_node *rnp, unsigned long flags) + __releases(rnp->lock) +{ + int i = 0; + int ndetected = 0; + struct rcu_stall_chk_rdr rscr; + struct task_struct *t; + struct task_struct *ts[8]; + + lockdep_assert_irqs_disabled(); + if (!rcu_preempt_blocked_readers_cgp(rnp)) { + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); + return 0; + } + pr_err("\tTasks blocked on level-%d rcu_node (CPUs %d-%d):", + rnp->level, rnp->grplo, rnp->grphi); + t = list_entry(rnp->gp_tasks->prev, + struct task_struct, rcu_node_entry); + list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) { + get_task_struct(t); + ts[i++] = t; + if (i >= ARRAY_SIZE(ts)) + break; + } + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); + while (i) { + t = ts[--i]; + if (task_call_func(t, check_slow_task, &rscr)) + pr_cont(" P%d", t->pid); + else + pr_cont(" P%d/%d:%c%c%c%c", + t->pid, rscr.nesting, + ".b"[rscr.rs.b.blocked], + ".q"[rscr.rs.b.need_qs], + ".e"[rscr.rs.b.exp_hint], + ".l"[rscr.on_blkd_list]); + lockdep_assert_irqs_disabled(); + put_task_struct(t); + ndetected++; + } + pr_cont("\n"); + return ndetected; +} + +#else /* #ifdef CONFIG_PREEMPT_RCU */ + +/* + * Because preemptible RCU does not exist, we never have to check for + * tasks blocked within RCU read-side critical sections. + */ +static void rcu_print_detail_task_stall_rnp(struct rcu_node *rnp) +{ +} + +/* + * Because preemptible RCU does not exist, we never have to check for + * tasks blocked within RCU read-side critical sections. + */ +static int rcu_print_task_stall(struct rcu_node *rnp, unsigned long flags) + __releases(rnp->lock) +{ + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); + return 0; +} +#endif /* #else #ifdef CONFIG_PREEMPT_RCU */ + +/* + * Dump stacks of all tasks running on stalled CPUs. First try using + * NMIs, but fall back to manual remote stack tracing on architectures + * that don't support NMI-based stack dumps. The NMI-triggered stack + * traces are more accurate because they are printed by the target CPU. + */ +static void rcu_dump_cpu_stacks(unsigned long gp_seq) +{ + int cpu; + unsigned long flags; + struct rcu_node *rnp; + + rcu_for_each_leaf_node(rnp) { + printk_deferred_enter(); + for_each_leaf_node_possible_cpu(rnp, cpu) { + if (gp_seq != data_race(rcu_state.gp_seq)) { + printk_deferred_exit(); + pr_err("INFO: Stall ended during stack backtracing.\n"); + return; + } + if (!(data_race(rnp->qsmask) & leaf_node_cpu_bit(rnp, cpu))) + continue; + raw_spin_lock_irqsave_rcu_node(rnp, flags); + if (rnp->qsmask & leaf_node_cpu_bit(rnp, cpu)) { + if (cpu_is_offline(cpu)) + pr_err("Offline CPU %d blocking current GP.\n", cpu); + else + dump_cpu_task(cpu); + } + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); + } + printk_deferred_exit(); + } +} + +static const char * const gp_state_names[] = { + [RCU_GP_IDLE] = "RCU_GP_IDLE", + [RCU_GP_WAIT_GPS] = "RCU_GP_WAIT_GPS", + [RCU_GP_DONE_GPS] = "RCU_GP_DONE_GPS", + [RCU_GP_ONOFF] = "RCU_GP_ONOFF", + [RCU_GP_INIT] = "RCU_GP_INIT", + [RCU_GP_WAIT_FQS] = "RCU_GP_WAIT_FQS", + [RCU_GP_DOING_FQS] = "RCU_GP_DOING_FQS", + [RCU_GP_CLEANUP] = "RCU_GP_CLEANUP", + [RCU_GP_CLEANED] = "RCU_GP_CLEANED", +}; + +/* + * Convert a ->gp_state value to a character string. + */ +static const char *gp_state_getname(short gs) +{ + if (gs < 0 || gs >= ARRAY_SIZE(gp_state_names)) + return "???"; + return gp_state_names[gs]; +} + +/* Is the RCU grace-period kthread being starved of CPU time? */ +static bool rcu_is_gp_kthread_starving(unsigned long *jp) +{ + unsigned long j = jiffies - READ_ONCE(rcu_state.gp_activity); + + if (jp) + *jp = j; + return j > 2 * HZ; +} + +static bool rcu_is_rcuc_kthread_starving(struct rcu_data *rdp, unsigned long *jp) +{ + int cpu; + struct task_struct *rcuc; + unsigned long j; + + rcuc = rdp->rcu_cpu_kthread_task; + if (!rcuc) + return false; + + cpu = task_cpu(rcuc); + if (cpu_is_offline(cpu) || idle_cpu(cpu)) + return false; + + j = jiffies - READ_ONCE(rdp->rcuc_activity); + + if (jp) + *jp = j; + return j > 2 * HZ; +} + +static void print_cpu_stat_info(int cpu) +{ + struct rcu_snap_record rsr, *rsrp; + struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); + struct kernel_cpustat *kcsp = &kcpustat_cpu(cpu); + + if (!rcu_cpu_stall_cputime) + return; + + rsrp = &rdp->snap_record; + if (rsrp->gp_seq != rdp->gp_seq) + return; + + rsr.cputime_irq = kcpustat_field(kcsp, CPUTIME_IRQ, cpu); + rsr.cputime_softirq = kcpustat_field(kcsp, CPUTIME_SOFTIRQ, cpu); + rsr.cputime_system = kcpustat_field(kcsp, CPUTIME_SYSTEM, cpu); + + pr_err("\t hardirqs softirqs csw/system\n"); + pr_err("\t number: %8lld %10d %12lld\n", + kstat_cpu_irqs_sum(cpu) + arch_irq_stat_cpu(cpu) - rsrp->nr_hardirqs, + kstat_cpu_softirqs_sum(cpu) - rsrp->nr_softirqs, + nr_context_switches_cpu(cpu) - rsrp->nr_csw); + pr_err("\tcputime: %8lld %10lld %12lld ==> %d(ms)\n", + div_u64(rsr.cputime_irq - rsrp->cputime_irq, NSEC_PER_MSEC), + div_u64(rsr.cputime_softirq - rsrp->cputime_softirq, NSEC_PER_MSEC), + div_u64(rsr.cputime_system - rsrp->cputime_system, NSEC_PER_MSEC), + jiffies_to_msecs(jiffies - rsrp->jiffies)); +} + +/* + * Print out diagnostic information for the specified stalled CPU. + * + * If the specified CPU is aware of the current RCU grace period, then + * print the number of scheduling clock interrupts the CPU has taken + * during the time that it has been aware. Otherwise, print the number + * of RCU grace periods that this CPU is ignorant of, for example, "1" + * if the CPU was aware of the previous grace period. + * + * Also print out idle info. + */ +static void print_cpu_stall_info(int cpu) +{ + unsigned long delta; + bool falsepositive; + struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); + char *ticks_title; + unsigned long ticks_value; + bool rcuc_starved; + unsigned long j; + char buf[32]; + + /* + * We could be printing a lot while holding a spinlock. Avoid + * triggering hard lockup. + */ + touch_nmi_watchdog(); + + ticks_value = rcu_seq_ctr(rcu_state.gp_seq - rdp->gp_seq); + if (ticks_value) { + ticks_title = "GPs behind"; + } else { + ticks_title = "ticks this GP"; + ticks_value = rdp->ticks_this_gp; + } + delta = rcu_seq_ctr(rdp->mynode->gp_seq - rdp->rcu_iw_gp_seq); + falsepositive = rcu_is_gp_kthread_starving(NULL) && + rcu_watching_snap_in_eqs(ct_rcu_watching_cpu(cpu)); + rcuc_starved = rcu_is_rcuc_kthread_starving(rdp, &j); + if (rcuc_starved) + // Print signed value, as negative values indicate a probable bug. + snprintf(buf, sizeof(buf), " rcuc=%ld jiffies(starved)", j); + pr_err("\t%d-%c%c%c%c: (%lu %s) idle=%04x/%ld/%#lx softirq=%u/%u fqs=%ld%s%s\n", + cpu, + "O."[!!cpu_online(cpu)], + "o."[!!(rdp->grpmask & rdp->mynode->qsmaskinit)], + "N."[!!(rdp->grpmask & rdp->mynode->qsmaskinitnext)], + !IS_ENABLED(CONFIG_IRQ_WORK) ? '?' : + rdp->rcu_iw_pending ? (int)min(delta, 9UL) + '0' : + "!."[!delta], + ticks_value, ticks_title, + ct_rcu_watching_cpu(cpu) & 0xffff, + ct_nesting_cpu(cpu), ct_nmi_nesting_cpu(cpu), + rdp->softirq_snap, kstat_softirqs_cpu(RCU_SOFTIRQ, cpu), + data_race(rcu_state.n_force_qs) - rcu_state.n_force_qs_gpstart, + rcuc_starved ? buf : "", + falsepositive ? " (false positive?)" : ""); + + print_cpu_stat_info(cpu); +} + +/* Complain about starvation of grace-period kthread. */ +static void rcu_check_gp_kthread_starvation(void) +{ + int cpu; + struct task_struct *gpk = rcu_state.gp_kthread; + unsigned long j; + + if (rcu_is_gp_kthread_starving(&j)) { + cpu = gpk ? task_cpu(gpk) : -1; + pr_err("%s kthread starved for %ld jiffies! g%ld f%#x %s(%d) ->state=%#x ->cpu=%d\n", + rcu_state.name, j, + (long)rcu_seq_current(&rcu_state.gp_seq), + data_race(READ_ONCE(rcu_state.gp_flags)), + gp_state_getname(rcu_state.gp_state), + data_race(READ_ONCE(rcu_state.gp_state)), + gpk ? data_race(READ_ONCE(gpk->__state)) : ~0, cpu); + if (gpk) { + struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); + + pr_err("\tUnless %s kthread gets sufficient CPU time, OOM is now expected behavior.\n", rcu_state.name); + pr_err("RCU grace-period kthread stack dump:\n"); + sched_show_task(gpk); + if (cpu_is_offline(cpu)) { + pr_err("RCU GP kthread last ran on offline CPU %d.\n", cpu); + } else if (!(data_race(READ_ONCE(rdp->mynode->qsmask)) & rdp->grpmask)) { + pr_err("Stack dump where RCU GP kthread last ran:\n"); + dump_cpu_task(cpu); + } + wake_up_process(gpk); + } + } +} + +/* Complain about missing wakeups from expired fqs wait timer */ +static void rcu_check_gp_kthread_expired_fqs_timer(void) +{ + struct task_struct *gpk = rcu_state.gp_kthread; + short gp_state; + unsigned long jiffies_fqs; + int cpu; + + /* + * Order reads of .gp_state and .jiffies_force_qs. + * Matching smp_wmb() is present in rcu_gp_fqs_loop(). + */ + gp_state = smp_load_acquire(&rcu_state.gp_state); + jiffies_fqs = READ_ONCE(rcu_state.jiffies_force_qs); + + if (gp_state == RCU_GP_WAIT_FQS && + time_after(jiffies, jiffies_fqs + RCU_STALL_MIGHT_MIN) && + gpk && !READ_ONCE(gpk->on_rq)) { + cpu = task_cpu(gpk); + pr_err("%s kthread timer wakeup didn't happen for %ld jiffies! g%ld f%#x %s(%d) ->state=%#x\n", + rcu_state.name, (jiffies - jiffies_fqs), + (long)rcu_seq_current(&rcu_state.gp_seq), + data_race(READ_ONCE(rcu_state.gp_flags)), // Diagnostic read + gp_state_getname(RCU_GP_WAIT_FQS), RCU_GP_WAIT_FQS, + data_race(READ_ONCE(gpk->__state))); + pr_err("\tPossible timer handling issue on cpu=%d timer-softirq=%u\n", + cpu, kstat_softirqs_cpu(TIMER_SOFTIRQ, cpu)); + } +} + +static void print_other_cpu_stall(unsigned long gp_seq, unsigned long gps) +{ + int cpu; + unsigned long flags; + unsigned long gpa; + unsigned long j; + int ndetected = 0; + struct rcu_node *rnp; + long totqlen = 0; + + lockdep_assert_irqs_disabled(); + + /* Kick and suppress, if so configured. */ + rcu_stall_kick_kthreads(); + if (rcu_stall_is_suppressed()) + return; + + nbcon_cpu_emergency_enter(); + + /* + * OK, time to rat on our buddy... + * See Documentation/RCU/stallwarn.rst for info on how to debug + * RCU CPU stall warnings. + */ + trace_rcu_stall_warning(rcu_state.name, TPS("StallDetected")); + pr_err("INFO: %s detected stalls on CPUs/tasks:\n", rcu_state.name); + rcu_for_each_leaf_node(rnp) { + raw_spin_lock_irqsave_rcu_node(rnp, flags); + if (rnp->qsmask != 0) { + for_each_leaf_node_possible_cpu(rnp, cpu) + if (rnp->qsmask & leaf_node_cpu_bit(rnp, cpu)) { + print_cpu_stall_info(cpu); + ndetected++; + } + } + ndetected += rcu_print_task_stall(rnp, flags); // Releases rnp->lock. + lockdep_assert_irqs_disabled(); + } + + for_each_possible_cpu(cpu) + totqlen += rcu_get_n_cbs_cpu(cpu); + pr_err("\t(detected by %d, t=%ld jiffies, g=%ld, q=%lu ncpus=%d)\n", + smp_processor_id(), (long)(jiffies - gps), + (long)rcu_seq_current(&rcu_state.gp_seq), totqlen, + data_race(rcu_state.n_online_cpus)); // Diagnostic read + if (ndetected) { + rcu_dump_cpu_stacks(gp_seq); + + /* Complain about tasks blocking the grace period. */ + rcu_for_each_leaf_node(rnp) + rcu_print_detail_task_stall_rnp(rnp); + } else { + if (rcu_seq_current(&rcu_state.gp_seq) != gp_seq) { + pr_err("INFO: Stall ended before state dump start\n"); + } else { + j = jiffies; + gpa = data_race(READ_ONCE(rcu_state.gp_activity)); + pr_err("All QSes seen, last %s kthread activity %ld (%ld-%ld), jiffies_till_next_fqs=%ld, root ->qsmask %#lx\n", + rcu_state.name, j - gpa, j, gpa, + data_race(READ_ONCE(jiffies_till_next_fqs)), + data_race(READ_ONCE(rcu_get_root()->qsmask))); + } + } + /* Rewrite if needed in case of slow consoles. */ + if (ULONG_CMP_GE(jiffies, READ_ONCE(rcu_state.jiffies_stall))) + WRITE_ONCE(rcu_state.jiffies_stall, + jiffies + 3 * rcu_jiffies_till_stall_check() + 3); + + rcu_check_gp_kthread_expired_fqs_timer(); + rcu_check_gp_kthread_starvation(); + + nbcon_cpu_emergency_exit(); + + panic_on_rcu_stall(); + + rcu_force_quiescent_state(); /* Kick them all. */ +} + +static void print_cpu_stall(unsigned long gp_seq, unsigned long gps) +{ + int cpu; + unsigned long flags; + struct rcu_data *rdp = this_cpu_ptr(&rcu_data); + struct rcu_node *rnp = rcu_get_root(); + long totqlen = 0; + + lockdep_assert_irqs_disabled(); + + /* Kick and suppress, if so configured. */ + rcu_stall_kick_kthreads(); + if (rcu_stall_is_suppressed()) + return; + + nbcon_cpu_emergency_enter(); + + /* + * OK, time to rat on ourselves... + * See Documentation/RCU/stallwarn.rst for info on how to debug + * RCU CPU stall warnings. + */ + trace_rcu_stall_warning(rcu_state.name, TPS("SelfDetected")); + pr_err("INFO: %s self-detected stall on CPU\n", rcu_state.name); + raw_spin_lock_irqsave_rcu_node(rdp->mynode, flags); + print_cpu_stall_info(smp_processor_id()); + raw_spin_unlock_irqrestore_rcu_node(rdp->mynode, flags); + for_each_possible_cpu(cpu) + totqlen += rcu_get_n_cbs_cpu(cpu); + pr_err("\t(t=%lu jiffies g=%ld q=%lu ncpus=%d)\n", + jiffies - gps, + (long)rcu_seq_current(&rcu_state.gp_seq), totqlen, + data_race(rcu_state.n_online_cpus)); // Diagnostic read + + rcu_check_gp_kthread_expired_fqs_timer(); + rcu_check_gp_kthread_starvation(); + + rcu_dump_cpu_stacks(gp_seq); + + raw_spin_lock_irqsave_rcu_node(rnp, flags); + /* Rewrite if needed in case of slow consoles. */ + if (ULONG_CMP_GE(jiffies, READ_ONCE(rcu_state.jiffies_stall))) + WRITE_ONCE(rcu_state.jiffies_stall, + jiffies + 3 * rcu_jiffies_till_stall_check() + 3); + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); + + nbcon_cpu_emergency_exit(); + + panic_on_rcu_stall(); + + /* + * Attempt to revive the RCU machinery by forcing a context switch. + * + * A context switch would normally allow the RCU state machine to make + * progress and it could be we're stuck in kernel space without context + * switches for an entirely unreasonable amount of time. + */ + set_need_resched_current(); +} + +static bool csd_lock_suppress_rcu_stall; +module_param(csd_lock_suppress_rcu_stall, bool, 0644); + +static void check_cpu_stall(struct rcu_data *rdp) +{ + bool self_detected; + unsigned long gs1; + unsigned long gs2; + unsigned long gps; + unsigned long j; + unsigned long jn; + unsigned long js; + struct rcu_node *rnp; + + lockdep_assert_irqs_disabled(); + if ((rcu_stall_is_suppressed() && !READ_ONCE(rcu_kick_kthreads)) || + !rcu_gp_in_progress()) + return; + rcu_stall_kick_kthreads(); + + /* + * Check if it was requested (via rcu_cpu_stall_reset()) that the FQS + * loop has to set jiffies to ensure a non-stale jiffies value. This + * is required to have good jiffies value after coming out of long + * breaks of jiffies updates. Not doing so can cause false positives. + */ + if (READ_ONCE(rcu_state.nr_fqs_jiffies_stall) > 0) + return; + + j = jiffies; + + /* + * Lots of memory barriers to reject false positives. + * + * The idea is to pick up rcu_state.gp_seq, then + * rcu_state.jiffies_stall, then rcu_state.gp_start, and finally + * another copy of rcu_state.gp_seq. These values are updated in + * the opposite order with memory barriers (or equivalent) during + * grace-period initialization and cleanup. Now, a false positive + * can occur if we get an new value of rcu_state.gp_start and a old + * value of rcu_state.jiffies_stall. But given the memory barriers, + * the only way that this can happen is if one grace period ends + * and another starts between these two fetches. This is detected + * by comparing the second fetch of rcu_state.gp_seq with the + * previous fetch from rcu_state.gp_seq. + * + * Given this check, comparisons of jiffies, rcu_state.jiffies_stall, + * and rcu_state.gp_start suffice to forestall false positives. + */ + gs1 = READ_ONCE(rcu_state.gp_seq); + smp_rmb(); /* Pick up ->gp_seq first... */ + js = READ_ONCE(rcu_state.jiffies_stall); + smp_rmb(); /* ...then ->jiffies_stall before the rest... */ + gps = READ_ONCE(rcu_state.gp_start); + smp_rmb(); /* ...and finally ->gp_start before ->gp_seq again. */ + gs2 = READ_ONCE(rcu_state.gp_seq); + if (gs1 != gs2 || + ULONG_CMP_LT(j, js) || + ULONG_CMP_GE(gps, js) || + !rcu_seq_state(gs2)) + return; /* No stall or GP completed since entering function. */ + rnp = rdp->mynode; + jn = jiffies + ULONG_MAX / 2; + self_detected = READ_ONCE(rnp->qsmask) & rdp->grpmask; + if (rcu_gp_in_progress() && + (self_detected || ULONG_CMP_GE(j, js + RCU_STALL_RAT_DELAY)) && + cmpxchg(&rcu_state.jiffies_stall, js, jn) == js) { + /* + * If a virtual machine is stopped by the host it can look to + * the watchdog like an RCU stall. Check to see if the host + * stopped the vm. + */ + if (kvm_check_and_clear_guest_paused()) + return; + +#ifdef CONFIG_SYSFS + ++rcu_stall_count; +#endif + + rcu_stall_notifier_call_chain(RCU_STALL_NOTIFY_NORM, (void *)j - gps); + if (READ_ONCE(csd_lock_suppress_rcu_stall) && csd_lock_is_stuck()) { + pr_err("INFO: %s detected stall, but suppressed full report due to a stuck CSD-lock.\n", rcu_state.name); + } else if (self_detected) { + /* We haven't checked in, so go dump stack. */ + print_cpu_stall(gs2, gps); + } else { + /* They had a few time units to dump stack, so complain. */ + print_other_cpu_stall(gs2, gps); + } + + if (READ_ONCE(rcu_cpu_stall_ftrace_dump)) + rcu_ftrace_dump(DUMP_ALL); + + if (READ_ONCE(rcu_state.jiffies_stall) == jn) { + jn = jiffies + 3 * rcu_jiffies_till_stall_check() + 3; + WRITE_ONCE(rcu_state.jiffies_stall, jn); + } + } +} + +////////////////////////////////////////////////////////////////////////////// +// +// RCU forward-progress mechanisms, including for callback invocation. + + +/* + * Check to see if a failure to end RCU priority inversion was due to + * a CPU not passing through a quiescent state. When this happens, there + * is nothing that RCU priority boosting can do to help, so we shouldn't + * count this as an RCU priority boosting failure. A return of true says + * RCU priority boosting is to blame, and false says otherwise. If false + * is returned, the first of the CPUs to blame is stored through cpup. + * If there was no CPU blocking the current grace period, but also nothing + * in need of being boosted, *cpup is set to -1. This can happen in case + * of vCPU preemption while the last CPU is reporting its quiscent state, + * for example. + * + * If cpup is NULL, then a lockless quick check is carried out, suitable + * for high-rate usage. On the other hand, if cpup is non-NULL, each + * rcu_node structure's ->lock is acquired, ruling out high-rate usage. + */ +bool rcu_check_boost_fail(unsigned long gp_state, int *cpup) +{ + bool atb = false; + int cpu; + unsigned long flags; + struct rcu_node *rnp; + + rcu_for_each_leaf_node(rnp) { + if (!cpup) { + if (data_race(READ_ONCE(rnp->qsmask))) { + return false; + } else { + if (READ_ONCE(rnp->gp_tasks)) + atb = true; + continue; + } + } + *cpup = -1; + raw_spin_lock_irqsave_rcu_node(rnp, flags); + if (rnp->gp_tasks) + atb = true; + if (!rnp->qsmask) { + // No CPUs without quiescent states for this rnp. + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); + continue; + } + // Find the first holdout CPU. + for_each_leaf_node_possible_cpu(rnp, cpu) { + if (rnp->qsmask & (1UL << (cpu - rnp->grplo))) { + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); + *cpup = cpu; + return false; + } + } + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); + } + // Can't blame CPUs, so must blame RCU priority boosting. + return atb; +} +EXPORT_SYMBOL_GPL(rcu_check_boost_fail); + +/* + * Show the state of the grace-period kthreads. + */ +void show_rcu_gp_kthreads(void) +{ + unsigned long cbs = 0; + int cpu; + unsigned long j; + unsigned long ja; + unsigned long jr; + unsigned long js; + unsigned long jw; + struct rcu_data *rdp; + struct rcu_node *rnp; + struct task_struct *t = READ_ONCE(rcu_state.gp_kthread); + + j = jiffies; + ja = j - data_race(READ_ONCE(rcu_state.gp_activity)); + jr = j - data_race(READ_ONCE(rcu_state.gp_req_activity)); + js = j - data_race(READ_ONCE(rcu_state.gp_start)); + jw = j - data_race(READ_ONCE(rcu_state.gp_wake_time)); + pr_info("%s: wait state: %s(%d) ->state: %#x ->rt_priority %u delta ->gp_start %lu ->gp_activity %lu ->gp_req_activity %lu ->gp_wake_time %lu ->gp_wake_seq %ld ->gp_seq %ld ->gp_seq_needed %ld ->gp_max %lu ->gp_flags %#x\n", + rcu_state.name, gp_state_getname(rcu_state.gp_state), + data_race(READ_ONCE(rcu_state.gp_state)), + t ? data_race(READ_ONCE(t->__state)) : 0x1ffff, t ? t->rt_priority : 0xffU, + js, ja, jr, jw, (long)data_race(READ_ONCE(rcu_state.gp_wake_seq)), + (long)data_race(READ_ONCE(rcu_state.gp_seq)), + (long)data_race(READ_ONCE(rcu_get_root()->gp_seq_needed)), + data_race(READ_ONCE(rcu_state.gp_max)), + data_race(READ_ONCE(rcu_state.gp_flags))); + rcu_for_each_node_breadth_first(rnp) { + if (ULONG_CMP_GE(READ_ONCE(rcu_state.gp_seq), READ_ONCE(rnp->gp_seq_needed)) && + !data_race(READ_ONCE(rnp->qsmask)) && !data_race(READ_ONCE(rnp->boost_tasks)) && + !data_race(READ_ONCE(rnp->exp_tasks)) && !data_race(READ_ONCE(rnp->gp_tasks))) + continue; + pr_info("\trcu_node %d:%d ->gp_seq %ld ->gp_seq_needed %ld ->qsmask %#lx %c%c%c%c ->n_boosts %ld\n", + rnp->grplo, rnp->grphi, + (long)data_race(READ_ONCE(rnp->gp_seq)), + (long)data_race(READ_ONCE(rnp->gp_seq_needed)), + data_race(READ_ONCE(rnp->qsmask)), + ".b"[!!data_race(READ_ONCE(rnp->boost_kthread_task))], + ".B"[!!data_race(READ_ONCE(rnp->boost_tasks))], + ".E"[!!data_race(READ_ONCE(rnp->exp_tasks))], + ".G"[!!data_race(READ_ONCE(rnp->gp_tasks))], + data_race(READ_ONCE(rnp->n_boosts))); + if (!rcu_is_leaf_node(rnp)) + continue; + for_each_leaf_node_possible_cpu(rnp, cpu) { + rdp = per_cpu_ptr(&rcu_data, cpu); + if (READ_ONCE(rdp->gpwrap) || + ULONG_CMP_GE(READ_ONCE(rcu_state.gp_seq), + READ_ONCE(rdp->gp_seq_needed))) + continue; + pr_info("\tcpu %d ->gp_seq_needed %ld\n", + cpu, (long)data_race(READ_ONCE(rdp->gp_seq_needed))); + } + } + for_each_possible_cpu(cpu) { + rdp = per_cpu_ptr(&rcu_data, cpu); + cbs += data_race(READ_ONCE(rdp->n_cbs_invoked)); + show_rcu_nocb_state(rdp); + } + pr_info("RCU callbacks invoked since boot: %lu\n", cbs); + show_rcu_tasks_gp_kthreads(); +} +EXPORT_SYMBOL_GPL(show_rcu_gp_kthreads); + +/* + * This function checks for grace-period requests that fail to motivate + * RCU to come out of its idle mode. + */ +static void rcu_check_gp_start_stall(struct rcu_node *rnp, struct rcu_data *rdp, + const unsigned long gpssdelay) +{ + unsigned long flags; + unsigned long j; + struct rcu_node *rnp_root = rcu_get_root(); + static atomic_t warned = ATOMIC_INIT(0); + + if (!IS_ENABLED(CONFIG_PROVE_RCU) || rcu_gp_in_progress() || + ULONG_CMP_GE(READ_ONCE(rnp_root->gp_seq), + READ_ONCE(rnp_root->gp_seq_needed)) || + !smp_load_acquire(&rcu_state.gp_kthread)) // Get stable kthread. + return; + j = jiffies; /* Expensive access, and in common case don't get here. */ + if (time_before(j, READ_ONCE(rcu_state.gp_req_activity) + gpssdelay) || + time_before(j, READ_ONCE(rcu_state.gp_activity) + gpssdelay) || + atomic_read(&warned)) + return; + + raw_spin_lock_irqsave_rcu_node(rnp, flags); + j = jiffies; + if (rcu_gp_in_progress() || + ULONG_CMP_GE(READ_ONCE(rnp_root->gp_seq), + READ_ONCE(rnp_root->gp_seq_needed)) || + time_before(j, READ_ONCE(rcu_state.gp_req_activity) + gpssdelay) || + time_before(j, READ_ONCE(rcu_state.gp_activity) + gpssdelay) || + atomic_read(&warned)) { + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); + return; + } + /* Hold onto the leaf lock to make others see warned==1. */ + + if (rnp_root != rnp) + raw_spin_lock_rcu_node(rnp_root); /* irqs already disabled. */ + j = jiffies; + if (rcu_gp_in_progress() || + ULONG_CMP_GE(READ_ONCE(rnp_root->gp_seq), + READ_ONCE(rnp_root->gp_seq_needed)) || + time_before(j, READ_ONCE(rcu_state.gp_req_activity) + gpssdelay) || + time_before(j, READ_ONCE(rcu_state.gp_activity) + gpssdelay) || + atomic_xchg(&warned, 1)) { + if (rnp_root != rnp) + /* irqs remain disabled. */ + raw_spin_unlock_rcu_node(rnp_root); + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); + return; + } + WARN_ON(1); + if (rnp_root != rnp) + raw_spin_unlock_rcu_node(rnp_root); + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); + show_rcu_gp_kthreads(); +} + +/* + * Do a forward-progress check for rcutorture. This is normally invoked + * due to an OOM event. The argument "j" gives the time period during + * which rcutorture would like progress to have been made. + */ +void rcu_fwd_progress_check(unsigned long j) +{ + unsigned long cbs; + int cpu; + unsigned long max_cbs = 0; + int max_cpu = -1; + struct rcu_data *rdp; + + if (rcu_gp_in_progress()) { + pr_info("%s: GP age %lu jiffies\n", + __func__, jiffies - data_race(READ_ONCE(rcu_state.gp_start))); + show_rcu_gp_kthreads(); + } else { + pr_info("%s: Last GP end %lu jiffies ago\n", + __func__, jiffies - data_race(READ_ONCE(rcu_state.gp_end))); + preempt_disable(); + rdp = this_cpu_ptr(&rcu_data); + rcu_check_gp_start_stall(rdp->mynode, rdp, j); + preempt_enable(); + } + for_each_possible_cpu(cpu) { + cbs = rcu_get_n_cbs_cpu(cpu); + if (!cbs) + continue; + if (max_cpu < 0) + pr_info("%s: callbacks", __func__); + pr_cont(" %d: %lu", cpu, cbs); + if (cbs <= max_cbs) + continue; + max_cbs = cbs; + max_cpu = cpu; + } + if (max_cpu >= 0) + pr_cont("\n"); +} +EXPORT_SYMBOL_GPL(rcu_fwd_progress_check); + +/* Commandeer a sysrq key to dump RCU's tree. */ +static bool sysrq_rcu; +module_param(sysrq_rcu, bool, 0444); + +/* Dump grace-period-request information due to commandeered sysrq. */ +static void sysrq_show_rcu(u8 key) +{ + show_rcu_gp_kthreads(); +} + +static const struct sysrq_key_op sysrq_rcudump_op = { + .handler = sysrq_show_rcu, + .help_msg = "show-rcu(y)", + .action_msg = "Show RCU tree", + .enable_mask = SYSRQ_ENABLE_DUMP, +}; + +static int __init rcu_sysrq_init(void) +{ + if (sysrq_rcu) + return register_sysrq_key('y', &sysrq_rcudump_op); + return 0; +} +early_initcall(rcu_sysrq_init); + +#ifdef CONFIG_RCU_CPU_STALL_NOTIFIER + +////////////////////////////////////////////////////////////////////////////// +// +// RCU CPU stall-warning notifiers + +static ATOMIC_NOTIFIER_HEAD(rcu_cpu_stall_notifier_list); + +/** + * rcu_stall_chain_notifier_register - Add an RCU CPU stall notifier + * @n: Entry to add. + * + * Adds an RCU CPU stall notifier to an atomic notifier chain. + * The @action passed to a notifier will be @RCU_STALL_NOTIFY_NORM or + * friends. The @data will be the duration of the stalled grace period, + * in jiffies, coerced to a void* pointer. + * + * Returns 0 on success, %-EEXIST on error. + */ +int rcu_stall_chain_notifier_register(struct notifier_block *n) +{ + int rcsn = rcu_cpu_stall_notifiers; + + WARN(1, "Adding %pS() to RCU stall notifier list (%s).\n", n->notifier_call, + rcsn ? "possibly suppressing RCU CPU stall warnings" : "failed, so all is well"); + if (rcsn) + return atomic_notifier_chain_register(&rcu_cpu_stall_notifier_list, n); + return -EEXIST; +} +EXPORT_SYMBOL_GPL(rcu_stall_chain_notifier_register); + +/** + * rcu_stall_chain_notifier_unregister - Remove an RCU CPU stall notifier + * @n: Entry to add. + * + * Removes an RCU CPU stall notifier from an atomic notifier chain. + * + * Returns zero on success, %-ENOENT on failure. + */ +int rcu_stall_chain_notifier_unregister(struct notifier_block *n) +{ + return atomic_notifier_chain_unregister(&rcu_cpu_stall_notifier_list, n); +} +EXPORT_SYMBOL_GPL(rcu_stall_chain_notifier_unregister); + +/* + * rcu_stall_notifier_call_chain - Call functions in an RCU CPU stall notifier chain + * @val: Value passed unmodified to notifier function + * @v: Pointer passed unmodified to notifier function + * + * Calls each function in the RCU CPU stall notifier chain in turn, which + * is an atomic call chain. See atomic_notifier_call_chain() for more + * information. + * + * This is for use within RCU, hence the omission of the extra asterisk + * to indicate a non-kerneldoc format header comment. + */ +int rcu_stall_notifier_call_chain(unsigned long val, void *v) +{ + return atomic_notifier_call_chain(&rcu_cpu_stall_notifier_list, val, v); +} + +#endif // #ifdef CONFIG_RCU_CPU_STALL_NOTIFIER diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index 1971869c4072..dfeba9b35395 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -1,26 +1,13 @@ +// SPDX-License-Identifier: GPL-2.0+ /* * Read-Copy Update mechanism for mutual exclusion * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, you can access it online at - * http://www.gnu.org/licenses/gpl-2.0.html. - * * Copyright IBM Corporation, 2001 * * Authors: Dipankar Sarma <dipankar@in.ibm.com> * Manfred Spraul <manfred@colorfullife.com> * - * Based on the original work by Paul McKenney <paulmck@us.ibm.com> + * Based on the original work by Paul McKenney <paulmck@linux.ibm.com> * and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen. * Papers: * http://www.rdrop.com/users/paulmck/paper/rclockpdcsproof.pdf @@ -38,6 +25,7 @@ #include <linux/interrupt.h> #include <linux/sched/signal.h> #include <linux/sched/debug.h> +#include <linux/torture.h> #include <linux/atomic.h> #include <linux/bitops.h> #include <linux/percpu.h> @@ -52,6 +40,10 @@ #include <linux/tick.h> #include <linux/rcupdate_wait.h> #include <linux/sched/isolation.h> +#include <linux/kprobes.h> +#include <linux/slab.h> +#include <linux/irq_work.h> +#include <linux/rcupdate_trace.h> #define CREATE_TRACE_POINTS @@ -63,19 +55,25 @@ #define MODULE_PARAM_PREFIX "rcupdate." #ifndef CONFIG_TINY_RCU -extern int rcu_expedited; /* from sysctl */ -module_param(rcu_expedited, int, 0); -extern int rcu_normal; /* from sysctl */ -module_param(rcu_normal, int, 0); -static int rcu_normal_after_boot; -module_param(rcu_normal_after_boot, int, 0); +module_param(rcu_expedited, int, 0444); +module_param(rcu_normal, int, 0444); +static int rcu_normal_after_boot = IS_ENABLED(CONFIG_PREEMPT_RT); +#if !defined(CONFIG_PREEMPT_RT) || defined(CONFIG_NO_HZ_FULL) +module_param(rcu_normal_after_boot, int, 0444); +#endif #endif /* #ifndef CONFIG_TINY_RCU */ #ifdef CONFIG_DEBUG_LOCK_ALLOC /** - * rcu_read_lock_sched_held() - might we be in RCU-sched read-side critical section? + * rcu_read_lock_held_common() - might we be in RCU-sched read-side critical section? + * @ret: Best guess answer if lockdep cannot be relied on + * + * Returns true if lockdep must be ignored, in which case ``*ret`` contains + * the best guess described below. Otherwise returns false, in which + * case ``*ret`` tells the caller nothing and the caller should instead + * consult lockdep. * - * If CONFIG_DEBUG_LOCK_ALLOC is selected, returns nonzero iff in an + * If CONFIG_DEBUG_LOCK_ALLOC is selected, set ``*ret`` to nonzero iff in an * RCU-sched read-side critical section. In absence of * CONFIG_DEBUG_LOCK_ALLOC, this assumes we are in an RCU-sched read-side * critical section unless it can prove otherwise. Note that disabling @@ -87,35 +85,45 @@ module_param(rcu_normal_after_boot, int, 0); * Check debug_lockdep_rcu_enabled() to prevent false positives during boot * and while lockdep is disabled. * - * Note that if the CPU is in the idle loop from an RCU point of - * view (ie: that we are in the section between rcu_idle_enter() and - * rcu_idle_exit()) then rcu_read_lock_held() returns false even if the CPU - * did an rcu_read_lock(). The reason for this is that RCU ignores CPUs - * that are in such a section, considering these as in extended quiescent - * state, so such a CPU is effectively never in an RCU read-side critical - * section regardless of what RCU primitives it invokes. This state of - * affairs is required --- we need to keep an RCU-free window in idle - * where the CPU may possibly enter into low power mode. This way we can - * notice an extended quiescent state to other CPUs that started a grace - * period. Otherwise we would delay any grace period as long as we run in - * the idle task. + * Note that if the CPU is in the idle loop from an RCU point of view (ie: + * that we are in the section between ct_idle_enter() and ct_idle_exit()) + * then rcu_read_lock_held() sets ``*ret`` to false even if the CPU did an + * rcu_read_lock(). The reason for this is that RCU ignores CPUs that are + * in such a section, considering these as in extended quiescent state, + * so such a CPU is effectively never in an RCU read-side critical section + * regardless of what RCU primitives it invokes. This state of affairs is + * required --- we need to keep an RCU-free window in idle where the CPU may + * possibly enter into low power mode. This way we can notice an extended + * quiescent state to other CPUs that started a grace period. Otherwise + * we would delay any grace period as long as we run in the idle task. * - * Similarly, we avoid claiming an SRCU read lock held if the current + * Similarly, we avoid claiming an RCU read lock held if the current * CPU is offline. */ -int rcu_read_lock_sched_held(void) +static bool rcu_read_lock_held_common(bool *ret) { - int lockdep_opinion = 0; + if (!debug_lockdep_rcu_enabled()) { + *ret = true; + return true; + } + if (!rcu_is_watching()) { + *ret = false; + return true; + } + if (!rcu_lockdep_current_cpu_online()) { + *ret = false; + return true; + } + return false; +} - if (!debug_lockdep_rcu_enabled()) - return 1; - if (!rcu_is_watching()) - return 0; - if (!rcu_lockdep_current_cpu_online()) - return 0; - if (debug_locks) - lockdep_opinion = lock_is_held(&rcu_sched_lock_map); - return lockdep_opinion || !preemptible(); +int notrace rcu_read_lock_sched_held(void) +{ + bool ret; + + if (rcu_read_lock_held_common(&ret)) + return ret; + return lock_is_held(&rcu_sched_lock_map) || !preemptible(); } EXPORT_SYMBOL(rcu_read_lock_sched_held); #endif @@ -137,8 +145,45 @@ bool rcu_gp_is_normal(void) } EXPORT_SYMBOL_GPL(rcu_gp_is_normal); -static atomic_t rcu_expedited_nesting = ATOMIC_INIT(1); +static atomic_t rcu_async_hurry_nesting = ATOMIC_INIT(1); +/* + * Should call_rcu() callbacks be processed with urgency or are + * they OK being executed with arbitrary delays? + */ +bool rcu_async_should_hurry(void) +{ + return !IS_ENABLED(CONFIG_RCU_LAZY) || + atomic_read(&rcu_async_hurry_nesting); +} +EXPORT_SYMBOL_GPL(rcu_async_should_hurry); + +/** + * rcu_async_hurry - Make future async RCU callbacks not lazy. + * + * After a call to this function, future calls to call_rcu() + * will be processed in a timely fashion. + */ +void rcu_async_hurry(void) +{ + if (IS_ENABLED(CONFIG_RCU_LAZY)) + atomic_inc(&rcu_async_hurry_nesting); +} +EXPORT_SYMBOL_GPL(rcu_async_hurry); + +/** + * rcu_async_relax - Make future async RCU callbacks lazy. + * + * After a call to this function, future calls to call_rcu() + * will be processed in a lazy fashion. + */ +void rcu_async_relax(void) +{ + if (IS_ENABLED(CONFIG_RCU_LAZY)) + atomic_dec(&rcu_async_hurry_nesting); +} +EXPORT_SYMBOL_GPL(rcu_async_relax); +static atomic_t rcu_expedited_nesting = ATOMIC_INIT(1); /* * Should normal grace-period primitives be expedited? Intended for * use within RCU. Note that this function takes the rcu_expedited @@ -148,8 +193,7 @@ static atomic_t rcu_expedited_nesting = ATOMIC_INIT(1); */ bool rcu_gp_is_expedited(void) { - return rcu_expedited || atomic_read(&rcu_expedited_nesting) || - rcu_scheduler_active == RCU_SCHEDULER_INIT; + return rcu_expedited || atomic_read(&rcu_expedited_nesting); } EXPORT_SYMBOL_GPL(rcu_gp_is_expedited); @@ -181,16 +225,29 @@ void rcu_unexpedite_gp(void) } EXPORT_SYMBOL_GPL(rcu_unexpedite_gp); +static bool rcu_boot_ended __read_mostly; + /* * Inform RCU of the end of the in-kernel boot sequence. */ void rcu_end_inkernel_boot(void) { rcu_unexpedite_gp(); + rcu_async_relax(); if (rcu_normal_after_boot) WRITE_ONCE(rcu_normal, 1); + rcu_boot_ended = true; } +/* + * Let rcutorture know when it is OK to turn it up to eleven. + */ +bool rcu_inkernel_boot_has_ended(void) +{ + return rcu_boot_ended; +} +EXPORT_SYMBOL_GPL(rcu_inkernel_boot_has_ended); + #endif /* #ifndef CONFIG_TINY_RCU */ /* @@ -202,11 +259,12 @@ void rcu_test_sync_prims(void) { if (!IS_ENABLED(CONFIG_PROVE_RCU)) return; + pr_info("Running RCU synchronous self tests\n"); synchronize_rcu(); synchronize_rcu_expedited(); } -#if !defined(CONFIG_TINY_RCU) || defined(CONFIG_SRCU) +#if !defined(CONFIG_TINY_RCU) /* * Switch to run-time mode once RCU has fully initialized. @@ -215,37 +273,51 @@ static int __init rcu_set_runtime_mode(void) { rcu_test_sync_prims(); rcu_scheduler_active = RCU_SCHEDULER_RUNNING; + kfree_rcu_scheduler_running(); rcu_test_sync_prims(); return 0; } core_initcall(rcu_set_runtime_mode); -#endif /* #if !defined(CONFIG_TINY_RCU) || defined(CONFIG_SRCU) */ +#endif /* #if !defined(CONFIG_TINY_RCU) */ #ifdef CONFIG_DEBUG_LOCK_ALLOC static struct lock_class_key rcu_lock_key; -struct lockdep_map rcu_lock_map = - STATIC_LOCKDEP_MAP_INIT("rcu_read_lock", &rcu_lock_key); +struct lockdep_map rcu_lock_map = { + .name = "rcu_read_lock", + .key = &rcu_lock_key, + .wait_type_outer = LD_WAIT_FREE, + .wait_type_inner = LD_WAIT_CONFIG, /* PREEMPT_RT implies PREEMPT_RCU */ +}; EXPORT_SYMBOL_GPL(rcu_lock_map); static struct lock_class_key rcu_bh_lock_key; -struct lockdep_map rcu_bh_lock_map = - STATIC_LOCKDEP_MAP_INIT("rcu_read_lock_bh", &rcu_bh_lock_key); +struct lockdep_map rcu_bh_lock_map = { + .name = "rcu_read_lock_bh", + .key = &rcu_bh_lock_key, + .wait_type_outer = LD_WAIT_FREE, + .wait_type_inner = LD_WAIT_CONFIG, /* PREEMPT_RT makes BH preemptible. */ +}; EXPORT_SYMBOL_GPL(rcu_bh_lock_map); static struct lock_class_key rcu_sched_lock_key; -struct lockdep_map rcu_sched_lock_map = - STATIC_LOCKDEP_MAP_INIT("rcu_read_lock_sched", &rcu_sched_lock_key); +struct lockdep_map rcu_sched_lock_map = { + .name = "rcu_read_lock_sched", + .key = &rcu_sched_lock_key, + .wait_type_outer = LD_WAIT_FREE, + .wait_type_inner = LD_WAIT_SPIN, +}; EXPORT_SYMBOL_GPL(rcu_sched_lock_map); +// Tell lockdep when RCU callbacks are being invoked. static struct lock_class_key rcu_callback_key; struct lockdep_map rcu_callback_map = STATIC_LOCKDEP_MAP_INIT("rcu_callback", &rcu_callback_key); EXPORT_SYMBOL_GPL(rcu_callback_map); -int notrace debug_lockdep_rcu_enabled(void) +noinstr int notrace debug_lockdep_rcu_enabled(void) { - return rcu_scheduler_active != RCU_SCHEDULER_INACTIVE && debug_locks && + return rcu_scheduler_active != RCU_SCHEDULER_INACTIVE && READ_ONCE(debug_locks) && current->lockdep_recursion == 0; } EXPORT_SYMBOL_GPL(debug_lockdep_rcu_enabled); @@ -270,14 +342,12 @@ EXPORT_SYMBOL_GPL(debug_lockdep_rcu_enabled); * Note that rcu_read_lock() is disallowed if the CPU is either idle or * offline from an RCU perspective, so check for those as well. */ -int rcu_read_lock_held(void) +int notrace rcu_read_lock_held(void) { - if (!debug_lockdep_rcu_enabled()) - return 1; - if (!rcu_is_watching()) - return 0; - if (!rcu_lockdep_current_cpu_online()) - return 0; + bool ret; + + if (rcu_read_lock_held_common(&ret)) + return ret; return lock_is_held(&rcu_lock_map); } EXPORT_SYMBOL_GPL(rcu_read_lock_held); @@ -297,18 +367,30 @@ EXPORT_SYMBOL_GPL(rcu_read_lock_held); * Note that rcu_read_lock_bh() is disallowed if the CPU is either idle or * offline from an RCU perspective, so check for those as well. */ -int rcu_read_lock_bh_held(void) +int notrace rcu_read_lock_bh_held(void) { - if (!debug_lockdep_rcu_enabled()) - return 1; - if (!rcu_is_watching()) - return 0; - if (!rcu_lockdep_current_cpu_online()) - return 0; + bool ret; + + if (rcu_read_lock_held_common(&ret)) + return ret; return in_softirq() || irqs_disabled(); } EXPORT_SYMBOL_GPL(rcu_read_lock_bh_held); +int notrace rcu_read_lock_any_held(void) +{ + bool ret; + + if (rcu_read_lock_held_common(&ret)) + return ret; + if (lock_is_held(&rcu_lock_map) || + lock_is_held(&rcu_bh_lock_map) || + lock_is_held(&rcu_sched_lock_map)) + return 1; + return !preemptible(); +} +EXPORT_SYMBOL_GPL(rcu_read_lock_any_held); + #endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ /** @@ -326,7 +408,7 @@ void wakeme_after_rcu(struct rcu_head *head) } EXPORT_SYMBOL_GPL(wakeme_after_rcu); -void __wait_rcu_gp(bool checktiny, int n, call_rcu_func_t *crcu_array, +void __wait_rcu_gp(bool checktiny, unsigned int state, int n, call_rcu_func_t *crcu_array, struct rcu_synchronize *rs_array) { int i; @@ -339,13 +421,14 @@ void __wait_rcu_gp(bool checktiny, int n, call_rcu_func_t *crcu_array, might_sleep(); continue; } - init_rcu_head_on_stack(&rs_array[i].head); - init_completion(&rs_array[i].completion); for (j = 0; j < i; j++) if (crcu_array[j] == crcu_array[i]) break; - if (j == i) + if (j == i) { + init_rcu_head_on_stack(&rs_array[i].head); + init_completion(&rs_array[i].completion); (crcu_array[i])(&rs_array[i].head, wakeme_after_rcu); + } } /* Wait for all callbacks to be invoked. */ @@ -356,13 +439,21 @@ void __wait_rcu_gp(bool checktiny, int n, call_rcu_func_t *crcu_array, for (j = 0; j < i; j++) if (crcu_array[j] == crcu_array[i]) break; - if (j == i) - wait_for_completion(&rs_array[i].completion); - destroy_rcu_head_on_stack(&rs_array[i].head); + if (j == i) { + wait_for_completion_state(&rs_array[i].completion, state); + destroy_rcu_head_on_stack(&rs_array[i].head); + } } } EXPORT_SYMBOL_GPL(__wait_rcu_gp); +void finish_rcuwait(struct rcuwait *w) +{ + rcu_assign_pointer(w->task, NULL); + __set_current_state(TASK_RUNNING); +} +EXPORT_SYMBOL_GPL(finish_rcuwait); + #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD void init_rcu_head(struct rcu_head *head) { @@ -414,14 +505,14 @@ void destroy_rcu_head_on_stack(struct rcu_head *head) } EXPORT_SYMBOL_GPL(destroy_rcu_head_on_stack); -struct debug_obj_descr rcuhead_debug_descr = { +const struct debug_obj_descr rcuhead_debug_descr = { .name = "rcu_head", .is_static_object = rcuhead_is_static_object, }; EXPORT_SYMBOL_GPL(rcuhead_debug_descr); #endif /* #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */ -#if defined(CONFIG_TREE_RCU) || defined(CONFIG_PREEMPT_RCU) || defined(CONFIG_RCU_TRACE) +#if defined(CONFIG_TREE_RCU) || defined(CONFIG_RCU_TRACE) void do_trace_rcu_torture_read(const char *rcutorturename, struct rcu_head *rhp, unsigned long secs, unsigned long c_old, unsigned long c) @@ -434,434 +525,59 @@ EXPORT_SYMBOL_GPL(do_trace_rcu_torture_read); do { } while (0) #endif -#ifdef CONFIG_RCU_STALL_COMMON +#if IS_ENABLED(CONFIG_RCU_TORTURE_TEST) || IS_MODULE(CONFIG_RCU_TORTURE_TEST) || IS_ENABLED(CONFIG_LOCK_TORTURE_TEST) || IS_MODULE(CONFIG_LOCK_TORTURE_TEST) +/* Get rcutorture access to sched_setaffinity(). */ +long torture_sched_setaffinity(pid_t pid, const struct cpumask *in_mask, bool dowarn) +{ + int ret; -#ifdef CONFIG_PROVE_RCU -#define RCU_STALL_DELAY_DELTA (5 * HZ) -#else -#define RCU_STALL_DELAY_DELTA 0 + ret = sched_setaffinity(pid, in_mask); + WARN_ONCE(dowarn && ret, "%s: sched_setaffinity(%d) returned %d\n", __func__, pid, ret); + return ret; +} +EXPORT_SYMBOL_GPL(torture_sched_setaffinity); #endif -int rcu_cpu_stall_suppress __read_mostly; /* 1 = suppress stall warnings. */ -EXPORT_SYMBOL_GPL(rcu_cpu_stall_suppress); -static int rcu_cpu_stall_timeout __read_mostly = CONFIG_RCU_CPU_STALL_TIMEOUT; +int rcu_cpu_stall_notifiers __read_mostly; // !0 = provide stall notifiers (rarely useful) +EXPORT_SYMBOL_GPL(rcu_cpu_stall_notifiers); +#ifdef CONFIG_RCU_STALL_COMMON +int rcu_cpu_stall_ftrace_dump __read_mostly; +module_param(rcu_cpu_stall_ftrace_dump, int, 0644); +#ifdef CONFIG_RCU_CPU_STALL_NOTIFIER +module_param(rcu_cpu_stall_notifiers, int, 0444); +#endif // #ifdef CONFIG_RCU_CPU_STALL_NOTIFIER +int rcu_cpu_stall_suppress __read_mostly; // !0 = suppress stall warnings. +EXPORT_SYMBOL_GPL(rcu_cpu_stall_suppress); module_param(rcu_cpu_stall_suppress, int, 0644); +int rcu_cpu_stall_timeout __read_mostly = CONFIG_RCU_CPU_STALL_TIMEOUT; module_param(rcu_cpu_stall_timeout, int, 0644); - -int rcu_jiffies_till_stall_check(void) -{ - int till_stall_check = READ_ONCE(rcu_cpu_stall_timeout); - - /* - * Limit check must be consistent with the Kconfig limits - * for CONFIG_RCU_CPU_STALL_TIMEOUT. - */ - if (till_stall_check < 3) { - WRITE_ONCE(rcu_cpu_stall_timeout, 3); - till_stall_check = 3; - } else if (till_stall_check > 300) { - WRITE_ONCE(rcu_cpu_stall_timeout, 300); - till_stall_check = 300; - } - return till_stall_check * HZ + RCU_STALL_DELAY_DELTA; -} -EXPORT_SYMBOL_GPL(rcu_jiffies_till_stall_check); - -void rcu_sysrq_start(void) -{ - if (!rcu_cpu_stall_suppress) - rcu_cpu_stall_suppress = 2; -} - -void rcu_sysrq_end(void) -{ - if (rcu_cpu_stall_suppress == 2) - rcu_cpu_stall_suppress = 0; -} - -static int rcu_panic(struct notifier_block *this, unsigned long ev, void *ptr) -{ - rcu_cpu_stall_suppress = 1; - return NOTIFY_DONE; -} - -static struct notifier_block rcu_panic_block = { - .notifier_call = rcu_panic, -}; - -static int __init check_cpu_stall_init(void) -{ - atomic_notifier_chain_register(&panic_notifier_list, &rcu_panic_block); - return 0; -} -early_initcall(check_cpu_stall_init); - +int rcu_exp_cpu_stall_timeout __read_mostly = CONFIG_RCU_EXP_CPU_STALL_TIMEOUT; +module_param(rcu_exp_cpu_stall_timeout, int, 0644); +int rcu_cpu_stall_cputime __read_mostly = IS_ENABLED(CONFIG_RCU_CPU_STALL_CPUTIME); +module_param(rcu_cpu_stall_cputime, int, 0644); +bool rcu_exp_stall_task_details __read_mostly; +module_param(rcu_exp_stall_task_details, bool, 0644); #endif /* #ifdef CONFIG_RCU_STALL_COMMON */ -#ifdef CONFIG_TASKS_RCU - -/* - * Simple variant of RCU whose quiescent states are voluntary context - * switch, cond_resched_rcu_qs(), user-space execution, and idle. - * As such, grace periods can take one good long time. There are no - * read-side primitives similar to rcu_read_lock() and rcu_read_unlock() - * because this implementation is intended to get the system into a safe - * state for some of the manipulations involved in tracing and the like. - * Finally, this implementation does not support high call_rcu_tasks() - * rates from multiple CPUs. If this is required, per-CPU callback lists - * will be needed. - */ - -/* Global list of callbacks and associated lock. */ -static struct rcu_head *rcu_tasks_cbs_head; -static struct rcu_head **rcu_tasks_cbs_tail = &rcu_tasks_cbs_head; -static DECLARE_WAIT_QUEUE_HEAD(rcu_tasks_cbs_wq); -static DEFINE_RAW_SPINLOCK(rcu_tasks_cbs_lock); - -/* Track exiting tasks in order to allow them to be waited for. */ -DEFINE_STATIC_SRCU(tasks_rcu_exit_srcu); - -/* Control stall timeouts. Disable with <= 0, otherwise jiffies till stall. */ -#define RCU_TASK_STALL_TIMEOUT (HZ * 60 * 10) -static int rcu_task_stall_timeout __read_mostly = RCU_TASK_STALL_TIMEOUT; -module_param(rcu_task_stall_timeout, int, 0644); - -static struct task_struct *rcu_tasks_kthread_ptr; - -/** - * call_rcu_tasks() - Queue an RCU for invocation task-based grace period - * @rhp: structure to be used for queueing the RCU updates. - * @func: actual callback function to be invoked after the grace period - * - * The callback function will be invoked some time after a full grace - * period elapses, in other words after all currently executing RCU - * read-side critical sections have completed. call_rcu_tasks() assumes - * that the read-side critical sections end at a voluntary context - * switch (not a preemption!), cond_resched_rcu_qs(), entry into idle, - * or transition to usermode execution. As such, there are no read-side - * primitives analogous to rcu_read_lock() and rcu_read_unlock() because - * this primitive is intended to determine that all tasks have passed - * through a safe state, not so much for data-strcuture synchronization. - * - * See the description of call_rcu() for more detailed information on - * memory ordering guarantees. - */ -void call_rcu_tasks(struct rcu_head *rhp, rcu_callback_t func) -{ - unsigned long flags; - bool needwake; - - rhp->next = NULL; - rhp->func = func; - raw_spin_lock_irqsave(&rcu_tasks_cbs_lock, flags); - needwake = !rcu_tasks_cbs_head; - *rcu_tasks_cbs_tail = rhp; - rcu_tasks_cbs_tail = &rhp->next; - raw_spin_unlock_irqrestore(&rcu_tasks_cbs_lock, flags); - /* We can't create the thread unless interrupts are enabled. */ - if (needwake && READ_ONCE(rcu_tasks_kthread_ptr)) - wake_up(&rcu_tasks_cbs_wq); -} -EXPORT_SYMBOL_GPL(call_rcu_tasks); +// Suppress boot-time RCU CPU stall warnings and rcutorture writer stall +// warnings. Also used by rcutorture even if stall warnings are excluded. +int rcu_cpu_stall_suppress_at_boot __read_mostly; // !0 = suppress boot stalls. +EXPORT_SYMBOL_GPL(rcu_cpu_stall_suppress_at_boot); +module_param(rcu_cpu_stall_suppress_at_boot, int, 0444); /** - * synchronize_rcu_tasks - wait until an rcu-tasks grace period has elapsed. - * - * Control will return to the caller some time after a full rcu-tasks - * grace period has elapsed, in other words after all currently - * executing rcu-tasks read-side critical sections have elapsed. These - * read-side critical sections are delimited by calls to schedule(), - * cond_resched_tasks_rcu_qs(), idle execution, userspace execution, calls - * to synchronize_rcu_tasks(), and (in theory, anyway) cond_resched(). - * - * This is a very specialized primitive, intended only for a few uses in - * tracing and other situations requiring manipulation of function - * preambles and profiling hooks. The synchronize_rcu_tasks() function - * is not (yet) intended for heavy use from multiple CPUs. + * get_completed_synchronize_rcu - Return a pre-completed polled state cookie * - * Note that this guarantee implies further memory-ordering guarantees. - * On systems with more than one CPU, when synchronize_rcu_tasks() returns, - * each CPU is guaranteed to have executed a full memory barrier since the - * end of its last RCU-tasks read-side critical section whose beginning - * preceded the call to synchronize_rcu_tasks(). In addition, each CPU - * having an RCU-tasks read-side critical section that extends beyond - * the return from synchronize_rcu_tasks() is guaranteed to have executed - * a full memory barrier after the beginning of synchronize_rcu_tasks() - * and before the beginning of that RCU-tasks read-side critical section. - * Note that these guarantees include CPUs that are offline, idle, or - * executing in user mode, as well as CPUs that are executing in the kernel. - * - * Furthermore, if CPU A invoked synchronize_rcu_tasks(), which returned - * to its caller on CPU B, then both CPU A and CPU B are guaranteed - * to have executed a full memory barrier during the execution of - * synchronize_rcu_tasks() -- even if CPU A and CPU B are the same CPU - * (but again only if the system has more than one CPU). - */ -void synchronize_rcu_tasks(void) -{ - /* Complain if the scheduler has not started. */ - RCU_LOCKDEP_WARN(rcu_scheduler_active == RCU_SCHEDULER_INACTIVE, - "synchronize_rcu_tasks called too soon"); - - /* Wait for the grace period. */ - wait_rcu_gp(call_rcu_tasks); -} -EXPORT_SYMBOL_GPL(synchronize_rcu_tasks); - -/** - * rcu_barrier_tasks - Wait for in-flight call_rcu_tasks() callbacks. - * - * Although the current implementation is guaranteed to wait, it is not - * obligated to, for example, if there are no pending callbacks. - */ -void rcu_barrier_tasks(void) -{ - /* There is only one callback queue, so this is easy. ;-) */ - synchronize_rcu_tasks(); -} -EXPORT_SYMBOL_GPL(rcu_barrier_tasks); - -/* See if tasks are still holding out, complain if so. */ -static void check_holdout_task(struct task_struct *t, - bool needreport, bool *firstreport) -{ - int cpu; - - if (!READ_ONCE(t->rcu_tasks_holdout) || - t->rcu_tasks_nvcsw != READ_ONCE(t->nvcsw) || - !READ_ONCE(t->on_rq) || - (IS_ENABLED(CONFIG_NO_HZ_FULL) && - !is_idle_task(t) && t->rcu_tasks_idle_cpu >= 0)) { - WRITE_ONCE(t->rcu_tasks_holdout, false); - list_del_init(&t->rcu_tasks_holdout_list); - put_task_struct(t); - return; - } - rcu_request_urgent_qs_task(t); - if (!needreport) - return; - if (*firstreport) { - pr_err("INFO: rcu_tasks detected stalls on tasks:\n"); - *firstreport = false; - } - cpu = task_cpu(t); - pr_alert("%p: %c%c nvcsw: %lu/%lu holdout: %d idle_cpu: %d/%d\n", - t, ".I"[is_idle_task(t)], - "N."[cpu < 0 || !tick_nohz_full_cpu(cpu)], - t->rcu_tasks_nvcsw, t->nvcsw, t->rcu_tasks_holdout, - t->rcu_tasks_idle_cpu, cpu); - sched_show_task(t); -} - -/* RCU-tasks kthread that detects grace periods and invokes callbacks. */ -static int __noreturn rcu_tasks_kthread(void *arg) -{ - unsigned long flags; - struct task_struct *g, *t; - unsigned long lastreport; - struct rcu_head *list; - struct rcu_head *next; - LIST_HEAD(rcu_tasks_holdouts); - int fract; - - /* Run on housekeeping CPUs by default. Sysadm can move if desired. */ - housekeeping_affine(current, HK_FLAG_RCU); - - /* - * Each pass through the following loop makes one check for - * newly arrived callbacks, and, if there are some, waits for - * one RCU-tasks grace period and then invokes the callbacks. - * This loop is terminated by the system going down. ;-) - */ - for (;;) { - - /* Pick up any new callbacks. */ - raw_spin_lock_irqsave(&rcu_tasks_cbs_lock, flags); - list = rcu_tasks_cbs_head; - rcu_tasks_cbs_head = NULL; - rcu_tasks_cbs_tail = &rcu_tasks_cbs_head; - raw_spin_unlock_irqrestore(&rcu_tasks_cbs_lock, flags); - - /* If there were none, wait a bit and start over. */ - if (!list) { - wait_event_interruptible(rcu_tasks_cbs_wq, - rcu_tasks_cbs_head); - if (!rcu_tasks_cbs_head) { - WARN_ON(signal_pending(current)); - schedule_timeout_interruptible(HZ/10); - } - continue; - } - - /* - * Wait for all pre-existing t->on_rq and t->nvcsw - * transitions to complete. Invoking synchronize_rcu() - * suffices because all these transitions occur with - * interrupts disabled. Without this synchronize_rcu(), - * a read-side critical section that started before the - * grace period might be incorrectly seen as having started - * after the grace period. - * - * This synchronize_rcu() also dispenses with the - * need for a memory barrier on the first store to - * ->rcu_tasks_holdout, as it forces the store to happen - * after the beginning of the grace period. - */ - synchronize_rcu(); - - /* - * There were callbacks, so we need to wait for an - * RCU-tasks grace period. Start off by scanning - * the task list for tasks that are not already - * voluntarily blocked. Mark these tasks and make - * a list of them in rcu_tasks_holdouts. - */ - rcu_read_lock(); - for_each_process_thread(g, t) { - if (t != current && READ_ONCE(t->on_rq) && - !is_idle_task(t)) { - get_task_struct(t); - t->rcu_tasks_nvcsw = READ_ONCE(t->nvcsw); - WRITE_ONCE(t->rcu_tasks_holdout, true); - list_add(&t->rcu_tasks_holdout_list, - &rcu_tasks_holdouts); - } - } - rcu_read_unlock(); - - /* - * Wait for tasks that are in the process of exiting. - * This does only part of the job, ensuring that all - * tasks that were previously exiting reach the point - * where they have disabled preemption, allowing the - * later synchronize_rcu() to finish the job. - */ - synchronize_srcu(&tasks_rcu_exit_srcu); - - /* - * Each pass through the following loop scans the list - * of holdout tasks, removing any that are no longer - * holdouts. When the list is empty, we are done. - */ - lastreport = jiffies; - - /* Start off with HZ/10 wait and slowly back off to 1 HZ wait*/ - fract = 10; - - for (;;) { - bool firstreport; - bool needreport; - int rtst; - struct task_struct *t1; - - if (list_empty(&rcu_tasks_holdouts)) - break; - - /* Slowly back off waiting for holdouts */ - schedule_timeout_interruptible(HZ/fract); - - if (fract > 1) - fract--; - - rtst = READ_ONCE(rcu_task_stall_timeout); - needreport = rtst > 0 && - time_after(jiffies, lastreport + rtst); - if (needreport) - lastreport = jiffies; - firstreport = true; - WARN_ON(signal_pending(current)); - list_for_each_entry_safe(t, t1, &rcu_tasks_holdouts, - rcu_tasks_holdout_list) { - check_holdout_task(t, needreport, &firstreport); - cond_resched(); - } - } - - /* - * Because ->on_rq and ->nvcsw are not guaranteed - * to have a full memory barriers prior to them in the - * schedule() path, memory reordering on other CPUs could - * cause their RCU-tasks read-side critical sections to - * extend past the end of the grace period. However, - * because these ->nvcsw updates are carried out with - * interrupts disabled, we can use synchronize_rcu() - * to force the needed ordering on all such CPUs. - * - * This synchronize_rcu() also confines all - * ->rcu_tasks_holdout accesses to be within the grace - * period, avoiding the need for memory barriers for - * ->rcu_tasks_holdout accesses. - * - * In addition, this synchronize_rcu() waits for exiting - * tasks to complete their final preempt_disable() region - * of execution, cleaning up after the synchronize_srcu() - * above. - */ - synchronize_rcu(); - - /* Invoke the callbacks. */ - while (list) { - next = list->next; - local_bh_disable(); - list->func(list); - local_bh_enable(); - list = next; - cond_resched(); - } - /* Paranoid sleep to keep this from entering a tight loop */ - schedule_timeout_uninterruptible(HZ/10); - } -} - -/* Spawn rcu_tasks_kthread() at core_initcall() time. */ -static int __init rcu_spawn_tasks_kthread(void) -{ - struct task_struct *t; - - t = kthread_run(rcu_tasks_kthread, NULL, "rcu_tasks_kthread"); - if (WARN_ONCE(IS_ERR(t), "%s: Could not start Tasks-RCU grace-period kthread, OOM is now expected behavior\n", __func__)) - return 0; - smp_mb(); /* Ensure others see full kthread. */ - WRITE_ONCE(rcu_tasks_kthread_ptr, t); - return 0; -} -core_initcall(rcu_spawn_tasks_kthread); - -/* Do the srcu_read_lock() for the above synchronize_srcu(). */ -void exit_tasks_rcu_start(void) -{ - preempt_disable(); - current->rcu_tasks_idx = __srcu_read_lock(&tasks_rcu_exit_srcu); - preempt_enable(); -} - -/* Do the srcu_read_unlock() for the above synchronize_srcu(). */ -void exit_tasks_rcu_finish(void) -{ - preempt_disable(); - __srcu_read_unlock(&tasks_rcu_exit_srcu, current->rcu_tasks_idx); - preempt_enable(); -} - -#endif /* #ifdef CONFIG_TASKS_RCU */ - -#ifndef CONFIG_TINY_RCU - -/* - * Print any non-default Tasks RCU settings. + * Returns a value that will always be treated by functions like + * poll_state_synchronize_rcu() as a cookie whose grace period has already + * completed. */ -static void __init rcu_tasks_bootup_oddness(void) +unsigned long get_completed_synchronize_rcu(void) { -#ifdef CONFIG_TASKS_RCU - if (rcu_task_stall_timeout != RCU_TASK_STALL_TIMEOUT) - pr_info("\tTasks-RCU CPU stall warnings timeout set to %d (rcu_task_stall_timeout).\n", rcu_task_stall_timeout); - else - pr_info("\tTasks RCU enabled.\n"); -#endif /* #ifdef CONFIG_TASKS_RCU */ + return RCU_GET_STATE_COMPLETED; } - -#endif /* #ifndef CONFIG_TINY_RCU */ +EXPORT_SYMBOL_GPL(get_completed_synchronize_rcu); #ifdef CONFIG_PROVE_RCU @@ -880,15 +596,27 @@ static void test_callback(struct rcu_head *r) } DEFINE_STATIC_SRCU(early_srcu); +static unsigned long early_srcu_cookie; + +struct early_boot_kfree_rcu { + struct rcu_head rh; +}; static void early_boot_test_call_rcu(void) { static struct rcu_head head; + int idx; static struct rcu_head shead; + struct early_boot_kfree_rcu *rhp; + idx = srcu_down_read(&early_srcu); + srcu_up_read(&early_srcu, idx); call_rcu(&head, test_callback); - if (IS_ENABLED(CONFIG_SRCU)) - call_srcu(&early_srcu, &shead, test_callback); + early_srcu_cookie = start_poll_synchronize_srcu(&early_srcu); + call_srcu(&early_srcu, &shead, test_callback); + rhp = kmalloc(sizeof(*rhp), GFP_KERNEL); + if (!WARN_ON_ONCE(!rhp)) + kfree_rcu(rhp, rh); } void rcu_early_boot_tests(void) @@ -908,10 +636,10 @@ static int rcu_verify_early_boot_tests(void) if (rcu_self_test) { early_boot_test_counter++; rcu_barrier(); - if (IS_ENABLED(CONFIG_SRCU)) { - early_boot_test_counter++; - srcu_barrier(&early_srcu); - } + early_boot_test_counter++; + srcu_barrier(&early_srcu); + WARN_ON_ONCE(!poll_state_synchronize_srcu(&early_srcu, early_srcu_cookie)); + cleanup_srcu_struct(&early_srcu); } if (rcu_self_test_counter != early_boot_test_counter) { WARN_ON(1); @@ -925,6 +653,8 @@ late_initcall(rcu_verify_early_boot_tests); void rcu_early_boot_tests(void) {} #endif /* CONFIG_PROVE_RCU */ +#include "tasks.h" + #ifndef CONFIG_TINY_RCU /* |
