summaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Kconfig.kexec35
-rw-r--r--kernel/Makefile14
-rw-r--r--kernel/audit.c6
-rw-r--r--kernel/audit.h2
-rw-r--r--kernel/audit_tree.c63
-rw-r--r--kernel/auditsc.c2
-rw-r--r--kernel/bpf/Makefile5
-rw-r--r--kernel/bpf/arena.c43
-rw-r--r--kernel/bpf/arraymap.c11
-rw-r--r--kernel/bpf/bpf_iter.c14
-rw-r--r--kernel/bpf/bpf_local_storage.c8
-rw-r--r--kernel/bpf/bpf_lru_list.c9
-rw-r--r--kernel/bpf/bpf_lru_list.h1
-rw-r--r--kernel/bpf/bpf_struct_ops.c7
-rw-r--r--kernel/bpf/btf.c168
-rw-r--r--kernel/bpf/cgroup.c275
-rw-r--r--kernel/bpf/core.c182
-rw-r--r--kernel/bpf/cpumap.c3
-rw-r--r--kernel/bpf/dmabuf_iter.c150
-rw-r--r--kernel/bpf/hashtab.c148
-rw-r--r--kernel/bpf/helpers.c614
-rw-r--r--kernel/bpf/link_iter.c3
-rw-r--r--kernel/bpf/local_storage.c9
-rw-r--r--kernel/bpf/net_namespace.c10
-rw-r--r--kernel/bpf/preload/Kconfig5
-rw-r--r--kernel/bpf/preload/iterators/iterators.lskel-big-endian.h492
-rw-r--r--kernel/bpf/prog_iter.c3
-rw-r--r--kernel/bpf/rqspinlock.c23
-rw-r--r--kernel/bpf/stream.c526
-rw-r--r--kernel/bpf/syscall.c301
-rw-r--r--kernel/bpf/sysfs_btf.c34
-rw-r--r--kernel/bpf/tcx.c16
-rw-r--r--kernel/bpf/tnum.c5
-rw-r--r--kernel/bpf/token.c25
-rw-r--r--kernel/bpf/trampoline.c51
-rw-r--r--kernel/bpf/verifier.c2961
-rw-r--r--kernel/cgroup/cgroup-internal.h6
-rw-r--r--kernel/cgroup/cgroup.c153
-rw-r--r--kernel/cgroup/cpuset.c130
-rw-r--r--kernel/cgroup/legacy_freezer.c11
-rw-r--r--kernel/cgroup/misc.c4
-rw-r--r--kernel/cgroup/rstat.c467
-rw-r--r--kernel/configs/debug.config5
-rw-r--r--kernel/configs/hardening.config6
-rw-r--r--kernel/configs/tiny.config1
-rw-r--r--kernel/configs/xen.config3
-rw-r--r--kernel/cpu.c130
-rw-r--r--kernel/crash_dump_dm_crypt.c464
-rw-r--r--kernel/crash_reserve.c2
-rw-r--r--kernel/delayacct.c51
-rw-r--r--kernel/dma/contiguous.c5
-rw-r--r--kernel/dma/direct.c44
-rw-r--r--kernel/dma/mapping.c18
-rw-r--r--kernel/entry/Makefile3
-rw-r--r--kernel/entry/common.c163
-rw-r--r--kernel/entry/syscall-common.c112
-rw-r--r--kernel/entry/syscall_user_dispatch.c36
-rw-r--r--kernel/events/core.c142
-rw-r--r--kernel/events/ring_buffer.c4
-rw-r--r--kernel/events/uprobes.c357
-rw-r--r--kernel/exit.c85
-rw-r--r--kernel/fork.c327
-rw-r--r--kernel/freezer.c15
-rw-r--r--kernel/futex/core.c295
-rw-r--r--kernel/futex/futex.h2
-rw-r--r--kernel/gcov/gcc_4_7.c4
-rw-r--r--kernel/hung_task.c55
-rw-r--r--kernel/irq/Kconfig11
-rw-r--r--kernel/irq/Makefile1
-rw-r--r--kernel/irq/affinity.c11
-rw-r--r--kernel/irq/autoprobe.c26
-rw-r--r--kernel/irq/chip.c705
-rw-r--r--kernel/irq/cpuhotplug.c19
-rw-r--r--kernel/irq/debugfs.c7
-rw-r--r--kernel/irq/generic-chip.c47
-rw-r--r--kernel/irq/internals.h54
-rw-r--r--kernel/irq/irq_sim.c2
-rw-r--r--kernel/irq/irq_test.c229
-rw-r--r--kernel/irq/irqdesc.c176
-rw-r--r--kernel/irq/irqdomain.c132
-rw-r--r--kernel/irq/manage.c1166
-rw-r--r--kernel/irq/msi.c195
-rw-r--r--kernel/irq/pm.c54
-rw-r--r--kernel/irq/proc.c67
-rw-r--r--kernel/irq/resend.c50
-rw-r--r--kernel/irq/spurious.c137
-rw-r--r--kernel/kallsyms.c3
-rw-r--r--kernel/kcsan/kcsan_test.c4
-rw-r--r--kernel/kexec_core.c57
-rw-r--r--kernel/kexec_file.c127
-rw-r--r--kernel/kexec_handover.c1271
-rw-r--r--kernel/kexec_internal.h16
-rw-r--r--kernel/kprobes.c8
-rw-r--r--kernel/kstack_erase.c (renamed from kernel/stackleak.c)22
-rw-r--r--kernel/kthread.c3
-rw-r--r--kernel/locking/lockdep.c39
-rw-r--r--kernel/locking/lockdep_internals.h18
-rw-r--r--kernel/locking/lockdep_proc.c2
-rw-r--r--kernel/locking/mutex-debug.c9
-rw-r--r--kernel/locking/mutex.c48
-rw-r--r--kernel/locking/mutex.h3
-rw-r--r--kernel/locking/rtmutex_api.c51
-rw-r--r--kernel/locking/rwsem.c4
-rw-r--r--kernel/locking/semaphore.c57
-rw-r--r--kernel/locking/ww_mutex.h16
-rw-r--r--kernel/module/internal.h10
-rw-r--r--kernel/module/main.c176
-rw-r--r--kernel/module/strict_rwx.c47
-rw-r--r--kernel/module/sysfs.c14
-rw-r--r--kernel/panic.c114
-rw-r--r--kernel/pid.c33
-rw-r--r--kernel/power/console.c7
-rw-r--r--kernel/power/energy_model.c72
-rw-r--r--kernel/power/hibernate.c52
-rw-r--r--kernel/power/main.c17
-rw-r--r--kernel/power/power.h9
-rw-r--r--kernel/power/process.c8
-rw-r--r--kernel/power/snapshot.c54
-rw-r--r--kernel/power/suspend.c6
-rw-r--r--kernel/power/wakelock.c3
-rw-r--r--kernel/printk/internal.h1
-rw-r--r--kernel/ptrace.c179
-rw-r--r--kernel/rcu/rcutorture.c358
-rw-r--r--kernel/rcu/refscale.c42
-rw-r--r--kernel/rcu/srcutree.c2
-rw-r--r--kernel/rcu/tasks.h3
-rw-r--r--kernel/rcu/tree.c84
-rw-r--r--kernel/rcu/tree.h13
-rw-r--r--kernel/rcu/tree_exp.h59
-rw-r--r--kernel/rcu/tree_nocb.h12
-rw-r--r--kernel/rcu/tree_plugin.h122
-rw-r--r--kernel/rcu/tree_stall.h62
-rw-r--r--kernel/relay.c111
-rw-r--r--kernel/resource.c5
-rw-r--r--kernel/sched/autogroup.c9
-rw-r--r--kernel/sched/autogroup.h6
-rw-r--r--kernel/sched/build_policy.c6
-rw-r--r--kernel/sched/build_utility.c9
-rw-r--r--kernel/sched/clock.c7
-rw-r--r--kernel/sched/completion.c5
-rw-r--r--kernel/sched/core.c897
-rw-r--r--kernel/sched/core_sched.c2
-rw-r--r--kernel/sched/cpuacct.c2
-rw-r--r--kernel/sched/cpudeadline.c1
-rw-r--r--kernel/sched/cpudeadline.h4
-rw-r--r--kernel/sched/cpufreq.c1
-rw-r--r--kernel/sched/cpufreq_schedutil.c15
-rw-r--r--kernel/sched/cpupri.c1
-rw-r--r--kernel/sched/cpupri.h5
-rw-r--r--kernel/sched/cputime.c17
-rw-r--r--kernel/sched/deadline.c218
-rw-r--r--kernel/sched/debug.c47
-rw-r--r--kernel/sched/ext.c1822
-rw-r--r--kernel/sched/ext.h15
-rw-r--r--kernel/sched/ext_idle.c314
-rw-r--r--kernel/sched/ext_idle.h3
-rw-r--r--kernel/sched/fair.c420
-rw-r--r--kernel/sched/idle.c15
-rw-r--r--kernel/sched/isolation.c2
-rw-r--r--kernel/sched/loadavg.c8
-rw-r--r--kernel/sched/membarrier.c2
-rw-r--r--kernel/sched/pelt.c5
-rw-r--r--kernel/sched/pelt.h67
-rw-r--r--kernel/sched/psi.c131
-rw-r--r--kernel/sched/rt.c112
-rw-r--r--kernel/sched/sched-pelt.h1
-rw-r--r--kernel/sched/sched.h251
-rw-r--r--kernel/sched/smp.h7
-rw-r--r--kernel/sched/stats.c5
-rw-r--r--kernel/sched/stats.h10
-rw-r--r--kernel/sched/stop_task.c5
-rw-r--r--kernel/sched/swait.c1
-rw-r--r--kernel/sched/syscalls.c15
-rw-r--r--kernel/sched/topology.c82
-rw-r--r--kernel/sched/wait.c23
-rw-r--r--kernel/sched/wait_bit.c3
-rw-r--r--kernel/signal.c13
-rw-r--r--kernel/smp.c70
-rw-r--r--kernel/smpboot.c4
-rw-r--r--kernel/stop_machine.c20
-rw-r--r--kernel/sys.c29
-rw-r--r--kernel/sysctl-test.c49
-rw-r--r--kernel/sysctl.c378
-rw-r--r--kernel/time/Kconfig15
-rw-r--r--kernel/time/alarmtimer.c84
-rw-r--r--kernel/time/clocksource.c10
-rw-r--r--kernel/time/jiffies.c5
-rw-r--r--kernel/time/namespace.c5
-rw-r--r--kernel/time/ntp.c72
-rw-r--r--kernel/time/ntp_internal.h13
-rw-r--r--kernel/time/posix-cpu-timers.c9
-rw-r--r--kernel/time/posix-timers.c26
-rw-r--r--kernel/time/posix-timers.h1
-rw-r--r--kernel/time/sleep_timeout.c4
-rw-r--r--kernel/time/timecounter.c2
-rw-r--r--kernel/time/timekeeping.c631
-rw-r--r--kernel/time/timekeeping_internal.h3
-rw-r--r--kernel/time/timer.c78
-rw-r--r--kernel/time/timer_migration.c23
-rw-r--r--kernel/time/vsyscall.c70
-rw-r--r--kernel/trace/Kconfig26
-rw-r--r--kernel/trace/blktrace.c25
-rw-r--r--kernel/trace/bpf_trace.c411
-rw-r--r--kernel/trace/fgraph.c18
-rw-r--r--kernel/trace/fprobe.c9
-rw-r--r--kernel/trace/ftrace.c64
-rw-r--r--kernel/trace/pid_list.c8
-rw-r--r--kernel/trace/power-traces.c1
-rw-r--r--kernel/trace/ring_buffer.c403
-rw-r--r--kernel/trace/rv/Kconfig43
-rw-r--r--kernel/trace/rv/Makefile9
-rw-r--r--kernel/trace/rv/monitors/nrp/Kconfig (renamed from kernel/trace/rv/monitors/tss/Kconfig)12
-rw-r--r--kernel/trace/rv/monitors/nrp/nrp.c138
-rw-r--r--kernel/trace/rv/monitors/nrp/nrp.h75
-rw-r--r--kernel/trace/rv/monitors/nrp/nrp_trace.h15
-rw-r--r--kernel/trace/rv/monitors/opid/Kconfig19
-rw-r--r--kernel/trace/rv/monitors/opid/opid.c168
-rw-r--r--kernel/trace/rv/monitors/opid/opid.h104
-rw-r--r--kernel/trace/rv/monitors/opid/opid_trace.h (renamed from kernel/trace/rv/monitors/sncid/sncid_trace.h)8
-rw-r--r--kernel/trace/rv/monitors/pagefault/Kconfig20
-rw-r--r--kernel/trace/rv/monitors/pagefault/pagefault.c88
-rw-r--r--kernel/trace/rv/monitors/pagefault/pagefault.h64
-rw-r--r--kernel/trace/rv/monitors/pagefault/pagefault_trace.h14
-rw-r--r--kernel/trace/rv/monitors/rtapp/Kconfig11
-rw-r--r--kernel/trace/rv/monitors/rtapp/rtapp.c33
-rw-r--r--kernel/trace/rv/monitors/rtapp/rtapp.h3
-rw-r--r--kernel/trace/rv/monitors/sched/Kconfig1
-rw-r--r--kernel/trace/rv/monitors/sched/sched.c3
-rw-r--r--kernel/trace/rv/monitors/sco/sco.c7
-rw-r--r--kernel/trace/rv/monitors/scpd/Kconfig2
-rw-r--r--kernel/trace/rv/monitors/scpd/scpd.c7
-rw-r--r--kernel/trace/rv/monitors/sleep/Kconfig22
-rw-r--r--kernel/trace/rv/monitors/sleep/sleep.c237
-rw-r--r--kernel/trace/rv/monitors/sleep/sleep.h257
-rw-r--r--kernel/trace/rv/monitors/sleep/sleep_trace.h14
-rw-r--r--kernel/trace/rv/monitors/sncid/sncid.c96
-rw-r--r--kernel/trace/rv/monitors/sncid/sncid.h49
-rw-r--r--kernel/trace/rv/monitors/snep/Kconfig2
-rw-r--r--kernel/trace/rv/monitors/snep/snep.c7
-rw-r--r--kernel/trace/rv/monitors/snep/snep.h14
-rw-r--r--kernel/trace/rv/monitors/snroc/snroc.c3
-rw-r--r--kernel/trace/rv/monitors/sssw/Kconfig (renamed from kernel/trace/rv/monitors/sncid/Kconfig)10
-rw-r--r--kernel/trace/rv/monitors/sssw/sssw.c116
-rw-r--r--kernel/trace/rv/monitors/sssw/sssw.h105
-rw-r--r--kernel/trace/rv/monitors/sssw/sssw_trace.h15
-rw-r--r--kernel/trace/rv/monitors/sts/Kconfig19
-rw-r--r--kernel/trace/rv/monitors/sts/sts.c156
-rw-r--r--kernel/trace/rv/monitors/sts/sts.h117
-rw-r--r--kernel/trace/rv/monitors/sts/sts_trace.h (renamed from kernel/trace/rv/monitors/tss/tss_trace.h)8
-rw-r--r--kernel/trace/rv/monitors/tss/tss.c91
-rw-r--r--kernel/trace/rv/monitors/tss/tss.h47
-rw-r--r--kernel/trace/rv/monitors/wip/Kconfig2
-rw-r--r--kernel/trace/rv/monitors/wip/wip.c3
-rw-r--r--kernel/trace/rv/monitors/wwnr/wwnr.c3
-rw-r--r--kernel/trace/rv/reactor_panic.c8
-rw-r--r--kernel/trace/rv/reactor_printk.c8
-rw-r--r--kernel/trace/rv/rv.c220
-rw-r--r--kernel/trace/rv/rv.h39
-rw-r--r--kernel/trace/rv/rv_reactors.c138
-rw-r--r--kernel/trace/rv/rv_trace.h166
-rw-r--r--kernel/trace/trace.c309
-rw-r--r--kernel/trace/trace.h30
-rw-r--r--kernel/trace/trace_branch.c4
-rw-r--r--kernel/trace/trace_entries.h12
-rw-r--r--kernel/trace/trace_eprobe.c53
-rw-r--r--kernel/trace/trace_events.c44
-rw-r--r--kernel/trace/trace_events_filter.c194
-rw-r--r--kernel/trace/trace_events_hist.c179
-rw-r--r--kernel/trace/trace_events_trigger.c64
-rw-r--r--kernel/trace/trace_fprobe.c614
-rw-r--r--kernel/trace/trace_functions.c24
-rw-r--r--kernel/trace/trace_functions_graph.c46
-rw-r--r--kernel/trace/trace_irqsoff.c47
-rw-r--r--kernel/trace/trace_kdb.c17
-rw-r--r--kernel/trace/trace_kprobe.c57
-rw-r--r--kernel/trace/trace_mmiotrace.c12
-rw-r--r--kernel/trace/trace_osnoise.c11
-rw-r--r--kernel/trace/trace_output.c60
-rw-r--r--kernel/trace/trace_probe.c152
-rw-r--r--kernel/trace/trace_probe.h26
-rw-r--r--kernel/trace/trace_sched_wakeup.c18
-rw-r--r--kernel/trace/trace_stack.c24
-rw-r--r--kernel/trace/trace_uprobe.c55
-rw-r--r--kernel/usermode_driver.c191
-rw-r--r--kernel/vmcore_info.c4
-rw-r--r--kernel/watchdog.c94
-rw-r--r--kernel/watchdog_perf.c22
-rw-r--r--kernel/workqueue.c26
288 files changed, 18221 insertions, 9980 deletions
diff --git a/kernel/Kconfig.kexec b/kernel/Kconfig.kexec
index 4d111f871951..2ee603a98813 100644
--- a/kernel/Kconfig.kexec
+++ b/kernel/Kconfig.kexec
@@ -38,8 +38,7 @@ config KEXEC
config KEXEC_FILE
bool "Enable kexec file based system call"
depends on ARCH_SUPPORTS_KEXEC_FILE
- select CRYPTO
- select CRYPTO_SHA256
+ select CRYPTO_LIB_SHA256
select KEXEC_CORE
help
This is new version of kexec system call. This system call is
@@ -95,6 +94,20 @@ config KEXEC_JUMP
Jump between original kernel and kexeced kernel and invoke
code in physical address mode via KEXEC
+config KEXEC_HANDOVER
+ bool "kexec handover"
+ depends on ARCH_SUPPORTS_KEXEC_HANDOVER && ARCH_SUPPORTS_KEXEC_FILE
+ select MEMBLOCK_KHO_SCRATCH
+ select KEXEC_FILE
+ select DEBUG_FS
+ select LIBFDT
+ select CMA
+ help
+ Allow kexec to hand over state across kernels by generating and
+ passing additional metadata to the target kernel. This is useful
+ to keep data or state alive across the kexec. For this to work,
+ both source and target kernels need to have this option enabled.
+
config CRASH_DUMP
bool "kernel crash dumps"
default ARCH_DEFAULT_CRASH_DUMP
@@ -116,6 +129,24 @@ config CRASH_DUMP
For s390, this option also enables zfcpdump.
See also <file:Documentation/arch/s390/zfcpdump.rst>
+config CRASH_DM_CRYPT
+ bool "Support saving crash dump to dm-crypt encrypted volume"
+ depends on KEXEC_FILE
+ depends on CRASH_DUMP
+ depends on DM_CRYPT
+ depends on KEYS
+ help
+ With this option enabled, user space can intereact with
+ /sys/kernel/config/crash_dm_crypt_keys to make the dm crypt keys
+ persistent for the dump-capture kernel.
+
+config CRASH_DM_CRYPT_CONFIGS
+ def_tristate CRASH_DM_CRYPT
+ select CONFIGFS_FS
+ help
+ CRASH_DM_CRYPT cannot directly select CONFIGFS_FS, because that
+ is required to be built-in.
+
config CRASH_HOTPLUG
bool "Update the crash elfcorehdr on system configuration changes"
default y
diff --git a/kernel/Makefile b/kernel/Makefile
index 434929de17ef..a89602a2a0ab 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -12,7 +12,6 @@ obj-y = fork.o exec_domain.o panic.o \
notifier.o ksysfs.o cred.o reboot.o \
async.o range.o smpboot.o ucount.o regset.o ksyms_common.o
-obj-$(CONFIG_USERMODE_DRIVER) += usermode_driver.o
obj-$(CONFIG_MULTIUSER) += groups.o
obj-$(CONFIG_VHOST_TASK) += vhost_task.o
@@ -77,9 +76,11 @@ obj-$(CONFIG_VMCORE_INFO) += vmcore_info.o elfcorehdr.o
obj-$(CONFIG_CRASH_RESERVE) += crash_reserve.o
obj-$(CONFIG_KEXEC_CORE) += kexec_core.o
obj-$(CONFIG_CRASH_DUMP) += crash_core.o
+obj-$(CONFIG_CRASH_DM_CRYPT) += crash_dump_dm_crypt.o
obj-$(CONFIG_KEXEC) += kexec.o
obj-$(CONFIG_KEXEC_FILE) += kexec_file.o
obj-$(CONFIG_KEXEC_ELF) += kexec_elf.o
+obj-$(CONFIG_KEXEC_HANDOVER) += kexec_handover.o
obj-$(CONFIG_BACKTRACE_SELF_TEST) += backtracetest.o
obj-$(CONFIG_COMPAT) += compat.o
obj-$(CONFIG_CGROUPS) += cgroup/
@@ -137,11 +138,12 @@ obj-$(CONFIG_WATCH_QUEUE) += watch_queue.o
obj-$(CONFIG_RESOURCE_KUNIT_TEST) += resource_kunit.o
obj-$(CONFIG_SYSCTL_KUNIT_TEST) += sysctl-test.o
-CFLAGS_stackleak.o += $(DISABLE_STACKLEAK_PLUGIN)
-obj-$(CONFIG_GCC_PLUGIN_STACKLEAK) += stackleak.o
-KASAN_SANITIZE_stackleak.o := n
-KCSAN_SANITIZE_stackleak.o := n
-KCOV_INSTRUMENT_stackleak.o := n
+CFLAGS_kstack_erase.o += $(DISABLE_KSTACK_ERASE)
+CFLAGS_kstack_erase.o += $(call cc-option,-mgeneral-regs-only)
+obj-$(CONFIG_KSTACK_ERASE) += kstack_erase.o
+KASAN_SANITIZE_kstack_erase.o := n
+KCSAN_SANITIZE_kstack_erase.o := n
+KCOV_INSTRUMENT_kstack_erase.o := n
obj-$(CONFIG_SCF_TORTURE_TEST) += scftorture.o
diff --git a/kernel/audit.c b/kernel/audit.c
index 5f5bf85bcc90..61b5744d0bb6 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -1956,8 +1956,8 @@ static inline int audit_expand(struct audit_buffer *ab, int extra)
* will be called a second time. Currently, we assume that a printk
* can't format message larger than 1024 bytes, so we don't either.
*/
-static void audit_log_vformat(struct audit_buffer *ab, const char *fmt,
- va_list args)
+static __printf(2, 0)
+void audit_log_vformat(struct audit_buffer *ab, const char *fmt, va_list args)
{
int len, avail;
struct sk_buff *skb;
@@ -2285,7 +2285,7 @@ void audit_log_path_denied(int type, const char *operation)
{
struct audit_buffer *ab;
- if (!audit_enabled || audit_dummy_context())
+ if (!audit_enabled)
return;
/* Generate log with subject, operation, outcome. */
diff --git a/kernel/audit.h b/kernel/audit.h
index 0211cb307d30..2a24d01c5fb0 100644
--- a/kernel/audit.h
+++ b/kernel/audit.h
@@ -200,7 +200,7 @@ struct audit_context {
int argc;
} execve;
struct {
- char *name;
+ const char *name;
} module;
struct {
struct audit_ntp_data ntp_data;
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index f2f38903b2fe..b0eae2a3c895 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -668,12 +668,6 @@ int audit_remove_tree_rule(struct audit_krule *rule)
return 0;
}
-static int compare_root(struct vfsmount *mnt, void *arg)
-{
- return inode_to_key(d_backing_inode(mnt->mnt_root)) ==
- (unsigned long)arg;
-}
-
void audit_trim_trees(void)
{
struct list_head cursor;
@@ -683,8 +677,9 @@ void audit_trim_trees(void)
while (cursor.next != &tree_list) {
struct audit_tree *tree;
struct path path;
- struct vfsmount *root_mnt;
struct audit_node *node;
+ struct path *paths;
+ struct path array[16];
int err;
tree = container_of(cursor.next, struct audit_tree, list);
@@ -696,9 +691,9 @@ void audit_trim_trees(void)
if (err)
goto skip_it;
- root_mnt = collect_mounts(&path);
+ paths = collect_paths(&path, array, 16);
path_put(&path);
- if (IS_ERR(root_mnt))
+ if (IS_ERR(paths))
goto skip_it;
spin_lock(&hash_lock);
@@ -706,14 +701,17 @@ void audit_trim_trees(void)
struct audit_chunk *chunk = find_chunk(node);
/* this could be NULL if the watch is dying else where... */
node->index |= 1U<<31;
- if (iterate_mounts(compare_root,
- (void *)(chunk->key),
- root_mnt))
- node->index &= ~(1U<<31);
+ for (struct path *p = paths; p->dentry; p++) {
+ struct inode *inode = p->dentry->d_inode;
+ if (inode_to_key(inode) == chunk->key) {
+ node->index &= ~(1U<<31);
+ break;
+ }
+ }
}
spin_unlock(&hash_lock);
trim_marked(tree);
- drop_collected_mounts(root_mnt);
+ drop_collected_paths(paths, array);
skip_it:
put_tree(tree);
mutex_lock(&audit_filter_mutex);
@@ -742,9 +740,14 @@ void audit_put_tree(struct audit_tree *tree)
put_tree(tree);
}
-static int tag_mount(struct vfsmount *mnt, void *arg)
+static int tag_mounts(struct path *paths, struct audit_tree *tree)
{
- return tag_chunk(d_backing_inode(mnt->mnt_root), arg);
+ for (struct path *p = paths; p->dentry; p++) {
+ int err = tag_chunk(p->dentry->d_inode, tree);
+ if (err)
+ return err;
+ }
+ return 0;
}
/*
@@ -801,7 +804,8 @@ int audit_add_tree_rule(struct audit_krule *rule)
{
struct audit_tree *seed = rule->tree, *tree;
struct path path;
- struct vfsmount *mnt;
+ struct path array[16];
+ struct path *paths;
int err;
rule->tree = NULL;
@@ -828,16 +832,16 @@ int audit_add_tree_rule(struct audit_krule *rule)
err = kern_path(tree->pathname, 0, &path);
if (err)
goto Err;
- mnt = collect_mounts(&path);
+ paths = collect_paths(&path, array, 16);
path_put(&path);
- if (IS_ERR(mnt)) {
- err = PTR_ERR(mnt);
+ if (IS_ERR(paths)) {
+ err = PTR_ERR(paths);
goto Err;
}
get_tree(tree);
- err = iterate_mounts(tag_mount, tree, mnt);
- drop_collected_mounts(mnt);
+ err = tag_mounts(paths, tree);
+ drop_collected_paths(paths, array);
if (!err) {
struct audit_node *node;
@@ -872,20 +876,21 @@ int audit_tag_tree(char *old, char *new)
struct list_head cursor, barrier;
int failed = 0;
struct path path1, path2;
- struct vfsmount *tagged;
+ struct path array[16];
+ struct path *paths;
int err;
err = kern_path(new, 0, &path2);
if (err)
return err;
- tagged = collect_mounts(&path2);
+ paths = collect_paths(&path2, array, 16);
path_put(&path2);
- if (IS_ERR(tagged))
- return PTR_ERR(tagged);
+ if (IS_ERR(paths))
+ return PTR_ERR(paths);
err = kern_path(old, 0, &path1);
if (err) {
- drop_collected_mounts(tagged);
+ drop_collected_paths(paths, array);
return err;
}
@@ -914,7 +919,7 @@ int audit_tag_tree(char *old, char *new)
continue;
}
- failed = iterate_mounts(tag_mount, tree, tagged);
+ failed = tag_mounts(paths, tree);
if (failed) {
put_tree(tree);
mutex_lock(&audit_filter_mutex);
@@ -955,7 +960,7 @@ int audit_tag_tree(char *old, char *new)
list_del(&cursor);
mutex_unlock(&audit_filter_mutex);
path_put(&path1);
- drop_collected_mounts(tagged);
+ drop_collected_paths(paths, array);
return failed;
}
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 78fd876a5473..eb98cd6fe91f 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -2864,7 +2864,7 @@ void __audit_openat2_how(struct open_how *how)
context->type = AUDIT_OPENAT2;
}
-void __audit_log_kern_module(char *name)
+void __audit_log_kern_module(const char *name)
{
struct audit_context *context = audit_context();
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index 70502f038b92..269c04a24664 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -14,7 +14,7 @@ obj-$(CONFIG_BPF_SYSCALL) += bpf_local_storage.o bpf_task_storage.o
obj-${CONFIG_BPF_LSM} += bpf_inode_storage.o
obj-$(CONFIG_BPF_SYSCALL) += disasm.o mprog.o
obj-$(CONFIG_BPF_JIT) += trampoline.o
-obj-$(CONFIG_BPF_SYSCALL) += btf.o memalloc.o rqspinlock.o
+obj-$(CONFIG_BPF_SYSCALL) += btf.o memalloc.o rqspinlock.o stream.o
ifeq ($(CONFIG_MMU)$(CONFIG_64BIT),yy)
obj-$(CONFIG_BPF_SYSCALL) += arena.o range_tree.o
endif
@@ -53,6 +53,9 @@ obj-$(CONFIG_BPF_SYSCALL) += relo_core.o
obj-$(CONFIG_BPF_SYSCALL) += btf_iter.o
obj-$(CONFIG_BPF_SYSCALL) += btf_relocate.o
obj-$(CONFIG_BPF_SYSCALL) += kmem_cache_iter.o
+ifeq ($(CONFIG_DMA_SHARED_BUFFER),y)
+obj-$(CONFIG_BPF_SYSCALL) += dmabuf_iter.o
+endif
CFLAGS_REMOVE_percpu_freelist.o = $(CC_FLAGS_FTRACE)
CFLAGS_REMOVE_bpf_lru_list.o = $(CC_FLAGS_FTRACE)
diff --git a/kernel/bpf/arena.c b/kernel/bpf/arena.c
index 0d56cea71602..5b37753799d2 100644
--- a/kernel/bpf/arena.c
+++ b/kernel/bpf/arena.c
@@ -550,6 +550,34 @@ static void arena_free_pages(struct bpf_arena *arena, long uaddr, long page_cnt)
}
}
+/*
+ * Reserve an arena virtual address range without populating it. This call stops
+ * bpf_arena_alloc_pages from adding pages to this range.
+ */
+static int arena_reserve_pages(struct bpf_arena *arena, long uaddr, u32 page_cnt)
+{
+ long page_cnt_max = (arena->user_vm_end - arena->user_vm_start) >> PAGE_SHIFT;
+ long pgoff;
+ int ret;
+
+ if (uaddr & ~PAGE_MASK)
+ return 0;
+
+ pgoff = compute_pgoff(arena, uaddr);
+ if (pgoff + page_cnt > page_cnt_max)
+ return -EINVAL;
+
+ guard(mutex)(&arena->lock);
+
+ /* Cannot guard already allocated pages. */
+ ret = is_range_tree_set(&arena->rt, pgoff, page_cnt);
+ if (ret)
+ return -EBUSY;
+
+ /* "Allocate" the region to prevent it from being allocated. */
+ return range_tree_clear(&arena->rt, pgoff, page_cnt);
+}
+
__bpf_kfunc_start_defs();
__bpf_kfunc void *bpf_arena_alloc_pages(void *p__map, void *addr__ign, u32 page_cnt,
@@ -573,11 +601,26 @@ __bpf_kfunc void bpf_arena_free_pages(void *p__map, void *ptr__ign, u32 page_cnt
return;
arena_free_pages(arena, (long)ptr__ign, page_cnt);
}
+
+__bpf_kfunc int bpf_arena_reserve_pages(void *p__map, void *ptr__ign, u32 page_cnt)
+{
+ struct bpf_map *map = p__map;
+ struct bpf_arena *arena = container_of(map, struct bpf_arena, map);
+
+ if (map->map_type != BPF_MAP_TYPE_ARENA)
+ return -EINVAL;
+
+ if (!page_cnt)
+ return 0;
+
+ return arena_reserve_pages(arena, (long)ptr__ign, page_cnt);
+}
__bpf_kfunc_end_defs();
BTF_KFUNCS_START(arena_kfuncs)
BTF_ID_FLAGS(func, bpf_arena_alloc_pages, KF_TRUSTED_ARGS | KF_SLEEPABLE | KF_ARENA_RET | KF_ARENA_ARG2)
BTF_ID_FLAGS(func, bpf_arena_free_pages, KF_TRUSTED_ARGS | KF_SLEEPABLE | KF_ARENA_ARG2)
+BTF_ID_FLAGS(func, bpf_arena_reserve_pages, KF_TRUSTED_ARGS | KF_SLEEPABLE | KF_ARENA_ARG2)
BTF_KFUNCS_END(arena_kfuncs)
static const struct btf_kfunc_id_set common_kfunc_set = {
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index eb28c0f219ee..3d080916faf9 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -530,8 +530,6 @@ static int array_map_check_btf(const struct bpf_map *map,
const struct btf_type *key_type,
const struct btf_type *value_type)
{
- u32 int_data;
-
/* One exception for keyless BTF: .bss/.data/.rodata map */
if (btf_type_is_void(key_type)) {
if (map->map_type != BPF_MAP_TYPE_ARRAY ||
@@ -544,14 +542,11 @@ static int array_map_check_btf(const struct bpf_map *map,
return 0;
}
- if (BTF_INFO_KIND(key_type->info) != BTF_KIND_INT)
- return -EINVAL;
-
- int_data = *(u32 *)(key_type + 1);
- /* bpf array can only take a u32 key. This check makes sure
+ /*
+ * Bpf array can only take a u32 key. This check makes sure
* that the btf matches the attr used during map_create.
*/
- if (BTF_INT_BITS(int_data) != 32 || BTF_INT_OFFSET(int_data))
+ if (!btf_type_is_i32(key_type))
return -EINVAL;
return 0;
diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c
index 380e9a7cac75..0cbcae727079 100644
--- a/kernel/bpf/bpf_iter.c
+++ b/kernel/bpf/bpf_iter.c
@@ -38,8 +38,7 @@ static DEFINE_MUTEX(link_mutex);
/* incremented on every opened seq_file */
static atomic64_t session_id;
-static int prepare_seq_file(struct file *file, struct bpf_iter_link *link,
- const struct bpf_iter_seq_info *seq_info);
+static int prepare_seq_file(struct file *file, struct bpf_iter_link *link);
static void bpf_iter_inc_seq_num(struct seq_file *seq)
{
@@ -257,7 +256,7 @@ static int iter_open(struct inode *inode, struct file *file)
{
struct bpf_iter_link *link = inode->i_private;
- return prepare_seq_file(file, link, __get_seq_info(link));
+ return prepare_seq_file(file, link);
}
static int iter_release(struct inode *inode, struct file *file)
@@ -553,7 +552,8 @@ int bpf_iter_link_attach(const union bpf_attr *attr, bpfptr_t uattr,
if (!link)
return -ENOMEM;
- bpf_link_init(&link->link, BPF_LINK_TYPE_ITER, &bpf_iter_link_lops, prog);
+ bpf_link_init(&link->link, BPF_LINK_TYPE_ITER, &bpf_iter_link_lops, prog,
+ attr->link_create.attach_type);
link->tinfo = tinfo;
err = bpf_link_prime(&link->link, &link_primer);
@@ -586,9 +586,9 @@ static void init_seq_meta(struct bpf_iter_priv_data *priv_data,
priv_data->done_stop = false;
}
-static int prepare_seq_file(struct file *file, struct bpf_iter_link *link,
- const struct bpf_iter_seq_info *seq_info)
+static int prepare_seq_file(struct file *file, struct bpf_iter_link *link)
{
+ const struct bpf_iter_seq_info *seq_info = __get_seq_info(link);
struct bpf_iter_priv_data *priv_data;
struct bpf_iter_target_info *tinfo;
struct bpf_prog *prog;
@@ -653,7 +653,7 @@ int bpf_iter_new_fd(struct bpf_link *link)
}
iter_link = container_of(link, struct bpf_iter_link, link);
- err = prepare_seq_file(file, iter_link, __get_seq_info(iter_link));
+ err = prepare_seq_file(file, iter_link);
if (err)
goto free_file;
diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c
index fa56c30833ff..b931fbceb54d 100644
--- a/kernel/bpf/bpf_local_storage.c
+++ b/kernel/bpf/bpf_local_storage.c
@@ -722,13 +722,7 @@ int bpf_local_storage_map_check_btf(const struct bpf_map *map,
const struct btf_type *key_type,
const struct btf_type *value_type)
{
- u32 int_data;
-
- if (BTF_INFO_KIND(key_type->info) != BTF_KIND_INT)
- return -EINVAL;
-
- int_data = *(u32 *)(key_type + 1);
- if (BTF_INT_BITS(int_data) != 32 || BTF_INT_OFFSET(int_data))
+ if (!btf_type_is_i32(key_type))
return -EINVAL;
return 0;
diff --git a/kernel/bpf/bpf_lru_list.c b/kernel/bpf/bpf_lru_list.c
index 3dabdd137d10..2d6e1c98d8ad 100644
--- a/kernel/bpf/bpf_lru_list.c
+++ b/kernel/bpf/bpf_lru_list.c
@@ -337,12 +337,12 @@ static void bpf_lru_list_pop_free_to_local(struct bpf_lru *lru,
list) {
__bpf_lru_node_move_to_free(l, node, local_free_list(loc_l),
BPF_LRU_LOCAL_LIST_T_FREE);
- if (++nfree == LOCAL_FREE_TARGET)
+ if (++nfree == lru->target_free)
break;
}
- if (nfree < LOCAL_FREE_TARGET)
- __bpf_lru_list_shrink(lru, l, LOCAL_FREE_TARGET - nfree,
+ if (nfree < lru->target_free)
+ __bpf_lru_list_shrink(lru, l, lru->target_free - nfree,
local_free_list(loc_l),
BPF_LRU_LOCAL_LIST_T_FREE);
@@ -577,6 +577,9 @@ static void bpf_common_lru_populate(struct bpf_lru *lru, void *buf,
list_add(&node->list, &l->lists[BPF_LRU_LIST_T_FREE]);
buf += elem_size;
}
+
+ lru->target_free = clamp((nr_elems / num_possible_cpus()) / 2,
+ 1, LOCAL_FREE_TARGET);
}
static void bpf_percpu_lru_populate(struct bpf_lru *lru, void *buf,
diff --git a/kernel/bpf/bpf_lru_list.h b/kernel/bpf/bpf_lru_list.h
index cbd8d3720c2b..fe2661a58ea9 100644
--- a/kernel/bpf/bpf_lru_list.h
+++ b/kernel/bpf/bpf_lru_list.h
@@ -58,6 +58,7 @@ struct bpf_lru {
del_from_htab_func del_from_htab;
void *del_arg;
unsigned int hash_offset;
+ unsigned int target_free;
unsigned int nr_scans;
bool percpu;
};
diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c
index db13ee70d94d..687a3e9c76f5 100644
--- a/kernel/bpf/bpf_struct_ops.c
+++ b/kernel/bpf/bpf_struct_ops.c
@@ -601,7 +601,7 @@ int bpf_struct_ops_prepare_trampoline(struct bpf_tramp_links *tlinks,
if (model->ret_size > 0)
flags |= BPF_TRAMP_F_RET_FENTRY_RET;
- size = arch_bpf_trampoline_size(model, flags, tlinks, NULL);
+ size = arch_bpf_trampoline_size(model, flags, tlinks, stub_func);
if (size <= 0)
return size ? : -EFAULT;
@@ -808,7 +808,7 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
goto reset_unlock;
}
bpf_link_init(&link->link, BPF_LINK_TYPE_STRUCT_OPS,
- &bpf_struct_ops_link_lops, prog);
+ &bpf_struct_ops_link_lops, prog, prog->expected_attach_type);
*plink++ = &link->link;
ksym = kzalloc(sizeof(*ksym), GFP_USER);
@@ -1351,7 +1351,8 @@ int bpf_struct_ops_link_create(union bpf_attr *attr)
err = -ENOMEM;
goto err_out;
}
- bpf_link_init(&link->link, BPF_LINK_TYPE_STRUCT_OPS, &bpf_struct_ops_map_lops, NULL);
+ bpf_link_init(&link->link, BPF_LINK_TYPE_STRUCT_OPS, &bpf_struct_ops_map_lops, NULL,
+ attr->link_create.attach_type);
err = bpf_link_prime(&link->link, &link_primer);
if (err)
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 16ba36f34dfa..64739308902f 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -26,6 +26,7 @@
#include <linux/bsearch.h>
#include <linux/kobject.h>
#include <linux/sysfs.h>
+#include <linux/overflow.h>
#include <net/netfilter/nf_bpf_link.h>
@@ -857,26 +858,43 @@ const struct btf_type *btf_type_by_id(const struct btf *btf, u32 type_id)
EXPORT_SYMBOL_GPL(btf_type_by_id);
/*
- * Regular int is not a bit field and it must be either
- * u8/u16/u32/u64 or __int128.
+ * Check that the type @t is a regular int. This means that @t is not
+ * a bit field and it has the same size as either of u8/u16/u32/u64
+ * or __int128. If @expected_size is not zero, then size of @t should
+ * be the same. A caller should already have checked that the type @t
+ * is an integer.
*/
+static bool __btf_type_int_is_regular(const struct btf_type *t, size_t expected_size)
+{
+ u32 int_data = btf_type_int(t);
+ u8 nr_bits = BTF_INT_BITS(int_data);
+ u8 nr_bytes = BITS_ROUNDUP_BYTES(nr_bits);
+
+ return BITS_PER_BYTE_MASKED(nr_bits) == 0 &&
+ BTF_INT_OFFSET(int_data) == 0 &&
+ (nr_bytes <= 16 && is_power_of_2(nr_bytes)) &&
+ (expected_size == 0 || nr_bytes == expected_size);
+}
+
static bool btf_type_int_is_regular(const struct btf_type *t)
{
- u8 nr_bits, nr_bytes;
- u32 int_data;
+ return __btf_type_int_is_regular(t, 0);
+}
- int_data = btf_type_int(t);
- nr_bits = BTF_INT_BITS(int_data);
- nr_bytes = BITS_ROUNDUP_BYTES(nr_bits);
- if (BITS_PER_BYTE_MASKED(nr_bits) ||
- BTF_INT_OFFSET(int_data) ||
- (nr_bytes != sizeof(u8) && nr_bytes != sizeof(u16) &&
- nr_bytes != sizeof(u32) && nr_bytes != sizeof(u64) &&
- nr_bytes != (2 * sizeof(u64)))) {
- return false;
- }
+bool btf_type_is_i32(const struct btf_type *t)
+{
+ return btf_type_is_int(t) && __btf_type_int_is_regular(t, 4);
+}
- return true;
+bool btf_type_is_i64(const struct btf_type *t)
+{
+ return btf_type_is_int(t) && __btf_type_int_is_regular(t, 8);
+}
+
+bool btf_type_is_primitive(const struct btf_type *t)
+{
+ return (btf_type_is_int(t) && btf_type_int_is_regular(t)) ||
+ btf_is_any_enum(t);
}
/*
@@ -3442,7 +3460,8 @@ btf_find_graph_root(const struct btf *btf, const struct btf_type *pt,
node_field_name = strstr(value_type, ":");
if (!node_field_name)
return -EINVAL;
- value_type = kstrndup(value_type, node_field_name - value_type, GFP_KERNEL | __GFP_NOWARN);
+ value_type = kstrndup(value_type, node_field_name - value_type,
+ GFP_KERNEL_ACCOUNT | __GFP_NOWARN);
if (!value_type)
return -ENOMEM;
id = btf_find_by_name_kind(btf, value_type, BTF_KIND_STRUCT);
@@ -3957,7 +3976,7 @@ struct btf_record *btf_parse_fields(const struct btf *btf, const struct btf_type
/* This needs to be kzalloc to zero out padding and unused fields, see
* comment in btf_record_equal.
*/
- rec = kzalloc(offsetof(struct btf_record, fields[cnt]), GFP_KERNEL | __GFP_NOWARN);
+ rec = kzalloc(struct_size(rec, fields, cnt), GFP_KERNEL_ACCOUNT | __GFP_NOWARN);
if (!rec)
return ERR_PTR(-ENOMEM);
@@ -5583,7 +5602,7 @@ btf_parse_struct_metas(struct bpf_verifier_log *log, struct btf *btf)
if (id < 0)
continue;
- new_aof = krealloc(aof, offsetof(struct btf_id_set, ids[aof->cnt + 1]),
+ new_aof = krealloc(aof, struct_size(new_aof, ids, aof->cnt + 1),
GFP_KERNEL | __GFP_NOWARN);
if (!new_aof) {
ret = -ENOMEM;
@@ -5610,7 +5629,7 @@ btf_parse_struct_metas(struct bpf_verifier_log *log, struct btf *btf)
if (ret != BTF_FIELD_FOUND)
continue;
- new_aof = krealloc(aof, offsetof(struct btf_id_set, ids[aof->cnt + 1]),
+ new_aof = krealloc(aof, struct_size(new_aof, ids, aof->cnt + 1),
GFP_KERNEL | __GFP_NOWARN);
if (!new_aof) {
ret = -ENOMEM;
@@ -5647,7 +5666,7 @@ btf_parse_struct_metas(struct bpf_verifier_log *log, struct btf *btf)
continue;
parse:
tab_cnt = tab ? tab->cnt : 0;
- new_tab = krealloc(tab, offsetof(struct btf_struct_metas, types[tab_cnt + 1]),
+ new_tab = krealloc(tab, struct_size(new_tab, types, tab_cnt + 1),
GFP_KERNEL | __GFP_NOWARN);
if (!new_tab) {
ret = -ENOMEM;
@@ -6181,8 +6200,7 @@ int get_kern_ctx_btf_id(struct bpf_verifier_log *log, enum bpf_prog_type prog_ty
return kctx_type_id;
}
-BTF_ID_LIST(bpf_ctx_convert_btf_id)
-BTF_ID(struct, bpf_ctx_convert)
+BTF_ID_LIST_SINGLE(bpf_ctx_convert_btf_id, struct, bpf_ctx_convert)
static struct btf *btf_parse_base(struct btf_verifier_env *env, const char *name,
void *data, unsigned int data_size)
@@ -6383,16 +6401,15 @@ struct btf *bpf_prog_get_target_btf(const struct bpf_prog *prog)
return prog->aux->attach_btf;
}
-static bool is_int_ptr(struct btf *btf, const struct btf_type *t)
+static bool is_void_or_int_ptr(struct btf *btf, const struct btf_type *t)
{
/* skip modifiers */
t = btf_type_skip_modifiers(btf, t->type, NULL);
-
- return btf_type_is_int(t);
+ return btf_type_is_void(t) || btf_type_is_int(t);
}
-static u32 get_ctx_arg_idx(struct btf *btf, const struct btf_type *func_proto,
- int off)
+u32 btf_ctx_arg_idx(struct btf *btf, const struct btf_type *func_proto,
+ int off)
{
const struct btf_param *args;
const struct btf_type *t;
@@ -6541,6 +6558,7 @@ static const struct bpf_raw_tp_null_args raw_tp_null_args[] = {
{ "xprt_put_cong", 0x10 },
/* tcp */
{ "tcp_send_reset", 0x11 },
+ { "tcp_sendmsg_locked", 0x100 },
/* tegra_apb_dma */
{ "tegra_dma_tx_status", 0x100 },
/* timer_migration */
@@ -6671,7 +6689,7 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type,
tname, off);
return false;
}
- arg = get_ctx_arg_idx(btf, t, off);
+ arg = btf_ctx_arg_idx(btf, t, off);
args = (const struct btf_param *)(t + 1);
/* if (t == NULL) Fall back to default BPF prog with
* MAX_BPF_FUNC_REG_ARGS u64 arguments.
@@ -6776,14 +6794,11 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type,
}
}
- if (t->type == 0)
- /* This is a pointer to void.
- * It is the same as scalar from the verifier safety pov.
- * No further pointer walking is allowed.
- */
- return true;
-
- if (is_int_ptr(btf, t))
+ /*
+ * If it's a pointer to void, it's the same as scalar from the verifier
+ * safety POV. Either way, no futher pointer walking is allowed.
+ */
+ if (is_void_or_int_ptr(btf, t))
return true;
/* this is a pointer to another type */
@@ -6829,10 +6844,10 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type,
/* Is this a func with potential NULL args? */
if (strcmp(tname, raw_tp_null_args[i].func))
continue;
- if (raw_tp_null_args[i].mask & (0x1 << (arg * 4)))
+ if (raw_tp_null_args[i].mask & (0x1ULL << (arg * 4)))
info->reg_type |= PTR_MAYBE_NULL;
/* Is the current arg IS_ERR? */
- if (raw_tp_null_args[i].mask & (0x2 << (arg * 4)))
+ if (raw_tp_null_args[i].mask & (0x2ULL << (arg * 4)))
ptr_err_raw_tp = true;
break;
}
@@ -6905,6 +6920,7 @@ enum bpf_struct_walk_result {
/* < 0 error */
WALK_SCALAR = 0,
WALK_PTR,
+ WALK_PTR_UNTRUSTED,
WALK_STRUCT,
};
@@ -7146,6 +7162,8 @@ error:
*field_name = mname;
return WALK_PTR;
}
+
+ return WALK_PTR_UNTRUSTED;
}
/* Allow more flexible access within an int as long as
@@ -7218,6 +7236,9 @@ int btf_struct_access(struct bpf_verifier_log *log,
*next_btf_id = id;
*flag = tmp_flag;
return PTR_TO_BTF_ID;
+ case WALK_PTR_UNTRUSTED:
+ *flag = MEM_RDONLY | PTR_UNTRUSTED;
+ return PTR_TO_MEM;
case WALK_SCALAR:
return SCALAR_VALUE;
case WALK_STRUCT:
@@ -7630,11 +7651,12 @@ cand_cache_unlock:
}
enum btf_arg_tag {
- ARG_TAG_CTX = BIT_ULL(0),
- ARG_TAG_NONNULL = BIT_ULL(1),
- ARG_TAG_TRUSTED = BIT_ULL(2),
- ARG_TAG_NULLABLE = BIT_ULL(3),
- ARG_TAG_ARENA = BIT_ULL(4),
+ ARG_TAG_CTX = BIT_ULL(0),
+ ARG_TAG_NONNULL = BIT_ULL(1),
+ ARG_TAG_TRUSTED = BIT_ULL(2),
+ ARG_TAG_UNTRUSTED = BIT_ULL(3),
+ ARG_TAG_NULLABLE = BIT_ULL(4),
+ ARG_TAG_ARENA = BIT_ULL(5),
};
/* Process BTF of a function to produce high-level expectation of function
@@ -7662,7 +7684,7 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog)
return 0;
if (!prog->aux->func_info) {
- bpf_log(log, "Verifier bug\n");
+ verifier_bug(env, "func_info undefined");
return -EFAULT;
}
@@ -7686,7 +7708,7 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog)
tname = btf_name_by_offset(btf, fn_t->name_off);
if (prog->aux->func_info_aux[subprog].unreliable) {
- bpf_log(log, "Verifier bug in function %s()\n", tname);
+ verifier_bug(env, "unreliable BTF for function %s()", tname);
return -EFAULT;
}
if (prog_type == BPF_PROG_TYPE_EXT)
@@ -7742,6 +7764,8 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog)
tags |= ARG_TAG_CTX;
} else if (strcmp(tag, "trusted") == 0) {
tags |= ARG_TAG_TRUSTED;
+ } else if (strcmp(tag, "untrusted") == 0) {
+ tags |= ARG_TAG_UNTRUSTED;
} else if (strcmp(tag, "nonnull") == 0) {
tags |= ARG_TAG_NONNULL;
} else if (strcmp(tag, "nullable") == 0) {
@@ -7802,6 +7826,38 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog)
sub->args[i].btf_id = kern_type_id;
continue;
}
+ if (tags & ARG_TAG_UNTRUSTED) {
+ struct btf *vmlinux_btf;
+ int kern_type_id;
+
+ if (tags & ~ARG_TAG_UNTRUSTED) {
+ bpf_log(log, "arg#%d untrusted cannot be combined with any other tags\n", i);
+ return -EINVAL;
+ }
+
+ ref_t = btf_type_skip_modifiers(btf, t->type, NULL);
+ if (btf_type_is_void(ref_t) || btf_type_is_primitive(ref_t)) {
+ sub->args[i].arg_type = ARG_PTR_TO_MEM | MEM_RDONLY | PTR_UNTRUSTED;
+ sub->args[i].mem_size = 0;
+ continue;
+ }
+
+ kern_type_id = btf_get_ptr_to_btf_id(log, i, btf, t);
+ if (kern_type_id < 0)
+ return kern_type_id;
+
+ vmlinux_btf = bpf_get_btf_vmlinux();
+ ref_t = btf_type_by_id(vmlinux_btf, kern_type_id);
+ if (!btf_type_is_struct(ref_t)) {
+ tname = __btf_name_by_offset(vmlinux_btf, t->name_off);
+ bpf_log(log, "arg#%d has type %s '%s', but only struct or primitive types are allowed\n",
+ i, btf_type_str(ref_t), tname);
+ return -EINVAL;
+ }
+ sub->args[i].arg_type = ARG_PTR_TO_BTF_ID | PTR_UNTRUSTED;
+ sub->args[i].btf_id = kern_type_id;
+ continue;
+ }
if (tags & ARG_TAG_ARENA) {
if (tags & ~ARG_TAG_ARENA) {
bpf_log(log, "arg#%d arena cannot be combined with any other tags\n", i);
@@ -8185,7 +8241,7 @@ static int btf_module_notify(struct notifier_block *nb, unsigned long op,
attr->attr.mode = 0444;
attr->size = btf->data_size;
attr->private = btf->data;
- attr->read_new = sysfs_bin_attr_simple_read;
+ attr->read = sysfs_bin_attr_simple_read;
err = sysfs_create_bin_file(btf_kobj, attr);
if (err) {
@@ -8563,7 +8619,7 @@ static int btf_populate_kfunc_set(struct btf *btf, enum btf_kfunc_hook hook,
/* Grow set */
set = krealloc(tab->sets[hook],
- offsetof(struct btf_id_set8, pairs[set_cnt + add_set->cnt]),
+ struct_size(set, pairs, set_cnt + add_set->cnt),
GFP_KERNEL | __GFP_NOWARN);
if (!set) {
ret = -ENOMEM;
@@ -8849,7 +8905,7 @@ int register_btf_id_dtor_kfuncs(const struct btf_id_dtor_kfunc *dtors, u32 add_c
}
tab = krealloc(btf->dtor_kfunc_tab,
- offsetof(struct btf_id_dtor_kfunc_tab, dtors[tab_cnt + add_cnt]),
+ struct_size(tab, dtors, tab_cnt + add_cnt),
GFP_KERNEL | __GFP_NOWARN);
if (!tab) {
ret = -ENOMEM;
@@ -9021,7 +9077,7 @@ static struct bpf_cand_cache *populate_cand_cache(struct bpf_cand_cache *cands,
bpf_free_cands_from_cache(*cc);
*cc = NULL;
}
- new_cands = kmemdup(cands, sizeof_cands(cands->cnt), GFP_KERNEL);
+ new_cands = kmemdup(cands, sizeof_cands(cands->cnt), GFP_KERNEL_ACCOUNT);
if (!new_cands) {
bpf_free_cands(cands);
return ERR_PTR(-ENOMEM);
@@ -9029,7 +9085,7 @@ static struct bpf_cand_cache *populate_cand_cache(struct bpf_cand_cache *cands,
/* strdup the name, since it will stay in cache.
* the cands->name points to strings in prog's BTF and the prog can be unloaded.
*/
- new_cands->name = kmemdup_nul(cands->name, cands->name_len, GFP_KERNEL);
+ new_cands->name = kmemdup_nul(cands->name, cands->name_len, GFP_KERNEL_ACCOUNT);
bpf_free_cands(cands);
if (!new_cands->name) {
kfree(new_cands);
@@ -9113,7 +9169,7 @@ bpf_core_add_cands(struct bpf_cand_cache *cands, const struct btf *targ_btf,
continue;
/* most of the time there is only one candidate for a given kind+name pair */
- new_cands = kmalloc(sizeof_cands(cands->cnt + 1), GFP_KERNEL);
+ new_cands = kmalloc(sizeof_cands(cands->cnt + 1), GFP_KERNEL_ACCOUNT);
if (!new_cands) {
bpf_free_cands(cands);
return ERR_PTR(-ENOMEM);
@@ -9230,7 +9286,7 @@ int bpf_core_apply(struct bpf_core_ctx *ctx, const struct bpf_core_relo *relo,
/* ~4k of temp memory necessary to convert LLVM spec like "0:1:0:5"
* into arrays of btf_ids of struct fields and array indices.
*/
- specs = kcalloc(3, sizeof(*specs), GFP_KERNEL);
+ specs = kcalloc(3, sizeof(*specs), GFP_KERNEL_ACCOUNT);
if (!specs)
return -ENOMEM;
@@ -9255,7 +9311,7 @@ int bpf_core_apply(struct bpf_core_ctx *ctx, const struct bpf_core_relo *relo,
goto out;
}
if (cc->cnt) {
- cands.cands = kcalloc(cc->cnt, sizeof(*cands.cands), GFP_KERNEL);
+ cands.cands = kcalloc(cc->cnt, sizeof(*cands.cands), GFP_KERNEL_ACCOUNT);
if (!cands.cands) {
err = -ENOMEM;
goto out;
@@ -9407,8 +9463,7 @@ btf_add_struct_ops(struct btf *btf, struct bpf_struct_ops *st_ops,
tab = btf->struct_ops_tab;
if (!tab) {
- tab = kzalloc(offsetof(struct btf_struct_ops_tab, ops[4]),
- GFP_KERNEL);
+ tab = kzalloc(struct_size(tab, ops, 4), GFP_KERNEL);
if (!tab)
return -ENOMEM;
tab->capacity = 4;
@@ -9421,8 +9476,7 @@ btf_add_struct_ops(struct btf *btf, struct bpf_struct_ops *st_ops,
if (tab->cnt == tab->capacity) {
new_tab = krealloc(tab,
- offsetof(struct btf_struct_ops_tab,
- ops[tab->capacity * 2]),
+ struct_size(tab, ops, tab->capacity * 2),
GFP_KERNEL);
if (!new_tab)
return -ENOMEM;
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index 84f58f3d028a..180b630279b9 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -41,6 +41,19 @@ static int __init cgroup_bpf_wq_init(void)
}
core_initcall(cgroup_bpf_wq_init);
+static int cgroup_bpf_lifetime_notify(struct notifier_block *nb,
+ unsigned long action, void *data);
+
+static struct notifier_block cgroup_bpf_lifetime_nb = {
+ .notifier_call = cgroup_bpf_lifetime_notify,
+};
+
+void __init cgroup_bpf_lifetime_notifier_init(void)
+{
+ BUG_ON(blocking_notifier_chain_register(&cgroup_lifetime_notifier,
+ &cgroup_bpf_lifetime_nb));
+}
+
/* __always_inline is necessary to prevent indirect call through run_prog
* function pointer.
*/
@@ -206,7 +219,7 @@ bpf_cgroup_atype_find(enum bpf_attach_type attach_type, u32 attach_btf_id)
}
#endif /* CONFIG_BPF_LSM */
-void cgroup_bpf_offline(struct cgroup *cgrp)
+static void cgroup_bpf_offline(struct cgroup *cgrp)
{
cgroup_get(cgrp);
percpu_ref_kill(&cgrp->bpf.refcnt);
@@ -491,7 +504,7 @@ static void activate_effective_progs(struct cgroup *cgrp,
* cgroup_bpf_inherit() - inherit effective programs from parent
* @cgrp: the cgroup to modify
*/
-int cgroup_bpf_inherit(struct cgroup *cgrp)
+static int cgroup_bpf_inherit(struct cgroup *cgrp)
{
/* has to use marco instead of const int, since compiler thinks
* that array below is variable length
@@ -534,6 +547,27 @@ cleanup:
return -ENOMEM;
}
+static int cgroup_bpf_lifetime_notify(struct notifier_block *nb,
+ unsigned long action, void *data)
+{
+ struct cgroup *cgrp = data;
+ int ret = 0;
+
+ if (cgrp->root != &cgrp_dfl_root)
+ return NOTIFY_OK;
+
+ switch (action) {
+ case CGROUP_LIFETIME_ONLINE:
+ ret = cgroup_bpf_inherit(cgrp);
+ break;
+ case CGROUP_LIFETIME_OFFLINE:
+ cgroup_bpf_offline(cgrp);
+ break;
+ }
+
+ return notifier_from_errno(ret);
+}
+
static int update_effective_progs(struct cgroup *cgrp,
enum cgroup_bpf_attach_type atype)
{
@@ -624,6 +658,116 @@ static struct bpf_prog_list *find_attach_entry(struct hlist_head *progs,
return NULL;
}
+static struct bpf_link *bpf_get_anchor_link(u32 flags, u32 id_or_fd)
+{
+ struct bpf_link *link = ERR_PTR(-EINVAL);
+
+ if (flags & BPF_F_ID)
+ link = bpf_link_by_id(id_or_fd);
+ else if (id_or_fd)
+ link = bpf_link_get_from_fd(id_or_fd);
+ return link;
+}
+
+static struct bpf_prog *bpf_get_anchor_prog(u32 flags, u32 id_or_fd)
+{
+ struct bpf_prog *prog = ERR_PTR(-EINVAL);
+
+ if (flags & BPF_F_ID)
+ prog = bpf_prog_by_id(id_or_fd);
+ else if (id_or_fd)
+ prog = bpf_prog_get(id_or_fd);
+ return prog;
+}
+
+static struct bpf_prog_list *get_prog_list(struct hlist_head *progs, struct bpf_prog *prog,
+ struct bpf_cgroup_link *link, u32 flags, u32 id_or_fd)
+{
+ bool is_link = flags & BPF_F_LINK, is_id = flags & BPF_F_ID;
+ struct bpf_prog_list *pltmp, *pl = ERR_PTR(-EINVAL);
+ bool preorder = flags & BPF_F_PREORDER;
+ struct bpf_link *anchor_link = NULL;
+ struct bpf_prog *anchor_prog = NULL;
+ bool is_before, is_after;
+
+ is_before = flags & BPF_F_BEFORE;
+ is_after = flags & BPF_F_AFTER;
+ if (is_link || is_id || id_or_fd) {
+ /* flags must have either BPF_F_BEFORE or BPF_F_AFTER */
+ if (is_before == is_after)
+ return ERR_PTR(-EINVAL);
+ if ((is_link && !link) || (!is_link && !prog))
+ return ERR_PTR(-EINVAL);
+ } else if (!hlist_empty(progs)) {
+ /* flags cannot have both BPF_F_BEFORE and BPF_F_AFTER */
+ if (is_before && is_after)
+ return ERR_PTR(-EINVAL);
+ }
+
+ if (is_link) {
+ anchor_link = bpf_get_anchor_link(flags, id_or_fd);
+ if (IS_ERR(anchor_link))
+ return ERR_CAST(anchor_link);
+ } else if (is_id || id_or_fd) {
+ anchor_prog = bpf_get_anchor_prog(flags, id_or_fd);
+ if (IS_ERR(anchor_prog))
+ return ERR_CAST(anchor_prog);
+ }
+
+ if (!anchor_prog && !anchor_link) {
+ /* if there is no anchor_prog/anchor_link, then BPF_F_PREORDER
+ * doesn't matter since either prepend or append to a combined
+ * list of progs will end up with correct result.
+ */
+ hlist_for_each_entry(pltmp, progs, node) {
+ if (is_before)
+ return pltmp;
+ if (pltmp->node.next)
+ continue;
+ return pltmp;
+ }
+ return NULL;
+ }
+
+ hlist_for_each_entry(pltmp, progs, node) {
+ if ((anchor_prog && anchor_prog == pltmp->prog) ||
+ (anchor_link && anchor_link == &pltmp->link->link)) {
+ if (!!(pltmp->flags & BPF_F_PREORDER) != preorder)
+ goto out;
+ pl = pltmp;
+ goto out;
+ }
+ }
+
+ pl = ERR_PTR(-ENOENT);
+out:
+ if (anchor_link)
+ bpf_link_put(anchor_link);
+ else
+ bpf_prog_put(anchor_prog);
+ return pl;
+}
+
+static int insert_pl_to_hlist(struct bpf_prog_list *pl, struct hlist_head *progs,
+ struct bpf_prog *prog, struct bpf_cgroup_link *link,
+ u32 flags, u32 id_or_fd)
+{
+ struct bpf_prog_list *pltmp;
+
+ pltmp = get_prog_list(progs, prog, link, flags, id_or_fd);
+ if (IS_ERR(pltmp))
+ return PTR_ERR(pltmp);
+
+ if (!pltmp)
+ hlist_add_head(&pl->node, progs);
+ else if (flags & BPF_F_BEFORE)
+ hlist_add_before(&pl->node, &pltmp->node);
+ else
+ hlist_add_behind(&pl->node, &pltmp->node);
+
+ return 0;
+}
+
/**
* __cgroup_bpf_attach() - Attach the program or the link to a cgroup, and
* propagate the change to descendants
@@ -633,6 +777,8 @@ static struct bpf_prog_list *find_attach_entry(struct hlist_head *progs,
* @replace_prog: Previously attached program to replace if BPF_F_REPLACE is set
* @type: Type of attach operation
* @flags: Option flags
+ * @id_or_fd: Relative prog id or fd
+ * @revision: bpf_prog_list revision
*
* Exactly one of @prog or @link can be non-null.
* Must be called with cgroup_mutex held.
@@ -640,7 +786,8 @@ static struct bpf_prog_list *find_attach_entry(struct hlist_head *progs,
static int __cgroup_bpf_attach(struct cgroup *cgrp,
struct bpf_prog *prog, struct bpf_prog *replace_prog,
struct bpf_cgroup_link *link,
- enum bpf_attach_type type, u32 flags)
+ enum bpf_attach_type type, u32 flags, u32 id_or_fd,
+ u64 revision)
{
u32 saved_flags = (flags & (BPF_F_ALLOW_OVERRIDE | BPF_F_ALLOW_MULTI));
struct bpf_prog *old_prog = NULL;
@@ -656,6 +803,9 @@ static int __cgroup_bpf_attach(struct cgroup *cgrp,
((flags & BPF_F_REPLACE) && !(flags & BPF_F_ALLOW_MULTI)))
/* invalid combination */
return -EINVAL;
+ if ((flags & BPF_F_REPLACE) && (flags & (BPF_F_BEFORE | BPF_F_AFTER)))
+ /* only either replace or insertion with before/after */
+ return -EINVAL;
if (link && (prog || replace_prog))
/* only either link or prog/replace_prog can be specified */
return -EINVAL;
@@ -666,6 +816,8 @@ static int __cgroup_bpf_attach(struct cgroup *cgrp,
atype = bpf_cgroup_atype_find(type, new_prog->aux->attach_btf_id);
if (atype < 0)
return -EINVAL;
+ if (revision && revision != cgrp->bpf.revisions[atype])
+ return -ESTALE;
progs = &cgrp->bpf.progs[atype];
@@ -694,22 +846,18 @@ static int __cgroup_bpf_attach(struct cgroup *cgrp,
if (pl) {
old_prog = pl->prog;
} else {
- struct hlist_node *last = NULL;
-
pl = kmalloc(sizeof(*pl), GFP_KERNEL);
if (!pl) {
bpf_cgroup_storages_free(new_storage);
return -ENOMEM;
}
- if (hlist_empty(progs))
- hlist_add_head(&pl->node, progs);
- else
- hlist_for_each(last, progs) {
- if (last->next)
- continue;
- hlist_add_behind(&pl->node, last);
- break;
- }
+
+ err = insert_pl_to_hlist(pl, progs, prog, link, flags, id_or_fd);
+ if (err) {
+ kfree(pl);
+ bpf_cgroup_storages_free(new_storage);
+ return err;
+ }
}
pl->prog = prog;
@@ -719,7 +867,7 @@ static int __cgroup_bpf_attach(struct cgroup *cgrp,
cgrp->bpf.flags[atype] = saved_flags;
if (type == BPF_LSM_CGROUP) {
- err = bpf_trampoline_link_cgroup_shim(new_prog, atype);
+ err = bpf_trampoline_link_cgroup_shim(new_prog, atype, type);
if (err)
goto cleanup;
}
@@ -728,6 +876,7 @@ static int __cgroup_bpf_attach(struct cgroup *cgrp,
if (err)
goto cleanup_trampoline;
+ cgrp->bpf.revisions[atype] += 1;
if (old_prog) {
if (type == BPF_LSM_CGROUP)
bpf_trampoline_unlink_cgroup_shim(old_prog);
@@ -759,12 +908,13 @@ static int cgroup_bpf_attach(struct cgroup *cgrp,
struct bpf_prog *prog, struct bpf_prog *replace_prog,
struct bpf_cgroup_link *link,
enum bpf_attach_type type,
- u32 flags)
+ u32 flags, u32 id_or_fd, u64 revision)
{
int ret;
cgroup_lock();
- ret = __cgroup_bpf_attach(cgrp, prog, replace_prog, link, type, flags);
+ ret = __cgroup_bpf_attach(cgrp, prog, replace_prog, link, type, flags,
+ id_or_fd, revision);
cgroup_unlock();
return ret;
}
@@ -834,7 +984,7 @@ static int __cgroup_bpf_replace(struct cgroup *cgrp,
struct hlist_head *progs;
bool found = false;
- atype = bpf_cgroup_atype_find(link->type, new_prog->aux->attach_btf_id);
+ atype = bpf_cgroup_atype_find(link->link.attach_type, new_prog->aux->attach_btf_id);
if (atype < 0)
return -EINVAL;
@@ -852,6 +1002,7 @@ static int __cgroup_bpf_replace(struct cgroup *cgrp,
if (!found)
return -ENOENT;
+ cgrp->bpf.revisions[atype] += 1;
old_prog = xchg(&link->link.prog, new_prog);
replace_effective_prog(cgrp, atype, link);
bpf_prog_put(old_prog);
@@ -977,12 +1128,14 @@ found:
* @prog: A program to detach or NULL
* @link: A link to detach or NULL
* @type: Type of detach operation
+ * @revision: bpf_prog_list revision
*
* At most one of @prog or @link can be non-NULL.
* Must be called with cgroup_mutex held.
*/
static int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
- struct bpf_cgroup_link *link, enum bpf_attach_type type)
+ struct bpf_cgroup_link *link, enum bpf_attach_type type,
+ u64 revision)
{
enum cgroup_bpf_attach_type atype;
struct bpf_prog *old_prog;
@@ -1000,6 +1153,9 @@ static int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
if (atype < 0)
return -EINVAL;
+ if (revision && revision != cgrp->bpf.revisions[atype])
+ return -ESTALE;
+
progs = &cgrp->bpf.progs[atype];
flags = cgrp->bpf.flags[atype];
@@ -1025,6 +1181,7 @@ static int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
/* now can actually delete it from this cgroup list */
hlist_del(&pl->node);
+ cgrp->bpf.revisions[atype] += 1;
kfree(pl);
if (hlist_empty(progs))
@@ -1040,12 +1197,12 @@ static int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
}
static int cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
- enum bpf_attach_type type)
+ enum bpf_attach_type type, u64 revision)
{
int ret;
cgroup_lock();
- ret = __cgroup_bpf_detach(cgrp, prog, NULL, type);
+ ret = __cgroup_bpf_detach(cgrp, prog, NULL, type, revision);
cgroup_unlock();
return ret;
}
@@ -1063,6 +1220,7 @@ static int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
struct bpf_prog_array *effective;
int cnt, ret = 0, i;
int total_cnt = 0;
+ u64 revision = 0;
u32 flags;
if (effective_query && prog_attach_flags)
@@ -1100,6 +1258,10 @@ static int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
return -EFAULT;
if (copy_to_user(&uattr->query.prog_cnt, &total_cnt, sizeof(total_cnt)))
return -EFAULT;
+ if (!effective_query && from_atype == to_atype)
+ revision = cgrp->bpf.revisions[from_atype];
+ if (copy_to_user(&uattr->query.revision, &revision, sizeof(revision)))
+ return -EFAULT;
if (attr->query.prog_cnt == 0 || !prog_ids || !total_cnt)
/* return early if user requested only program count + flags */
return 0;
@@ -1182,7 +1344,8 @@ int cgroup_bpf_prog_attach(const union bpf_attr *attr,
}
ret = cgroup_bpf_attach(cgrp, prog, replace_prog, NULL,
- attr->attach_type, attr->attach_flags);
+ attr->attach_type, attr->attach_flags,
+ attr->relative_fd, attr->expected_revision);
if (replace_prog)
bpf_prog_put(replace_prog);
@@ -1204,7 +1367,7 @@ int cgroup_bpf_prog_detach(const union bpf_attr *attr, enum bpf_prog_type ptype)
if (IS_ERR(prog))
prog = NULL;
- ret = cgroup_bpf_detach(cgrp, prog, attr->attach_type);
+ ret = cgroup_bpf_detach(cgrp, prog, attr->attach_type, attr->expected_revision);
if (prog)
bpf_prog_put(prog);
@@ -1233,8 +1396,8 @@ static void bpf_cgroup_link_release(struct bpf_link *link)
}
WARN_ON(__cgroup_bpf_detach(cg_link->cgroup, NULL, cg_link,
- cg_link->type));
- if (cg_link->type == BPF_LSM_CGROUP)
+ link->attach_type, 0));
+ if (link->attach_type == BPF_LSM_CGROUP)
bpf_trampoline_unlink_cgroup_shim(cg_link->link.prog);
cg = cg_link->cgroup;
@@ -1276,7 +1439,7 @@ static void bpf_cgroup_link_show_fdinfo(const struct bpf_link *link,
"cgroup_id:\t%llu\n"
"attach_type:\t%d\n",
cg_id,
- cg_link->type);
+ link->attach_type);
}
static int bpf_cgroup_link_fill_link_info(const struct bpf_link *link,
@@ -1292,7 +1455,7 @@ static int bpf_cgroup_link_fill_link_info(const struct bpf_link *link,
cgroup_unlock();
info->cgroup.cgroup_id = cg_id;
- info->cgroup.attach_type = cg_link->type;
+ info->cgroup.attach_type = link->attach_type;
return 0;
}
@@ -1305,6 +1468,13 @@ static const struct bpf_link_ops bpf_cgroup_link_lops = {
.fill_link_info = bpf_cgroup_link_fill_link_info,
};
+#define BPF_F_LINK_ATTACH_MASK \
+ (BPF_F_ID | \
+ BPF_F_BEFORE | \
+ BPF_F_AFTER | \
+ BPF_F_PREORDER | \
+ BPF_F_LINK)
+
int cgroup_bpf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
{
struct bpf_link_primer link_primer;
@@ -1312,7 +1482,7 @@ int cgroup_bpf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
struct cgroup *cgrp;
int err;
- if (attr->link_create.flags)
+ if (attr->link_create.flags & (~BPF_F_LINK_ATTACH_MASK))
return -EINVAL;
cgrp = cgroup_get_from_fd(attr->link_create.target_fd);
@@ -1325,9 +1495,8 @@ int cgroup_bpf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
goto out_put_cgroup;
}
bpf_link_init(&link->link, BPF_LINK_TYPE_CGROUP, &bpf_cgroup_link_lops,
- prog);
+ prog, attr->link_create.attach_type);
link->cgroup = cgrp;
- link->type = attr->link_create.attach_type;
err = bpf_link_prime(&link->link, &link_primer);
if (err) {
@@ -1336,7 +1505,9 @@ int cgroup_bpf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
}
err = cgroup_bpf_attach(cgrp, NULL, NULL, link,
- link->type, BPF_F_ALLOW_MULTI);
+ link->link.attach_type, BPF_F_ALLOW_MULTI | attr->link_create.flags,
+ attr->link_create.cgroup.relative_fd,
+ attr->link_create.cgroup.expected_revision);
if (err) {
bpf_link_cleanup(&link_primer);
goto out_put_cgroup;
@@ -1653,10 +1824,6 @@ cgroup_dev_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
if (func_proto)
return func_proto;
- func_proto = cgroup_current_func_proto(func_id, prog);
- if (func_proto)
- return func_proto;
-
switch (func_id) {
case BPF_FUNC_perf_event_output:
return &bpf_event_output_data_proto;
@@ -2104,7 +2271,7 @@ static const struct bpf_func_proto bpf_sysctl_get_name_proto = {
.gpl_only = false,
.ret_type = RET_INTEGER,
.arg1_type = ARG_PTR_TO_CTX,
- .arg2_type = ARG_PTR_TO_MEM,
+ .arg2_type = ARG_PTR_TO_MEM | MEM_WRITE,
.arg3_type = ARG_CONST_SIZE,
.arg4_type = ARG_ANYTHING,
};
@@ -2204,10 +2371,6 @@ sysctl_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
if (func_proto)
return func_proto;
- func_proto = cgroup_current_func_proto(func_id, prog);
- if (func_proto)
- return func_proto;
-
switch (func_id) {
case BPF_FUNC_sysctl_get_name:
return &bpf_sysctl_get_name_proto;
@@ -2351,10 +2514,6 @@ cg_sockopt_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
if (func_proto)
return func_proto;
- func_proto = cgroup_current_func_proto(func_id, prog);
- if (func_proto)
- return func_proto;
-
switch (func_id) {
#ifdef CONFIG_NET
case BPF_FUNC_get_netns_cookie:
@@ -2418,22 +2577,22 @@ static bool cg_sockopt_is_valid_access(int off, int size,
}
switch (off) {
- case offsetof(struct bpf_sockopt, sk):
+ case bpf_ctx_range_ptr(struct bpf_sockopt, sk):
if (size != sizeof(__u64))
return false;
info->reg_type = PTR_TO_SOCKET;
break;
- case offsetof(struct bpf_sockopt, optval):
+ case bpf_ctx_range_ptr(struct bpf_sockopt, optval):
if (size != sizeof(__u64))
return false;
info->reg_type = PTR_TO_PACKET;
break;
- case offsetof(struct bpf_sockopt, optval_end):
+ case bpf_ctx_range_ptr(struct bpf_sockopt, optval_end):
if (size != sizeof(__u64))
return false;
info->reg_type = PTR_TO_PACKET_END;
break;
- case offsetof(struct bpf_sockopt, retval):
+ case bpf_ctx_range(struct bpf_sockopt, retval):
if (size != size_default)
return false;
return prog->expected_attach_type == BPF_CGROUP_GETSOCKOPT;
@@ -2601,23 +2760,3 @@ cgroup_common_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return NULL;
}
}
-
-/* Common helpers for cgroup hooks with valid process context. */
-const struct bpf_func_proto *
-cgroup_current_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
-{
- switch (func_id) {
- case BPF_FUNC_get_current_uid_gid:
- return &bpf_get_current_uid_gid_proto;
- case BPF_FUNC_get_current_comm:
- return &bpf_get_current_comm_proto;
-#ifdef CONFIG_CGROUP_NET_CLASSID
- case BPF_FUNC_get_cgroup_classid:
- return &bpf_get_cgroup_classid_curr_proto;
-#endif
- case BPF_FUNC_current_task_under_cgroup:
- return &bpf_current_task_under_cgroup_proto;
- default:
- return NULL;
- }
-}
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index ba6b6118cf50..09dde5b00d0c 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -134,6 +134,10 @@ struct bpf_prog *bpf_prog_alloc_no_stats(unsigned int size, gfp_t gfp_extra_flag
mutex_init(&fp->aux->ext_mutex);
mutex_init(&fp->aux->dst_mutex);
+#ifdef CONFIG_BPF_SYSCALL
+ bpf_prog_stream_init(fp);
+#endif
+
return fp;
}
@@ -304,7 +308,7 @@ int bpf_prog_calc_tag(struct bpf_prog *fp)
if (!raw)
return -ENOMEM;
- sha1_init(digest);
+ sha1_init_raw(digest);
memset(ws, 0, sizeof(ws));
/* We need to take out the map fd for the digest calculation
@@ -778,7 +782,10 @@ bool is_bpf_text_address(unsigned long addr)
struct bpf_prog *bpf_prog_ksym_find(unsigned long addr)
{
- struct bpf_ksym *ksym = bpf_ksym_find(addr);
+ struct bpf_ksym *ksym;
+
+ WARN_ON_ONCE(!rcu_read_lock_held());
+ ksym = bpf_ksym_find(addr);
return ksym && ksym->prog ?
container_of(ksym, struct bpf_prog_aux, ksym)->prog :
@@ -1290,6 +1297,13 @@ int bpf_jit_get_func_addr(const struct bpf_prog *prog,
return 0;
}
+const char *bpf_jit_get_prog_name(struct bpf_prog *prog)
+{
+ if (prog->aux->ksym.prog)
+ return prog->aux->ksym.name;
+ return prog->aux->name;
+}
+
static int bpf_jit_blind_insn(const struct bpf_insn *from,
const struct bpf_insn *aux,
struct bpf_insn *to_buff,
@@ -2102,14 +2116,15 @@ out:
#undef COND_JMP
/* ST, STX and LDX*/
ST_NOSPEC:
- /* Speculation barrier for mitigating Speculative Store Bypass.
- * In case of arm64, we rely on the firmware mitigation as
- * controlled via the ssbd kernel parameter. Whenever the
- * mitigation is enabled, it works for all of the kernel code
- * with no need to provide any additional instructions here.
- * In case of x86, we use 'lfence' insn for mitigation. We
- * reuse preexisting logic from Spectre v1 mitigation that
- * happens to produce the required code on x86 for v4 as well.
+ /* Speculation barrier for mitigating Speculative Store Bypass,
+ * Bounds-Check Bypass and Type Confusion. In case of arm64, we
+ * rely on the firmware mitigation as controlled via the ssbd
+ * kernel parameter. Whenever the mitigation is enabled, it
+ * works for all of the kernel code with no need to provide any
+ * additional instructions here. In case of x86, we use 'lfence'
+ * insn for mitigation. We reuse preexisting logic from Spectre
+ * v1 mitigation that happens to produce the required code on
+ * x86 for v4 as well.
*/
barrier_nospec();
CONT;
@@ -2358,8 +2373,8 @@ static unsigned int __bpf_prog_ret0_warn(const void *ctx,
return 0;
}
-bool bpf_prog_map_compatible(struct bpf_map *map,
- const struct bpf_prog *fp)
+static bool __bpf_prog_map_compatible(struct bpf_map *map,
+ const struct bpf_prog *fp)
{
enum bpf_prog_type prog_type = resolve_prog_type(fp);
bool ret;
@@ -2368,14 +2383,6 @@ bool bpf_prog_map_compatible(struct bpf_map *map,
if (fp->kprobe_override)
return false;
- /* XDP programs inserted into maps are not guaranteed to run on
- * a particular netdev (and can run outside driver context entirely
- * in the case of devmap and cpumap). Until device checks
- * are implemented, prohibit adding dev-bound programs to program maps.
- */
- if (bpf_prog_is_dev_bound(aux))
- return false;
-
spin_lock(&map->owner.lock);
if (!map->owner.type) {
/* There's no owner yet where we could check for
@@ -2409,6 +2416,19 @@ bool bpf_prog_map_compatible(struct bpf_map *map,
return ret;
}
+bool bpf_prog_map_compatible(struct bpf_map *map, const struct bpf_prog *fp)
+{
+ /* XDP programs inserted into maps are not guaranteed to run on
+ * a particular netdev (and can run outside driver context entirely
+ * in the case of devmap and cpumap). Until device checks
+ * are implemented, prohibit adding dev-bound programs to program maps.
+ */
+ if (bpf_prog_is_dev_bound(fp->aux))
+ return false;
+
+ return __bpf_prog_map_compatible(map, fp);
+}
+
static int bpf_check_tail_call(const struct bpf_prog *fp)
{
struct bpf_prog_aux *aux = fp->aux;
@@ -2421,7 +2441,7 @@ static int bpf_check_tail_call(const struct bpf_prog *fp)
if (!map_type_contains_progs(map))
continue;
- if (!bpf_prog_map_compatible(map, fp)) {
+ if (!__bpf_prog_map_compatible(map, fp)) {
ret = -EINVAL;
goto out;
}
@@ -2469,7 +2489,7 @@ struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err)
/* In case of BPF to BPF calls, verifier did all the prep
* work with regards to JITing, etc.
*/
- bool jit_needed = false;
+ bool jit_needed = fp->jit_requested;
if (fp->bpf_func)
goto finalize;
@@ -2856,6 +2876,7 @@ static void bpf_prog_free_deferred(struct work_struct *work)
aux = container_of(work, struct bpf_prog_aux, work);
#ifdef CONFIG_BPF_SYSCALL
bpf_free_kfunc_btf_tab(aux->kfunc_btf_tab);
+ bpf_prog_stream_free(aux->prog);
#endif
#ifdef CONFIG_CGROUP_BPF
if (aux->cgroup_atype != CGROUP_BPF_ATTACH_TYPE_INVALID)
@@ -3029,6 +3050,21 @@ bool __weak bpf_jit_needs_zext(void)
return false;
}
+/* By default, enable the verifier's mitigations against Spectre v1 and v4 for
+ * all archs. The value returned must not change at runtime as there is
+ * currently no support for reloading programs that were loaded without
+ * mitigations.
+ */
+bool __weak bpf_jit_bypass_spec_v1(void)
+{
+ return false;
+}
+
+bool __weak bpf_jit_bypass_spec_v4(void)
+{
+ return false;
+}
+
/* Return true if the JIT inlines the call to the helper corresponding to
* the imm.
*
@@ -3139,6 +3175,22 @@ u64 __weak arch_bpf_timed_may_goto(void)
return 0;
}
+static noinline void bpf_prog_report_may_goto_violation(void)
+{
+#ifdef CONFIG_BPF_SYSCALL
+ struct bpf_stream_stage ss;
+ struct bpf_prog *prog;
+
+ prog = bpf_prog_find_from_stack();
+ if (!prog)
+ return;
+ bpf_stream_stage(ss, prog, BPF_STDERR, ({
+ bpf_stream_printk(ss, "ERROR: Timeout detected for may_goto instruction\n");
+ bpf_stream_dump_stack(ss);
+ }));
+#endif
+}
+
u64 bpf_check_timed_may_goto(struct bpf_timed_may_goto *p)
{
u64 time = ktime_get_mono_fast_ns();
@@ -3149,8 +3201,10 @@ u64 bpf_check_timed_may_goto(struct bpf_timed_may_goto *p)
return BPF_MAX_TIMED_LOOPS;
}
/* Check if we've exhausted our time slice, and zero count. */
- if (time - p->timestamp >= (NSEC_PER_SEC / 4))
+ if (unlikely(time - p->timestamp >= (NSEC_PER_SEC / 4))) {
+ bpf_prog_report_may_goto_violation();
return 0;
+ }
/* Refresh the count for the stack frame. */
return BPF_MAX_TIMED_LOOPS;
}
@@ -3187,3 +3241,85 @@ EXPORT_SYMBOL(bpf_stats_enabled_key);
EXPORT_TRACEPOINT_SYMBOL_GPL(xdp_exception);
EXPORT_TRACEPOINT_SYMBOL_GPL(xdp_bulk_tx);
+
+#ifdef CONFIG_BPF_SYSCALL
+
+int bpf_prog_get_file_line(struct bpf_prog *prog, unsigned long ip, const char **filep,
+ const char **linep, int *nump)
+{
+ int idx = -1, insn_start, insn_end, len;
+ struct bpf_line_info *linfo;
+ void **jited_linfo;
+ struct btf *btf;
+ int nr_linfo;
+
+ btf = prog->aux->btf;
+ linfo = prog->aux->linfo;
+ jited_linfo = prog->aux->jited_linfo;
+
+ if (!btf || !linfo || !jited_linfo)
+ return -EINVAL;
+ len = prog->aux->func ? prog->aux->func[prog->aux->func_idx]->len : prog->len;
+
+ linfo = &prog->aux->linfo[prog->aux->linfo_idx];
+ jited_linfo = &prog->aux->jited_linfo[prog->aux->linfo_idx];
+
+ insn_start = linfo[0].insn_off;
+ insn_end = insn_start + len;
+ nr_linfo = prog->aux->nr_linfo - prog->aux->linfo_idx;
+
+ for (int i = 0; i < nr_linfo &&
+ linfo[i].insn_off >= insn_start && linfo[i].insn_off < insn_end; i++) {
+ if (jited_linfo[i] >= (void *)ip)
+ break;
+ idx = i;
+ }
+
+ if (idx == -1)
+ return -ENOENT;
+
+ /* Get base component of the file path. */
+ *filep = btf_name_by_offset(btf, linfo[idx].file_name_off);
+ *filep = kbasename(*filep);
+ /* Obtain the source line, and strip whitespace in prefix. */
+ *linep = btf_name_by_offset(btf, linfo[idx].line_off);
+ while (isspace(**linep))
+ *linep += 1;
+ *nump = BPF_LINE_INFO_LINE_NUM(linfo[idx].line_col);
+ return 0;
+}
+
+struct walk_stack_ctx {
+ struct bpf_prog *prog;
+};
+
+static bool find_from_stack_cb(void *cookie, u64 ip, u64 sp, u64 bp)
+{
+ struct walk_stack_ctx *ctxp = cookie;
+ struct bpf_prog *prog;
+
+ /*
+ * The RCU read lock is held to safely traverse the latch tree, but we
+ * don't need its protection when accessing the prog, since it has an
+ * active stack frame on the current stack trace, and won't disappear.
+ */
+ rcu_read_lock();
+ prog = bpf_prog_ksym_find(ip);
+ rcu_read_unlock();
+ if (!prog)
+ return true;
+ if (bpf_is_subprog(prog))
+ return true;
+ ctxp->prog = prog;
+ return false;
+}
+
+struct bpf_prog *bpf_prog_find_from_stack(void)
+{
+ struct walk_stack_ctx ctx = {};
+
+ arch_bpf_stack_walk(find_from_stack_cb, &ctx);
+ return ctx.prog;
+}
+
+#endif
diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c
index 67e8a2fc1a99..b2b7b8ec2c2a 100644
--- a/kernel/bpf/cpumap.c
+++ b/kernel/bpf/cpumap.c
@@ -282,8 +282,7 @@ static void cpu_map_gro_flush(struct bpf_cpu_map_entry *rcpu, bool empty)
* This is equivalent to how NAPI decides whether to perform a full
* flush.
*/
- gro_flush(&rcpu->gro, !empty && HZ >= 1000);
- gro_normal_list(&rcpu->gro);
+ gro_flush_normal(&rcpu->gro, !empty && HZ >= 1000);
}
static int cpu_map_kthread_run(void *data)
diff --git a/kernel/bpf/dmabuf_iter.c b/kernel/bpf/dmabuf_iter.c
new file mode 100644
index 000000000000..4dd7ef7c145c
--- /dev/null
+++ b/kernel/bpf/dmabuf_iter.c
@@ -0,0 +1,150 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (c) 2025 Google LLC */
+#include <linux/bpf.h>
+#include <linux/btf_ids.h>
+#include <linux/dma-buf.h>
+#include <linux/kernel.h>
+#include <linux/seq_file.h>
+
+static void *dmabuf_iter_seq_start(struct seq_file *seq, loff_t *pos)
+{
+ if (*pos)
+ return NULL;
+
+ return dma_buf_iter_begin();
+}
+
+static void *dmabuf_iter_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ struct dma_buf *dmabuf = v;
+
+ ++*pos;
+
+ return dma_buf_iter_next(dmabuf);
+}
+
+struct bpf_iter__dmabuf {
+ __bpf_md_ptr(struct bpf_iter_meta *, meta);
+ __bpf_md_ptr(struct dma_buf *, dmabuf);
+};
+
+static int __dmabuf_seq_show(struct seq_file *seq, void *v, bool in_stop)
+{
+ struct bpf_iter_meta meta = {
+ .seq = seq,
+ };
+ struct bpf_iter__dmabuf ctx = {
+ .meta = &meta,
+ .dmabuf = v,
+ };
+ struct bpf_prog *prog = bpf_iter_get_info(&meta, in_stop);
+
+ if (prog)
+ return bpf_iter_run_prog(prog, &ctx);
+
+ return 0;
+}
+
+static int dmabuf_iter_seq_show(struct seq_file *seq, void *v)
+{
+ return __dmabuf_seq_show(seq, v, false);
+}
+
+static void dmabuf_iter_seq_stop(struct seq_file *seq, void *v)
+{
+ struct dma_buf *dmabuf = v;
+
+ if (dmabuf)
+ dma_buf_put(dmabuf);
+}
+
+static const struct seq_operations dmabuf_iter_seq_ops = {
+ .start = dmabuf_iter_seq_start,
+ .next = dmabuf_iter_seq_next,
+ .stop = dmabuf_iter_seq_stop,
+ .show = dmabuf_iter_seq_show,
+};
+
+static void bpf_iter_dmabuf_show_fdinfo(const struct bpf_iter_aux_info *aux,
+ struct seq_file *seq)
+{
+ seq_puts(seq, "dmabuf iter\n");
+}
+
+static const struct bpf_iter_seq_info dmabuf_iter_seq_info = {
+ .seq_ops = &dmabuf_iter_seq_ops,
+ .init_seq_private = NULL,
+ .fini_seq_private = NULL,
+ .seq_priv_size = 0,
+};
+
+static struct bpf_iter_reg bpf_dmabuf_reg_info = {
+ .target = "dmabuf",
+ .feature = BPF_ITER_RESCHED,
+ .show_fdinfo = bpf_iter_dmabuf_show_fdinfo,
+ .ctx_arg_info_size = 1,
+ .ctx_arg_info = {
+ { offsetof(struct bpf_iter__dmabuf, dmabuf),
+ PTR_TO_BTF_ID_OR_NULL },
+ },
+ .seq_info = &dmabuf_iter_seq_info,
+};
+
+DEFINE_BPF_ITER_FUNC(dmabuf, struct bpf_iter_meta *meta, struct dma_buf *dmabuf)
+BTF_ID_LIST_SINGLE(bpf_dmabuf_btf_id, struct, dma_buf)
+
+static int __init dmabuf_iter_init(void)
+{
+ bpf_dmabuf_reg_info.ctx_arg_info[0].btf_id = bpf_dmabuf_btf_id[0];
+ return bpf_iter_reg_target(&bpf_dmabuf_reg_info);
+}
+
+late_initcall(dmabuf_iter_init);
+
+struct bpf_iter_dmabuf {
+ /*
+ * opaque iterator state; having __u64 here allows to preserve correct
+ * alignment requirements in vmlinux.h, generated from BTF
+ */
+ __u64 __opaque[1];
+} __aligned(8);
+
+/* Non-opaque version of bpf_iter_dmabuf */
+struct bpf_iter_dmabuf_kern {
+ struct dma_buf *dmabuf;
+} __aligned(8);
+
+__bpf_kfunc_start_defs();
+
+__bpf_kfunc int bpf_iter_dmabuf_new(struct bpf_iter_dmabuf *it)
+{
+ struct bpf_iter_dmabuf_kern *kit = (void *)it;
+
+ BUILD_BUG_ON(sizeof(*kit) > sizeof(*it));
+ BUILD_BUG_ON(__alignof__(*kit) != __alignof__(*it));
+
+ kit->dmabuf = NULL;
+ return 0;
+}
+
+__bpf_kfunc struct dma_buf *bpf_iter_dmabuf_next(struct bpf_iter_dmabuf *it)
+{
+ struct bpf_iter_dmabuf_kern *kit = (void *)it;
+
+ if (kit->dmabuf)
+ kit->dmabuf = dma_buf_iter_next(kit->dmabuf);
+ else
+ kit->dmabuf = dma_buf_iter_begin();
+
+ return kit->dmabuf;
+}
+
+__bpf_kfunc void bpf_iter_dmabuf_destroy(struct bpf_iter_dmabuf *it)
+{
+ struct bpf_iter_dmabuf_kern *kit = (void *)it;
+
+ if (kit->dmabuf)
+ dma_buf_put(kit->dmabuf);
+}
+
+__bpf_kfunc_end_defs();
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index 92b606d60020..71f9931ac64c 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -175,20 +175,30 @@ static bool htab_is_percpu(const struct bpf_htab *htab)
htab->map.map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH;
}
+static inline bool is_fd_htab(const struct bpf_htab *htab)
+{
+ return htab->map.map_type == BPF_MAP_TYPE_HASH_OF_MAPS;
+}
+
+static inline void *htab_elem_value(struct htab_elem *l, u32 key_size)
+{
+ return l->key + round_up(key_size, 8);
+}
+
static inline void htab_elem_set_ptr(struct htab_elem *l, u32 key_size,
void __percpu *pptr)
{
- *(void __percpu **)(l->key + roundup(key_size, 8)) = pptr;
+ *(void __percpu **)htab_elem_value(l, key_size) = pptr;
}
static inline void __percpu *htab_elem_get_ptr(struct htab_elem *l, u32 key_size)
{
- return *(void __percpu **)(l->key + roundup(key_size, 8));
+ return *(void __percpu **)htab_elem_value(l, key_size);
}
static void *fd_htab_map_get_ptr(const struct bpf_map *map, struct htab_elem *l)
{
- return *(void **)(l->key + roundup(map->key_size, 8));
+ return *(void **)htab_elem_value(l, map->key_size);
}
static struct htab_elem *get_htab_elem(struct bpf_htab *htab, int i)
@@ -196,9 +206,13 @@ static struct htab_elem *get_htab_elem(struct bpf_htab *htab, int i)
return (struct htab_elem *) (htab->elems + i * (u64)htab->elem_size);
}
+/* Both percpu and fd htab support in-place update, so no need for
+ * extra elem. LRU itself can remove the least used element, so
+ * there is no need for an extra elem during map_update.
+ */
static bool htab_has_extra_elems(struct bpf_htab *htab)
{
- return !htab_is_percpu(htab) && !htab_is_lru(htab);
+ return !htab_is_percpu(htab) && !htab_is_lru(htab) && !is_fd_htab(htab);
}
static void htab_free_prealloced_timers_and_wq(struct bpf_htab *htab)
@@ -215,10 +229,10 @@ static void htab_free_prealloced_timers_and_wq(struct bpf_htab *htab)
elem = get_htab_elem(htab, i);
if (btf_record_has_field(htab->map.record, BPF_TIMER))
bpf_obj_free_timer(htab->map.record,
- elem->key + round_up(htab->map.key_size, 8));
+ htab_elem_value(elem, htab->map.key_size));
if (btf_record_has_field(htab->map.record, BPF_WORKQUEUE))
bpf_obj_free_workqueue(htab->map.record,
- elem->key + round_up(htab->map.key_size, 8));
+ htab_elem_value(elem, htab->map.key_size));
cond_resched();
}
}
@@ -245,7 +259,8 @@ static void htab_free_prealloced_fields(struct bpf_htab *htab)
cond_resched();
}
} else {
- bpf_obj_free_fields(htab->map.record, elem->key + round_up(htab->map.key_size, 8));
+ bpf_obj_free_fields(htab->map.record,
+ htab_elem_value(elem, htab->map.key_size));
cond_resched();
}
cond_resched();
@@ -453,8 +468,6 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
{
bool percpu = (attr->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
attr->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH);
- bool lru = (attr->map_type == BPF_MAP_TYPE_LRU_HASH ||
- attr->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH);
/* percpu_lru means each cpu has its own LRU list.
* it is different from BPF_MAP_TYPE_PERCPU_HASH where
* the map's value itself is percpu. percpu_lru has
@@ -549,10 +562,7 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
if (err)
goto free_map_locked;
- if (!percpu && !lru) {
- /* lru itself can remove the least used element, so
- * there is no need for an extra elem during map_update.
- */
+ if (htab_has_extra_elems(htab)) {
err = alloc_extra_elems(htab);
if (err)
goto free_prealloc;
@@ -670,7 +680,7 @@ static void *htab_map_lookup_elem(struct bpf_map *map, void *key)
struct htab_elem *l = __htab_map_lookup_elem(map, key);
if (l)
- return l->key + round_up(map->key_size, 8);
+ return htab_elem_value(l, map->key_size);
return NULL;
}
@@ -709,7 +719,7 @@ static __always_inline void *__htab_lru_map_lookup_elem(struct bpf_map *map,
if (l) {
if (mark)
bpf_lru_node_set_ref(&l->lru_node);
- return l->key + round_up(map->key_size, 8);
+ return htab_elem_value(l, map->key_size);
}
return NULL;
@@ -763,7 +773,7 @@ static void check_and_free_fields(struct bpf_htab *htab,
for_each_possible_cpu(cpu)
bpf_obj_free_fields(htab->map.record, per_cpu_ptr(pptr, cpu));
} else {
- void *map_value = elem->key + round_up(htab->map.key_size, 8);
+ void *map_value = htab_elem_value(elem, htab->map.key_size);
bpf_obj_free_fields(htab->map.record, map_value);
}
@@ -968,8 +978,7 @@ static void pcpu_init_value(struct bpf_htab *htab, void __percpu *pptr,
static bool fd_htab_map_needs_adjust(const struct bpf_htab *htab)
{
- return htab->map.map_type == BPF_MAP_TYPE_HASH_OF_MAPS &&
- BITS_PER_LONG == 64;
+ return is_fd_htab(htab) && BITS_PER_LONG == 64;
}
static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key,
@@ -1039,11 +1048,9 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key,
htab_elem_set_ptr(l_new, key_size, pptr);
} else if (fd_htab_map_needs_adjust(htab)) {
size = round_up(size, 8);
- memcpy(l_new->key + round_up(key_size, 8), value, size);
+ memcpy(htab_elem_value(l_new, key_size), value, size);
} else {
- copy_map_value(&htab->map,
- l_new->key + round_up(key_size, 8),
- value);
+ copy_map_value(&htab->map, htab_elem_value(l_new, key_size), value);
}
l_new->hash = hash;
@@ -1072,10 +1079,9 @@ static long htab_map_update_elem(struct bpf_map *map, void *key, void *value,
u64 map_flags)
{
struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
- struct htab_elem *l_new = NULL, *l_old;
+ struct htab_elem *l_new, *l_old;
struct hlist_nulls_head *head;
unsigned long flags;
- void *old_map_ptr;
struct bucket *b;
u32 key_size, hash;
int ret;
@@ -1106,7 +1112,7 @@ static long htab_map_update_elem(struct bpf_map *map, void *key, void *value,
if (l_old) {
/* grab the element lock and update value in place */
copy_map_value_locked(map,
- l_old->key + round_up(key_size, 8),
+ htab_elem_value(l_old, key_size),
value, false);
return 0;
}
@@ -1134,7 +1140,7 @@ static long htab_map_update_elem(struct bpf_map *map, void *key, void *value,
* and update element in place
*/
copy_map_value_locked(map,
- l_old->key + round_up(key_size, 8),
+ htab_elem_value(l_old, key_size),
value, false);
ret = 0;
goto err;
@@ -1156,24 +1162,14 @@ static long htab_map_update_elem(struct bpf_map *map, void *key, void *value,
hlist_nulls_del_rcu(&l_old->hash_node);
/* l_old has already been stashed in htab->extra_elems, free
- * its special fields before it is available for reuse. Also
- * save the old map pointer in htab of maps before unlock
- * and release it after unlock.
+ * its special fields before it is available for reuse.
*/
- old_map_ptr = NULL;
- if (htab_is_prealloc(htab)) {
- if (map->ops->map_fd_put_ptr)
- old_map_ptr = fd_htab_map_get_ptr(map, l_old);
+ if (htab_is_prealloc(htab))
check_and_free_fields(htab, l_old);
- }
}
htab_unlock_bucket(b, flags);
- if (l_old) {
- if (old_map_ptr)
- map->ops->map_fd_put_ptr(map, old_map_ptr, true);
- if (!htab_is_prealloc(htab))
- free_htab_elem(htab, l_old);
- }
+ if (l_old && !htab_is_prealloc(htab))
+ free_htab_elem(htab, l_old);
return 0;
err:
htab_unlock_bucket(b, flags);
@@ -1220,8 +1216,7 @@ static long htab_lru_map_update_elem(struct bpf_map *map, void *key, void *value
l_new = prealloc_lru_pop(htab, key, hash);
if (!l_new)
return -ENOMEM;
- copy_map_value(&htab->map,
- l_new->key + round_up(map->key_size, 8), value);
+ copy_map_value(&htab->map, htab_elem_value(l_new, map->key_size), value);
ret = htab_lock_bucket(b, &flags);
if (ret)
@@ -1255,13 +1250,14 @@ err_lock_bucket:
return ret;
}
-static long __htab_percpu_map_update_elem(struct bpf_map *map, void *key,
+static long htab_map_update_elem_in_place(struct bpf_map *map, void *key,
void *value, u64 map_flags,
- bool onallcpus)
+ bool percpu, bool onallcpus)
{
struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
- struct htab_elem *l_new = NULL, *l_old;
+ struct htab_elem *l_new, *l_old;
struct hlist_nulls_head *head;
+ void *old_map_ptr = NULL;
unsigned long flags;
struct bucket *b;
u32 key_size, hash;
@@ -1292,21 +1288,29 @@ static long __htab_percpu_map_update_elem(struct bpf_map *map, void *key,
goto err;
if (l_old) {
- /* per-cpu hash map can update value in-place */
- pcpu_copy_value(htab, htab_elem_get_ptr(l_old, key_size),
- value, onallcpus);
+ /* Update value in-place */
+ if (percpu) {
+ pcpu_copy_value(htab, htab_elem_get_ptr(l_old, key_size),
+ value, onallcpus);
+ } else {
+ void **inner_map_pptr = htab_elem_value(l_old, key_size);
+
+ old_map_ptr = *inner_map_pptr;
+ WRITE_ONCE(*inner_map_pptr, *(void **)value);
+ }
} else {
l_new = alloc_htab_elem(htab, key, value, key_size,
- hash, true, onallcpus, NULL);
+ hash, percpu, onallcpus, NULL);
if (IS_ERR(l_new)) {
ret = PTR_ERR(l_new);
goto err;
}
hlist_nulls_add_head_rcu(&l_new->hash_node, head);
}
- ret = 0;
err:
htab_unlock_bucket(b, flags);
+ if (old_map_ptr)
+ map->ops->map_fd_put_ptr(map, old_map_ptr, true);
return ret;
}
@@ -1383,7 +1387,7 @@ err_lock_bucket:
static long htab_percpu_map_update_elem(struct bpf_map *map, void *key,
void *value, u64 map_flags)
{
- return __htab_percpu_map_update_elem(map, key, value, map_flags, false);
+ return htab_map_update_elem_in_place(map, key, value, map_flags, true, false);
}
static long htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key,
@@ -1500,10 +1504,10 @@ static void htab_free_malloced_timers_and_wq(struct bpf_htab *htab)
/* We only free timer on uref dropping to zero */
if (btf_record_has_field(htab->map.record, BPF_TIMER))
bpf_obj_free_timer(htab->map.record,
- l->key + round_up(htab->map.key_size, 8));
+ htab_elem_value(l, htab->map.key_size));
if (btf_record_has_field(htab->map.record, BPF_WORKQUEUE))
bpf_obj_free_workqueue(htab->map.record,
- l->key + round_up(htab->map.key_size, 8));
+ htab_elem_value(l, htab->map.key_size));
}
cond_resched_rcu();
}
@@ -1615,15 +1619,12 @@ static int __htab_map_lookup_and_delete_elem(struct bpf_map *map, void *key,
off += roundup_value_size;
}
} else {
- u32 roundup_key_size = round_up(map->key_size, 8);
+ void *src = htab_elem_value(l, map->key_size);
if (flags & BPF_F_LOCK)
- copy_map_value_locked(map, value, l->key +
- roundup_key_size,
- true);
+ copy_map_value_locked(map, value, src, true);
else
- copy_map_value(map, value, l->key +
- roundup_key_size);
+ copy_map_value(map, value, src);
/* Zeroing special fields in the temp buffer */
check_and_init_map_value(map, value);
}
@@ -1680,12 +1681,12 @@ __htab_map_lookup_and_delete_batch(struct bpf_map *map,
bool is_percpu)
{
struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
- u32 bucket_cnt, total, key_size, value_size, roundup_key_size;
void *keys = NULL, *values = NULL, *value, *dst_key, *dst_val;
void __user *uvalues = u64_to_user_ptr(attr->batch.values);
void __user *ukeys = u64_to_user_ptr(attr->batch.keys);
void __user *ubatch = u64_to_user_ptr(attr->batch.in_batch);
u32 batch, max_count, size, bucket_size, map_id;
+ u32 bucket_cnt, total, key_size, value_size;
struct htab_elem *node_to_free = NULL;
u64 elem_map_flags, map_flags;
struct hlist_nulls_head *head;
@@ -1720,7 +1721,6 @@ __htab_map_lookup_and_delete_batch(struct bpf_map *map,
return -ENOENT;
key_size = htab->map.key_size;
- roundup_key_size = round_up(htab->map.key_size, 8);
value_size = htab->map.value_size;
size = round_up(value_size, 8);
if (is_percpu)
@@ -1812,8 +1812,8 @@ again_nocopy:
off += size;
}
} else {
- value = l->key + roundup_key_size;
- if (map->map_type == BPF_MAP_TYPE_HASH_OF_MAPS) {
+ value = htab_elem_value(l, key_size);
+ if (is_fd_htab(htab)) {
struct bpf_map **inner_map = value;
/* Actual value is the id of the inner map */
@@ -2063,11 +2063,11 @@ static void *bpf_hash_map_seq_next(struct seq_file *seq, void *v, loff_t *pos)
static int __bpf_hash_map_seq_show(struct seq_file *seq, struct htab_elem *elem)
{
struct bpf_iter_seq_hash_map_info *info = seq->private;
- u32 roundup_key_size, roundup_value_size;
struct bpf_iter__bpf_map_elem ctx = {};
struct bpf_map *map = info->map;
struct bpf_iter_meta meta;
int ret = 0, off = 0, cpu;
+ u32 roundup_value_size;
struct bpf_prog *prog;
void __percpu *pptr;
@@ -2077,10 +2077,9 @@ static int __bpf_hash_map_seq_show(struct seq_file *seq, struct htab_elem *elem)
ctx.meta = &meta;
ctx.map = info->map;
if (elem) {
- roundup_key_size = round_up(map->key_size, 8);
ctx.key = elem->key;
if (!info->percpu_value_buf) {
- ctx.value = elem->key + roundup_key_size;
+ ctx.value = htab_elem_value(elem, map->key_size);
} else {
roundup_value_size = round_up(map->value_size, 8);
pptr = htab_elem_get_ptr(elem, map->key_size);
@@ -2165,7 +2164,6 @@ static long bpf_for_each_hash_elem(struct bpf_map *map, bpf_callback_t callback_
struct hlist_nulls_head *head;
struct hlist_nulls_node *n;
struct htab_elem *elem;
- u32 roundup_key_size;
int i, num_elems = 0;
void __percpu *pptr;
struct bucket *b;
@@ -2180,7 +2178,6 @@ static long bpf_for_each_hash_elem(struct bpf_map *map, bpf_callback_t callback_
is_percpu = htab_is_percpu(htab);
- roundup_key_size = round_up(map->key_size, 8);
/* migration has been disabled, so percpu value prepared here will be
* the same as the one seen by the bpf program with
* bpf_map_lookup_elem().
@@ -2196,7 +2193,7 @@ static long bpf_for_each_hash_elem(struct bpf_map *map, bpf_callback_t callback_
pptr = htab_elem_get_ptr(elem, map->key_size);
val = this_cpu_ptr(pptr);
} else {
- val = elem->key + roundup_key_size;
+ val = htab_elem_value(elem, map->key_size);
}
num_elems++;
ret = callback_fn((u64)(long)map, (u64)(long)key,
@@ -2411,8 +2408,8 @@ int bpf_percpu_hash_update(struct bpf_map *map, void *key, void *value,
ret = __htab_lru_percpu_map_update_elem(map, key, value,
map_flags, true);
else
- ret = __htab_percpu_map_update_elem(map, key, value, map_flags,
- true);
+ ret = htab_map_update_elem_in_place(map, key, value, map_flags,
+ true, true);
rcu_read_unlock();
return ret;
@@ -2536,24 +2533,23 @@ int bpf_fd_htab_map_lookup_elem(struct bpf_map *map, void *key, u32 *value)
return ret;
}
-/* only called from syscall */
+/* Only called from syscall */
int bpf_fd_htab_map_update_elem(struct bpf_map *map, struct file *map_file,
void *key, void *value, u64 map_flags)
{
void *ptr;
int ret;
- u32 ufd = *(u32 *)value;
- ptr = map->ops->map_fd_get_ptr(map, map_file, ufd);
+ ptr = map->ops->map_fd_get_ptr(map, map_file, *(int *)value);
if (IS_ERR(ptr))
return PTR_ERR(ptr);
/* The htab bucket lock is always held during update operations in fd
* htab map, and the following rcu_read_lock() is only used to avoid
- * the WARN_ON_ONCE in htab_map_update_elem().
+ * the WARN_ON_ONCE in htab_map_update_elem_in_place().
*/
rcu_read_lock();
- ret = htab_map_update_elem(map, key, &ptr, map_flags);
+ ret = htab_map_update_elem_in_place(map, key, &ptr, map_flags, false, false);
rcu_read_unlock();
if (ret)
map->ops->map_fd_put_ptr(map, ptr, false);
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index e3a2662f4e33..6b4877e85a68 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -23,6 +23,8 @@
#include <linux/btf_ids.h>
#include <linux/bpf_mem_alloc.h>
#include <linux/kasan.h>
+#include <linux/bpf_verifier.h>
+#include <linux/uaccess.h>
#include "../../lib/kstrtox.h"
@@ -129,7 +131,8 @@ const struct bpf_func_proto bpf_map_peek_elem_proto = {
BPF_CALL_3(bpf_map_lookup_percpu_elem, struct bpf_map *, map, void *, key, u32, cpu)
{
- WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_bh_held());
+ WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() &&
+ !rcu_read_lock_bh_held());
return (unsigned long) map->ops->map_lookup_percpu_elem(map, key, cpu);
}
@@ -761,22 +764,13 @@ static int bpf_trace_copy_string(char *buf, void *unsafe_ptr, char fmt_ptype,
return -EINVAL;
}
-/* Per-cpu temp buffers used by printf-like helpers to store the bprintf binary
- * arguments representation.
- */
-#define MAX_BPRINTF_BIN_ARGS 512
-
/* Support executing three nested bprintf helper calls on a given CPU */
#define MAX_BPRINTF_NEST_LEVEL 3
-struct bpf_bprintf_buffers {
- char bin_args[MAX_BPRINTF_BIN_ARGS];
- char buf[MAX_BPRINTF_BUF];
-};
static DEFINE_PER_CPU(struct bpf_bprintf_buffers[MAX_BPRINTF_NEST_LEVEL], bpf_bprintf_bufs);
static DEFINE_PER_CPU(int, bpf_bprintf_nest_level);
-static int try_get_buffers(struct bpf_bprintf_buffers **bufs)
+int bpf_try_get_buffers(struct bpf_bprintf_buffers **bufs)
{
int nest_level;
@@ -792,16 +786,21 @@ static int try_get_buffers(struct bpf_bprintf_buffers **bufs)
return 0;
}
-void bpf_bprintf_cleanup(struct bpf_bprintf_data *data)
+void bpf_put_buffers(void)
{
- if (!data->bin_args && !data->buf)
- return;
if (WARN_ON_ONCE(this_cpu_read(bpf_bprintf_nest_level) == 0))
return;
this_cpu_dec(bpf_bprintf_nest_level);
preempt_enable();
}
+void bpf_bprintf_cleanup(struct bpf_bprintf_data *data)
+{
+ if (!data->bin_args && !data->buf)
+ return;
+ bpf_put_buffers();
+}
+
/*
* bpf_bprintf_prepare - Generic pass on format strings for bprintf-like helpers
*
@@ -816,7 +815,7 @@ void bpf_bprintf_cleanup(struct bpf_bprintf_data *data)
* In argument preparation mode, if 0 is returned, safe temporary buffers are
* allocated and bpf_bprintf_cleanup should be called to free them after use.
*/
-int bpf_bprintf_prepare(char *fmt, u32 fmt_size, const u64 *raw_args,
+int bpf_bprintf_prepare(const char *fmt, u32 fmt_size, const u64 *raw_args,
u32 num_args, struct bpf_bprintf_data *data)
{
bool get_buffers = (data->get_bin_args && num_args) || data->get_buf;
@@ -832,7 +831,7 @@ int bpf_bprintf_prepare(char *fmt, u32 fmt_size, const u64 *raw_args,
return -EINVAL;
fmt_size = fmt_end - fmt;
- if (get_buffers && try_get_buffers(&buffers))
+ if (get_buffers && bpf_try_get_buffers(&buffers))
return -EBUSY;
if (data->get_bin_args) {
@@ -882,6 +881,13 @@ int bpf_bprintf_prepare(char *fmt, u32 fmt_size, const u64 *raw_args,
if (fmt[i] == 'p') {
sizeof_cur_arg = sizeof(long);
+ if (fmt[i + 1] == 0 || isspace(fmt[i + 1]) ||
+ ispunct(fmt[i + 1])) {
+ if (tmp_buf)
+ cur_arg = raw_args[num_spec];
+ goto nocopy_fmt;
+ }
+
if ((fmt[i + 1] == 'k' || fmt[i + 1] == 'u') &&
fmt[i + 2] == 's') {
fmt_ptype = fmt[i + 1];
@@ -889,11 +895,9 @@ int bpf_bprintf_prepare(char *fmt, u32 fmt_size, const u64 *raw_args,
goto fmt_str;
}
- if (fmt[i + 1] == 0 || isspace(fmt[i + 1]) ||
- ispunct(fmt[i + 1]) || fmt[i + 1] == 'K' ||
+ if (fmt[i + 1] == 'K' ||
fmt[i + 1] == 'x' || fmt[i + 1] == 's' ||
fmt[i + 1] == 'S') {
- /* just kernel pointers */
if (tmp_buf)
cur_arg = raw_args[num_spec];
i++;
@@ -1713,16 +1717,6 @@ void bpf_dynptr_set_null(struct bpf_dynptr_kern *ptr)
memset(ptr, 0, sizeof(*ptr));
}
-static int bpf_dynptr_check_off_len(const struct bpf_dynptr_kern *ptr, u32 offset, u32 len)
-{
- u32 size = __bpf_dynptr_size(ptr);
-
- if (len > size || offset > size - len)
- return -E2BIG;
-
- return 0;
-}
-
BPF_CALL_4(bpf_dynptr_from_mem, void *, data, u32, size, u64, flags, struct bpf_dynptr_kern *, ptr)
{
int err;
@@ -1809,8 +1803,8 @@ static const struct bpf_func_proto bpf_dynptr_read_proto = {
.arg5_type = ARG_ANYTHING,
};
-static int __bpf_dynptr_write(const struct bpf_dynptr_kern *dst, u32 offset, void *src,
- u32 len, u64 flags)
+int __bpf_dynptr_write(const struct bpf_dynptr_kern *dst, u32 offset, void *src,
+ u32 len, u64 flags)
{
enum bpf_dynptr_type type;
int err;
@@ -1912,6 +1906,12 @@ const struct bpf_func_proto bpf_probe_read_user_str_proto __weak;
const struct bpf_func_proto bpf_probe_read_kernel_proto __weak;
const struct bpf_func_proto bpf_probe_read_kernel_str_proto __weak;
const struct bpf_func_proto bpf_task_pt_regs_proto __weak;
+const struct bpf_func_proto bpf_perf_event_read_proto __weak;
+const struct bpf_func_proto bpf_send_signal_proto __weak;
+const struct bpf_func_proto bpf_send_signal_thread_proto __weak;
+const struct bpf_func_proto bpf_get_task_stack_sleepable_proto __weak;
+const struct bpf_func_proto bpf_get_task_stack_proto __weak;
+const struct bpf_func_proto bpf_get_branch_snapshot_proto __weak;
const struct bpf_func_proto *
bpf_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
@@ -1965,6 +1965,8 @@ bpf_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_get_current_pid_tgid_proto;
case BPF_FUNC_get_ns_current_pid_tgid:
return &bpf_get_ns_current_pid_tgid_proto;
+ case BPF_FUNC_get_current_uid_gid:
+ return &bpf_get_current_uid_gid_proto;
default:
break;
}
@@ -2022,7 +2024,21 @@ bpf_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_get_current_cgroup_id_proto;
case BPF_FUNC_get_current_ancestor_cgroup_id:
return &bpf_get_current_ancestor_cgroup_id_proto;
+ case BPF_FUNC_current_task_under_cgroup:
+ return &bpf_current_task_under_cgroup_proto;
#endif
+#ifdef CONFIG_CGROUP_NET_CLASSID
+ case BPF_FUNC_get_cgroup_classid:
+ return &bpf_get_cgroup_classid_curr_proto;
+#endif
+ case BPF_FUNC_task_storage_get:
+ if (bpf_prog_check_recur(prog))
+ return &bpf_task_storage_get_recur_proto;
+ return &bpf_task_storage_get_proto;
+ case BPF_FUNC_task_storage_delete:
+ if (bpf_prog_check_recur(prog))
+ return &bpf_task_storage_delete_recur_proto;
+ return &bpf_task_storage_delete_proto;
default:
break;
}
@@ -2037,6 +2053,8 @@ bpf_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_get_current_task_proto;
case BPF_FUNC_get_current_task_btf:
return &bpf_get_current_task_btf_proto;
+ case BPF_FUNC_get_current_comm:
+ return &bpf_get_current_comm_proto;
case BPF_FUNC_probe_read_user:
return &bpf_probe_read_user_proto;
case BPF_FUNC_probe_read_kernel:
@@ -2047,6 +2065,10 @@ bpf_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
case BPF_FUNC_probe_read_kernel_str:
return security_locked_down(LOCKDOWN_BPF_READ_KERNEL) < 0 ?
NULL : &bpf_probe_read_kernel_str_proto;
+ case BPF_FUNC_copy_from_user:
+ return &bpf_copy_from_user_proto;
+ case BPF_FUNC_copy_from_user_task:
+ return &bpf_copy_from_user_task_proto;
case BPF_FUNC_snprintf_btf:
return &bpf_snprintf_btf_proto;
case BPF_FUNC_snprintf:
@@ -2057,6 +2079,19 @@ bpf_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return bpf_get_trace_vprintk_proto();
case BPF_FUNC_perf_event_read_value:
return bpf_get_perf_event_read_value_proto();
+ case BPF_FUNC_perf_event_read:
+ return &bpf_perf_event_read_proto;
+ case BPF_FUNC_send_signal:
+ return &bpf_send_signal_proto;
+ case BPF_FUNC_send_signal_thread:
+ return &bpf_send_signal_thread_proto;
+ case BPF_FUNC_get_task_stack:
+ return prog->sleepable ? &bpf_get_task_stack_sleepable_proto
+ : &bpf_get_task_stack_proto;
+ case BPF_FUNC_get_branch_snapshot:
+ return &bpf_get_branch_snapshot_proto;
+ case BPF_FUNC_find_vma:
+ return &bpf_find_vma_proto;
default:
return NULL;
}
@@ -2293,6 +2328,26 @@ __bpf_kfunc struct bpf_list_node *bpf_list_pop_back(struct bpf_list_head *head)
return __bpf_list_del(head, true);
}
+__bpf_kfunc struct bpf_list_node *bpf_list_front(struct bpf_list_head *head)
+{
+ struct list_head *h = (struct list_head *)head;
+
+ if (list_empty(h) || unlikely(!h->next))
+ return NULL;
+
+ return (struct bpf_list_node *)h->next;
+}
+
+__bpf_kfunc struct bpf_list_node *bpf_list_back(struct bpf_list_head *head)
+{
+ struct list_head *h = (struct list_head *)head;
+
+ if (list_empty(h) || unlikely(!h->next))
+ return NULL;
+
+ return (struct bpf_list_node *)h->prev;
+}
+
__bpf_kfunc struct bpf_rb_node *bpf_rbtree_remove(struct bpf_rb_root *root,
struct bpf_rb_node *node)
{
@@ -2366,6 +2421,33 @@ __bpf_kfunc struct bpf_rb_node *bpf_rbtree_first(struct bpf_rb_root *root)
return (struct bpf_rb_node *)rb_first_cached(r);
}
+__bpf_kfunc struct bpf_rb_node *bpf_rbtree_root(struct bpf_rb_root *root)
+{
+ struct rb_root_cached *r = (struct rb_root_cached *)root;
+
+ return (struct bpf_rb_node *)r->rb_root.rb_node;
+}
+
+__bpf_kfunc struct bpf_rb_node *bpf_rbtree_left(struct bpf_rb_root *root, struct bpf_rb_node *node)
+{
+ struct bpf_rb_node_kern *node_internal = (struct bpf_rb_node_kern *)node;
+
+ if (READ_ONCE(node_internal->owner) != root)
+ return NULL;
+
+ return (struct bpf_rb_node *)node_internal->rb_node.rb_left;
+}
+
+__bpf_kfunc struct bpf_rb_node *bpf_rbtree_right(struct bpf_rb_root *root, struct bpf_rb_node *node)
+{
+ struct bpf_rb_node_kern *node_internal = (struct bpf_rb_node_kern *)node;
+
+ if (READ_ONCE(node_internal->owner) != root)
+ return NULL;
+
+ return (struct bpf_rb_node *)node_internal->rb_node.rb_right;
+}
+
/**
* bpf_task_acquire - Acquire a reference to a task. A task acquired by this
* kfunc which is not stored in a map as a kptr, must be released by calling
@@ -2826,6 +2908,52 @@ __bpf_kfunc int bpf_dynptr_copy(struct bpf_dynptr *dst_ptr, u32 dst_off,
return 0;
}
+/**
+ * bpf_dynptr_memset() - Fill dynptr memory with a constant byte.
+ * @p: Destination dynptr - where data will be filled
+ * @offset: Offset into the dynptr to start filling from
+ * @size: Number of bytes to fill
+ * @val: Constant byte to fill the memory with
+ *
+ * Fills the @size bytes of the memory area pointed to by @p
+ * at @offset with the constant byte @val.
+ * Returns 0 on success; negative error, otherwise.
+ */
+ __bpf_kfunc int bpf_dynptr_memset(struct bpf_dynptr *p, u32 offset, u32 size, u8 val)
+ {
+ struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)p;
+ u32 chunk_sz, write_off;
+ char buf[256];
+ void* slice;
+ int err;
+
+ slice = bpf_dynptr_slice_rdwr(p, offset, NULL, size);
+ if (likely(slice)) {
+ memset(slice, val, size);
+ return 0;
+ }
+
+ if (__bpf_dynptr_is_rdonly(ptr))
+ return -EINVAL;
+
+ err = bpf_dynptr_check_off_len(ptr, offset, size);
+ if (err)
+ return err;
+
+ /* Non-linear data under the dynptr, write from a local buffer */
+ chunk_sz = min_t(u32, sizeof(buf), size);
+ memset(buf, val, chunk_sz);
+
+ for (write_off = 0; write_off < size; write_off += chunk_sz) {
+ chunk_sz = min_t(u32, sizeof(buf), size - write_off);
+ err = __bpf_dynptr_write(ptr, offset + write_off, buf, chunk_sz, 0);
+ if (err)
+ return err;
+ }
+
+ return 0;
+}
+
__bpf_kfunc void *bpf_cast_to_kern_ctx(void *obj)
{
return obj;
@@ -2858,9 +2986,16 @@ static bool bpf_stack_walker(void *cookie, u64 ip, u64 sp, u64 bp)
struct bpf_throw_ctx *ctx = cookie;
struct bpf_prog *prog;
- if (!is_bpf_text_address(ip))
- return !ctx->cnt;
+ /*
+ * The RCU read lock is held to safely traverse the latch tree, but we
+ * don't need its protection when accessing the prog, since it has an
+ * active stack frame on the current stack trace, and won't disappear.
+ */
+ rcu_read_lock();
prog = bpf_prog_ksym_find(ip);
+ rcu_read_unlock();
+ if (!prog)
+ return !ctx->cnt;
ctx->cnt++;
if (bpf_is_subprog(prog))
return true;
@@ -2923,9 +3058,9 @@ __bpf_kfunc int bpf_wq_start(struct bpf_wq *wq, unsigned int flags)
__bpf_kfunc int bpf_wq_set_callback_impl(struct bpf_wq *wq,
int (callback_fn)(void *map, int *key, void *value),
unsigned int flags,
- void *aux__ign)
+ void *aux__prog)
{
- struct bpf_prog_aux *aux = (struct bpf_prog_aux *)aux__ign;
+ struct bpf_prog_aux *aux = (struct bpf_prog_aux *)aux__prog;
struct bpf_async_kern *async = (struct bpf_async_kern *)wq;
if (flags)
@@ -3194,6 +3329,380 @@ __bpf_kfunc void bpf_local_irq_restore(unsigned long *flags__irq_flag)
local_irq_restore(*flags__irq_flag);
}
+__bpf_kfunc void __bpf_trap(void)
+{
+}
+
+/*
+ * Kfuncs for string operations.
+ *
+ * Since strings are not necessarily %NUL-terminated, we cannot directly call
+ * in-kernel implementations. Instead, we open-code the implementations using
+ * __get_kernel_nofault instead of plain dereference to make them safe.
+ */
+
+/**
+ * bpf_strcmp - Compare two strings
+ * @s1__ign: One string
+ * @s2__ign: Another string
+ *
+ * Return:
+ * * %0 - Strings are equal
+ * * %-1 - @s1__ign is smaller
+ * * %1 - @s2__ign is smaller
+ * * %-EFAULT - Cannot read one of the strings
+ * * %-E2BIG - One of strings is too large
+ * * %-ERANGE - One of strings is outside of kernel address space
+ */
+__bpf_kfunc int bpf_strcmp(const char *s1__ign, const char *s2__ign)
+{
+ char c1, c2;
+ int i;
+
+ if (!copy_from_kernel_nofault_allowed(s1__ign, 1) ||
+ !copy_from_kernel_nofault_allowed(s2__ign, 1)) {
+ return -ERANGE;
+ }
+
+ guard(pagefault)();
+ for (i = 0; i < XATTR_SIZE_MAX; i++) {
+ __get_kernel_nofault(&c1, s1__ign, char, err_out);
+ __get_kernel_nofault(&c2, s2__ign, char, err_out);
+ if (c1 != c2)
+ return c1 < c2 ? -1 : 1;
+ if (c1 == '\0')
+ return 0;
+ s1__ign++;
+ s2__ign++;
+ }
+ return -E2BIG;
+err_out:
+ return -EFAULT;
+}
+
+/**
+ * bpf_strnchr - Find a character in a length limited string
+ * @s__ign: The string to be searched
+ * @count: The number of characters to be searched
+ * @c: The character to search for
+ *
+ * Note that the %NUL-terminator is considered part of the string, and can
+ * be searched for.
+ *
+ * Return:
+ * * >=0 - Index of the first occurrence of @c within @s__ign
+ * * %-ENOENT - @c not found in the first @count characters of @s__ign
+ * * %-EFAULT - Cannot read @s__ign
+ * * %-E2BIG - @s__ign is too large
+ * * %-ERANGE - @s__ign is outside of kernel address space
+ */
+__bpf_kfunc int bpf_strnchr(const char *s__ign, size_t count, char c)
+{
+ char sc;
+ int i;
+
+ if (!copy_from_kernel_nofault_allowed(s__ign, 1))
+ return -ERANGE;
+
+ guard(pagefault)();
+ for (i = 0; i < count && i < XATTR_SIZE_MAX; i++) {
+ __get_kernel_nofault(&sc, s__ign, char, err_out);
+ if (sc == c)
+ return i;
+ if (sc == '\0')
+ return -ENOENT;
+ s__ign++;
+ }
+ return i == XATTR_SIZE_MAX ? -E2BIG : -ENOENT;
+err_out:
+ return -EFAULT;
+}
+
+/**
+ * bpf_strchr - Find the first occurrence of a character in a string
+ * @s__ign: The string to be searched
+ * @c: The character to search for
+ *
+ * Note that the %NUL-terminator is considered part of the string, and can
+ * be searched for.
+ *
+ * Return:
+ * * >=0 - The index of the first occurrence of @c within @s__ign
+ * * %-ENOENT - @c not found in @s__ign
+ * * %-EFAULT - Cannot read @s__ign
+ * * %-E2BIG - @s__ign is too large
+ * * %-ERANGE - @s__ign is outside of kernel address space
+ */
+__bpf_kfunc int bpf_strchr(const char *s__ign, char c)
+{
+ return bpf_strnchr(s__ign, XATTR_SIZE_MAX, c);
+}
+
+/**
+ * bpf_strchrnul - Find and return a character in a string, or end of string
+ * @s__ign: The string to be searched
+ * @c: The character to search for
+ *
+ * Return:
+ * * >=0 - Index of the first occurrence of @c within @s__ign or index of
+ * the null byte at the end of @s__ign when @c is not found
+ * * %-EFAULT - Cannot read @s__ign
+ * * %-E2BIG - @s__ign is too large
+ * * %-ERANGE - @s__ign is outside of kernel address space
+ */
+__bpf_kfunc int bpf_strchrnul(const char *s__ign, char c)
+{
+ char sc;
+ int i;
+
+ if (!copy_from_kernel_nofault_allowed(s__ign, 1))
+ return -ERANGE;
+
+ guard(pagefault)();
+ for (i = 0; i < XATTR_SIZE_MAX; i++) {
+ __get_kernel_nofault(&sc, s__ign, char, err_out);
+ if (sc == '\0' || sc == c)
+ return i;
+ s__ign++;
+ }
+ return -E2BIG;
+err_out:
+ return -EFAULT;
+}
+
+/**
+ * bpf_strrchr - Find the last occurrence of a character in a string
+ * @s__ign: The string to be searched
+ * @c: The character to search for
+ *
+ * Return:
+ * * >=0 - Index of the last occurrence of @c within @s__ign
+ * * %-ENOENT - @c not found in @s__ign
+ * * %-EFAULT - Cannot read @s__ign
+ * * %-E2BIG - @s__ign is too large
+ * * %-ERANGE - @s__ign is outside of kernel address space
+ */
+__bpf_kfunc int bpf_strrchr(const char *s__ign, int c)
+{
+ char sc;
+ int i, last = -ENOENT;
+
+ if (!copy_from_kernel_nofault_allowed(s__ign, 1))
+ return -ERANGE;
+
+ guard(pagefault)();
+ for (i = 0; i < XATTR_SIZE_MAX; i++) {
+ __get_kernel_nofault(&sc, s__ign, char, err_out);
+ if (sc == c)
+ last = i;
+ if (sc == '\0')
+ return last;
+ s__ign++;
+ }
+ return -E2BIG;
+err_out:
+ return -EFAULT;
+}
+
+/**
+ * bpf_strnlen - Calculate the length of a length-limited string
+ * @s__ign: The string
+ * @count: The maximum number of characters to count
+ *
+ * Return:
+ * * >=0 - The length of @s__ign
+ * * %-EFAULT - Cannot read @s__ign
+ * * %-E2BIG - @s__ign is too large
+ * * %-ERANGE - @s__ign is outside of kernel address space
+ */
+__bpf_kfunc int bpf_strnlen(const char *s__ign, size_t count)
+{
+ char c;
+ int i;
+
+ if (!copy_from_kernel_nofault_allowed(s__ign, 1))
+ return -ERANGE;
+
+ guard(pagefault)();
+ for (i = 0; i < count && i < XATTR_SIZE_MAX; i++) {
+ __get_kernel_nofault(&c, s__ign, char, err_out);
+ if (c == '\0')
+ return i;
+ s__ign++;
+ }
+ return i == XATTR_SIZE_MAX ? -E2BIG : i;
+err_out:
+ return -EFAULT;
+}
+
+/**
+ * bpf_strlen - Calculate the length of a string
+ * @s__ign: The string
+ *
+ * Return:
+ * * >=0 - The length of @s__ign
+ * * %-EFAULT - Cannot read @s__ign
+ * * %-E2BIG - @s__ign is too large
+ * * %-ERANGE - @s__ign is outside of kernel address space
+ */
+__bpf_kfunc int bpf_strlen(const char *s__ign)
+{
+ return bpf_strnlen(s__ign, XATTR_SIZE_MAX);
+}
+
+/**
+ * bpf_strspn - Calculate the length of the initial substring of @s__ign which
+ * only contains letters in @accept__ign
+ * @s__ign: The string to be searched
+ * @accept__ign: The string to search for
+ *
+ * Return:
+ * * >=0 - The length of the initial substring of @s__ign which only
+ * contains letters from @accept__ign
+ * * %-EFAULT - Cannot read one of the strings
+ * * %-E2BIG - One of the strings is too large
+ * * %-ERANGE - One of the strings is outside of kernel address space
+ */
+__bpf_kfunc int bpf_strspn(const char *s__ign, const char *accept__ign)
+{
+ char cs, ca;
+ int i, j;
+
+ if (!copy_from_kernel_nofault_allowed(s__ign, 1) ||
+ !copy_from_kernel_nofault_allowed(accept__ign, 1)) {
+ return -ERANGE;
+ }
+
+ guard(pagefault)();
+ for (i = 0; i < XATTR_SIZE_MAX; i++) {
+ __get_kernel_nofault(&cs, s__ign, char, err_out);
+ if (cs == '\0')
+ return i;
+ for (j = 0; j < XATTR_SIZE_MAX; j++) {
+ __get_kernel_nofault(&ca, accept__ign + j, char, err_out);
+ if (cs == ca || ca == '\0')
+ break;
+ }
+ if (j == XATTR_SIZE_MAX)
+ return -E2BIG;
+ if (ca == '\0')
+ return i;
+ s__ign++;
+ }
+ return -E2BIG;
+err_out:
+ return -EFAULT;
+}
+
+/**
+ * bpf_strcspn - Calculate the length of the initial substring of @s__ign which
+ * does not contain letters in @reject__ign
+ * @s__ign: The string to be searched
+ * @reject__ign: The string to search for
+ *
+ * Return:
+ * * >=0 - The length of the initial substring of @s__ign which does not
+ * contain letters from @reject__ign
+ * * %-EFAULT - Cannot read one of the strings
+ * * %-E2BIG - One of the strings is too large
+ * * %-ERANGE - One of the strings is outside of kernel address space
+ */
+__bpf_kfunc int bpf_strcspn(const char *s__ign, const char *reject__ign)
+{
+ char cs, cr;
+ int i, j;
+
+ if (!copy_from_kernel_nofault_allowed(s__ign, 1) ||
+ !copy_from_kernel_nofault_allowed(reject__ign, 1)) {
+ return -ERANGE;
+ }
+
+ guard(pagefault)();
+ for (i = 0; i < XATTR_SIZE_MAX; i++) {
+ __get_kernel_nofault(&cs, s__ign, char, err_out);
+ if (cs == '\0')
+ return i;
+ for (j = 0; j < XATTR_SIZE_MAX; j++) {
+ __get_kernel_nofault(&cr, reject__ign + j, char, err_out);
+ if (cs == cr || cr == '\0')
+ break;
+ }
+ if (j == XATTR_SIZE_MAX)
+ return -E2BIG;
+ if (cr != '\0')
+ return i;
+ s__ign++;
+ }
+ return -E2BIG;
+err_out:
+ return -EFAULT;
+}
+
+/**
+ * bpf_strnstr - Find the first substring in a length-limited string
+ * @s1__ign: The string to be searched
+ * @s2__ign: The string to search for
+ * @len: the maximum number of characters to search
+ *
+ * Return:
+ * * >=0 - Index of the first character of the first occurrence of @s2__ign
+ * within the first @len characters of @s1__ign
+ * * %-ENOENT - @s2__ign not found in the first @len characters of @s1__ign
+ * * %-EFAULT - Cannot read one of the strings
+ * * %-E2BIG - One of the strings is too large
+ * * %-ERANGE - One of the strings is outside of kernel address space
+ */
+__bpf_kfunc int bpf_strnstr(const char *s1__ign, const char *s2__ign, size_t len)
+{
+ char c1, c2;
+ int i, j;
+
+ if (!copy_from_kernel_nofault_allowed(s1__ign, 1) ||
+ !copy_from_kernel_nofault_allowed(s2__ign, 1)) {
+ return -ERANGE;
+ }
+
+ guard(pagefault)();
+ for (i = 0; i < XATTR_SIZE_MAX; i++) {
+ for (j = 0; i + j < len && j < XATTR_SIZE_MAX; j++) {
+ __get_kernel_nofault(&c2, s2__ign + j, char, err_out);
+ if (c2 == '\0')
+ return i;
+ __get_kernel_nofault(&c1, s1__ign + j, char, err_out);
+ if (c1 == '\0')
+ return -ENOENT;
+ if (c1 != c2)
+ break;
+ }
+ if (j == XATTR_SIZE_MAX)
+ return -E2BIG;
+ if (i + j == len)
+ return -ENOENT;
+ s1__ign++;
+ }
+ return -E2BIG;
+err_out:
+ return -EFAULT;
+}
+
+/**
+ * bpf_strstr - Find the first substring in a string
+ * @s1__ign: The string to be searched
+ * @s2__ign: The string to search for
+ *
+ * Return:
+ * * >=0 - Index of the first character of the first occurrence of @s2__ign
+ * within @s1__ign
+ * * %-ENOENT - @s2__ign is not a substring of @s1__ign
+ * * %-EFAULT - Cannot read one of the strings
+ * * %-E2BIG - One of the strings is too large
+ * * %-ERANGE - One of the strings is outside of kernel address space
+ */
+__bpf_kfunc int bpf_strstr(const char *s1__ign, const char *s2__ign)
+{
+ return bpf_strnstr(s1__ign, s2__ign, XATTR_SIZE_MAX);
+}
+
__bpf_kfunc_end_defs();
BTF_KFUNCS_START(generic_btf_ids)
@@ -3209,11 +3718,16 @@ BTF_ID_FLAGS(func, bpf_list_push_front_impl)
BTF_ID_FLAGS(func, bpf_list_push_back_impl)
BTF_ID_FLAGS(func, bpf_list_pop_front, KF_ACQUIRE | KF_RET_NULL)
BTF_ID_FLAGS(func, bpf_list_pop_back, KF_ACQUIRE | KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_list_front, KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_list_back, KF_RET_NULL)
BTF_ID_FLAGS(func, bpf_task_acquire, KF_ACQUIRE | KF_RCU | KF_RET_NULL)
BTF_ID_FLAGS(func, bpf_task_release, KF_RELEASE)
BTF_ID_FLAGS(func, bpf_rbtree_remove, KF_ACQUIRE | KF_RET_NULL)
BTF_ID_FLAGS(func, bpf_rbtree_add_impl)
BTF_ID_FLAGS(func, bpf_rbtree_first, KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_rbtree_root, KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_rbtree_left, KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_rbtree_right, KF_RET_NULL)
#ifdef CONFIG_CGROUPS
BTF_ID_FLAGS(func, bpf_cgroup_acquire, KF_ACQUIRE | KF_RCU | KF_RET_NULL)
@@ -3275,6 +3789,7 @@ BTF_ID_FLAGS(func, bpf_dynptr_is_rdonly)
BTF_ID_FLAGS(func, bpf_dynptr_size)
BTF_ID_FLAGS(func, bpf_dynptr_clone)
BTF_ID_FLAGS(func, bpf_dynptr_copy)
+BTF_ID_FLAGS(func, bpf_dynptr_memset)
#ifdef CONFIG_NET
BTF_ID_FLAGS(func, bpf_modify_return_test_tp)
#endif
@@ -3294,6 +3809,35 @@ BTF_ID_FLAGS(func, bpf_iter_kmem_cache_next, KF_ITER_NEXT | KF_RET_NULL | KF_SLE
BTF_ID_FLAGS(func, bpf_iter_kmem_cache_destroy, KF_ITER_DESTROY | KF_SLEEPABLE)
BTF_ID_FLAGS(func, bpf_local_irq_save)
BTF_ID_FLAGS(func, bpf_local_irq_restore)
+BTF_ID_FLAGS(func, bpf_probe_read_user_dynptr)
+BTF_ID_FLAGS(func, bpf_probe_read_kernel_dynptr)
+BTF_ID_FLAGS(func, bpf_probe_read_user_str_dynptr)
+BTF_ID_FLAGS(func, bpf_probe_read_kernel_str_dynptr)
+BTF_ID_FLAGS(func, bpf_copy_from_user_dynptr, KF_SLEEPABLE)
+BTF_ID_FLAGS(func, bpf_copy_from_user_str_dynptr, KF_SLEEPABLE)
+BTF_ID_FLAGS(func, bpf_copy_from_user_task_dynptr, KF_SLEEPABLE | KF_TRUSTED_ARGS)
+BTF_ID_FLAGS(func, bpf_copy_from_user_task_str_dynptr, KF_SLEEPABLE | KF_TRUSTED_ARGS)
+#ifdef CONFIG_DMA_SHARED_BUFFER
+BTF_ID_FLAGS(func, bpf_iter_dmabuf_new, KF_ITER_NEW | KF_SLEEPABLE)
+BTF_ID_FLAGS(func, bpf_iter_dmabuf_next, KF_ITER_NEXT | KF_RET_NULL | KF_SLEEPABLE)
+BTF_ID_FLAGS(func, bpf_iter_dmabuf_destroy, KF_ITER_DESTROY | KF_SLEEPABLE)
+#endif
+BTF_ID_FLAGS(func, __bpf_trap)
+BTF_ID_FLAGS(func, bpf_strcmp);
+BTF_ID_FLAGS(func, bpf_strchr);
+BTF_ID_FLAGS(func, bpf_strchrnul);
+BTF_ID_FLAGS(func, bpf_strnchr);
+BTF_ID_FLAGS(func, bpf_strrchr);
+BTF_ID_FLAGS(func, bpf_strlen);
+BTF_ID_FLAGS(func, bpf_strnlen);
+BTF_ID_FLAGS(func, bpf_strspn);
+BTF_ID_FLAGS(func, bpf_strcspn);
+BTF_ID_FLAGS(func, bpf_strstr);
+BTF_ID_FLAGS(func, bpf_strnstr);
+#if defined(CONFIG_BPF_LSM) && defined(CONFIG_CGROUPS)
+BTF_ID_FLAGS(func, bpf_cgroup_read_xattr, KF_RCU)
+#endif
+BTF_ID_FLAGS(func, bpf_stream_vprintk, KF_TRUSTED_ARGS)
BTF_KFUNCS_END(common_btf_ids)
static const struct btf_kfunc_id_set common_kfunc_set = {
diff --git a/kernel/bpf/link_iter.c b/kernel/bpf/link_iter.c
index fec8005a121c..8158e9c1af7b 100644
--- a/kernel/bpf/link_iter.c
+++ b/kernel/bpf/link_iter.c
@@ -78,8 +78,7 @@ static const struct seq_operations bpf_link_seq_ops = {
.show = bpf_link_seq_show,
};
-BTF_ID_LIST(btf_bpf_link_id)
-BTF_ID(struct, bpf_link)
+BTF_ID_LIST_SINGLE(btf_bpf_link_id, struct, bpf_link)
static const struct bpf_iter_seq_info bpf_link_seq_info = {
.seq_ops = &bpf_link_seq_ops,
diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c
index 3969eb0382af..632d51b05fe9 100644
--- a/kernel/bpf/local_storage.c
+++ b/kernel/bpf/local_storage.c
@@ -394,17 +394,10 @@ static int cgroup_storage_check_btf(const struct bpf_map *map,
if (!btf_member_is_reg_int(btf, key_type, m, offset, size))
return -EINVAL;
} else {
- u32 int_data;
-
/*
* Key is expected to be u64, which stores the cgroup_inode_id
*/
-
- if (BTF_INFO_KIND(key_type->info) != BTF_KIND_INT)
- return -EINVAL;
-
- int_data = *(u32 *)(key_type + 1);
- if (BTF_INT_BITS(int_data) != 64 || BTF_INT_OFFSET(int_data))
+ if (!btf_type_is_i64(key_type))
return -EINVAL;
}
diff --git a/kernel/bpf/net_namespace.c b/kernel/bpf/net_namespace.c
index 868cc2c43899..8e88201c98bf 100644
--- a/kernel/bpf/net_namespace.c
+++ b/kernel/bpf/net_namespace.c
@@ -11,8 +11,6 @@
struct bpf_netns_link {
struct bpf_link link;
- enum bpf_attach_type type;
- enum netns_bpf_attach_type netns_type;
/* We don't hold a ref to net in order to auto-detach the link
* when netns is going away. Instead we rely on pernet
@@ -21,6 +19,7 @@ struct bpf_netns_link {
*/
struct net *net;
struct list_head node; /* node in list of links attached to net */
+ enum netns_bpf_attach_type netns_type;
};
/* Protects updates to netns_bpf */
@@ -216,7 +215,7 @@ static int bpf_netns_link_fill_info(const struct bpf_link *link,
mutex_unlock(&netns_bpf_mutex);
info->netns.netns_ino = inum;
- info->netns.attach_type = net_link->type;
+ info->netns.attach_type = link->attach_type;
return 0;
}
@@ -230,7 +229,7 @@ static void bpf_netns_link_show_fdinfo(const struct bpf_link *link,
"netns_ino:\t%u\n"
"attach_type:\t%u\n",
info.netns.netns_ino,
- info.netns.attach_type);
+ link->attach_type);
}
static const struct bpf_link_ops bpf_netns_link_ops = {
@@ -501,9 +500,8 @@ int netns_bpf_link_create(const union bpf_attr *attr, struct bpf_prog *prog)
goto out_put_net;
}
bpf_link_init(&net_link->link, BPF_LINK_TYPE_NETNS,
- &bpf_netns_link_ops, prog);
+ &bpf_netns_link_ops, prog, type);
net_link->net = net;
- net_link->type = type;
net_link->netns_type = netns_type;
err = bpf_link_prime(&net_link->link, &link_primer);
diff --git a/kernel/bpf/preload/Kconfig b/kernel/bpf/preload/Kconfig
index c9d45c9d6918..aef7b0bc96d6 100644
--- a/kernel/bpf/preload/Kconfig
+++ b/kernel/bpf/preload/Kconfig
@@ -1,8 +1,4 @@
# SPDX-License-Identifier: GPL-2.0-only
-config USERMODE_DRIVER
- bool
- default n
-
menuconfig BPF_PRELOAD
bool "Preload BPF file system with kernel specific program and map iterators"
depends on BPF
@@ -10,7 +6,6 @@ menuconfig BPF_PRELOAD
# The dependency on !COMPILE_TEST prevents it from being enabled
# in allmodconfig or allyesconfig configurations
depends on !COMPILE_TEST
- select USERMODE_DRIVER
help
This builds kernel module with several embedded BPF programs that are
pinned into BPF FS mount point as human readable files that are
diff --git a/kernel/bpf/preload/iterators/iterators.lskel-big-endian.h b/kernel/bpf/preload/iterators/iterators.lskel-big-endian.h
index ebdc6c0cdb70..49b1d515a847 100644
--- a/kernel/bpf/preload/iterators/iterators.lskel-big-endian.h
+++ b/kernel/bpf/preload/iterators/iterators.lskel-big-endian.h
@@ -89,10 +89,7 @@ iterators_bpf__load(struct iterators_bpf *skel)
{
struct bpf_load_and_run_opts opts = {};
int err;
-
- opts.ctx = (struct bpf_loader_ctx *)skel;
- opts.data_sz = 6008;
- opts.data = (void *)"\
+ static const char opts_data[] __attribute__((__aligned__(8))) = "\
\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
@@ -126,190 +123,196 @@ iterators_bpf__load(struct iterators_bpf *skel)
\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\xeb\x9f\x01\0\
-\0\0\0\x18\0\0\0\0\0\0\x04\x1c\0\0\x04\x1c\0\0\x05\x18\0\0\0\0\x02\0\0\0\0\0\0\
+\0\0\0\x18\0\0\0\0\0\0\x04\x80\0\0\x04\x80\0\0\x05\x44\0\0\0\0\x02\0\0\0\0\0\0\
\x02\0\0\0\x01\x04\0\0\x02\0\0\0\x10\0\0\0\x13\0\0\0\x03\0\0\0\0\0\0\0\x18\0\0\
\0\x04\0\0\0\x40\0\0\0\0\x02\0\0\0\0\0\0\x08\0\0\0\0\x02\0\0\0\0\0\0\x0d\0\0\0\
\0\x0d\0\0\x01\0\0\0\x06\0\0\0\x1c\0\0\0\x01\0\0\0\x20\x01\0\0\0\0\0\0\x04\x01\
-\0\0\x20\0\0\0\x24\x0c\0\0\x01\0\0\0\x05\0\0\0\xc2\x04\0\0\x03\0\0\0\x18\0\0\0\
-\xd0\0\0\0\x09\0\0\0\0\0\0\0\xd4\0\0\0\x0b\0\0\0\x40\0\0\0\xdf\0\0\0\x0b\0\0\0\
-\x80\0\0\0\0\x02\0\0\0\0\0\0\x0a\0\0\0\xe7\x07\0\0\0\0\0\0\0\0\0\0\xf0\x08\0\0\
-\0\0\0\0\x0c\0\0\0\xf6\x01\0\0\0\0\0\0\x08\0\0\0\x40\0\0\x01\xb3\x04\0\0\x03\0\
-\0\0\x18\0\0\x01\xbb\0\0\0\x0e\0\0\0\0\0\0\x01\xbe\0\0\0\x11\0\0\0\x20\0\0\x01\
-\xc3\0\0\0\x0e\0\0\0\xa0\0\0\x01\xcf\x08\0\0\0\0\0\0\x0f\0\0\x01\xd5\x01\0\0\0\
-\0\0\0\x04\0\0\0\x20\0\0\x01\xe2\x01\0\0\0\0\0\0\x01\x01\0\0\x08\0\0\0\0\x03\0\
-\0\0\0\0\0\0\0\0\0\x10\0\0\0\x12\0\0\0\x10\0\0\x01\xe7\x01\0\0\0\0\0\0\x04\0\0\
-\0\x20\0\0\0\0\x02\0\0\0\0\0\0\x14\0\0\x02\x4b\x04\0\0\x02\0\0\0\x10\0\0\0\x13\
-\0\0\0\x03\0\0\0\0\0\0\x02\x5e\0\0\0\x15\0\0\0\x40\0\0\0\0\x02\0\0\0\0\0\0\x18\
-\0\0\0\0\x0d\0\0\x01\0\0\0\x06\0\0\0\x1c\0\0\0\x13\0\0\x02\x63\x0c\0\0\x01\0\0\
-\0\x16\0\0\x02\xaf\x04\0\0\x01\0\0\0\x08\0\0\x02\xb8\0\0\0\x19\0\0\0\0\0\0\0\0\
-\x02\0\0\0\0\0\0\x1a\0\0\x03\x09\x04\0\0\x06\0\0\0\x38\0\0\x01\xbb\0\0\0\x0e\0\
-\0\0\0\0\0\x01\xbe\0\0\0\x11\0\0\0\x20\0\0\x03\x16\0\0\0\x1b\0\0\0\xc0\0\0\x03\
-\x27\0\0\0\x15\0\0\x01\0\0\0\x03\x30\0\0\0\x1d\0\0\x01\x40\0\0\x03\x3a\0\0\0\
-\x1e\0\0\x01\x80\0\0\0\0\x02\0\0\0\0\0\0\x1c\0\0\0\0\x0a\0\0\0\0\0\0\x10\0\0\0\
-\0\x02\0\0\0\0\0\0\x1f\0\0\0\0\x02\0\0\0\0\0\0\x20\0\0\x03\x84\x04\0\0\x02\0\0\
-\0\x08\0\0\x03\x92\0\0\0\x0e\0\0\0\0\0\0\x03\x9b\0\0\0\x0e\0\0\0\x20\0\0\x03\
-\x3a\x04\0\0\x03\0\0\0\x18\0\0\x03\xa5\0\0\0\x1b\0\0\0\0\0\0\x03\xad\0\0\0\x21\
-\0\0\0\x40\0\0\x03\xb3\0\0\0\x23\0\0\0\x80\0\0\0\0\x02\0\0\0\0\0\0\x22\0\0\0\0\
-\x02\0\0\0\0\0\0\x24\0\0\x03\xb7\x04\0\0\x01\0\0\0\x04\0\0\x03\xc2\0\0\0\x0e\0\
-\0\0\0\0\0\x04\x2b\x04\0\0\x01\0\0\0\x04\0\0\x04\x34\0\0\0\x0e\0\0\0\0\0\0\0\0\
-\x03\0\0\0\0\0\0\0\0\0\0\x1c\0\0\0\x12\0\0\0\x23\0\0\x04\xaa\x0e\0\0\0\0\0\0\
-\x25\0\0\0\0\0\0\0\0\x03\0\0\0\0\0\0\0\0\0\0\x1c\0\0\0\x12\0\0\0\x0e\0\0\x04\
-\xbe\x0e\0\0\0\0\0\0\x27\0\0\0\0\0\0\0\0\x03\0\0\0\0\0\0\0\0\0\0\x1c\0\0\0\x12\
-\0\0\0\x20\0\0\x04\xd4\x0e\0\0\0\0\0\0\x29\0\0\0\0\0\0\0\0\x03\0\0\0\0\0\0\0\0\
-\0\0\x1c\0\0\0\x12\0\0\0\x11\0\0\x04\xe9\x0e\0\0\0\0\0\0\x2b\0\0\0\0\0\0\0\0\
-\x03\0\0\0\0\0\0\0\0\0\0\x10\0\0\0\x12\0\0\0\x04\0\0\x05\0\x0e\0\0\0\0\0\0\x2d\
-\0\0\0\x01\0\0\x05\x08\x0f\0\0\x04\0\0\0\x62\0\0\0\x26\0\0\0\0\0\0\0\x23\0\0\0\
-\x28\0\0\0\x23\0\0\0\x0e\0\0\0\x2a\0\0\0\x31\0\0\0\x20\0\0\0\x2c\0\0\0\x51\0\0\
-\0\x11\0\0\x05\x10\x0f\0\0\x01\0\0\0\x04\0\0\0\x2e\0\0\0\0\0\0\0\x04\0\x62\x70\
-\x66\x5f\x69\x74\x65\x72\x5f\x5f\x62\x70\x66\x5f\x6d\x61\x70\0\x6d\x65\x74\x61\
-\0\x6d\x61\x70\0\x63\x74\x78\0\x69\x6e\x74\0\x64\x75\x6d\x70\x5f\x62\x70\x66\
-\x5f\x6d\x61\x70\0\x69\x74\x65\x72\x2f\x62\x70\x66\x5f\x6d\x61\x70\0\x30\x3a\
-\x30\0\x2f\x68\x6f\x6d\x65\x2f\x69\x69\x69\x2f\x6c\x69\x6e\x75\x78\x2d\x6b\x65\
-\x72\x6e\x65\x6c\x2d\x74\x6f\x6f\x6c\x63\x68\x61\x69\x6e\x2f\x73\x72\x63\x2f\
-\x6c\x69\x6e\x75\x78\x2f\x6b\x65\x72\x6e\x65\x6c\x2f\x62\x70\x66\x2f\x70\x72\
-\x65\x6c\x6f\x61\x64\x2f\x69\x74\x65\x72\x61\x74\x6f\x72\x73\x2f\x69\x74\x65\
-\x72\x61\x74\x6f\x72\x73\x2e\x62\x70\x66\x2e\x63\0\x09\x73\x74\x72\x75\x63\x74\
-\x20\x73\x65\x71\x5f\x66\x69\x6c\x65\x20\x2a\x73\x65\x71\x20\x3d\x20\x63\x74\
-\x78\x2d\x3e\x6d\x65\x74\x61\x2d\x3e\x73\x65\x71\x3b\0\x62\x70\x66\x5f\x69\x74\
-\x65\x72\x5f\x6d\x65\x74\x61\0\x73\x65\x71\0\x73\x65\x73\x73\x69\x6f\x6e\x5f\
-\x69\x64\0\x73\x65\x71\x5f\x6e\x75\x6d\0\x73\x65\x71\x5f\x66\x69\x6c\x65\0\x5f\
-\x5f\x75\x36\x34\0\x75\x6e\x73\x69\x67\x6e\x65\x64\x20\x6c\x6f\x6e\x67\x20\x6c\
-\x6f\x6e\x67\0\x30\x3a\x31\0\x09\x73\x74\x72\x75\x63\x74\x20\x62\x70\x66\x5f\
-\x6d\x61\x70\x20\x2a\x6d\x61\x70\x20\x3d\x20\x63\x74\x78\x2d\x3e\x6d\x61\x70\
-\x3b\0\x09\x69\x66\x20\x28\x21\x6d\x61\x70\x29\0\x30\x3a\x32\0\x09\x5f\x5f\x75\
-\x36\x34\x20\x73\x65\x71\x5f\x6e\x75\x6d\x20\x3d\x20\x63\x74\x78\x2d\x3e\x6d\
-\x65\x74\x61\x2d\x3e\x73\x65\x71\x5f\x6e\x75\x6d\x3b\0\x09\x69\x66\x20\x28\x73\
-\x65\x71\x5f\x6e\x75\x6d\x20\x3d\x3d\x20\x30\x29\0\x09\x09\x42\x50\x46\x5f\x53\
-\x45\x51\x5f\x50\x52\x49\x4e\x54\x46\x28\x73\x65\x71\x2c\x20\x22\x20\x20\x69\
-\x64\x20\x6e\x61\x6d\x65\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\
-\x6d\x61\x78\x5f\x65\x6e\x74\x72\x69\x65\x73\x5c\x6e\x22\x29\x3b\0\x62\x70\x66\
-\x5f\x6d\x61\x70\0\x69\x64\0\x6e\x61\x6d\x65\0\x6d\x61\x78\x5f\x65\x6e\x74\x72\
-\x69\x65\x73\0\x5f\x5f\x75\x33\x32\0\x75\x6e\x73\x69\x67\x6e\x65\x64\x20\x69\
-\x6e\x74\0\x63\x68\x61\x72\0\x5f\x5f\x41\x52\x52\x41\x59\x5f\x53\x49\x5a\x45\
-\x5f\x54\x59\x50\x45\x5f\x5f\0\x09\x42\x50\x46\x5f\x53\x45\x51\x5f\x50\x52\x49\
-\x4e\x54\x46\x28\x73\x65\x71\x2c\x20\x22\x25\x34\x75\x20\x25\x2d\x31\x36\x73\
-\x25\x36\x64\x5c\x6e\x22\x2c\x20\x6d\x61\x70\x2d\x3e\x69\x64\x2c\x20\x6d\x61\
-\x70\x2d\x3e\x6e\x61\x6d\x65\x2c\x20\x6d\x61\x70\x2d\x3e\x6d\x61\x78\x5f\x65\
-\x6e\x74\x72\x69\x65\x73\x29\x3b\0\x7d\0\x62\x70\x66\x5f\x69\x74\x65\x72\x5f\
-\x5f\x62\x70\x66\x5f\x70\x72\x6f\x67\0\x70\x72\x6f\x67\0\x64\x75\x6d\x70\x5f\
-\x62\x70\x66\x5f\x70\x72\x6f\x67\0\x69\x74\x65\x72\x2f\x62\x70\x66\x5f\x70\x72\
-\x6f\x67\0\x09\x73\x74\x72\x75\x63\x74\x20\x62\x70\x66\x5f\x70\x72\x6f\x67\x20\
-\x2a\x70\x72\x6f\x67\x20\x3d\x20\x63\x74\x78\x2d\x3e\x70\x72\x6f\x67\x3b\0\x09\
-\x69\x66\x20\x28\x21\x70\x72\x6f\x67\x29\0\x62\x70\x66\x5f\x70\x72\x6f\x67\0\
-\x61\x75\x78\0\x09\x61\x75\x78\x20\x3d\x20\x70\x72\x6f\x67\x2d\x3e\x61\x75\x78\
-\x3b\0\x09\x09\x42\x50\x46\x5f\x53\x45\x51\x5f\x50\x52\x49\x4e\x54\x46\x28\x73\
-\x65\x71\x2c\x20\x22\x20\x20\x69\x64\x20\x6e\x61\x6d\x65\x20\x20\x20\x20\x20\
-\x20\x20\x20\x20\x20\x20\x20\x20\x61\x74\x74\x61\x63\x68\x65\x64\x5c\x6e\x22\
-\x29\x3b\0\x62\x70\x66\x5f\x70\x72\x6f\x67\x5f\x61\x75\x78\0\x61\x74\x74\x61\
-\x63\x68\x5f\x66\x75\x6e\x63\x5f\x6e\x61\x6d\x65\0\x64\x73\x74\x5f\x70\x72\x6f\
-\x67\0\x66\x75\x6e\x63\x5f\x69\x6e\x66\x6f\0\x62\x74\x66\0\x09\x42\x50\x46\x5f\
-\x53\x45\x51\x5f\x50\x52\x49\x4e\x54\x46\x28\x73\x65\x71\x2c\x20\x22\x25\x34\
-\x75\x20\x25\x2d\x31\x36\x73\x20\x25\x73\x20\x25\x73\x5c\x6e\x22\x2c\x20\x61\
-\x75\x78\x2d\x3e\x69\x64\x2c\0\x30\x3a\x34\0\x30\x3a\x35\0\x09\x69\x66\x20\x28\
-\x21\x62\x74\x66\x29\0\x62\x70\x66\x5f\x66\x75\x6e\x63\x5f\x69\x6e\x66\x6f\0\
-\x69\x6e\x73\x6e\x5f\x6f\x66\x66\0\x74\x79\x70\x65\x5f\x69\x64\0\x30\0\x73\x74\
-\x72\x69\x6e\x67\x73\0\x74\x79\x70\x65\x73\0\x68\x64\x72\0\x62\x74\x66\x5f\x68\
-\x65\x61\x64\x65\x72\0\x73\x74\x72\x5f\x6c\x65\x6e\0\x09\x74\x79\x70\x65\x73\
-\x20\x3d\x20\x62\x74\x66\x2d\x3e\x74\x79\x70\x65\x73\x3b\0\x09\x62\x70\x66\x5f\
-\x70\x72\x6f\x62\x65\x5f\x72\x65\x61\x64\x5f\x6b\x65\x72\x6e\x65\x6c\x28\x26\
-\x74\x2c\x20\x73\x69\x7a\x65\x6f\x66\x28\x74\x29\x2c\x20\x74\x79\x70\x65\x73\
-\x20\x2b\x20\x62\x74\x66\x5f\x69\x64\x29\x3b\0\x09\x73\x74\x72\x20\x3d\x20\x62\
-\x74\x66\x2d\x3e\x73\x74\x72\x69\x6e\x67\x73\x3b\0\x62\x74\x66\x5f\x74\x79\x70\
-\x65\0\x6e\x61\x6d\x65\x5f\x6f\x66\x66\0\x09\x6e\x61\x6d\x65\x5f\x6f\x66\x66\
-\x20\x3d\x20\x42\x50\x46\x5f\x43\x4f\x52\x45\x5f\x52\x45\x41\x44\x28\x74\x2c\
-\x20\x6e\x61\x6d\x65\x5f\x6f\x66\x66\x29\x3b\0\x30\x3a\x32\x3a\x30\0\x09\x69\
-\x66\x20\x28\x6e\x61\x6d\x65\x5f\x6f\x66\x66\x20\x3e\x3d\x20\x62\x74\x66\x2d\
-\x3e\x68\x64\x72\x2e\x73\x74\x72\x5f\x6c\x65\x6e\x29\0\x09\x72\x65\x74\x75\x72\
-\x6e\x20\x73\x74\x72\x20\x2b\x20\x6e\x61\x6d\x65\x5f\x6f\x66\x66\x3b\0\x30\x3a\
-\x33\0\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x6d\x61\x70\x2e\x5f\x5f\x5f\x66\x6d\
-\x74\0\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x6d\x61\x70\x2e\x5f\x5f\x5f\x66\x6d\
-\x74\x2e\x31\0\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x70\x72\x6f\x67\x2e\x5f\x5f\
-\x5f\x66\x6d\x74\0\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x70\x72\x6f\x67\x2e\x5f\
-\x5f\x5f\x66\x6d\x74\x2e\x32\0\x4c\x49\x43\x45\x4e\x53\x45\0\x2e\x72\x6f\x64\
-\x61\x74\x61\0\x6c\x69\x63\x65\x6e\x73\x65\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
-\0\0\0\0\0\x09\x4c\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x02\0\0\0\x04\0\0\0\x62\0\0\0\
-\x01\0\0\0\x80\0\0\0\0\0\0\0\0\x69\x74\x65\x72\x61\x74\x6f\x72\x2e\x72\x6f\x64\
-\x61\x74\x61\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x2f\0\0\0\0\0\0\0\0\0\0\0\0\x20\
-\x20\x69\x64\x20\x6e\x61\x6d\x65\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\
-\x20\x20\x6d\x61\x78\x5f\x65\x6e\x74\x72\x69\x65\x73\x0a\0\x25\x34\x75\x20\x25\
-\x2d\x31\x36\x73\x25\x36\x64\x0a\0\x20\x20\x69\x64\x20\x6e\x61\x6d\x65\x20\x20\
+\0\0\x20\0\0\0\x24\x0c\0\0\x01\0\0\0\x05\0\0\0\xc3\x04\0\0\x03\0\0\0\x18\0\0\0\
+\xd1\0\0\0\x09\0\0\0\0\0\0\0\xd5\0\0\0\x0b\0\0\0\x40\0\0\0\xe0\0\0\0\x0b\0\0\0\
+\x80\0\0\0\0\x02\0\0\0\0\0\0\x0a\0\0\0\xe8\x07\0\0\0\0\0\0\0\0\0\0\xf1\x08\0\0\
+\0\0\0\0\x0c\0\0\0\xf7\x01\0\0\0\0\0\0\x08\0\0\0\x40\0\0\x01\xc1\x04\0\0\x03\0\
+\0\0\x18\0\0\x01\xc9\0\0\0\x0e\0\0\0\0\0\0\x01\xcc\0\0\0\x11\0\0\0\x20\0\0\x01\
+\xd1\0\0\0\x0e\0\0\0\xa0\0\0\x01\xdd\x08\0\0\0\0\0\0\x0f\0\0\x01\xe3\x01\0\0\0\
+\0\0\0\x04\0\0\0\x20\0\0\x01\xf0\x01\0\0\0\0\0\0\x01\x01\0\0\x08\0\0\0\0\x03\0\
+\0\0\0\0\0\0\0\0\0\x10\0\0\0\x12\0\0\0\x10\0\0\x01\xf5\x01\0\0\0\0\0\0\x04\0\0\
+\0\x20\0\0\0\0\x0d\0\0\x01\0\0\0\x14\0\0\x05\x39\0\0\0\x04\0\0\x02\x3e\x08\0\0\
+\0\0\0\0\x15\0\0\x02\x44\x01\0\0\0\0\0\0\x08\x01\0\0\x40\0\0\x02\x4e\x0c\0\0\
+\x01\0\0\0\x13\0\0\0\0\x02\0\0\0\0\0\0\x18\0\0\x02\x65\x04\0\0\x02\0\0\0\x10\0\
+\0\0\x13\0\0\0\x03\0\0\0\0\0\0\x02\x78\0\0\0\x19\0\0\0\x40\0\0\0\0\x02\0\0\0\0\
+\0\0\x1c\0\0\0\0\x0d\0\0\x01\0\0\0\x06\0\0\0\x1c\0\0\0\x17\0\0\x02\x7d\x0c\0\0\
+\x01\0\0\0\x1a\0\0\x02\xc9\x04\0\0\x01\0\0\0\x08\0\0\x02\xd2\0\0\0\x1d\0\0\0\0\
+\0\0\0\0\x02\0\0\0\0\0\0\x1e\0\0\x03\x23\x04\0\0\x06\0\0\0\x38\0\0\x01\xc9\0\0\
+\0\x0e\0\0\0\0\0\0\x01\xcc\0\0\0\x11\0\0\0\x20\0\0\x03\x30\0\0\0\x1f\0\0\0\xc0\
+\0\0\x03\x41\0\0\0\x19\0\0\x01\0\0\0\x03\x4a\0\0\0\x21\0\0\x01\x40\0\0\x03\x54\
+\0\0\0\x22\0\0\x01\x80\0\0\0\0\x02\0\0\0\0\0\0\x20\0\0\0\0\x0a\0\0\0\0\0\0\x10\
+\0\0\0\0\x02\0\0\0\0\0\0\x23\0\0\0\0\x02\0\0\0\0\0\0\x24\0\0\x03\x9e\x04\0\0\
+\x02\0\0\0\x08\0\0\x03\xac\0\0\0\x0e\0\0\0\0\0\0\x03\xb5\0\0\0\x0e\0\0\0\x20\0\
+\0\x03\x54\x04\0\0\x03\0\0\0\x18\0\0\x03\xbf\0\0\0\x1f\0\0\0\0\0\0\x03\xc7\0\0\
+\0\x25\0\0\0\x40\0\0\x03\xcd\0\0\0\x27\0\0\0\x80\0\0\0\0\x02\0\0\0\0\0\0\x26\0\
+\0\0\0\x02\0\0\0\0\0\0\x28\0\0\x03\xd1\x04\0\0\x01\0\0\0\x04\0\0\x03\xdc\0\0\0\
+\x0e\0\0\0\0\0\0\x04\x45\x04\0\0\x01\0\0\0\x04\0\0\x04\x4e\0\0\0\x0e\0\0\0\0\0\
+\0\0\0\x03\0\0\0\0\0\0\0\0\0\0\x20\0\0\0\x12\0\0\0\x30\0\0\x04\xc4\x0e\0\0\0\0\
+\0\0\x29\0\0\0\0\0\0\0\0\x03\0\0\0\0\0\0\0\0\0\0\x20\0\0\0\x12\0\0\0\x1a\0\0\
+\x04\xd8\x0e\0\0\0\0\0\0\x2b\0\0\0\0\0\0\0\0\x03\0\0\0\0\0\0\0\0\0\0\x20\0\0\0\
+\x12\0\0\0\x20\0\0\x04\xee\x0e\0\0\0\0\0\0\x2d\0\0\0\0\0\0\0\0\x03\0\0\0\0\0\0\
+\0\0\0\0\x20\0\0\0\x12\0\0\0\x11\0\0\x05\x03\x0e\0\0\0\0\0\0\x2f\0\0\0\0\0\0\0\
+\0\x03\0\0\0\0\0\0\0\0\0\0\x10\0\0\0\x12\0\0\0\x04\0\0\x05\x1a\x0e\0\0\0\0\0\0\
+\x31\0\0\0\x01\0\0\x05\x22\x0f\0\0\x01\0\0\0\x04\0\0\0\x36\0\0\0\0\0\0\0\x04\0\
+\0\x05\x29\x0f\0\0\x04\0\0\0\x7b\0\0\0\x2a\0\0\0\0\0\0\0\x30\0\0\0\x2c\0\0\0\
+\x30\0\0\0\x1a\0\0\0\x2e\0\0\0\x4a\0\0\0\x20\0\0\0\x30\0\0\0\x6a\0\0\0\x11\0\0\
+\x05\x31\x0f\0\0\x01\0\0\0\x04\0\0\0\x32\0\0\0\0\0\0\0\x04\0\0\x05\x39\x0e\0\0\
+\0\0\0\0\x06\0\0\0\x01\0\x62\x70\x66\x5f\x69\x74\x65\x72\x5f\x5f\x62\x70\x66\
+\x5f\x6d\x61\x70\0\x6d\x65\x74\x61\0\x6d\x61\x70\0\x63\x74\x78\0\x69\x6e\x74\0\
+\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x6d\x61\x70\0\x69\x74\x65\x72\x2f\x62\x70\
+\x66\x5f\x6d\x61\x70\0\x30\x3a\x30\0\x2f\x68\x6f\x6d\x65\x32\x2f\x69\x69\x69\
+\x2f\x6c\x69\x6e\x75\x78\x2d\x6b\x65\x72\x6e\x65\x6c\x2d\x74\x6f\x6f\x6c\x63\
+\x68\x61\x69\x6e\x2f\x73\x72\x63\x2f\x6c\x69\x6e\x75\x78\x2f\x6b\x65\x72\x6e\
+\x65\x6c\x2f\x62\x70\x66\x2f\x70\x72\x65\x6c\x6f\x61\x64\x2f\x69\x74\x65\x72\
+\x61\x74\x6f\x72\x73\x2f\x69\x74\x65\x72\x61\x74\x6f\x72\x73\x2e\x62\x70\x66\
+\x2e\x63\0\x09\x73\x74\x72\x75\x63\x74\x20\x73\x65\x71\x5f\x66\x69\x6c\x65\x20\
+\x2a\x73\x65\x71\x20\x3d\x20\x63\x74\x78\x2d\x3e\x6d\x65\x74\x61\x2d\x3e\x73\
+\x65\x71\x3b\0\x62\x70\x66\x5f\x69\x74\x65\x72\x5f\x6d\x65\x74\x61\0\x73\x65\
+\x71\0\x73\x65\x73\x73\x69\x6f\x6e\x5f\x69\x64\0\x73\x65\x71\x5f\x6e\x75\x6d\0\
+\x73\x65\x71\x5f\x66\x69\x6c\x65\0\x5f\x5f\x75\x36\x34\0\x75\x6e\x73\x69\x67\
+\x6e\x65\x64\x20\x6c\x6f\x6e\x67\x20\x6c\x6f\x6e\x67\0\x30\x3a\x31\0\x09\x73\
+\x74\x72\x75\x63\x74\x20\x62\x70\x66\x5f\x6d\x61\x70\x20\x2a\x6d\x61\x70\x20\
+\x3d\x20\x63\x74\x78\x2d\x3e\x6d\x61\x70\x3b\0\x09\x69\x66\x20\x28\x21\x6d\x61\
+\x70\x29\0\x30\x3a\x32\0\x09\x5f\x5f\x75\x36\x34\x20\x73\x65\x71\x5f\x6e\x75\
+\x6d\x20\x3d\x20\x63\x74\x78\x2d\x3e\x6d\x65\x74\x61\x2d\x3e\x73\x65\x71\x5f\
+\x6e\x75\x6d\x3b\0\x09\x69\x66\x20\x28\x73\x65\x71\x5f\x6e\x75\x6d\x20\x3d\x3d\
+\x20\x30\x29\0\x09\x09\x42\x50\x46\x5f\x53\x45\x51\x5f\x50\x52\x49\x4e\x54\x46\
+\x28\x73\x65\x71\x2c\x20\x22\x20\x20\x69\x64\x20\x6e\x61\x6d\x65\x20\x20\x20\
+\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x6d\x61\x78\x5f\x65\x6e\x74\x72\x69\
+\x65\x73\x20\x20\x63\x75\x72\x5f\x65\x6e\x74\x72\x69\x65\x73\x5c\x6e\x22\x29\
+\x3b\0\x62\x70\x66\x5f\x6d\x61\x70\0\x69\x64\0\x6e\x61\x6d\x65\0\x6d\x61\x78\
+\x5f\x65\x6e\x74\x72\x69\x65\x73\0\x5f\x5f\x75\x33\x32\0\x75\x6e\x73\x69\x67\
+\x6e\x65\x64\x20\x69\x6e\x74\0\x63\x68\x61\x72\0\x5f\x5f\x41\x52\x52\x41\x59\
+\x5f\x53\x49\x5a\x45\x5f\x54\x59\x50\x45\x5f\x5f\0\x09\x42\x50\x46\x5f\x53\x45\
+\x51\x5f\x50\x52\x49\x4e\x54\x46\x28\x73\x65\x71\x2c\x20\x22\x25\x34\x75\x20\
+\x25\x2d\x31\x36\x73\x20\x20\x25\x31\x30\x64\x20\x20\x20\x25\x31\x30\x6c\x6c\
+\x64\x5c\x6e\x22\x2c\0\x7d\0\x5f\x5f\x73\x36\x34\0\x6c\x6f\x6e\x67\x20\x6c\x6f\
+\x6e\x67\0\x62\x70\x66\x5f\x6d\x61\x70\x5f\x73\x75\x6d\x5f\x65\x6c\x65\x6d\x5f\
+\x63\x6f\x75\x6e\x74\0\x62\x70\x66\x5f\x69\x74\x65\x72\x5f\x5f\x62\x70\x66\x5f\
+\x70\x72\x6f\x67\0\x70\x72\x6f\x67\0\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x70\
+\x72\x6f\x67\0\x69\x74\x65\x72\x2f\x62\x70\x66\x5f\x70\x72\x6f\x67\0\x09\x73\
+\x74\x72\x75\x63\x74\x20\x62\x70\x66\x5f\x70\x72\x6f\x67\x20\x2a\x70\x72\x6f\
+\x67\x20\x3d\x20\x63\x74\x78\x2d\x3e\x70\x72\x6f\x67\x3b\0\x09\x69\x66\x20\x28\
+\x21\x70\x72\x6f\x67\x29\0\x62\x70\x66\x5f\x70\x72\x6f\x67\0\x61\x75\x78\0\x09\
+\x61\x75\x78\x20\x3d\x20\x70\x72\x6f\x67\x2d\x3e\x61\x75\x78\x3b\0\x09\x09\x42\
+\x50\x46\x5f\x53\x45\x51\x5f\x50\x52\x49\x4e\x54\x46\x28\x73\x65\x71\x2c\x20\
+\x22\x20\x20\x69\x64\x20\x6e\x61\x6d\x65\x20\x20\x20\x20\x20\x20\x20\x20\x20\
+\x20\x20\x20\x20\x61\x74\x74\x61\x63\x68\x65\x64\x5c\x6e\x22\x29\x3b\0\x62\x70\
+\x66\x5f\x70\x72\x6f\x67\x5f\x61\x75\x78\0\x61\x74\x74\x61\x63\x68\x5f\x66\x75\
+\x6e\x63\x5f\x6e\x61\x6d\x65\0\x64\x73\x74\x5f\x70\x72\x6f\x67\0\x66\x75\x6e\
+\x63\x5f\x69\x6e\x66\x6f\0\x62\x74\x66\0\x09\x42\x50\x46\x5f\x53\x45\x51\x5f\
+\x50\x52\x49\x4e\x54\x46\x28\x73\x65\x71\x2c\x20\x22\x25\x34\x75\x20\x25\x2d\
+\x31\x36\x73\x20\x25\x73\x20\x25\x73\x5c\x6e\x22\x2c\x20\x61\x75\x78\x2d\x3e\
+\x69\x64\x2c\0\x30\x3a\x34\0\x30\x3a\x35\0\x09\x69\x66\x20\x28\x21\x62\x74\x66\
+\x29\0\x62\x70\x66\x5f\x66\x75\x6e\x63\x5f\x69\x6e\x66\x6f\0\x69\x6e\x73\x6e\
+\x5f\x6f\x66\x66\0\x74\x79\x70\x65\x5f\x69\x64\0\x30\0\x73\x74\x72\x69\x6e\x67\
+\x73\0\x74\x79\x70\x65\x73\0\x68\x64\x72\0\x62\x74\x66\x5f\x68\x65\x61\x64\x65\
+\x72\0\x73\x74\x72\x5f\x6c\x65\x6e\0\x09\x74\x79\x70\x65\x73\x20\x3d\x20\x62\
+\x74\x66\x2d\x3e\x74\x79\x70\x65\x73\x3b\0\x09\x62\x70\x66\x5f\x70\x72\x6f\x62\
+\x65\x5f\x72\x65\x61\x64\x5f\x6b\x65\x72\x6e\x65\x6c\x28\x26\x74\x2c\x20\x73\
+\x69\x7a\x65\x6f\x66\x28\x74\x29\x2c\x20\x74\x79\x70\x65\x73\x20\x2b\x20\x62\
+\x74\x66\x5f\x69\x64\x29\x3b\0\x09\x73\x74\x72\x20\x3d\x20\x62\x74\x66\x2d\x3e\
+\x73\x74\x72\x69\x6e\x67\x73\x3b\0\x62\x74\x66\x5f\x74\x79\x70\x65\0\x6e\x61\
+\x6d\x65\x5f\x6f\x66\x66\0\x09\x6e\x61\x6d\x65\x5f\x6f\x66\x66\x20\x3d\x20\x42\
+\x50\x46\x5f\x43\x4f\x52\x45\x5f\x52\x45\x41\x44\x28\x74\x2c\x20\x6e\x61\x6d\
+\x65\x5f\x6f\x66\x66\x29\x3b\0\x30\x3a\x32\x3a\x30\0\x09\x69\x66\x20\x28\x6e\
+\x61\x6d\x65\x5f\x6f\x66\x66\x20\x3e\x3d\x20\x62\x74\x66\x2d\x3e\x68\x64\x72\
+\x2e\x73\x74\x72\x5f\x6c\x65\x6e\x29\0\x09\x72\x65\x74\x75\x72\x6e\x20\x73\x74\
+\x72\x20\x2b\x20\x6e\x61\x6d\x65\x5f\x6f\x66\x66\x3b\0\x30\x3a\x33\0\x64\x75\
+\x6d\x70\x5f\x62\x70\x66\x5f\x6d\x61\x70\x2e\x5f\x5f\x5f\x66\x6d\x74\0\x64\x75\
+\x6d\x70\x5f\x62\x70\x66\x5f\x6d\x61\x70\x2e\x5f\x5f\x5f\x66\x6d\x74\x2e\x31\0\
+\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x70\x72\x6f\x67\x2e\x5f\x5f\x5f\x66\x6d\
+\x74\0\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x70\x72\x6f\x67\x2e\x5f\x5f\x5f\x66\
+\x6d\x74\x2e\x32\0\x4c\x49\x43\x45\x4e\x53\x45\0\x2e\x6b\x73\x79\x6d\x73\0\x2e\
+\x72\x6f\x64\x61\x74\x61\0\x6c\x69\x63\x65\x6e\x73\x65\0\x64\x75\x6d\x6d\x79\
+\x5f\x6b\x73\x79\x6d\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x09\xdc\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\x02\0\0\0\x04\0\0\0\x7b\0\0\0\x01\0\0\0\x80\0\0\0\0\
+\0\0\0\0\x69\x74\x65\x72\x61\x74\x6f\x72\x2e\x72\x6f\x64\x61\x74\x61\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\x34\0\0\0\0\0\0\0\0\0\0\0\0\x20\x20\x69\x64\x20\x6e\x61\
+\x6d\x65\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x6d\x61\x78\x5f\
+\x65\x6e\x74\x72\x69\x65\x73\x20\x20\x63\x75\x72\x5f\x65\x6e\x74\x72\x69\x65\
+\x73\x0a\0\x25\x34\x75\x20\x25\x2d\x31\x36\x73\x20\x20\x25\x31\x30\x64\x20\x20\
+\x20\x25\x31\x30\x6c\x6c\x64\x0a\0\x20\x20\x69\x64\x20\x6e\x61\x6d\x65\x20\x20\
\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x61\x74\x74\x61\x63\x68\x65\x64\
\x0a\0\x25\x34\x75\x20\x25\x2d\x31\x36\x73\x20\x25\x73\x20\x25\x73\x0a\0\0\0\0\
\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
-\0\0\0\0\0\0\0\0\0\0\0\0\x47\x50\x4c\0\0\0\0\0\x79\x21\0\0\0\0\0\0\x79\x62\0\0\
-\0\0\0\0\x79\x71\0\x08\0\0\0\0\x15\x70\0\x1a\0\0\0\0\x79\x12\0\x10\0\0\0\0\x55\
-\x10\0\x08\0\0\0\0\xbf\x4a\0\0\0\0\0\0\x07\x40\0\0\xff\xff\xff\xe8\xbf\x16\0\0\
-\0\0\0\0\x18\x26\0\0\0\0\0\0\0\0\0\0\0\0\0\0\xb7\x30\0\0\0\0\0\x23\xb7\x50\0\0\
-\0\0\0\0\x85\0\0\0\0\0\0\x7e\x61\x17\0\0\0\0\0\0\x7b\xa1\xff\xe8\0\0\0\0\xb7\
-\x10\0\0\0\0\0\x04\xbf\x27\0\0\0\0\0\0\x0f\x21\0\0\0\0\0\0\x7b\xa2\xff\xf0\0\0\
-\0\0\x61\x17\0\x14\0\0\0\0\x7b\xa1\xff\xf8\0\0\0\0\xbf\x4a\0\0\0\0\0\0\x07\x40\
-\0\0\xff\xff\xff\xe8\xbf\x16\0\0\0\0\0\0\x18\x26\0\0\0\0\0\0\0\0\0\0\0\0\0\x23\
-\xb7\x30\0\0\0\0\0\x0e\xb7\x50\0\0\0\0\0\x18\x85\0\0\0\0\0\0\x7e\xb7\0\0\0\0\0\
-\0\0\x95\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x07\0\0\0\0\0\0\0\x42\0\0\0\x9a\0\x01\x3c\
-\x1e\0\0\0\x01\0\0\0\x42\0\0\0\x9a\0\x01\x3c\x24\0\0\0\x02\0\0\0\x42\0\0\x01\
-\x0d\0\x01\x44\x1d\0\0\0\x03\0\0\0\x42\0\0\x01\x2e\0\x01\x4c\x06\0\0\0\x04\0\0\
-\0\x42\0\0\x01\x3d\0\x01\x40\x1d\0\0\0\x05\0\0\0\x42\0\0\x01\x62\0\x01\x58\x06\
-\0\0\0\x07\0\0\0\x42\0\0\x01\x75\0\x01\x5c\x03\0\0\0\x0e\0\0\0\x42\0\0\x01\xfb\
-\0\x01\x64\x02\0\0\0\x1e\0\0\0\x42\0\0\x02\x49\0\x01\x6c\x01\0\0\0\0\0\0\0\x02\
-\0\0\0\x3e\0\0\0\0\0\0\0\x08\0\0\0\x08\0\0\0\x3e\0\0\0\0\0\0\0\x10\0\0\0\x02\0\
-\0\x01\x09\0\0\0\0\0\0\0\x20\0\0\0\x08\0\0\x01\x39\0\0\0\0\0\0\0\x70\0\0\0\x0d\
-\0\0\0\x3e\0\0\0\0\0\0\0\x80\0\0\0\x0d\0\0\x01\x09\0\0\0\0\0\0\0\xa0\0\0\0\x0d\
-\0\0\x01\x39\0\0\0\0\0\0\0\x1a\0\0\0\x20\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
-\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\
-\x6d\x61\x70\0\0\0\0\0\0\0\0\0\0\0\x1c\0\0\0\0\0\0\0\x08\0\0\0\0\0\0\0\0\0\0\0\
-\x01\0\0\0\x10\0\0\0\0\0\0\0\0\0\0\0\x09\0\0\0\x01\0\0\0\0\0\0\0\x07\0\0\0\0\0\
-\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x10\0\0\0\0\x62\x70\x66\x5f\x69\x74\x65\x72\x5f\
-\x62\x70\x66\x5f\x6d\x61\x70\0\0\0\0\0\0\0\0\x47\x50\x4c\0\0\0\0\0\x79\x21\0\0\
-\0\0\0\0\x79\x62\0\0\0\0\0\0\x79\x11\0\x08\0\0\0\0\x15\x10\0\x3b\0\0\0\0\x79\
-\x71\0\0\0\0\0\0\x79\x12\0\x10\0\0\0\0\x55\x10\0\x08\0\0\0\0\xbf\x4a\0\0\0\0\0\
-\0\x07\x40\0\0\xff\xff\xff\xd0\xbf\x16\0\0\0\0\0\0\x18\x26\0\0\0\0\0\0\0\0\0\0\
-\0\0\0\x31\xb7\x30\0\0\0\0\0\x20\xb7\x50\0\0\0\0\0\0\x85\0\0\0\0\0\0\x7e\x7b\
-\xa6\xff\xc8\0\0\0\0\x61\x17\0\0\0\0\0\0\x7b\xa1\xff\xd0\0\0\0\0\xb7\x30\0\0\0\
-\0\0\x04\xbf\x97\0\0\0\0\0\0\x0f\x93\0\0\0\0\0\0\x79\x17\0\x28\0\0\0\0\x79\x87\
-\0\x30\0\0\0\0\x15\x80\0\x18\0\0\0\0\xb7\x20\0\0\0\0\0\0\x0f\x12\0\0\0\0\0\0\
-\x61\x11\0\x04\0\0\0\0\x79\x38\0\x08\0\0\0\0\x67\x10\0\0\0\0\0\x03\x0f\x31\0\0\
-\0\0\0\0\x79\x68\0\0\0\0\0\0\xbf\x1a\0\0\0\0\0\0\x07\x10\0\0\xff\xff\xff\xf8\
-\xb7\x20\0\0\0\0\0\x08\x85\0\0\0\0\0\0\x71\xb7\x10\0\0\0\0\0\0\x79\x3a\xff\xf8\
+\0\0\0\0\0\0\0\0\0\0\0\x47\x50\x4c\0\0\0\0\0\x79\x21\0\0\0\0\0\0\x79\x62\0\0\0\
+\0\0\0\x79\x71\0\x08\0\0\0\0\x15\x70\0\x1d\0\0\0\0\x79\x12\0\x10\0\0\0\0\x55\
+\x10\0\x08\0\0\0\0\xbf\x4a\0\0\0\0\0\0\x07\x40\0\0\xff\xff\xff\xe0\xbf\x16\0\0\
+\0\0\0\0\x18\x26\0\0\0\0\0\0\0\0\0\0\0\0\0\0\xb4\x30\0\0\0\0\0\x30\xb4\x50\0\0\
+\0\0\0\0\x85\0\0\0\0\0\0\x7e\x61\x17\0\0\0\0\0\0\x7b\xa1\xff\xe0\0\0\0\0\xb7\
+\x10\0\0\0\0\0\x04\xbf\x27\0\0\0\0\0\0\x0f\x21\0\0\0\0\0\0\x7b\xa2\xff\xe8\0\0\
+\0\0\x61\x17\0\x14\0\0\0\0\x7b\xa1\xff\xf0\0\0\0\0\xbf\x17\0\0\0\0\0\0\x85\x02\
+\0\0\0\0\0\0\x7b\xa0\xff\xf8\0\0\0\0\xbf\x4a\0\0\0\0\0\0\x07\x40\0\0\xff\xff\
+\xff\xe0\xbf\x16\0\0\0\0\0\0\x18\x26\0\0\0\0\0\0\0\0\0\0\0\0\0\x30\xb4\x30\0\0\
+\0\0\0\x1a\xb4\x50\0\0\0\0\0\x20\x85\0\0\0\0\0\0\x7e\xb4\0\0\0\0\0\0\0\x95\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\x07\0\0\0\0\0\0\0\x42\0\0\0\x9b\0\x01\x44\x1e\0\0\0\
+\x01\0\0\0\x42\0\0\0\x9b\0\x01\x44\x24\0\0\0\x02\0\0\0\x42\0\0\x01\x0e\0\x01\
+\x4c\x1d\0\0\0\x03\0\0\0\x42\0\0\x01\x2f\0\x01\x54\x06\0\0\0\x04\0\0\0\x42\0\0\
+\x01\x3e\0\x01\x48\x1d\0\0\0\x05\0\0\0\x42\0\0\x01\x63\0\x01\x60\x0e\0\0\0\x08\
+\0\0\0\x42\0\0\x01\x76\0\x01\x64\x03\0\0\0\x0e\0\0\0\x42\0\0\x02\x09\0\x01\x6c\
+\x02\0\0\0\x21\0\0\0\x42\0\0\x02\x3c\0\x01\x80\x01\0\0\0\0\0\0\0\x02\0\0\0\x3e\
+\0\0\0\0\0\0\0\x08\0\0\0\x08\0\0\0\x3e\0\0\0\0\0\0\0\x10\0\0\0\x02\0\0\x01\x0a\
+\0\0\0\0\0\0\0\x20\0\0\0\x08\0\0\x01\x3a\0\0\0\0\0\0\0\x70\0\0\0\x0d\0\0\0\x3e\
+\0\0\0\0\0\0\0\x80\0\0\0\x0d\0\0\x01\x0a\0\0\0\0\0\0\0\xa0\0\0\0\x0d\0\0\x01\
+\x3a\0\0\0\0\0\0\0\x1a\0\0\0\x23\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x6d\x61\
+\x70\0\0\0\0\0\0\0\0\0\0\0\x1c\0\0\0\0\0\0\0\x08\0\0\0\0\0\0\0\0\0\0\0\x01\0\0\
+\0\x10\0\0\0\0\0\0\0\0\0\0\0\x09\0\0\0\x01\0\0\0\0\0\0\0\x07\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\x10\0\0\0\0\x62\x70\x66\x5f\x69\x74\x65\x72\x5f\x62\x70\
+\x66\x5f\x6d\x61\x70\0\0\0\0\0\0\0\0\x62\x70\x66\x5f\x6d\x61\x70\x5f\x73\x75\
+\x6d\x5f\x65\x6c\x65\x6d\x5f\x63\x6f\x75\x6e\x74\0\0\x47\x50\x4c\0\0\0\0\0\x79\
+\x21\0\0\0\0\0\0\x79\x62\0\0\0\0\0\0\x79\x11\0\x08\0\0\0\0\x15\x10\0\x3b\0\0\0\
+\0\x79\x71\0\0\0\0\0\0\x79\x12\0\x10\0\0\0\0\x55\x10\0\x08\0\0\0\0\xbf\x4a\0\0\
+\0\0\0\0\x07\x40\0\0\xff\xff\xff\xd0\xbf\x16\0\0\0\0\0\0\x18\x26\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\x4a\xb4\x30\0\0\0\0\0\x20\xb4\x50\0\0\0\0\0\0\x85\0\0\0\0\0\0\x7e\
+\x7b\xa6\xff\xc8\0\0\0\0\x61\x17\0\0\0\0\0\0\x7b\xa1\xff\xd0\0\0\0\0\xb7\x30\0\
+\0\0\0\0\x04\xbf\x97\0\0\0\0\0\0\x0f\x93\0\0\0\0\0\0\x79\x17\0\x28\0\0\0\0\x79\
+\x87\0\x30\0\0\0\0\x15\x80\0\x18\0\0\0\0\xb7\x20\0\0\0\0\0\0\x0f\x12\0\0\0\0\0\
+\0\x61\x11\0\x04\0\0\0\0\x79\x38\0\x08\0\0\0\0\x67\x10\0\0\0\0\0\x03\x0f\x31\0\
+\0\0\0\0\0\x79\x68\0\0\0\0\0\0\xbf\x1a\0\0\0\0\0\0\x07\x10\0\0\xff\xff\xff\xf8\
+\xb4\x20\0\0\0\0\0\x08\x85\0\0\0\0\0\0\x71\xb7\x10\0\0\0\0\0\0\x79\x3a\xff\xf8\
\0\0\0\0\x0f\x31\0\0\0\0\0\0\xbf\x1a\0\0\0\0\0\0\x07\x10\0\0\xff\xff\xff\xf4\
-\xb7\x20\0\0\0\0\0\x04\x85\0\0\0\0\0\0\x71\xb7\x30\0\0\0\0\0\x04\x61\x1a\xff\
-\xf4\0\0\0\0\x61\x28\0\x10\0\0\0\0\x3d\x12\0\x02\0\0\0\0\x0f\x61\0\0\0\0\0\0\
+\xb4\x20\0\0\0\0\0\x04\x85\0\0\0\0\0\0\x71\xb7\x30\0\0\0\0\0\x04\x61\x1a\xff\
+\xf4\0\0\0\0\x61\x28\0\x10\0\0\0\0\x3e\x12\0\x02\0\0\0\0\x0f\x61\0\0\0\0\0\0\
\xbf\x96\0\0\0\0\0\0\x7b\xa9\xff\xd8\0\0\0\0\x79\x17\0\x18\0\0\0\0\x7b\xa1\xff\
\xe0\0\0\0\0\x79\x17\0\x20\0\0\0\0\x79\x11\0\0\0\0\0\0\x0f\x13\0\0\0\0\0\0\x7b\
\xa1\xff\xe8\0\0\0\0\xbf\x4a\0\0\0\0\0\0\x07\x40\0\0\xff\xff\xff\xd0\x79\x1a\
-\xff\xc8\0\0\0\0\x18\x26\0\0\0\0\0\0\0\0\0\0\0\0\0\x51\xb7\x30\0\0\0\0\0\x11\
-\xb7\x50\0\0\0\0\0\x20\x85\0\0\0\0\0\0\x7e\xb7\0\0\0\0\0\0\0\x95\0\0\0\0\0\0\0\
-\0\0\0\0\0\0\0\x17\0\0\0\0\0\0\0\x42\0\0\0\x9a\0\x01\x80\x1e\0\0\0\x01\0\0\0\
-\x42\0\0\0\x9a\0\x01\x80\x24\0\0\0\x02\0\0\0\x42\0\0\x02\x7f\0\x01\x88\x1f\0\0\
-\0\x03\0\0\0\x42\0\0\x02\xa3\0\x01\x94\x06\0\0\0\x04\0\0\0\x42\0\0\x02\xbc\0\
-\x01\xa0\x0e\0\0\0\x05\0\0\0\x42\0\0\x01\x3d\0\x01\x84\x1d\0\0\0\x06\0\0\0\x42\
-\0\0\x01\x62\0\x01\xa4\x06\0\0\0\x08\0\0\0\x42\0\0\x02\xce\0\x01\xa8\x03\0\0\0\
-\x10\0\0\0\x42\0\0\x03\x3e\0\x01\xb0\x02\0\0\0\x17\0\0\0\x42\0\0\x03\x79\0\x01\
-\x04\x06\0\0\0\x1a\0\0\0\x42\0\0\x03\x3e\0\x01\xb0\x02\0\0\0\x1b\0\0\0\x42\0\0\
-\x03\xca\0\x01\x10\x0f\0\0\0\x1c\0\0\0\x42\0\0\x03\xdf\0\x01\x14\x2d\0\0\0\x1e\
-\0\0\0\x42\0\0\x04\x16\0\x01\x0c\x0d\0\0\0\x20\0\0\0\x42\0\0\x03\x3e\0\x01\xb0\
-\x02\0\0\0\x21\0\0\0\x42\0\0\x03\xdf\0\x01\x14\x02\0\0\0\x24\0\0\0\x42\0\0\x04\
-\x3d\0\x01\x18\x0d\0\0\0\x27\0\0\0\x42\0\0\x03\x3e\0\x01\xb0\x02\0\0\0\x28\0\0\
-\0\x42\0\0\x04\x3d\0\x01\x18\x0d\0\0\0\x2b\0\0\0\x42\0\0\x04\x3d\0\x01\x18\x0d\
-\0\0\0\x2c\0\0\0\x42\0\0\x04\x6b\0\x01\x1c\x1b\0\0\0\x2d\0\0\0\x42\0\0\x04\x6b\
-\0\x01\x1c\x06\0\0\0\x2e\0\0\0\x42\0\0\x04\x8e\0\x01\x24\x0d\0\0\0\x30\0\0\0\
-\x42\0\0\x03\x3e\0\x01\xb0\x02\0\0\0\x3f\0\0\0\x42\0\0\x02\x49\0\x01\xc0\x01\0\
-\0\0\0\0\0\0\x14\0\0\0\x3e\0\0\0\0\0\0\0\x08\0\0\0\x08\0\0\0\x3e\0\0\0\0\0\0\0\
-\x10\0\0\0\x14\0\0\x01\x09\0\0\0\0\0\0\0\x20\0\0\0\x18\0\0\0\x3e\0\0\0\0\0\0\0\
-\x28\0\0\0\x08\0\0\x01\x39\0\0\0\0\0\0\0\x80\0\0\0\x1a\0\0\0\x3e\0\0\0\0\0\0\0\
-\x90\0\0\0\x1a\0\0\x01\x09\0\0\0\0\0\0\0\xa8\0\0\0\x1a\0\0\x03\x71\0\0\0\0\0\0\
-\0\xb0\0\0\0\x1a\0\0\x03\x75\0\0\0\0\0\0\0\xc0\0\0\0\x1f\0\0\x03\xa3\0\0\0\0\0\
-\0\0\xd8\0\0\0\x20\0\0\x01\x09\0\0\0\0\0\0\0\xf0\0\0\0\x20\0\0\0\x3e\0\0\0\0\0\
-\0\x01\x18\0\0\0\x24\0\0\0\x3e\0\0\0\0\0\0\x01\x50\0\0\0\x1a\0\0\x01\x09\0\0\0\
-\0\0\0\x01\x60\0\0\0\x20\0\0\x04\x65\0\0\0\0\0\0\x01\x88\0\0\0\x1a\0\0\x01\x39\
-\0\0\0\0\0\0\x01\x98\0\0\0\x1a\0\0\x04\xa6\0\0\0\0\0\0\x01\xa0\0\0\0\x18\0\0\0\
-\x3e\0\0\0\0\0\0\0\x1a\0\0\0\x41\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
-\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x70\x72\
-\x6f\x67\0\0\0\0\0\0\0\0\0\0\x1c\0\0\0\0\0\0\0\x08\0\0\0\0\0\0\0\0\0\0\0\x01\0\
-\0\0\x10\0\0\0\0\0\0\0\0\0\0\0\x19\0\0\0\x01\0\0\0\0\0\0\0\x12\0\0\0\0\0\0\0\0\
-\0\0\0\0\0\0\0\0\0\0\0\x10\0\0\0\0\x62\x70\x66\x5f\x69\x74\x65\x72\x5f\x62\x70\
-\x66\x5f\x70\x72\x6f\x67\0\0\0\0\0\0\0";
- opts.insns_sz = 2216;
- opts.insns = (void *)"\
+\xff\xc8\0\0\0\0\x18\x26\0\0\0\0\0\0\0\0\0\0\0\0\0\x6a\xb4\x30\0\0\0\0\0\x11\
+\xb4\x50\0\0\0\0\0\x20\x85\0\0\0\0\0\0\x7e\xb4\0\0\0\0\0\0\0\x95\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\x1b\0\0\0\0\0\0\0\x42\0\0\0\x9b\0\x01\x94\x1e\0\0\0\x01\0\0\0\
+\x42\0\0\0\x9b\0\x01\x94\x24\0\0\0\x02\0\0\0\x42\0\0\x02\x99\0\x01\x9c\x1f\0\0\
+\0\x03\0\0\0\x42\0\0\x02\xbd\0\x01\xa8\x06\0\0\0\x04\0\0\0\x42\0\0\x02\xd6\0\
+\x01\xb4\x0e\0\0\0\x05\0\0\0\x42\0\0\x01\x3e\0\x01\x98\x1d\0\0\0\x06\0\0\0\x42\
+\0\0\x01\x63\0\x01\xb8\x0e\0\0\0\x09\0\0\0\x42\0\0\x02\xe8\0\x01\xbc\x03\0\0\0\
+\x10\0\0\0\x42\0\0\x03\x58\0\x01\xc4\x02\0\0\0\x17\0\0\0\x42\0\0\x03\x93\0\x01\
+\x04\x06\0\0\0\x1a\0\0\0\x42\0\0\x03\x58\0\x01\xc4\x02\0\0\0\x1b\0\0\0\x42\0\0\
+\x03\xe4\0\x01\x10\x0f\0\0\0\x1c\0\0\0\x42\0\0\x03\xf9\0\x01\x14\x2d\0\0\0\x1e\
+\0\0\0\x42\0\0\x04\x30\0\x01\x0c\x0d\0\0\0\x21\0\0\0\x42\0\0\x03\xf9\0\x01\x14\
+\x02\0\0\0\x24\0\0\0\x42\0\0\x04\x57\0\x01\x18\x0d\0\0\0\x2b\0\0\0\x42\0\0\x04\
+\x57\0\x01\x18\x0d\0\0\0\x2c\0\0\0\x42\0\0\x04\x85\0\x01\x1c\x1b\0\0\0\x2d\0\0\
+\0\x42\0\0\x04\x85\0\x01\x1c\x0f\0\0\0\x2e\0\0\0\x42\0\0\x04\xa8\0\x01\x24\x0d\
+\0\0\0\x30\0\0\0\x42\0\0\x03\x58\0\x01\xc4\x02\0\0\0\x3f\0\0\0\x42\0\0\x02\x3c\
+\0\x01\xd4\x01\0\0\0\0\0\0\0\x18\0\0\0\x3e\0\0\0\0\0\0\0\x08\0\0\0\x08\0\0\0\
+\x3e\0\0\0\0\0\0\0\x10\0\0\0\x18\0\0\x01\x0a\0\0\0\0\0\0\0\x20\0\0\0\x1c\0\0\0\
+\x3e\0\0\0\0\0\0\0\x28\0\0\0\x08\0\0\x01\x3a\0\0\0\0\0\0\0\x80\0\0\0\x1e\0\0\0\
+\x3e\0\0\0\0\0\0\0\x90\0\0\0\x1e\0\0\x01\x0a\0\0\0\0\0\0\0\xa8\0\0\0\x1e\0\0\
+\x03\x8b\0\0\0\0\0\0\0\xb0\0\0\0\x1e\0\0\x03\x8f\0\0\0\0\0\0\0\xc0\0\0\0\x23\0\
+\0\x03\xbd\0\0\0\0\0\0\0\xd8\0\0\0\x24\0\0\x01\x0a\0\0\0\0\0\0\0\xf0\0\0\0\x24\
+\0\0\0\x3e\0\0\0\0\0\0\x01\x18\0\0\0\x28\0\0\0\x3e\0\0\0\0\0\0\x01\x50\0\0\0\
+\x1e\0\0\x01\x0a\0\0\0\0\0\0\x01\x60\0\0\0\x24\0\0\x04\x7f\0\0\0\0\0\0\x01\x88\
+\0\0\0\x1e\0\0\x01\x3a\0\0\0\0\0\0\x01\x98\0\0\0\x1e\0\0\x04\xc0\0\0\0\0\0\0\
+\x01\xa0\0\0\0\x1c\0\0\0\x3e\0\0\0\0\0\0\0\x1a\0\0\0\x41\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x64\x75\x6d\x70\x5f\
+\x62\x70\x66\x5f\x70\x72\x6f\x67\0\0\0\0\0\0\0\0\0\0\x1c\0\0\0\0\0\0\0\x08\0\0\
+\0\0\0\0\0\0\0\0\0\x01\0\0\0\x10\0\0\0\0\0\0\0\0\0\0\0\x16\0\0\0\x01\0\0\0\0\0\
+\0\0\x12\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x10\0\0\0\0\x62\x70\x66\x5f\x69\
+\x74\x65\x72\x5f\x62\x70\x66\x5f\x70\x72\x6f\x67\0\0\0\0\0\0\0";
+ static const char opts_insn[] __attribute__((__aligned__(8))) = "\
\xbf\x61\0\0\0\0\0\0\xbf\x1a\0\0\0\0\0\0\x07\x10\0\0\xff\xff\xff\x78\xb7\x20\0\
\0\0\0\0\x88\xb7\x30\0\0\0\0\0\0\x85\0\0\0\0\0\0\x71\x05\0\0\x14\0\0\0\0\x61\
\x1a\xff\x78\0\0\0\0\xd5\x10\0\x01\0\0\0\0\x85\0\0\0\0\0\0\xa8\x61\x1a\xff\x7c\
@@ -318,72 +321,87 @@ iterators_bpf__load(struct iterators_bpf *skel)
\0\0\0\x85\0\0\0\0\0\0\xa8\x18\x06\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x61\x10\0\0\0\0\
\0\0\xd5\x10\0\x02\0\0\0\0\xbf\x91\0\0\0\0\0\0\x85\0\0\0\0\0\0\xa8\xbf\x07\0\0\
\0\0\0\0\x95\0\0\0\0\0\0\0\x61\x06\0\x08\0\0\0\0\x18\x16\0\0\0\0\0\0\0\0\0\0\0\
-\0\x0e\x68\x63\x10\0\0\0\0\0\0\x61\x06\0\x0c\0\0\0\0\x18\x16\0\0\0\0\0\0\0\0\0\
-\0\0\0\x0e\x64\x63\x10\0\0\0\0\0\0\x79\x06\0\x10\0\0\0\0\x18\x16\0\0\0\0\0\0\0\
-\0\0\0\0\0\x0e\x58\x7b\x10\0\0\0\0\0\0\x18\x06\0\0\0\0\0\0\0\0\0\0\0\0\x05\0\
-\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x0e\x50\x7b\x10\0\0\0\0\0\0\xb7\x10\0\0\0\0\0\
-\x12\x18\x26\0\0\0\0\0\0\0\0\0\0\0\0\x0e\x50\xb7\x30\0\0\0\0\0\x1c\x85\0\0\0\0\
+\0\x0e\xf8\x63\x10\0\0\0\0\0\0\x61\x06\0\x0c\0\0\0\0\x18\x16\0\0\0\0\0\0\0\0\0\
+\0\0\0\x0e\xf4\x63\x10\0\0\0\0\0\0\x79\x06\0\x10\0\0\0\0\x18\x16\0\0\0\0\0\0\0\
+\0\0\0\0\0\x0e\xe8\x7b\x10\0\0\0\0\0\0\x18\x06\0\0\0\0\0\0\0\0\0\0\0\0\x05\0\
+\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x0e\xe0\x7b\x10\0\0\0\0\0\0\xb7\x10\0\0\0\0\0\
+\x12\x18\x26\0\0\0\0\0\0\0\0\0\0\0\0\x0e\xe0\xb7\x30\0\0\0\0\0\x1c\x85\0\0\0\0\
\0\0\xa6\xbf\x70\0\0\0\0\0\0\xc5\x70\xff\xd4\0\0\0\0\x63\xa7\xff\x78\0\0\0\0\
-\x61\x0a\xff\x78\0\0\0\0\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x0e\xa0\x63\x10\0\0\0\
+\x61\x0a\xff\x78\0\0\0\0\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x0f\x30\x63\x10\0\0\0\
\0\0\0\x61\x06\0\x1c\0\0\0\0\x15\0\0\x03\0\0\0\0\x18\x16\0\0\0\0\0\0\0\0\0\0\0\
-\0\x0e\x7c\x63\x10\0\0\0\0\0\0\xb7\x10\0\0\0\0\0\0\x18\x26\0\0\0\0\0\0\0\0\0\0\
-\0\0\x0e\x70\xb7\x30\0\0\0\0\0\x48\x85\0\0\0\0\0\0\xa6\xbf\x70\0\0\0\0\0\0\xc5\
+\0\x0f\x0c\x63\x10\0\0\0\0\0\0\xb7\x10\0\0\0\0\0\0\x18\x26\0\0\0\0\0\0\0\0\0\0\
+\0\0\x0f\0\xb7\x30\0\0\0\0\0\x48\x85\0\0\0\0\0\0\xa6\xbf\x70\0\0\0\0\0\0\xc5\
\x70\xff\xc3\0\0\0\0\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x63\x17\0\0\0\0\0\0\
\x79\x36\0\x20\0\0\0\0\x15\x30\0\x08\0\0\0\0\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\
-\x0e\xb8\xb7\x20\0\0\0\0\0\x62\x61\x06\0\x04\0\0\0\0\x45\0\0\x02\0\0\0\x01\x85\
+\x0f\x48\xb7\x20\0\0\0\0\0\x7b\x61\x06\0\x04\0\0\0\0\x45\0\0\x02\0\0\0\x01\x85\
\0\0\0\0\0\0\x94\x05\0\0\x01\0\0\0\0\x85\0\0\0\0\0\0\x71\x18\x26\0\0\0\0\0\0\0\
-\0\0\0\0\0\0\0\x61\x02\0\0\0\0\0\0\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x0f\x28\x63\
-\x10\0\0\0\0\0\0\x18\x06\0\0\0\0\0\0\0\0\0\0\0\0\x0f\x20\x18\x16\0\0\0\0\0\0\0\
-\0\0\0\0\0\x0f\x30\x7b\x10\0\0\0\0\0\0\x18\x06\0\0\0\0\0\0\0\0\0\0\0\0\x0e\xb8\
-\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x0f\x38\x7b\x10\0\0\0\0\0\0\xb7\x10\0\0\0\0\0\
-\x02\x18\x26\0\0\0\0\0\0\0\0\0\0\0\0\x0f\x28\xb7\x30\0\0\0\0\0\x20\x85\0\0\0\0\
+\0\0\0\0\0\0\0\x61\x02\0\0\0\0\0\0\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x0f\xd0\x63\
+\x10\0\0\0\0\0\0\x18\x06\0\0\0\0\0\0\0\0\0\0\0\0\x0f\xc8\x18\x16\0\0\0\0\0\0\0\
+\0\0\0\0\0\x0f\xd8\x7b\x10\0\0\0\0\0\0\x18\x06\0\0\0\0\0\0\0\0\0\0\0\0\x0f\x48\
+\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x0f\xe0\x7b\x10\0\0\0\0\0\0\xb7\x10\0\0\0\0\0\
+\x02\x18\x26\0\0\0\0\0\0\0\0\0\0\0\0\x0f\xd0\xb7\x30\0\0\0\0\0\x20\x85\0\0\0\0\
\0\0\xa6\xbf\x70\0\0\0\0\0\0\xc5\x70\xff\x9f\0\0\0\0\x18\x26\0\0\0\0\0\0\0\0\0\
-\0\0\0\0\0\x61\x02\0\0\0\0\0\0\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x0f\x48\x63\x10\
-\0\0\0\0\0\0\xb7\x10\0\0\0\0\0\x16\x18\x26\0\0\0\0\0\0\0\0\0\0\0\0\x0f\x48\xb7\
+\0\0\0\0\0\x61\x02\0\0\0\0\0\0\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x0f\xf0\x63\x10\
+\0\0\0\0\0\0\xb7\x10\0\0\0\0\0\x16\x18\x26\0\0\0\0\0\0\0\0\0\0\0\0\x0f\xf0\xb7\
\x30\0\0\0\0\0\x04\x85\0\0\0\0\0\0\xa6\xbf\x70\0\0\0\0\0\0\xc5\x70\xff\x92\0\0\
-\0\0\x18\x06\0\0\0\0\0\0\0\0\0\0\0\0\x0f\x50\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\
-\x11\x70\x7b\x10\0\0\0\0\0\0\x18\x06\0\0\0\0\0\0\0\0\0\0\0\0\x0f\x58\x18\x16\0\
-\0\0\0\0\0\0\0\0\0\0\0\x11\x68\x7b\x10\0\0\0\0\0\0\x18\x06\0\0\0\0\0\0\0\0\0\0\
-\0\0\x10\x58\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x11\xb0\x7b\x10\0\0\0\0\0\0\x18\
-\x06\0\0\0\0\0\0\0\0\0\0\0\0\x10\x60\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x11\xc0\
-\x7b\x10\0\0\0\0\0\0\x18\x06\0\0\0\0\0\0\0\0\0\0\0\0\x10\xf0\x18\x16\0\0\0\0\0\
-\0\0\0\0\0\0\0\x11\xe0\x7b\x10\0\0\0\0\0\0\x18\x06\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
-\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x11\xd8\x7b\x10\0\0\0\0\0\0\x61\x06\0\x08\0\0\
-\0\0\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x11\x78\x63\x10\0\0\0\0\0\0\x61\x06\0\x0c\
-\0\0\0\0\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x11\x7c\x63\x10\0\0\0\0\0\0\x79\x06\0\
-\x10\0\0\0\0\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x11\x80\x7b\x10\0\0\0\0\0\0\x61\
-\x0a\xff\x78\0\0\0\0\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x11\xa8\x63\x10\0\0\0\0\0\
-\0\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x11\xf0\xb7\x20\0\0\0\0\0\x11\xb7\x30\0\0\0\
-\0\0\x0c\xb7\x40\0\0\0\0\0\0\x85\0\0\0\0\0\0\xa7\xbf\x70\0\0\0\0\0\0\xc5\x70\
-\xff\x5c\0\0\0\0\x18\x06\0\0\0\0\0\0\0\0\0\0\0\0\x11\x60\x63\x07\0\x6c\0\0\0\0\
-\x77\x70\0\0\0\0\0\x20\x63\x07\0\x70\0\0\0\0\xb7\x10\0\0\0\0\0\x05\x18\x26\0\0\
-\0\0\0\0\0\0\0\0\0\0\x11\x60\xb7\x30\0\0\0\0\0\x8c\x85\0\0\0\0\0\0\xa6\xbf\x70\
-\0\0\0\0\0\0\x18\x06\0\0\0\0\0\0\0\0\0\0\0\0\x11\xd0\x61\x10\0\0\0\0\0\0\xd5\
-\x10\0\x02\0\0\0\0\xbf\x91\0\0\0\0\0\0\x85\0\0\0\0\0\0\xa8\xc5\x70\xff\x4a\0\0\
-\0\0\x63\xa7\xff\x80\0\0\0\0\x18\x06\0\0\0\0\0\0\0\0\0\0\0\0\x12\x08\x18\x16\0\
-\0\0\0\0\0\0\0\0\0\0\0\x16\xe0\x7b\x10\0\0\0\0\0\0\x18\x06\0\0\0\0\0\0\0\0\0\0\
-\0\0\x12\x10\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x16\xd8\x7b\x10\0\0\0\0\0\0\x18\
-\x06\0\0\0\0\0\0\0\0\0\0\0\0\x14\x18\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x17\x20\
-\x7b\x10\0\0\0\0\0\0\x18\x06\0\0\0\0\0\0\0\0\0\0\0\0\x14\x20\x18\x16\0\0\0\0\0\
-\0\0\0\0\0\0\0\x17\x30\x7b\x10\0\0\0\0\0\0\x18\x06\0\0\0\0\0\0\0\0\0\0\0\0\x15\
-\xb0\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x17\x50\x7b\x10\0\0\0\0\0\0\x18\x06\0\0\0\
-\0\0\0\0\0\0\0\0\0\0\0\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x17\x48\x7b\x10\0\0\0\0\
-\0\0\x61\x06\0\x08\0\0\0\0\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x16\xe8\x63\x10\0\0\
-\0\0\0\0\x61\x06\0\x0c\0\0\0\0\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x16\xec\x63\x10\
-\0\0\0\0\0\0\x79\x06\0\x10\0\0\0\0\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x16\xf0\x7b\
-\x10\0\0\0\0\0\0\x61\x0a\xff\x78\0\0\0\0\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x17\
-\x18\x63\x10\0\0\0\0\0\0\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x17\x60\xb7\x20\0\0\0\
-\0\0\x12\xb7\x30\0\0\0\0\0\x0c\xb7\x40\0\0\0\0\0\0\x85\0\0\0\0\0\0\xa7\xbf\x70\
-\0\0\0\0\0\0\xc5\x70\xff\x13\0\0\0\0\x18\x06\0\0\0\0\0\0\0\0\0\0\0\0\x16\xd0\
-\x63\x07\0\x6c\0\0\0\0\x77\x70\0\0\0\0\0\x20\x63\x07\0\x70\0\0\0\0\xb7\x10\0\0\
-\0\0\0\x05\x18\x26\0\0\0\0\0\0\0\0\0\0\0\0\x16\xd0\xb7\x30\0\0\0\0\0\x8c\x85\0\
-\0\0\0\0\0\xa6\xbf\x70\0\0\0\0\0\0\x18\x06\0\0\0\0\0\0\0\0\0\0\0\0\x17\x40\x61\
-\x10\0\0\0\0\0\0\xd5\x10\0\x02\0\0\0\0\xbf\x91\0\0\0\0\0\0\x85\0\0\0\0\0\0\xa8\
-\xc5\x70\xff\x01\0\0\0\0\x63\xa7\xff\x84\0\0\0\0\x61\x1a\xff\x78\0\0\0\0\xd5\
-\x10\0\x02\0\0\0\0\xbf\x91\0\0\0\0\0\0\x85\0\0\0\0\0\0\xa8\x61\x0a\xff\x80\0\0\
-\0\0\x63\x60\0\x28\0\0\0\0\x61\x0a\xff\x84\0\0\0\0\x63\x60\0\x2c\0\0\0\0\x18\
-\x16\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x61\x01\0\0\0\0\0\0\x63\x60\0\x18\0\0\0\0\xb7\
-\0\0\0\0\0\0\0\x95\0\0\0\0\0\0\0";
+\0\0\x18\x06\0\0\0\0\0\0\0\0\0\0\0\0\x0f\xf8\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\
+\x12\x30\x7b\x10\0\0\0\0\0\0\x18\x06\0\0\0\0\0\0\0\0\0\0\0\0\x10\0\x18\x16\0\0\
+\0\0\0\0\0\0\0\0\0\0\x12\x28\x7b\x10\0\0\0\0\0\0\x18\x06\0\0\0\0\0\0\0\0\0\0\0\
+\0\x11\x18\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x12\x70\x7b\x10\0\0\0\0\0\0\x18\x06\
+\0\0\0\0\0\0\0\0\0\0\0\0\x11\x20\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x12\x80\x7b\
+\x10\0\0\0\0\0\0\x18\x06\0\0\0\0\0\0\0\0\0\0\0\0\x11\xb0\x18\x16\0\0\0\0\0\0\0\
+\0\0\0\0\0\x12\xa0\x7b\x10\0\0\0\0\0\0\x18\x06\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x18\
+\x16\0\0\0\0\0\0\0\0\0\0\0\0\x12\x98\x7b\x10\0\0\0\0\0\0\x61\x06\0\x08\0\0\0\0\
+\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x12\x38\x63\x10\0\0\0\0\0\0\x61\x06\0\x0c\0\0\
+\0\0\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x12\x3c\x63\x10\0\0\0\0\0\0\x79\x06\0\x10\
+\0\0\0\0\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x12\x40\x7b\x10\0\0\0\0\0\0\x61\x0a\
+\xff\x78\0\0\0\0\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x12\x68\x63\x10\0\0\0\0\0\0\
+\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x12\xb0\xb7\x20\0\0\0\0\0\x11\xb7\x30\0\0\0\0\
+\0\x0c\xb7\x40\0\0\0\0\0\0\x85\0\0\0\0\0\0\xa7\xbf\x70\0\0\0\0\0\0\xc5\x70\xff\
+\x5c\0\0\0\0\x18\x06\0\0\0\0\0\0\0\0\0\0\0\0\x12\x20\x63\x07\0\x6c\0\0\0\0\x77\
+\x70\0\0\0\0\0\x20\x63\x07\0\x70\0\0\0\0\x18\x86\0\0\0\0\0\0\0\0\0\0\0\0\x10\
+\xb8\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x12\xc8\xb7\x20\0\0\0\0\0\x17\xb7\x30\0\0\
+\0\0\0\x0c\xb7\x40\0\0\0\0\0\0\x85\0\0\0\0\0\0\xa7\xbf\x70\0\0\0\0\0\0\xc5\x70\
+\xff\x4d\0\0\0\0\x75\x70\0\x03\0\0\0\0\x62\x80\0\x04\0\0\0\0\x6a\x80\0\x02\0\0\
+\0\0\x05\0\0\x0a\0\0\0\0\x63\x87\0\x04\0\0\0\0\xbf\x97\0\0\0\0\0\0\x77\x90\0\0\
+\0\0\0\x20\x18\x06\0\0\0\0\0\0\0\0\0\0\0\0\x01\0\x63\x09\0\0\0\0\0\0\x55\x90\0\
+\x02\0\0\0\0\x6a\x80\0\x02\0\0\0\0\x05\0\0\x01\0\0\0\0\x6a\x80\0\x02\0\0\0\x40\
+\xb7\x10\0\0\0\0\0\x05\x18\x26\0\0\0\0\0\0\0\0\0\0\0\0\x12\x20\xb7\x30\0\0\0\0\
+\0\x8c\x85\0\0\0\0\0\0\xa6\xbf\x70\0\0\0\0\0\0\x18\x06\0\0\0\0\0\0\0\0\0\0\0\0\
+\x01\0\x61\x10\0\0\0\0\0\0\xd5\x10\0\x02\0\0\0\0\xbf\x91\0\0\0\0\0\0\x85\0\0\0\
+\0\0\0\xa8\x18\x06\0\0\0\0\0\0\0\0\0\0\0\0\x12\x90\x61\x10\0\0\0\0\0\0\xd5\x10\
+\0\x02\0\0\0\0\xbf\x91\0\0\0\0\0\0\x85\0\0\0\0\0\0\xa8\xc5\x70\xff\x2c\0\0\0\0\
+\x63\xa7\xff\x80\0\0\0\0\x18\x06\0\0\0\0\0\0\0\0\0\0\0\0\x12\xe0\x18\x16\0\0\0\
+\0\0\0\0\0\0\0\0\0\x17\x88\x7b\x10\0\0\0\0\0\0\x18\x06\0\0\0\0\0\0\0\0\0\0\0\0\
+\x12\xe8\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x17\x80\x7b\x10\0\0\0\0\0\0\x18\x06\0\
+\0\0\0\0\0\0\0\0\0\0\0\x14\xf0\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x17\xc8\x7b\x10\
+\0\0\0\0\0\0\x18\x06\0\0\0\0\0\0\0\0\0\0\0\0\x14\xf8\x18\x16\0\0\0\0\0\0\0\0\0\
+\0\0\0\x17\xd8\x7b\x10\0\0\0\0\0\0\x18\x06\0\0\0\0\0\0\0\0\0\0\0\0\x16\x58\x18\
+\x16\0\0\0\0\0\0\0\0\0\0\0\0\x17\xf8\x7b\x10\0\0\0\0\0\0\x18\x06\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x17\xf0\x7b\x10\0\0\0\0\0\0\x61\
+\x06\0\x08\0\0\0\0\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x17\x90\x63\x10\0\0\0\0\0\0\
+\x61\x06\0\x0c\0\0\0\0\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x17\x94\x63\x10\0\0\0\0\
+\0\0\x79\x06\0\x10\0\0\0\0\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x17\x98\x7b\x10\0\0\
+\0\0\0\0\x61\x0a\xff\x78\0\0\0\0\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x17\xc0\x63\
+\x10\0\0\0\0\0\0\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x18\x08\xb7\x20\0\0\0\0\0\x12\
+\xb7\x30\0\0\0\0\0\x0c\xb7\x40\0\0\0\0\0\0\x85\0\0\0\0\0\0\xa7\xbf\x70\0\0\0\0\
+\0\0\xc5\x70\xfe\xf5\0\0\0\0\x18\x06\0\0\0\0\0\0\0\0\0\0\0\0\x17\x78\x63\x07\0\
+\x6c\0\0\0\0\x77\x70\0\0\0\0\0\x20\x63\x07\0\x70\0\0\0\0\xb7\x10\0\0\0\0\0\x05\
+\x18\x26\0\0\0\0\0\0\0\0\0\0\0\0\x17\x78\xb7\x30\0\0\0\0\0\x8c\x85\0\0\0\0\0\0\
+\xa6\xbf\x70\0\0\0\0\0\0\x18\x06\0\0\0\0\0\0\0\0\0\0\0\0\x17\xe8\x61\x10\0\0\0\
+\0\0\0\xd5\x10\0\x02\0\0\0\0\xbf\x91\0\0\0\0\0\0\x85\0\0\0\0\0\0\xa8\xc5\x70\
+\xfe\xe3\0\0\0\0\x63\xa7\xff\x84\0\0\0\0\x61\x1a\xff\x78\0\0\0\0\xd5\x10\0\x02\
+\0\0\0\0\xbf\x91\0\0\0\0\0\0\x85\0\0\0\0\0\0\xa8\x61\x0a\xff\x80\0\0\0\0\x63\
+\x60\0\x28\0\0\0\0\x61\x0a\xff\x84\0\0\0\0\x63\x60\0\x2c\0\0\0\0\x18\x16\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\x61\x01\0\0\0\0\0\0\x63\x60\0\x18\0\0\0\0\xb7\0\0\0\0\0\
+\0\0\x95\0\0\0\0\0\0\0";
+
+ opts.ctx = (struct bpf_loader_ctx *)skel;
+ opts.data_sz = sizeof(opts_data) - 1;
+ opts.data = (void *)opts_data;
+ opts.insns_sz = sizeof(opts_insn) - 1;
+ opts.insns = (void *)opts_insn;
+
err = bpf_load_and_run(&opts);
if (err < 0)
return err;
diff --git a/kernel/bpf/prog_iter.c b/kernel/bpf/prog_iter.c
index 53a73c841c13..85d8fcb56fb7 100644
--- a/kernel/bpf/prog_iter.c
+++ b/kernel/bpf/prog_iter.c
@@ -78,8 +78,7 @@ static const struct seq_operations bpf_prog_seq_ops = {
.show = bpf_prog_seq_show,
};
-BTF_ID_LIST(btf_bpf_prog_id)
-BTF_ID(struct, bpf_prog)
+BTF_ID_LIST_SINGLE(btf_bpf_prog_id, struct, bpf_prog)
static const struct bpf_iter_seq_info bpf_prog_seq_info = {
.seq_ops = &bpf_prog_seq_ops,
diff --git a/kernel/bpf/rqspinlock.c b/kernel/bpf/rqspinlock.c
index 338305c8852c..5ab354d55d82 100644
--- a/kernel/bpf/rqspinlock.c
+++ b/kernel/bpf/rqspinlock.c
@@ -666,6 +666,27 @@ EXPORT_SYMBOL_GPL(resilient_queued_spin_lock_slowpath);
__bpf_kfunc_start_defs();
+static void bpf_prog_report_rqspinlock_violation(const char *str, void *lock, bool irqsave)
+{
+ struct rqspinlock_held *rqh = this_cpu_ptr(&rqspinlock_held_locks);
+ struct bpf_stream_stage ss;
+ struct bpf_prog *prog;
+
+ prog = bpf_prog_find_from_stack();
+ if (!prog)
+ return;
+ bpf_stream_stage(ss, prog, BPF_STDERR, ({
+ bpf_stream_printk(ss, "ERROR: %s for bpf_res_spin_lock%s\n", str, irqsave ? "_irqsave" : "");
+ bpf_stream_printk(ss, "Attempted lock = 0x%px\n", lock);
+ bpf_stream_printk(ss, "Total held locks = %d\n", rqh->cnt);
+ for (int i = 0; i < min(RES_NR_HELD, rqh->cnt); i++)
+ bpf_stream_printk(ss, "Held lock[%2d] = 0x%px\n", i, rqh->locks[i]);
+ bpf_stream_dump_stack(ss);
+ }));
+}
+
+#define REPORT_STR(ret) ({ (ret) == -ETIMEDOUT ? "Timeout detected" : "AA or ABBA deadlock detected"; })
+
__bpf_kfunc int bpf_res_spin_lock(struct bpf_res_spin_lock *lock)
{
int ret;
@@ -676,6 +697,7 @@ __bpf_kfunc int bpf_res_spin_lock(struct bpf_res_spin_lock *lock)
preempt_disable();
ret = res_spin_lock((rqspinlock_t *)lock);
if (unlikely(ret)) {
+ bpf_prog_report_rqspinlock_violation(REPORT_STR(ret), lock, false);
preempt_enable();
return ret;
}
@@ -698,6 +720,7 @@ __bpf_kfunc int bpf_res_spin_lock_irqsave(struct bpf_res_spin_lock *lock, unsign
local_irq_save(flags);
ret = res_spin_lock((rqspinlock_t *)lock);
if (unlikely(ret)) {
+ bpf_prog_report_rqspinlock_violation(REPORT_STR(ret), lock, true);
local_irq_restore(flags);
preempt_enable();
return ret;
diff --git a/kernel/bpf/stream.c b/kernel/bpf/stream.c
new file mode 100644
index 000000000000..ab592db4a4bf
--- /dev/null
+++ b/kernel/bpf/stream.c
@@ -0,0 +1,526 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (c) 2025 Meta Platforms, Inc. and affiliates. */
+
+#include <linux/bpf.h>
+#include <linux/filter.h>
+#include <linux/bpf_mem_alloc.h>
+#include <linux/percpu.h>
+#include <linux/refcount.h>
+#include <linux/gfp.h>
+#include <linux/memory.h>
+#include <linux/local_lock.h>
+#include <linux/mutex.h>
+
+/*
+ * Simple per-CPU NMI-safe bump allocation mechanism, backed by the NMI-safe
+ * try_alloc_pages()/free_pages_nolock() primitives. We allocate a page and
+ * stash it in a local per-CPU variable, and bump allocate from the page
+ * whenever items need to be printed to a stream. Each page holds a global
+ * atomic refcount in its first 4 bytes, and then records of variable length
+ * that describe the printed messages. Once the global refcount has dropped to
+ * zero, it is a signal to free the page back to the kernel's page allocator,
+ * given all the individual records in it have been consumed.
+ *
+ * It is possible the same page is used to serve allocations across different
+ * programs, which may be consumed at different times individually, hence
+ * maintaining a reference count per-page is critical for correct lifetime
+ * tracking.
+ *
+ * The bpf_stream_page code will be replaced to use kmalloc_nolock() once it
+ * lands.
+ */
+struct bpf_stream_page {
+ refcount_t ref;
+ u32 consumed;
+ char buf[];
+};
+
+/* Available room to add data to a refcounted page. */
+#define BPF_STREAM_PAGE_SZ (PAGE_SIZE - offsetofend(struct bpf_stream_page, consumed))
+
+static DEFINE_PER_CPU(local_trylock_t, stream_local_lock) = INIT_LOCAL_TRYLOCK(stream_local_lock);
+static DEFINE_PER_CPU(struct bpf_stream_page *, stream_pcpu_page);
+
+static bool bpf_stream_page_local_lock(unsigned long *flags)
+{
+ return local_trylock_irqsave(&stream_local_lock, *flags);
+}
+
+static void bpf_stream_page_local_unlock(unsigned long *flags)
+{
+ local_unlock_irqrestore(&stream_local_lock, *flags);
+}
+
+static void bpf_stream_page_free(struct bpf_stream_page *stream_page)
+{
+ struct page *p;
+
+ if (!stream_page)
+ return;
+ p = virt_to_page(stream_page);
+ free_pages_nolock(p, 0);
+}
+
+static void bpf_stream_page_get(struct bpf_stream_page *stream_page)
+{
+ refcount_inc(&stream_page->ref);
+}
+
+static void bpf_stream_page_put(struct bpf_stream_page *stream_page)
+{
+ if (refcount_dec_and_test(&stream_page->ref))
+ bpf_stream_page_free(stream_page);
+}
+
+static void bpf_stream_page_init(struct bpf_stream_page *stream_page)
+{
+ refcount_set(&stream_page->ref, 1);
+ stream_page->consumed = 0;
+}
+
+static struct bpf_stream_page *bpf_stream_page_replace(void)
+{
+ struct bpf_stream_page *stream_page, *old_stream_page;
+ struct page *page;
+
+ page = alloc_pages_nolock(NUMA_NO_NODE, 0);
+ if (!page)
+ return NULL;
+ stream_page = page_address(page);
+ bpf_stream_page_init(stream_page);
+
+ old_stream_page = this_cpu_read(stream_pcpu_page);
+ if (old_stream_page)
+ bpf_stream_page_put(old_stream_page);
+ this_cpu_write(stream_pcpu_page, stream_page);
+ return stream_page;
+}
+
+static int bpf_stream_page_check_room(struct bpf_stream_page *stream_page, int len)
+{
+ int min = offsetof(struct bpf_stream_elem, str[0]);
+ int consumed = stream_page->consumed;
+ int total = BPF_STREAM_PAGE_SZ;
+ int rem = max(0, total - consumed - min);
+
+ /* Let's give room of at least 8 bytes. */
+ WARN_ON_ONCE(rem % 8 != 0);
+ rem = rem < 8 ? 0 : rem;
+ return min(len, rem);
+}
+
+static void bpf_stream_elem_init(struct bpf_stream_elem *elem, int len)
+{
+ init_llist_node(&elem->node);
+ elem->total_len = len;
+ elem->consumed_len = 0;
+}
+
+static struct bpf_stream_page *bpf_stream_page_from_elem(struct bpf_stream_elem *elem)
+{
+ unsigned long addr = (unsigned long)elem;
+
+ return (struct bpf_stream_page *)PAGE_ALIGN_DOWN(addr);
+}
+
+static struct bpf_stream_elem *bpf_stream_page_push_elem(struct bpf_stream_page *stream_page, int len)
+{
+ u32 consumed = stream_page->consumed;
+
+ stream_page->consumed += round_up(offsetof(struct bpf_stream_elem, str[len]), 8);
+ return (struct bpf_stream_elem *)&stream_page->buf[consumed];
+}
+
+static struct bpf_stream_elem *bpf_stream_page_reserve_elem(int len)
+{
+ struct bpf_stream_elem *elem = NULL;
+ struct bpf_stream_page *page;
+ int room = 0;
+
+ page = this_cpu_read(stream_pcpu_page);
+ if (!page)
+ page = bpf_stream_page_replace();
+ if (!page)
+ return NULL;
+
+ room = bpf_stream_page_check_room(page, len);
+ if (room != len)
+ page = bpf_stream_page_replace();
+ if (!page)
+ return NULL;
+ bpf_stream_page_get(page);
+ room = bpf_stream_page_check_room(page, len);
+ WARN_ON_ONCE(room != len);
+
+ elem = bpf_stream_page_push_elem(page, room);
+ bpf_stream_elem_init(elem, room);
+ return elem;
+}
+
+static struct bpf_stream_elem *bpf_stream_elem_alloc(int len)
+{
+ const int max_len = ARRAY_SIZE((struct bpf_bprintf_buffers){}.buf);
+ struct bpf_stream_elem *elem;
+ unsigned long flags;
+
+ BUILD_BUG_ON(max_len > BPF_STREAM_PAGE_SZ);
+ /*
+ * Length denotes the amount of data to be written as part of stream element,
+ * thus includes '\0' byte. We're capped by how much bpf_bprintf_buffers can
+ * accomodate, therefore deny allocations that won't fit into them.
+ */
+ if (len < 0 || len > max_len)
+ return NULL;
+
+ if (!bpf_stream_page_local_lock(&flags))
+ return NULL;
+ elem = bpf_stream_page_reserve_elem(len);
+ bpf_stream_page_local_unlock(&flags);
+ return elem;
+}
+
+static int __bpf_stream_push_str(struct llist_head *log, const char *str, int len)
+{
+ struct bpf_stream_elem *elem = NULL;
+
+ /*
+ * Allocate a bpf_prog_stream_elem and push it to the bpf_prog_stream
+ * log, elements will be popped at once and reversed to print the log.
+ */
+ elem = bpf_stream_elem_alloc(len);
+ if (!elem)
+ return -ENOMEM;
+
+ memcpy(elem->str, str, len);
+ llist_add(&elem->node, log);
+
+ return 0;
+}
+
+static int bpf_stream_consume_capacity(struct bpf_stream *stream, int len)
+{
+ if (atomic_read(&stream->capacity) >= BPF_STREAM_MAX_CAPACITY)
+ return -ENOSPC;
+ if (atomic_add_return(len, &stream->capacity) >= BPF_STREAM_MAX_CAPACITY) {
+ atomic_sub(len, &stream->capacity);
+ return -ENOSPC;
+ }
+ return 0;
+}
+
+static void bpf_stream_release_capacity(struct bpf_stream *stream, struct bpf_stream_elem *elem)
+{
+ int len = elem->total_len;
+
+ atomic_sub(len, &stream->capacity);
+}
+
+static int bpf_stream_push_str(struct bpf_stream *stream, const char *str, int len)
+{
+ int ret = bpf_stream_consume_capacity(stream, len);
+
+ return ret ?: __bpf_stream_push_str(&stream->log, str, len);
+}
+
+static struct bpf_stream *bpf_stream_get(enum bpf_stream_id stream_id, struct bpf_prog_aux *aux)
+{
+ if (stream_id != BPF_STDOUT && stream_id != BPF_STDERR)
+ return NULL;
+ return &aux->stream[stream_id - 1];
+}
+
+static void bpf_stream_free_elem(struct bpf_stream_elem *elem)
+{
+ struct bpf_stream_page *p;
+
+ p = bpf_stream_page_from_elem(elem);
+ bpf_stream_page_put(p);
+}
+
+static void bpf_stream_free_list(struct llist_node *list)
+{
+ struct bpf_stream_elem *elem, *tmp;
+
+ llist_for_each_entry_safe(elem, tmp, list, node)
+ bpf_stream_free_elem(elem);
+}
+
+static struct llist_node *bpf_stream_backlog_peek(struct bpf_stream *stream)
+{
+ return stream->backlog_head;
+}
+
+static struct llist_node *bpf_stream_backlog_pop(struct bpf_stream *stream)
+{
+ struct llist_node *node;
+
+ node = stream->backlog_head;
+ if (stream->backlog_head == stream->backlog_tail)
+ stream->backlog_head = stream->backlog_tail = NULL;
+ else
+ stream->backlog_head = node->next;
+ return node;
+}
+
+static void bpf_stream_backlog_fill(struct bpf_stream *stream)
+{
+ struct llist_node *head, *tail;
+
+ if (llist_empty(&stream->log))
+ return;
+ tail = llist_del_all(&stream->log);
+ if (!tail)
+ return;
+ head = llist_reverse_order(tail);
+
+ if (!stream->backlog_head) {
+ stream->backlog_head = head;
+ stream->backlog_tail = tail;
+ } else {
+ stream->backlog_tail->next = head;
+ stream->backlog_tail = tail;
+ }
+
+ return;
+}
+
+static bool bpf_stream_consume_elem(struct bpf_stream_elem *elem, int *len)
+{
+ int rem = elem->total_len - elem->consumed_len;
+ int used = min(rem, *len);
+
+ elem->consumed_len += used;
+ *len -= used;
+
+ return elem->consumed_len == elem->total_len;
+}
+
+static int bpf_stream_read(struct bpf_stream *stream, void __user *buf, int len)
+{
+ int rem_len = len, cons_len, ret = 0;
+ struct bpf_stream_elem *elem = NULL;
+ struct llist_node *node;
+
+ mutex_lock(&stream->lock);
+
+ while (rem_len) {
+ int pos = len - rem_len;
+ bool cont;
+
+ node = bpf_stream_backlog_peek(stream);
+ if (!node) {
+ bpf_stream_backlog_fill(stream);
+ node = bpf_stream_backlog_peek(stream);
+ }
+ if (!node)
+ break;
+ elem = container_of(node, typeof(*elem), node);
+
+ cons_len = elem->consumed_len;
+ cont = bpf_stream_consume_elem(elem, &rem_len) == false;
+
+ ret = copy_to_user(buf + pos, elem->str + cons_len,
+ elem->consumed_len - cons_len);
+ /* Restore in case of error. */
+ if (ret) {
+ ret = -EFAULT;
+ elem->consumed_len = cons_len;
+ break;
+ }
+
+ if (cont)
+ continue;
+ bpf_stream_backlog_pop(stream);
+ bpf_stream_release_capacity(stream, elem);
+ bpf_stream_free_elem(elem);
+ }
+
+ mutex_unlock(&stream->lock);
+ return ret ? ret : len - rem_len;
+}
+
+int bpf_prog_stream_read(struct bpf_prog *prog, enum bpf_stream_id stream_id, void __user *buf, int len)
+{
+ struct bpf_stream *stream;
+
+ stream = bpf_stream_get(stream_id, prog->aux);
+ if (!stream)
+ return -ENOENT;
+ return bpf_stream_read(stream, buf, len);
+}
+
+__bpf_kfunc_start_defs();
+
+/*
+ * Avoid using enum bpf_stream_id so that kfunc users don't have to pull in the
+ * enum in headers.
+ */
+__bpf_kfunc int bpf_stream_vprintk(int stream_id, const char *fmt__str, const void *args, u32 len__sz, void *aux__prog)
+{
+ struct bpf_bprintf_data data = {
+ .get_bin_args = true,
+ .get_buf = true,
+ };
+ struct bpf_prog_aux *aux = aux__prog;
+ u32 fmt_size = strlen(fmt__str) + 1;
+ struct bpf_stream *stream;
+ u32 data_len = len__sz;
+ int ret, num_args;
+
+ stream = bpf_stream_get(stream_id, aux);
+ if (!stream)
+ return -ENOENT;
+
+ if (data_len & 7 || data_len > MAX_BPRINTF_VARARGS * 8 ||
+ (data_len && !args))
+ return -EINVAL;
+ num_args = data_len / 8;
+
+ ret = bpf_bprintf_prepare(fmt__str, fmt_size, args, num_args, &data);
+ if (ret < 0)
+ return ret;
+
+ ret = bstr_printf(data.buf, MAX_BPRINTF_BUF, fmt__str, data.bin_args);
+ /* Exclude NULL byte during push. */
+ ret = bpf_stream_push_str(stream, data.buf, ret);
+ bpf_bprintf_cleanup(&data);
+
+ return ret;
+}
+
+__bpf_kfunc_end_defs();
+
+/* Added kfunc to common_btf_ids */
+
+void bpf_prog_stream_init(struct bpf_prog *prog)
+{
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(prog->aux->stream); i++) {
+ atomic_set(&prog->aux->stream[i].capacity, 0);
+ init_llist_head(&prog->aux->stream[i].log);
+ mutex_init(&prog->aux->stream[i].lock);
+ prog->aux->stream[i].backlog_head = NULL;
+ prog->aux->stream[i].backlog_tail = NULL;
+ }
+}
+
+void bpf_prog_stream_free(struct bpf_prog *prog)
+{
+ struct llist_node *list;
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(prog->aux->stream); i++) {
+ list = llist_del_all(&prog->aux->stream[i].log);
+ bpf_stream_free_list(list);
+ bpf_stream_free_list(prog->aux->stream[i].backlog_head);
+ }
+}
+
+void bpf_stream_stage_init(struct bpf_stream_stage *ss)
+{
+ init_llist_head(&ss->log);
+ ss->len = 0;
+}
+
+void bpf_stream_stage_free(struct bpf_stream_stage *ss)
+{
+ struct llist_node *node;
+
+ node = llist_del_all(&ss->log);
+ bpf_stream_free_list(node);
+}
+
+int bpf_stream_stage_printk(struct bpf_stream_stage *ss, const char *fmt, ...)
+{
+ struct bpf_bprintf_buffers *buf;
+ va_list args;
+ int ret;
+
+ if (bpf_try_get_buffers(&buf))
+ return -EBUSY;
+
+ va_start(args, fmt);
+ ret = vsnprintf(buf->buf, ARRAY_SIZE(buf->buf), fmt, args);
+ va_end(args);
+ ss->len += ret;
+ /* Exclude NULL byte during push. */
+ ret = __bpf_stream_push_str(&ss->log, buf->buf, ret);
+ bpf_put_buffers();
+ return ret;
+}
+
+int bpf_stream_stage_commit(struct bpf_stream_stage *ss, struct bpf_prog *prog,
+ enum bpf_stream_id stream_id)
+{
+ struct llist_node *list, *head, *tail;
+ struct bpf_stream *stream;
+ int ret;
+
+ stream = bpf_stream_get(stream_id, prog->aux);
+ if (!stream)
+ return -EINVAL;
+
+ ret = bpf_stream_consume_capacity(stream, ss->len);
+ if (ret)
+ return ret;
+
+ list = llist_del_all(&ss->log);
+ head = tail = list;
+
+ if (!list)
+ return 0;
+ while (llist_next(list)) {
+ tail = llist_next(list);
+ list = tail;
+ }
+ llist_add_batch(head, tail, &stream->log);
+ return 0;
+}
+
+struct dump_stack_ctx {
+ struct bpf_stream_stage *ss;
+ int err;
+};
+
+static bool dump_stack_cb(void *cookie, u64 ip, u64 sp, u64 bp)
+{
+ struct dump_stack_ctx *ctxp = cookie;
+ const char *file = "", *line = "";
+ struct bpf_prog *prog;
+ int num, ret;
+
+ rcu_read_lock();
+ prog = bpf_prog_ksym_find(ip);
+ rcu_read_unlock();
+ if (prog) {
+ ret = bpf_prog_get_file_line(prog, ip, &file, &line, &num);
+ if (ret < 0)
+ goto end;
+ ctxp->err = bpf_stream_stage_printk(ctxp->ss, "%pS\n %s @ %s:%d\n",
+ (void *)(long)ip, line, file, num);
+ return !ctxp->err;
+ }
+end:
+ ctxp->err = bpf_stream_stage_printk(ctxp->ss, "%pS\n", (void *)(long)ip);
+ return !ctxp->err;
+}
+
+int bpf_stream_stage_dump_stack(struct bpf_stream_stage *ss)
+{
+ struct dump_stack_ctx ctx = { .ss = ss };
+ int ret;
+
+ ret = bpf_stream_stage_printk(ss, "CPU: %d UID: %d PID: %d Comm: %s\n",
+ raw_smp_processor_id(), __kuid_val(current_real_cred()->euid),
+ current->pid, current->comm);
+ if (ret)
+ return ret;
+ ret = bpf_stream_stage_printk(ss, "Call trace:\n");
+ if (ret)
+ return ret;
+ arch_bpf_stack_walk(dump_stack_cb, &ctx);
+ if (ctx.err)
+ return ctx.err;
+ return bpf_stream_stage_printk(ss, "\n");
+}
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 64c3393e8270..e63039817af3 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -36,6 +36,7 @@
#include <linux/memcontrol.h>
#include <linux/trace_events.h>
#include <linux/tracepoint.h>
+#include <linux/overflow.h>
#include <net/netfilter/nf_bpf_link.h>
#include <net/netkit.h>
@@ -578,7 +579,7 @@ static bool can_alloc_pages(void)
static struct page *__bpf_alloc_page(int nid)
{
if (!can_alloc_pages())
- return try_alloc_pages(nid, 0);
+ return alloc_pages_nolock(nid, 0);
return alloc_pages_node(nid,
GFP_KERNEL | __GFP_ZERO | __GFP_ACCOUNT
@@ -693,7 +694,7 @@ struct btf_record *btf_record_dup(const struct btf_record *rec)
if (IS_ERR_OR_NULL(rec))
return NULL;
- size = offsetof(struct btf_record, fields[rec->cnt]);
+ size = struct_size(rec, fields, rec->cnt);
new_rec = kmemdup(rec, size, GFP_KERNEL | __GFP_NOWARN);
if (!new_rec)
return ERR_PTR(-ENOMEM);
@@ -748,7 +749,7 @@ bool btf_record_equal(const struct btf_record *rec_a, const struct btf_record *r
return false;
if (rec_a->cnt != rec_b->cnt)
return false;
- size = offsetof(struct btf_record, fields[rec_a->cnt]);
+ size = struct_size(rec_a, fields, rec_a->cnt);
/* btf_parse_fields uses kzalloc to allocate a btf_record, so unused
* members are zeroed out. So memcmp is safe to do without worrying
* about padding/unused fields.
@@ -3068,7 +3069,7 @@ static int bpf_obj_get(const union bpf_attr *attr)
*/
void bpf_link_init_sleepable(struct bpf_link *link, enum bpf_link_type type,
const struct bpf_link_ops *ops, struct bpf_prog *prog,
- bool sleepable)
+ enum bpf_attach_type attach_type, bool sleepable)
{
WARN_ON(ops->dealloc && ops->dealloc_deferred);
atomic64_set(&link->refcnt, 1);
@@ -3077,12 +3078,14 @@ void bpf_link_init_sleepable(struct bpf_link *link, enum bpf_link_type type,
link->id = 0;
link->ops = ops;
link->prog = prog;
+ link->attach_type = attach_type;
}
void bpf_link_init(struct bpf_link *link, enum bpf_link_type type,
- const struct bpf_link_ops *ops, struct bpf_prog *prog)
+ const struct bpf_link_ops *ops, struct bpf_prog *prog,
+ enum bpf_attach_type attach_type)
{
- bpf_link_init_sleepable(link, type, ops, prog, false);
+ bpf_link_init_sleepable(link, type, ops, prog, attach_type, false);
}
static void bpf_link_free_id(int id)
@@ -3227,7 +3230,14 @@ static void bpf_link_show_fdinfo(struct seq_file *m, struct file *filp)
char prog_tag[sizeof(prog->tag) * 2 + 1] = { };
if (type < ARRAY_SIZE(bpf_link_type_strs) && bpf_link_type_strs[type]) {
- seq_printf(m, "link_type:\t%s\n", bpf_link_type_strs[type]);
+ if (link->type == BPF_LINK_TYPE_KPROBE_MULTI)
+ seq_printf(m, "link_type:\t%s\n", link->flags == BPF_F_KPROBE_MULTI_RETURN ?
+ "kretprobe_multi" : "kprobe_multi");
+ else if (link->type == BPF_LINK_TYPE_UPROBE_MULTI)
+ seq_printf(m, "link_type:\t%s\n", link->flags == BPF_F_UPROBE_MULTI_RETURN ?
+ "uretprobe_multi" : "uprobe_multi");
+ else
+ seq_printf(m, "link_type:\t%s\n", bpf_link_type_strs[type]);
} else {
WARN_ONCE(1, "missing BPF_LINK_TYPE(...) for link type %u\n", type);
seq_printf(m, "link_type:\t<%u>\n", type);
@@ -3402,10 +3412,12 @@ static void bpf_tracing_link_show_fdinfo(const struct bpf_link *link,
seq_printf(seq,
"attach_type:\t%d\n"
"target_obj_id:\t%u\n"
- "target_btf_id:\t%u\n",
- tr_link->attach_type,
+ "target_btf_id:\t%u\n"
+ "cookie:\t%llu\n",
+ link->attach_type,
target_obj_id,
- target_btf_id);
+ target_btf_id,
+ tr_link->link.cookie);
}
static int bpf_tracing_link_fill_link_info(const struct bpf_link *link,
@@ -3414,7 +3426,8 @@ static int bpf_tracing_link_fill_link_info(const struct bpf_link *link,
struct bpf_tracing_link *tr_link =
container_of(link, struct bpf_tracing_link, link.link);
- info->tracing.attach_type = tr_link->attach_type;
+ info->tracing.attach_type = link->attach_type;
+ info->tracing.cookie = tr_link->link.cookie;
bpf_trampoline_unpack_key(tr_link->trampoline->key,
&info->tracing.target_obj_id,
&info->tracing.target_btf_id);
@@ -3432,7 +3445,8 @@ static const struct bpf_link_ops bpf_tracing_link_lops = {
static int bpf_tracing_prog_attach(struct bpf_prog *prog,
int tgt_prog_fd,
u32 btf_id,
- u64 bpf_cookie)
+ u64 bpf_cookie,
+ enum bpf_attach_type attach_type)
{
struct bpf_link_primer link_primer;
struct bpf_prog *tgt_prog = NULL;
@@ -3500,8 +3514,8 @@ static int bpf_tracing_prog_attach(struct bpf_prog *prog,
goto out_put_prog;
}
bpf_link_init(&link->link.link, BPF_LINK_TYPE_TRACING,
- &bpf_tracing_link_lops, prog);
- link->attach_type = prog->expected_attach_type;
+ &bpf_tracing_link_lops, prog, attach_type);
+
link->link.cookie = bpf_cookie;
mutex_lock(&prog->aux->dst_mutex);
@@ -3650,8 +3664,10 @@ static void bpf_raw_tp_link_show_fdinfo(const struct bpf_link *link,
container_of(link, struct bpf_raw_tp_link, link);
seq_printf(seq,
- "tp_name:\t%s\n",
- raw_tp_link->btp->tp->name);
+ "tp_name:\t%s\n"
+ "cookie:\t%llu\n",
+ raw_tp_link->btp->tp->name,
+ raw_tp_link->cookie);
}
static int bpf_copy_to_user(char __user *ubuf, const char *buf, u32 ulen,
@@ -3687,6 +3703,7 @@ static int bpf_raw_tp_link_fill_link_info(const struct bpf_link *link,
return -EINVAL;
info->raw_tracepoint.tp_name_len = tp_len + 1;
+ info->raw_tracepoint.cookie = raw_tp_link->cookie;
if (!ubuf)
return 0;
@@ -3793,20 +3810,46 @@ static int bpf_perf_link_fill_kprobe(const struct perf_event *event,
info->perf_event.kprobe.cookie = event->bpf_cookie;
return 0;
}
+
+static void bpf_perf_link_fdinfo_kprobe(const struct perf_event *event,
+ struct seq_file *seq)
+{
+ const char *name;
+ int err;
+ u32 prog_id, type;
+ u64 offset, addr;
+ unsigned long missed;
+
+ err = bpf_get_perf_event_info(event, &prog_id, &type, &name,
+ &offset, &addr, &missed);
+ if (err)
+ return;
+
+ seq_printf(seq,
+ "name:\t%s\n"
+ "offset:\t%#llx\n"
+ "missed:\t%lu\n"
+ "addr:\t%#llx\n"
+ "event_type:\t%s\n"
+ "cookie:\t%llu\n",
+ name, offset, missed, addr,
+ type == BPF_FD_TYPE_KRETPROBE ? "kretprobe" : "kprobe",
+ event->bpf_cookie);
+}
#endif
#ifdef CONFIG_UPROBE_EVENTS
static int bpf_perf_link_fill_uprobe(const struct perf_event *event,
struct bpf_link_info *info)
{
+ u64 ref_ctr_offset, offset;
char __user *uname;
- u64 addr, offset;
u32 ulen, type;
int err;
uname = u64_to_user_ptr(info->perf_event.uprobe.file_name);
ulen = info->perf_event.uprobe.name_len;
- err = bpf_perf_link_fill_common(event, uname, &ulen, &offset, &addr,
+ err = bpf_perf_link_fill_common(event, uname, &ulen, &offset, &ref_ctr_offset,
&type, NULL);
if (err)
return err;
@@ -3818,8 +3861,34 @@ static int bpf_perf_link_fill_uprobe(const struct perf_event *event,
info->perf_event.uprobe.name_len = ulen;
info->perf_event.uprobe.offset = offset;
info->perf_event.uprobe.cookie = event->bpf_cookie;
+ info->perf_event.uprobe.ref_ctr_offset = ref_ctr_offset;
return 0;
}
+
+static void bpf_perf_link_fdinfo_uprobe(const struct perf_event *event,
+ struct seq_file *seq)
+{
+ const char *name;
+ int err;
+ u32 prog_id, type;
+ u64 offset, ref_ctr_offset;
+ unsigned long missed;
+
+ err = bpf_get_perf_event_info(event, &prog_id, &type, &name,
+ &offset, &ref_ctr_offset, &missed);
+ if (err)
+ return;
+
+ seq_printf(seq,
+ "name:\t%s\n"
+ "offset:\t%#llx\n"
+ "ref_ctr_offset:\t%#llx\n"
+ "event_type:\t%s\n"
+ "cookie:\t%llu\n",
+ name, offset, ref_ctr_offset,
+ type == BPF_FD_TYPE_URETPROBE ? "uretprobe" : "uprobe",
+ event->bpf_cookie);
+}
#endif
static int bpf_perf_link_fill_probe(const struct perf_event *event,
@@ -3888,10 +3957,79 @@ static int bpf_perf_link_fill_link_info(const struct bpf_link *link,
}
}
+static void bpf_perf_event_link_show_fdinfo(const struct perf_event *event,
+ struct seq_file *seq)
+{
+ seq_printf(seq,
+ "type:\t%u\n"
+ "config:\t%llu\n"
+ "event_type:\t%s\n"
+ "cookie:\t%llu\n",
+ event->attr.type, event->attr.config,
+ "event", event->bpf_cookie);
+}
+
+static void bpf_tracepoint_link_show_fdinfo(const struct perf_event *event,
+ struct seq_file *seq)
+{
+ int err;
+ const char *name;
+ u32 prog_id;
+
+ err = bpf_get_perf_event_info(event, &prog_id, NULL, &name, NULL,
+ NULL, NULL);
+ if (err)
+ return;
+
+ seq_printf(seq,
+ "tp_name:\t%s\n"
+ "event_type:\t%s\n"
+ "cookie:\t%llu\n",
+ name, "tracepoint", event->bpf_cookie);
+}
+
+static void bpf_probe_link_show_fdinfo(const struct perf_event *event,
+ struct seq_file *seq)
+{
+#ifdef CONFIG_KPROBE_EVENTS
+ if (event->tp_event->flags & TRACE_EVENT_FL_KPROBE)
+ return bpf_perf_link_fdinfo_kprobe(event, seq);
+#endif
+
+#ifdef CONFIG_UPROBE_EVENTS
+ if (event->tp_event->flags & TRACE_EVENT_FL_UPROBE)
+ return bpf_perf_link_fdinfo_uprobe(event, seq);
+#endif
+}
+
+static void bpf_perf_link_show_fdinfo(const struct bpf_link *link,
+ struct seq_file *seq)
+{
+ struct bpf_perf_link *perf_link;
+ const struct perf_event *event;
+
+ perf_link = container_of(link, struct bpf_perf_link, link);
+ event = perf_get_event(perf_link->perf_file);
+ if (IS_ERR(event))
+ return;
+
+ switch (event->prog->type) {
+ case BPF_PROG_TYPE_PERF_EVENT:
+ return bpf_perf_event_link_show_fdinfo(event, seq);
+ case BPF_PROG_TYPE_TRACEPOINT:
+ return bpf_tracepoint_link_show_fdinfo(event, seq);
+ case BPF_PROG_TYPE_KPROBE:
+ return bpf_probe_link_show_fdinfo(event, seq);
+ default:
+ return;
+ }
+}
+
static const struct bpf_link_ops bpf_perf_link_lops = {
.release = bpf_perf_link_release,
.dealloc = bpf_perf_link_dealloc,
.fill_link_info = bpf_perf_link_fill_link_info,
+ .show_fdinfo = bpf_perf_link_show_fdinfo,
};
static int bpf_perf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
@@ -3914,7 +4052,8 @@ static int bpf_perf_link_attach(const union bpf_attr *attr, struct bpf_prog *pro
err = -ENOMEM;
goto out_put_file;
}
- bpf_link_init(&link->link, BPF_LINK_TYPE_PERF_EVENT, &bpf_perf_link_lops, prog);
+ bpf_link_init(&link->link, BPF_LINK_TYPE_PERF_EVENT, &bpf_perf_link_lops, prog,
+ attr->link_create.attach_type);
link->perf_file = perf_file;
err = bpf_link_prime(&link->link, &link_primer);
@@ -3946,7 +4085,8 @@ static int bpf_perf_link_attach(const union bpf_attr *attr, struct bpf_prog *pro
#endif /* CONFIG_PERF_EVENTS */
static int bpf_raw_tp_link_attach(struct bpf_prog *prog,
- const char __user *user_tp_name, u64 cookie)
+ const char __user *user_tp_name, u64 cookie,
+ enum bpf_attach_type attach_type)
{
struct bpf_link_primer link_primer;
struct bpf_raw_tp_link *link;
@@ -3969,7 +4109,7 @@ static int bpf_raw_tp_link_attach(struct bpf_prog *prog,
tp_name = prog->aux->attach_func_name;
break;
}
- return bpf_tracing_prog_attach(prog, 0, 0, 0);
+ return bpf_tracing_prog_attach(prog, 0, 0, 0, attach_type);
case BPF_PROG_TYPE_RAW_TRACEPOINT:
case BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE:
if (strncpy_from_user(buf, user_tp_name, sizeof(buf) - 1) < 0)
@@ -3991,7 +4131,7 @@ static int bpf_raw_tp_link_attach(struct bpf_prog *prog,
goto out_put_btp;
}
bpf_link_init_sleepable(&link->link, BPF_LINK_TYPE_RAW_TRACEPOINT,
- &bpf_raw_tp_link_lops, prog,
+ &bpf_raw_tp_link_lops, prog, attach_type,
tracepoint_is_faultable(btp->tp));
link->btp = btp;
link->cookie = cookie;
@@ -4033,7 +4173,7 @@ static int bpf_raw_tracepoint_open(const union bpf_attr *attr)
tp_name = u64_to_user_ptr(attr->raw_tracepoint.name);
cookie = attr->raw_tracepoint.cookie;
- fd = bpf_raw_tp_link_attach(prog, tp_name, cookie);
+ fd = bpf_raw_tp_link_attach(prog, tp_name, cookie, prog->expected_attach_type);
if (fd < 0)
bpf_prog_put(prog);
return fd;
@@ -4183,6 +4323,25 @@ static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog,
}
}
+static bool is_cgroup_prog_type(enum bpf_prog_type ptype, enum bpf_attach_type atype,
+ bool check_atype)
+{
+ switch (ptype) {
+ case BPF_PROG_TYPE_CGROUP_DEVICE:
+ case BPF_PROG_TYPE_CGROUP_SKB:
+ case BPF_PROG_TYPE_CGROUP_SOCK:
+ case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
+ case BPF_PROG_TYPE_CGROUP_SOCKOPT:
+ case BPF_PROG_TYPE_CGROUP_SYSCTL:
+ case BPF_PROG_TYPE_SOCK_OPS:
+ return true;
+ case BPF_PROG_TYPE_LSM:
+ return check_atype ? atype == BPF_LSM_CGROUP : true;
+ default:
+ return false;
+ }
+}
+
#define BPF_PROG_ATTACH_LAST_FIELD expected_revision
#define BPF_F_ATTACH_MASK_BASE \
@@ -4213,6 +4372,9 @@ static int bpf_prog_attach(const union bpf_attr *attr)
if (bpf_mprog_supported(ptype)) {
if (attr->attach_flags & ~BPF_F_ATTACH_MASK_MPROG)
return -EINVAL;
+ } else if (is_cgroup_prog_type(ptype, 0, false)) {
+ if (attr->attach_flags & ~(BPF_F_ATTACH_MASK_BASE | BPF_F_ATTACH_MASK_MPROG))
+ return -EINVAL;
} else {
if (attr->attach_flags & ~BPF_F_ATTACH_MASK_BASE)
return -EINVAL;
@@ -4230,6 +4392,11 @@ static int bpf_prog_attach(const union bpf_attr *attr)
return -EINVAL;
}
+ if (is_cgroup_prog_type(ptype, prog->expected_attach_type, true)) {
+ ret = cgroup_bpf_prog_attach(attr, ptype, prog);
+ goto out;
+ }
+
switch (ptype) {
case BPF_PROG_TYPE_SK_SKB:
case BPF_PROG_TYPE_SK_MSG:
@@ -4241,20 +4408,6 @@ static int bpf_prog_attach(const union bpf_attr *attr)
case BPF_PROG_TYPE_FLOW_DISSECTOR:
ret = netns_bpf_prog_attach(attr, prog);
break;
- case BPF_PROG_TYPE_CGROUP_DEVICE:
- case BPF_PROG_TYPE_CGROUP_SKB:
- case BPF_PROG_TYPE_CGROUP_SOCK:
- case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
- case BPF_PROG_TYPE_CGROUP_SOCKOPT:
- case BPF_PROG_TYPE_CGROUP_SYSCTL:
- case BPF_PROG_TYPE_SOCK_OPS:
- case BPF_PROG_TYPE_LSM:
- if (ptype == BPF_PROG_TYPE_LSM &&
- prog->expected_attach_type != BPF_LSM_CGROUP)
- ret = -EINVAL;
- else
- ret = cgroup_bpf_prog_attach(attr, ptype, prog);
- break;
case BPF_PROG_TYPE_SCHED_CLS:
if (attr->attach_type == BPF_TCX_INGRESS ||
attr->attach_type == BPF_TCX_EGRESS)
@@ -4265,7 +4418,7 @@ static int bpf_prog_attach(const union bpf_attr *attr)
default:
ret = -EINVAL;
}
-
+out:
if (ret)
bpf_prog_put(prog);
return ret;
@@ -4293,6 +4446,9 @@ static int bpf_prog_detach(const union bpf_attr *attr)
if (IS_ERR(prog))
return PTR_ERR(prog);
}
+ } else if (is_cgroup_prog_type(ptype, 0, false)) {
+ if (attr->attach_flags || attr->relative_fd)
+ return -EINVAL;
} else if (attr->attach_flags ||
attr->relative_fd ||
attr->expected_revision) {
@@ -5083,6 +5239,21 @@ static int bpf_link_get_info_by_fd(struct file *file,
}
+static int token_get_info_by_fd(struct file *file,
+ struct bpf_token *token,
+ const union bpf_attr *attr,
+ union bpf_attr __user *uattr)
+{
+ struct bpf_token_info __user *uinfo = u64_to_user_ptr(attr->info.info);
+ u32 info_len = attr->info.info_len;
+ int err;
+
+ err = bpf_check_uarg_tail_zero(USER_BPFPTR(uinfo), sizeof(*uinfo), info_len);
+ if (err)
+ return err;
+ return bpf_token_get_info_by_fd(token, attr, uattr);
+}
+
#define BPF_OBJ_GET_INFO_BY_FD_LAST_FIELD info.info
static int bpf_obj_get_info_by_fd(const union bpf_attr *attr,
@@ -5106,6 +5277,9 @@ static int bpf_obj_get_info_by_fd(const union bpf_attr *attr,
else if (fd_file(f)->f_op == &bpf_link_fops || fd_file(f)->f_op == &bpf_link_fops_poll)
return bpf_link_get_info_by_fd(fd_file(f), fd_file(f)->private_data,
attr, uattr);
+ else if (fd_file(f)->f_op == &bpf_token_fops)
+ return token_get_info_by_fd(fd_file(f), fd_file(f)->private_data,
+ attr, uattr);
return -EINVAL;
}
@@ -5193,21 +5367,10 @@ static int bpf_task_fd_query_copy(const union bpf_attr *attr,
if (put_user(zero, ubuf))
return -EFAULT;
- } else if (input_len >= len + 1) {
- /* ubuf can hold the string with NULL terminator */
- if (copy_to_user(ubuf, buf, len + 1))
- return -EFAULT;
} else {
- /* ubuf cannot hold the string with NULL terminator,
- * do a partial copy with NULL terminator.
- */
- char zero = '\0';
-
- err = -ENOSPC;
- if (copy_to_user(ubuf, buf, input_len - 1))
- return -EFAULT;
- if (put_user(zero, ubuf + input_len - 1))
- return -EFAULT;
+ err = bpf_copy_to_user(ubuf, buf, input_len, len);
+ if (err == -EFAULT)
+ return err;
}
}
@@ -5385,7 +5548,8 @@ static int link_create(union bpf_attr *attr, bpfptr_t uattr)
ret = bpf_tracing_prog_attach(prog,
attr->link_create.target_fd,
attr->link_create.target_btf_id,
- attr->link_create.tracing.cookie);
+ attr->link_create.tracing.cookie,
+ attr->link_create.attach_type);
break;
case BPF_PROG_TYPE_LSM:
case BPF_PROG_TYPE_TRACING:
@@ -5394,7 +5558,8 @@ static int link_create(union bpf_attr *attr, bpfptr_t uattr)
goto out;
}
if (prog->expected_attach_type == BPF_TRACE_RAW_TP)
- ret = bpf_raw_tp_link_attach(prog, NULL, attr->link_create.tracing.cookie);
+ ret = bpf_raw_tp_link_attach(prog, NULL, attr->link_create.tracing.cookie,
+ attr->link_create.attach_type);
else if (prog->expected_attach_type == BPF_TRACE_ITER)
ret = bpf_iter_link_attach(attr, uattr, prog);
else if (prog->expected_attach_type == BPF_LSM_CGROUP)
@@ -5403,7 +5568,8 @@ static int link_create(union bpf_attr *attr, bpfptr_t uattr)
ret = bpf_tracing_prog_attach(prog,
attr->link_create.target_fd,
attr->link_create.target_btf_id,
- attr->link_create.tracing.cookie);
+ attr->link_create.tracing.cookie,
+ attr->link_create.attach_type);
break;
case BPF_PROG_TYPE_FLOW_DISSECTOR:
case BPF_PROG_TYPE_SK_LOOKUP:
@@ -5792,6 +5958,28 @@ static int token_create(union bpf_attr *attr)
return bpf_token_create(attr);
}
+#define BPF_PROG_STREAM_READ_BY_FD_LAST_FIELD prog_stream_read.prog_fd
+
+static int prog_stream_read(union bpf_attr *attr)
+{
+ char __user *buf = u64_to_user_ptr(attr->prog_stream_read.stream_buf);
+ u32 len = attr->prog_stream_read.stream_buf_len;
+ struct bpf_prog *prog;
+ int ret;
+
+ if (CHECK_ATTR(BPF_PROG_STREAM_READ_BY_FD))
+ return -EINVAL;
+
+ prog = bpf_prog_get(attr->prog_stream_read.prog_fd);
+ if (IS_ERR(prog))
+ return PTR_ERR(prog);
+
+ ret = bpf_prog_stream_read(prog, attr->prog_stream_read.stream_id, buf, len);
+ bpf_prog_put(prog);
+
+ return ret;
+}
+
static int __sys_bpf(enum bpf_cmd cmd, bpfptr_t uattr, unsigned int size)
{
union bpf_attr attr;
@@ -5928,6 +6116,9 @@ static int __sys_bpf(enum bpf_cmd cmd, bpfptr_t uattr, unsigned int size)
case BPF_TOKEN_CREATE:
err = token_create(&attr);
break;
+ case BPF_PROG_STREAM_READ_BY_FD:
+ err = prog_stream_read(&attr);
+ break;
default:
err = -EINVAL;
break;
diff --git a/kernel/bpf/sysfs_btf.c b/kernel/bpf/sysfs_btf.c
index 81d6cf90584a..9cbe15ce3540 100644
--- a/kernel/bpf/sysfs_btf.c
+++ b/kernel/bpf/sysfs_btf.c
@@ -7,14 +7,46 @@
#include <linux/kobject.h>
#include <linux/init.h>
#include <linux/sysfs.h>
+#include <linux/mm.h>
+#include <linux/io.h>
+#include <linux/btf.h>
/* See scripts/link-vmlinux.sh, gen_btf() func for details */
extern char __start_BTF[];
extern char __stop_BTF[];
+static int btf_sysfs_vmlinux_mmap(struct file *filp, struct kobject *kobj,
+ const struct bin_attribute *attr,
+ struct vm_area_struct *vma)
+{
+ unsigned long pages = PAGE_ALIGN(attr->size) >> PAGE_SHIFT;
+ size_t vm_size = vma->vm_end - vma->vm_start;
+ phys_addr_t addr = __pa_symbol(__start_BTF);
+ unsigned long pfn = addr >> PAGE_SHIFT;
+
+ if (attr->private != __start_BTF || !PAGE_ALIGNED(addr))
+ return -EINVAL;
+
+ if (vma->vm_pgoff)
+ return -EINVAL;
+
+ if (vma->vm_flags & (VM_WRITE | VM_EXEC | VM_MAYSHARE))
+ return -EACCES;
+
+ if (pfn + pages < pfn)
+ return -EINVAL;
+
+ if ((vm_size >> PAGE_SHIFT) > pages)
+ return -EINVAL;
+
+ vm_flags_mod(vma, VM_DONTDUMP, VM_MAYEXEC | VM_MAYWRITE);
+ return remap_pfn_range(vma, vma->vm_start, pfn, vm_size, vma->vm_page_prot);
+}
+
static struct bin_attribute bin_attr_btf_vmlinux __ro_after_init = {
.attr = { .name = "vmlinux", .mode = 0444, },
- .read_new = sysfs_bin_attr_simple_read,
+ .read = sysfs_bin_attr_simple_read,
+ .mmap = btf_sysfs_vmlinux_mmap,
};
struct kobject *btf_kobj;
diff --git a/kernel/bpf/tcx.c b/kernel/bpf/tcx.c
index 2e4885e7781f..efd987ea6872 100644
--- a/kernel/bpf/tcx.c
+++ b/kernel/bpf/tcx.c
@@ -142,7 +142,7 @@ static int tcx_link_prog_attach(struct bpf_link *link, u32 flags, u32 id_or_fd,
u64 revision)
{
struct tcx_link *tcx = tcx_link(link);
- bool created, ingress = tcx->location == BPF_TCX_INGRESS;
+ bool created, ingress = link->attach_type == BPF_TCX_INGRESS;
struct bpf_mprog_entry *entry, *entry_new;
struct net_device *dev = tcx->dev;
int ret;
@@ -169,7 +169,7 @@ static int tcx_link_prog_attach(struct bpf_link *link, u32 flags, u32 id_or_fd,
static void tcx_link_release(struct bpf_link *link)
{
struct tcx_link *tcx = tcx_link(link);
- bool ingress = tcx->location == BPF_TCX_INGRESS;
+ bool ingress = link->attach_type == BPF_TCX_INGRESS;
struct bpf_mprog_entry *entry, *entry_new;
struct net_device *dev;
int ret = 0;
@@ -204,7 +204,7 @@ static int tcx_link_update(struct bpf_link *link, struct bpf_prog *nprog,
struct bpf_prog *oprog)
{
struct tcx_link *tcx = tcx_link(link);
- bool ingress = tcx->location == BPF_TCX_INGRESS;
+ bool ingress = link->attach_type == BPF_TCX_INGRESS;
struct bpf_mprog_entry *entry, *entry_new;
struct net_device *dev;
int ret = 0;
@@ -260,8 +260,8 @@ static void tcx_link_fdinfo(const struct bpf_link *link, struct seq_file *seq)
seq_printf(seq, "ifindex:\t%u\n", ifindex);
seq_printf(seq, "attach_type:\t%u (%s)\n",
- tcx->location,
- tcx->location == BPF_TCX_INGRESS ? "ingress" : "egress");
+ link->attach_type,
+ link->attach_type == BPF_TCX_INGRESS ? "ingress" : "egress");
}
static int tcx_link_fill_info(const struct bpf_link *link,
@@ -276,7 +276,7 @@ static int tcx_link_fill_info(const struct bpf_link *link,
rtnl_unlock();
info->tcx.ifindex = ifindex;
- info->tcx.attach_type = tcx->location;
+ info->tcx.attach_type = link->attach_type;
return 0;
}
@@ -301,8 +301,8 @@ static int tcx_link_init(struct tcx_link *tcx,
struct net_device *dev,
struct bpf_prog *prog)
{
- bpf_link_init(&tcx->link, BPF_LINK_TYPE_TCX, &tcx_link_lops, prog);
- tcx->location = attr->link_create.attach_type;
+ bpf_link_init(&tcx->link, BPF_LINK_TYPE_TCX, &tcx_link_lops, prog,
+ attr->link_create.attach_type);
tcx->dev = dev;
return bpf_link_prime(&tcx->link, link_primer);
}
diff --git a/kernel/bpf/tnum.c b/kernel/bpf/tnum.c
index 9dbc31b25e3d..fa353c5d550f 100644
--- a/kernel/bpf/tnum.c
+++ b/kernel/bpf/tnum.c
@@ -83,6 +83,11 @@ struct tnum tnum_sub(struct tnum a, struct tnum b)
return TNUM(dv & ~mu, mu);
}
+struct tnum tnum_neg(struct tnum a)
+{
+ return tnum_sub(TNUM(0, 0), a);
+}
+
struct tnum tnum_and(struct tnum a, struct tnum b)
{
u64 alpha, beta, v;
diff --git a/kernel/bpf/token.c b/kernel/bpf/token.c
index 26057aa13503..0bbe412f854e 100644
--- a/kernel/bpf/token.c
+++ b/kernel/bpf/token.c
@@ -103,7 +103,7 @@ static void bpf_token_show_fdinfo(struct seq_file *m, struct file *filp)
static const struct inode_operations bpf_token_iops = { };
-static const struct file_operations bpf_token_fops = {
+const struct file_operations bpf_token_fops = {
.release = bpf_token_release,
.show_fdinfo = bpf_token_show_fdinfo,
};
@@ -210,6 +210,29 @@ out_file:
return err;
}
+int bpf_token_get_info_by_fd(struct bpf_token *token,
+ const union bpf_attr *attr,
+ union bpf_attr __user *uattr)
+{
+ struct bpf_token_info __user *uinfo = u64_to_user_ptr(attr->info.info);
+ struct bpf_token_info info;
+ u32 info_len = attr->info.info_len;
+
+ info_len = min_t(u32, info_len, sizeof(info));
+ memset(&info, 0, sizeof(info));
+
+ info.allowed_cmds = token->allowed_cmds;
+ info.allowed_maps = token->allowed_maps;
+ info.allowed_progs = token->allowed_progs;
+ info.allowed_attachs = token->allowed_attachs;
+
+ if (copy_to_user(uinfo, &info, info_len) ||
+ put_user(info_len, &uattr->info.info_len))
+ return -EFAULT;
+
+ return 0;
+}
+
struct bpf_token *bpf_token_get_from_fd(u32 ufd)
{
CLASS(fd, f)(ufd);
diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c
index c4b1a98ff726..0e364614c3a2 100644
--- a/kernel/bpf/trampoline.c
+++ b/kernel/bpf/trampoline.c
@@ -674,7 +674,8 @@ static const struct bpf_link_ops bpf_shim_tramp_link_lops = {
static struct bpf_shim_tramp_link *cgroup_shim_alloc(const struct bpf_prog *prog,
bpf_func_t bpf_func,
- int cgroup_atype)
+ int cgroup_atype,
+ enum bpf_attach_type attach_type)
{
struct bpf_shim_tramp_link *shim_link = NULL;
struct bpf_prog *p;
@@ -701,7 +702,7 @@ static struct bpf_shim_tramp_link *cgroup_shim_alloc(const struct bpf_prog *prog
p->expected_attach_type = BPF_LSM_MAC;
bpf_prog_inc(p);
bpf_link_init(&shim_link->link.link, BPF_LINK_TYPE_UNSPEC,
- &bpf_shim_tramp_link_lops, p);
+ &bpf_shim_tramp_link_lops, p, attach_type);
bpf_cgroup_atype_get(p->aux->attach_btf_id, cgroup_atype);
return shim_link;
@@ -726,7 +727,8 @@ static struct bpf_shim_tramp_link *cgroup_shim_find(struct bpf_trampoline *tr,
}
int bpf_trampoline_link_cgroup_shim(struct bpf_prog *prog,
- int cgroup_atype)
+ int cgroup_atype,
+ enum bpf_attach_type attach_type)
{
struct bpf_shim_tramp_link *shim_link = NULL;
struct bpf_attach_target_info tgt_info = {};
@@ -763,7 +765,7 @@ int bpf_trampoline_link_cgroup_shim(struct bpf_prog *prog,
/* Allocate and install new shim. */
- shim_link = cgroup_shim_alloc(prog, bpf_func, cgroup_atype);
+ shim_link = cgroup_shim_alloc(prog, bpf_func, cgroup_atype, attach_type);
if (!shim_link) {
err = -ENOMEM;
goto err;
@@ -911,27 +913,32 @@ static u64 notrace __bpf_prog_enter_recur(struct bpf_prog *prog, struct bpf_tram
return bpf_prog_start_time();
}
-static void notrace update_prog_stats(struct bpf_prog *prog,
- u64 start)
+static void notrace __update_prog_stats(struct bpf_prog *prog, u64 start)
{
struct bpf_prog_stats *stats;
+ unsigned long flags;
+ u64 duration;
- if (static_branch_unlikely(&bpf_stats_enabled_key) &&
- /* static_key could be enabled in __bpf_prog_enter*
- * and disabled in __bpf_prog_exit*.
- * And vice versa.
- * Hence check that 'start' is valid.
- */
- start > NO_START_TIME) {
- u64 duration = sched_clock() - start;
- unsigned long flags;
-
- stats = this_cpu_ptr(prog->stats);
- flags = u64_stats_update_begin_irqsave(&stats->syncp);
- u64_stats_inc(&stats->cnt);
- u64_stats_add(&stats->nsecs, duration);
- u64_stats_update_end_irqrestore(&stats->syncp, flags);
- }
+ /*
+ * static_key could be enabled in __bpf_prog_enter* and disabled in
+ * __bpf_prog_exit*. And vice versa. Check that 'start' is valid.
+ */
+ if (start <= NO_START_TIME)
+ return;
+
+ duration = sched_clock() - start;
+ stats = this_cpu_ptr(prog->stats);
+ flags = u64_stats_update_begin_irqsave(&stats->syncp);
+ u64_stats_inc(&stats->cnt);
+ u64_stats_add(&stats->nsecs, duration);
+ u64_stats_update_end_irqrestore(&stats->syncp, flags);
+}
+
+static __always_inline void notrace update_prog_stats(struct bpf_prog *prog,
+ u64 start)
+{
+ if (static_branch_unlikely(&bpf_stats_enabled_key))
+ __update_prog_stats(prog, start);
}
static void notrace __bpf_prog_exit_recur(struct bpf_prog *prog, u64 start,
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 54c6953a8b84..399f03e62508 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -44,6 +44,12 @@ static const struct bpf_verifier_ops * const bpf_verifier_ops[] = {
#undef BPF_LINK_TYPE
};
+enum bpf_features {
+ BPF_FEAT_RDONLY_CAST_TO_VOID = 0,
+ BPF_FEAT_STREAMS = 1,
+ __MAX_BPF_FEAT,
+};
+
struct bpf_mem_alloc bpf_global_percpu_ma;
static bool bpf_global_percpu_ma_set;
@@ -322,6 +328,7 @@ struct bpf_kfunc_call_arg_meta {
struct btf *arg_btf;
u32 arg_btf_id;
bool arg_owning_ref;
+ bool arg_prog;
struct {
struct btf_field *field;
@@ -404,7 +411,8 @@ static bool reg_not_null(const struct bpf_reg_state *reg)
type == PTR_TO_MAP_KEY ||
type == PTR_TO_SOCK_COMMON ||
(type == PTR_TO_BTF_ID && is_trusted_reg(reg)) ||
- type == PTR_TO_MEM;
+ (type == PTR_TO_MEM && !(reg->type & PTR_UNTRUSTED)) ||
+ type == CONST_PTR_TO_MAP;
}
static struct btf_record *reg_btf_record(const struct bpf_reg_state *reg)
@@ -854,7 +862,7 @@ static int unmark_stack_slots_dynptr(struct bpf_verifier_env *env, struct bpf_re
* dynptr
*/
if (state->stack[i].slot_type[0] != STACK_DYNPTR) {
- verbose(env, "verifier internal error: misconfigured ref_obj_id\n");
+ verifier_bug(env, "misconfigured ref_obj_id");
return -EFAULT;
}
if (state->stack[i].spilled_ptr.dynptr.first_slot)
@@ -1402,7 +1410,7 @@ static void *realloc_array(void *arr, size_t old_n, size_t new_n, size_t size)
goto out;
alloc_size = kmalloc_size_roundup(size_mul(new_n, size));
- new_arr = krealloc(arr, alloc_size, GFP_KERNEL);
+ new_arr = krealloc(arr, alloc_size, GFP_KERNEL_ACCOUNT);
if (!new_arr) {
kfree(arr);
return NULL;
@@ -1419,7 +1427,7 @@ out:
static int copy_reference_state(struct bpf_verifier_state *dst, const struct bpf_verifier_state *src)
{
dst->refs = copy_array(dst->refs, src->refs, src->acquired_refs,
- sizeof(struct bpf_reference_state), GFP_KERNEL);
+ sizeof(struct bpf_reference_state), GFP_KERNEL_ACCOUNT);
if (!dst->refs)
return -ENOMEM;
@@ -1438,7 +1446,7 @@ static int copy_stack_state(struct bpf_func_state *dst, const struct bpf_func_st
size_t n = src->allocated_stack / BPF_REG_SIZE;
dst->stack = copy_array(dst->stack, src->stack, n, sizeof(struct bpf_stack_state),
- GFP_KERNEL);
+ GFP_KERNEL_ACCOUNT);
if (!dst->stack)
return -ENOMEM;
@@ -1646,7 +1654,7 @@ static void update_peak_states(struct bpf_verifier_env *env)
{
u32 cur_states;
- cur_states = env->explored_states_size + env->free_list_size;
+ cur_states = env->explored_states_size + env->free_list_size + env->num_backedges;
env->peak_states = max(env->peak_states, cur_states);
}
@@ -1658,6 +1666,13 @@ static void free_func_state(struct bpf_func_state *state)
kfree(state);
}
+static void clear_jmp_history(struct bpf_verifier_state *state)
+{
+ kfree(state->jmp_history);
+ state->jmp_history = NULL;
+ state->jmp_history_cnt = 0;
+}
+
static void free_verifier_state(struct bpf_verifier_state *state,
bool free_self)
{
@@ -1668,11 +1683,12 @@ static void free_verifier_state(struct bpf_verifier_state *state,
state->frame[i] = NULL;
}
kfree(state->refs);
+ clear_jmp_history(state);
if (free_self)
kfree(state);
}
-/* struct bpf_verifier_state->{parent,loop_entry} refer to states
+/* struct bpf_verifier_state->parent refers to states
* that are in either of env->{expored_states,free_list}.
* In both cases the state is contained in struct bpf_verifier_state_list.
*/
@@ -1683,37 +1699,24 @@ static struct bpf_verifier_state_list *state_parent_as_list(struct bpf_verifier_
return NULL;
}
-static struct bpf_verifier_state_list *state_loop_entry_as_list(struct bpf_verifier_state *st)
-{
- if (st->loop_entry)
- return container_of(st->loop_entry, struct bpf_verifier_state_list, state);
- return NULL;
-}
+static bool incomplete_read_marks(struct bpf_verifier_env *env,
+ struct bpf_verifier_state *st);
/* A state can be freed if it is no longer referenced:
* - is in the env->free_list;
* - has no children states;
- * - is not used as loop_entry.
- *
- * Freeing a state can make it's loop_entry free-able.
*/
static void maybe_free_verifier_state(struct bpf_verifier_env *env,
struct bpf_verifier_state_list *sl)
{
- struct bpf_verifier_state_list *loop_entry_sl;
-
- while (sl && sl->in_free_list &&
- sl->state.branches == 0 &&
- sl->state.used_as_loop_entry == 0) {
- loop_entry_sl = state_loop_entry_as_list(&sl->state);
- if (loop_entry_sl)
- loop_entry_sl->state.used_as_loop_entry--;
- list_del(&sl->node);
- free_verifier_state(&sl->state, false);
- kfree(sl);
- env->free_list_size--;
- sl = loop_entry_sl;
- }
+ if (!sl->in_free_list
+ || sl->state.branches != 0
+ || incomplete_read_marks(env, &sl->state))
+ return;
+ list_del(&sl->node);
+ free_verifier_state(&sl->state, false);
+ kfree(sl);
+ env->free_list_size--;
}
/* copy verifier state from src to dst growing dst stack space
@@ -1732,6 +1735,13 @@ static int copy_verifier_state(struct bpf_verifier_state *dst_state,
struct bpf_func_state *dst;
int i, err;
+ dst_state->jmp_history = copy_array(dst_state->jmp_history, src->jmp_history,
+ src->jmp_history_cnt, sizeof(*dst_state->jmp_history),
+ GFP_KERNEL_ACCOUNT);
+ if (!dst_state->jmp_history)
+ return -ENOMEM;
+ dst_state->jmp_history_cnt = src->jmp_history_cnt;
+
/* if dst has more stack frames then src frame, free them, this is also
* necessary in case of exceptional exits using bpf_throw.
*/
@@ -1749,17 +1759,14 @@ static int copy_verifier_state(struct bpf_verifier_state *dst_state,
dst_state->parent = src->parent;
dst_state->first_insn_idx = src->first_insn_idx;
dst_state->last_insn_idx = src->last_insn_idx;
- dst_state->insn_hist_start = src->insn_hist_start;
- dst_state->insn_hist_end = src->insn_hist_end;
dst_state->dfs_depth = src->dfs_depth;
dst_state->callback_unroll_depth = src->callback_unroll_depth;
- dst_state->used_as_loop_entry = src->used_as_loop_entry;
dst_state->may_goto_depth = src->may_goto_depth;
- dst_state->loop_entry = src->loop_entry;
+ dst_state->equal_state = src->equal_state;
for (i = 0; i <= src->curframe; i++) {
dst = dst_state->frame[i];
if (!dst) {
- dst = kzalloc(sizeof(*dst), GFP_KERNEL);
+ dst = kzalloc(sizeof(*dst), GFP_KERNEL_ACCOUNT);
if (!dst)
return -ENOMEM;
dst_state->frame[i] = dst;
@@ -1798,184 +1805,241 @@ static bool same_callsites(struct bpf_verifier_state *a, struct bpf_verifier_sta
return true;
}
-/* Open coded iterators allow back-edges in the state graph in order to
- * check unbounded loops that iterators.
- *
- * In is_state_visited() it is necessary to know if explored states are
- * part of some loops in order to decide whether non-exact states
- * comparison could be used:
- * - non-exact states comparison establishes sub-state relation and uses
- * read and precision marks to do so, these marks are propagated from
- * children states and thus are not guaranteed to be final in a loop;
- * - exact states comparison just checks if current and explored states
- * are identical (and thus form a back-edge).
- *
- * Paper "A New Algorithm for Identifying Loops in Decompilation"
- * by Tao Wei, Jian Mao, Wei Zou and Yu Chen [1] presents a convenient
- * algorithm for loop structure detection and gives an overview of
- * relevant terminology. It also has helpful illustrations.
- *
- * [1] https://api.semanticscholar.org/CorpusID:15784067
- *
- * We use a similar algorithm but because loop nested structure is
- * irrelevant for verifier ours is significantly simpler and resembles
- * strongly connected components algorithm from Sedgewick's textbook.
- *
- * Define topmost loop entry as a first node of the loop traversed in a
- * depth first search starting from initial state. The goal of the loop
- * tracking algorithm is to associate topmost loop entries with states
- * derived from these entries.
- *
- * For each step in the DFS states traversal algorithm needs to identify
- * the following situations:
- *
- * initial initial initial
- * | | |
- * V V V
- * ... ... .---------> hdr
- * | | | |
- * V V | V
- * cur .-> succ | .------...
- * | | | | | |
- * V | V | V V
- * succ '-- cur | ... ...
- * | | |
- * | V V
- * | succ <- cur
- * | |
- * | V
- * | ...
- * | |
- * '----'
- *
- * (A) successor state of cur (B) successor state of cur or it's entry
- * not yet traversed are in current DFS path, thus cur and succ
- * are members of the same outermost loop
- *
- * initial initial
- * | |
- * V V
- * ... ...
- * | |
- * V V
- * .------... .------...
- * | | | |
- * V V V V
- * .-> hdr ... ... ...
- * | | | | |
- * | V V V V
- * | succ <- cur succ <- cur
- * | | |
- * | V V
- * | ... ...
- * | | |
- * '----' exit
- *
- * (C) successor state of cur is a part of some loop but this loop
- * does not include cur or successor state is not in a loop at all.
- *
- * Algorithm could be described as the following python code:
- *
- * traversed = set() # Set of traversed nodes
- * entries = {} # Mapping from node to loop entry
- * depths = {} # Depth level assigned to graph node
- * path = set() # Current DFS path
- *
- * # Find outermost loop entry known for n
- * def get_loop_entry(n):
- * h = entries.get(n, None)
- * while h in entries:
- * h = entries[h]
- * return h
- *
- * # Update n's loop entry if h comes before n in current DFS path.
- * def update_loop_entry(n, h):
- * if h in path and depths[entries.get(n, n)] < depths[n]:
- * entries[n] = h1
+/* Return IP for a given frame in a call stack */
+static u32 frame_insn_idx(struct bpf_verifier_state *st, u32 frame)
+{
+ return frame == st->curframe
+ ? st->insn_idx
+ : st->frame[frame + 1]->callsite;
+}
+
+/* For state @st look for a topmost frame with frame_insn_idx() in some SCC,
+ * if such frame exists form a corresponding @callchain as an array of
+ * call sites leading to this frame and SCC id.
+ * E.g.:
*
- * def dfs(n, depth):
- * traversed.add(n)
- * path.add(n)
- * depths[n] = depth
- * for succ in G.successors(n):
- * if succ not in traversed:
- * # Case A: explore succ and update cur's loop entry
- * # only if succ's entry is in current DFS path.
- * dfs(succ, depth + 1)
- * h = entries.get(succ, None)
- * update_loop_entry(n, h)
- * else:
- * # Case B or C depending on `h1 in path` check in update_loop_entry().
- * update_loop_entry(n, succ)
- * path.remove(n)
+ * void foo() { A: loop {... SCC#1 ...}; }
+ * void bar() { B: loop { C: foo(); ... SCC#2 ... }
+ * D: loop { E: foo(); ... SCC#3 ... } }
+ * void main() { F: bar(); }
*
- * To adapt this algorithm for use with verifier:
- * - use st->branch == 0 as a signal that DFS of succ had been finished
- * and cur's loop entry has to be updated (case A), handle this in
- * update_branch_counts();
- * - use st->branch > 0 as a signal that st is in the current DFS path;
- * - handle cases B and C in is_state_visited().
+ * @callchain at (A) would be either (F,SCC#2) or (F,SCC#3) depending
+ * on @st frame call sites being (F,C,A) or (F,E,A).
*/
-static struct bpf_verifier_state *get_loop_entry(struct bpf_verifier_env *env,
- struct bpf_verifier_state *st)
+static bool compute_scc_callchain(struct bpf_verifier_env *env,
+ struct bpf_verifier_state *st,
+ struct bpf_scc_callchain *callchain)
{
- struct bpf_verifier_state *topmost = st->loop_entry;
- u32 steps = 0;
+ u32 i, scc, insn_idx;
- while (topmost && topmost->loop_entry) {
- if (steps++ > st->dfs_depth) {
- WARN_ONCE(true, "verifier bug: infinite loop in get_loop_entry\n");
- verbose(env, "verifier bug: infinite loop in get_loop_entry()\n");
- return ERR_PTR(-EFAULT);
+ memset(callchain, 0, sizeof(*callchain));
+ for (i = 0; i <= st->curframe; i++) {
+ insn_idx = frame_insn_idx(st, i);
+ scc = env->insn_aux_data[insn_idx].scc;
+ if (scc) {
+ callchain->scc = scc;
+ break;
+ } else if (i < st->curframe) {
+ callchain->callsites[i] = insn_idx;
+ } else {
+ return false;
}
- topmost = topmost->loop_entry;
}
- return topmost;
+ return true;
}
-static void update_loop_entry(struct bpf_verifier_env *env,
- struct bpf_verifier_state *cur, struct bpf_verifier_state *hdr)
+/* Check if bpf_scc_visit instance for @callchain exists. */
+static struct bpf_scc_visit *scc_visit_lookup(struct bpf_verifier_env *env,
+ struct bpf_scc_callchain *callchain)
{
- /* The hdr->branches check decides between cases B and C in
- * comment for get_loop_entry(). If hdr->branches == 0 then
- * head's topmost loop entry is not in current DFS path,
- * hence 'cur' and 'hdr' are not in the same loop and there is
- * no need to update cur->loop_entry.
- */
- if (hdr->branches && hdr->dfs_depth < (cur->loop_entry ?: cur)->dfs_depth) {
- if (cur->loop_entry) {
- cur->loop_entry->used_as_loop_entry--;
- maybe_free_verifier_state(env, state_loop_entry_as_list(cur));
- }
- cur->loop_entry = hdr;
- hdr->used_as_loop_entry++;
+ struct bpf_scc_info *info = env->scc_info[callchain->scc];
+ struct bpf_scc_visit *visits = info->visits;
+ u32 i;
+
+ if (!info)
+ return NULL;
+ for (i = 0; i < info->num_visits; i++)
+ if (memcmp(callchain, &visits[i].callchain, sizeof(*callchain)) == 0)
+ return &visits[i];
+ return NULL;
+}
+
+/* Allocate a new bpf_scc_visit instance corresponding to @callchain.
+ * Allocated instances are alive for a duration of the do_check_common()
+ * call and are freed by free_states().
+ */
+static struct bpf_scc_visit *scc_visit_alloc(struct bpf_verifier_env *env,
+ struct bpf_scc_callchain *callchain)
+{
+ struct bpf_scc_visit *visit;
+ struct bpf_scc_info *info;
+ u32 scc, num_visits;
+ u64 new_sz;
+
+ scc = callchain->scc;
+ info = env->scc_info[scc];
+ num_visits = info ? info->num_visits : 0;
+ new_sz = sizeof(*info) + sizeof(struct bpf_scc_visit) * (num_visits + 1);
+ info = kvrealloc(env->scc_info[scc], new_sz, GFP_KERNEL_ACCOUNT);
+ if (!info)
+ return NULL;
+ env->scc_info[scc] = info;
+ info->num_visits = num_visits + 1;
+ visit = &info->visits[num_visits];
+ memset(visit, 0, sizeof(*visit));
+ memcpy(&visit->callchain, callchain, sizeof(*callchain));
+ return visit;
+}
+
+/* Form a string '(callsite#1,callsite#2,...,scc)' in env->tmp_str_buf */
+static char *format_callchain(struct bpf_verifier_env *env, struct bpf_scc_callchain *callchain)
+{
+ char *buf = env->tmp_str_buf;
+ int i, delta = 0;
+
+ delta += snprintf(buf + delta, TMP_STR_BUF_LEN - delta, "(");
+ for (i = 0; i < ARRAY_SIZE(callchain->callsites); i++) {
+ if (!callchain->callsites[i])
+ break;
+ delta += snprintf(buf + delta, TMP_STR_BUF_LEN - delta, "%u,",
+ callchain->callsites[i]);
}
+ delta += snprintf(buf + delta, TMP_STR_BUF_LEN - delta, "%u)", callchain->scc);
+ return env->tmp_str_buf;
}
-static void update_branch_counts(struct bpf_verifier_env *env, struct bpf_verifier_state *st)
+/* If callchain for @st exists (@st is in some SCC), ensure that
+ * bpf_scc_visit instance for this callchain exists.
+ * If instance does not exist or is empty, assign visit->entry_state to @st.
+ */
+static int maybe_enter_scc(struct bpf_verifier_env *env, struct bpf_verifier_state *st)
+{
+ struct bpf_scc_callchain *callchain = &env->callchain_buf;
+ struct bpf_scc_visit *visit;
+
+ if (!compute_scc_callchain(env, st, callchain))
+ return 0;
+ visit = scc_visit_lookup(env, callchain);
+ visit = visit ?: scc_visit_alloc(env, callchain);
+ if (!visit)
+ return -ENOMEM;
+ if (!visit->entry_state) {
+ visit->entry_state = st;
+ if (env->log.level & BPF_LOG_LEVEL2)
+ verbose(env, "SCC enter %s\n", format_callchain(env, callchain));
+ }
+ return 0;
+}
+
+static int propagate_backedges(struct bpf_verifier_env *env, struct bpf_scc_visit *visit);
+
+/* If callchain for @st exists (@st is in some SCC), make it empty:
+ * - set visit->entry_state to NULL;
+ * - flush accumulated backedges.
+ */
+static int maybe_exit_scc(struct bpf_verifier_env *env, struct bpf_verifier_state *st)
+{
+ struct bpf_scc_callchain *callchain = &env->callchain_buf;
+ struct bpf_scc_visit *visit;
+
+ if (!compute_scc_callchain(env, st, callchain))
+ return 0;
+ visit = scc_visit_lookup(env, callchain);
+ if (!visit) {
+ verifier_bug(env, "scc exit: no visit info for call chain %s",
+ format_callchain(env, callchain));
+ return -EFAULT;
+ }
+ if (visit->entry_state != st)
+ return 0;
+ if (env->log.level & BPF_LOG_LEVEL2)
+ verbose(env, "SCC exit %s\n", format_callchain(env, callchain));
+ visit->entry_state = NULL;
+ env->num_backedges -= visit->num_backedges;
+ visit->num_backedges = 0;
+ update_peak_states(env);
+ return propagate_backedges(env, visit);
+}
+
+/* Lookup an bpf_scc_visit instance corresponding to @st callchain
+ * and add @backedge to visit->backedges. @st callchain must exist.
+ */
+static int add_scc_backedge(struct bpf_verifier_env *env,
+ struct bpf_verifier_state *st,
+ struct bpf_scc_backedge *backedge)
+{
+ struct bpf_scc_callchain *callchain = &env->callchain_buf;
+ struct bpf_scc_visit *visit;
+
+ if (!compute_scc_callchain(env, st, callchain)) {
+ verifier_bug(env, "add backedge: no SCC in verification path, insn_idx %d",
+ st->insn_idx);
+ return -EFAULT;
+ }
+ visit = scc_visit_lookup(env, callchain);
+ if (!visit) {
+ verifier_bug(env, "add backedge: no visit info for call chain %s",
+ format_callchain(env, callchain));
+ return -EFAULT;
+ }
+ if (env->log.level & BPF_LOG_LEVEL2)
+ verbose(env, "SCC backedge %s\n", format_callchain(env, callchain));
+ backedge->next = visit->backedges;
+ visit->backedges = backedge;
+ visit->num_backedges++;
+ env->num_backedges++;
+ update_peak_states(env);
+ return 0;
+}
+
+/* bpf_reg_state->live marks for registers in a state @st are incomplete,
+ * if state @st is in some SCC and not all execution paths starting at this
+ * SCC are fully explored.
+ */
+static bool incomplete_read_marks(struct bpf_verifier_env *env,
+ struct bpf_verifier_state *st)
+{
+ struct bpf_scc_callchain *callchain = &env->callchain_buf;
+ struct bpf_scc_visit *visit;
+
+ if (!compute_scc_callchain(env, st, callchain))
+ return false;
+ visit = scc_visit_lookup(env, callchain);
+ if (!visit)
+ return false;
+ return !!visit->backedges;
+}
+
+static void free_backedges(struct bpf_scc_visit *visit)
+{
+ struct bpf_scc_backedge *backedge, *next;
+
+ for (backedge = visit->backedges; backedge; backedge = next) {
+ free_verifier_state(&backedge->state, false);
+ next = backedge->next;
+ kvfree(backedge);
+ }
+ visit->backedges = NULL;
+}
+
+static int update_branch_counts(struct bpf_verifier_env *env, struct bpf_verifier_state *st)
{
struct bpf_verifier_state_list *sl = NULL, *parent_sl;
struct bpf_verifier_state *parent;
+ int err;
while (st) {
u32 br = --st->branches;
- /* br == 0 signals that DFS exploration for 'st' is finished,
- * thus it is necessary to update parent's loop entry if it
- * turned out that st is a part of some loop.
- * This is a part of 'case A' in get_loop_entry() comment.
- */
- if (br == 0 && st->parent && st->loop_entry)
- update_loop_entry(env, st->parent, st->loop_entry);
-
- /* WARN_ON(br > 1) technically makes sense here,
+ /* verifier_bug_if(br > 1, ...) technically makes sense here,
* but see comment in push_stack(), hence:
*/
- WARN_ONCE((int)br < 0,
- "BUG update_branch_counts:branches_to_explore=%d\n",
- br);
+ verifier_bug_if((int)br < 0, env, "%s:branches_to_explore=%d", __func__, br);
if (br)
break;
+ err = maybe_exit_scc(env, st);
+ if (err)
+ return err;
parent = st->parent;
parent_sl = state_parent_as_list(st);
if (sl)
@@ -1983,6 +2047,7 @@ static void update_branch_counts(struct bpf_verifier_env *env, struct bpf_verifi
st = parent;
sl = parent_sl;
}
+ return 0;
}
static int pop_stack(struct bpf_verifier_env *env, int *prev_insn_idx,
@@ -2014,6 +2079,18 @@ static int pop_stack(struct bpf_verifier_env *env, int *prev_insn_idx,
return 0;
}
+static bool error_recoverable_with_nospec(int err)
+{
+ /* Should only return true for non-fatal errors that are allowed to
+ * occur during speculative verification. For these we can insert a
+ * nospec and the program might still be accepted. Do not include
+ * something like ENOMEM because it is likely to re-occur for the next
+ * architectural path once it has been recovered-from in all speculative
+ * paths.
+ */
+ return err == -EPERM || err == -EACCES || err == -EINVAL;
+}
+
static struct bpf_verifier_state *push_stack(struct bpf_verifier_env *env,
int insn_idx, int prev_insn_idx,
bool speculative)
@@ -2022,9 +2099,9 @@ static struct bpf_verifier_state *push_stack(struct bpf_verifier_env *env,
struct bpf_verifier_stack_elem *elem;
int err;
- elem = kzalloc(sizeof(struct bpf_verifier_stack_elem), GFP_KERNEL);
+ elem = kzalloc(sizeof(struct bpf_verifier_stack_elem), GFP_KERNEL_ACCOUNT);
if (!elem)
- goto err;
+ return NULL;
elem->insn_idx = insn_idx;
elem->prev_insn_idx = prev_insn_idx;
@@ -2034,12 +2111,12 @@ static struct bpf_verifier_state *push_stack(struct bpf_verifier_env *env,
env->stack_size++;
err = copy_verifier_state(&elem->st, cur);
if (err)
- goto err;
+ return NULL;
elem->st.speculative |= speculative;
if (env->stack_size > BPF_COMPLEXITY_LIMIT_JMP_SEQ) {
verbose(env, "The sequence of %d jumps is too complex.\n",
env->stack_size);
- goto err;
+ return NULL;
}
if (elem->st.parent) {
++elem->st.parent->branches;
@@ -2054,12 +2131,6 @@ static struct bpf_verifier_state *push_stack(struct bpf_verifier_env *env,
*/
}
return &elem->st;
-err:
- free_verifier_state(env->cur_state, true);
- env->cur_state = NULL;
- /* pop all elements and return */
- while (!pop_stack(env, NULL, NULL, false));
- return NULL;
}
#define CALLER_SAVED_REGS 6
@@ -2452,6 +2523,58 @@ static void __reg64_deduce_bounds(struct bpf_reg_state *reg)
if ((u64)reg->smin_value <= (u64)reg->smax_value) {
reg->umin_value = max_t(u64, reg->smin_value, reg->umin_value);
reg->umax_value = min_t(u64, reg->smax_value, reg->umax_value);
+ } else {
+ /* If the s64 range crosses the sign boundary, then it's split
+ * between the beginning and end of the U64 domain. In that
+ * case, we can derive new bounds if the u64 range overlaps
+ * with only one end of the s64 range.
+ *
+ * In the following example, the u64 range overlaps only with
+ * positive portion of the s64 range.
+ *
+ * 0 U64_MAX
+ * | [xxxxxxxxxxxxxx u64 range xxxxxxxxxxxxxx] |
+ * |----------------------------|----------------------------|
+ * |xxxxx s64 range xxxxxxxxx] [xxxxxxx|
+ * 0 S64_MAX S64_MIN -1
+ *
+ * We can thus derive the following new s64 and u64 ranges.
+ *
+ * 0 U64_MAX
+ * | [xxxxxx u64 range xxxxx] |
+ * |----------------------------|----------------------------|
+ * | [xxxxxx s64 range xxxxx] |
+ * 0 S64_MAX S64_MIN -1
+ *
+ * If they overlap in two places, we can't derive anything
+ * because reg_state can't represent two ranges per numeric
+ * domain.
+ *
+ * 0 U64_MAX
+ * | [xxxxxxxxxxxxxxxxx u64 range xxxxxxxxxxxxxxxxx] |
+ * |----------------------------|----------------------------|
+ * |xxxxx s64 range xxxxxxxxx] [xxxxxxxxxx|
+ * 0 S64_MAX S64_MIN -1
+ *
+ * The first condition below corresponds to the first diagram
+ * above.
+ */
+ if (reg->umax_value < (u64)reg->smin_value) {
+ reg->smin_value = (s64)reg->umin_value;
+ reg->umax_value = min_t(u64, reg->umax_value, reg->smax_value);
+ } else if ((u64)reg->smax_value < reg->umin_value) {
+ /* This second condition considers the case where the u64 range
+ * overlaps with the negative portion of the s64 range:
+ *
+ * 0 U64_MAX
+ * | [xxxxxxxxxxxxxx u64 range xxxxxxxxxxxxxx] |
+ * |----------------------------|----------------------------|
+ * |xxxxxxxxx] [xxxxxxxxxxxx s64 range |
+ * 0 S64_MAX S64_MIN -1
+ */
+ reg->smax_value = (s64)reg->umax_value;
+ reg->umin_value = max_t(u64, reg->umin_value, reg->smin_value);
+ }
}
}
@@ -2483,20 +2606,6 @@ static void __reg_deduce_mixed_bounds(struct bpf_reg_state *reg)
reg->smin_value = max_t(s64, reg->smin_value, new_smin);
reg->smax_value = min_t(s64, reg->smax_value, new_smax);
- /* if s32 can be treated as valid u32 range, we can use it as well */
- if ((u32)reg->s32_min_value <= (u32)reg->s32_max_value) {
- /* s32 -> u64 tightening */
- new_umin = (reg->umin_value & ~0xffffffffULL) | (u32)reg->s32_min_value;
- new_umax = (reg->umax_value & ~0xffffffffULL) | (u32)reg->s32_max_value;
- reg->umin_value = max_t(u64, reg->umin_value, new_umin);
- reg->umax_value = min_t(u64, reg->umax_value, new_umax);
- /* s32 -> s64 tightening */
- new_smin = (reg->smin_value & ~0xffffffffULL) | (u32)reg->s32_min_value;
- new_smax = (reg->smax_value & ~0xffffffffULL) | (u32)reg->s32_max_value;
- reg->smin_value = max_t(s64, reg->smin_value, new_smin);
- reg->smax_value = min_t(s64, reg->smax_value, new_smax);
- }
-
/* Here we would like to handle a special case after sign extending load,
* when upper bits for a 64-bit range are all 1s or all 0s.
*
@@ -2563,6 +2672,7 @@ static void reg_bounds_sync(struct bpf_reg_state *reg)
/* We might have learned something about the sign bit. */
__reg_deduce_bounds(reg);
__reg_deduce_bounds(reg);
+ __reg_deduce_bounds(reg);
/* We might have learned some bits from the bounds. */
__reg_bound_offset(reg);
/* Intersecting with the old var_off might have improved our bounds
@@ -2609,13 +2719,13 @@ static int reg_bounds_sanity_check(struct bpf_verifier_env *env,
return 0;
out:
- verbose(env, "REG INVARIANTS VIOLATION (%s): %s u64=[%#llx, %#llx] "
- "s64=[%#llx, %#llx] u32=[%#x, %#x] s32=[%#x, %#x] var_off=(%#llx, %#llx)\n",
- ctx, msg, reg->umin_value, reg->umax_value,
- reg->smin_value, reg->smax_value,
- reg->u32_min_value, reg->u32_max_value,
- reg->s32_min_value, reg->s32_max_value,
- reg->var_off.value, reg->var_off.mask);
+ verifier_bug(env, "REG INVARIANTS VIOLATION (%s): %s u64=[%#llx, %#llx] "
+ "s64=[%#llx, %#llx] u32=[%#x, %#x] s32=[%#x, %#x] var_off=(%#llx, %#llx)",
+ ctx, msg, reg->umin_value, reg->umax_value,
+ reg->smin_value, reg->smax_value,
+ reg->u32_min_value, reg->u32_max_value,
+ reg->s32_min_value, reg->s32_max_value,
+ reg->var_off.value, reg->var_off.mask);
if (env->test_reg_invariants)
return -EFAULT;
__mark_reg_unbounded(reg);
@@ -2725,22 +2835,33 @@ static void mark_reg_not_init(struct bpf_verifier_env *env,
__mark_reg_not_init(env, regs + regno);
}
-static void mark_btf_ld_reg(struct bpf_verifier_env *env,
- struct bpf_reg_state *regs, u32 regno,
- enum bpf_reg_type reg_type,
- struct btf *btf, u32 btf_id,
- enum bpf_type_flag flag)
+static int mark_btf_ld_reg(struct bpf_verifier_env *env,
+ struct bpf_reg_state *regs, u32 regno,
+ enum bpf_reg_type reg_type,
+ struct btf *btf, u32 btf_id,
+ enum bpf_type_flag flag)
{
- if (reg_type == SCALAR_VALUE) {
+ switch (reg_type) {
+ case SCALAR_VALUE:
mark_reg_unknown(env, regs, regno);
- return;
+ return 0;
+ case PTR_TO_BTF_ID:
+ mark_reg_known_zero(env, regs, regno);
+ regs[regno].type = PTR_TO_BTF_ID | flag;
+ regs[regno].btf = btf;
+ regs[regno].btf_id = btf_id;
+ if (type_may_be_null(flag))
+ regs[regno].id = ++env->id_gen;
+ return 0;
+ case PTR_TO_MEM:
+ mark_reg_known_zero(env, regs, regno);
+ regs[regno].type = PTR_TO_MEM | flag;
+ regs[regno].mem_size = 0;
+ return 0;
+ default:
+ verifier_bug(env, "unexpected reg_type %d in %s\n", reg_type, __func__);
+ return -EFAULT;
}
- mark_reg_known_zero(env, regs, regno);
- regs[regno].type = PTR_TO_BTF_ID | flag;
- regs[regno].btf = btf;
- regs[regno].btf_id = btf_id;
- if (type_may_be_null(flag))
- regs[regno].id = ++env->id_gen;
}
#define DEF_NOT_SUBREG (0)
@@ -2789,9 +2910,9 @@ static struct bpf_verifier_state *push_async_cb(struct bpf_verifier_env *env,
struct bpf_verifier_stack_elem *elem;
struct bpf_func_state *frame;
- elem = kzalloc(sizeof(struct bpf_verifier_stack_elem), GFP_KERNEL);
+ elem = kzalloc(sizeof(struct bpf_verifier_stack_elem), GFP_KERNEL_ACCOUNT);
if (!elem)
- goto err;
+ return NULL;
elem->insn_idx = insn_idx;
elem->prev_insn_idx = prev_insn_idx;
@@ -2803,35 +2924,24 @@ static struct bpf_verifier_state *push_async_cb(struct bpf_verifier_env *env,
verbose(env,
"The sequence of %d jumps is too complex for async cb.\n",
env->stack_size);
- goto err;
+ return NULL;
}
/* Unlike push_stack() do not copy_verifier_state().
* The caller state doesn't matter.
* This is async callback. It starts in a fresh stack.
* Initialize it similar to do_check_common().
- * But we do need to make sure to not clobber insn_hist, so we keep
- * chaining insn_hist_start/insn_hist_end indices as for a normal
- * child state.
*/
elem->st.branches = 1;
elem->st.in_sleepable = is_sleepable;
- elem->st.insn_hist_start = env->cur_state->insn_hist_end;
- elem->st.insn_hist_end = elem->st.insn_hist_start;
- frame = kzalloc(sizeof(*frame), GFP_KERNEL);
+ frame = kzalloc(sizeof(*frame), GFP_KERNEL_ACCOUNT);
if (!frame)
- goto err;
+ return NULL;
init_func_state(env, frame,
BPF_MAIN_FUNC /* callsite */,
0 /* frameno within this callchain */,
subprog /* subprog number within this prog */);
elem->st.frame[0] = frame;
return &elem->st;
-err:
- free_verifier_state(env->cur_state, true);
- env->cur_state = NULL;
- /* pop all elements and return */
- while (!pop_stack(env, NULL, NULL, false));
- return NULL;
}
@@ -3169,7 +3279,7 @@ static int add_kfunc_call(struct bpf_verifier_env *env, u32 func_id, s16 offset)
return -EINVAL;
}
- tab = kzalloc(sizeof(*tab), GFP_KERNEL);
+ tab = kzalloc(sizeof(*tab), GFP_KERNEL_ACCOUNT);
if (!tab)
return -ENOMEM;
prog_aux->kfunc_tab = tab;
@@ -3185,7 +3295,7 @@ static int add_kfunc_call(struct bpf_verifier_env *env, u32 func_id, s16 offset)
return 0;
if (!btf_tab && offset) {
- btf_tab = kzalloc(sizeof(*btf_tab), GFP_KERNEL);
+ btf_tab = kzalloc(sizeof(*btf_tab), GFP_KERNEL_ACCOUNT);
if (!btf_tab)
return -ENOMEM;
prog_aux->kfunc_btf_tab = btf_tab;
@@ -3459,12 +3569,11 @@ static int mark_reg_read(struct bpf_verifier_env *env,
/* if read wasn't screened by an earlier write ... */
if (writes && state->live & REG_LIVE_WRITTEN)
break;
- if (parent->live & REG_LIVE_DONE) {
- verbose(env, "verifier BUG type %s var_off %lld off %d\n",
- reg_type_str(env, parent->type),
- parent->var_off.value, parent->off);
+ if (verifier_bug_if(parent->live & REG_LIVE_DONE, env,
+ "type %s var_off %lld off %d",
+ reg_type_str(env, parent->type),
+ parent->var_off.value, parent->off))
return -EFAULT;
- }
/* The first condition is more likely to be true than the
* second, checked it first.
*/
@@ -3649,16 +3758,16 @@ static int insn_def_regno(const struct bpf_insn *insn)
case BPF_ST:
return -1;
case BPF_STX:
- if ((BPF_MODE(insn->code) == BPF_ATOMIC ||
- BPF_MODE(insn->code) == BPF_PROBE_ATOMIC) &&
- (insn->imm & BPF_FETCH)) {
+ if (BPF_MODE(insn->code) == BPF_ATOMIC ||
+ BPF_MODE(insn->code) == BPF_PROBE_ATOMIC) {
if (insn->imm == BPF_CMPXCHG)
return BPF_REG_0;
- else
+ else if (insn->imm == BPF_LOAD_ACQ)
+ return insn->dst_reg;
+ else if (insn->imm & BPF_FETCH)
return insn->src_reg;
- } else {
- return -1;
}
+ return -1;
default:
return insn->dst_reg;
}
@@ -3846,10 +3955,11 @@ static void linked_regs_unpack(u64 val, struct linked_regs *s)
}
/* for any branch, call, exit record the history of jmps in the given state */
-static int push_insn_history(struct bpf_verifier_env *env, struct bpf_verifier_state *cur,
- int insn_flags, u64 linked_regs)
+static int push_jmp_history(struct bpf_verifier_env *env, struct bpf_verifier_state *cur,
+ int insn_flags, u64 linked_regs)
{
- struct bpf_insn_hist_entry *p;
+ u32 cnt = cur->jmp_history_cnt;
+ struct bpf_jmp_history_entry *p;
size_t alloc_size;
/* combine instruction flags if we already recorded this instruction */
@@ -3857,44 +3967,41 @@ static int push_insn_history(struct bpf_verifier_env *env, struct bpf_verifier_s
/* atomic instructions push insn_flags twice, for READ and
* WRITE sides, but they should agree on stack slot
*/
- WARN_ONCE((env->cur_hist_ent->flags & insn_flags) &&
- (env->cur_hist_ent->flags & insn_flags) != insn_flags,
- "verifier insn history bug: insn_idx %d cur flags %x new flags %x\n",
- env->insn_idx, env->cur_hist_ent->flags, insn_flags);
+ verifier_bug_if((env->cur_hist_ent->flags & insn_flags) &&
+ (env->cur_hist_ent->flags & insn_flags) != insn_flags,
+ env, "insn history: insn_idx %d cur flags %x new flags %x",
+ env->insn_idx, env->cur_hist_ent->flags, insn_flags);
env->cur_hist_ent->flags |= insn_flags;
- WARN_ONCE(env->cur_hist_ent->linked_regs != 0,
- "verifier insn history bug: insn_idx %d linked_regs != 0: %#llx\n",
- env->insn_idx, env->cur_hist_ent->linked_regs);
+ verifier_bug_if(env->cur_hist_ent->linked_regs != 0, env,
+ "insn history: insn_idx %d linked_regs: %#llx",
+ env->insn_idx, env->cur_hist_ent->linked_regs);
env->cur_hist_ent->linked_regs = linked_regs;
return 0;
}
- if (cur->insn_hist_end + 1 > env->insn_hist_cap) {
- alloc_size = size_mul(cur->insn_hist_end + 1, sizeof(*p));
- p = kvrealloc(env->insn_hist, alloc_size, GFP_USER);
- if (!p)
- return -ENOMEM;
- env->insn_hist = p;
- env->insn_hist_cap = alloc_size / sizeof(*p);
- }
+ cnt++;
+ alloc_size = kmalloc_size_roundup(size_mul(cnt, sizeof(*p)));
+ p = krealloc(cur->jmp_history, alloc_size, GFP_KERNEL_ACCOUNT);
+ if (!p)
+ return -ENOMEM;
+ cur->jmp_history = p;
- p = &env->insn_hist[cur->insn_hist_end];
+ p = &cur->jmp_history[cnt - 1];
p->idx = env->insn_idx;
p->prev_idx = env->prev_insn_idx;
p->flags = insn_flags;
p->linked_regs = linked_regs;
-
- cur->insn_hist_end++;
+ cur->jmp_history_cnt = cnt;
env->cur_hist_ent = p;
return 0;
}
-static struct bpf_insn_hist_entry *get_insn_hist_entry(struct bpf_verifier_env *env,
- u32 hist_start, u32 hist_end, int insn_idx)
+static struct bpf_jmp_history_entry *get_jmp_hist_entry(struct bpf_verifier_state *st,
+ u32 hist_end, int insn_idx)
{
- if (hist_end > hist_start && env->insn_hist[hist_end - 1].idx == insn_idx)
- return &env->insn_hist[hist_end - 1];
+ if (hist_end > 0 && st->jmp_history[hist_end - 1].idx == insn_idx)
+ return &st->jmp_history[hist_end - 1];
return NULL;
}
@@ -3911,26 +4018,25 @@ static struct bpf_insn_hist_entry *get_insn_hist_entry(struct bpf_verifier_env *
* history entry recording a jump from last instruction of parent state and
* first instruction of given state.
*/
-static int get_prev_insn_idx(const struct bpf_verifier_env *env,
- struct bpf_verifier_state *st,
- int insn_idx, u32 hist_start, u32 *hist_endp)
+static int get_prev_insn_idx(struct bpf_verifier_state *st, int i,
+ u32 *history)
{
- u32 hist_end = *hist_endp;
- u32 cnt = hist_end - hist_start;
+ u32 cnt = *history;
- if (insn_idx == st->first_insn_idx) {
+ if (i == st->first_insn_idx) {
if (cnt == 0)
return -ENOENT;
- if (cnt == 1 && env->insn_hist[hist_start].idx == insn_idx)
+ if (cnt == 1 && st->jmp_history[0].idx == i)
return -ENOENT;
}
- if (cnt && env->insn_hist[hist_end - 1].idx == insn_idx) {
- (*hist_endp)--;
- return env->insn_hist[hist_end - 1].prev_idx;
+ if (cnt && st->jmp_history[cnt - 1].idx == i) {
+ i = st->jmp_history[cnt - 1].prev_idx;
+ (*history)--;
} else {
- return insn_idx - 1;
+ i--;
}
+ return i;
}
static const char *disasm_kfunc_name(void *data, const struct bpf_insn *insn)
@@ -3987,8 +4093,7 @@ static inline u32 bt_empty(struct backtrack_state *bt)
static inline int bt_subprog_enter(struct backtrack_state *bt)
{
if (bt->frame == MAX_CALL_FRAMES - 1) {
- verbose(bt->env, "BUG subprog enter from frame %d\n", bt->frame);
- WARN_ONCE(1, "verifier backtracking bug");
+ verifier_bug(bt->env, "subprog enter from frame %d", bt->frame);
return -EFAULT;
}
bt->frame++;
@@ -3998,8 +4103,7 @@ static inline int bt_subprog_enter(struct backtrack_state *bt)
static inline int bt_subprog_exit(struct backtrack_state *bt)
{
if (bt->frame == 0) {
- verbose(bt->env, "BUG subprog exit from frame 0\n");
- WARN_ONCE(1, "verifier backtracking bug");
+ verifier_bug(bt->env, "subprog exit from frame 0");
return -EFAULT;
}
bt->frame--;
@@ -4113,7 +4217,7 @@ static void fmt_stack_mask(char *buf, ssize_t buf_sz, u64 stack_mask)
/* If any register R in hist->linked_regs is marked as precise in bt,
* do bt_set_frame_{reg,slot}(bt, R) for all registers in hist->linked_regs.
*/
-static void bt_sync_linked_regs(struct backtrack_state *bt, struct bpf_insn_hist_entry *hist)
+static void bt_sync_linked_regs(struct backtrack_state *bt, struct bpf_jmp_history_entry *hist)
{
struct linked_regs linked_regs;
bool some_precise = false;
@@ -4158,7 +4262,7 @@ static bool calls_callback(struct bpf_verifier_env *env, int insn_idx);
* - *was* processed previously during backtracking.
*/
static int backtrack_insn(struct bpf_verifier_env *env, int idx, int subseq_idx,
- struct bpf_insn_hist_entry *hist, struct backtrack_state *bt)
+ struct bpf_jmp_history_entry *hist, struct backtrack_state *bt)
{
struct bpf_insn *insn = env->prog->insnsi + idx;
u8 class = BPF_CLASS(insn->code);
@@ -4277,14 +4381,15 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx, int subseq_idx,
* should be literally next instruction in
* caller program
*/
- WARN_ONCE(idx + 1 != subseq_idx, "verifier backtracking bug");
+ verifier_bug_if(idx + 1 != subseq_idx, env,
+ "extra insn from subprog");
/* r1-r5 are invalidated after subprog call,
* so for global func call it shouldn't be set
* anymore
*/
if (bt_reg_mask(bt) & BPF_REGMASK_ARGS) {
- verbose(env, "BUG regs %x\n", bt_reg_mask(bt));
- WARN_ONCE(1, "verifier backtracking bug");
+ verifier_bug(env, "global subprog unexpected regs %x",
+ bt_reg_mask(bt));
return -EFAULT;
}
/* global subprog always sets R0 */
@@ -4298,16 +4403,17 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx, int subseq_idx,
* the current frame should be zero by now
*/
if (bt_reg_mask(bt) & ~BPF_REGMASK_ARGS) {
- verbose(env, "BUG regs %x\n", bt_reg_mask(bt));
- WARN_ONCE(1, "verifier backtracking bug");
+ verifier_bug(env, "static subprog unexpected regs %x",
+ bt_reg_mask(bt));
return -EFAULT;
}
/* we are now tracking register spills correctly,
* so any instance of leftover slots is a bug
*/
if (bt_stack_mask(bt) != 0) {
- verbose(env, "BUG stack slots %llx\n", bt_stack_mask(bt));
- WARN_ONCE(1, "verifier backtracking bug (subprog leftover stack slots)");
+ verifier_bug(env,
+ "static subprog leftover stack slots %llx",
+ bt_stack_mask(bt));
return -EFAULT;
}
/* propagate r1-r5 to the caller */
@@ -4330,13 +4436,13 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx, int subseq_idx,
* not actually arguments passed directly to callback subprogs
*/
if (bt_reg_mask(bt) & ~BPF_REGMASK_ARGS) {
- verbose(env, "BUG regs %x\n", bt_reg_mask(bt));
- WARN_ONCE(1, "verifier backtracking bug");
+ verifier_bug(env, "callback unexpected regs %x",
+ bt_reg_mask(bt));
return -EFAULT;
}
if (bt_stack_mask(bt) != 0) {
- verbose(env, "BUG stack slots %llx\n", bt_stack_mask(bt));
- WARN_ONCE(1, "verifier backtracking bug (callback leftover stack slots)");
+ verifier_bug(env, "callback leftover stack slots %llx",
+ bt_stack_mask(bt));
return -EFAULT;
}
/* clear r1-r5 in callback subprog's mask */
@@ -4355,11 +4461,11 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx, int subseq_idx,
/* regular helper call sets R0 */
bt_clear_reg(bt, BPF_REG_0);
if (bt_reg_mask(bt) & BPF_REGMASK_ARGS) {
- /* if backtracing was looking for registers R1-R5
+ /* if backtracking was looking for registers R1-R5
* they should have been found already.
*/
- verbose(env, "BUG regs %x\n", bt_reg_mask(bt));
- WARN_ONCE(1, "verifier backtracking bug");
+ verifier_bug(env, "backtracking call unexpected regs %x",
+ bt_reg_mask(bt));
return -EFAULT;
}
} else if (opcode == BPF_EXIT) {
@@ -4377,8 +4483,8 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx, int subseq_idx,
for (i = BPF_REG_1; i <= BPF_REG_5; i++)
bt_clear_reg(bt, i);
if (bt_reg_mask(bt) & BPF_REGMASK_ARGS) {
- verbose(env, "BUG regs %x\n", bt_reg_mask(bt));
- WARN_ONCE(1, "verifier backtracking bug");
+ verifier_bug(env, "backtracking exit unexpected regs %x",
+ bt_reg_mask(bt));
return -EFAULT;
}
@@ -4413,8 +4519,10 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx, int subseq_idx,
* before it would be equally necessary to
* propagate it to dreg.
*/
- bt_set_reg(bt, dreg);
- bt_set_reg(bt, sreg);
+ if (!hist || !(hist->flags & INSN_F_SRC_REG_STACK))
+ bt_set_reg(bt, sreg);
+ if (!hist || !(hist->flags & INSN_F_DST_REG_STACK))
+ bt_set_reg(bt, dreg);
} else if (BPF_SRC(insn->code) == BPF_K) {
/* dreg <cond> K
* Only dreg still needs precision before
@@ -4449,7 +4557,7 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx, int subseq_idx,
* . if (scalar cond K|scalar)
* . helper_call(.., scalar, ...) where ARG_CONST is expected
* backtrack through the verifier states and mark all registers and
- * stack slots with spilled constants that these scalar regisers
+ * stack slots with spilled constants that these scalar registers
* should be precise.
* . during state pruning two registers (or spilled stack slots)
* are equivalent if both are not precise.
@@ -4572,7 +4680,7 @@ static void mark_all_scalars_imprecise(struct bpf_verifier_env *env, struct bpf_
* SCALARS, as well as any other registers and slots that contribute to
* a tracked state of given registers/stack slots, depending on specific BPF
* assembly instructions (see backtrack_insns() for exact instruction handling
- * logic). This backtracking relies on recorded insn_hist and is able to
+ * logic). This backtracking relies on recorded jmp_history and is able to
* traverse entire chain of parent states. This process ends only when all the
* necessary registers/slots and their transitive dependencies are marked as
* precise.
@@ -4652,23 +4760,27 @@ static void mark_all_scalars_imprecise(struct bpf_verifier_env *env, struct bpf_
* mark_all_scalars_imprecise() to hopefully get more permissive and generic
* finalized states which help in short circuiting more future states.
*/
-static int __mark_chain_precision(struct bpf_verifier_env *env, int regno)
+static int __mark_chain_precision(struct bpf_verifier_env *env,
+ struct bpf_verifier_state *starting_state,
+ int regno,
+ bool *changed)
{
+ struct bpf_verifier_state *st = starting_state;
struct backtrack_state *bt = &env->bt;
- struct bpf_verifier_state *st = env->cur_state;
int first_idx = st->first_insn_idx;
- int last_idx = env->insn_idx;
+ int last_idx = starting_state->insn_idx;
int subseq_idx = -1;
struct bpf_func_state *func;
+ bool tmp, skip_first = true;
struct bpf_reg_state *reg;
- bool skip_first = true;
int i, fr, err;
if (!env->bpf_capable)
return 0;
+ changed = changed ?: &tmp;
/* set frame number from which we are starting to backtrack */
- bt_init(bt, env->cur_state->curframe);
+ bt_init(bt, starting_state->curframe);
/* Do sanity checks against current state of register and/or stack
* slot, but don't set precise flag in current state, as precision
@@ -4678,7 +4790,7 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int regno)
if (regno >= 0) {
reg = &func->regs[regno];
if (reg->type != SCALAR_VALUE) {
- WARN_ONCE(1, "backtracing misuse");
+ verifier_bug(env, "backtracking misuse");
return -EFAULT;
}
bt_set_reg(bt, regno);
@@ -4689,9 +4801,8 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int regno)
for (;;) {
DECLARE_BITMAP(mask, 64);
- u32 hist_start = st->insn_hist_start;
- u32 hist_end = st->insn_hist_end;
- struct bpf_insn_hist_entry *hist;
+ u32 history = st->jmp_history_cnt;
+ struct bpf_jmp_history_entry *hist;
if (env->log.level & BPF_LOG_LEVEL2) {
verbose(env, "mark_precise: frame%d: last_idx %d first_idx %d subseq_idx %d \n",
@@ -4713,15 +4824,16 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int regno)
for_each_set_bit(i, mask, 32) {
reg = &st->frame[0]->regs[i];
bt_clear_reg(bt, i);
- if (reg->type == SCALAR_VALUE)
+ if (reg->type == SCALAR_VALUE) {
reg->precise = true;
+ *changed = true;
+ }
}
return 0;
}
- verbose(env, "BUG backtracking func entry subprog %d reg_mask %x stack_mask %llx\n",
- st->frame[0]->subprogno, bt_reg_mask(bt), bt_stack_mask(bt));
- WARN_ONCE(1, "verifier backtracking bug");
+ verifier_bug(env, "backtracking func entry subprog %d reg_mask %x stack_mask %llx",
+ st->frame[0]->subprogno, bt_reg_mask(bt), bt_stack_mask(bt));
return -EFAULT;
}
@@ -4730,11 +4842,11 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int regno)
err = 0;
skip_first = false;
} else {
- hist = get_insn_hist_entry(env, hist_start, hist_end, i);
+ hist = get_jmp_hist_entry(st, history, i);
err = backtrack_insn(env, i, subseq_idx, hist, bt);
}
if (err == -ENOTSUPP) {
- mark_all_scalars_precise(env, env->cur_state);
+ mark_all_scalars_precise(env, starting_state);
bt_reset(bt);
return 0;
} else if (err) {
@@ -4747,7 +4859,7 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int regno)
*/
return 0;
subseq_idx = i;
- i = get_prev_insn_idx(env, st, i, hist_start, &hist_end);
+ i = get_prev_insn_idx(st, i, &history);
if (i == -ENOENT)
break;
if (i >= env->prog->len) {
@@ -4757,8 +4869,7 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int regno)
* It means the backtracking missed the spot where
* particular register was initialized with a constant.
*/
- verbose(env, "BUG backtracking idx %d\n", i);
- WARN_ONCE(1, "verifier backtracking bug");
+ verifier_bug(env, "backtracking idx %d", i);
return -EFAULT;
}
}
@@ -4775,30 +4886,32 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int regno)
bt_clear_frame_reg(bt, fr, i);
continue;
}
- if (reg->precise)
+ if (reg->precise) {
bt_clear_frame_reg(bt, fr, i);
- else
+ } else {
reg->precise = true;
+ *changed = true;
+ }
}
bitmap_from_u64(mask, bt_frame_stack_mask(bt, fr));
for_each_set_bit(i, mask, 64) {
- if (i >= func->allocated_stack / BPF_REG_SIZE) {
- verbose(env, "BUG backtracking (stack slot %d, total slots %d)\n",
- i, func->allocated_stack / BPF_REG_SIZE);
- WARN_ONCE(1, "verifier backtracking bug (stack slot out of bounds)");
+ if (verifier_bug_if(i >= func->allocated_stack / BPF_REG_SIZE,
+ env, "stack slot %d, total slots %d",
+ i, func->allocated_stack / BPF_REG_SIZE))
return -EFAULT;
- }
if (!is_spilled_scalar_reg(&func->stack[i])) {
bt_clear_frame_slot(bt, fr, i);
continue;
}
reg = &func->stack[i].spilled_ptr;
- if (reg->precise)
+ if (reg->precise) {
bt_clear_frame_slot(bt, fr, i);
- else
+ } else {
reg->precise = true;
+ *changed = true;
+ }
}
if (env->log.level & BPF_LOG_LEVEL2) {
fmt_reg_mask(env->tmp_str_buf, TMP_STR_BUF_LEN,
@@ -4825,7 +4938,7 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int regno)
* fallback to marking all precise
*/
if (!bt_empty(bt)) {
- mark_all_scalars_precise(env, env->cur_state);
+ mark_all_scalars_precise(env, starting_state);
bt_reset(bt);
}
@@ -4834,15 +4947,16 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int regno)
int mark_chain_precision(struct bpf_verifier_env *env, int regno)
{
- return __mark_chain_precision(env, regno);
+ return __mark_chain_precision(env, env->cur_state, regno, NULL);
}
/* mark_chain_precision_batch() assumes that env->bt is set in the caller to
* desired reg and stack masks across all relevant frames
*/
-static int mark_chain_precision_batch(struct bpf_verifier_env *env)
+static int mark_chain_precision_batch(struct bpf_verifier_env *env,
+ struct bpf_verifier_state *starting_state)
{
- return __mark_chain_precision(env, -1);
+ return __mark_chain_precision(env, starting_state, -1, NULL);
}
static bool is_spillable_regtype(enum bpf_reg_type type)
@@ -5031,7 +5145,7 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env,
}
if (sanitize)
- env->insn_aux_data[insn_idx].sanitize_stack_spill = true;
+ env->insn_aux_data[insn_idx].nospec_result = true;
}
err = destroy_if_dynptr_stack_slot(env, state, spi);
@@ -5114,7 +5228,7 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env,
}
if (insn_flags)
- return push_insn_history(env, env->cur_state, insn_flags, 0);
+ return push_jmp_history(env, env->cur_state, insn_flags, 0);
return 0;
}
@@ -5421,7 +5535,7 @@ static int check_stack_read_fixed_off(struct bpf_verifier_env *env,
insn_flags = 0; /* we are not restoring spilled register */
}
if (insn_flags)
- return push_insn_history(env, env->cur_state, insn_flags, 0);
+ return push_jmp_history(env, env->cur_state, insn_flags, 0);
return 0;
}
@@ -5901,6 +6015,7 @@ static int check_map_kptr_access(struct bpf_verifier_env *env, u32 regno,
struct bpf_insn *insn = &env->prog->insnsi[insn_idx];
int class = BPF_CLASS(insn->code);
struct bpf_reg_state *val_reg;
+ int ret;
/* Things we already checked for in check_map_access and caller:
* - Reject cases where variable offset may touch kptr
@@ -5934,8 +6049,11 @@ static int check_map_kptr_access(struct bpf_verifier_env *env, u32 regno,
/* We can simply mark the value_regno receiving the pointer
* value from map as PTR_TO_BTF_ID, with the correct type.
*/
- mark_btf_ld_reg(env, cur_regs(env), value_regno, PTR_TO_BTF_ID, kptr_field->kptr.btf,
- kptr_field->kptr.btf_id, btf_ld_kptr_type(env, kptr_field));
+ ret = mark_btf_ld_reg(env, cur_regs(env), value_regno, PTR_TO_BTF_ID,
+ kptr_field->kptr.btf, kptr_field->kptr.btf_id,
+ btf_ld_kptr_type(env, kptr_field));
+ if (ret < 0)
+ return ret;
} else if (class == BPF_STX) {
val_reg = reg_state(env, value_regno);
if (!register_is_null(val_reg) &&
@@ -6561,21 +6679,18 @@ continue_func:
/* find the callee */
next_insn = i + insn[i].imm + 1;
sidx = find_subprog(env, next_insn);
- if (sidx < 0) {
- WARN_ONCE(1, "verifier bug. No program starts at insn %d\n",
- next_insn);
+ if (verifier_bug_if(sidx < 0, env, "callee not found at insn %d", next_insn))
return -EFAULT;
- }
if (subprog[sidx].is_async_cb) {
if (subprog[sidx].has_tail_call) {
- verbose(env, "verifier bug. subprog has tail_call and async cb\n");
+ verifier_bug(env, "subprog has tail_call and async cb");
return -EFAULT;
}
/* async callbacks don't increase bpf prog stack size unless called directly */
if (!bpf_pseudo_call(insn + i))
continue;
if (subprog[sidx].is_exception_cb) {
- verbose(env, "insn %d cannot call exception cb directly\n", i);
+ verbose(env, "insn %d cannot call exception cb directly", i);
return -EINVAL;
}
}
@@ -6675,11 +6790,8 @@ static int get_callee_stack_depth(struct bpf_verifier_env *env,
int start = idx + insn->imm + 1, subprog;
subprog = find_subprog(env, start);
- if (subprog < 0) {
- WARN_ONCE(1, "verifier bug. No program starts at insn %d\n",
- start);
+ if (verifier_bug_if(subprog < 0, env, "get stack depth: no program at insn %d", start))
return -EFAULT;
- }
return env->subprog_info[subprog].stack_depth;
}
#endif
@@ -7004,6 +7116,10 @@ BTF_TYPE_SAFE_RCU(struct css_set) {
struct cgroup *dfl_cgrp;
};
+BTF_TYPE_SAFE_RCU(struct cgroup_subsys_state) {
+ struct cgroup *cgroup;
+};
+
/* RCU trusted: these fields are trusted in RCU CS and can be NULL */
BTF_TYPE_SAFE_RCU_OR_NULL(struct mm_struct) {
struct file __rcu *exe_file;
@@ -7038,8 +7154,7 @@ BTF_TYPE_SAFE_TRUSTED(struct file) {
struct inode *f_inode;
};
-BTF_TYPE_SAFE_TRUSTED(struct dentry) {
- /* no negative dentry-s in places where bpf can see it */
+BTF_TYPE_SAFE_TRUSTED_OR_NULL(struct dentry) {
struct inode *d_inode;
};
@@ -7054,6 +7169,7 @@ static bool type_is_rcu(struct bpf_verifier_env *env,
BTF_TYPE_EMIT(BTF_TYPE_SAFE_RCU(struct task_struct));
BTF_TYPE_EMIT(BTF_TYPE_SAFE_RCU(struct cgroup));
BTF_TYPE_EMIT(BTF_TYPE_SAFE_RCU(struct css_set));
+ BTF_TYPE_EMIT(BTF_TYPE_SAFE_RCU(struct cgroup_subsys_state));
return btf_nested_type_is_trusted(&env->log, reg, field_name, btf_id, "__safe_rcu");
}
@@ -7077,7 +7193,6 @@ static bool type_is_trusted(struct bpf_verifier_env *env,
BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED(struct bpf_iter__task));
BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED(struct linux_binprm));
BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED(struct file));
- BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED(struct dentry));
return btf_nested_type_is_trusted(&env->log, reg, field_name, btf_id, "__safe_trusted");
}
@@ -7087,6 +7202,7 @@ static bool type_is_trusted_or_null(struct bpf_verifier_env *env,
const char *field_name, u32 btf_id)
{
BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED_OR_NULL(struct socket));
+ BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED_OR_NULL(struct dentry));
return btf_nested_type_is_trusted(&env->log, reg, field_name, btf_id,
"__safe_trusted_or_null");
@@ -7150,7 +7266,7 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env,
if (env->ops->btf_struct_access && !type_is_alloc(reg->type) && atype == BPF_WRITE) {
if (!btf_is_kernel(reg->btf)) {
- verbose(env, "verifier internal error: reg->btf must be kernel btf\n");
+ verifier_bug(env, "reg->btf must be kernel btf");
return -EFAULT;
}
ret = env->ops->btf_struct_access(&env->log, reg, off, size);
@@ -7166,7 +7282,7 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env,
if (type_is_alloc(reg->type) && !type_is_non_owning_ref(reg->type) &&
!(reg->type & MEM_RCU) && !reg->ref_obj_id) {
- verbose(env, "verifier internal error: ref_obj_id for allocated object must be non-zero\n");
+ verifier_bug(env, "ref_obj_id for allocated object must be non-zero");
return -EFAULT;
}
@@ -7236,8 +7352,11 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env,
clear_trusted_flags(&flag);
}
- if (atype == BPF_READ && value_regno >= 0)
- mark_btf_ld_reg(env, regs, value_regno, ret, reg->btf, btf_id, flag);
+ if (atype == BPF_READ && value_regno >= 0) {
+ ret = mark_btf_ld_reg(env, regs, value_regno, ret, reg->btf, btf_id, flag);
+ if (ret < 0)
+ return ret;
+ }
return 0;
}
@@ -7291,13 +7410,19 @@ static int check_ptr_to_map_access(struct bpf_verifier_env *env,
/* Simulate access to a PTR_TO_BTF_ID */
memset(&map_reg, 0, sizeof(map_reg));
- mark_btf_ld_reg(env, &map_reg, 0, PTR_TO_BTF_ID, btf_vmlinux, *map->ops->map_btf_id, 0);
+ ret = mark_btf_ld_reg(env, &map_reg, 0, PTR_TO_BTF_ID,
+ btf_vmlinux, *map->ops->map_btf_id, 0);
+ if (ret < 0)
+ return ret;
ret = btf_struct_access(&env->log, &map_reg, off, size, atype, &btf_id, &flag, NULL);
if (ret < 0)
return ret;
- if (value_regno >= 0)
- mark_btf_ld_reg(env, regs, value_regno, ret, btf_vmlinux, btf_id, flag);
+ if (value_regno >= 0) {
+ ret = mark_btf_ld_reg(env, regs, value_regno, ret, btf_vmlinux, btf_id, flag);
+ if (ret < 0)
+ return ret;
+ }
return 0;
}
@@ -7481,6 +7606,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
}
} else if (base_type(reg->type) == PTR_TO_MEM) {
bool rdonly_mem = type_is_rdonly_mem(reg->type);
+ bool rdonly_untrusted = rdonly_mem && (reg->type & PTR_UNTRUSTED);
if (type_may_be_null(reg->type)) {
verbose(env, "R%d invalid mem access '%s'\n", regno,
@@ -7500,8 +7626,13 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
return -EACCES;
}
- err = check_mem_region_access(env, regno, off, size,
- reg->mem_size, false);
+ /*
+ * Accesses to untrusted PTR_TO_MEM are done through probe
+ * instructions, hence no need to check bounds in that case.
+ */
+ if (!rdonly_untrusted)
+ err = check_mem_region_access(env, regno, off, size,
+ reg->mem_size, false);
if (!err && value_regno >= 0 && (t == BPF_READ || rdonly_mem))
mark_reg_unknown(env, regs, value_regno);
} else if (reg->type == PTR_TO_CTX) {
@@ -7984,7 +8115,7 @@ static int check_stack_range_initialized(
slot = -i - 1;
spi = slot / BPF_REG_SIZE;
if (state->allocated_stack <= slot) {
- verbose(env, "verifier bug: allocated_stack too small\n");
+ verbose(env, "allocated_stack too small\n");
return -EFAULT;
}
@@ -8413,7 +8544,7 @@ static int process_timer_func(struct bpf_verifier_env *env, int regno,
return -EINVAL;
}
if (meta->map_ptr) {
- verbose(env, "verifier bug. Two map pointers in a timer helper\n");
+ verifier_bug(env, "Two map pointers in a timer helper");
return -EFAULT;
}
meta->map_uid = reg->map_uid;
@@ -8528,7 +8659,7 @@ static int process_dynptr_func(struct bpf_verifier_env *env, int regno, int insn
* ARG_PTR_TO_DYNPTR (or ARG_PTR_TO_DYNPTR | DYNPTR_TYPE_*):
*/
if ((arg_type & (MEM_UNINIT | MEM_RDONLY)) == (MEM_UNINIT | MEM_RDONLY)) {
- verbose(env, "verifier internal error: misconfigured dynptr helper type flags\n");
+ verifier_bug(env, "misconfigured dynptr helper type flags");
return -EFAULT;
}
@@ -8894,8 +9025,8 @@ static int process_iter_next_call(struct bpf_verifier_env *env, int insn_idx,
if (cur_iter->iter.state != BPF_ITER_STATE_ACTIVE &&
cur_iter->iter.state != BPF_ITER_STATE_DRAINED) {
- verbose(env, "verifier internal error: unexpected iterator state %d (%s)\n",
- cur_iter->iter.state, iter_state_str(cur_iter->iter.state));
+ verifier_bug(env, "unexpected iterator state %d (%s)",
+ cur_iter->iter.state, iter_state_str(cur_iter->iter.state));
return -EFAULT;
}
@@ -8905,7 +9036,7 @@ static int process_iter_next_call(struct bpf_verifier_env *env, int insn_idx,
*/
if (!cur_st->parent || cur_st->parent->insn_idx != insn_idx ||
!same_callsites(cur_st->parent, cur_st)) {
- verbose(env, "bug: bad parent state for iter next call");
+ verifier_bug(env, "bad parent state for iter next call");
return -EFAULT;
}
/* Note cur_st->parent in the call below, it is necessary to skip
@@ -8964,8 +9095,8 @@ static int resolve_map_arg_type(struct bpf_verifier_env *env,
{
if (!meta->map_ptr) {
/* kernel subsystem misconfigured verifier */
- verbose(env, "invalid map_ptr to access map->type\n");
- return -EACCES;
+ verifier_bug(env, "invalid map_ptr to access map->type");
+ return -EFAULT;
}
switch (meta->map_ptr->map_type) {
@@ -9111,7 +9242,7 @@ static int check_reg_type(struct bpf_verifier_env *env, u32 regno,
compatible = compatible_reg_types[base_type(arg_type)];
if (!compatible) {
- verbose(env, "verifier internal error: unsupported arg type %d\n", arg_type);
+ verifier_bug(env, "unsupported arg type %d", arg_type);
return -EFAULT;
}
@@ -9193,7 +9324,7 @@ found:
if (!arg_btf_id) {
if (!compatible->btf_id) {
- verbose(env, "verifier internal error: missing arg compatible BTF ID\n");
+ verifier_bug(env, "missing arg compatible BTF ID");
return -EFAULT;
}
arg_btf_id = compatible->btf_id;
@@ -9225,7 +9356,7 @@ found:
case PTR_TO_BTF_ID | MEM_PERCPU | MEM_ALLOC:
if (meta->func_id != BPF_FUNC_spin_lock && meta->func_id != BPF_FUNC_spin_unlock &&
meta->func_id != BPF_FUNC_kptr_xchg) {
- verbose(env, "verifier internal error: unimplemented handling of MEM_ALLOC\n");
+ verifier_bug(env, "unimplemented handling of MEM_ALLOC");
return -EFAULT;
}
/* Check if local kptr in src arg matches kptr in dst arg */
@@ -9240,7 +9371,7 @@ found:
/* Handled by helper specific checks */
break;
default:
- verbose(env, "verifier internal error: invalid PTR_TO_BTF_ID register for type match\n");
+ verifier_bug(env, "invalid PTR_TO_BTF_ID register for type match");
return -EFAULT;
}
return 0;
@@ -9501,7 +9632,7 @@ static int get_constant_map_key(struct bpf_verifier_env *env,
* to prevent pruning on it.
*/
bt_set_frame_slot(&env->bt, key->frameno, spi);
- err = mark_chain_precision_batch(env);
+ err = mark_chain_precision_batch(env, env->cur_state);
if (err < 0)
return err;
@@ -9598,7 +9729,7 @@ skip_type_check:
return -EINVAL;
}
if (meta->release_regno) {
- verbose(env, "verifier internal error: more than one release argument\n");
+ verifier_bug(env, "more than one release argument");
return -EFAULT;
}
meta->release_regno = regno;
@@ -9606,10 +9737,10 @@ skip_type_check:
if (reg->ref_obj_id && base_type(arg_type) != ARG_KPTR_XCHG_DEST) {
if (meta->ref_obj_id) {
- verbose(env, "verifier internal error: more than one arg with ref_obj_id R%d %u %u\n",
+ verbose(env, "more than one arg with ref_obj_id R%d %u %u",
regno, reg->ref_obj_id,
meta->ref_obj_id);
- return -EFAULT;
+ return -EACCES;
}
meta->ref_obj_id = reg->ref_obj_id;
}
@@ -9652,8 +9783,8 @@ skip_type_check:
* we have to check map_key here. Otherwise it means
* that kernel subsystem misconfigured verifier
*/
- verbose(env, "invalid map_ptr to access map->key\n");
- return -EACCES;
+ verifier_bug(env, "invalid map_ptr to access map->key");
+ return -EFAULT;
}
key_size = meta->map_ptr->key_size;
err = check_helper_mem_access(env, regno, key_size, BPF_READ, false, NULL);
@@ -9679,8 +9810,8 @@ skip_type_check:
*/
if (!meta->map_ptr) {
/* kernel subsystem misconfigured verifier */
- verbose(env, "invalid map_ptr to access map->value\n");
- return -EACCES;
+ verifier_bug(env, "invalid map_ptr to access map->value");
+ return -EFAULT;
}
meta->raw_mode = arg_type & MEM_UNINIT;
err = check_helper_mem_access(env, regno, meta->map_ptr->value_size,
@@ -9709,7 +9840,7 @@ skip_type_check:
if (err)
return err;
} else {
- verbose(env, "verifier internal error\n");
+ verifier_bug(env, "spin lock arg on unexpected helper");
return -EFAULT;
}
break;
@@ -10285,13 +10416,12 @@ static int setup_func_entry(struct bpf_verifier_env *env, int subprog, int calls
}
if (state->frame[state->curframe + 1]) {
- verbose(env, "verifier bug. Frame %d already allocated\n",
- state->curframe + 1);
+ verifier_bug(env, "Frame %d already allocated", state->curframe + 1);
return -EFAULT;
}
caller = state->frame[state->curframe];
- callee = kzalloc(sizeof(*callee), GFP_KERNEL);
+ callee = kzalloc(sizeof(*callee), GFP_KERNEL_ACCOUNT);
if (!callee)
return -ENOMEM;
state->frame[state->curframe + 1] = callee;
@@ -10346,6 +10476,12 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, int subprog,
bpf_log(log, "R%d is not a scalar\n", regno);
return -EINVAL;
}
+ } else if (arg->arg_type & PTR_UNTRUSTED) {
+ /*
+ * Anything is allowed for untrusted arguments, as these are
+ * read-only and probe read instructions would protect against
+ * invalid memory access.
+ */
} else if (arg->arg_type == ARG_PTR_TO_CTX) {
ret = check_func_arg_reg_off(env, reg, regno, ARG_DONTCARE);
if (ret < 0)
@@ -10400,8 +10536,7 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, int subprog,
if (err)
return err;
} else {
- bpf_log(log, "verifier bug: unrecognized arg#%d type %d\n",
- i, arg->arg_type);
+ verifier_bug(env, "unrecognized arg#%d type %d", i, arg->arg_type);
return -EFAULT;
}
}
@@ -10464,13 +10599,13 @@ static int push_callback_call(struct bpf_verifier_env *env, struct bpf_insn *ins
env->subprog_info[subprog].is_cb = true;
if (bpf_pseudo_kfunc_call(insn) &&
!is_callback_calling_kfunc(insn->imm)) {
- verbose(env, "verifier bug: kfunc %s#%d not marked as callback-calling\n",
- func_id_name(insn->imm), insn->imm);
+ verifier_bug(env, "kfunc %s#%d not marked as callback-calling",
+ func_id_name(insn->imm), insn->imm);
return -EFAULT;
} else if (!bpf_pseudo_kfunc_call(insn) &&
!is_callback_calling_function(insn->imm)) { /* helper */
- verbose(env, "verifier bug: helper %s#%d not marked as callback-calling\n",
- func_id_name(insn->imm), insn->imm);
+ verifier_bug(env, "helper %s#%d not marked as callback-calling",
+ func_id_name(insn->imm), insn->imm);
return -EFAULT;
}
@@ -10522,10 +10657,9 @@ static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
target_insn = *insn_idx + insn->imm + 1;
subprog = find_subprog(env, target_insn);
- if (subprog < 0) {
- verbose(env, "verifier bug. No program starts at insn %d\n", target_insn);
+ if (verifier_bug_if(subprog < 0, env, "target of func call at insn %d is not a program",
+ target_insn))
return -EFAULT;
- }
caller = state->frame[state->curframe];
err = btf_check_subprog_call(env, subprog, caller->regs);
@@ -10870,8 +11004,8 @@ static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx)
return -EINVAL;
}
if (!calls_callback(env, callee->callsite)) {
- verbose(env, "BUG: in callback at %d, callsite %d !calls_callback\n",
- *insn_idx, callee->callsite);
+ verifier_bug(env, "in callback at %d, callsite %d !calls_callback",
+ *insn_idx, callee->callsite);
return -EFAULT;
}
} else {
@@ -10978,8 +11112,8 @@ record_func_map(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta,
return 0;
if (map == NULL) {
- verbose(env, "kernel subsystem misconfigured verifier\n");
- return -EINVAL;
+ verifier_bug(env, "expected map for helper call");
+ return -EFAULT;
}
/* In case of read-only, some additional restrictions
@@ -11017,7 +11151,7 @@ record_func_key(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta,
if (func_id != BPF_FUNC_tail_call)
return 0;
if (!map || map->map_type != BPF_MAP_TYPE_PROG_ARRAY) {
- verbose(env, "kernel subsystem misconfigured verifier\n");
+ verbose(env, "expected prog array map for tail call");
return -EINVAL;
}
@@ -11124,7 +11258,7 @@ static int check_bpf_snprintf_call(struct bpf_verifier_env *env,
err = fmt_map->ops->map_direct_value_addr(fmt_map, &fmt_addr,
fmt_map_off);
if (err) {
- verbose(env, "verifier bug\n");
+ verbose(env, "failed to retrieve map value address\n");
return -EFAULT;
}
fmt = (char *)(long)fmt_addr + fmt_map_off;
@@ -11160,7 +11294,7 @@ static int check_get_func_ip(struct bpf_verifier_env *env)
return -ENOTSUPP;
}
-static struct bpf_insn_aux_data *cur_aux(struct bpf_verifier_env *env)
+static struct bpf_insn_aux_data *cur_aux(const struct bpf_verifier_env *env)
{
return &env->insn_aux_data[env->insn_idx];
}
@@ -11270,9 +11404,8 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
/* With LD_ABS/IND some JITs save/restore skb from r1. */
changes_data = bpf_helper_changes_pkt_data(func_id);
if (changes_data && fn->arg1_type != ARG_PTR_TO_CTX) {
- verbose(env, "kernel subsystem misconfigured func %s#%d: r1 != ctx\n",
- func_id_name(func_id), func_id);
- return -EINVAL;
+ verifier_bug(env, "func %s#%d: r1 != ctx", func_id_name(func_id), func_id);
+ return -EFAULT;
}
memset(&meta, 0, sizeof(meta));
@@ -11280,8 +11413,7 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
err = check_func_proto(fn, func_id);
if (err) {
- verbose(env, "kernel subsystem misconfigured func %s#%d\n",
- func_id_name(func_id), func_id);
+ verifier_bug(env, "incorrect func proto %s#%d", func_id_name(func_id), func_id);
return err;
}
@@ -11354,7 +11486,7 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
*/
if (arg_type_is_dynptr(fn->arg_type[meta.release_regno - BPF_REG_1])) {
if (regs[meta.release_regno].type == CONST_PTR_TO_DYNPTR) {
- verbose(env, "verifier internal error: CONST_PTR_TO_DYNPTR cannot be released\n");
+ verifier_bug(env, "CONST_PTR_TO_DYNPTR cannot be released");
return -EFAULT;
}
err = unmark_stack_slots_dynptr(env, &regs[meta.release_regno]);
@@ -11471,23 +11603,23 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
if (meta.dynptr_id) {
- verbose(env, "verifier internal error: meta.dynptr_id already set\n");
+ verifier_bug(env, "meta.dynptr_id already set");
return -EFAULT;
}
if (meta.ref_obj_id) {
- verbose(env, "verifier internal error: meta.ref_obj_id already set\n");
+ verifier_bug(env, "meta.ref_obj_id already set");
return -EFAULT;
}
id = dynptr_id(env, reg);
if (id < 0) {
- verbose(env, "verifier internal error: failed to obtain dynptr id\n");
+ verifier_bug(env, "failed to obtain dynptr id");
return id;
}
ref_obj_id = dynptr_ref_obj_id(env, reg);
if (ref_obj_id < 0) {
- verbose(env, "verifier internal error: failed to obtain dynptr ref_obj_id\n");
+ verifier_bug(env, "failed to obtain dynptr ref_obj_id");
return ref_obj_id;
}
@@ -11572,9 +11704,8 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
* to map element returned from bpf_map_lookup_elem()
*/
if (meta.map_ptr == NULL) {
- verbose(env,
- "kernel subsystem misconfigured verifier\n");
- return -EINVAL;
+ verifier_bug(env, "unexpected null map_ptr");
+ return -EFAULT;
}
if (func_id == BPF_FUNC_map_lookup_elem &&
@@ -11664,10 +11795,9 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
}
} else {
if (fn->ret_btf_id == BPF_PTR_POISON) {
- verbose(env, "verifier internal error:");
- verbose(env, "func %s has non-overwritten BPF_PTR_POISON return type\n",
- func_id_name(func_id));
- return -EINVAL;
+ verifier_bug(env, "func %s has non-overwritten BPF_PTR_POISON return type",
+ func_id_name(func_id));
+ return -EFAULT;
}
ret_btf = btf_vmlinux;
ret_btf_id = *fn->ret_btf_id;
@@ -11692,8 +11822,8 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
regs[BPF_REG_0].id = ++env->id_gen;
if (helper_multiple_ref_obj_use(func_id, meta.map_ptr)) {
- verbose(env, "verifier internal error: func %s#%d sets ref_obj_id more than once\n",
- func_id_name(func_id), func_id);
+ verifier_bug(env, "func %s#%d sets ref_obj_id more than once",
+ func_id_name(func_id), func_id);
return -EFAULT;
}
@@ -11897,6 +12027,11 @@ static bool is_kfunc_arg_irq_flag(const struct btf *btf, const struct btf_param
return btf_param_match_suffix(btf, arg, "__irq_flag");
}
+static bool is_kfunc_arg_prog(const struct btf *btf, const struct btf_param *arg)
+{
+ return btf_param_match_suffix(btf, arg, "__prog");
+}
+
static bool is_kfunc_arg_scalar_with_name(const struct btf *btf,
const struct btf_param *arg,
const char *name)
@@ -11987,6 +12122,16 @@ static bool is_kfunc_arg_res_spin_lock(const struct btf *btf, const struct btf_p
return __is_kfunc_ptr_arg_type(btf, arg, KF_ARG_RES_SPIN_LOCK_ID);
}
+static bool is_rbtree_node_type(const struct btf_type *t)
+{
+ return t == btf_type_by_id(btf_vmlinux, kf_arg_btf_ids[KF_ARG_RB_NODE_ID]);
+}
+
+static bool is_list_node_type(const struct btf_type *t)
+{
+ return t == btf_type_by_id(btf_vmlinux, kf_arg_btf_ids[KF_ARG_LIST_NODE_ID]);
+}
+
static bool is_kfunc_arg_callback(struct bpf_verifier_env *env, const struct btf *btf,
const struct btf_param *arg)
{
@@ -12069,6 +12214,8 @@ enum special_kfunc_type {
KF_bpf_list_push_back_impl,
KF_bpf_list_pop_front,
KF_bpf_list_pop_back,
+ KF_bpf_list_front,
+ KF_bpf_list_back,
KF_bpf_cast_to_kern_ctx,
KF_bpf_rdonly_cast,
KF_bpf_rcu_read_lock,
@@ -12076,6 +12223,9 @@ enum special_kfunc_type {
KF_bpf_rbtree_remove,
KF_bpf_rbtree_add_impl,
KF_bpf_rbtree_first,
+ KF_bpf_rbtree_root,
+ KF_bpf_rbtree_left,
+ KF_bpf_rbtree_right,
KF_bpf_dynptr_from_skb,
KF_bpf_dynptr_from_xdp,
KF_bpf_dynptr_slice,
@@ -12101,41 +12251,9 @@ enum special_kfunc_type {
KF_bpf_res_spin_unlock,
KF_bpf_res_spin_lock_irqsave,
KF_bpf_res_spin_unlock_irqrestore,
+ KF___bpf_trap,
};
-BTF_SET_START(special_kfunc_set)
-BTF_ID(func, bpf_obj_new_impl)
-BTF_ID(func, bpf_obj_drop_impl)
-BTF_ID(func, bpf_refcount_acquire_impl)
-BTF_ID(func, bpf_list_push_front_impl)
-BTF_ID(func, bpf_list_push_back_impl)
-BTF_ID(func, bpf_list_pop_front)
-BTF_ID(func, bpf_list_pop_back)
-BTF_ID(func, bpf_cast_to_kern_ctx)
-BTF_ID(func, bpf_rdonly_cast)
-BTF_ID(func, bpf_rbtree_remove)
-BTF_ID(func, bpf_rbtree_add_impl)
-BTF_ID(func, bpf_rbtree_first)
-#ifdef CONFIG_NET
-BTF_ID(func, bpf_dynptr_from_skb)
-BTF_ID(func, bpf_dynptr_from_xdp)
-#endif
-BTF_ID(func, bpf_dynptr_slice)
-BTF_ID(func, bpf_dynptr_slice_rdwr)
-BTF_ID(func, bpf_dynptr_clone)
-BTF_ID(func, bpf_percpu_obj_new_impl)
-BTF_ID(func, bpf_percpu_obj_drop_impl)
-BTF_ID(func, bpf_throw)
-BTF_ID(func, bpf_wq_set_callback_impl)
-#ifdef CONFIG_CGROUPS
-BTF_ID(func, bpf_iter_css_task_new)
-#endif
-#ifdef CONFIG_BPF_LSM
-BTF_ID(func, bpf_set_dentry_xattr)
-BTF_ID(func, bpf_remove_dentry_xattr)
-#endif
-BTF_SET_END(special_kfunc_set)
-
BTF_ID_LIST(special_kfunc_list)
BTF_ID(func, bpf_obj_new_impl)
BTF_ID(func, bpf_obj_drop_impl)
@@ -12144,6 +12262,8 @@ BTF_ID(func, bpf_list_push_front_impl)
BTF_ID(func, bpf_list_push_back_impl)
BTF_ID(func, bpf_list_pop_front)
BTF_ID(func, bpf_list_pop_back)
+BTF_ID(func, bpf_list_front)
+BTF_ID(func, bpf_list_back)
BTF_ID(func, bpf_cast_to_kern_ctx)
BTF_ID(func, bpf_rdonly_cast)
BTF_ID(func, bpf_rcu_read_lock)
@@ -12151,6 +12271,9 @@ BTF_ID(func, bpf_rcu_read_unlock)
BTF_ID(func, bpf_rbtree_remove)
BTF_ID(func, bpf_rbtree_add_impl)
BTF_ID(func, bpf_rbtree_first)
+BTF_ID(func, bpf_rbtree_root)
+BTF_ID(func, bpf_rbtree_left)
+BTF_ID(func, bpf_rbtree_right)
#ifdef CONFIG_NET
BTF_ID(func, bpf_dynptr_from_skb)
BTF_ID(func, bpf_dynptr_from_xdp)
@@ -12194,6 +12317,7 @@ BTF_ID(func, bpf_res_spin_lock)
BTF_ID(func, bpf_res_spin_unlock)
BTF_ID(func, bpf_res_spin_lock_irqsave)
BTF_ID(func, bpf_res_spin_unlock_irqrestore)
+BTF_ID(func, __bpf_trap)
static bool is_kfunc_ret_null(struct bpf_kfunc_call_arg_meta *meta)
{
@@ -12411,7 +12535,7 @@ static int process_irq_flag(struct bpf_verifier_env *env, int regno,
if (meta->func_id == special_kfunc_list[KF_bpf_res_spin_unlock_irqrestore])
kfunc_class = IRQ_LOCK_KFUNC;
} else {
- verbose(env, "verifier internal error: unknown irq flags kfunc\n");
+ verifier_bug(env, "unknown irq flags kfunc");
return -EFAULT;
}
@@ -12452,12 +12576,12 @@ static int ref_set_non_owning(struct bpf_verifier_env *env, struct bpf_reg_state
struct btf_record *rec = reg_btf_record(reg);
if (!env->cur_state->active_locks) {
- verbose(env, "verifier internal error: ref_set_non_owning w/o active lock\n");
+ verifier_bug(env, "%s w/o active lock", __func__);
return -EFAULT;
}
if (type_flag(reg->type) & NON_OWN_REF) {
- verbose(env, "verifier internal error: NON_OWN_REF already set\n");
+ verifier_bug(env, "NON_OWN_REF already set");
return -EFAULT;
}
@@ -12476,8 +12600,7 @@ static int ref_convert_owning_non_owning(struct bpf_verifier_env *env, u32 ref_o
int i;
if (!ref_obj_id) {
- verbose(env, "verifier internal error: ref_obj_id is zero for "
- "owning -> non-owning conversion\n");
+ verifier_bug(env, "ref_obj_id is zero for owning -> non-owning conversion");
return -EFAULT;
}
@@ -12497,7 +12620,7 @@ static int ref_convert_owning_non_owning(struct bpf_verifier_env *env, u32 ref_o
return 0;
}
- verbose(env, "verifier internal error: ref state missing for ref_obj_id\n");
+ verifier_bug(env, "ref state missing for ref_obj_id");
return -EFAULT;
}
@@ -12559,7 +12682,7 @@ static int check_reg_allocation_locked(struct bpf_verifier_env *env, struct bpf_
ptr = reg->btf;
break;
default:
- verbose(env, "verifier internal error: unknown reg type for lock check\n");
+ verifier_bug(env, "unknown reg type for lock check");
return -EFAULT;
}
id = reg->id;
@@ -12579,14 +12702,19 @@ static bool is_bpf_list_api_kfunc(u32 btf_id)
return btf_id == special_kfunc_list[KF_bpf_list_push_front_impl] ||
btf_id == special_kfunc_list[KF_bpf_list_push_back_impl] ||
btf_id == special_kfunc_list[KF_bpf_list_pop_front] ||
- btf_id == special_kfunc_list[KF_bpf_list_pop_back];
+ btf_id == special_kfunc_list[KF_bpf_list_pop_back] ||
+ btf_id == special_kfunc_list[KF_bpf_list_front] ||
+ btf_id == special_kfunc_list[KF_bpf_list_back];
}
static bool is_bpf_rbtree_api_kfunc(u32 btf_id)
{
return btf_id == special_kfunc_list[KF_bpf_rbtree_add_impl] ||
btf_id == special_kfunc_list[KF_bpf_rbtree_remove] ||
- btf_id == special_kfunc_list[KF_bpf_rbtree_first];
+ btf_id == special_kfunc_list[KF_bpf_rbtree_first] ||
+ btf_id == special_kfunc_list[KF_bpf_rbtree_root] ||
+ btf_id == special_kfunc_list[KF_bpf_rbtree_left] ||
+ btf_id == special_kfunc_list[KF_bpf_rbtree_right];
}
static bool is_bpf_iter_num_api_kfunc(u32 btf_id)
@@ -12686,7 +12814,9 @@ static bool check_kfunc_is_graph_node_api(struct bpf_verifier_env *env,
break;
case BPF_RB_NODE:
ret = (kfunc_btf_id == special_kfunc_list[KF_bpf_rbtree_remove] ||
- kfunc_btf_id == special_kfunc_list[KF_bpf_rbtree_add_impl]);
+ kfunc_btf_id == special_kfunc_list[KF_bpf_rbtree_add_impl] ||
+ kfunc_btf_id == special_kfunc_list[KF_bpf_rbtree_left] ||
+ kfunc_btf_id == special_kfunc_list[KF_bpf_rbtree_right]);
break;
default:
verbose(env, "verifier internal error: unexpected graph node argument type %s\n",
@@ -12713,7 +12843,7 @@ __process_kf_arg_ptr_to_graph_root(struct bpf_verifier_env *env,
u32 head_off;
if (meta->btf != btf_vmlinux) {
- verbose(env, "verifier internal error: unexpected btf mismatch in kfunc call\n");
+ verifier_bug(env, "unexpected btf mismatch in kfunc call");
return -EFAULT;
}
@@ -12744,7 +12874,7 @@ __process_kf_arg_ptr_to_graph_root(struct bpf_verifier_env *env,
}
if (*head_field) {
- verbose(env, "verifier internal error: repeating %s arg\n", head_type_name);
+ verifier_bug(env, "repeating %s arg", head_type_name);
return -EFAULT;
}
*head_field = field;
@@ -12781,7 +12911,7 @@ __process_kf_arg_ptr_to_graph_node(struct bpf_verifier_env *env,
u32 node_off;
if (meta->btf != btf_vmlinux) {
- verbose(env, "verifier internal error: unexpected btf mismatch in kfunc call\n");
+ verifier_bug(env, "unexpected btf mismatch in kfunc call");
return -EFAULT;
}
@@ -12906,6 +13036,17 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
if (is_kfunc_arg_ignore(btf, &args[i]))
continue;
+ if (is_kfunc_arg_prog(btf, &args[i])) {
+ /* Used to reject repeated use of __prog. */
+ if (meta->arg_prog) {
+ verifier_bug(env, "Only 1 prog->aux argument supported per-kfunc");
+ return -EFAULT;
+ }
+ meta->arg_prog = true;
+ cur_aux(env)->arg_prog = regno;
+ continue;
+ }
+
if (btf_type_is_scalar(t)) {
if (reg->type != SCALAR_VALUE) {
verbose(env, "R%d is not a scalar\n", regno);
@@ -12914,7 +13055,7 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
if (is_kfunc_arg_constant(meta->btf, &args[i])) {
if (meta->arg_constant.found) {
- verbose(env, "verifier internal error: only one constant argument permitted\n");
+ verifier_bug(env, "only one constant argument permitted");
return -EFAULT;
}
if (!tnum_is_const(reg->var_off)) {
@@ -12966,9 +13107,9 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
if (reg->ref_obj_id) {
if (is_kfunc_release(meta) && meta->ref_obj_id) {
- verbose(env, "verifier internal error: more than one arg with ref_obj_id R%d %u %u\n",
- regno, reg->ref_obj_id,
- meta->ref_obj_id);
+ verifier_bug(env, "more than one arg with ref_obj_id R%d %u %u",
+ regno, reg->ref_obj_id,
+ meta->ref_obj_id);
return -EFAULT;
}
meta->ref_obj_id = reg->ref_obj_id;
@@ -13048,7 +13189,7 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
case KF_ARG_PTR_TO_RES_SPIN_LOCK:
break;
default:
- WARN_ON_ONCE(1);
+ verifier_bug(env, "unknown kfunc arg type %d", kf_arg_type);
return -EFAULT;
}
@@ -13117,14 +13258,14 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
enum bpf_dynptr_type parent_type = meta->initialized_dynptr.type;
if (parent_type == BPF_DYNPTR_TYPE_INVALID) {
- verbose(env, "verifier internal error: no dynptr type for parent of clone\n");
+ verifier_bug(env, "no dynptr type for parent of clone");
return -EFAULT;
}
dynptr_arg_type |= (unsigned int)get_dynptr_type_flag(parent_type);
clone_ref_obj_id = meta->initialized_dynptr.ref_obj_id;
if (dynptr_type_refcounted(parent_type) && !clone_ref_obj_id) {
- verbose(env, "verifier internal error: missing ref obj id for parent of clone\n");
+ verifier_bug(env, "missing ref obj id for parent of clone");
return -EFAULT;
}
}
@@ -13137,7 +13278,7 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
int id = dynptr_id(env, reg);
if (id < 0) {
- verbose(env, "verifier internal error: failed to obtain dynptr id\n");
+ verifier_bug(env, "failed to obtain dynptr id");
return id;
}
meta->initialized_dynptr.id = id;
@@ -13200,22 +13341,22 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
return ret;
break;
case KF_ARG_PTR_TO_RB_NODE:
- if (meta->func_id == special_kfunc_list[KF_bpf_rbtree_remove]) {
- if (!type_is_non_owning_ref(reg->type) || reg->ref_obj_id) {
- verbose(env, "rbtree_remove node input must be non-owning ref\n");
+ if (meta->func_id == special_kfunc_list[KF_bpf_rbtree_add_impl]) {
+ if (reg->type != (PTR_TO_BTF_ID | MEM_ALLOC)) {
+ verbose(env, "arg#%d expected pointer to allocated object\n", i);
return -EINVAL;
}
- if (in_rbtree_lock_required_cb(env)) {
- verbose(env, "rbtree_remove not allowed in rbtree cb\n");
+ if (!reg->ref_obj_id) {
+ verbose(env, "allocated object must be referenced\n");
return -EINVAL;
}
} else {
- if (reg->type != (PTR_TO_BTF_ID | MEM_ALLOC)) {
- verbose(env, "arg#%d expected pointer to allocated object\n", i);
+ if (!type_is_non_owning_ref(reg->type) && !reg->ref_obj_id) {
+ verbose(env, "%s can only take non-owning or refcounted bpf_rb_node pointer\n", func_name);
return -EINVAL;
}
- if (!reg->ref_obj_id) {
- verbose(env, "allocated object must be referenced\n");
+ if (in_rbtree_lock_required_cb(env)) {
+ verbose(env, "%s not allowed in rbtree cb\n", func_name);
return -EINVAL;
}
}
@@ -13273,7 +13414,7 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
if (is_kfunc_arg_const_mem_size(meta->btf, size_arg, size_reg)) {
if (meta->arg_constant.found) {
- verbose(env, "verifier internal error: only one constant argument permitted\n");
+ verifier_bug(env, "only one constant argument permitted");
return -EFAULT;
}
if (!tnum_is_const(size_reg->var_off)) {
@@ -13305,7 +13446,7 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
rec = reg_btf_record(reg);
if (!rec) {
- verbose(env, "verifier internal error: Couldn't find btf_record\n");
+ verifier_bug(env, "Couldn't find btf_record");
return -EFAULT;
}
@@ -13420,6 +13561,186 @@ static int fetch_kfunc_meta(struct bpf_verifier_env *env,
return 0;
}
+/* check special kfuncs and return:
+ * 1 - not fall-through to 'else' branch, continue verification
+ * 0 - fall-through to 'else' branch
+ * < 0 - not fall-through to 'else' branch, return error
+ */
+static int check_special_kfunc(struct bpf_verifier_env *env, struct bpf_kfunc_call_arg_meta *meta,
+ struct bpf_reg_state *regs, struct bpf_insn_aux_data *insn_aux,
+ const struct btf_type *ptr_type, struct btf *desc_btf)
+{
+ const struct btf_type *ret_t;
+ int err = 0;
+
+ if (meta->btf != btf_vmlinux)
+ return 0;
+
+ if (meta->func_id == special_kfunc_list[KF_bpf_obj_new_impl] ||
+ meta->func_id == special_kfunc_list[KF_bpf_percpu_obj_new_impl]) {
+ struct btf_struct_meta *struct_meta;
+ struct btf *ret_btf;
+ u32 ret_btf_id;
+
+ if (meta->func_id == special_kfunc_list[KF_bpf_obj_new_impl] && !bpf_global_ma_set)
+ return -ENOMEM;
+
+ if (((u64)(u32)meta->arg_constant.value) != meta->arg_constant.value) {
+ verbose(env, "local type ID argument must be in range [0, U32_MAX]\n");
+ return -EINVAL;
+ }
+
+ ret_btf = env->prog->aux->btf;
+ ret_btf_id = meta->arg_constant.value;
+
+ /* This may be NULL due to user not supplying a BTF */
+ if (!ret_btf) {
+ verbose(env, "bpf_obj_new/bpf_percpu_obj_new requires prog BTF\n");
+ return -EINVAL;
+ }
+
+ ret_t = btf_type_by_id(ret_btf, ret_btf_id);
+ if (!ret_t || !__btf_type_is_struct(ret_t)) {
+ verbose(env, "bpf_obj_new/bpf_percpu_obj_new type ID argument must be of a struct\n");
+ return -EINVAL;
+ }
+
+ if (meta->func_id == special_kfunc_list[KF_bpf_percpu_obj_new_impl]) {
+ if (ret_t->size > BPF_GLOBAL_PERCPU_MA_MAX_SIZE) {
+ verbose(env, "bpf_percpu_obj_new type size (%d) is greater than %d\n",
+ ret_t->size, BPF_GLOBAL_PERCPU_MA_MAX_SIZE);
+ return -EINVAL;
+ }
+
+ if (!bpf_global_percpu_ma_set) {
+ mutex_lock(&bpf_percpu_ma_lock);
+ if (!bpf_global_percpu_ma_set) {
+ /* Charge memory allocated with bpf_global_percpu_ma to
+ * root memcg. The obj_cgroup for root memcg is NULL.
+ */
+ err = bpf_mem_alloc_percpu_init(&bpf_global_percpu_ma, NULL);
+ if (!err)
+ bpf_global_percpu_ma_set = true;
+ }
+ mutex_unlock(&bpf_percpu_ma_lock);
+ if (err)
+ return err;
+ }
+
+ mutex_lock(&bpf_percpu_ma_lock);
+ err = bpf_mem_alloc_percpu_unit_init(&bpf_global_percpu_ma, ret_t->size);
+ mutex_unlock(&bpf_percpu_ma_lock);
+ if (err)
+ return err;
+ }
+
+ struct_meta = btf_find_struct_meta(ret_btf, ret_btf_id);
+ if (meta->func_id == special_kfunc_list[KF_bpf_percpu_obj_new_impl]) {
+ if (!__btf_type_is_scalar_struct(env, ret_btf, ret_t, 0)) {
+ verbose(env, "bpf_percpu_obj_new type ID argument must be of a struct of scalars\n");
+ return -EINVAL;
+ }
+
+ if (struct_meta) {
+ verbose(env, "bpf_percpu_obj_new type ID argument must not contain special fields\n");
+ return -EINVAL;
+ }
+ }
+
+ mark_reg_known_zero(env, regs, BPF_REG_0);
+ regs[BPF_REG_0].type = PTR_TO_BTF_ID | MEM_ALLOC;
+ regs[BPF_REG_0].btf = ret_btf;
+ regs[BPF_REG_0].btf_id = ret_btf_id;
+ if (meta->func_id == special_kfunc_list[KF_bpf_percpu_obj_new_impl])
+ regs[BPF_REG_0].type |= MEM_PERCPU;
+
+ insn_aux->obj_new_size = ret_t->size;
+ insn_aux->kptr_struct_meta = struct_meta;
+ } else if (meta->func_id == special_kfunc_list[KF_bpf_refcount_acquire_impl]) {
+ mark_reg_known_zero(env, regs, BPF_REG_0);
+ regs[BPF_REG_0].type = PTR_TO_BTF_ID | MEM_ALLOC;
+ regs[BPF_REG_0].btf = meta->arg_btf;
+ regs[BPF_REG_0].btf_id = meta->arg_btf_id;
+
+ insn_aux->kptr_struct_meta =
+ btf_find_struct_meta(meta->arg_btf,
+ meta->arg_btf_id);
+ } else if (is_list_node_type(ptr_type)) {
+ struct btf_field *field = meta->arg_list_head.field;
+
+ mark_reg_graph_node(regs, BPF_REG_0, &field->graph_root);
+ } else if (is_rbtree_node_type(ptr_type)) {
+ struct btf_field *field = meta->arg_rbtree_root.field;
+
+ mark_reg_graph_node(regs, BPF_REG_0, &field->graph_root);
+ } else if (meta->func_id == special_kfunc_list[KF_bpf_cast_to_kern_ctx]) {
+ mark_reg_known_zero(env, regs, BPF_REG_0);
+ regs[BPF_REG_0].type = PTR_TO_BTF_ID | PTR_TRUSTED;
+ regs[BPF_REG_0].btf = desc_btf;
+ regs[BPF_REG_0].btf_id = meta->ret_btf_id;
+ } else if (meta->func_id == special_kfunc_list[KF_bpf_rdonly_cast]) {
+ ret_t = btf_type_by_id(desc_btf, meta->arg_constant.value);
+ if (!ret_t) {
+ verbose(env, "Unknown type ID %lld passed to kfunc bpf_rdonly_cast\n",
+ meta->arg_constant.value);
+ return -EINVAL;
+ } else if (btf_type_is_struct(ret_t)) {
+ mark_reg_known_zero(env, regs, BPF_REG_0);
+ regs[BPF_REG_0].type = PTR_TO_BTF_ID | PTR_UNTRUSTED;
+ regs[BPF_REG_0].btf = desc_btf;
+ regs[BPF_REG_0].btf_id = meta->arg_constant.value;
+ } else if (btf_type_is_void(ret_t)) {
+ mark_reg_known_zero(env, regs, BPF_REG_0);
+ regs[BPF_REG_0].type = PTR_TO_MEM | MEM_RDONLY | PTR_UNTRUSTED;
+ regs[BPF_REG_0].mem_size = 0;
+ } else {
+ verbose(env,
+ "kfunc bpf_rdonly_cast type ID argument must be of a struct or void\n");
+ return -EINVAL;
+ }
+ } else if (meta->func_id == special_kfunc_list[KF_bpf_dynptr_slice] ||
+ meta->func_id == special_kfunc_list[KF_bpf_dynptr_slice_rdwr]) {
+ enum bpf_type_flag type_flag = get_dynptr_type_flag(meta->initialized_dynptr.type);
+
+ mark_reg_known_zero(env, regs, BPF_REG_0);
+
+ if (!meta->arg_constant.found) {
+ verifier_bug(env, "bpf_dynptr_slice(_rdwr) no constant size");
+ return -EFAULT;
+ }
+
+ regs[BPF_REG_0].mem_size = meta->arg_constant.value;
+
+ /* PTR_MAYBE_NULL will be added when is_kfunc_ret_null is checked */
+ regs[BPF_REG_0].type = PTR_TO_MEM | type_flag;
+
+ if (meta->func_id == special_kfunc_list[KF_bpf_dynptr_slice]) {
+ regs[BPF_REG_0].type |= MEM_RDONLY;
+ } else {
+ /* this will set env->seen_direct_write to true */
+ if (!may_access_direct_pkt_data(env, NULL, BPF_WRITE)) {
+ verbose(env, "the prog does not allow writes to packet data\n");
+ return -EINVAL;
+ }
+ }
+
+ if (!meta->initialized_dynptr.id) {
+ verifier_bug(env, "no dynptr id");
+ return -EFAULT;
+ }
+ regs[BPF_REG_0].dynptr_id = meta->initialized_dynptr.id;
+
+ /* we don't need to set BPF_REG_0's ref obj id
+ * because packet slices are not refcounted (see
+ * dynptr_type_refcounted)
+ */
+ } else {
+ return 0;
+ }
+
+ return 1;
+}
+
static int check_return_code(struct bpf_verifier_env *env, int regno, const char *reg_name);
static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
@@ -13434,7 +13755,6 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
struct bpf_insn_aux_data *insn_aux;
int err, insn_idx = *insn_idx_p;
const struct btf_param *args;
- const struct btf_type *ret_t;
struct btf *desc_btf;
/* skip for now, but return error when we find this in fixup_kfunc_call */
@@ -13476,6 +13796,9 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
return err;
}
__mark_btf_func_reg_size(env, regs, BPF_REG_0, sizeof(u32));
+ } else if (!insn->off && insn->imm == special_kfunc_list[KF___bpf_trap]) {
+ verbose(env, "unexpected __bpf_trap() due to uninitialized variable?\n");
+ return -EFAULT;
}
if (is_kfunc_destructive(&meta) && !capable(CAP_SYS_BOOT)) {
@@ -13654,165 +13977,10 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
mark_btf_func_reg_size(env, BPF_REG_0, t->size);
} else if (btf_type_is_ptr(t)) {
ptr_type = btf_type_skip_modifiers(desc_btf, t->type, &ptr_type_id);
-
- if (meta.btf == btf_vmlinux && btf_id_set_contains(&special_kfunc_set, meta.func_id)) {
- if (meta.func_id == special_kfunc_list[KF_bpf_obj_new_impl] ||
- meta.func_id == special_kfunc_list[KF_bpf_percpu_obj_new_impl]) {
- struct btf_struct_meta *struct_meta;
- struct btf *ret_btf;
- u32 ret_btf_id;
-
- if (meta.func_id == special_kfunc_list[KF_bpf_obj_new_impl] && !bpf_global_ma_set)
- return -ENOMEM;
-
- if (((u64)(u32)meta.arg_constant.value) != meta.arg_constant.value) {
- verbose(env, "local type ID argument must be in range [0, U32_MAX]\n");
- return -EINVAL;
- }
-
- ret_btf = env->prog->aux->btf;
- ret_btf_id = meta.arg_constant.value;
-
- /* This may be NULL due to user not supplying a BTF */
- if (!ret_btf) {
- verbose(env, "bpf_obj_new/bpf_percpu_obj_new requires prog BTF\n");
- return -EINVAL;
- }
-
- ret_t = btf_type_by_id(ret_btf, ret_btf_id);
- if (!ret_t || !__btf_type_is_struct(ret_t)) {
- verbose(env, "bpf_obj_new/bpf_percpu_obj_new type ID argument must be of a struct\n");
- return -EINVAL;
- }
-
- if (meta.func_id == special_kfunc_list[KF_bpf_percpu_obj_new_impl]) {
- if (ret_t->size > BPF_GLOBAL_PERCPU_MA_MAX_SIZE) {
- verbose(env, "bpf_percpu_obj_new type size (%d) is greater than %d\n",
- ret_t->size, BPF_GLOBAL_PERCPU_MA_MAX_SIZE);
- return -EINVAL;
- }
-
- if (!bpf_global_percpu_ma_set) {
- mutex_lock(&bpf_percpu_ma_lock);
- if (!bpf_global_percpu_ma_set) {
- /* Charge memory allocated with bpf_global_percpu_ma to
- * root memcg. The obj_cgroup for root memcg is NULL.
- */
- err = bpf_mem_alloc_percpu_init(&bpf_global_percpu_ma, NULL);
- if (!err)
- bpf_global_percpu_ma_set = true;
- }
- mutex_unlock(&bpf_percpu_ma_lock);
- if (err)
- return err;
- }
-
- mutex_lock(&bpf_percpu_ma_lock);
- err = bpf_mem_alloc_percpu_unit_init(&bpf_global_percpu_ma, ret_t->size);
- mutex_unlock(&bpf_percpu_ma_lock);
- if (err)
- return err;
- }
-
- struct_meta = btf_find_struct_meta(ret_btf, ret_btf_id);
- if (meta.func_id == special_kfunc_list[KF_bpf_percpu_obj_new_impl]) {
- if (!__btf_type_is_scalar_struct(env, ret_btf, ret_t, 0)) {
- verbose(env, "bpf_percpu_obj_new type ID argument must be of a struct of scalars\n");
- return -EINVAL;
- }
-
- if (struct_meta) {
- verbose(env, "bpf_percpu_obj_new type ID argument must not contain special fields\n");
- return -EINVAL;
- }
- }
-
- mark_reg_known_zero(env, regs, BPF_REG_0);
- regs[BPF_REG_0].type = PTR_TO_BTF_ID | MEM_ALLOC;
- regs[BPF_REG_0].btf = ret_btf;
- regs[BPF_REG_0].btf_id = ret_btf_id;
- if (meta.func_id == special_kfunc_list[KF_bpf_percpu_obj_new_impl])
- regs[BPF_REG_0].type |= MEM_PERCPU;
-
- insn_aux->obj_new_size = ret_t->size;
- insn_aux->kptr_struct_meta = struct_meta;
- } else if (meta.func_id == special_kfunc_list[KF_bpf_refcount_acquire_impl]) {
- mark_reg_known_zero(env, regs, BPF_REG_0);
- regs[BPF_REG_0].type = PTR_TO_BTF_ID | MEM_ALLOC;
- regs[BPF_REG_0].btf = meta.arg_btf;
- regs[BPF_REG_0].btf_id = meta.arg_btf_id;
-
- insn_aux->kptr_struct_meta =
- btf_find_struct_meta(meta.arg_btf,
- meta.arg_btf_id);
- } else if (meta.func_id == special_kfunc_list[KF_bpf_list_pop_front] ||
- meta.func_id == special_kfunc_list[KF_bpf_list_pop_back]) {
- struct btf_field *field = meta.arg_list_head.field;
-
- mark_reg_graph_node(regs, BPF_REG_0, &field->graph_root);
- } else if (meta.func_id == special_kfunc_list[KF_bpf_rbtree_remove] ||
- meta.func_id == special_kfunc_list[KF_bpf_rbtree_first]) {
- struct btf_field *field = meta.arg_rbtree_root.field;
-
- mark_reg_graph_node(regs, BPF_REG_0, &field->graph_root);
- } else if (meta.func_id == special_kfunc_list[KF_bpf_cast_to_kern_ctx]) {
- mark_reg_known_zero(env, regs, BPF_REG_0);
- regs[BPF_REG_0].type = PTR_TO_BTF_ID | PTR_TRUSTED;
- regs[BPF_REG_0].btf = desc_btf;
- regs[BPF_REG_0].btf_id = meta.ret_btf_id;
- } else if (meta.func_id == special_kfunc_list[KF_bpf_rdonly_cast]) {
- ret_t = btf_type_by_id(desc_btf, meta.arg_constant.value);
- if (!ret_t || !btf_type_is_struct(ret_t)) {
- verbose(env,
- "kfunc bpf_rdonly_cast type ID argument must be of a struct\n");
- return -EINVAL;
- }
-
- mark_reg_known_zero(env, regs, BPF_REG_0);
- regs[BPF_REG_0].type = PTR_TO_BTF_ID | PTR_UNTRUSTED;
- regs[BPF_REG_0].btf = desc_btf;
- regs[BPF_REG_0].btf_id = meta.arg_constant.value;
- } else if (meta.func_id == special_kfunc_list[KF_bpf_dynptr_slice] ||
- meta.func_id == special_kfunc_list[KF_bpf_dynptr_slice_rdwr]) {
- enum bpf_type_flag type_flag = get_dynptr_type_flag(meta.initialized_dynptr.type);
-
- mark_reg_known_zero(env, regs, BPF_REG_0);
-
- if (!meta.arg_constant.found) {
- verbose(env, "verifier internal error: bpf_dynptr_slice(_rdwr) no constant size\n");
- return -EFAULT;
- }
-
- regs[BPF_REG_0].mem_size = meta.arg_constant.value;
-
- /* PTR_MAYBE_NULL will be added when is_kfunc_ret_null is checked */
- regs[BPF_REG_0].type = PTR_TO_MEM | type_flag;
-
- if (meta.func_id == special_kfunc_list[KF_bpf_dynptr_slice]) {
- regs[BPF_REG_0].type |= MEM_RDONLY;
- } else {
- /* this will set env->seen_direct_write to true */
- if (!may_access_direct_pkt_data(env, NULL, BPF_WRITE)) {
- verbose(env, "the prog does not allow writes to packet data\n");
- return -EINVAL;
- }
- }
-
- if (!meta.initialized_dynptr.id) {
- verbose(env, "verifier internal error: no dynptr id\n");
- return -EFAULT;
- }
- regs[BPF_REG_0].dynptr_id = meta.initialized_dynptr.id;
-
- /* we don't need to set BPF_REG_0's ref obj id
- * because packet slices are not refcounted (see
- * dynptr_type_refcounted)
- */
- } else {
- verbose(env, "kernel function %s unhandled dynamic return type\n",
- meta.func_name);
- return -EFAULT;
- }
+ err = check_special_kfunc(env, &meta, regs, insn_aux, ptr_type, desc_btf);
+ if (err) {
+ if (err < 0)
+ return err;
} else if (btf_type_is_void(ptr_type)) {
/* kfunc returning 'void *' is equivalent to returning scalar */
mark_reg_unknown(env, regs, BPF_REG_0);
@@ -13881,14 +14049,14 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
if (is_kfunc_ret_null(&meta))
regs[BPF_REG_0].id = id;
regs[BPF_REG_0].ref_obj_id = id;
- } else if (meta.func_id == special_kfunc_list[KF_bpf_rbtree_first]) {
+ } else if (is_rbtree_node_type(ptr_type) || is_list_node_type(ptr_type)) {
ref_set_non_owning(env, &regs[BPF_REG_0]);
}
if (reg_may_point_to_spin_lock(&regs[BPF_REG_0]) && !regs[BPF_REG_0].id)
regs[BPF_REG_0].id = ++env->id_gen;
} else if (btf_type_is_void(t)) {
- if (meta.btf == btf_vmlinux && btf_id_set_contains(&special_kfunc_set, meta.func_id)) {
+ if (meta.btf == btf_vmlinux) {
if (meta.func_id == special_kfunc_list[KF_bpf_obj_drop_impl] ||
meta.func_id == special_kfunc_list[KF_bpf_percpu_obj_drop_impl]) {
insn_aux->kptr_struct_meta =
@@ -13997,7 +14165,9 @@ static int retrieve_ptr_limit(const struct bpf_reg_state *ptr_reg,
static bool can_skip_alu_sanitation(const struct bpf_verifier_env *env,
const struct bpf_insn *insn)
{
- return env->bypass_spec_v1 || BPF_SRC(insn->code) == BPF_K;
+ return env->bypass_spec_v1 ||
+ BPF_SRC(insn->code) == BPF_K ||
+ cur_aux(env)->nospec;
}
static int update_alu_sanitation_state(struct bpf_insn_aux_data *aux,
@@ -14197,10 +14367,9 @@ static int sanitize_err(struct bpf_verifier_env *env,
case REASON_STACK:
verbose(env, "R%d could not be pushed for speculative verification, %s\n",
dst, err);
- break;
+ return -ENOMEM;
default:
- verbose(env, "verifier internal error: unknown reason (%d)\n",
- reason);
+ verifier_bug(env, "unknown reason (%d)", reason);
break;
}
@@ -14267,7 +14436,7 @@ static int sanitize_check_bounds(struct bpf_verifier_env *env,
}
break;
default:
- break;
+ return -EOPNOTSUPP;
}
return 0;
@@ -14294,7 +14463,7 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
struct bpf_sanitize_info info = {};
u8 opcode = BPF_OP(insn->code);
u32 dst = insn->dst_reg;
- int ret;
+ int ret, bounds_ret;
dst_reg = &regs[dst];
@@ -14326,6 +14495,13 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
return -EACCES;
}
+ /*
+ * Accesses to untrusted PTR_TO_MEM are done through probe
+ * instructions, hence no need to track offsets.
+ */
+ if (base_type(ptr_reg->type) == PTR_TO_MEM && (ptr_reg->type & PTR_UNTRUSTED))
+ return 0;
+
switch (base_type(ptr_reg->type)) {
case PTR_TO_CTX:
case PTR_TO_MAP_VALUE:
@@ -14494,11 +14670,19 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
if (!check_reg_sane_offset(env, dst_reg, ptr_reg->type))
return -EINVAL;
reg_bounds_sync(dst_reg);
- if (sanitize_check_bounds(env, insn, dst_reg) < 0)
- return -EACCES;
+ bounds_ret = sanitize_check_bounds(env, insn, dst_reg);
+ if (bounds_ret == -EACCES)
+ return bounds_ret;
if (sanitize_needed(opcode)) {
ret = sanitize_ptr_alu(env, insn, dst_reg, off_reg, dst_reg,
&info, true);
+ if (verifier_bug_if(!can_skip_alu_sanitation(env, insn)
+ && !env->cur_state->speculative
+ && bounds_ret
+ && !ret,
+ env, "Pointer type unsupported by sanitize_check_bounds() not rejected by retrieve_ptr_limit() as required")) {
+ return -EFAULT;
+ }
if (ret < 0)
return sanitize_err(env, insn, ret, off_reg, dst_reg);
}
@@ -14513,14 +14697,25 @@ static void scalar32_min_max_add(struct bpf_reg_state *dst_reg,
s32 *dst_smax = &dst_reg->s32_max_value;
u32 *dst_umin = &dst_reg->u32_min_value;
u32 *dst_umax = &dst_reg->u32_max_value;
+ u32 umin_val = src_reg->u32_min_value;
+ u32 umax_val = src_reg->u32_max_value;
+ bool min_overflow, max_overflow;
if (check_add_overflow(*dst_smin, src_reg->s32_min_value, dst_smin) ||
check_add_overflow(*dst_smax, src_reg->s32_max_value, dst_smax)) {
*dst_smin = S32_MIN;
*dst_smax = S32_MAX;
}
- if (check_add_overflow(*dst_umin, src_reg->u32_min_value, dst_umin) ||
- check_add_overflow(*dst_umax, src_reg->u32_max_value, dst_umax)) {
+
+ /* If either all additions overflow or no additions overflow, then
+ * it is okay to set: dst_umin = dst_umin + src_umin, dst_umax =
+ * dst_umax + src_umax. Otherwise (some additions overflow), set
+ * the output bounds to unbounded.
+ */
+ min_overflow = check_add_overflow(*dst_umin, umin_val, dst_umin);
+ max_overflow = check_add_overflow(*dst_umax, umax_val, dst_umax);
+
+ if (!min_overflow && max_overflow) {
*dst_umin = 0;
*dst_umax = U32_MAX;
}
@@ -14533,14 +14728,25 @@ static void scalar_min_max_add(struct bpf_reg_state *dst_reg,
s64 *dst_smax = &dst_reg->smax_value;
u64 *dst_umin = &dst_reg->umin_value;
u64 *dst_umax = &dst_reg->umax_value;
+ u64 umin_val = src_reg->umin_value;
+ u64 umax_val = src_reg->umax_value;
+ bool min_overflow, max_overflow;
if (check_add_overflow(*dst_smin, src_reg->smin_value, dst_smin) ||
check_add_overflow(*dst_smax, src_reg->smax_value, dst_smax)) {
*dst_smin = S64_MIN;
*dst_smax = S64_MAX;
}
- if (check_add_overflow(*dst_umin, src_reg->umin_value, dst_umin) ||
- check_add_overflow(*dst_umax, src_reg->umax_value, dst_umax)) {
+
+ /* If either all additions overflow or no additions overflow, then
+ * it is okay to set: dst_umin = dst_umin + src_umin, dst_umax =
+ * dst_umax + src_umax. Otherwise (some additions overflow), set
+ * the output bounds to unbounded.
+ */
+ min_overflow = check_add_overflow(*dst_umin, umin_val, dst_umin);
+ max_overflow = check_add_overflow(*dst_umax, umax_val, dst_umax);
+
+ if (!min_overflow && max_overflow) {
*dst_umin = 0;
*dst_umax = U64_MAX;
}
@@ -14551,8 +14757,11 @@ static void scalar32_min_max_sub(struct bpf_reg_state *dst_reg,
{
s32 *dst_smin = &dst_reg->s32_min_value;
s32 *dst_smax = &dst_reg->s32_max_value;
+ u32 *dst_umin = &dst_reg->u32_min_value;
+ u32 *dst_umax = &dst_reg->u32_max_value;
u32 umin_val = src_reg->u32_min_value;
u32 umax_val = src_reg->u32_max_value;
+ bool min_underflow, max_underflow;
if (check_sub_overflow(*dst_smin, src_reg->s32_max_value, dst_smin) ||
check_sub_overflow(*dst_smax, src_reg->s32_min_value, dst_smax)) {
@@ -14560,14 +14769,18 @@ static void scalar32_min_max_sub(struct bpf_reg_state *dst_reg,
*dst_smin = S32_MIN;
*dst_smax = S32_MAX;
}
- if (dst_reg->u32_min_value < umax_val) {
- /* Overflow possible, we know nothing */
- dst_reg->u32_min_value = 0;
- dst_reg->u32_max_value = U32_MAX;
- } else {
- /* Cannot overflow (as long as bounds are consistent) */
- dst_reg->u32_min_value -= umax_val;
- dst_reg->u32_max_value -= umin_val;
+
+ /* If either all subtractions underflow or no subtractions
+ * underflow, it is okay to set: dst_umin = dst_umin - src_umax,
+ * dst_umax = dst_umax - src_umin. Otherwise (some subtractions
+ * underflow), set the output bounds to unbounded.
+ */
+ min_underflow = check_sub_overflow(*dst_umin, umax_val, dst_umin);
+ max_underflow = check_sub_overflow(*dst_umax, umin_val, dst_umax);
+
+ if (min_underflow && !max_underflow) {
+ *dst_umin = 0;
+ *dst_umax = U32_MAX;
}
}
@@ -14576,8 +14789,11 @@ static void scalar_min_max_sub(struct bpf_reg_state *dst_reg,
{
s64 *dst_smin = &dst_reg->smin_value;
s64 *dst_smax = &dst_reg->smax_value;
+ u64 *dst_umin = &dst_reg->umin_value;
+ u64 *dst_umax = &dst_reg->umax_value;
u64 umin_val = src_reg->umin_value;
u64 umax_val = src_reg->umax_value;
+ bool min_underflow, max_underflow;
if (check_sub_overflow(*dst_smin, src_reg->smax_value, dst_smin) ||
check_sub_overflow(*dst_smax, src_reg->smin_value, dst_smax)) {
@@ -14585,14 +14801,18 @@ static void scalar_min_max_sub(struct bpf_reg_state *dst_reg,
*dst_smin = S64_MIN;
*dst_smax = S64_MAX;
}
- if (dst_reg->umin_value < umax_val) {
- /* Overflow possible, we know nothing */
- dst_reg->umin_value = 0;
- dst_reg->umax_value = U64_MAX;
- } else {
- /* Cannot overflow (as long as bounds are consistent) */
- dst_reg->umin_value -= umax_val;
- dst_reg->umax_value -= umin_val;
+
+ /* If either all subtractions underflow or no subtractions
+ * underflow, it is okay to set: dst_umin = dst_umin - src_umax,
+ * dst_umax = dst_umax - src_umin. Otherwise (some subtractions
+ * underflow), set the output bounds to unbounded.
+ */
+ min_underflow = check_sub_overflow(*dst_umin, umax_val, dst_umin);
+ max_underflow = check_sub_overflow(*dst_umax, umin_val, dst_umax);
+
+ if (min_underflow && !max_underflow) {
+ *dst_umin = 0;
+ *dst_umax = U64_MAX;
}
}
@@ -15054,6 +15274,7 @@ static bool is_safe_to_compute_dst_reg_range(struct bpf_insn *insn,
switch (BPF_OP(insn->code)) {
case BPF_ADD:
case BPF_SUB:
+ case BPF_NEG:
case BPF_AND:
case BPF_XOR:
case BPF_OR:
@@ -15122,6 +15343,13 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env,
scalar_min_max_sub(dst_reg, &src_reg);
dst_reg->var_off = tnum_sub(dst_reg->var_off, src_reg.var_off);
break;
+ case BPF_NEG:
+ env->fake_reg[0] = *dst_reg;
+ __mark_reg_known(dst_reg, 0);
+ scalar32_min_max_sub(dst_reg, &env->fake_reg[0]);
+ scalar_min_max_sub(dst_reg, &env->fake_reg[0]);
+ dst_reg->var_off = tnum_neg(env->fake_reg[0].var_off);
+ break;
case BPF_MUL:
dst_reg->var_off = tnum_mul(dst_reg->var_off, src_reg.var_off);
scalar32_min_max_mul(dst_reg, &src_reg);
@@ -15261,12 +15489,12 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env,
if (WARN_ON_ONCE(ptr_reg)) {
print_verifier_state(env, vstate, vstate->curframe, true);
verbose(env, "verifier internal error: unexpected ptr_reg\n");
- return -EINVAL;
+ return -EFAULT;
}
if (WARN_ON(!src_reg)) {
print_verifier_state(env, vstate, vstate->curframe, true);
verbose(env, "verifier internal error: no src_reg\n");
- return -EINVAL;
+ return -EFAULT;
}
err = adjust_scalar_min_max_vals(env, insn, dst_reg, *src_reg);
if (err)
@@ -15345,7 +15573,14 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
}
/* check dest operand */
- err = check_reg_arg(env, insn->dst_reg, DST_OP);
+ if (opcode == BPF_NEG) {
+ err = check_reg_arg(env, insn->dst_reg, DST_OP_NO_MARK);
+ err = err ?: adjust_scalar_min_max_vals(env, insn,
+ &regs[insn->dst_reg],
+ regs[insn->dst_reg]);
+ } else {
+ err = check_reg_arg(env, insn->dst_reg, DST_OP);
+ }
if (err)
return err;
@@ -16012,6 +16247,10 @@ static void regs_refine_cond_op(struct bpf_reg_state *reg1, struct bpf_reg_state
if (!is_reg_const(reg2, is_jmp32))
break;
val = reg_const_value(reg2, is_jmp32);
+ /* Forget the ranges before narrowing tnums, to avoid invariant
+ * violations if we're on a dead branch.
+ */
+ __mark_reg_unbounded(reg1);
if (is_jmp32) {
t = tnum_and(tnum_subreg(reg1->var_off), tnum_const(~val));
reg1->var_off = tnum_with_subreg(reg1->var_off, t);
@@ -16377,6 +16616,7 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
struct bpf_reg_state *eq_branch_regs;
struct linked_regs linked_regs = {};
u8 opcode = BPF_OP(insn->code);
+ int insn_flags = 0;
bool is_jmp32;
int pred = -1;
int err;
@@ -16435,6 +16675,11 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
insn->src_reg);
return -EACCES;
}
+
+ if (src_reg->type == PTR_TO_STACK)
+ insn_flags |= INSN_F_SRC_REG_STACK;
+ if (dst_reg->type == PTR_TO_STACK)
+ insn_flags |= INSN_F_DST_REG_STACK;
} else {
if (insn->src_reg != BPF_REG_0) {
verbose(env, "BPF_JMP/JMP32 uses reserved fields\n");
@@ -16444,6 +16689,15 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
memset(src_reg, 0, sizeof(*src_reg));
src_reg->type = SCALAR_VALUE;
__mark_reg_known(src_reg, insn->imm);
+
+ if (dst_reg->type == PTR_TO_STACK)
+ insn_flags |= INSN_F_DST_REG_STACK;
+ }
+
+ if (insn_flags) {
+ err = push_jmp_history(env, this_branch, insn_flags, 0);
+ if (err)
+ return err;
}
is_jmp32 = BPF_CLASS(insn->code) == BPF_JMP32;
@@ -16499,7 +16753,7 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
if (dst_reg->type == SCALAR_VALUE && dst_reg->id)
collect_linked_regs(this_branch, dst_reg->id, &linked_regs);
if (linked_regs.cnt > 1) {
- err = push_insn_history(env, this_branch, 0, linked_regs_pack(&linked_regs));
+ err = push_jmp_history(env, this_branch, 0, linked_regs_pack(&linked_regs));
if (err)
return err;
}
@@ -16654,7 +16908,7 @@ static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn)
dst_reg->btf_id = aux->btf_var.btf_id;
break;
default:
- verbose(env, "bpf verifier is misconfigured\n");
+ verifier_bug(env, "pseudo btf id: unexpected dst reg type");
return -EFAULT;
}
return 0;
@@ -16696,8 +16950,8 @@ static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn)
insn->src_reg == BPF_PSEUDO_MAP_IDX) {
dst_reg->type = CONST_PTR_TO_MAP;
} else {
- verbose(env, "bpf verifier is misconfigured\n");
- return -EINVAL;
+ verifier_bug(env, "unexpected src reg value for ldimm64");
+ return -EFAULT;
}
return 0;
@@ -16743,8 +16997,8 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn)
}
if (!env->ops->gen_ld_abs) {
- verbose(env, "bpf verifier is misconfigured\n");
- return -EINVAL;
+ verifier_bug(env, "gen_ld_abs is null");
+ return -EFAULT;
}
if (insn->dst_reg != BPF_REG_0 || insn->off != 0 ||
@@ -17154,7 +17408,7 @@ static int push_insn(int t, int w, int e, struct bpf_verifier_env *env)
/* forward- or cross-edge */
insn_state[t] = DISCOVERED | e;
} else {
- verbose(env, "insn state internal bug\n");
+ verifier_bug(env, "insn state internal bug");
return -EFAULT;
}
return DONE_EXPLORING;
@@ -17574,17 +17828,18 @@ static int check_cfg(struct bpf_verifier_env *env)
int *insn_stack, *insn_state, *insn_postorder;
int ex_insn_beg, i, ret = 0;
- insn_state = env->cfg.insn_state = kvcalloc(insn_cnt, sizeof(int), GFP_KERNEL);
+ insn_state = env->cfg.insn_state = kvcalloc(insn_cnt, sizeof(int), GFP_KERNEL_ACCOUNT);
if (!insn_state)
return -ENOMEM;
- insn_stack = env->cfg.insn_stack = kvcalloc(insn_cnt, sizeof(int), GFP_KERNEL);
+ insn_stack = env->cfg.insn_stack = kvcalloc(insn_cnt, sizeof(int), GFP_KERNEL_ACCOUNT);
if (!insn_stack) {
kvfree(insn_state);
return -ENOMEM;
}
- insn_postorder = env->cfg.insn_postorder = kvcalloc(insn_cnt, sizeof(int), GFP_KERNEL);
+ insn_postorder = env->cfg.insn_postorder =
+ kvcalloc(insn_cnt, sizeof(int), GFP_KERNEL_ACCOUNT);
if (!insn_postorder) {
kvfree(insn_state);
kvfree(insn_stack);
@@ -17614,7 +17869,7 @@ walk_cfg:
break;
default:
if (ret > 0) {
- verbose(env, "visit_insn internal bug\n");
+ verifier_bug(env, "visit_insn internal bug");
ret = -EFAULT;
}
goto err_free;
@@ -17622,7 +17877,7 @@ walk_cfg:
}
if (env->cfg.cur_stack < 0) {
- verbose(env, "pop stack internal bug\n");
+ verifier_bug(env, "pop stack internal bug");
ret = -EFAULT;
goto err_free;
}
@@ -17718,7 +17973,7 @@ static int check_btf_func_early(struct bpf_verifier_env *env,
urecord = make_bpfptr(attr->func_info, uattr.is_kernel);
min_size = min_t(u32, krec_size, urec_size);
- krecord = kvcalloc(nfuncs, krec_size, GFP_KERNEL | __GFP_NOWARN);
+ krecord = kvcalloc(nfuncs, krec_size, GFP_KERNEL_ACCOUNT | __GFP_NOWARN);
if (!krecord)
return -ENOMEM;
@@ -17818,7 +18073,7 @@ static int check_btf_func(struct bpf_verifier_env *env,
urecord = make_bpfptr(attr->func_info, uattr.is_kernel);
krecord = prog->aux->func_info;
- info_aux = kcalloc(nfuncs, sizeof(*info_aux), GFP_KERNEL | __GFP_NOWARN);
+ info_aux = kcalloc(nfuncs, sizeof(*info_aux), GFP_KERNEL_ACCOUNT | __GFP_NOWARN);
if (!info_aux)
return -ENOMEM;
@@ -17904,7 +18159,7 @@ static int check_btf_line(struct bpf_verifier_env *env,
* pass in a smaller bpf_line_info object.
*/
linfo = kvcalloc(nr_linfo, sizeof(struct bpf_line_info),
- GFP_KERNEL | __GFP_NOWARN);
+ GFP_KERNEL_ACCOUNT | __GFP_NOWARN);
if (!linfo)
return -ENOMEM;
@@ -18227,10 +18482,6 @@ static void clean_verifier_state(struct bpf_verifier_env *env,
{
int i;
- if (st->frame[0]->regs[0].live & REG_LIVE_DONE)
- /* all regs in this state in all frames were already marked */
- return;
-
for (i = 0; i <= st->curframe; i++)
clean_func_state(env, st->frame[i]);
}
@@ -18238,7 +18489,7 @@ static void clean_verifier_state(struct bpf_verifier_env *env,
/* the parentage chains form a tree.
* the verifier states are added to state lists at given insn and
* pushed into state stack for future exploration.
- * when the verifier reaches bpf_exit insn some of the verifer states
+ * when the verifier reaches bpf_exit insn some of the verifier states
* stored in the state lists have their final liveness state already,
* but a lot of states will get revised from liveness point of view when
* the verifier explores other branches.
@@ -18270,7 +18521,6 @@ static void clean_verifier_state(struct bpf_verifier_env *env,
static void clean_live_states(struct bpf_verifier_env *env, int insn,
struct bpf_verifier_state *cur)
{
- struct bpf_verifier_state *loop_entry;
struct bpf_verifier_state_list *sl;
struct list_head *pos, *head;
@@ -18279,12 +18529,14 @@ static void clean_live_states(struct bpf_verifier_env *env, int insn,
sl = container_of(pos, struct bpf_verifier_state_list, node);
if (sl->state.branches)
continue;
- loop_entry = get_loop_entry(env, &sl->state);
- if (!IS_ERR_OR_NULL(loop_entry) && loop_entry->branches)
- continue;
if (sl->state.insn_idx != insn ||
!same_callsites(&sl->state, cur))
continue;
+ if (sl->state.frame[0]->regs[0].live & REG_LIVE_DONE)
+ /* all regs in this state in all frames were already marked */
+ continue;
+ if (incomplete_read_marks(env, &sl->state))
+ continue;
clean_verifier_state(env, &sl->state);
}
}
@@ -18731,9 +18983,7 @@ static bool states_equal(struct bpf_verifier_env *env,
* and all frame states need to be equivalent
*/
for (i = 0; i <= old->curframe; i++) {
- insn_idx = i == old->curframe
- ? env->insn_idx
- : old->frame[i + 1]->callsite;
+ insn_idx = frame_insn_idx(old, i);
if (old->frame[i]->callsite != cur->frame[i]->callsite)
return false;
if (!func_states_equal(env, old->frame[i], cur->frame[i], insn_idx, exact))
@@ -18780,12 +19030,15 @@ static int propagate_liveness_reg(struct bpf_verifier_env *env,
*/
static int propagate_liveness(struct bpf_verifier_env *env,
const struct bpf_verifier_state *vstate,
- struct bpf_verifier_state *vparent)
+ struct bpf_verifier_state *vparent,
+ bool *changed)
{
struct bpf_reg_state *state_reg, *parent_reg;
struct bpf_func_state *state, *parent;
int i, frame, err = 0;
+ bool tmp = false;
+ changed = changed ?: &tmp;
if (vparent->curframe != vstate->curframe) {
WARN(1, "propagate_live: parent frame %d current frame %d\n",
vparent->curframe, vstate->curframe);
@@ -18804,6 +19057,7 @@ static int propagate_liveness(struct bpf_verifier_env *env,
&parent_reg[i]);
if (err < 0)
return err;
+ *changed |= err > 0;
if (err == REG_LIVE_READ64)
mark_insn_zext(env, &parent_reg[i]);
}
@@ -18815,6 +19069,7 @@ static int propagate_liveness(struct bpf_verifier_env *env,
state_reg = &state->stack[i].spilled_ptr;
err = propagate_liveness_reg(env, state_reg,
parent_reg);
+ *changed |= err > 0;
if (err < 0)
return err;
}
@@ -18826,7 +19081,9 @@ static int propagate_liveness(struct bpf_verifier_env *env,
* propagate them into the current state
*/
static int propagate_precision(struct bpf_verifier_env *env,
- const struct bpf_verifier_state *old)
+ const struct bpf_verifier_state *old,
+ struct bpf_verifier_state *cur,
+ bool *changed)
{
struct bpf_reg_state *state_reg;
struct bpf_func_state *state;
@@ -18874,13 +19131,53 @@ static int propagate_precision(struct bpf_verifier_env *env,
verbose(env, "\n");
}
- err = mark_chain_precision_batch(env);
+ err = __mark_chain_precision(env, cur, -1, changed);
if (err < 0)
return err;
return 0;
}
+#define MAX_BACKEDGE_ITERS 64
+
+/* Propagate read and precision marks from visit->backedges[*].state->equal_state
+ * to corresponding parent states of visit->backedges[*].state until fixed point is reached,
+ * then free visit->backedges.
+ * After execution of this function incomplete_read_marks() will return false
+ * for all states corresponding to @visit->callchain.
+ */
+static int propagate_backedges(struct bpf_verifier_env *env, struct bpf_scc_visit *visit)
+{
+ struct bpf_scc_backedge *backedge;
+ struct bpf_verifier_state *st;
+ bool changed;
+ int i, err;
+
+ i = 0;
+ do {
+ if (i++ > MAX_BACKEDGE_ITERS) {
+ if (env->log.level & BPF_LOG_LEVEL2)
+ verbose(env, "%s: too many iterations\n", __func__);
+ for (backedge = visit->backedges; backedge; backedge = backedge->next)
+ mark_all_scalars_precise(env, &backedge->state);
+ break;
+ }
+ changed = false;
+ for (backedge = visit->backedges; backedge; backedge = backedge->next) {
+ st = &backedge->state;
+ err = propagate_liveness(env, st->equal_state, st, &changed);
+ if (err)
+ return err;
+ err = propagate_precision(env, st->equal_state, st, &changed);
+ if (err)
+ return err;
+ }
+ } while (changed);
+
+ free_backedges(visit);
+ return 0;
+}
+
static bool states_maybe_looping(struct bpf_verifier_state *old,
struct bpf_verifier_state *cur)
{
@@ -18908,7 +19205,7 @@ static bool is_iter_next_insn(struct bpf_verifier_env *env, int insn_idx)
* terminology) calls specially: as opposed to bounded BPF loops, it *expects*
* states to match, which otherwise would look like an infinite loop. So while
* iter_next() calls are taken care of, we still need to be careful and
- * prevent erroneous and too eager declaration of "ininite loop", when
+ * prevent erroneous and too eager declaration of "infinite loop", when
* iterators are involved.
*
* Here's a situation in pseudo-BPF assembly form:
@@ -18950,7 +19247,7 @@ static bool is_iter_next_insn(struct bpf_verifier_env *env, int insn_idx)
*
* This approach allows to keep infinite loop heuristic even in the face of
* active iterator. E.g., C snippet below is and will be detected as
- * inifintely looping:
+ * infinitely looping:
*
* struct bpf_iter_num it;
* int *p, x;
@@ -18990,14 +19287,14 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
{
struct bpf_verifier_state_list *new_sl;
struct bpf_verifier_state_list *sl;
- struct bpf_verifier_state *cur = env->cur_state, *new, *loop_entry;
+ struct bpf_verifier_state *cur = env->cur_state, *new;
+ bool force_new_state, add_new_state, loop;
int i, j, n, err, states_cnt = 0;
- bool force_new_state, add_new_state, force_exact;
struct list_head *pos, *tmp, *head;
force_new_state = env->test_state_freq || is_force_checkpoint(env, insn_idx) ||
/* Avoid accumulating infinitely long jmp history */
- cur->insn_hist_end - cur->insn_hist_start > 40;
+ cur->jmp_history_cnt > 40;
/* bpf progs typically have pruning point every 4 instructions
* http://vger.kernel.org/bpfconf2019.html#session-1
@@ -19014,6 +19311,7 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
clean_live_states(env, insn_idx, cur);
+ loop = false;
head = explored_state(env, insn_idx);
list_for_each_safe(pos, tmp, head) {
sl = container_of(pos, struct bpf_verifier_state_list, node);
@@ -19093,7 +19391,7 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
spi = __get_spi(iter_reg->off + iter_reg->var_off.value);
iter_state = &func(env, iter_reg)->stack[spi].spilled_ptr;
if (iter_state->iter.state == BPF_ITER_STATE_ACTIVE) {
- update_loop_entry(env, cur, &sl->state);
+ loop = true;
goto hit;
}
}
@@ -19102,7 +19400,7 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
if (is_may_goto_insn_at(env, insn_idx)) {
if (sl->state.may_goto_depth != cur->may_goto_depth &&
states_equal(env, &sl->state, cur, RANGE_WITHIN)) {
- update_loop_entry(env, cur, &sl->state);
+ loop = true;
goto hit;
}
}
@@ -19144,38 +19442,9 @@ skip_inf_loop_check:
add_new_state = false;
goto miss;
}
- /* If sl->state is a part of a loop and this loop's entry is a part of
- * current verification path then states have to be compared exactly.
- * 'force_exact' is needed to catch the following case:
- *
- * initial Here state 'succ' was processed first,
- * | it was eventually tracked to produce a
- * V state identical to 'hdr'.
- * .---------> hdr All branches from 'succ' had been explored
- * | | and thus 'succ' has its .branches == 0.
- * | V
- * | .------... Suppose states 'cur' and 'succ' correspond
- * | | | to the same instruction + callsites.
- * | V V In such case it is necessary to check
- * | ... ... if 'succ' and 'cur' are states_equal().
- * | | | If 'succ' and 'cur' are a part of the
- * | V V same loop exact flag has to be set.
- * | succ <- cur To check if that is the case, verify
- * | | if loop entry of 'succ' is in current
- * | V DFS path.
- * | ...
- * | |
- * '----'
- *
- * Additional details are in the comment before get_loop_entry().
- */
- loop_entry = get_loop_entry(env, &sl->state);
- if (IS_ERR(loop_entry))
- return PTR_ERR(loop_entry);
- force_exact = loop_entry && loop_entry->branches > 0;
- if (states_equal(env, &sl->state, cur, force_exact ? RANGE_WITHIN : NOT_EXACT)) {
- if (force_exact)
- update_loop_entry(env, cur, loop_entry);
+ /* See comments for mark_all_regs_read_and_precise() */
+ loop = incomplete_read_marks(env, &sl->state);
+ if (states_equal(env, &sl->state, cur, loop ? RANGE_WITHIN : NOT_EXACT)) {
hit:
sl->hit_cnt++;
/* reached equivalent register/stack state,
@@ -19188,7 +19457,7 @@ hit:
* they'll be immediately forgotten as we're pruning
* this state and will pop a new one.
*/
- err = propagate_liveness(env, &sl->state, cur);
+ err = propagate_liveness(env, &sl->state, cur, NULL);
/* if previous state reached the exit with precision and
* current state is equivalent to it (except precision marks)
@@ -19196,10 +19465,98 @@ hit:
* the current state.
*/
if (is_jmp_point(env, env->insn_idx))
- err = err ? : push_insn_history(env, cur, 0, 0);
- err = err ? : propagate_precision(env, &sl->state);
+ err = err ? : push_jmp_history(env, cur, 0, 0);
+ err = err ? : propagate_precision(env, &sl->state, cur, NULL);
if (err)
return err;
+ /* When processing iterator based loops above propagate_liveness and
+ * propagate_precision calls are not sufficient to transfer all relevant
+ * read and precision marks. E.g. consider the following case:
+ *
+ * .-> A --. Assume the states are visited in the order A, B, C.
+ * | | | Assume that state B reaches a state equivalent to state A.
+ * | v v At this point, state C is not processed yet, so state A
+ * '-- B C has not received any read or precision marks from C.
+ * Thus, marks propagated from A to B are incomplete.
+ *
+ * The verifier mitigates this by performing the following steps:
+ *
+ * - Prior to the main verification pass, strongly connected components
+ * (SCCs) are computed over the program's control flow graph,
+ * intraprocedurally.
+ *
+ * - During the main verification pass, `maybe_enter_scc()` checks
+ * whether the current verifier state is entering an SCC. If so, an
+ * instance of a `bpf_scc_visit` object is created, and the state
+ * entering the SCC is recorded as the entry state.
+ *
+ * - This instance is associated not with the SCC itself, but with a
+ * `bpf_scc_callchain`: a tuple consisting of the call sites leading to
+ * the SCC and the SCC id. See `compute_scc_callchain()`.
+ *
+ * - When a verification path encounters a `states_equal(...,
+ * RANGE_WITHIN)` condition, there exists a call chain describing the
+ * current state and a corresponding `bpf_scc_visit` instance. A copy
+ * of the current state is created and added to
+ * `bpf_scc_visit->backedges`.
+ *
+ * - When a verification path terminates, `maybe_exit_scc()` is called
+ * from `update_branch_counts()`. For states with `branches == 0`, it
+ * checks whether the state is the entry state of any `bpf_scc_visit`
+ * instance. If it is, this indicates that all paths originating from
+ * this SCC visit have been explored. `propagate_backedges()` is then
+ * called, which propagates read and precision marks through the
+ * backedges until a fixed point is reached.
+ * (In the earlier example, this would propagate marks from A to B,
+ * from C to A, and then again from A to B.)
+ *
+ * A note on callchains
+ * --------------------
+ *
+ * Consider the following example:
+ *
+ * void foo() { loop { ... SCC#1 ... } }
+ * void main() {
+ * A: foo();
+ * B: ...
+ * C: foo();
+ * }
+ *
+ * Here, there are two distinct callchains leading to SCC#1:
+ * - (A, SCC#1)
+ * - (C, SCC#1)
+ *
+ * Each callchain identifies a separate `bpf_scc_visit` instance that
+ * accumulates backedge states. The `propagate_{liveness,precision}()`
+ * functions traverse the parent state of each backedge state, which
+ * means these parent states must remain valid (i.e., not freed) while
+ * the corresponding `bpf_scc_visit` instance exists.
+ *
+ * Associating `bpf_scc_visit` instances directly with SCCs instead of
+ * callchains would break this invariant:
+ * - States explored during `C: foo()` would contribute backedges to
+ * SCC#1, but SCC#1 would only be exited once the exploration of
+ * `A: foo()` completes.
+ * - By that time, the states explored between `A: foo()` and `C: foo()`
+ * (i.e., `B: ...`) may have already been freed, causing the parent
+ * links for states from `C: foo()` to become invalid.
+ */
+ if (loop) {
+ struct bpf_scc_backedge *backedge;
+
+ backedge = kzalloc(sizeof(*backedge), GFP_KERNEL_ACCOUNT);
+ if (!backedge)
+ return -ENOMEM;
+ err = copy_verifier_state(&backedge->state, cur);
+ backedge->state.equal_state = &sl->state;
+ backedge->state.insn_idx = insn_idx;
+ err = err ?: add_scc_backedge(env, &sl->state, backedge);
+ if (err) {
+ free_verifier_state(&backedge->state, false);
+ kvfree(backedge);
+ return err;
+ }
+ }
return 1;
}
miss:
@@ -19251,7 +19608,7 @@ miss:
* When looping the sl->state.branches will be > 0 and this state
* will not be considered for equivalence until branches == 0.
*/
- new_sl = kzalloc(sizeof(struct bpf_verifier_state_list), GFP_KERNEL);
+ new_sl = kzalloc(sizeof(struct bpf_verifier_state_list), GFP_KERNEL_ACCOUNT);
if (!new_sl)
return -ENOMEM;
env->total_states++;
@@ -19273,13 +19630,20 @@ miss:
return err;
}
new->insn_idx = insn_idx;
- WARN_ONCE(new->branches != 1,
- "BUG is_state_visited:branches_to_explore=%d insn %d\n", new->branches, insn_idx);
+ verifier_bug_if(new->branches != 1, env,
+ "%s:branches_to_explore=%d insn %d",
+ __func__, new->branches, insn_idx);
+ err = maybe_enter_scc(env, new);
+ if (err) {
+ free_verifier_state(new, false);
+ kvfree(new_sl);
+ return err;
+ }
cur->parent = new;
cur->first_insn_idx = insn_idx;
- cur->insn_hist_start = cur->insn_hist_end;
cur->dfs_depth = new->dfs_depth + 1;
+ clear_jmp_history(cur);
list_add(&new_sl->node, head);
/* connect new state to parentage chain. Current frame needs all
@@ -19351,10 +19715,27 @@ static bool reg_type_mismatch(enum bpf_reg_type src, enum bpf_reg_type prev)
!reg_type_mismatch_ok(prev));
}
+static bool is_ptr_to_mem_or_btf_id(enum bpf_reg_type type)
+{
+ switch (base_type(type)) {
+ case PTR_TO_MEM:
+ case PTR_TO_BTF_ID:
+ return true;
+ default:
+ return false;
+ }
+}
+
+static bool is_ptr_to_mem(enum bpf_reg_type type)
+{
+ return base_type(type) == PTR_TO_MEM;
+}
+
static int save_aux_ptr_type(struct bpf_verifier_env *env, enum bpf_reg_type type,
bool allow_trust_mismatch)
{
enum bpf_reg_type *prev_type = &env->insn_aux_data[env->insn_idx].ptr_type;
+ enum bpf_reg_type merged_type;
if (*prev_type == NOT_INIT) {
/* Saw a valid insn
@@ -19371,15 +19752,24 @@ static int save_aux_ptr_type(struct bpf_verifier_env *env, enum bpf_reg_type typ
* Reject it.
*/
if (allow_trust_mismatch &&
- base_type(type) == PTR_TO_BTF_ID &&
- base_type(*prev_type) == PTR_TO_BTF_ID) {
+ is_ptr_to_mem_or_btf_id(type) &&
+ is_ptr_to_mem_or_btf_id(*prev_type)) {
/*
* Have to support a use case when one path through
* the program yields TRUSTED pointer while another
* is UNTRUSTED. Fallback to UNTRUSTED to generate
* BPF_PROBE_MEM/BPF_PROBE_MEMSX.
+ * Same behavior of MEM_RDONLY flag.
*/
- *prev_type = PTR_TO_BTF_ID | PTR_UNTRUSTED;
+ if (is_ptr_to_mem(type) || is_ptr_to_mem(*prev_type))
+ merged_type = PTR_TO_MEM;
+ else
+ merged_type = PTR_TO_BTF_ID;
+ if ((type & PTR_UNTRUSTED) || (*prev_type & PTR_UNTRUSTED))
+ merged_type |= PTR_UNTRUSTED;
+ if ((type & MEM_RDONLY) || (*prev_type & MEM_RDONLY))
+ merged_type |= MEM_RDONLY;
+ *prev_type = merged_type;
} else {
verbose(env, "same insn cannot be used with different pointers\n");
return -EINVAL;
@@ -19389,20 +19779,224 @@ static int save_aux_ptr_type(struct bpf_verifier_env *env, enum bpf_reg_type typ
return 0;
}
+enum {
+ PROCESS_BPF_EXIT = 1
+};
+
+static int process_bpf_exit_full(struct bpf_verifier_env *env,
+ bool *do_print_state,
+ bool exception_exit)
+{
+ /* We must do check_reference_leak here before
+ * prepare_func_exit to handle the case when
+ * state->curframe > 0, it may be a callback function,
+ * for which reference_state must match caller reference
+ * state when it exits.
+ */
+ int err = check_resource_leak(env, exception_exit,
+ !env->cur_state->curframe,
+ "BPF_EXIT instruction in main prog");
+ if (err)
+ return err;
+
+ /* The side effect of the prepare_func_exit which is
+ * being skipped is that it frees bpf_func_state.
+ * Typically, process_bpf_exit will only be hit with
+ * outermost exit. copy_verifier_state in pop_stack will
+ * handle freeing of any extra bpf_func_state left over
+ * from not processing all nested function exits. We
+ * also skip return code checks as they are not needed
+ * for exceptional exits.
+ */
+ if (exception_exit)
+ return PROCESS_BPF_EXIT;
+
+ if (env->cur_state->curframe) {
+ /* exit from nested function */
+ err = prepare_func_exit(env, &env->insn_idx);
+ if (err)
+ return err;
+ *do_print_state = true;
+ return 0;
+ }
+
+ err = check_return_code(env, BPF_REG_0, "R0");
+ if (err)
+ return err;
+ return PROCESS_BPF_EXIT;
+}
+
+static int do_check_insn(struct bpf_verifier_env *env, bool *do_print_state)
+{
+ int err;
+ struct bpf_insn *insn = &env->prog->insnsi[env->insn_idx];
+ u8 class = BPF_CLASS(insn->code);
+
+ if (class == BPF_ALU || class == BPF_ALU64) {
+ err = check_alu_op(env, insn);
+ if (err)
+ return err;
+
+ } else if (class == BPF_LDX) {
+ bool is_ldsx = BPF_MODE(insn->code) == BPF_MEMSX;
+
+ /* Check for reserved fields is already done in
+ * resolve_pseudo_ldimm64().
+ */
+ err = check_load_mem(env, insn, false, is_ldsx, true, "ldx");
+ if (err)
+ return err;
+ } else if (class == BPF_STX) {
+ if (BPF_MODE(insn->code) == BPF_ATOMIC) {
+ err = check_atomic(env, insn);
+ if (err)
+ return err;
+ env->insn_idx++;
+ return 0;
+ }
+
+ if (BPF_MODE(insn->code) != BPF_MEM || insn->imm != 0) {
+ verbose(env, "BPF_STX uses reserved fields\n");
+ return -EINVAL;
+ }
+
+ err = check_store_reg(env, insn, false);
+ if (err)
+ return err;
+ } else if (class == BPF_ST) {
+ enum bpf_reg_type dst_reg_type;
+
+ if (BPF_MODE(insn->code) != BPF_MEM ||
+ insn->src_reg != BPF_REG_0) {
+ verbose(env, "BPF_ST uses reserved fields\n");
+ return -EINVAL;
+ }
+ /* check src operand */
+ err = check_reg_arg(env, insn->dst_reg, SRC_OP);
+ if (err)
+ return err;
+
+ dst_reg_type = cur_regs(env)[insn->dst_reg].type;
+
+ /* check that memory (dst_reg + off) is writeable */
+ err = check_mem_access(env, env->insn_idx, insn->dst_reg,
+ insn->off, BPF_SIZE(insn->code),
+ BPF_WRITE, -1, false, false);
+ if (err)
+ return err;
+
+ err = save_aux_ptr_type(env, dst_reg_type, false);
+ if (err)
+ return err;
+ } else if (class == BPF_JMP || class == BPF_JMP32) {
+ u8 opcode = BPF_OP(insn->code);
+
+ env->jmps_processed++;
+ if (opcode == BPF_CALL) {
+ if (BPF_SRC(insn->code) != BPF_K ||
+ (insn->src_reg != BPF_PSEUDO_KFUNC_CALL &&
+ insn->off != 0) ||
+ (insn->src_reg != BPF_REG_0 &&
+ insn->src_reg != BPF_PSEUDO_CALL &&
+ insn->src_reg != BPF_PSEUDO_KFUNC_CALL) ||
+ insn->dst_reg != BPF_REG_0 || class == BPF_JMP32) {
+ verbose(env, "BPF_CALL uses reserved fields\n");
+ return -EINVAL;
+ }
+
+ if (env->cur_state->active_locks) {
+ if ((insn->src_reg == BPF_REG_0 &&
+ insn->imm != BPF_FUNC_spin_unlock) ||
+ (insn->src_reg == BPF_PSEUDO_KFUNC_CALL &&
+ (insn->off != 0 || !kfunc_spin_allowed(insn->imm)))) {
+ verbose(env,
+ "function calls are not allowed while holding a lock\n");
+ return -EINVAL;
+ }
+ }
+ if (insn->src_reg == BPF_PSEUDO_CALL) {
+ err = check_func_call(env, insn, &env->insn_idx);
+ } else if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL) {
+ err = check_kfunc_call(env, insn, &env->insn_idx);
+ if (!err && is_bpf_throw_kfunc(insn))
+ return process_bpf_exit_full(env, do_print_state, true);
+ } else {
+ err = check_helper_call(env, insn, &env->insn_idx);
+ }
+ if (err)
+ return err;
+
+ mark_reg_scratched(env, BPF_REG_0);
+ } else if (opcode == BPF_JA) {
+ if (BPF_SRC(insn->code) != BPF_K ||
+ insn->src_reg != BPF_REG_0 ||
+ insn->dst_reg != BPF_REG_0 ||
+ (class == BPF_JMP && insn->imm != 0) ||
+ (class == BPF_JMP32 && insn->off != 0)) {
+ verbose(env, "BPF_JA uses reserved fields\n");
+ return -EINVAL;
+ }
+
+ if (class == BPF_JMP)
+ env->insn_idx += insn->off + 1;
+ else
+ env->insn_idx += insn->imm + 1;
+ return 0;
+ } else if (opcode == BPF_EXIT) {
+ if (BPF_SRC(insn->code) != BPF_K ||
+ insn->imm != 0 ||
+ insn->src_reg != BPF_REG_0 ||
+ insn->dst_reg != BPF_REG_0 ||
+ class == BPF_JMP32) {
+ verbose(env, "BPF_EXIT uses reserved fields\n");
+ return -EINVAL;
+ }
+ return process_bpf_exit_full(env, do_print_state, false);
+ } else {
+ err = check_cond_jmp_op(env, insn, &env->insn_idx);
+ if (err)
+ return err;
+ }
+ } else if (class == BPF_LD) {
+ u8 mode = BPF_MODE(insn->code);
+
+ if (mode == BPF_ABS || mode == BPF_IND) {
+ err = check_ld_abs(env, insn);
+ if (err)
+ return err;
+
+ } else if (mode == BPF_IMM) {
+ err = check_ld_imm(env, insn);
+ if (err)
+ return err;
+
+ env->insn_idx++;
+ sanitize_mark_insn_seen(env);
+ } else {
+ verbose(env, "invalid BPF_LD mode\n");
+ return -EINVAL;
+ }
+ } else {
+ verbose(env, "unknown insn class %d\n", class);
+ return -EINVAL;
+ }
+
+ env->insn_idx++;
+ return 0;
+}
+
static int do_check(struct bpf_verifier_env *env)
{
bool pop_log = !(env->log.level & BPF_LOG_LEVEL2);
struct bpf_verifier_state *state = env->cur_state;
struct bpf_insn *insns = env->prog->insnsi;
- struct bpf_reg_state *regs;
int insn_cnt = env->prog->len;
bool do_print_state = false;
int prev_insn_idx = -1;
for (;;) {
- bool exception_exit = false;
struct bpf_insn *insn;
- u8 class;
+ struct bpf_insn_aux_data *insn_aux;
int err;
/* reset current history entry on each new instruction */
@@ -19416,7 +20010,7 @@ static int do_check(struct bpf_verifier_env *env)
}
insn = &insns[env->insn_idx];
- class = BPF_CLASS(insn->code);
+ insn_aux = &env->insn_aux_data[env->insn_idx];
if (++env->insn_processed > BPF_COMPLEXITY_LIMIT_INSNS) {
verbose(env,
@@ -19426,6 +20020,7 @@ static int do_check(struct bpf_verifier_env *env)
}
state->last_insn_idx = env->prev_insn_idx;
+ state->insn_idx = env->insn_idx;
if (is_prune_point(env, env->insn_idx)) {
err = is_state_visited(env, env->insn_idx);
@@ -19447,7 +20042,7 @@ static int do_check(struct bpf_verifier_env *env)
}
if (is_jmp_point(env, env->insn_idx)) {
- err = push_insn_history(env, state, 0, 0);
+ err = push_jmp_history(env, state, 0, 0);
if (err)
return err;
}
@@ -19486,216 +20081,67 @@ static int do_check(struct bpf_verifier_env *env)
return err;
}
- regs = cur_regs(env);
sanitize_mark_insn_seen(env);
prev_insn_idx = env->insn_idx;
- if (class == BPF_ALU || class == BPF_ALU64) {
- err = check_alu_op(env, insn);
- if (err)
- return err;
-
- } else if (class == BPF_LDX) {
- bool is_ldsx = BPF_MODE(insn->code) == BPF_MEMSX;
+ /* Reduce verification complexity by stopping speculative path
+ * verification when a nospec is encountered.
+ */
+ if (state->speculative && insn_aux->nospec)
+ goto process_bpf_exit;
- /* Check for reserved fields is already done in
- * resolve_pseudo_ldimm64().
+ err = do_check_insn(env, &do_print_state);
+ if (error_recoverable_with_nospec(err) && state->speculative) {
+ /* Prevent this speculative path from ever reaching the
+ * insn that would have been unsafe to execute.
*/
- err = check_load_mem(env, insn, false, is_ldsx, true,
- "ldx");
- if (err)
- return err;
- } else if (class == BPF_STX) {
- if (BPF_MODE(insn->code) == BPF_ATOMIC) {
- err = check_atomic(env, insn);
- if (err)
- return err;
- env->insn_idx++;
- continue;
- }
-
- if (BPF_MODE(insn->code) != BPF_MEM || insn->imm != 0) {
- verbose(env, "BPF_STX uses reserved fields\n");
- return -EINVAL;
- }
-
- err = check_store_reg(env, insn, false);
- if (err)
- return err;
- } else if (class == BPF_ST) {
- enum bpf_reg_type dst_reg_type;
-
- if (BPF_MODE(insn->code) != BPF_MEM ||
- insn->src_reg != BPF_REG_0) {
- verbose(env, "BPF_ST uses reserved fields\n");
- return -EINVAL;
- }
- /* check src operand */
- err = check_reg_arg(env, insn->dst_reg, SRC_OP);
- if (err)
- return err;
-
- dst_reg_type = regs[insn->dst_reg].type;
-
- /* check that memory (dst_reg + off) is writeable */
- err = check_mem_access(env, env->insn_idx, insn->dst_reg,
- insn->off, BPF_SIZE(insn->code),
- BPF_WRITE, -1, false, false);
- if (err)
- return err;
-
- err = save_aux_ptr_type(env, dst_reg_type, false);
+ insn_aux->nospec = true;
+ /* If it was an ADD/SUB insn, potentially remove any
+ * markings for alu sanitization.
+ */
+ insn_aux->alu_state = 0;
+ goto process_bpf_exit;
+ } else if (err < 0) {
+ return err;
+ } else if (err == PROCESS_BPF_EXIT) {
+ goto process_bpf_exit;
+ }
+ WARN_ON_ONCE(err);
+
+ if (state->speculative && insn_aux->nospec_result) {
+ /* If we are on a path that performed a jump-op, this
+ * may skip a nospec patched-in after the jump. This can
+ * currently never happen because nospec_result is only
+ * used for the write-ops
+ * `*(size*)(dst_reg+off)=src_reg|imm32` which must
+ * never skip the following insn. Still, add a warning
+ * to document this in case nospec_result is used
+ * elsewhere in the future.
+ *
+ * All non-branch instructions have a single
+ * fall-through edge. For these, nospec_result should
+ * already work.
+ */
+ if (verifier_bug_if(BPF_CLASS(insn->code) == BPF_JMP ||
+ BPF_CLASS(insn->code) == BPF_JMP32, env,
+ "speculation barrier after jump instruction may not have the desired effect"))
+ return -EFAULT;
+process_bpf_exit:
+ mark_verifier_state_scratched(env);
+ err = update_branch_counts(env, env->cur_state);
if (err)
return err;
- } else if (class == BPF_JMP || class == BPF_JMP32) {
- u8 opcode = BPF_OP(insn->code);
-
- env->jmps_processed++;
- if (opcode == BPF_CALL) {
- if (BPF_SRC(insn->code) != BPF_K ||
- (insn->src_reg != BPF_PSEUDO_KFUNC_CALL
- && insn->off != 0) ||
- (insn->src_reg != BPF_REG_0 &&
- insn->src_reg != BPF_PSEUDO_CALL &&
- insn->src_reg != BPF_PSEUDO_KFUNC_CALL) ||
- insn->dst_reg != BPF_REG_0 ||
- class == BPF_JMP32) {
- verbose(env, "BPF_CALL uses reserved fields\n");
- return -EINVAL;
- }
-
- if (env->cur_state->active_locks) {
- if ((insn->src_reg == BPF_REG_0 && insn->imm != BPF_FUNC_spin_unlock) ||
- (insn->src_reg == BPF_PSEUDO_KFUNC_CALL &&
- (insn->off != 0 || !kfunc_spin_allowed(insn->imm)))) {
- verbose(env, "function calls are not allowed while holding a lock\n");
- return -EINVAL;
- }
- }
- if (insn->src_reg == BPF_PSEUDO_CALL) {
- err = check_func_call(env, insn, &env->insn_idx);
- } else if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL) {
- err = check_kfunc_call(env, insn, &env->insn_idx);
- if (!err && is_bpf_throw_kfunc(insn)) {
- exception_exit = true;
- goto process_bpf_exit_full;
- }
- } else {
- err = check_helper_call(env, insn, &env->insn_idx);
- }
- if (err)
- return err;
-
- mark_reg_scratched(env, BPF_REG_0);
- } else if (opcode == BPF_JA) {
- if (BPF_SRC(insn->code) != BPF_K ||
- insn->src_reg != BPF_REG_0 ||
- insn->dst_reg != BPF_REG_0 ||
- (class == BPF_JMP && insn->imm != 0) ||
- (class == BPF_JMP32 && insn->off != 0)) {
- verbose(env, "BPF_JA uses reserved fields\n");
- return -EINVAL;
- }
-
- if (class == BPF_JMP)
- env->insn_idx += insn->off + 1;
- else
- env->insn_idx += insn->imm + 1;
- continue;
-
- } else if (opcode == BPF_EXIT) {
- if (BPF_SRC(insn->code) != BPF_K ||
- insn->imm != 0 ||
- insn->src_reg != BPF_REG_0 ||
- insn->dst_reg != BPF_REG_0 ||
- class == BPF_JMP32) {
- verbose(env, "BPF_EXIT uses reserved fields\n");
- return -EINVAL;
- }
-process_bpf_exit_full:
- /* We must do check_reference_leak here before
- * prepare_func_exit to handle the case when
- * state->curframe > 0, it may be a callback
- * function, for which reference_state must
- * match caller reference state when it exits.
- */
- err = check_resource_leak(env, exception_exit, !env->cur_state->curframe,
- "BPF_EXIT instruction in main prog");
- if (err)
- return err;
-
- /* The side effect of the prepare_func_exit
- * which is being skipped is that it frees
- * bpf_func_state. Typically, process_bpf_exit
- * will only be hit with outermost exit.
- * copy_verifier_state in pop_stack will handle
- * freeing of any extra bpf_func_state left over
- * from not processing all nested function
- * exits. We also skip return code checks as
- * they are not needed for exceptional exits.
- */
- if (exception_exit)
- goto process_bpf_exit;
-
- if (state->curframe) {
- /* exit from nested function */
- err = prepare_func_exit(env, &env->insn_idx);
- if (err)
- return err;
- do_print_state = true;
- continue;
- }
-
- err = check_return_code(env, BPF_REG_0, "R0");
- if (err)
- return err;
-process_bpf_exit:
- mark_verifier_state_scratched(env);
- update_branch_counts(env, env->cur_state);
- err = pop_stack(env, &prev_insn_idx,
- &env->insn_idx, pop_log);
- if (err < 0) {
- if (err != -ENOENT)
- return err;
- break;
- } else {
- if (WARN_ON_ONCE(env->cur_state->loop_entry)) {
- verbose(env, "verifier bug: env->cur_state->loop_entry != NULL\n");
- return -EFAULT;
- }
- do_print_state = true;
- continue;
- }
- } else {
- err = check_cond_jmp_op(env, insn, &env->insn_idx);
- if (err)
- return err;
- }
- } else if (class == BPF_LD) {
- u8 mode = BPF_MODE(insn->code);
-
- if (mode == BPF_ABS || mode == BPF_IND) {
- err = check_ld_abs(env, insn);
- if (err)
- return err;
-
- } else if (mode == BPF_IMM) {
- err = check_ld_imm(env, insn);
- if (err)
+ err = pop_stack(env, &prev_insn_idx, &env->insn_idx,
+ pop_log);
+ if (err < 0) {
+ if (err != -ENOENT)
return err;
-
- env->insn_idx++;
- sanitize_mark_insn_seen(env);
+ break;
} else {
- verbose(env, "invalid BPF_LD mode\n");
- return -EINVAL;
+ do_print_state = true;
+ continue;
}
- } else {
- verbose(env, "unknown insn class %d\n", class);
- return -EINVAL;
}
-
- env->insn_idx++;
}
return 0;
@@ -20647,7 +21093,10 @@ static int opt_remove_nops(struct bpf_verifier_env *env)
static int opt_subreg_zext_lo32_rnd_hi32(struct bpf_verifier_env *env,
const union bpf_attr *attr)
{
- struct bpf_insn *patch, zext_patch[2], rnd_hi32_patch[4];
+ struct bpf_insn *patch;
+ /* use env->insn_buf as two independent buffers */
+ struct bpf_insn *zext_patch = env->insn_buf;
+ struct bpf_insn *rnd_hi32_patch = &env->insn_buf[2];
struct bpf_insn_aux_data *aux = env->insn_aux_data;
int i, patch_len, delta = 0, len = env->prog->len;
struct bpf_insn *insns = env->prog->insnsi;
@@ -20720,10 +21169,9 @@ static int opt_subreg_zext_lo32_rnd_hi32(struct bpf_verifier_env *env,
if (bpf_pseudo_kfunc_call(&insn))
continue;
- if (WARN_ON(load_reg == -1)) {
- verbose(env, "verifier bug. zext_dst is set, but no reg is defined\n");
+ if (verifier_bug_if(load_reg == -1, env,
+ "zext_dst is set, but no reg is defined"))
return -EFAULT;
- }
zext_patch[0] = insn;
zext_patch[1].dst_reg = load_reg;
@@ -20767,8 +21215,8 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
epilogue_cnt = ops->gen_epilogue(epilogue_buf, env->prog,
-(subprogs[0].stack_depth + 8));
if (epilogue_cnt >= INSN_BUF_SIZE) {
- verbose(env, "bpf verifier is misconfigured\n");
- return -EINVAL;
+ verifier_bug(env, "epilogue is too long");
+ return -EFAULT;
} else if (epilogue_cnt) {
/* Save the ARG_PTR_TO_CTX for the epilogue to use */
cnt = 0;
@@ -20790,14 +21238,14 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
if (ops->gen_prologue || env->seen_direct_write) {
if (!ops->gen_prologue) {
- verbose(env, "bpf verifier is misconfigured\n");
- return -EINVAL;
+ verifier_bug(env, "gen_prologue is null");
+ return -EFAULT;
}
cnt = ops->gen_prologue(insn_buf, env->seen_direct_write,
env->prog);
if (cnt >= INSN_BUF_SIZE) {
- verbose(env, "bpf verifier is misconfigured\n");
- return -EINVAL;
+ verifier_bug(env, "prologue is too long");
+ return -EFAULT;
} else if (cnt) {
new_prog = bpf_patch_insn_data(env, 0, insn_buf, cnt);
if (!new_prog)
@@ -20824,6 +21272,28 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
bpf_convert_ctx_access_t convert_ctx_access;
u8 mode;
+ if (env->insn_aux_data[i + delta].nospec) {
+ WARN_ON_ONCE(env->insn_aux_data[i + delta].alu_state);
+ struct bpf_insn *patch = insn_buf;
+
+ *patch++ = BPF_ST_NOSPEC();
+ *patch++ = *insn;
+ cnt = patch - insn_buf;
+ new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
+ if (!new_prog)
+ return -ENOMEM;
+
+ delta += cnt - 1;
+ env->prog = new_prog;
+ insn = new_prog->insnsi + i + delta;
+ /* This can not be easily merged with the
+ * nospec_result-case, because an insn may require a
+ * nospec before and after itself. Therefore also do not
+ * 'continue' here but potentially apply further
+ * patching to insn. *insn should equal patch[1] now.
+ */
+ }
+
if (insn->code == (BPF_LDX | BPF_MEM | BPF_B) ||
insn->code == (BPF_LDX | BPF_MEM | BPF_H) ||
insn->code == (BPF_LDX | BPF_MEM | BPF_W) ||
@@ -20873,14 +21343,16 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
}
if (type == BPF_WRITE &&
- env->insn_aux_data[i + delta].sanitize_stack_spill) {
- struct bpf_insn patch[] = {
- *insn,
- BPF_ST_NOSPEC(),
- };
+ env->insn_aux_data[i + delta].nospec_result) {
+ /* nospec_result is only used to mitigate Spectre v4 and
+ * to limit verification-time for Spectre v1.
+ */
+ struct bpf_insn *patch = insn_buf;
- cnt = ARRAY_SIZE(patch);
- new_prog = bpf_patch_insn_data(env, i + delta, patch, cnt);
+ *patch++ = *insn;
+ *patch++ = BPF_ST_NOSPEC();
+ cnt = patch - insn_buf;
+ new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
if (!new_prog)
return -ENOMEM;
@@ -20915,6 +21387,7 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
* for this case.
*/
case PTR_TO_BTF_ID | MEM_ALLOC | PTR_UNTRUSTED:
+ case PTR_TO_MEM | MEM_RDONLY | PTR_UNTRUSTED:
if (type == BPF_READ) {
if (BPF_MODE(insn->code) == BPF_MEM)
insn->code = BPF_LDX | BPF_PROBE_MEM |
@@ -20953,8 +21426,8 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
u8 size_code;
if (type == BPF_WRITE) {
- verbose(env, "bpf verifier narrow ctx access misconfigured\n");
- return -EINVAL;
+ verifier_bug(env, "narrow ctx access misconfigured");
+ return -EFAULT;
}
size_code = BPF_H;
@@ -20972,16 +21445,16 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
&target_size);
if (cnt == 0 || cnt >= INSN_BUF_SIZE ||
(ctx_field_size && !target_size)) {
- verbose(env, "bpf verifier is misconfigured\n");
- return -EINVAL;
+ verifier_bug(env, "error during ctx access conversion");
+ return -EFAULT;
}
if (is_narrower_load && size < target_size) {
u8 shift = bpf_ctx_narrow_access_offset(
off, size, size_default) * 8;
if (shift && cnt + 1 >= INSN_BUF_SIZE) {
- verbose(env, "bpf verifier narrow ctx load misconfigured\n");
- return -EINVAL;
+ verifier_bug(env, "narrow ctx load misconfigured");
+ return -EFAULT;
}
if (ctx_field_size <= 4) {
if (shift)
@@ -21040,11 +21513,9 @@ static int jit_subprogs(struct bpf_verifier_env *env)
* propagated in any case.
*/
subprog = find_subprog(env, i + insn->imm + 1);
- if (subprog < 0) {
- WARN_ONCE(1, "verifier bug. No program starts at insn %d\n",
- i + insn->imm + 1);
+ if (verifier_bug_if(subprog < 0, env, "No program to jit at insn %d",
+ i + insn->imm + 1))
return -EFAULT;
- }
/* temporarily remember subprog id inside insn instead of
* aux_data, since next loop will split up all insns into funcs
*/
@@ -21413,8 +21884,8 @@ static int fixup_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
*/
desc = find_kfunc_desc(env->prog, insn->imm, insn->off);
if (!desc) {
- verbose(env, "verifier internal error: kernel function descriptor not found for func_id %u\n",
- insn->imm);
+ verifier_bug(env, "kernel function descriptor not found for func_id %u",
+ insn->imm);
return -EFAULT;
}
@@ -21429,8 +21900,8 @@ static int fixup_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
u64 obj_new_size = env->insn_aux_data[insn_idx].obj_new_size;
if (desc->func_id == special_kfunc_list[KF_bpf_percpu_obj_new_impl] && kptr_struct_meta) {
- verbose(env, "verifier internal error: NULL kptr_struct_meta expected at insn_idx %d\n",
- insn_idx);
+ verifier_bug(env, "NULL kptr_struct_meta expected at insn_idx %d",
+ insn_idx);
return -EFAULT;
}
@@ -21446,15 +21917,15 @@ static int fixup_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
struct bpf_insn addr[2] = { BPF_LD_IMM64(BPF_REG_2, (long)kptr_struct_meta) };
if (desc->func_id == special_kfunc_list[KF_bpf_percpu_obj_drop_impl] && kptr_struct_meta) {
- verbose(env, "verifier internal error: NULL kptr_struct_meta expected at insn_idx %d\n",
- insn_idx);
+ verifier_bug(env, "NULL kptr_struct_meta expected at insn_idx %d",
+ insn_idx);
return -EFAULT;
}
if (desc->func_id == special_kfunc_list[KF_bpf_refcount_acquire_impl] &&
!kptr_struct_meta) {
- verbose(env, "verifier internal error: kptr_struct_meta expected at insn_idx %d\n",
- insn_idx);
+ verifier_bug(env, "kptr_struct_meta expected at insn_idx %d",
+ insn_idx);
return -EFAULT;
}
@@ -21476,8 +21947,8 @@ static int fixup_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
}
if (!kptr_struct_meta) {
- verbose(env, "verifier internal error: kptr_struct_meta expected at insn_idx %d\n",
- insn_idx);
+ verifier_bug(env, "kptr_struct_meta expected at insn_idx %d",
+ insn_idx);
return -EFAULT;
}
@@ -21487,13 +21958,17 @@ static int fixup_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
desc->func_id == special_kfunc_list[KF_bpf_rdonly_cast]) {
insn_buf[0] = BPF_MOV64_REG(BPF_REG_0, BPF_REG_1);
*cnt = 1;
- } else if (is_bpf_wq_set_callback_impl_kfunc(desc->func_id)) {
- struct bpf_insn ld_addrs[2] = { BPF_LD_IMM64(BPF_REG_4, (long)env->prog->aux) };
+ }
- insn_buf[0] = ld_addrs[0];
- insn_buf[1] = ld_addrs[1];
- insn_buf[2] = *insn;
- *cnt = 3;
+ if (env->insn_aux_data[insn_idx].arg_prog) {
+ u32 regno = env->insn_aux_data[insn_idx].arg_prog;
+ struct bpf_insn ld_addrs[2] = { BPF_LD_IMM64(regno, (long)env->prog->aux) };
+ int idx = *cnt;
+
+ insn_buf[idx++] = ld_addrs[0];
+ insn_buf[idx++] = ld_addrs[1];
+ insn_buf[idx++] = *insn;
+ *cnt = idx;
}
return 0;
}
@@ -21507,7 +21982,7 @@ static int add_hidden_subprog(struct bpf_verifier_env *env, struct bpf_insn *pat
/* We only reserve one slot for hidden subprogs in subprog_info. */
if (env->hidden_subprog_cnt) {
- verbose(env, "verifier internal error: only one hidden subprog supported\n");
+ verifier_bug(env, "only one hidden subprog supported");
return -EFAULT;
}
/* We're not patching any existing instruction, just appending the new
@@ -21547,13 +22022,12 @@ static int do_misc_fixups(struct bpf_verifier_env *env)
u16 stack_depth_extra = 0;
if (env->seen_exception && !env->exception_callback_subprog) {
- struct bpf_insn patch[] = {
- env->prog->insnsi[insn_cnt - 1],
- BPF_MOV64_REG(BPF_REG_0, BPF_REG_1),
- BPF_EXIT_INSN(),
- };
+ struct bpf_insn *patch = insn_buf;
- ret = add_hidden_subprog(env, patch, ARRAY_SIZE(patch));
+ *patch++ = env->prog->insnsi[insn_cnt - 1];
+ *patch++ = BPF_MOV64_REG(BPF_REG_0, BPF_REG_1);
+ *patch++ = BPF_EXIT_INSN();
+ ret = add_hidden_subprog(env, insn_buf, patch - insn_buf);
if (ret < 0)
return ret;
prog = env->prog;
@@ -21589,20 +22063,18 @@ static int do_misc_fixups(struct bpf_verifier_env *env)
insn->off == 1 && insn->imm == -1) {
bool is64 = BPF_CLASS(insn->code) == BPF_ALU64;
bool isdiv = BPF_OP(insn->code) == BPF_DIV;
- struct bpf_insn *patchlet;
- struct bpf_insn chk_and_sdiv[] = {
- BPF_RAW_INSN((is64 ? BPF_ALU64 : BPF_ALU) |
- BPF_NEG | BPF_K, insn->dst_reg,
- 0, 0, 0),
- };
- struct bpf_insn chk_and_smod[] = {
- BPF_MOV32_IMM(insn->dst_reg, 0),
- };
+ struct bpf_insn *patch = insn_buf;
- patchlet = isdiv ? chk_and_sdiv : chk_and_smod;
- cnt = isdiv ? ARRAY_SIZE(chk_and_sdiv) : ARRAY_SIZE(chk_and_smod);
+ if (isdiv)
+ *patch++ = BPF_RAW_INSN((is64 ? BPF_ALU64 : BPF_ALU) |
+ BPF_NEG | BPF_K, insn->dst_reg,
+ 0, 0, 0);
+ else
+ *patch++ = BPF_MOV32_IMM(insn->dst_reg, 0);
- new_prog = bpf_patch_insn_data(env, i + delta, patchlet, cnt);
+ cnt = patch - insn_buf;
+
+ new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
if (!new_prog)
return -ENOMEM;
@@ -21621,83 +22093,79 @@ static int do_misc_fixups(struct bpf_verifier_env *env)
bool isdiv = BPF_OP(insn->code) == BPF_DIV;
bool is_sdiv = isdiv && insn->off == 1;
bool is_smod = !isdiv && insn->off == 1;
- struct bpf_insn *patchlet;
- struct bpf_insn chk_and_div[] = {
- /* [R,W]x div 0 -> 0 */
- BPF_RAW_INSN((is64 ? BPF_JMP : BPF_JMP32) |
- BPF_JNE | BPF_K, insn->src_reg,
- 0, 2, 0),
- BPF_ALU32_REG(BPF_XOR, insn->dst_reg, insn->dst_reg),
- BPF_JMP_IMM(BPF_JA, 0, 0, 1),
- *insn,
- };
- struct bpf_insn chk_and_mod[] = {
- /* [R,W]x mod 0 -> [R,W]x */
- BPF_RAW_INSN((is64 ? BPF_JMP : BPF_JMP32) |
- BPF_JEQ | BPF_K, insn->src_reg,
- 0, 1 + (is64 ? 0 : 1), 0),
- *insn,
- BPF_JMP_IMM(BPF_JA, 0, 0, 1),
- BPF_MOV32_REG(insn->dst_reg, insn->dst_reg),
- };
- struct bpf_insn chk_and_sdiv[] = {
+ struct bpf_insn *patch = insn_buf;
+
+ if (is_sdiv) {
/* [R,W]x sdiv 0 -> 0
* LLONG_MIN sdiv -1 -> LLONG_MIN
* INT_MIN sdiv -1 -> INT_MIN
*/
- BPF_MOV64_REG(BPF_REG_AX, insn->src_reg),
- BPF_RAW_INSN((is64 ? BPF_ALU64 : BPF_ALU) |
- BPF_ADD | BPF_K, BPF_REG_AX,
- 0, 0, 1),
- BPF_RAW_INSN((is64 ? BPF_JMP : BPF_JMP32) |
- BPF_JGT | BPF_K, BPF_REG_AX,
- 0, 4, 1),
- BPF_RAW_INSN((is64 ? BPF_JMP : BPF_JMP32) |
- BPF_JEQ | BPF_K, BPF_REG_AX,
- 0, 1, 0),
- BPF_RAW_INSN((is64 ? BPF_ALU64 : BPF_ALU) |
- BPF_MOV | BPF_K, insn->dst_reg,
- 0, 0, 0),
+ *patch++ = BPF_MOV64_REG(BPF_REG_AX, insn->src_reg);
+ *patch++ = BPF_RAW_INSN((is64 ? BPF_ALU64 : BPF_ALU) |
+ BPF_ADD | BPF_K, BPF_REG_AX,
+ 0, 0, 1);
+ *patch++ = BPF_RAW_INSN((is64 ? BPF_JMP : BPF_JMP32) |
+ BPF_JGT | BPF_K, BPF_REG_AX,
+ 0, 4, 1);
+ *patch++ = BPF_RAW_INSN((is64 ? BPF_JMP : BPF_JMP32) |
+ BPF_JEQ | BPF_K, BPF_REG_AX,
+ 0, 1, 0);
+ *patch++ = BPF_RAW_INSN((is64 ? BPF_ALU64 : BPF_ALU) |
+ BPF_MOV | BPF_K, insn->dst_reg,
+ 0, 0, 0);
/* BPF_NEG(LLONG_MIN) == -LLONG_MIN == LLONG_MIN */
- BPF_RAW_INSN((is64 ? BPF_ALU64 : BPF_ALU) |
- BPF_NEG | BPF_K, insn->dst_reg,
- 0, 0, 0),
- BPF_JMP_IMM(BPF_JA, 0, 0, 1),
- *insn,
- };
- struct bpf_insn chk_and_smod[] = {
+ *patch++ = BPF_RAW_INSN((is64 ? BPF_ALU64 : BPF_ALU) |
+ BPF_NEG | BPF_K, insn->dst_reg,
+ 0, 0, 0);
+ *patch++ = BPF_JMP_IMM(BPF_JA, 0, 0, 1);
+ *patch++ = *insn;
+ cnt = patch - insn_buf;
+ } else if (is_smod) {
/* [R,W]x mod 0 -> [R,W]x */
/* [R,W]x mod -1 -> 0 */
- BPF_MOV64_REG(BPF_REG_AX, insn->src_reg),
- BPF_RAW_INSN((is64 ? BPF_ALU64 : BPF_ALU) |
- BPF_ADD | BPF_K, BPF_REG_AX,
- 0, 0, 1),
- BPF_RAW_INSN((is64 ? BPF_JMP : BPF_JMP32) |
- BPF_JGT | BPF_K, BPF_REG_AX,
- 0, 3, 1),
- BPF_RAW_INSN((is64 ? BPF_JMP : BPF_JMP32) |
- BPF_JEQ | BPF_K, BPF_REG_AX,
- 0, 3 + (is64 ? 0 : 1), 1),
- BPF_MOV32_IMM(insn->dst_reg, 0),
- BPF_JMP_IMM(BPF_JA, 0, 0, 1),
- *insn,
- BPF_JMP_IMM(BPF_JA, 0, 0, 1),
- BPF_MOV32_REG(insn->dst_reg, insn->dst_reg),
- };
-
- if (is_sdiv) {
- patchlet = chk_and_sdiv;
- cnt = ARRAY_SIZE(chk_and_sdiv);
- } else if (is_smod) {
- patchlet = chk_and_smod;
- cnt = ARRAY_SIZE(chk_and_smod) - (is64 ? 2 : 0);
+ *patch++ = BPF_MOV64_REG(BPF_REG_AX, insn->src_reg);
+ *patch++ = BPF_RAW_INSN((is64 ? BPF_ALU64 : BPF_ALU) |
+ BPF_ADD | BPF_K, BPF_REG_AX,
+ 0, 0, 1);
+ *patch++ = BPF_RAW_INSN((is64 ? BPF_JMP : BPF_JMP32) |
+ BPF_JGT | BPF_K, BPF_REG_AX,
+ 0, 3, 1);
+ *patch++ = BPF_RAW_INSN((is64 ? BPF_JMP : BPF_JMP32) |
+ BPF_JEQ | BPF_K, BPF_REG_AX,
+ 0, 3 + (is64 ? 0 : 1), 1);
+ *patch++ = BPF_MOV32_IMM(insn->dst_reg, 0);
+ *patch++ = BPF_JMP_IMM(BPF_JA, 0, 0, 1);
+ *patch++ = *insn;
+
+ if (!is64) {
+ *patch++ = BPF_JMP_IMM(BPF_JA, 0, 0, 1);
+ *patch++ = BPF_MOV32_REG(insn->dst_reg, insn->dst_reg);
+ }
+ cnt = patch - insn_buf;
+ } else if (isdiv) {
+ /* [R,W]x div 0 -> 0 */
+ *patch++ = BPF_RAW_INSN((is64 ? BPF_JMP : BPF_JMP32) |
+ BPF_JNE | BPF_K, insn->src_reg,
+ 0, 2, 0);
+ *patch++ = BPF_ALU32_REG(BPF_XOR, insn->dst_reg, insn->dst_reg);
+ *patch++ = BPF_JMP_IMM(BPF_JA, 0, 0, 1);
+ *patch++ = *insn;
+ cnt = patch - insn_buf;
} else {
- patchlet = isdiv ? chk_and_div : chk_and_mod;
- cnt = isdiv ? ARRAY_SIZE(chk_and_div) :
- ARRAY_SIZE(chk_and_mod) - (is64 ? 2 : 0);
+ /* [R,W]x mod 0 -> [R,W]x */
+ *patch++ = BPF_RAW_INSN((is64 ? BPF_JMP : BPF_JMP32) |
+ BPF_JEQ | BPF_K, insn->src_reg,
+ 0, 1 + (is64 ? 0 : 1), 0);
+ *patch++ = *insn;
+
+ if (!is64) {
+ *patch++ = BPF_JMP_IMM(BPF_JA, 0, 0, 1);
+ *patch++ = BPF_MOV32_REG(insn->dst_reg, insn->dst_reg);
+ }
+ cnt = patch - insn_buf;
}
- new_prog = bpf_patch_insn_data(env, i + delta, patchlet, cnt);
+ new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
if (!new_prog)
return -ENOMEM;
@@ -21711,7 +22179,7 @@ static int do_misc_fixups(struct bpf_verifier_env *env)
if (BPF_CLASS(insn->code) == BPF_LDX &&
(BPF_MODE(insn->code) == BPF_PROBE_MEM ||
BPF_MODE(insn->code) == BPF_PROBE_MEMSX)) {
- struct bpf_insn *patch = &insn_buf[0];
+ struct bpf_insn *patch = insn_buf;
u64 uaddress_limit = bpf_arch_uaddress_limit();
if (!uaddress_limit)
@@ -21743,8 +22211,8 @@ static int do_misc_fixups(struct bpf_verifier_env *env)
BPF_MODE(insn->code) == BPF_IND)) {
cnt = env->ops->gen_ld_abs(insn, insn_buf);
if (cnt == 0 || cnt >= INSN_BUF_SIZE) {
- verbose(env, "bpf verifier is misconfigured\n");
- return -EINVAL;
+ verifier_bug(env, "%d insns generated for ld_abs", cnt);
+ return -EFAULT;
}
new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
@@ -21762,7 +22230,7 @@ static int do_misc_fixups(struct bpf_verifier_env *env)
insn->code == (BPF_ALU64 | BPF_SUB | BPF_X)) {
const u8 code_add = BPF_ALU64 | BPF_ADD | BPF_X;
const u8 code_sub = BPF_ALU64 | BPF_SUB | BPF_X;
- struct bpf_insn *patch = &insn_buf[0];
+ struct bpf_insn *patch = insn_buf;
bool issrc, isneg, isimm;
u32 off_reg;
@@ -22079,8 +22547,8 @@ static int do_misc_fixups(struct bpf_verifier_env *env)
if (cnt == -EOPNOTSUPP)
goto patch_map_ops_generic;
if (cnt <= 0 || cnt >= INSN_BUF_SIZE) {
- verbose(env, "bpf verifier is misconfigured\n");
- return -EINVAL;
+ verifier_bug(env, "%d insns generated for map lookup", cnt);
+ return -EFAULT;
}
new_prog = bpf_patch_insn_data(env, i + delta,
@@ -22367,9 +22835,9 @@ patch_call_imm:
* programs to call them, must be real in-kernel functions
*/
if (!fn->func) {
- verbose(env,
- "kernel subsystem misconfigured func %s#%d\n",
- func_id_name(insn->imm), insn->imm);
+ verifier_bug(env,
+ "not inlined functions %s#%d is missing func",
+ func_id_name(insn->imm), insn->imm);
return -EFAULT;
}
insn->imm = fn->func - __bpf_call_base;
@@ -22403,7 +22871,7 @@ next_insn:
continue;
/* We need two slots in case timed may_goto is supported. */
if (stack_slots > slots) {
- verbose(env, "verifier bug: stack_slots supports may_goto only\n");
+ verifier_bug(env, "stack_slots supports may_goto only");
return -EFAULT;
}
@@ -22439,8 +22907,8 @@ next_insn:
if (!map_ptr->ops->map_poke_track ||
!map_ptr->ops->map_poke_untrack ||
!map_ptr->ops->map_poke_run) {
- verbose(env, "bpf verifier is misconfigured\n");
- return -EINVAL;
+ verifier_bug(env, "poke tab is misconfigured");
+ return -EFAULT;
}
ret = map_ptr->ops->map_poke_track(map_ptr, prog->aux);
@@ -22630,7 +23098,12 @@ static void free_states(struct bpf_verifier_env *env)
{
struct bpf_verifier_state_list *sl;
struct list_head *head, *pos, *tmp;
- int i;
+ struct bpf_scc_info *info;
+ int i, j;
+
+ free_verifier_state(env->cur_state, true);
+ env->cur_state = NULL;
+ while (!pop_stack(env, NULL, NULL, false));
list_for_each_safe(pos, tmp, &env->free_list) {
sl = container_of(pos, struct bpf_verifier_state_list, node);
@@ -22639,6 +23112,14 @@ static void free_states(struct bpf_verifier_env *env)
}
INIT_LIST_HEAD(&env->free_list);
+ for (i = 0; i < env->scc_cnt; ++i) {
+ info = env->scc_info[i];
+ for (j = 0; j < info->num_visits; j++)
+ free_backedges(&info->visits[j]);
+ kvfree(info);
+ env->scc_info[i] = NULL;
+ }
+
if (!env->explored_states)
return;
@@ -22666,13 +23147,13 @@ static int do_check_common(struct bpf_verifier_env *env, int subprog)
env->prev_linfo = NULL;
env->pass_cnt++;
- state = kzalloc(sizeof(struct bpf_verifier_state), GFP_KERNEL);
+ state = kzalloc(sizeof(struct bpf_verifier_state), GFP_KERNEL_ACCOUNT);
if (!state)
return -ENOMEM;
state->curframe = 0;
state->speculative = false;
state->branches = 1;
- state->frame[0] = kzalloc(sizeof(struct bpf_func_state), GFP_KERNEL);
+ state->frame[0] = kzalloc(sizeof(struct bpf_func_state), GFP_KERNEL_ACCOUNT);
if (!state->frame[0]) {
kfree(state);
return -ENOMEM;
@@ -22723,11 +23204,12 @@ static int do_check_common(struct bpf_verifier_env *env, int subprog)
__mark_dynptr_reg(reg, BPF_DYNPTR_TYPE_LOCAL, true, ++env->id_gen);
} else if (base_type(arg->arg_type) == ARG_PTR_TO_MEM) {
reg->type = PTR_TO_MEM;
- if (arg->arg_type & PTR_MAYBE_NULL)
- reg->type |= PTR_MAYBE_NULL;
+ reg->type |= arg->arg_type &
+ (PTR_MAYBE_NULL | PTR_UNTRUSTED | MEM_RDONLY);
mark_reg_known_zero(env, regs, i);
reg->mem_size = arg->mem_size;
- reg->id = ++env->id_gen;
+ if (arg->arg_type & PTR_MAYBE_NULL)
+ reg->id = ++env->id_gen;
} else if (base_type(arg->arg_type) == ARG_PTR_TO_BTF_ID) {
reg->type = PTR_TO_BTF_ID;
if (arg->arg_type & PTR_MAYBE_NULL)
@@ -22744,8 +23226,8 @@ static int do_check_common(struct bpf_verifier_env *env, int subprog)
/* caller can pass either PTR_TO_ARENA or SCALAR */
mark_reg_unknown(env, regs, i);
} else {
- WARN_ONCE(1, "BUG: unhandled arg#%d type %d\n",
- i - BPF_REG_1, arg->arg_type);
+ verifier_bug(env, "unhandled arg#%d type %d",
+ i - BPF_REG_1, arg->arg_type);
ret = -EFAULT;
goto out;
}
@@ -22775,14 +23257,6 @@ static int do_check_common(struct bpf_verifier_env *env, int subprog)
ret = do_check(env);
out:
- /* check for NULL is necessary, since cur_state can be freed inside
- * do_check() under memory pressure.
- */
- if (env->cur_state) {
- free_verifier_state(env->cur_state, true);
- env->cur_state = NULL;
- }
- while (!pop_stack(env, NULL, NULL, false));
if (!ret && pop_log)
bpf_vlog_reset(&env->log, 0);
free_states(env);
@@ -22898,7 +23372,7 @@ static void print_verification_stats(struct bpf_verifier_env *env)
int bpf_prog_ctx_arg_info_init(struct bpf_prog *prog,
const struct bpf_ctx_arg_aux *info, u32 cnt)
{
- prog->aux->ctx_arg_info = kmemdup_array(info, cnt, sizeof(*info), GFP_KERNEL);
+ prog->aux->ctx_arg_info = kmemdup_array(info, cnt, sizeof(*info), GFP_KERNEL_ACCOUNT);
prog->aux->ctx_arg_info_size = cnt;
return prog->aux->ctx_arg_info ? 0 : -ENOMEM;
@@ -23507,11 +23981,14 @@ static int check_attach_btf_id(struct bpf_verifier_env *env)
return ret;
} else if (prog->type == BPF_PROG_TYPE_TRACING &&
btf_id_set_contains(&btf_id_deny, btf_id)) {
+ verbose(env, "Attaching tracing programs to function '%s' is rejected.\n",
+ tgt_info.tgt_name);
return -EINVAL;
} else if ((prog->expected_attach_type == BPF_TRACE_FEXIT ||
prog->expected_attach_type == BPF_MODIFY_RETURN) &&
btf_id_set_contains(&noreturn_deny, btf_id)) {
- verbose(env, "Attaching fexit/fmod_ret to __noreturn functions is rejected.\n");
+ verbose(env, "Attaching fexit/fmod_ret to __noreturn function '%s' is rejected.\n",
+ tgt_info.tgt_name);
return -EINVAL;
}
@@ -23640,6 +24117,7 @@ static bool can_jump(struct bpf_insn *insn)
case BPF_JSLT:
case BPF_JSLE:
case BPF_JCOND:
+ case BPF_JSET:
return true;
}
@@ -23842,7 +24320,7 @@ static int compute_live_registers(struct bpf_verifier_env *env)
* - repeat the computation while {in,out} fields changes for
* any instruction.
*/
- state = kvcalloc(insn_cnt, sizeof(*state), GFP_KERNEL);
+ state = kvcalloc(insn_cnt, sizeof(*state), GFP_KERNEL_ACCOUNT);
if (!state) {
err = -ENOMEM;
goto out;
@@ -23880,6 +24358,10 @@ static int compute_live_registers(struct bpf_verifier_env *env)
if (env->log.level & BPF_LOG_LEVEL2) {
verbose(env, "Live regs before insn:\n");
for (i = 0; i < insn_cnt; ++i) {
+ if (env->insn_aux_data[i].scc)
+ verbose(env, "%3d ", env->insn_aux_data[i].scc);
+ else
+ verbose(env, " ");
verbose(env, "%3d: ", i);
for (j = BPF_REG_0; j < BPF_REG_10; ++j)
if (insn_aux[i].live_regs_before & BIT(j))
@@ -23901,6 +24383,185 @@ out:
return err;
}
+/*
+ * Compute strongly connected components (SCCs) on the CFG.
+ * Assign an SCC number to each instruction, recorded in env->insn_aux[*].scc.
+ * If instruction is a sole member of its SCC and there are no self edges,
+ * assign it SCC number of zero.
+ * Uses a non-recursive adaptation of Tarjan's algorithm for SCC computation.
+ */
+static int compute_scc(struct bpf_verifier_env *env)
+{
+ const u32 NOT_ON_STACK = U32_MAX;
+
+ struct bpf_insn_aux_data *aux = env->insn_aux_data;
+ const u32 insn_cnt = env->prog->len;
+ int stack_sz, dfs_sz, err = 0;
+ u32 *stack, *pre, *low, *dfs;
+ u32 succ_cnt, i, j, t, w;
+ u32 next_preorder_num;
+ u32 next_scc_id;
+ bool assign_scc;
+ u32 succ[2];
+
+ next_preorder_num = 1;
+ next_scc_id = 1;
+ /*
+ * - 'stack' accumulates vertices in DFS order, see invariant comment below;
+ * - 'pre[t] == p' => preorder number of vertex 't' is 'p';
+ * - 'low[t] == n' => smallest preorder number of the vertex reachable from 't' is 'n';
+ * - 'dfs' DFS traversal stack, used to emulate explicit recursion.
+ */
+ stack = kvcalloc(insn_cnt, sizeof(int), GFP_KERNEL_ACCOUNT);
+ pre = kvcalloc(insn_cnt, sizeof(int), GFP_KERNEL_ACCOUNT);
+ low = kvcalloc(insn_cnt, sizeof(int), GFP_KERNEL_ACCOUNT);
+ dfs = kvcalloc(insn_cnt, sizeof(*dfs), GFP_KERNEL_ACCOUNT);
+ if (!stack || !pre || !low || !dfs) {
+ err = -ENOMEM;
+ goto exit;
+ }
+ /*
+ * References:
+ * [1] R. Tarjan "Depth-First Search and Linear Graph Algorithms"
+ * [2] D. J. Pearce "A Space-Efficient Algorithm for Finding Strongly Connected Components"
+ *
+ * The algorithm maintains the following invariant:
+ * - suppose there is a path 'u' ~> 'v', such that 'pre[v] < pre[u]';
+ * - then, vertex 'u' remains on stack while vertex 'v' is on stack.
+ *
+ * Consequently:
+ * - If 'low[v] < pre[v]', there is a path from 'v' to some vertex 'u',
+ * such that 'pre[u] == low[v]'; vertex 'u' is currently on the stack,
+ * and thus there is an SCC (loop) containing both 'u' and 'v'.
+ * - If 'low[v] == pre[v]', loops containing 'v' have been explored,
+ * and 'v' can be considered the root of some SCC.
+ *
+ * Here is a pseudo-code for an explicitly recursive version of the algorithm:
+ *
+ * NOT_ON_STACK = insn_cnt + 1
+ * pre = [0] * insn_cnt
+ * low = [0] * insn_cnt
+ * scc = [0] * insn_cnt
+ * stack = []
+ *
+ * next_preorder_num = 1
+ * next_scc_id = 1
+ *
+ * def recur(w):
+ * nonlocal next_preorder_num
+ * nonlocal next_scc_id
+ *
+ * pre[w] = next_preorder_num
+ * low[w] = next_preorder_num
+ * next_preorder_num += 1
+ * stack.append(w)
+ * for s in successors(w):
+ * # Note: for classic algorithm the block below should look as:
+ * #
+ * # if pre[s] == 0:
+ * # recur(s)
+ * # low[w] = min(low[w], low[s])
+ * # elif low[s] != NOT_ON_STACK:
+ * # low[w] = min(low[w], pre[s])
+ * #
+ * # But replacing both 'min' instructions with 'low[w] = min(low[w], low[s])'
+ * # does not break the invariant and makes itartive version of the algorithm
+ * # simpler. See 'Algorithm #3' from [2].
+ *
+ * # 's' not yet visited
+ * if pre[s] == 0:
+ * recur(s)
+ * # if 's' is on stack, pick lowest reachable preorder number from it;
+ * # if 's' is not on stack 'low[s] == NOT_ON_STACK > low[w]',
+ * # so 'min' would be a noop.
+ * low[w] = min(low[w], low[s])
+ *
+ * if low[w] == pre[w]:
+ * # 'w' is the root of an SCC, pop all vertices
+ * # below 'w' on stack and assign same SCC to them.
+ * while True:
+ * t = stack.pop()
+ * low[t] = NOT_ON_STACK
+ * scc[t] = next_scc_id
+ * if t == w:
+ * break
+ * next_scc_id += 1
+ *
+ * for i in range(0, insn_cnt):
+ * if pre[i] == 0:
+ * recur(i)
+ *
+ * Below implementation replaces explicit recursion with array 'dfs'.
+ */
+ for (i = 0; i < insn_cnt; i++) {
+ if (pre[i])
+ continue;
+ stack_sz = 0;
+ dfs_sz = 1;
+ dfs[0] = i;
+dfs_continue:
+ while (dfs_sz) {
+ w = dfs[dfs_sz - 1];
+ if (pre[w] == 0) {
+ low[w] = next_preorder_num;
+ pre[w] = next_preorder_num;
+ next_preorder_num++;
+ stack[stack_sz++] = w;
+ }
+ /* Visit 'w' successors */
+ succ_cnt = insn_successors(env->prog, w, succ);
+ for (j = 0; j < succ_cnt; ++j) {
+ if (pre[succ[j]]) {
+ low[w] = min(low[w], low[succ[j]]);
+ } else {
+ dfs[dfs_sz++] = succ[j];
+ goto dfs_continue;
+ }
+ }
+ /*
+ * Preserve the invariant: if some vertex above in the stack
+ * is reachable from 'w', keep 'w' on the stack.
+ */
+ if (low[w] < pre[w]) {
+ dfs_sz--;
+ goto dfs_continue;
+ }
+ /*
+ * Assign SCC number only if component has two or more elements,
+ * or if component has a self reference.
+ */
+ assign_scc = stack[stack_sz - 1] != w;
+ for (j = 0; j < succ_cnt; ++j) {
+ if (succ[j] == w) {
+ assign_scc = true;
+ break;
+ }
+ }
+ /* Pop component elements from stack */
+ do {
+ t = stack[--stack_sz];
+ low[t] = NOT_ON_STACK;
+ if (assign_scc)
+ aux[t].scc = next_scc_id;
+ } while (t != w);
+ if (assign_scc)
+ next_scc_id++;
+ dfs_sz--;
+ }
+ }
+ env->scc_info = kvcalloc(next_scc_id, sizeof(*env->scc_info), GFP_KERNEL_ACCOUNT);
+ if (!env->scc_info) {
+ err = -ENOMEM;
+ goto exit;
+ }
+exit:
+ kvfree(stack);
+ kvfree(pre);
+ kvfree(low);
+ kvfree(dfs);
+ return err;
+}
+
int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u32 uattr_size)
{
u64 start_time = ktime_get_ns();
@@ -23909,6 +24570,8 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u3
u32 log_true_size;
bool is_priv;
+ BTF_TYPE_EMIT(enum bpf_features);
+
/* no program is valid */
if (ARRAY_SIZE(bpf_verifier_ops) == 0)
return -EINVAL;
@@ -23916,7 +24579,7 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u3
/* 'struct bpf_verifier_env' can be global, but since it's not small,
* allocate/free it every time bpf_check() is called
*/
- env = kvzalloc(sizeof(struct bpf_verifier_env), GFP_KERNEL);
+ env = kvzalloc(sizeof(struct bpf_verifier_env), GFP_KERNEL_ACCOUNT);
if (!env)
return -ENOMEM;
@@ -23979,7 +24642,7 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u3
env->explored_states = kvcalloc(state_htab_size(env),
sizeof(struct list_head),
- GFP_USER);
+ GFP_KERNEL_ACCOUNT);
ret = -ENOMEM;
if (!env->explored_states)
goto skip_full_check;
@@ -24022,6 +24685,10 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u3
if (ret)
goto skip_full_check;
+ ret = compute_scc(env);
+ if (ret < 0)
+ goto skip_full_check;
+
ret = compute_live_registers(env);
if (ret < 0)
goto skip_full_check;
@@ -24106,7 +24773,7 @@ skip_full_check:
/* if program passed verifier, update used_maps in bpf_prog_info */
env->prog->aux->used_maps = kmalloc_array(env->used_map_cnt,
sizeof(env->used_maps[0]),
- GFP_KERNEL);
+ GFP_KERNEL_ACCOUNT);
if (!env->prog->aux->used_maps) {
ret = -ENOMEM;
@@ -24121,7 +24788,7 @@ skip_full_check:
/* if program passed verifier, update used_btfs in bpf_prog_aux */
env->prog->aux->used_btfs = kmalloc_array(env->used_btf_cnt,
sizeof(env->used_btfs[0]),
- GFP_KERNEL);
+ GFP_KERNEL_ACCOUNT);
if (!env->prog->aux->used_btfs) {
ret = -ENOMEM;
goto err_release_maps;
@@ -24162,9 +24829,9 @@ err_unlock:
if (!is_priv)
mutex_unlock(&bpf_verifier_lock);
vfree(env->insn_aux_data);
- kvfree(env->insn_hist);
err_free_env:
kvfree(env->cfg.insn_postorder);
+ kvfree(env->scc_info);
kvfree(env);
return ret;
}
diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h
index 95ab39e1ec8f..b14e61c64a34 100644
--- a/kernel/cgroup/cgroup-internal.h
+++ b/kernel/cgroup/cgroup-internal.h
@@ -270,9 +270,9 @@ int cgroup_task_count(const struct cgroup *cgrp);
/*
* rstat.c
*/
-int cgroup_rstat_init(struct cgroup *cgrp);
-void cgroup_rstat_exit(struct cgroup *cgrp);
-void cgroup_rstat_boot(void);
+int css_rstat_init(struct cgroup_subsys_state *css);
+void css_rstat_exit(struct cgroup_subsys_state *css);
+int ss_rstat_init(struct cgroup_subsys *ss);
void cgroup_base_stat_cputime_show(struct seq_file *seq);
/*
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 63e5b90da1f3..312c6a8b55bb 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -95,6 +95,9 @@ EXPORT_SYMBOL_GPL(cgroup_mutex);
EXPORT_SYMBOL_GPL(css_set_lock);
#endif
+struct blocking_notifier_head cgroup_lifetime_notifier =
+ BLOCKING_NOTIFIER_INIT(cgroup_lifetime_notifier);
+
DEFINE_SPINLOCK(trace_cgroup_path_lock);
char trace_cgroup_path[TRACE_CGROUP_PATH_LEN];
static bool cgroup_debug __read_mostly;
@@ -161,10 +164,14 @@ static struct static_key_true *cgroup_subsys_on_dfl_key[] = {
};
#undef SUBSYS
-static DEFINE_PER_CPU(struct cgroup_rstat_cpu, cgrp_dfl_root_rstat_cpu);
+static DEFINE_PER_CPU(struct css_rstat_cpu, root_rstat_cpu);
+static DEFINE_PER_CPU(struct cgroup_rstat_base_cpu, root_rstat_base_cpu);
/* the default hierarchy */
-struct cgroup_root cgrp_dfl_root = { .cgrp.rstat_cpu = &cgrp_dfl_root_rstat_cpu };
+struct cgroup_root cgrp_dfl_root = {
+ .cgrp.self.rstat_cpu = &root_rstat_cpu,
+ .cgrp.rstat_base_cpu = &root_rstat_base_cpu,
+};
EXPORT_SYMBOL_GPL(cgrp_dfl_root);
/*
@@ -1335,6 +1342,7 @@ static void cgroup_destroy_root(struct cgroup_root *root)
{
struct cgroup *cgrp = &root->cgrp;
struct cgrp_cset_link *link, *tmp_link;
+ int ret;
trace_cgroup_destroy_root(root);
@@ -1343,6 +1351,10 @@ static void cgroup_destroy_root(struct cgroup_root *root)
BUG_ON(atomic_read(&root->nr_cgrps));
BUG_ON(!list_empty(&cgrp->self.children));
+ ret = blocking_notifier_call_chain(&cgroup_lifetime_notifier,
+ CGROUP_LIFETIME_OFFLINE, cgrp);
+ WARN_ON_ONCE(notifier_to_errno(ret));
+
/* Rebind all subsystems back to the default hierarchy */
WARN_ON(rebind_subsystems(&cgrp_dfl_root, root->subsys_mask));
@@ -1371,7 +1383,6 @@ static void cgroup_destroy_root(struct cgroup_root *root)
cgroup_unlock();
- cgroup_rstat_exit(cgrp);
kernfs_destroy_root(root->kf_root);
cgroup_free_root(root);
}
@@ -1715,7 +1726,7 @@ static void css_clear_dir(struct cgroup_subsys_state *css)
css->flags &= ~CSS_VISIBLE;
- if (!css->ss) {
+ if (css_is_self(css)) {
if (cgroup_on_dfl(cgrp)) {
cgroup_addrm_files(css, cgrp,
cgroup_base_files, false);
@@ -1747,7 +1758,7 @@ static int css_populate_dir(struct cgroup_subsys_state *css)
if (css->flags & CSS_VISIBLE)
return 0;
- if (!css->ss) {
+ if (css_is_self(css)) {
if (cgroup_on_dfl(cgrp)) {
ret = cgroup_addrm_files(css, cgrp,
cgroup_base_files, true);
@@ -1876,13 +1887,6 @@ int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask)
}
spin_unlock_irq(&css_set_lock);
- if (ss->css_rstat_flush) {
- list_del_rcu(&css->rstat_css_node);
- synchronize_rcu();
- list_add_rcu(&css->rstat_css_node,
- &dcgrp->rstat_css_list);
- }
-
/* default hierarchy doesn't enable controllers by default */
dst_root->subsys_mask |= 1 << ssid;
if (dst_root == &cgrp_dfl_root) {
@@ -2065,12 +2069,16 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
cgrp->dom_cgrp = cgrp;
cgrp->max_descendants = INT_MAX;
cgrp->max_depth = INT_MAX;
- INIT_LIST_HEAD(&cgrp->rstat_css_list);
prev_cputime_init(&cgrp->prev_cputime);
for_each_subsys(ss, ssid)
INIT_LIST_HEAD(&cgrp->e_csets[ssid]);
+#ifdef CONFIG_CGROUP_BPF
+ for (int i = 0; i < ARRAY_SIZE(cgrp->bpf.revisions); i++)
+ cgrp->bpf.revisions[i] = 1;
+#endif
+
init_waitqueue_head(&cgrp->offline_waitq);
INIT_WORK(&cgrp->release_agent_work, cgroup1_release_agent);
}
@@ -2146,7 +2154,7 @@ int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask)
if (ret)
goto destroy_root;
- ret = cgroup_rstat_init(root_cgrp);
+ ret = css_rstat_init(&root_cgrp->self);
if (ret)
goto destroy_root;
@@ -2154,10 +2162,9 @@ int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask)
if (ret)
goto exit_stats;
- if (root == &cgrp_dfl_root) {
- ret = cgroup_bpf_inherit(root_cgrp);
- WARN_ON_ONCE(ret);
- }
+ ret = blocking_notifier_call_chain(&cgroup_lifetime_notifier,
+ CGROUP_LIFETIME_ONLINE, root_cgrp);
+ WARN_ON_ONCE(notifier_to_errno(ret));
trace_cgroup_setup_root(root);
@@ -2188,7 +2195,7 @@ int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask)
goto out;
exit_stats:
- cgroup_rstat_exit(root_cgrp);
+ css_rstat_exit(&root_cgrp->self);
destroy_root:
kernfs_destroy_root(root->kf_root);
root->kf_root = NULL;
@@ -5444,8 +5451,9 @@ static void css_free_rwork_fn(struct work_struct *work)
struct cgroup *cgrp = css->cgroup;
percpu_ref_exit(&css->refcnt);
+ css_rstat_exit(css);
- if (ss) {
+ if (!css_is_self(css)) {
/* css free path */
struct cgroup_subsys_state *parent = css->parent;
int id = css->id;
@@ -5474,7 +5482,6 @@ static void css_free_rwork_fn(struct work_struct *work)
cgroup_put(cgroup_parent(cgrp));
kernfs_put(cgrp->kn);
psi_cgroup_free(cgrp);
- cgroup_rstat_exit(cgrp);
kfree(cgrp);
} else {
/*
@@ -5499,14 +5506,10 @@ static void css_release_work_fn(struct work_struct *work)
css->flags |= CSS_RELEASED;
list_del_rcu(&css->sibling);
- if (ss) {
+ if (!css_is_self(css)) {
struct cgroup *parent_cgrp;
- /* css release path */
- if (!list_empty(&css->rstat_css_node)) {
- cgroup_rstat_flush(cgrp);
- list_del_rcu(&css->rstat_css_node);
- }
+ css_rstat_flush(css);
cgroup_idr_replace(&ss->css_idr, NULL, css->id);
if (ss->css_released)
@@ -5532,7 +5535,7 @@ static void css_release_work_fn(struct work_struct *work)
/* cgroup release path */
TRACE_CGROUP_PATH(release, cgrp);
- cgroup_rstat_flush(cgrp);
+ css_rstat_flush(&cgrp->self);
spin_lock_irq(&css_set_lock);
for (tcgrp = cgroup_parent(cgrp); tcgrp;
@@ -5580,7 +5583,6 @@ static void init_and_link_css(struct cgroup_subsys_state *css,
css->id = -1;
INIT_LIST_HEAD(&css->sibling);
INIT_LIST_HEAD(&css->children);
- INIT_LIST_HEAD(&css->rstat_css_node);
css->serial_nr = css_serial_nr_next++;
atomic_set(&css->online_cnt, 0);
@@ -5589,9 +5591,6 @@ static void init_and_link_css(struct cgroup_subsys_state *css,
css_get(css->parent);
}
- if (ss->css_rstat_flush)
- list_add_rcu(&css->rstat_css_node, &cgrp->rstat_css_list);
-
BUG_ON(cgroup_css(cgrp, ss));
}
@@ -5684,6 +5683,10 @@ static struct cgroup_subsys_state *css_create(struct cgroup *cgrp,
goto err_free_css;
css->id = err;
+ err = css_rstat_init(css);
+ if (err)
+ goto err_free_css;
+
/* @css is ready to be brought online now, make it visible */
list_add_tail_rcu(&css->sibling, &parent_css->children);
cgroup_idr_replace(&ss->css_idr, css, css->id);
@@ -5697,7 +5700,6 @@ static struct cgroup_subsys_state *css_create(struct cgroup *cgrp,
err_list_del:
list_del_rcu(&css->sibling);
err_free_css:
- list_del_rcu(&css->rstat_css_node);
INIT_RCU_WORK(&css->destroy_rwork, css_free_rwork_fn);
queue_rcu_work(cgroup_destroy_wq, &css->destroy_rwork);
return ERR_PTR(err);
@@ -5713,7 +5715,7 @@ static struct cgroup *cgroup_create(struct cgroup *parent, const char *name,
struct cgroup_root *root = parent->root;
struct cgroup *cgrp, *tcgrp;
struct kernfs_node *kn;
- int level = parent->level + 1;
+ int i, level = parent->level + 1;
int ret;
/* allocate the cgroup and its ID, 0 is reserved for the root */
@@ -5725,17 +5727,13 @@ static struct cgroup *cgroup_create(struct cgroup *parent, const char *name,
if (ret)
goto out_free_cgrp;
- ret = cgroup_rstat_init(cgrp);
- if (ret)
- goto out_cancel_ref;
-
/* create the directory */
kn = kernfs_create_dir_ns(parent->kn, name, mode,
current_fsuid(), current_fsgid(),
cgrp, NULL);
if (IS_ERR(kn)) {
ret = PTR_ERR(kn);
- goto out_stat_exit;
+ goto out_cancel_ref;
}
cgrp->kn = kn;
@@ -5745,15 +5743,20 @@ static struct cgroup *cgroup_create(struct cgroup *parent, const char *name,
cgrp->root = root;
cgrp->level = level;
- ret = psi_cgroup_alloc(cgrp);
+ /*
+ * Now that init_cgroup_housekeeping() has been called and cgrp->self
+ * is setup, it is safe to perform rstat initialization on it.
+ */
+ ret = css_rstat_init(&cgrp->self);
if (ret)
goto out_kernfs_remove;
- if (cgrp->root == &cgrp_dfl_root) {
- ret = cgroup_bpf_inherit(cgrp);
- if (ret)
- goto out_psi_free;
- }
+ ret = psi_cgroup_alloc(cgrp);
+ if (ret)
+ goto out_stat_exit;
+
+ for (tcgrp = cgrp; tcgrp; tcgrp = cgroup_parent(tcgrp))
+ cgrp->ancestors[tcgrp->level] = tcgrp;
/*
* New cgroup inherits effective freeze counter, and
@@ -5771,24 +5774,6 @@ static struct cgroup *cgroup_create(struct cgroup *parent, const char *name,
set_bit(CGRP_FROZEN, &cgrp->flags);
}
- spin_lock_irq(&css_set_lock);
- for (tcgrp = cgrp; tcgrp; tcgrp = cgroup_parent(tcgrp)) {
- cgrp->ancestors[tcgrp->level] = tcgrp;
-
- if (tcgrp != cgrp) {
- tcgrp->nr_descendants++;
-
- /*
- * If the new cgroup is frozen, all ancestor cgroups
- * get a new frozen descendant, but their state can't
- * change because of this.
- */
- if (cgrp->freezer.e_freeze)
- tcgrp->freezer.nr_frozen_descendants++;
- }
- }
- spin_unlock_irq(&css_set_lock);
-
if (notify_on_release(parent))
set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
@@ -5797,7 +5782,29 @@ static struct cgroup *cgroup_create(struct cgroup *parent, const char *name,
cgrp->self.serial_nr = css_serial_nr_next++;
+ ret = blocking_notifier_call_chain_robust(&cgroup_lifetime_notifier,
+ CGROUP_LIFETIME_ONLINE,
+ CGROUP_LIFETIME_OFFLINE, cgrp);
+ ret = notifier_to_errno(ret);
+ if (ret)
+ goto out_psi_free;
+
/* allocation complete, commit to creation */
+ spin_lock_irq(&css_set_lock);
+ for (i = 0; i < level; i++) {
+ tcgrp = cgrp->ancestors[i];
+ tcgrp->nr_descendants++;
+
+ /*
+ * If the new cgroup is frozen, all ancestor cgroups get a new
+ * frozen descendant, but their state can't change because of
+ * this.
+ */
+ if (cgrp->freezer.e_freeze)
+ tcgrp->freezer.nr_frozen_descendants++;
+ }
+ spin_unlock_irq(&css_set_lock);
+
list_add_tail_rcu(&cgrp->self.sibling, &cgroup_parent(cgrp)->self.children);
atomic_inc(&root->nr_cgrps);
cgroup_get_live(parent);
@@ -5815,10 +5822,10 @@ static struct cgroup *cgroup_create(struct cgroup *parent, const char *name,
out_psi_free:
psi_cgroup_free(cgrp);
+out_stat_exit:
+ css_rstat_exit(&cgrp->self);
out_kernfs_remove:
kernfs_remove(cgrp->kn);
-out_stat_exit:
- cgroup_rstat_exit(cgrp);
out_cancel_ref:
percpu_ref_exit(&cgrp->self.refcnt);
out_free_cgrp:
@@ -6015,7 +6022,7 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
struct cgroup *tcgrp, *parent = cgroup_parent(cgrp);
struct cgroup_subsys_state *css;
struct cgrp_cset_link *link;
- int ssid;
+ int ssid, ret;
lockdep_assert_held(&cgroup_mutex);
@@ -6073,8 +6080,9 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
cgroup1_check_for_release(parent);
- if (cgrp->root == &cgrp_dfl_root)
- cgroup_bpf_offline(cgrp);
+ ret = blocking_notifier_call_chain(&cgroup_lifetime_notifier,
+ CGROUP_LIFETIME_OFFLINE, cgrp);
+ WARN_ON_ONCE(notifier_to_errno(ret));
/* put the base reference */
percpu_ref_kill(&cgrp->self.refcnt);
@@ -6136,6 +6144,9 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss, bool early)
} else {
css->id = cgroup_idr_alloc(&ss->css_idr, css, 1, 2, GFP_KERNEL);
BUG_ON(css->id < 0);
+
+ BUG_ON(ss_rstat_init(ss));
+ BUG_ON(css_rstat_init(css));
}
/* Update the init_css_set to contain a subsys
@@ -6184,6 +6195,8 @@ int __init cgroup_init_early(void)
ss->id, ss->name);
WARN(strlen(cgroup_subsys_name[i]) > MAX_CGROUP_TYPE_NAMELEN,
"cgroup_subsys_name %s too long\n", cgroup_subsys_name[i]);
+ WARN(ss->early_init && ss->css_rstat_flush,
+ "cgroup rstat cannot be used with early init subsystem\n");
ss->id = i;
ss->name = cgroup_subsys_name[i];
@@ -6212,7 +6225,7 @@ int __init cgroup_init(void)
BUG_ON(cgroup_init_cftypes(NULL, cgroup_psi_files));
BUG_ON(cgroup_init_cftypes(NULL, cgroup1_base_files));
- cgroup_rstat_boot();
+ BUG_ON(ss_rstat_init(NULL));
get_user_ns(init_cgroup_ns.user_ns);
@@ -6225,6 +6238,8 @@ int __init cgroup_init(void)
hash_add(css_set_table, &init_css_set.hlist,
css_set_hash(init_css_set.subsys));
+ cgroup_bpf_lifetime_notifier_init();
+
BUG_ON(cgroup_setup_root(&cgrp_dfl_root, 0));
cgroup_unlock();
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index 24b70ea3e6ce..3bc4301466f3 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -192,6 +192,20 @@ static inline void notify_partition_change(struct cpuset *cs, int old_prs)
WRITE_ONCE(cs->prs_err, PERR_NONE);
}
+/*
+ * The top_cpuset is always synchronized to cpu_active_mask and we should avoid
+ * using cpu_online_mask as much as possible. An active CPU is always an online
+ * CPU, but not vice versa. cpu_active_mask and cpu_online_mask can differ
+ * during hotplug operations. A CPU is marked active at the last stage of CPU
+ * bringup (CPUHP_AP_ACTIVE). It is also the stage where cpuset hotplug code
+ * will be called to update the sched domains so that the scheduler can move
+ * a normal task to a newly active CPU or remove tasks away from a newly
+ * inactivated CPU. The online bit is set much earlier in the CPU bringup
+ * process and cleared much later in CPU teardown.
+ *
+ * If cpu_online_mask is used while a hotunplug operation is happening in
+ * parallel, we may leave an offline CPU in cpu_allowed or some other masks.
+ */
static struct cpuset top_cpuset = {
.flags = BIT(CS_ONLINE) | BIT(CS_CPU_EXCLUSIVE) |
BIT(CS_MEM_EXCLUSIVE) | BIT(CS_SCHED_LOAD_BALANCE),
@@ -355,18 +369,18 @@ static inline bool partition_is_populated(struct cpuset *cs,
* appropriate cpus.
*
* One way or another, we guarantee to return some non-empty subset
- * of cpu_online_mask.
+ * of cpu_active_mask.
*
* Call with callback_lock or cpuset_mutex held.
*/
-static void guarantee_online_cpus(struct task_struct *tsk,
+static void guarantee_active_cpus(struct task_struct *tsk,
struct cpumask *pmask)
{
const struct cpumask *possible_mask = task_cpu_possible_mask(tsk);
struct cpuset *cs;
- if (WARN_ON(!cpumask_and(pmask, possible_mask, cpu_online_mask)))
- cpumask_copy(pmask, cpu_online_mask);
+ if (WARN_ON(!cpumask_and(pmask, possible_mask, cpu_active_mask)))
+ cpumask_copy(pmask, cpu_active_mask);
rcu_read_lock();
cs = task_cs(tsk);
@@ -1390,14 +1404,12 @@ static int compute_effective_exclusive_cpumask(struct cpuset *cs,
if (sibling == cs)
continue;
- if (!cpumask_empty(sibling->exclusive_cpus) &&
- cpumask_intersects(xcpus, sibling->exclusive_cpus)) {
+ if (cpumask_intersects(xcpus, sibling->exclusive_cpus)) {
cpumask_andnot(xcpus, xcpus, sibling->exclusive_cpus);
retval++;
continue;
}
- if (!cpumask_empty(sibling->effective_xcpus) &&
- cpumask_intersects(xcpus, sibling->effective_xcpus)) {
+ if (cpumask_intersects(xcpus, sibling->effective_xcpus)) {
cpumask_andnot(xcpus, xcpus, sibling->effective_xcpus);
retval++;
}
@@ -1441,13 +1453,15 @@ static int remote_partition_enable(struct cpuset *cs, int new_prs,
* The requested exclusive_cpus must not be allocated to other
* partitions and it can't use up all the root's effective_cpus.
*
- * Note that if there is any local partition root above it or
- * remote partition root underneath it, its exclusive_cpus must
- * have overlapped with subpartitions_cpus.
+ * The effective_xcpus mask can contain offline CPUs, but there must
+ * be at least one or more online CPUs present before it can be enabled.
+ *
+ * Note that creating a remote partition with any local partition root
+ * above it or remote partition root underneath it is not allowed.
*/
compute_effective_exclusive_cpumask(cs, tmp->new_cpus, NULL);
- if (cpumask_empty(tmp->new_cpus) ||
- cpumask_intersects(tmp->new_cpus, subpartitions_cpus) ||
+ WARN_ON_ONCE(cpumask_intersects(tmp->new_cpus, subpartitions_cpus));
+ if (!cpumask_intersects(tmp->new_cpus, cpu_active_mask) ||
cpumask_subset(top_cpuset.effective_cpus, tmp->new_cpus))
return PERR_INVCPUS;
@@ -1543,6 +1557,7 @@ static void remote_cpus_update(struct cpuset *cs, struct cpumask *xcpus,
* left in the top cpuset.
*/
if (adding) {
+ WARN_ON_ONCE(cpumask_intersects(tmp->addmask, subpartitions_cpus));
if (!capable(CAP_SYS_ADMIN))
cs->prs_err = PERR_ACCESS;
else if (cpumask_intersects(tmp->addmask, subpartitions_cpus) ||
@@ -1652,7 +1667,7 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd,
bool nocpu;
lockdep_assert_held(&cpuset_mutex);
- WARN_ON_ONCE(is_remote_partition(cs));
+ WARN_ON_ONCE(is_remote_partition(cs)); /* For local partition only */
/*
* new_prs will only be changed for the partcmd_update and
@@ -1698,7 +1713,7 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd,
* exclusive_cpus not set. Sibling conflict should only happen
* if exclusive_cpus isn't set.
*/
- xcpus = tmp->new_cpus;
+ xcpus = tmp->delmask;
if (compute_effective_exclusive_cpumask(cs, xcpus, NULL))
WARN_ON_ONCE(!cpumask_empty(cs->exclusive_cpus));
@@ -1719,9 +1734,20 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd,
if (nocpu)
return PERR_NOCPUS;
- deleting = cpumask_and(tmp->delmask, xcpus, parent->effective_xcpus);
- if (deleting)
- subparts_delta++;
+ /*
+ * This function will only be called when all the preliminary
+ * checks have passed. At this point, the following condition
+ * should hold.
+ *
+ * (cs->effective_xcpus & cpu_active_mask) ⊆ parent->effective_cpus
+ *
+ * Warn if it is not the case.
+ */
+ cpumask_and(tmp->new_cpus, xcpus, cpu_active_mask);
+ WARN_ON_ONCE(!cpumask_subset(tmp->new_cpus, parent->effective_cpus));
+
+ deleting = true;
+ subparts_delta++;
new_prs = (cmd == partcmd_enable) ? PRS_ROOT : PRS_ISOLATED;
} else if (cmd == partcmd_disable) {
/*
@@ -1776,6 +1802,15 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd,
parent->effective_xcpus);
}
/*
+ * The new CPUs to be removed from parent's effective CPUs
+ * must be present.
+ */
+ if (deleting) {
+ cpumask_and(tmp->new_cpus, tmp->delmask, cpu_active_mask);
+ WARN_ON_ONCE(!cpumask_subset(tmp->new_cpus, parent->effective_cpus));
+ }
+
+ /*
* Make partition invalid if parent's effective_cpus could
* become empty and there are tasks in the parent.
*/
@@ -2265,7 +2300,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
bool force = false;
int old_prs = cs->partition_root_state;
- /* top_cpuset.cpus_allowed tracks cpu_online_mask; it's read-only */
+ /* top_cpuset.cpus_allowed tracks cpu_active_mask; it's read-only */
if (cs == &top_cpuset)
return -EACCES;
@@ -3084,7 +3119,7 @@ static void cpuset_attach_task(struct cpuset *cs, struct task_struct *task)
lockdep_assert_held(&cpuset_mutex);
if (cs != &top_cpuset)
- guarantee_online_cpus(task, cpus_attach);
+ guarantee_active_cpus(task, cpus_attach);
else
cpumask_andnot(cpus_attach, task_cpu_possible_mask(task),
subpartitions_cpus);
@@ -3526,11 +3561,7 @@ out_unlock:
* will call rebuild_sched_domains_locked(). That is not needed
* in the default hierarchy where only changes in partition
* will cause repartitioning.
- *
- * If the cpuset has the 'sched.partition' flag enabled, simulate
- * turning 'sched.partition" off.
*/
-
static void cpuset_css_offline(struct cgroup_subsys_state *css)
{
struct cpuset *cs = css_cs(css);
@@ -3548,6 +3579,11 @@ static void cpuset_css_offline(struct cgroup_subsys_state *css)
cpus_read_unlock();
}
+/*
+ * If a dying cpuset has the 'cpus.partition' enabled, turn it off by
+ * changing it back to member to free its exclusive CPUs back to the pool to
+ * be used by other online cpusets.
+ */
static void cpuset_css_killed(struct cgroup_subsys_state *css)
{
struct cpuset *cs = css_cs(css);
@@ -4028,7 +4064,7 @@ void __init cpuset_init_smp(void)
*
* Description: Returns the cpumask_var_t cpus_allowed of the cpuset
* attached to the specified @tsk. Guaranteed to return some non-empty
- * subset of cpu_online_mask, even if this means going outside the
+ * subset of cpu_active_mask, even if this means going outside the
* tasks cpuset, except when the task is in the top cpuset.
**/
@@ -4042,7 +4078,7 @@ void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask)
cs = task_cs(tsk);
if (cs != &top_cpuset)
- guarantee_online_cpus(tsk, pmask);
+ guarantee_active_cpus(tsk, pmask);
/*
* Tasks in the top cpuset won't get update to their cpumasks
* when a hotplug online/offline event happens. So we include all
@@ -4056,7 +4092,7 @@ void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask)
* allowable online cpu left, we fall back to all possible cpus.
*/
cpumask_andnot(pmask, possible_mask, subpartitions_cpus);
- if (!cpumask_intersects(pmask, cpu_online_mask))
+ if (!cpumask_intersects(pmask, cpu_active_mask))
cpumask_copy(pmask, possible_mask);
}
@@ -4166,7 +4202,7 @@ static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs)
}
/*
- * cpuset_node_allowed - Can we allocate on a memory node?
+ * cpuset_current_node_allowed - Can current task allocate on a memory node?
* @node: is this an allowed node?
* @gfp_mask: memory allocation flags
*
@@ -4205,7 +4241,7 @@ static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs)
* GFP_KERNEL - any node in enclosing hardwalled cpuset ok
* GFP_USER - only nodes in current tasks mems allowed ok.
*/
-bool cpuset_node_allowed(int node, gfp_t gfp_mask)
+bool cpuset_current_node_allowed(int node, gfp_t gfp_mask)
{
struct cpuset *cs; /* current cpuset ancestors */
bool allowed; /* is allocation in zone z allowed? */
@@ -4239,6 +4275,42 @@ bool cpuset_node_allowed(int node, gfp_t gfp_mask)
return allowed;
}
+bool cpuset_node_allowed(struct cgroup *cgroup, int nid)
+{
+ struct cgroup_subsys_state *css;
+ struct cpuset *cs;
+ bool allowed;
+
+ /*
+ * In v1, mem_cgroup and cpuset are unlikely in the same hierarchy
+ * and mems_allowed is likely to be empty even if we could get to it,
+ * so return true to avoid taking a global lock on the empty check.
+ */
+ if (!cpuset_v2())
+ return true;
+
+ css = cgroup_get_e_css(cgroup, &cpuset_cgrp_subsys);
+ if (!css)
+ return true;
+
+ /*
+ * Normally, accessing effective_mems would require the cpuset_mutex
+ * or callback_lock - but node_isset is atomic and the reference
+ * taken via cgroup_get_e_css is sufficient to protect css.
+ *
+ * Since this interface is intended for use by migration paths, we
+ * relax locking here to avoid taking global locks - while accepting
+ * there may be rare scenarios where the result may be innaccurate.
+ *
+ * Reclaim and migration are subject to these same race conditions, and
+ * cannot make strong isolation guarantees, so this is acceptable.
+ */
+ cs = container_of(css, struct cpuset, css);
+ allowed = node_isset(nid, cs->effective_mems);
+ css_put(css);
+ return allowed;
+}
+
/**
* cpuset_spread_node() - On which node to begin search for a page
* @rotor: round robin rotor
diff --git a/kernel/cgroup/legacy_freezer.c b/kernel/cgroup/legacy_freezer.c
index 039d1eb2f215..dd9417425d92 100644
--- a/kernel/cgroup/legacy_freezer.c
+++ b/kernel/cgroup/legacy_freezer.c
@@ -66,15 +66,9 @@ static struct freezer *parent_freezer(struct freezer *freezer)
bool cgroup_freezing(struct task_struct *task)
{
bool ret;
- unsigned int state;
rcu_read_lock();
- /* Check if the cgroup is still FREEZING, but not FROZEN. The extra
- * !FROZEN check is required, because the FREEZING bit is not cleared
- * when the state FROZEN is reached.
- */
- state = task_freezer(task)->state;
- ret = (state & CGROUP_FREEZING) && !(state & CGROUP_FROZEN);
+ ret = task_freezer(task)->state & CGROUP_FREEZING;
rcu_read_unlock();
return ret;
@@ -188,13 +182,12 @@ static void freezer_attach(struct cgroup_taskset *tset)
if (!(freezer->state & CGROUP_FREEZING)) {
__thaw_task(task);
} else {
- freeze_task(task);
-
/* clear FROZEN and propagate upwards */
while (freezer && (freezer->state & CGROUP_FROZEN)) {
freezer->state &= ~CGROUP_FROZEN;
freezer = parent_freezer(freezer);
}
+ freeze_task(task);
}
}
diff --git a/kernel/cgroup/misc.c b/kernel/cgroup/misc.c
index 2fa3a4fb2aaf..6a01d91ea4cb 100644
--- a/kernel/cgroup/misc.c
+++ b/kernel/cgroup/misc.c
@@ -24,6 +24,10 @@ static const char *const misc_res_name[] = {
/* AMD SEV-ES ASIDs resource */
"sev_es",
#endif
+#ifdef CONFIG_INTEL_TDX_HOST
+ /* Intel TDX HKIDs resource */
+ "tdx",
+#endif
};
/* Root misc cgroup */
diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c
index b2239156b7de..cbeaa499a96a 100644
--- a/kernel/cgroup/rstat.c
+++ b/kernel/cgroup/rstat.c
@@ -9,18 +9,64 @@
#include <trace/events/cgroup.h>
-static DEFINE_SPINLOCK(cgroup_rstat_lock);
-static DEFINE_PER_CPU(raw_spinlock_t, cgroup_rstat_cpu_lock);
+static DEFINE_SPINLOCK(rstat_base_lock);
+static DEFINE_PER_CPU(raw_spinlock_t, rstat_base_cpu_lock);
static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu);
-static struct cgroup_rstat_cpu *cgroup_rstat_cpu(struct cgroup *cgrp, int cpu)
+/*
+ * Determines whether a given css can participate in rstat.
+ * css's that are cgroup::self use rstat for base stats.
+ * Other css's associated with a subsystem use rstat only when
+ * they define the ss->css_rstat_flush callback.
+ */
+static inline bool css_uses_rstat(struct cgroup_subsys_state *css)
+{
+ return css_is_self(css) || css->ss->css_rstat_flush != NULL;
+}
+
+static struct css_rstat_cpu *css_rstat_cpu(
+ struct cgroup_subsys_state *css, int cpu)
+{
+ return per_cpu_ptr(css->rstat_cpu, cpu);
+}
+
+static struct cgroup_rstat_base_cpu *cgroup_rstat_base_cpu(
+ struct cgroup *cgrp, int cpu)
{
- return per_cpu_ptr(cgrp->rstat_cpu, cpu);
+ return per_cpu_ptr(cgrp->rstat_base_cpu, cpu);
+}
+
+static spinlock_t *ss_rstat_lock(struct cgroup_subsys *ss)
+{
+ if (ss)
+ return &ss->rstat_ss_lock;
+
+ return &rstat_base_lock;
+}
+
+static raw_spinlock_t *ss_rstat_cpu_lock(struct cgroup_subsys *ss, int cpu)
+{
+ if (ss) {
+ /*
+ * Depending on config, the subsystem per-cpu lock type may be an
+ * empty struct. In enviromnents where this is the case, allocation
+ * of this field is not performed in ss_rstat_init(). Avoid a
+ * cpu-based offset relative to NULL by returning early. When the
+ * lock type is zero in size, the corresponding lock functions are
+ * no-ops so passing them NULL is acceptable.
+ */
+ if (sizeof(*ss->rstat_ss_cpu_lock) == 0)
+ return NULL;
+
+ return per_cpu_ptr(ss->rstat_ss_cpu_lock, cpu);
+ }
+
+ return per_cpu_ptr(&rstat_base_cpu_lock, cpu);
}
/*
- * Helper functions for rstat per CPU lock (cgroup_rstat_cpu_lock).
+ * Helper functions for rstat per CPU locks.
*
* This makes it easier to diagnose locking issues and contention in
* production environments. The parameter @fast_path determine the
@@ -28,20 +74,23 @@ static struct cgroup_rstat_cpu *cgroup_rstat_cpu(struct cgroup *cgrp, int cpu)
* operations without handling high-frequency fast-path "update" events.
*/
static __always_inline
-unsigned long _cgroup_rstat_cpu_lock(raw_spinlock_t *cpu_lock, int cpu,
- struct cgroup *cgrp, const bool fast_path)
+unsigned long _css_rstat_cpu_lock(struct cgroup_subsys_state *css, int cpu,
+ const bool fast_path)
{
+ struct cgroup *cgrp = css->cgroup;
+ raw_spinlock_t *cpu_lock;
unsigned long flags;
bool contended;
/*
- * The _irqsave() is needed because cgroup_rstat_lock is
- * spinlock_t which is a sleeping lock on PREEMPT_RT. Acquiring
- * this lock with the _irq() suffix only disables interrupts on
- * a non-PREEMPT_RT kernel. The raw_spinlock_t below disables
- * interrupts on both configurations. The _irqsave() ensures
- * that interrupts are always disabled and later restored.
+ * The _irqsave() is needed because the locks used for flushing are
+ * spinlock_t which is a sleeping lock on PREEMPT_RT. Acquiring this lock
+ * with the _irq() suffix only disables interrupts on a non-PREEMPT_RT
+ * kernel. The raw_spinlock_t below disables interrupts on both
+ * configurations. The _irqsave() ensures that interrupts are always
+ * disabled and later restored.
*/
+ cpu_lock = ss_rstat_cpu_lock(css->ss, cpu);
contended = !raw_spin_trylock_irqsave(cpu_lock, flags);
if (contended) {
if (fast_path)
@@ -61,50 +110,59 @@ unsigned long _cgroup_rstat_cpu_lock(raw_spinlock_t *cpu_lock, int cpu,
}
static __always_inline
-void _cgroup_rstat_cpu_unlock(raw_spinlock_t *cpu_lock, int cpu,
- struct cgroup *cgrp, unsigned long flags,
- const bool fast_path)
+void _css_rstat_cpu_unlock(struct cgroup_subsys_state *css, int cpu,
+ unsigned long flags, const bool fast_path)
{
+ struct cgroup *cgrp = css->cgroup;
+ raw_spinlock_t *cpu_lock;
+
if (fast_path)
trace_cgroup_rstat_cpu_unlock_fastpath(cgrp, cpu, false);
else
trace_cgroup_rstat_cpu_unlock(cgrp, cpu, false);
+ cpu_lock = ss_rstat_cpu_lock(css->ss, cpu);
raw_spin_unlock_irqrestore(cpu_lock, flags);
}
/**
- * cgroup_rstat_updated - keep track of updated rstat_cpu
- * @cgrp: target cgroup
+ * css_rstat_updated - keep track of updated rstat_cpu
+ * @css: target cgroup subsystem state
* @cpu: cpu on which rstat_cpu was updated
*
- * @cgrp's rstat_cpu on @cpu was updated. Put it on the parent's matching
- * rstat_cpu->updated_children list. See the comment on top of
- * cgroup_rstat_cpu definition for details.
+ * @css's rstat_cpu on @cpu was updated. Put it on the parent's matching
+ * rstat_cpu->updated_children list. See the comment on top of
+ * css_rstat_cpu definition for details.
*/
-__bpf_kfunc void cgroup_rstat_updated(struct cgroup *cgrp, int cpu)
+__bpf_kfunc void css_rstat_updated(struct cgroup_subsys_state *css, int cpu)
{
- raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_rstat_cpu_lock, cpu);
unsigned long flags;
/*
+ * Since bpf programs can call this function, prevent access to
+ * uninitialized rstat pointers.
+ */
+ if (!css_uses_rstat(css))
+ return;
+
+ /*
* Speculative already-on-list test. This may race leading to
* temporary inaccuracies, which is fine.
*
* Because @parent's updated_children is terminated with @parent
- * instead of NULL, we can tell whether @cgrp is on the list by
+ * instead of NULL, we can tell whether @css is on the list by
* testing the next pointer for NULL.
*/
- if (data_race(cgroup_rstat_cpu(cgrp, cpu)->updated_next))
+ if (data_race(css_rstat_cpu(css, cpu)->updated_next))
return;
- flags = _cgroup_rstat_cpu_lock(cpu_lock, cpu, cgrp, true);
+ flags = _css_rstat_cpu_lock(css, cpu, true);
- /* put @cgrp and all ancestors on the corresponding updated lists */
+ /* put @css and all ancestors on the corresponding updated lists */
while (true) {
- struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu);
- struct cgroup *parent = cgroup_parent(cgrp);
- struct cgroup_rstat_cpu *prstatc;
+ struct css_rstat_cpu *rstatc = css_rstat_cpu(css, cpu);
+ struct cgroup_subsys_state *parent = css->parent;
+ struct css_rstat_cpu *prstatc;
/*
* Both additions and removals are bottom-up. If a cgroup
@@ -115,53 +173,78 @@ __bpf_kfunc void cgroup_rstat_updated(struct cgroup *cgrp, int cpu)
/* Root has no parent to link it to, but mark it busy */
if (!parent) {
- rstatc->updated_next = cgrp;
+ rstatc->updated_next = css;
break;
}
- prstatc = cgroup_rstat_cpu(parent, cpu);
+ prstatc = css_rstat_cpu(parent, cpu);
rstatc->updated_next = prstatc->updated_children;
- prstatc->updated_children = cgrp;
+ prstatc->updated_children = css;
- cgrp = parent;
+ css = parent;
}
- _cgroup_rstat_cpu_unlock(cpu_lock, cpu, cgrp, flags, true);
+ _css_rstat_cpu_unlock(css, cpu, flags, true);
}
/**
- * cgroup_rstat_push_children - push children cgroups into the given list
+ * css_rstat_push_children - push children css's into the given list
* @head: current head of the list (= subtree root)
* @child: first child of the root
* @cpu: target cpu
- * Return: A new singly linked list of cgroups to be flush
+ * Return: A new singly linked list of css's to be flushed
*
- * Iteratively traverse down the cgroup_rstat_cpu updated tree level by
+ * Iteratively traverse down the css_rstat_cpu updated tree level by
* level and push all the parents first before their next level children
- * into a singly linked list built from the tail backward like "pushing"
- * cgroups into a stack. The root is pushed by the caller.
+ * into a singly linked list via the rstat_flush_next pointer built from the
+ * tail backward like "pushing" css's into a stack. The root is pushed by
+ * the caller.
*/
-static struct cgroup *cgroup_rstat_push_children(struct cgroup *head,
- struct cgroup *child, int cpu)
+static struct cgroup_subsys_state *css_rstat_push_children(
+ struct cgroup_subsys_state *head,
+ struct cgroup_subsys_state *child, int cpu)
{
- struct cgroup *chead = child; /* Head of child cgroup level */
- struct cgroup *ghead = NULL; /* Head of grandchild cgroup level */
- struct cgroup *parent, *grandchild;
- struct cgroup_rstat_cpu *crstatc;
+ struct cgroup_subsys_state *cnext = child; /* Next head of child css level */
+ struct cgroup_subsys_state *ghead = NULL; /* Head of grandchild css level */
+ struct cgroup_subsys_state *parent, *grandchild;
+ struct css_rstat_cpu *crstatc;
child->rstat_flush_next = NULL;
+ /*
+ * The subsystem rstat lock must be held for the whole duration from
+ * here as the rstat_flush_next list is being constructed to when
+ * it is consumed later in css_rstat_flush().
+ */
+ lockdep_assert_held(ss_rstat_lock(head->ss));
+
+ /*
+ * Notation: -> updated_next pointer
+ * => rstat_flush_next pointer
+ *
+ * Assuming the following sample updated_children lists:
+ * P: C1 -> C2 -> P
+ * C1: G11 -> G12 -> C1
+ * C2: G21 -> G22 -> C2
+ *
+ * After 1st iteration:
+ * head => C2 => C1 => NULL
+ * ghead => G21 => G11 => NULL
+ *
+ * After 2nd iteration:
+ * head => G12 => G11 => G22 => G21 => C2 => C1 => NULL
+ */
next_level:
- while (chead) {
- child = chead;
- chead = child->rstat_flush_next;
- parent = cgroup_parent(child);
+ while (cnext) {
+ child = cnext;
+ cnext = child->rstat_flush_next;
+ parent = child->parent;
- /* updated_next is parent cgroup terminated */
+ /* updated_next is parent cgroup terminated if !NULL */
while (child != parent) {
child->rstat_flush_next = head;
head = child;
- crstatc = cgroup_rstat_cpu(child, cpu);
+ crstatc = css_rstat_cpu(child, cpu);
grandchild = crstatc->updated_children;
if (grandchild != child) {
/* Push the grand child to the next level */
@@ -175,7 +258,7 @@ next_level:
}
if (ghead) {
- chead = ghead;
+ cnext = ghead;
ghead = NULL;
goto next_level;
}
@@ -183,31 +266,31 @@ next_level:
}
/**
- * cgroup_rstat_updated_list - return a list of updated cgroups to be flushed
- * @root: root of the cgroup subtree to traverse
+ * css_rstat_updated_list - build a list of updated css's to be flushed
+ * @root: root of the css subtree to traverse
* @cpu: target cpu
- * Return: A singly linked list of cgroups to be flushed
+ * Return: A singly linked list of css's to be flushed
*
* Walks the updated rstat_cpu tree on @cpu from @root. During traversal,
- * each returned cgroup is unlinked from the updated tree.
+ * each returned css is unlinked from the updated tree.
*
* The only ordering guarantee is that, for a parent and a child pair
* covered by a given traversal, the child is before its parent in
* the list.
*
* Note that updated_children is self terminated and points to a list of
- * child cgroups if not empty. Whereas updated_next is like a sibling link
- * within the children list and terminated by the parent cgroup. An exception
- * here is the cgroup root whose updated_next can be self terminated.
+ * child css's if not empty. Whereas updated_next is like a sibling link
+ * within the children list and terminated by the parent css. An exception
+ * here is the css root whose updated_next can be self terminated.
*/
-static struct cgroup *cgroup_rstat_updated_list(struct cgroup *root, int cpu)
+static struct cgroup_subsys_state *css_rstat_updated_list(
+ struct cgroup_subsys_state *root, int cpu)
{
- raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_rstat_cpu_lock, cpu);
- struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(root, cpu);
- struct cgroup *head = NULL, *parent, *child;
+ struct css_rstat_cpu *rstatc = css_rstat_cpu(root, cpu);
+ struct cgroup_subsys_state *head = NULL, *parent, *child;
unsigned long flags;
- flags = _cgroup_rstat_cpu_lock(cpu_lock, cpu, root, false);
+ flags = _css_rstat_cpu_lock(root, cpu, false);
/* Return NULL if this subtree is not on-list */
if (!rstatc->updated_next)
@@ -217,17 +300,17 @@ static struct cgroup *cgroup_rstat_updated_list(struct cgroup *root, int cpu)
* Unlink @root from its parent. As the updated_children list is
* singly linked, we have to walk it to find the removal point.
*/
- parent = cgroup_parent(root);
+ parent = root->parent;
if (parent) {
- struct cgroup_rstat_cpu *prstatc;
- struct cgroup **nextp;
+ struct css_rstat_cpu *prstatc;
+ struct cgroup_subsys_state **nextp;
- prstatc = cgroup_rstat_cpu(parent, cpu);
+ prstatc = css_rstat_cpu(parent, cpu);
nextp = &prstatc->updated_children;
while (*nextp != root) {
- struct cgroup_rstat_cpu *nrstatc;
+ struct css_rstat_cpu *nrstatc;
- nrstatc = cgroup_rstat_cpu(*nextp, cpu);
+ nrstatc = css_rstat_cpu(*nextp, cpu);
WARN_ON_ONCE(*nextp == parent);
nextp = &nrstatc->updated_next;
}
@@ -242,16 +325,16 @@ static struct cgroup *cgroup_rstat_updated_list(struct cgroup *root, int cpu)
child = rstatc->updated_children;
rstatc->updated_children = root;
if (child != root)
- head = cgroup_rstat_push_children(head, child, cpu);
+ head = css_rstat_push_children(head, child, cpu);
unlock_ret:
- _cgroup_rstat_cpu_unlock(cpu_lock, cpu, root, flags, false);
+ _css_rstat_cpu_unlock(root, cpu, flags, false);
return head;
}
/*
* A hook for bpf stat collectors to attach to and flush their stats.
- * Together with providing bpf kfuncs for cgroup_rstat_updated() and
- * cgroup_rstat_flush(), this enables a complete workflow where bpf progs that
+ * Together with providing bpf kfuncs for css_rstat_updated() and
+ * css_rstat_flush(), this enables a complete workflow where bpf progs that
* collect cgroup stats can integrate with rstat for efficient flushing.
*
* A static noinline declaration here could cause the compiler to optimize away
@@ -271,7 +354,7 @@ __weak noinline void bpf_rstat_flush(struct cgroup *cgrp,
__bpf_hook_end();
/*
- * Helper functions for locking cgroup_rstat_lock.
+ * Helper functions for locking.
*
* This makes it easier to diagnose locking issues and contention in
* production environments. The parameter @cpu_in_loop indicate lock
@@ -279,115 +362,181 @@ __bpf_hook_end();
* value -1 is used when obtaining the main lock else this is the CPU
* number processed last.
*/
-static inline void __cgroup_rstat_lock(struct cgroup *cgrp, int cpu_in_loop)
- __acquires(&cgroup_rstat_lock)
+static inline void __css_rstat_lock(struct cgroup_subsys_state *css,
+ int cpu_in_loop)
+ __acquires(ss_rstat_lock(css->ss))
{
+ struct cgroup *cgrp = css->cgroup;
+ spinlock_t *lock;
bool contended;
- contended = !spin_trylock_irq(&cgroup_rstat_lock);
+ lock = ss_rstat_lock(css->ss);
+ contended = !spin_trylock_irq(lock);
if (contended) {
trace_cgroup_rstat_lock_contended(cgrp, cpu_in_loop, contended);
- spin_lock_irq(&cgroup_rstat_lock);
+ spin_lock_irq(lock);
}
trace_cgroup_rstat_locked(cgrp, cpu_in_loop, contended);
}
-static inline void __cgroup_rstat_unlock(struct cgroup *cgrp, int cpu_in_loop)
- __releases(&cgroup_rstat_lock)
+static inline void __css_rstat_unlock(struct cgroup_subsys_state *css,
+ int cpu_in_loop)
+ __releases(ss_rstat_lock(css->ss))
{
+ struct cgroup *cgrp = css->cgroup;
+ spinlock_t *lock;
+
+ lock = ss_rstat_lock(css->ss);
trace_cgroup_rstat_unlock(cgrp, cpu_in_loop, false);
- spin_unlock_irq(&cgroup_rstat_lock);
+ spin_unlock_irq(lock);
}
/**
- * cgroup_rstat_flush - flush stats in @cgrp's subtree
- * @cgrp: target cgroup
+ * css_rstat_flush - flush stats in @css's rstat subtree
+ * @css: target cgroup subsystem state
*
- * Collect all per-cpu stats in @cgrp's subtree into the global counters
- * and propagate them upwards. After this function returns, all cgroups in
- * the subtree have up-to-date ->stat.
+ * Collect all per-cpu stats in @css's subtree into the global counters
+ * and propagate them upwards. After this function returns, all rstat
+ * nodes in the subtree have up-to-date ->stat.
*
- * This also gets all cgroups in the subtree including @cgrp off the
+ * This also gets all rstat nodes in the subtree including @css off the
* ->updated_children lists.
*
* This function may block.
*/
-__bpf_kfunc void cgroup_rstat_flush(struct cgroup *cgrp)
+__bpf_kfunc void css_rstat_flush(struct cgroup_subsys_state *css)
{
int cpu;
+ bool is_self = css_is_self(css);
+
+ /*
+ * Since bpf programs can call this function, prevent access to
+ * uninitialized rstat pointers.
+ */
+ if (!css_uses_rstat(css))
+ return;
might_sleep();
for_each_possible_cpu(cpu) {
- struct cgroup *pos;
+ struct cgroup_subsys_state *pos;
/* Reacquire for each CPU to avoid disabling IRQs too long */
- __cgroup_rstat_lock(cgrp, cpu);
- pos = cgroup_rstat_updated_list(cgrp, cpu);
+ __css_rstat_lock(css, cpu);
+ pos = css_rstat_updated_list(css, cpu);
for (; pos; pos = pos->rstat_flush_next) {
- struct cgroup_subsys_state *css;
-
- cgroup_base_stat_flush(pos, cpu);
- bpf_rstat_flush(pos, cgroup_parent(pos), cpu);
-
- rcu_read_lock();
- list_for_each_entry_rcu(css, &pos->rstat_css_list,
- rstat_css_node)
- css->ss->css_rstat_flush(css, cpu);
- rcu_read_unlock();
+ if (is_self) {
+ cgroup_base_stat_flush(pos->cgroup, cpu);
+ bpf_rstat_flush(pos->cgroup,
+ cgroup_parent(pos->cgroup), cpu);
+ } else
+ pos->ss->css_rstat_flush(pos, cpu);
}
- __cgroup_rstat_unlock(cgrp, cpu);
+ __css_rstat_unlock(css, cpu);
if (!cond_resched())
cpu_relax();
}
}
-int cgroup_rstat_init(struct cgroup *cgrp)
+int css_rstat_init(struct cgroup_subsys_state *css)
{
+ struct cgroup *cgrp = css->cgroup;
int cpu;
+ bool is_self = css_is_self(css);
+
+ if (is_self) {
+ /* the root cgrp has rstat_base_cpu preallocated */
+ if (!cgrp->rstat_base_cpu) {
+ cgrp->rstat_base_cpu = alloc_percpu(struct cgroup_rstat_base_cpu);
+ if (!cgrp->rstat_base_cpu)
+ return -ENOMEM;
+ }
+ } else if (css->ss->css_rstat_flush == NULL)
+ return 0;
+
+ /* the root cgrp's self css has rstat_cpu preallocated */
+ if (!css->rstat_cpu) {
+ css->rstat_cpu = alloc_percpu(struct css_rstat_cpu);
+ if (!css->rstat_cpu) {
+ if (is_self)
+ free_percpu(cgrp->rstat_base_cpu);
- /* the root cgrp has rstat_cpu preallocated */
- if (!cgrp->rstat_cpu) {
- cgrp->rstat_cpu = alloc_percpu(struct cgroup_rstat_cpu);
- if (!cgrp->rstat_cpu)
return -ENOMEM;
+ }
}
/* ->updated_children list is self terminated */
for_each_possible_cpu(cpu) {
- struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu);
+ struct css_rstat_cpu *rstatc = css_rstat_cpu(css, cpu);
- rstatc->updated_children = cgrp;
- u64_stats_init(&rstatc->bsync);
+ rstatc->updated_children = css;
+
+ if (is_self) {
+ struct cgroup_rstat_base_cpu *rstatbc;
+
+ rstatbc = cgroup_rstat_base_cpu(cgrp, cpu);
+ u64_stats_init(&rstatbc->bsync);
+ }
}
return 0;
}
-void cgroup_rstat_exit(struct cgroup *cgrp)
+void css_rstat_exit(struct cgroup_subsys_state *css)
{
int cpu;
- cgroup_rstat_flush(cgrp);
+ if (!css_uses_rstat(css))
+ return;
+
+ css_rstat_flush(css);
/* sanity check */
for_each_possible_cpu(cpu) {
- struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu);
+ struct css_rstat_cpu *rstatc = css_rstat_cpu(css, cpu);
- if (WARN_ON_ONCE(rstatc->updated_children != cgrp) ||
+ if (WARN_ON_ONCE(rstatc->updated_children != css) ||
WARN_ON_ONCE(rstatc->updated_next))
return;
}
- free_percpu(cgrp->rstat_cpu);
- cgrp->rstat_cpu = NULL;
+ if (css_is_self(css)) {
+ struct cgroup *cgrp = css->cgroup;
+
+ free_percpu(cgrp->rstat_base_cpu);
+ cgrp->rstat_base_cpu = NULL;
+ }
+
+ free_percpu(css->rstat_cpu);
+ css->rstat_cpu = NULL;
}
-void __init cgroup_rstat_boot(void)
+/**
+ * ss_rstat_init - subsystem-specific rstat initialization
+ * @ss: target subsystem
+ *
+ * If @ss is NULL, the static locks associated with the base stats
+ * are initialized. If @ss is non-NULL, the subsystem-specific locks
+ * are initialized.
+ */
+int __init ss_rstat_init(struct cgroup_subsys *ss)
{
int cpu;
+ /*
+ * Depending on config, the subsystem per-cpu lock type may be an empty
+ * struct. Avoid allocating a size of zero in this case.
+ */
+ if (ss && sizeof(*ss->rstat_ss_cpu_lock)) {
+ ss->rstat_ss_cpu_lock = alloc_percpu(raw_spinlock_t);
+ if (!ss->rstat_ss_cpu_lock)
+ return -ENOMEM;
+ }
+
+ spin_lock_init(ss_rstat_lock(ss));
for_each_possible_cpu(cpu)
- raw_spin_lock_init(per_cpu_ptr(&cgroup_rstat_cpu_lock, cpu));
+ raw_spin_lock_init(ss_rstat_cpu_lock(ss, cpu));
+
+ return 0;
}
/*
@@ -420,9 +569,9 @@ static void cgroup_base_stat_sub(struct cgroup_base_stat *dst_bstat,
static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu)
{
- struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu);
+ struct cgroup_rstat_base_cpu *rstatbc = cgroup_rstat_base_cpu(cgrp, cpu);
struct cgroup *parent = cgroup_parent(cgrp);
- struct cgroup_rstat_cpu *prstatc;
+ struct cgroup_rstat_base_cpu *prstatbc;
struct cgroup_base_stat delta;
unsigned seq;
@@ -432,15 +581,15 @@ static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu)
/* fetch the current per-cpu values */
do {
- seq = __u64_stats_fetch_begin(&rstatc->bsync);
- delta = rstatc->bstat;
- } while (__u64_stats_fetch_retry(&rstatc->bsync, seq));
+ seq = __u64_stats_fetch_begin(&rstatbc->bsync);
+ delta = rstatbc->bstat;
+ } while (__u64_stats_fetch_retry(&rstatbc->bsync, seq));
/* propagate per-cpu delta to cgroup and per-cpu global statistics */
- cgroup_base_stat_sub(&delta, &rstatc->last_bstat);
+ cgroup_base_stat_sub(&delta, &rstatbc->last_bstat);
cgroup_base_stat_add(&cgrp->bstat, &delta);
- cgroup_base_stat_add(&rstatc->last_bstat, &delta);
- cgroup_base_stat_add(&rstatc->subtree_bstat, &delta);
+ cgroup_base_stat_add(&rstatbc->last_bstat, &delta);
+ cgroup_base_stat_add(&rstatbc->subtree_bstat, &delta);
/* propagate cgroup and per-cpu global delta to parent (unless that's root) */
if (cgroup_parent(parent)) {
@@ -449,73 +598,73 @@ static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu)
cgroup_base_stat_add(&parent->bstat, &delta);
cgroup_base_stat_add(&cgrp->last_bstat, &delta);
- delta = rstatc->subtree_bstat;
- prstatc = cgroup_rstat_cpu(parent, cpu);
- cgroup_base_stat_sub(&delta, &rstatc->last_subtree_bstat);
- cgroup_base_stat_add(&prstatc->subtree_bstat, &delta);
- cgroup_base_stat_add(&rstatc->last_subtree_bstat, &delta);
+ delta = rstatbc->subtree_bstat;
+ prstatbc = cgroup_rstat_base_cpu(parent, cpu);
+ cgroup_base_stat_sub(&delta, &rstatbc->last_subtree_bstat);
+ cgroup_base_stat_add(&prstatbc->subtree_bstat, &delta);
+ cgroup_base_stat_add(&rstatbc->last_subtree_bstat, &delta);
}
}
-static struct cgroup_rstat_cpu *
+static struct cgroup_rstat_base_cpu *
cgroup_base_stat_cputime_account_begin(struct cgroup *cgrp, unsigned long *flags)
{
- struct cgroup_rstat_cpu *rstatc;
+ struct cgroup_rstat_base_cpu *rstatbc;
- rstatc = get_cpu_ptr(cgrp->rstat_cpu);
- *flags = u64_stats_update_begin_irqsave(&rstatc->bsync);
- return rstatc;
+ rstatbc = get_cpu_ptr(cgrp->rstat_base_cpu);
+ *flags = u64_stats_update_begin_irqsave(&rstatbc->bsync);
+ return rstatbc;
}
static void cgroup_base_stat_cputime_account_end(struct cgroup *cgrp,
- struct cgroup_rstat_cpu *rstatc,
+ struct cgroup_rstat_base_cpu *rstatbc,
unsigned long flags)
{
- u64_stats_update_end_irqrestore(&rstatc->bsync, flags);
- cgroup_rstat_updated(cgrp, smp_processor_id());
- put_cpu_ptr(rstatc);
+ u64_stats_update_end_irqrestore(&rstatbc->bsync, flags);
+ css_rstat_updated(&cgrp->self, smp_processor_id());
+ put_cpu_ptr(rstatbc);
}
void __cgroup_account_cputime(struct cgroup *cgrp, u64 delta_exec)
{
- struct cgroup_rstat_cpu *rstatc;
+ struct cgroup_rstat_base_cpu *rstatbc;
unsigned long flags;
- rstatc = cgroup_base_stat_cputime_account_begin(cgrp, &flags);
- rstatc->bstat.cputime.sum_exec_runtime += delta_exec;
- cgroup_base_stat_cputime_account_end(cgrp, rstatc, flags);
+ rstatbc = cgroup_base_stat_cputime_account_begin(cgrp, &flags);
+ rstatbc->bstat.cputime.sum_exec_runtime += delta_exec;
+ cgroup_base_stat_cputime_account_end(cgrp, rstatbc, flags);
}
void __cgroup_account_cputime_field(struct cgroup *cgrp,
enum cpu_usage_stat index, u64 delta_exec)
{
- struct cgroup_rstat_cpu *rstatc;
+ struct cgroup_rstat_base_cpu *rstatbc;
unsigned long flags;
- rstatc = cgroup_base_stat_cputime_account_begin(cgrp, &flags);
+ rstatbc = cgroup_base_stat_cputime_account_begin(cgrp, &flags);
switch (index) {
case CPUTIME_NICE:
- rstatc->bstat.ntime += delta_exec;
+ rstatbc->bstat.ntime += delta_exec;
fallthrough;
case CPUTIME_USER:
- rstatc->bstat.cputime.utime += delta_exec;
+ rstatbc->bstat.cputime.utime += delta_exec;
break;
case CPUTIME_SYSTEM:
case CPUTIME_IRQ:
case CPUTIME_SOFTIRQ:
- rstatc->bstat.cputime.stime += delta_exec;
+ rstatbc->bstat.cputime.stime += delta_exec;
break;
#ifdef CONFIG_SCHED_CORE
case CPUTIME_FORCEIDLE:
- rstatc->bstat.forceidle_sum += delta_exec;
+ rstatbc->bstat.forceidle_sum += delta_exec;
break;
#endif
default:
break;
}
- cgroup_base_stat_cputime_account_end(cgrp, rstatc, flags);
+ cgroup_base_stat_cputime_account_end(cgrp, rstatbc, flags);
}
/*
@@ -574,12 +723,12 @@ void cgroup_base_stat_cputime_show(struct seq_file *seq)
struct cgroup_base_stat bstat;
if (cgroup_parent(cgrp)) {
- cgroup_rstat_flush(cgrp);
- __cgroup_rstat_lock(cgrp, -1);
+ css_rstat_flush(&cgrp->self);
+ __css_rstat_lock(&cgrp->self, -1);
bstat = cgrp->bstat;
cputime_adjust(&cgrp->bstat.cputime, &cgrp->prev_cputime,
&bstat.cputime.utime, &bstat.cputime.stime);
- __cgroup_rstat_unlock(cgrp, -1);
+ __css_rstat_unlock(&cgrp->self, -1);
} else {
root_cgroup_cputime(&bstat);
}
@@ -601,10 +750,10 @@ void cgroup_base_stat_cputime_show(struct seq_file *seq)
cgroup_force_idle_show(seq, &bstat);
}
-/* Add bpf kfuncs for cgroup_rstat_updated() and cgroup_rstat_flush() */
+/* Add bpf kfuncs for css_rstat_updated() and css_rstat_flush() */
BTF_KFUNCS_START(bpf_rstat_kfunc_ids)
-BTF_ID_FLAGS(func, cgroup_rstat_updated)
-BTF_ID_FLAGS(func, cgroup_rstat_flush, KF_SLEEPABLE)
+BTF_ID_FLAGS(func, css_rstat_updated)
+BTF_ID_FLAGS(func, css_rstat_flush, KF_SLEEPABLE)
BTF_KFUNCS_END(bpf_rstat_kfunc_ids)
static const struct btf_kfunc_id_set bpf_rstat_kfunc_set = {
diff --git a/kernel/configs/debug.config b/kernel/configs/debug.config
index 8aafd050b754..e81327d2cd63 100644
--- a/kernel/configs/debug.config
+++ b/kernel/configs/debug.config
@@ -112,3 +112,8 @@ CONFIG_BRANCH_PROFILE_NONE=y
CONFIG_DYNAMIC_FTRACE=y
CONFIG_FTRACE=y
CONFIG_FUNCTION_TRACER=y
+#
+# Preemption
+#
+CONFIG_DEBUG_PREEMPT=y
+CONFIG_PREEMPT=y
diff --git a/kernel/configs/hardening.config b/kernel/configs/hardening.config
index dd7c32fb5ac1..64caaf997fc0 100644
--- a/kernel/configs/hardening.config
+++ b/kernel/configs/hardening.config
@@ -60,9 +60,15 @@ CONFIG_LIST_HARDENED=y
# Initialize all heap variables to zero on allocation.
CONFIG_INIT_ON_ALLOC_DEFAULT_ON=y
+# Initialize all heap variables to zero on free to reduce stale data lifetime.
+CONFIG_INIT_ON_FREE_DEFAULT_ON=y
+
# Initialize all stack variables to zero on function entry.
CONFIG_INIT_STACK_ALL_ZERO=y
+# Wipe kernel stack after syscall completion to reduce stale data lifetime.
+CONFIG_KSTACK_ERASE=y
+
# Wipe RAM at reboot via EFI. For more details, see:
# https://trustedcomputinggroup.org/resource/pc-client-work-group-platform-reset-attack-mitigation-specification/
# https://bugzilla.redhat.com/show_bug.cgi?id=1532058
diff --git a/kernel/configs/tiny.config b/kernel/configs/tiny.config
index b753695c5a8f..5dd0f0a34a73 100644
--- a/kernel/configs/tiny.config
+++ b/kernel/configs/tiny.config
@@ -2,3 +2,4 @@ CONFIG_CC_OPTIMIZE_FOR_SIZE=y
CONFIG_KERNEL_XZ=y
CONFIG_SLUB=y
CONFIG_SLUB_TINY=y
+CONFIG_LD_DEAD_CODE_DATA_ELIMINATION=y
diff --git a/kernel/configs/xen.config b/kernel/configs/xen.config
index 6878b9a49be8..1875a0a5047a 100644
--- a/kernel/configs/xen.config
+++ b/kernel/configs/xen.config
@@ -13,6 +13,8 @@ CONFIG_SCSI=y
CONFIG_FB=y
CONFIG_INPUT_MISC=y
CONFIG_MEMORY_HOTPLUG=y
+CONFIG_MEMORY_HOTREMOVE=y
+CONFIG_ZONE_DEVICE=y
CONFIG_TTY=y
# Technically not required but otherwise produces
# pretty useless systems starting from allnoconfig
@@ -47,3 +49,4 @@ CONFIG_XEN_GNTDEV=m
CONFIG_XEN_GRANT_DEV_ALLOC=m
CONFIG_SWIOTLB_XEN=y
CONFIG_XEN_PRIVCMD=m
+CONFIG_XEN_UNPOPULATED_ALLOC=y
diff --git a/kernel/cpu.c b/kernel/cpu.c
index a59e009e0be4..faf0f23fc5d8 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -37,6 +37,7 @@
#include <linux/cpuset.h>
#include <linux/random.h>
#include <linux/cc_platform.h>
+#include <linux/parser.h>
#include <trace/events/power.h>
#define CREATE_TRACE_POINTS
@@ -3174,8 +3175,38 @@ void __init boot_cpu_hotplug_init(void)
#ifdef CONFIG_CPU_MITIGATIONS
/*
- * These are used for a global "mitigations=" cmdline option for toggling
- * optional CPU mitigations.
+ * All except the cross-thread attack vector are mitigated by default.
+ * Cross-thread mitigation often requires disabling SMT which is expensive
+ * so cross-thread mitigations are only partially enabled by default.
+ *
+ * Guest-to-Host and Guest-to-Guest vectors are only needed if KVM support is
+ * present.
+ */
+static bool attack_vectors[NR_CPU_ATTACK_VECTORS] __ro_after_init = {
+ [CPU_MITIGATE_USER_KERNEL] = true,
+ [CPU_MITIGATE_USER_USER] = true,
+ [CPU_MITIGATE_GUEST_HOST] = IS_ENABLED(CONFIG_KVM),
+ [CPU_MITIGATE_GUEST_GUEST] = IS_ENABLED(CONFIG_KVM),
+};
+
+bool cpu_attack_vector_mitigated(enum cpu_attack_vectors v)
+{
+ if (v < NR_CPU_ATTACK_VECTORS)
+ return attack_vectors[v];
+
+ WARN_ONCE(1, "Invalid attack vector %d\n", v);
+ return false;
+}
+
+/*
+ * There are 3 global options, 'off', 'auto', 'auto,nosmt'. These may optionally
+ * be combined with attack-vector disables which follow them.
+ *
+ * Examples:
+ * mitigations=auto,no_user_kernel,no_user_user,no_cross_thread
+ * mitigations=auto,nosmt,no_guest_host,no_guest_guest
+ *
+ * mitigations=off is equivalent to disabling all attack vectors.
*/
enum cpu_mitigations {
CPU_MITIGATIONS_OFF,
@@ -3183,19 +3214,96 @@ enum cpu_mitigations {
CPU_MITIGATIONS_AUTO_NOSMT,
};
+enum {
+ NO_USER_KERNEL,
+ NO_USER_USER,
+ NO_GUEST_HOST,
+ NO_GUEST_GUEST,
+ NO_CROSS_THREAD,
+ NR_VECTOR_PARAMS,
+};
+
+enum smt_mitigations smt_mitigations __ro_after_init = SMT_MITIGATIONS_AUTO;
static enum cpu_mitigations cpu_mitigations __ro_after_init = CPU_MITIGATIONS_AUTO;
+static const match_table_t global_mitigations = {
+ { CPU_MITIGATIONS_AUTO_NOSMT, "auto,nosmt"},
+ { CPU_MITIGATIONS_AUTO, "auto"},
+ { CPU_MITIGATIONS_OFF, "off"},
+};
+
+static const match_table_t vector_mitigations = {
+ { NO_USER_KERNEL, "no_user_kernel"},
+ { NO_USER_USER, "no_user_user"},
+ { NO_GUEST_HOST, "no_guest_host"},
+ { NO_GUEST_GUEST, "no_guest_guest"},
+ { NO_CROSS_THREAD, "no_cross_thread"},
+ { NR_VECTOR_PARAMS, NULL},
+};
+
+static int __init mitigations_parse_global_opt(char *arg)
+{
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(global_mitigations); i++) {
+ const char *pattern = global_mitigations[i].pattern;
+
+ if (!strncmp(arg, pattern, strlen(pattern))) {
+ cpu_mitigations = global_mitigations[i].token;
+ return strlen(pattern);
+ }
+ }
+
+ return 0;
+}
+
static int __init mitigations_parse_cmdline(char *arg)
{
- if (!strcmp(arg, "off"))
- cpu_mitigations = CPU_MITIGATIONS_OFF;
- else if (!strcmp(arg, "auto"))
- cpu_mitigations = CPU_MITIGATIONS_AUTO;
- else if (!strcmp(arg, "auto,nosmt"))
- cpu_mitigations = CPU_MITIGATIONS_AUTO_NOSMT;
- else
- pr_crit("Unsupported mitigations=%s, system may still be vulnerable\n",
- arg);
+ char *s, *p;
+ int len;
+
+ len = mitigations_parse_global_opt(arg);
+
+ if (cpu_mitigations_off()) {
+ memset(attack_vectors, 0, sizeof(attack_vectors));
+ smt_mitigations = SMT_MITIGATIONS_OFF;
+ } else if (cpu_mitigations_auto_nosmt()) {
+ smt_mitigations = SMT_MITIGATIONS_ON;
+ }
+
+ p = arg + len;
+
+ if (!*p)
+ return 0;
+
+ /* Attack vector controls may come after the ',' */
+ if (*p++ != ',' || !IS_ENABLED(CONFIG_ARCH_HAS_CPU_ATTACK_VECTORS)) {
+ pr_crit("Unsupported mitigations=%s, system may still be vulnerable\n", arg);
+ return 0;
+ }
+
+ while ((s = strsep(&p, ",")) != NULL) {
+ switch (match_token(s, vector_mitigations, NULL)) {
+ case NO_USER_KERNEL:
+ attack_vectors[CPU_MITIGATE_USER_KERNEL] = false;
+ break;
+ case NO_USER_USER:
+ attack_vectors[CPU_MITIGATE_USER_USER] = false;
+ break;
+ case NO_GUEST_HOST:
+ attack_vectors[CPU_MITIGATE_GUEST_HOST] = false;
+ break;
+ case NO_GUEST_GUEST:
+ attack_vectors[CPU_MITIGATE_GUEST_GUEST] = false;
+ break;
+ case NO_CROSS_THREAD:
+ smt_mitigations = SMT_MITIGATIONS_OFF;
+ break;
+ default:
+ pr_crit("Unsupported mitigations options %s\n", s);
+ return 0;
+ }
+ }
return 0;
}
diff --git a/kernel/crash_dump_dm_crypt.c b/kernel/crash_dump_dm_crypt.c
new file mode 100644
index 000000000000..401423ba477d
--- /dev/null
+++ b/kernel/crash_dump_dm_crypt.c
@@ -0,0 +1,464 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#include <linux/key.h>
+#include <linux/keyctl.h>
+#include <keys/user-type.h>
+#include <linux/crash_dump.h>
+#include <linux/cc_platform.h>
+#include <linux/configfs.h>
+#include <linux/module.h>
+
+#define KEY_NUM_MAX 128 /* maximum dm crypt keys */
+#define KEY_SIZE_MAX 256 /* maximum dm crypt key size */
+#define KEY_DESC_MAX_LEN 128 /* maximum dm crypt key description size */
+
+static unsigned int key_count;
+
+struct dm_crypt_key {
+ unsigned int key_size;
+ char key_desc[KEY_DESC_MAX_LEN];
+ u8 data[KEY_SIZE_MAX];
+};
+
+static struct keys_header {
+ unsigned int total_keys;
+ struct dm_crypt_key keys[] __counted_by(total_keys);
+} *keys_header;
+
+static size_t get_keys_header_size(size_t total_keys)
+{
+ return struct_size(keys_header, keys, total_keys);
+}
+
+unsigned long long dm_crypt_keys_addr;
+EXPORT_SYMBOL_GPL(dm_crypt_keys_addr);
+
+static int __init setup_dmcryptkeys(char *arg)
+{
+ char *end;
+
+ if (!arg)
+ return -EINVAL;
+ dm_crypt_keys_addr = memparse(arg, &end);
+ if (end > arg)
+ return 0;
+
+ dm_crypt_keys_addr = 0;
+ return -EINVAL;
+}
+
+early_param("dmcryptkeys", setup_dmcryptkeys);
+
+/*
+ * Architectures may override this function to read dm crypt keys
+ */
+ssize_t __weak dm_crypt_keys_read(char *buf, size_t count, u64 *ppos)
+{
+ struct kvec kvec = { .iov_base = buf, .iov_len = count };
+ struct iov_iter iter;
+
+ iov_iter_kvec(&iter, READ, &kvec, 1, count);
+ return read_from_oldmem(&iter, count, ppos, cc_platform_has(CC_ATTR_MEM_ENCRYPT));
+}
+
+static int add_key_to_keyring(struct dm_crypt_key *dm_key,
+ key_ref_t keyring_ref)
+{
+ key_ref_t key_ref;
+ int r;
+
+ /* create or update the requested key and add it to the target keyring */
+ key_ref = key_create_or_update(keyring_ref, "user", dm_key->key_desc,
+ dm_key->data, dm_key->key_size,
+ KEY_USR_ALL, KEY_ALLOC_IN_QUOTA);
+
+ if (!IS_ERR(key_ref)) {
+ r = key_ref_to_ptr(key_ref)->serial;
+ key_ref_put(key_ref);
+ kexec_dprintk("Success adding key %s", dm_key->key_desc);
+ } else {
+ r = PTR_ERR(key_ref);
+ kexec_dprintk("Error when adding key");
+ }
+
+ key_ref_put(keyring_ref);
+ return r;
+}
+
+static void get_keys_from_kdump_reserved_memory(void)
+{
+ struct keys_header *keys_header_loaded;
+
+ arch_kexec_unprotect_crashkres();
+
+ keys_header_loaded = kmap_local_page(pfn_to_page(
+ kexec_crash_image->dm_crypt_keys_addr >> PAGE_SHIFT));
+
+ memcpy(keys_header, keys_header_loaded, get_keys_header_size(key_count));
+ kunmap_local(keys_header_loaded);
+ arch_kexec_protect_crashkres();
+}
+
+static int restore_dm_crypt_keys_to_thread_keyring(void)
+{
+ struct dm_crypt_key *key;
+ size_t keys_header_size;
+ key_ref_t keyring_ref;
+ u64 addr;
+
+ /* find the target keyring (which must be writable) */
+ keyring_ref =
+ lookup_user_key(KEY_SPEC_USER_KEYRING, 0x01, KEY_NEED_WRITE);
+ if (IS_ERR(keyring_ref)) {
+ kexec_dprintk("Failed to get the user keyring\n");
+ return PTR_ERR(keyring_ref);
+ }
+
+ addr = dm_crypt_keys_addr;
+ dm_crypt_keys_read((char *)&key_count, sizeof(key_count), &addr);
+ if (key_count < 0 || key_count > KEY_NUM_MAX) {
+ kexec_dprintk("Failed to read the number of dm-crypt keys\n");
+ return -1;
+ }
+
+ kexec_dprintk("There are %u keys\n", key_count);
+ addr = dm_crypt_keys_addr;
+
+ keys_header_size = get_keys_header_size(key_count);
+ keys_header = kzalloc(keys_header_size, GFP_KERNEL);
+ if (!keys_header)
+ return -ENOMEM;
+
+ dm_crypt_keys_read((char *)keys_header, keys_header_size, &addr);
+
+ for (int i = 0; i < keys_header->total_keys; i++) {
+ key = &keys_header->keys[i];
+ kexec_dprintk("Get key (size=%u)\n", key->key_size);
+ add_key_to_keyring(key, keyring_ref);
+ }
+
+ return 0;
+}
+
+static int read_key_from_user_keying(struct dm_crypt_key *dm_key)
+{
+ const struct user_key_payload *ukp;
+ struct key *key;
+
+ kexec_dprintk("Requesting logon key %s", dm_key->key_desc);
+ key = request_key(&key_type_logon, dm_key->key_desc, NULL);
+
+ if (IS_ERR(key)) {
+ pr_warn("No such logon key %s\n", dm_key->key_desc);
+ return PTR_ERR(key);
+ }
+
+ ukp = user_key_payload_locked(key);
+ if (!ukp)
+ return -EKEYREVOKED;
+
+ if (ukp->datalen > KEY_SIZE_MAX) {
+ pr_err("Key size %u exceeds maximum (%u)\n", ukp->datalen, KEY_SIZE_MAX);
+ return -EINVAL;
+ }
+
+ memcpy(dm_key->data, ukp->data, ukp->datalen);
+ dm_key->key_size = ukp->datalen;
+ kexec_dprintk("Get dm crypt key (size=%u) %s: %8ph\n", dm_key->key_size,
+ dm_key->key_desc, dm_key->data);
+ return 0;
+}
+
+struct config_key {
+ struct config_item item;
+ const char *description;
+};
+
+static inline struct config_key *to_config_key(struct config_item *item)
+{
+ return container_of(item, struct config_key, item);
+}
+
+static ssize_t config_key_description_show(struct config_item *item, char *page)
+{
+ return sprintf(page, "%s\n", to_config_key(item)->description);
+}
+
+static ssize_t config_key_description_store(struct config_item *item,
+ const char *page, size_t count)
+{
+ struct config_key *config_key = to_config_key(item);
+ size_t len;
+ int ret;
+
+ ret = -EINVAL;
+ len = strcspn(page, "\n");
+
+ if (len > KEY_DESC_MAX_LEN) {
+ pr_err("The key description shouldn't exceed %u characters", KEY_DESC_MAX_LEN);
+ return ret;
+ }
+
+ if (!len)
+ return ret;
+
+ kfree(config_key->description);
+ ret = -ENOMEM;
+ config_key->description = kmemdup_nul(page, len, GFP_KERNEL);
+ if (!config_key->description)
+ return ret;
+
+ return count;
+}
+
+CONFIGFS_ATTR(config_key_, description);
+
+static struct configfs_attribute *config_key_attrs[] = {
+ &config_key_attr_description,
+ NULL,
+};
+
+static void config_key_release(struct config_item *item)
+{
+ kfree(to_config_key(item));
+ key_count--;
+}
+
+static struct configfs_item_operations config_key_item_ops = {
+ .release = config_key_release,
+};
+
+static const struct config_item_type config_key_type = {
+ .ct_item_ops = &config_key_item_ops,
+ .ct_attrs = config_key_attrs,
+ .ct_owner = THIS_MODULE,
+};
+
+static struct config_item *config_keys_make_item(struct config_group *group,
+ const char *name)
+{
+ struct config_key *config_key;
+
+ if (key_count > KEY_NUM_MAX) {
+ pr_err("Only %u keys at maximum to be created\n", KEY_NUM_MAX);
+ return ERR_PTR(-EINVAL);
+ }
+
+ config_key = kzalloc(sizeof(struct config_key), GFP_KERNEL);
+ if (!config_key)
+ return ERR_PTR(-ENOMEM);
+
+ config_item_init_type_name(&config_key->item, name, &config_key_type);
+
+ key_count++;
+
+ return &config_key->item;
+}
+
+static ssize_t config_keys_count_show(struct config_item *item, char *page)
+{
+ return sprintf(page, "%d\n", key_count);
+}
+
+CONFIGFS_ATTR_RO(config_keys_, count);
+
+static bool is_dm_key_reused;
+
+static ssize_t config_keys_reuse_show(struct config_item *item, char *page)
+{
+ return sprintf(page, "%d\n", is_dm_key_reused);
+}
+
+static ssize_t config_keys_reuse_store(struct config_item *item,
+ const char *page, size_t count)
+{
+ if (!kexec_crash_image || !kexec_crash_image->dm_crypt_keys_addr) {
+ kexec_dprintk(
+ "dm-crypt keys haven't be saved to crash-reserved memory\n");
+ return -EINVAL;
+ }
+
+ if (kstrtobool(page, &is_dm_key_reused))
+ return -EINVAL;
+
+ if (is_dm_key_reused)
+ get_keys_from_kdump_reserved_memory();
+
+ return count;
+}
+
+CONFIGFS_ATTR(config_keys_, reuse);
+
+static struct configfs_attribute *config_keys_attrs[] = {
+ &config_keys_attr_count,
+ &config_keys_attr_reuse,
+ NULL,
+};
+
+/*
+ * Note that, since no extra work is required on ->drop_item(),
+ * no ->drop_item() is provided.
+ */
+static struct configfs_group_operations config_keys_group_ops = {
+ .make_item = config_keys_make_item,
+};
+
+static const struct config_item_type config_keys_type = {
+ .ct_group_ops = &config_keys_group_ops,
+ .ct_attrs = config_keys_attrs,
+ .ct_owner = THIS_MODULE,
+};
+
+static bool restore;
+
+static ssize_t config_keys_restore_show(struct config_item *item, char *page)
+{
+ return sprintf(page, "%d\n", restore);
+}
+
+static ssize_t config_keys_restore_store(struct config_item *item,
+ const char *page, size_t count)
+{
+ if (!restore)
+ restore_dm_crypt_keys_to_thread_keyring();
+
+ if (kstrtobool(page, &restore))
+ return -EINVAL;
+
+ return count;
+}
+
+CONFIGFS_ATTR(config_keys_, restore);
+
+static struct configfs_attribute *kdump_config_keys_attrs[] = {
+ &config_keys_attr_restore,
+ NULL,
+};
+
+static const struct config_item_type kdump_config_keys_type = {
+ .ct_attrs = kdump_config_keys_attrs,
+ .ct_owner = THIS_MODULE,
+};
+
+static struct configfs_subsystem config_keys_subsys = {
+ .su_group = {
+ .cg_item = {
+ .ci_namebuf = "crash_dm_crypt_keys",
+ .ci_type = &config_keys_type,
+ },
+ },
+};
+
+static int build_keys_header(void)
+{
+ struct config_item *item = NULL;
+ struct config_key *key;
+ int i, r;
+
+ if (keys_header != NULL)
+ kvfree(keys_header);
+
+ keys_header = kzalloc(get_keys_header_size(key_count), GFP_KERNEL);
+ if (!keys_header)
+ return -ENOMEM;
+
+ keys_header->total_keys = key_count;
+
+ i = 0;
+ list_for_each_entry(item, &config_keys_subsys.su_group.cg_children,
+ ci_entry) {
+ if (item->ci_type != &config_key_type)
+ continue;
+
+ key = to_config_key(item);
+
+ if (!key->description) {
+ pr_warn("No key description for key %s\n", item->ci_name);
+ return -EINVAL;
+ }
+
+ strscpy(keys_header->keys[i].key_desc, key->description,
+ KEY_DESC_MAX_LEN);
+ r = read_key_from_user_keying(&keys_header->keys[i]);
+ if (r != 0) {
+ kexec_dprintk("Failed to read key %s\n",
+ keys_header->keys[i].key_desc);
+ return r;
+ }
+ i++;
+ kexec_dprintk("Found key: %s\n", item->ci_name);
+ }
+
+ return 0;
+}
+
+int crash_load_dm_crypt_keys(struct kimage *image)
+{
+ struct kexec_buf kbuf = {
+ .image = image,
+ .buf_min = 0,
+ .buf_max = ULONG_MAX,
+ .top_down = false,
+ .random = true,
+ };
+ int r;
+
+
+ if (key_count <= 0) {
+ kexec_dprintk("No dm-crypt keys\n");
+ return -ENOENT;
+ }
+
+ if (!is_dm_key_reused) {
+ image->dm_crypt_keys_addr = 0;
+ r = build_keys_header();
+ if (r)
+ return r;
+ }
+
+ kbuf.buffer = keys_header;
+ kbuf.bufsz = get_keys_header_size(key_count);
+
+ kbuf.memsz = kbuf.bufsz;
+ kbuf.buf_align = ELF_CORE_HEADER_ALIGN;
+ kbuf.mem = KEXEC_BUF_MEM_UNKNOWN;
+ r = kexec_add_buffer(&kbuf);
+ if (r) {
+ kvfree((void *)kbuf.buffer);
+ return r;
+ }
+ image->dm_crypt_keys_addr = kbuf.mem;
+ image->dm_crypt_keys_sz = kbuf.bufsz;
+ kexec_dprintk(
+ "Loaded dm crypt keys to kexec_buffer bufsz=0x%lx memsz=0x%lx\n",
+ kbuf.bufsz, kbuf.memsz);
+
+ return r;
+}
+
+static int __init configfs_dmcrypt_keys_init(void)
+{
+ int ret;
+
+ if (is_kdump_kernel()) {
+ config_keys_subsys.su_group.cg_item.ci_type =
+ &kdump_config_keys_type;
+ }
+
+ config_group_init(&config_keys_subsys.su_group);
+ mutex_init(&config_keys_subsys.su_mutex);
+ ret = configfs_register_subsystem(&config_keys_subsys);
+ if (ret) {
+ pr_err("Error %d while registering subsystem %s\n", ret,
+ config_keys_subsys.su_group.cg_item.ci_namebuf);
+ goto out_unregister;
+ }
+
+ return 0;
+
+out_unregister:
+ configfs_unregister_subsystem(&config_keys_subsys);
+
+ return ret;
+}
+
+module_init(configfs_dmcrypt_keys_init);
diff --git a/kernel/crash_reserve.c b/kernel/crash_reserve.c
index aff7c0fdbefa..acb6bf42e30d 100644
--- a/kernel/crash_reserve.c
+++ b/kernel/crash_reserve.c
@@ -131,7 +131,7 @@ static int __init parse_crashkernel_mem(char *cmdline,
cur++;
*crash_base = memparse(cur, &tmp);
if (cur == tmp) {
- pr_warn("crahskernel: Memory value expected after '@'\n");
+ pr_warn("crashkernel: Memory value expected after '@'\n");
return -EINVAL;
}
}
diff --git a/kernel/delayacct.c b/kernel/delayacct.c
index eb63a021ac04..30e7912ebb0d 100644
--- a/kernel/delayacct.c
+++ b/kernel/delayacct.c
@@ -14,6 +14,15 @@
#include <linux/delayacct.h>
#include <linux/module.h>
+#define UPDATE_DELAY(type) \
+do { \
+ d->type##_delay_max = tsk->delays->type##_delay_max; \
+ d->type##_delay_min = tsk->delays->type##_delay_min; \
+ tmp = d->type##_delay_total + tsk->delays->type##_delay; \
+ d->type##_delay_total = (tmp < d->type##_delay_total) ? 0 : tmp; \
+ d->type##_count += tsk->delays->type##_count; \
+} while (0)
+
DEFINE_STATIC_KEY_FALSE(delayacct_key);
int delayacct_on __read_mostly; /* Delay accounting turned on/off */
struct kmem_cache *delayacct_cache;
@@ -173,41 +182,13 @@ int delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk)
/* zero XXX_total, non-zero XXX_count implies XXX stat overflowed */
raw_spin_lock_irqsave(&tsk->delays->lock, flags);
- d->blkio_delay_max = tsk->delays->blkio_delay_max;
- d->blkio_delay_min = tsk->delays->blkio_delay_min;
- tmp = d->blkio_delay_total + tsk->delays->blkio_delay;
- d->blkio_delay_total = (tmp < d->blkio_delay_total) ? 0 : tmp;
- d->swapin_delay_max = tsk->delays->swapin_delay_max;
- d->swapin_delay_min = tsk->delays->swapin_delay_min;
- tmp = d->swapin_delay_total + tsk->delays->swapin_delay;
- d->swapin_delay_total = (tmp < d->swapin_delay_total) ? 0 : tmp;
- d->freepages_delay_max = tsk->delays->freepages_delay_max;
- d->freepages_delay_min = tsk->delays->freepages_delay_min;
- tmp = d->freepages_delay_total + tsk->delays->freepages_delay;
- d->freepages_delay_total = (tmp < d->freepages_delay_total) ? 0 : tmp;
- d->thrashing_delay_max = tsk->delays->thrashing_delay_max;
- d->thrashing_delay_min = tsk->delays->thrashing_delay_min;
- tmp = d->thrashing_delay_total + tsk->delays->thrashing_delay;
- d->thrashing_delay_total = (tmp < d->thrashing_delay_total) ? 0 : tmp;
- d->compact_delay_max = tsk->delays->compact_delay_max;
- d->compact_delay_min = tsk->delays->compact_delay_min;
- tmp = d->compact_delay_total + tsk->delays->compact_delay;
- d->compact_delay_total = (tmp < d->compact_delay_total) ? 0 : tmp;
- d->wpcopy_delay_max = tsk->delays->wpcopy_delay_max;
- d->wpcopy_delay_min = tsk->delays->wpcopy_delay_min;
- tmp = d->wpcopy_delay_total + tsk->delays->wpcopy_delay;
- d->wpcopy_delay_total = (tmp < d->wpcopy_delay_total) ? 0 : tmp;
- d->irq_delay_max = tsk->delays->irq_delay_max;
- d->irq_delay_min = tsk->delays->irq_delay_min;
- tmp = d->irq_delay_total + tsk->delays->irq_delay;
- d->irq_delay_total = (tmp < d->irq_delay_total) ? 0 : tmp;
- d->blkio_count += tsk->delays->blkio_count;
- d->swapin_count += tsk->delays->swapin_count;
- d->freepages_count += tsk->delays->freepages_count;
- d->thrashing_count += tsk->delays->thrashing_count;
- d->compact_count += tsk->delays->compact_count;
- d->wpcopy_count += tsk->delays->wpcopy_count;
- d->irq_count += tsk->delays->irq_count;
+ UPDATE_DELAY(blkio);
+ UPDATE_DELAY(swapin);
+ UPDATE_DELAY(freepages);
+ UPDATE_DELAY(thrashing);
+ UPDATE_DELAY(compact);
+ UPDATE_DELAY(wpcopy);
+ UPDATE_DELAY(irq);
raw_spin_unlock_irqrestore(&tsk->delays->lock, flags);
return 0;
diff --git a/kernel/dma/contiguous.c b/kernel/dma/contiguous.c
index 8df0dfaaca18..67af8a55185d 100644
--- a/kernel/dma/contiguous.c
+++ b/kernel/dma/contiguous.c
@@ -222,7 +222,10 @@ void __init dma_contiguous_reserve(phys_addr_t limit)
if (size_cmdline != -1) {
selected_size = size_cmdline;
selected_base = base_cmdline;
- selected_limit = min_not_zero(limit_cmdline, limit);
+
+ /* Hornor the user setup dma address limit */
+ selected_limit = limit_cmdline ?: limit;
+
if (base_cmdline + size_cmdline == limit_cmdline)
fixed = true;
} else {
diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
index b8fe0b3d0ffb..24c359d9c879 100644
--- a/kernel/dma/direct.c
+++ b/kernel/dma/direct.c
@@ -13,6 +13,7 @@
#include <linux/vmalloc.h>
#include <linux/set_memory.h>
#include <linux/slab.h>
+#include <linux/pci-p2pdma.h>
#include "direct.h"
/*
@@ -462,34 +463,33 @@ int dma_direct_map_sg(struct device *dev, struct scatterlist *sgl, int nents,
enum dma_data_direction dir, unsigned long attrs)
{
struct pci_p2pdma_map_state p2pdma_state = {};
- enum pci_p2pdma_map_type map;
struct scatterlist *sg;
int i, ret;
for_each_sg(sgl, sg, nents, i) {
- if (is_pci_p2pdma_page(sg_page(sg))) {
- map = pci_p2pdma_map_segment(&p2pdma_state, dev, sg);
- switch (map) {
- case PCI_P2PDMA_MAP_BUS_ADDR:
- continue;
- case PCI_P2PDMA_MAP_THRU_HOST_BRIDGE:
- /*
- * Any P2P mapping that traverses the PCI
- * host bridge must be mapped with CPU physical
- * address and not PCI bus addresses. This is
- * done with dma_direct_map_page() below.
- */
- break;
- default:
- ret = -EREMOTEIO;
+ switch (pci_p2pdma_state(&p2pdma_state, dev, sg_page(sg))) {
+ case PCI_P2PDMA_MAP_THRU_HOST_BRIDGE:
+ /*
+ * Any P2P mapping that traverses the PCI host bridge
+ * must be mapped with CPU physical address and not PCI
+ * bus addresses.
+ */
+ break;
+ case PCI_P2PDMA_MAP_NONE:
+ sg->dma_address = dma_direct_map_page(dev, sg_page(sg),
+ sg->offset, sg->length, dir, attrs);
+ if (sg->dma_address == DMA_MAPPING_ERROR) {
+ ret = -EIO;
goto out_unmap;
}
- }
-
- sg->dma_address = dma_direct_map_page(dev, sg_page(sg),
- sg->offset, sg->length, dir, attrs);
- if (sg->dma_address == DMA_MAPPING_ERROR) {
- ret = -EIO;
+ break;
+ case PCI_P2PDMA_MAP_BUS_ADDR:
+ sg->dma_address = pci_p2pdma_bus_addr_map(&p2pdma_state,
+ sg_phys(sg));
+ sg_dma_mark_bus_address(sg);
+ continue;
+ default:
+ ret = -EREMOTEIO;
goto out_unmap;
}
sg_dma_len(sg) = sg->length;
diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c
index 051a32988040..107e4a4d251d 100644
--- a/kernel/dma/mapping.c
+++ b/kernel/dma/mapping.c
@@ -443,6 +443,24 @@ bool __dma_need_sync(struct device *dev, dma_addr_t dma_addr)
}
EXPORT_SYMBOL_GPL(__dma_need_sync);
+/**
+ * dma_need_unmap - does this device need dma_unmap_* operations
+ * @dev: device to check
+ *
+ * If this function returns %false, drivers can skip calling dma_unmap_* after
+ * finishing an I/O. This function must be called after all mappings that might
+ * need to be unmapped have been performed.
+ */
+bool dma_need_unmap(struct device *dev)
+{
+ if (!dma_map_direct(dev, get_dma_ops(dev)))
+ return true;
+ if (!dev->dma_skip_sync)
+ return true;
+ return IS_ENABLED(CONFIG_DMA_API_DEBUG);
+}
+EXPORT_SYMBOL_GPL(dma_need_unmap);
+
static void dma_setup_need_sync(struct device *dev)
{
const struct dma_map_ops *ops = get_dma_ops(dev);
diff --git a/kernel/entry/Makefile b/kernel/entry/Makefile
index d4b8bd0af79b..77fcd83dd663 100644
--- a/kernel/entry/Makefile
+++ b/kernel/entry/Makefile
@@ -12,5 +12,6 @@ ccflags-$(CONFIG_TRACE_BRANCH_PROFILING) += -DDISABLE_BRANCH_PROFILING
CFLAGS_REMOVE_common.o = -fstack-protector -fstack-protector-strong
CFLAGS_common.o += -fno-stack-protector
-obj-$(CONFIG_GENERIC_ENTRY) += common.o syscall_user_dispatch.o
+obj-$(CONFIG_GENERIC_IRQ_ENTRY) += common.o
+obj-$(CONFIG_GENERIC_SYSCALL) += syscall-common.o syscall_user_dispatch.o
obj-$(CONFIG_KVM_XFER_TO_GUEST_WORK) += kvm.o
diff --git a/kernel/entry/common.c b/kernel/entry/common.c
index 20154572ede9..408d28b5179d 100644
--- a/kernel/entry/common.c
+++ b/kernel/entry/common.c
@@ -1,84 +1,13 @@
// SPDX-License-Identifier: GPL-2.0
-#include <linux/context_tracking.h>
-#include <linux/entry-common.h>
+#include <linux/irq-entry-common.h>
#include <linux/resume_user_mode.h>
#include <linux/highmem.h>
#include <linux/jump_label.h>
#include <linux/kmsan.h>
#include <linux/livepatch.h>
-#include <linux/audit.h>
#include <linux/tick.h>
-#include "common.h"
-
-#define CREATE_TRACE_POINTS
-#include <trace/events/syscalls.h>
-
-static inline void syscall_enter_audit(struct pt_regs *regs, long syscall)
-{
- if (unlikely(audit_context())) {
- unsigned long args[6];
-
- syscall_get_arguments(current, regs, args);
- audit_syscall_entry(syscall, args[0], args[1], args[2], args[3]);
- }
-}
-
-long syscall_trace_enter(struct pt_regs *regs, long syscall,
- unsigned long work)
-{
- long ret = 0;
-
- /*
- * Handle Syscall User Dispatch. This must comes first, since
- * the ABI here can be something that doesn't make sense for
- * other syscall_work features.
- */
- if (work & SYSCALL_WORK_SYSCALL_USER_DISPATCH) {
- if (syscall_user_dispatch(regs))
- return -1L;
- }
-
- /* Handle ptrace */
- if (work & (SYSCALL_WORK_SYSCALL_TRACE | SYSCALL_WORK_SYSCALL_EMU)) {
- ret = ptrace_report_syscall_entry(regs);
- if (ret || (work & SYSCALL_WORK_SYSCALL_EMU))
- return -1L;
- }
-
- /* Do seccomp after ptrace, to catch any tracer changes. */
- if (work & SYSCALL_WORK_SECCOMP) {
- ret = __secure_computing();
- if (ret == -1L)
- return ret;
- }
-
- /* Either of the above might have changed the syscall number */
- syscall = syscall_get_nr(current, regs);
-
- if (unlikely(work & SYSCALL_WORK_SYSCALL_TRACEPOINT)) {
- trace_sys_enter(regs, syscall);
- /*
- * Probes or BPF hooks in the tracepoint may have changed the
- * system call number as well.
- */
- syscall = syscall_get_nr(current, regs);
- }
-
- syscall_enter_audit(regs, syscall);
-
- return ret ? : syscall;
-}
-
-noinstr void syscall_enter_from_user_mode_prepare(struct pt_regs *regs)
-{
- enter_from_user_mode(regs);
- instrumentation_begin();
- local_irq_enable();
- instrumentation_end();
-}
-
/* Workaround to allow gradual conversion of architecture code */
void __weak arch_do_signal_or_restart(struct pt_regs *regs) { }
@@ -133,93 +62,6 @@ __always_inline unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
return ti_work;
}
-/*
- * If SYSCALL_EMU is set, then the only reason to report is when
- * SINGLESTEP is set (i.e. PTRACE_SYSEMU_SINGLESTEP). This syscall
- * instruction has been already reported in syscall_enter_from_user_mode().
- */
-static inline bool report_single_step(unsigned long work)
-{
- if (work & SYSCALL_WORK_SYSCALL_EMU)
- return false;
-
- return work & SYSCALL_WORK_SYSCALL_EXIT_TRAP;
-}
-
-static void syscall_exit_work(struct pt_regs *regs, unsigned long work)
-{
- bool step;
-
- /*
- * If the syscall was rolled back due to syscall user dispatching,
- * then the tracers below are not invoked for the same reason as
- * the entry side was not invoked in syscall_trace_enter(): The ABI
- * of these syscalls is unknown.
- */
- if (work & SYSCALL_WORK_SYSCALL_USER_DISPATCH) {
- if (unlikely(current->syscall_dispatch.on_dispatch)) {
- current->syscall_dispatch.on_dispatch = false;
- return;
- }
- }
-
- audit_syscall_exit(regs);
-
- if (work & SYSCALL_WORK_SYSCALL_TRACEPOINT)
- trace_sys_exit(regs, syscall_get_return_value(current, regs));
-
- step = report_single_step(work);
- if (step || work & SYSCALL_WORK_SYSCALL_TRACE)
- ptrace_report_syscall_exit(regs, step);
-}
-
-/*
- * Syscall specific exit to user mode preparation. Runs with interrupts
- * enabled.
- */
-static void syscall_exit_to_user_mode_prepare(struct pt_regs *regs)
-{
- unsigned long work = READ_ONCE(current_thread_info()->syscall_work);
- unsigned long nr = syscall_get_nr(current, regs);
-
- CT_WARN_ON(ct_state() != CT_STATE_KERNEL);
-
- if (IS_ENABLED(CONFIG_PROVE_LOCKING)) {
- if (WARN(irqs_disabled(), "syscall %lu left IRQs disabled", nr))
- local_irq_enable();
- }
-
- rseq_syscall(regs);
-
- /*
- * Do one-time syscall specific work. If these work items are
- * enabled, we want to run them exactly once per syscall exit with
- * interrupts enabled.
- */
- if (unlikely(work & SYSCALL_WORK_EXIT))
- syscall_exit_work(regs, work);
-}
-
-static __always_inline void __syscall_exit_to_user_mode_work(struct pt_regs *regs)
-{
- syscall_exit_to_user_mode_prepare(regs);
- local_irq_disable_exit_to_user();
- exit_to_user_mode_prepare(regs);
-}
-
-void syscall_exit_to_user_mode_work(struct pt_regs *regs)
-{
- __syscall_exit_to_user_mode_work(regs);
-}
-
-__visible noinstr void syscall_exit_to_user_mode(struct pt_regs *regs)
-{
- instrumentation_begin();
- __syscall_exit_to_user_mode_work(regs);
- instrumentation_end();
- exit_to_user_mode();
-}
-
noinstr void irqentry_enter_from_user_mode(struct pt_regs *regs)
{
enter_from_user_mode(regs);
@@ -267,7 +109,8 @@ noinstr irqentry_state_t irqentry_enter(struct pt_regs *regs)
* TINY_RCU does not support EQS, so let the compiler eliminate
* this part when enabled.
*/
- if (!IS_ENABLED(CONFIG_TINY_RCU) && is_idle_task(current)) {
+ if (!IS_ENABLED(CONFIG_TINY_RCU) &&
+ (is_idle_task(current) || arch_in_rcu_eqs())) {
/*
* If RCU is not watching then the same careful
* sequence vs. lockdep and tracing is required
diff --git a/kernel/entry/syscall-common.c b/kernel/entry/syscall-common.c
new file mode 100644
index 000000000000..66e6ba7fa80c
--- /dev/null
+++ b/kernel/entry/syscall-common.c
@@ -0,0 +1,112 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/audit.h>
+#include <linux/entry-common.h>
+#include "common.h"
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/syscalls.h>
+
+static inline void syscall_enter_audit(struct pt_regs *regs, long syscall)
+{
+ if (unlikely(audit_context())) {
+ unsigned long args[6];
+
+ syscall_get_arguments(current, regs, args);
+ audit_syscall_entry(syscall, args[0], args[1], args[2], args[3]);
+ }
+}
+
+long syscall_trace_enter(struct pt_regs *regs, long syscall,
+ unsigned long work)
+{
+ long ret = 0;
+
+ /*
+ * Handle Syscall User Dispatch. This must comes first, since
+ * the ABI here can be something that doesn't make sense for
+ * other syscall_work features.
+ */
+ if (work & SYSCALL_WORK_SYSCALL_USER_DISPATCH) {
+ if (syscall_user_dispatch(regs))
+ return -1L;
+ }
+
+ /* Handle ptrace */
+ if (work & (SYSCALL_WORK_SYSCALL_TRACE | SYSCALL_WORK_SYSCALL_EMU)) {
+ ret = ptrace_report_syscall_entry(regs);
+ if (ret || (work & SYSCALL_WORK_SYSCALL_EMU))
+ return -1L;
+ }
+
+ /* Do seccomp after ptrace, to catch any tracer changes. */
+ if (work & SYSCALL_WORK_SECCOMP) {
+ ret = __secure_computing();
+ if (ret == -1L)
+ return ret;
+ }
+
+ /* Either of the above might have changed the syscall number */
+ syscall = syscall_get_nr(current, regs);
+
+ if (unlikely(work & SYSCALL_WORK_SYSCALL_TRACEPOINT)) {
+ trace_sys_enter(regs, syscall);
+ /*
+ * Probes or BPF hooks in the tracepoint may have changed the
+ * system call number as well.
+ */
+ syscall = syscall_get_nr(current, regs);
+ }
+
+ syscall_enter_audit(regs, syscall);
+
+ return ret ? : syscall;
+}
+
+noinstr void syscall_enter_from_user_mode_prepare(struct pt_regs *regs)
+{
+ enter_from_user_mode(regs);
+ instrumentation_begin();
+ local_irq_enable();
+ instrumentation_end();
+}
+
+/*
+ * If SYSCALL_EMU is set, then the only reason to report is when
+ * SINGLESTEP is set (i.e. PTRACE_SYSEMU_SINGLESTEP). This syscall
+ * instruction has been already reported in syscall_enter_from_user_mode().
+ */
+static inline bool report_single_step(unsigned long work)
+{
+ if (work & SYSCALL_WORK_SYSCALL_EMU)
+ return false;
+
+ return work & SYSCALL_WORK_SYSCALL_EXIT_TRAP;
+}
+
+void syscall_exit_work(struct pt_regs *regs, unsigned long work)
+{
+ bool step;
+
+ /*
+ * If the syscall was rolled back due to syscall user dispatching,
+ * then the tracers below are not invoked for the same reason as
+ * the entry side was not invoked in syscall_trace_enter(): The ABI
+ * of these syscalls is unknown.
+ */
+ if (work & SYSCALL_WORK_SYSCALL_USER_DISPATCH) {
+ if (unlikely(current->syscall_dispatch.on_dispatch)) {
+ current->syscall_dispatch.on_dispatch = false;
+ return;
+ }
+ }
+
+ audit_syscall_exit(regs);
+
+ if (work & SYSCALL_WORK_SYSCALL_TRACEPOINT)
+ trace_sys_exit(regs, syscall_get_return_value(current, regs));
+
+ step = report_single_step(work);
+ if (step || work & SYSCALL_WORK_SYSCALL_TRACE)
+ ptrace_report_syscall_exit(regs, step);
+}
diff --git a/kernel/entry/syscall_user_dispatch.c b/kernel/entry/syscall_user_dispatch.c
index 5340c5aa89e7..a9055eccb27e 100644
--- a/kernel/entry/syscall_user_dispatch.c
+++ b/kernel/entry/syscall_user_dispatch.c
@@ -78,7 +78,7 @@ static int task_set_syscall_user_dispatch(struct task_struct *task, unsigned lon
if (offset || len || selector)
return -EINVAL;
break;
- case PR_SYS_DISPATCH_ON:
+ case PR_SYS_DISPATCH_EXCLUSIVE_ON:
/*
* Validate the direct dispatcher region just for basic
* sanity against overflow and a 0-sized dispatcher
@@ -87,30 +87,40 @@ static int task_set_syscall_user_dispatch(struct task_struct *task, unsigned lon
*/
if (offset && offset + len <= offset)
return -EINVAL;
-
+ break;
+ case PR_SYS_DISPATCH_INCLUSIVE_ON:
+ if (len == 0 || offset + len <= offset)
+ return -EINVAL;
/*
- * access_ok() will clear memory tags for tagged addresses
- * if current has memory tagging enabled.
-
- * To enable a tracer to set a tracees selector the
- * selector address must be untagged for access_ok(),
- * otherwise an untagged tracer will always fail to set a
- * tagged tracees selector.
+ * Invert the range, the check in syscall_user_dispatch()
+ * supports wrap-around.
*/
- if (selector && !access_ok(untagged_addr(selector), sizeof(*selector)))
- return -EFAULT;
-
+ offset = offset + len;
+ len = -len;
break;
default:
return -EINVAL;
}
+ /*
+ * access_ok() will clear memory tags for tagged addresses
+ * if current has memory tagging enabled.
+ *
+ * To enable a tracer to set a tracees selector the
+ * selector address must be untagged for access_ok(),
+ * otherwise an untagged tracer will always fail to set a
+ * tagged tracees selector.
+ */
+ if (mode != PR_SYS_DISPATCH_OFF && selector &&
+ !access_ok(untagged_addr(selector), sizeof(*selector)))
+ return -EFAULT;
+
task->syscall_dispatch.selector = selector;
task->syscall_dispatch.offset = offset;
task->syscall_dispatch.len = len;
task->syscall_dispatch.on_dispatch = false;
- if (mode == PR_SYS_DISPATCH_ON)
+ if (mode != PR_SYS_DISPATCH_OFF)
set_task_syscall_work(task, SYSCALL_USER_DISPATCH);
else
clear_task_syscall_work(task, SYSCALL_USER_DISPATCH);
diff --git a/kernel/events/core.c b/kernel/events/core.c
index f34c99f8ce8f..22fdf0c187cd 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -207,6 +207,19 @@ static void perf_ctx_unlock(struct perf_cpu_context *cpuctx,
__perf_ctx_unlock(&cpuctx->ctx);
}
+typedef struct {
+ struct perf_cpu_context *cpuctx;
+ struct perf_event_context *ctx;
+} class_perf_ctx_lock_t;
+
+static inline void class_perf_ctx_lock_destructor(class_perf_ctx_lock_t *_T)
+{ perf_ctx_unlock(_T->cpuctx, _T->ctx); }
+
+static inline class_perf_ctx_lock_t
+class_perf_ctx_lock_constructor(struct perf_cpu_context *cpuctx,
+ struct perf_event_context *ctx)
+{ perf_ctx_lock(cpuctx, ctx); return (class_perf_ctx_lock_t){ cpuctx, ctx }; }
+
#define TASK_TOMBSTONE ((void *)-1L)
static bool is_kernel_event(struct perf_event *event)
@@ -938,13 +951,19 @@ static void perf_cgroup_switch(struct task_struct *task)
if (READ_ONCE(cpuctx->cgrp) == NULL)
return;
- WARN_ON_ONCE(cpuctx->ctx.nr_cgroups == 0);
-
cgrp = perf_cgroup_from_task(task, NULL);
if (READ_ONCE(cpuctx->cgrp) == cgrp)
return;
- perf_ctx_lock(cpuctx, cpuctx->task_ctx);
+ guard(perf_ctx_lock)(cpuctx, cpuctx->task_ctx);
+ /*
+ * Re-check, could've raced vs perf_remove_from_context().
+ */
+ if (READ_ONCE(cpuctx->cgrp) == NULL)
+ return;
+
+ WARN_ON_ONCE(cpuctx->ctx.nr_cgroups == 0);
+
perf_ctx_disable(&cpuctx->ctx, true);
ctx_sched_out(&cpuctx->ctx, NULL, EVENT_ALL|EVENT_CGROUP);
@@ -962,7 +981,6 @@ static void perf_cgroup_switch(struct task_struct *task)
ctx_sched_in(&cpuctx->ctx, NULL, EVENT_ALL|EVENT_CGROUP);
perf_ctx_enable(&cpuctx->ctx, true);
- perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
}
static int perf_cgroup_ensure_storage(struct perf_event *event,
@@ -2120,18 +2138,6 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx)
if (event->group_leader == event)
del_event_from_groups(event, ctx);
- /*
- * If event was in error state, then keep it
- * that way, otherwise bogus counts will be
- * returned on read(). The only way to get out
- * of error state is by explicit re-enabling
- * of the event
- */
- if (event->state > PERF_EVENT_STATE_OFF) {
- perf_cgroup_event_disable(event, ctx);
- perf_event_set_state(event, PERF_EVENT_STATE_OFF);
- }
-
ctx->generation++;
event->pmu_ctx->nr_events--;
}
@@ -2149,8 +2155,9 @@ perf_aux_output_match(struct perf_event *event, struct perf_event *aux_event)
}
static void put_event(struct perf_event *event);
-static void event_sched_out(struct perf_event *event,
- struct perf_event_context *ctx);
+static void __event_disable(struct perf_event *event,
+ struct perf_event_context *ctx,
+ enum perf_event_state state);
static void perf_put_aux_event(struct perf_event *event)
{
@@ -2183,8 +2190,7 @@ static void perf_put_aux_event(struct perf_event *event)
* state so that we don't try to schedule it again. Note
* that perf_event_enable() will clear the ERROR status.
*/
- event_sched_out(iter, ctx);
- perf_event_set_state(event, PERF_EVENT_STATE_ERROR);
+ __event_disable(iter, ctx, PERF_EVENT_STATE_ERROR);
}
}
@@ -2242,18 +2248,6 @@ static inline struct list_head *get_event_list(struct perf_event *event)
&event->pmu_ctx->flexible_active;
}
-/*
- * Events that have PERF_EV_CAP_SIBLING require being part of a group and
- * cannot exist on their own, schedule them out and move them into the ERROR
- * state. Also see _perf_event_enable(), it will not be able to recover
- * this ERROR state.
- */
-static inline void perf_remove_sibling_event(struct perf_event *event)
-{
- event_sched_out(event, event->ctx);
- perf_event_set_state(event, PERF_EVENT_STATE_ERROR);
-}
-
static void perf_group_detach(struct perf_event *event)
{
struct perf_event *leader = event->group_leader;
@@ -2289,8 +2283,15 @@ static void perf_group_detach(struct perf_event *event)
*/
list_for_each_entry_safe(sibling, tmp, &event->sibling_list, sibling_list) {
+ /*
+ * Events that have PERF_EV_CAP_SIBLING require being part of
+ * a group and cannot exist on their own, schedule them out
+ * and move them into the ERROR state. Also see
+ * _perf_event_enable(), it will not be able to recover this
+ * ERROR state.
+ */
if (sibling->event_caps & PERF_EV_CAP_SIBLING)
- perf_remove_sibling_event(sibling);
+ __event_disable(sibling, ctx, PERF_EVENT_STATE_ERROR);
sibling->group_leader = sibling;
list_del_init(&sibling->sibling_list);
@@ -2493,11 +2494,14 @@ __perf_remove_from_context(struct perf_event *event,
state = PERF_EVENT_STATE_EXIT;
if (flags & DETACH_REVOKE)
state = PERF_EVENT_STATE_REVOKED;
- if (flags & DETACH_DEAD) {
- event->pending_disable = 1;
+ if (flags & DETACH_DEAD)
state = PERF_EVENT_STATE_DEAD;
- }
+
event_sched_out(event, ctx);
+
+ if (event->state > PERF_EVENT_STATE_OFF)
+ perf_cgroup_event_disable(event, ctx);
+
perf_event_set_state(event, min(event->state, state));
if (flags & DETACH_GROUP)
@@ -2562,6 +2566,15 @@ static void perf_remove_from_context(struct perf_event *event, unsigned long fla
event_function_call(event, __perf_remove_from_context, (void *)flags);
}
+static void __event_disable(struct perf_event *event,
+ struct perf_event_context *ctx,
+ enum perf_event_state state)
+{
+ event_sched_out(event, ctx);
+ perf_cgroup_event_disable(event, ctx);
+ perf_event_set_state(event, state);
+}
+
/*
* Cross CPU call to disable a performance event
*/
@@ -2576,13 +2589,18 @@ static void __perf_event_disable(struct perf_event *event,
perf_pmu_disable(event->pmu_ctx->pmu);
ctx_time_update_event(ctx, event);
+ /*
+ * When disabling a group leader, the whole group becomes ineligible
+ * to run, so schedule out the full group.
+ */
if (event == event->group_leader)
group_sched_out(event, ctx);
- else
- event_sched_out(event, ctx);
- perf_event_set_state(event, PERF_EVENT_STATE_OFF);
- perf_cgroup_event_disable(event, ctx);
+ /*
+ * But only mark the leader OFF; the siblings will remain
+ * INACTIVE.
+ */
+ __event_disable(event, ctx, PERF_EVENT_STATE_OFF);
perf_pmu_enable(event->pmu_ctx->pmu);
}
@@ -2656,8 +2674,8 @@ static void perf_event_unthrottle(struct perf_event *event, bool start)
static void perf_event_throttle(struct perf_event *event)
{
- event->pmu->stop(event, 0);
event->hw.interrupts = MAX_INTERRUPTS;
+ event->pmu->stop(event, 0);
if (event == event->group_leader)
perf_log_throttle(event, 0);
}
@@ -7186,18 +7204,18 @@ void perf_event_wakeup(struct perf_event *event)
static void perf_sigtrap(struct perf_event *event)
{
/*
- * We'd expect this to only occur if the irq_work is delayed and either
- * ctx->task or current has changed in the meantime. This can be the
- * case on architectures that do not implement arch_irq_work_raise().
+ * Both perf_pending_task() and perf_pending_irq() can race with the
+ * task exiting.
*/
- if (WARN_ON_ONCE(event->ctx->task != current))
+ if (current->flags & PF_EXITING)
return;
/*
- * Both perf_pending_task() and perf_pending_irq() can race with the
- * task exiting.
+ * We'd expect this to only occur if the irq_work is delayed and either
+ * ctx->task or current has changed in the meantime. This can be the
+ * case on architectures that do not implement arch_irq_work_raise().
*/
- if (current->flags & PF_EXITING)
+ if (WARN_ON_ONCE(event->ctx->task != current))
return;
send_sig_perf((void __user *)event->pending_addr,
@@ -7233,15 +7251,15 @@ static void __perf_pending_disable(struct perf_event *event)
* CPU-A CPU-B
*
* perf_event_disable_inatomic()
- * @pending_disable = CPU-A;
+ * @pending_disable = 1;
* irq_work_queue();
*
* sched-out
- * @pending_disable = -1;
+ * @pending_disable = 0;
*
* sched-in
* perf_event_disable_inatomic()
- * @pending_disable = CPU-B;
+ * @pending_disable = 1;
* irq_work_queue(); // FAILS
*
* irq_work_run()
@@ -7439,6 +7457,10 @@ perf_sample_ustack_size(u16 stack_size, u16 header_size,
if (!regs)
return 0;
+ /* No mm, no stack, no dump. */
+ if (!current->mm)
+ return 0;
+
/*
* Check if we fit in with the requested stack size into the:
* - TASK_SIZE
@@ -8150,6 +8172,9 @@ perf_callchain(struct perf_event *event, struct pt_regs *regs)
const u32 max_stack = event->attr.sample_max_stack;
struct perf_callchain_entry *callchain;
+ if (!current->mm)
+ user = false;
+
if (!kernel && !user)
return &__empty_callchain;
@@ -11091,7 +11116,7 @@ static int perf_uprobe_event_init(struct perf_event *event)
if (event->attr.type != perf_uprobe.type)
return -ENOENT;
- if (!perfmon_capable())
+ if (!capable(CAP_SYS_ADMIN))
return -EACCES;
/*
@@ -11749,7 +11774,12 @@ static void perf_swevent_cancel_hrtimer(struct perf_event *event)
{
struct hw_perf_event *hwc = &event->hw;
- if (is_sampling_event(event)) {
+ /*
+ * The throttle can be triggered in the hrtimer handler.
+ * The HRTIMER_NORESTART should be used to stop the timer,
+ * rather than hrtimer_cancel(). See perf_swevent_hrtimer()
+ */
+ if (is_sampling_event(event) && (hwc->interrupts != MAX_INTERRUPTS)) {
ktime_t remaining = hrtimer_get_remaining(&hwc->hrtimer);
local64_set(&hwc->period_left, ktime_to_ns(remaining));
@@ -11804,7 +11834,8 @@ static void cpu_clock_event_start(struct perf_event *event, int flags)
static void cpu_clock_event_stop(struct perf_event *event, int flags)
{
perf_swevent_cancel_hrtimer(event);
- cpu_clock_event_update(event);
+ if (flags & PERF_EF_UPDATE)
+ cpu_clock_event_update(event);
}
static int cpu_clock_event_add(struct perf_event *event, int flags)
@@ -11882,7 +11913,8 @@ static void task_clock_event_start(struct perf_event *event, int flags)
static void task_clock_event_stop(struct perf_event *event, int flags)
{
perf_swevent_cancel_hrtimer(event);
- task_clock_event_update(event, event->ctx->time);
+ if (flags & PERF_EF_UPDATE)
+ task_clock_event_update(event, event->ctx->time);
}
static int task_clock_event_add(struct perf_event *event, int flags)
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index d2aef87c7e9f..aa9a759e824f 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -441,7 +441,7 @@ void *perf_aux_output_begin(struct perf_output_handle *handle,
* store that will be enabled on successful return
*/
if (!handle->size) { /* A, matches D */
- event->pending_disable = smp_processor_id();
+ perf_event_disable_inatomic(handle->event);
perf_output_wakeup(handle);
WRITE_ONCE(rb->aux_nest, 0);
goto err_put;
@@ -526,7 +526,7 @@ void perf_aux_output_end(struct perf_output_handle *handle, unsigned long size)
if (wakeup) {
if (handle->aux_flags & PERF_AUX_FLAG_TRUNCATED)
- handle->event->pending_disable = smp_processor_id();
+ perf_event_disable_inatomic(handle->event);
perf_output_wakeup(handle);
}
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 8d783b5882b6..4c965ba77f9f 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -29,6 +29,7 @@
#include <linux/workqueue.h>
#include <linux/srcu.h>
#include <linux/oom.h> /* check_stable_address_space */
+#include <linux/pagewalk.h>
#include <linux/uprobes.h>
@@ -152,91 +153,6 @@ static loff_t vaddr_to_offset(struct vm_area_struct *vma, unsigned long vaddr)
}
/**
- * __replace_page - replace page in vma by new page.
- * based on replace_page in mm/ksm.c
- *
- * @vma: vma that holds the pte pointing to page
- * @addr: address the old @page is mapped at
- * @old_page: the page we are replacing by new_page
- * @new_page: the modified page we replace page by
- *
- * If @new_page is NULL, only unmap @old_page.
- *
- * Returns 0 on success, negative error code otherwise.
- */
-static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
- struct page *old_page, struct page *new_page)
-{
- struct folio *old_folio = page_folio(old_page);
- struct folio *new_folio;
- struct mm_struct *mm = vma->vm_mm;
- DEFINE_FOLIO_VMA_WALK(pvmw, old_folio, vma, addr, 0);
- int err;
- struct mmu_notifier_range range;
- pte_t pte;
-
- mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, addr,
- addr + PAGE_SIZE);
-
- if (new_page) {
- new_folio = page_folio(new_page);
- err = mem_cgroup_charge(new_folio, vma->vm_mm, GFP_KERNEL);
- if (err)
- return err;
- }
-
- /* For folio_free_swap() below */
- folio_lock(old_folio);
-
- mmu_notifier_invalidate_range_start(&range);
- err = -EAGAIN;
- if (!page_vma_mapped_walk(&pvmw))
- goto unlock;
- VM_BUG_ON_PAGE(addr != pvmw.address, old_page);
- pte = ptep_get(pvmw.pte);
-
- /*
- * Handle PFN swap PTES, such as device-exclusive ones, that actually
- * map pages: simply trigger GUP again to fix it up.
- */
- if (unlikely(!pte_present(pte))) {
- page_vma_mapped_walk_done(&pvmw);
- goto unlock;
- }
-
- if (new_page) {
- folio_get(new_folio);
- folio_add_new_anon_rmap(new_folio, vma, addr, RMAP_EXCLUSIVE);
- folio_add_lru_vma(new_folio, vma);
- } else
- /* no new page, just dec_mm_counter for old_page */
- dec_mm_counter(mm, MM_ANONPAGES);
-
- if (!folio_test_anon(old_folio)) {
- dec_mm_counter(mm, mm_counter_file(old_folio));
- inc_mm_counter(mm, MM_ANONPAGES);
- }
-
- flush_cache_page(vma, addr, pte_pfn(pte));
- ptep_clear_flush(vma, addr, pvmw.pte);
- if (new_page)
- set_pte_at(mm, addr, pvmw.pte,
- mk_pte(new_page, vma->vm_page_prot));
-
- folio_remove_rmap_pte(old_folio, old_page, vma);
- if (!folio_mapped(old_folio))
- folio_free_swap(old_folio);
- page_vma_mapped_walk_done(&pvmw);
- folio_put(old_folio);
-
- err = 0;
- unlock:
- mmu_notifier_invalidate_range_end(&range);
- folio_unlock(old_folio);
- return err;
-}
-
-/**
* is_swbp_insn - check if instruction is breakpoint instruction.
* @insn: instruction to be checked.
* Default implementation of is_swbp_insn
@@ -463,6 +379,95 @@ static int update_ref_ctr(struct uprobe *uprobe, struct mm_struct *mm,
return ret;
}
+static bool orig_page_is_identical(struct vm_area_struct *vma,
+ unsigned long vaddr, struct page *page, bool *pmd_mappable)
+{
+ const pgoff_t index = vaddr_to_offset(vma, vaddr) >> PAGE_SHIFT;
+ struct folio *orig_folio = filemap_get_folio(vma->vm_file->f_mapping,
+ index);
+ struct page *orig_page;
+ bool identical;
+
+ if (IS_ERR(orig_folio))
+ return false;
+ orig_page = folio_file_page(orig_folio, index);
+
+ *pmd_mappable = folio_test_pmd_mappable(orig_folio);
+ identical = folio_test_uptodate(orig_folio) &&
+ pages_identical(page, orig_page);
+ folio_put(orig_folio);
+ return identical;
+}
+
+static int __uprobe_write_opcode(struct vm_area_struct *vma,
+ struct folio_walk *fw, struct folio *folio,
+ unsigned long opcode_vaddr, uprobe_opcode_t opcode)
+{
+ const unsigned long vaddr = opcode_vaddr & PAGE_MASK;
+ const bool is_register = !!is_swbp_insn(&opcode);
+ bool pmd_mappable;
+
+ /* For now, we'll only handle PTE-mapped folios. */
+ if (fw->level != FW_LEVEL_PTE)
+ return -EFAULT;
+
+ /*
+ * See can_follow_write_pte(): we'd actually prefer a writable PTE here,
+ * but the VMA might not be writable.
+ */
+ if (!pte_write(fw->pte)) {
+ if (!PageAnonExclusive(fw->page))
+ return -EFAULT;
+ if (unlikely(userfaultfd_pte_wp(vma, fw->pte)))
+ return -EFAULT;
+ /* SOFTDIRTY is handled via pte_mkdirty() below. */
+ }
+
+ /*
+ * We'll temporarily unmap the page and flush the TLB, such that we can
+ * modify the page atomically.
+ */
+ flush_cache_page(vma, vaddr, pte_pfn(fw->pte));
+ fw->pte = ptep_clear_flush(vma, vaddr, fw->ptep);
+ copy_to_page(fw->page, opcode_vaddr, &opcode, UPROBE_SWBP_INSN_SIZE);
+
+ /*
+ * When unregistering, we may only zap a PTE if uffd is disabled and
+ * there are no unexpected folio references ...
+ */
+ if (is_register || userfaultfd_missing(vma) ||
+ (folio_ref_count(folio) != folio_mapcount(folio) + 1 +
+ folio_test_swapcache(folio) * folio_nr_pages(folio)))
+ goto remap;
+
+ /*
+ * ... and the mapped page is identical to the original page that
+ * would get faulted in on next access.
+ */
+ if (!orig_page_is_identical(vma, vaddr, fw->page, &pmd_mappable))
+ goto remap;
+
+ dec_mm_counter(vma->vm_mm, MM_ANONPAGES);
+ folio_remove_rmap_pte(folio, fw->page, vma);
+ if (!folio_mapped(folio) && folio_test_swapcache(folio) &&
+ folio_trylock(folio)) {
+ folio_free_swap(folio);
+ folio_unlock(folio);
+ }
+ folio_put(folio);
+
+ return pmd_mappable;
+remap:
+ /*
+ * Make sure that our copy_to_page() changes become visible before the
+ * set_pte_at() write.
+ */
+ smp_wmb();
+ /* We modified the page. Make sure to mark the PTE dirty. */
+ set_pte_at(vma->vm_mm, vaddr, fw->ptep, pte_mkdirty(fw->pte));
+ return 0;
+}
+
/*
* NOTE:
* Expect the breakpoint instruction to be the smallest size instruction for
@@ -474,146 +479,146 @@ static int update_ref_ctr(struct uprobe *uprobe, struct mm_struct *mm,
*
* uprobe_write_opcode - write the opcode at a given virtual address.
* @auprobe: arch specific probepoint information.
- * @mm: the probed process address space.
- * @vaddr: the virtual address to store the opcode.
- * @opcode: opcode to be written at @vaddr.
+ * @vma: the probed virtual memory area.
+ * @opcode_vaddr: the virtual address to store the opcode.
+ * @opcode: opcode to be written at @opcode_vaddr.
*
* Called with mm->mmap_lock held for read or write.
* Return 0 (success) or a negative errno.
*/
-int uprobe_write_opcode(struct arch_uprobe *auprobe, struct mm_struct *mm,
- unsigned long vaddr, uprobe_opcode_t opcode)
+int uprobe_write_opcode(struct arch_uprobe *auprobe, struct vm_area_struct *vma,
+ const unsigned long opcode_vaddr, uprobe_opcode_t opcode)
{
+ const unsigned long vaddr = opcode_vaddr & PAGE_MASK;
+ struct mm_struct *mm = vma->vm_mm;
struct uprobe *uprobe;
- struct page *old_page, *new_page;
- struct vm_area_struct *vma;
int ret, is_register, ref_ctr_updated = 0;
- bool orig_page_huge = false;
unsigned int gup_flags = FOLL_FORCE;
+ struct mmu_notifier_range range;
+ struct folio_walk fw;
+ struct folio *folio;
+ struct page *page;
is_register = is_swbp_insn(&opcode);
uprobe = container_of(auprobe, struct uprobe, arch);
-retry:
+ if (WARN_ON_ONCE(!is_cow_mapping(vma->vm_flags)))
+ return -EINVAL;
+
+ /*
+ * When registering, we have to break COW to get an exclusive anonymous
+ * page that we can safely modify. Use FOLL_WRITE to trigger a write
+ * fault if required. When unregistering, we might be lucky and the
+ * anon page is already gone. So defer write faults until really
+ * required. Use FOLL_SPLIT_PMD, because __uprobe_write_opcode()
+ * cannot deal with PMDs yet.
+ */
if (is_register)
- gup_flags |= FOLL_SPLIT_PMD;
- /* Read the page with vaddr into memory */
- old_page = get_user_page_vma_remote(mm, vaddr, gup_flags, &vma);
- if (IS_ERR(old_page))
- return PTR_ERR(old_page);
+ gup_flags |= FOLL_WRITE | FOLL_SPLIT_PMD;
- ret = verify_opcode(old_page, vaddr, &opcode);
+retry:
+ ret = get_user_pages_remote(mm, vaddr, 1, gup_flags, &page, NULL);
if (ret <= 0)
- goto put_old;
-
- if (is_zero_page(old_page)) {
- ret = -EINVAL;
- goto put_old;
- }
+ goto out;
+ folio = page_folio(page);
- if (WARN(!is_register && PageCompound(old_page),
- "uprobe unregister should never work on compound page\n")) {
- ret = -EINVAL;
- goto put_old;
+ ret = verify_opcode(page, opcode_vaddr, &opcode);
+ if (ret <= 0) {
+ folio_put(folio);
+ goto out;
}
/* We are going to replace instruction, update ref_ctr. */
if (!ref_ctr_updated && uprobe->ref_ctr_offset) {
ret = update_ref_ctr(uprobe, mm, is_register ? 1 : -1);
- if (ret)
- goto put_old;
+ if (ret) {
+ folio_put(folio);
+ goto out;
+ }
ref_ctr_updated = 1;
}
ret = 0;
- if (!is_register && !PageAnon(old_page))
- goto put_old;
-
- ret = anon_vma_prepare(vma);
- if (ret)
- goto put_old;
-
- ret = -ENOMEM;
- new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, vaddr);
- if (!new_page)
- goto put_old;
-
- __SetPageUptodate(new_page);
- copy_highpage(new_page, old_page);
- copy_to_page(new_page, vaddr, &opcode, UPROBE_SWBP_INSN_SIZE);
+ if (unlikely(!folio_test_anon(folio))) {
+ VM_WARN_ON_ONCE(is_register);
+ folio_put(folio);
+ goto out;
+ }
if (!is_register) {
- struct page *orig_page;
- pgoff_t index;
-
- VM_BUG_ON_PAGE(!PageAnon(old_page), old_page);
-
- index = vaddr_to_offset(vma, vaddr & PAGE_MASK) >> PAGE_SHIFT;
- orig_page = find_get_page(vma->vm_file->f_inode->i_mapping,
- index);
-
- if (orig_page) {
- if (PageUptodate(orig_page) &&
- pages_identical(new_page, orig_page)) {
- /* let go new_page */
- put_page(new_page);
- new_page = NULL;
-
- if (PageCompound(orig_page))
- orig_page_huge = true;
- }
- put_page(orig_page);
- }
+ /*
+ * In the common case, we'll be able to zap the page when
+ * unregistering. So trigger MMU notifiers now, as we won't
+ * be able to do it under PTL.
+ */
+ mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm,
+ vaddr, vaddr + PAGE_SIZE);
+ mmu_notifier_invalidate_range_start(&range);
}
- ret = __replace_page(vma, vaddr & PAGE_MASK, old_page, new_page);
- if (new_page)
- put_page(new_page);
-put_old:
- put_page(old_page);
+ ret = -EAGAIN;
+ /* Walk the page tables again, to perform the actual update. */
+ if (folio_walk_start(&fw, vma, vaddr, 0)) {
+ if (fw.page == page)
+ ret = __uprobe_write_opcode(vma, &fw, folio, opcode_vaddr, opcode);
+ folio_walk_end(&fw, vma);
+ }
+
+ if (!is_register)
+ mmu_notifier_invalidate_range_end(&range);
- if (unlikely(ret == -EAGAIN))
+ folio_put(folio);
+ switch (ret) {
+ case -EFAULT:
+ gup_flags |= FOLL_WRITE | FOLL_SPLIT_PMD;
+ fallthrough;
+ case -EAGAIN:
goto retry;
+ default:
+ break;
+ }
+out:
/* Revert back reference counter if instruction update failed. */
- if (ret && is_register && ref_ctr_updated)
+ if (ret < 0 && is_register && ref_ctr_updated)
update_ref_ctr(uprobe, mm, -1);
/* try collapse pmd for compound page */
- if (!ret && orig_page_huge)
+ if (ret > 0)
collapse_pte_mapped_thp(mm, vaddr, false);
- return ret;
+ return ret < 0 ? ret : 0;
}
/**
* set_swbp - store breakpoint at a given address.
* @auprobe: arch specific probepoint information.
- * @mm: the probed process address space.
+ * @vma: the probed virtual memory area.
* @vaddr: the virtual address to insert the opcode.
*
* For mm @mm, store the breakpoint instruction at @vaddr.
* Return 0 (success) or a negative errno.
*/
-int __weak set_swbp(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr)
+int __weak set_swbp(struct arch_uprobe *auprobe, struct vm_area_struct *vma,
+ unsigned long vaddr)
{
- return uprobe_write_opcode(auprobe, mm, vaddr, UPROBE_SWBP_INSN);
+ return uprobe_write_opcode(auprobe, vma, vaddr, UPROBE_SWBP_INSN);
}
/**
* set_orig_insn - Restore the original instruction.
- * @mm: the probed process address space.
+ * @vma: the probed virtual memory area.
* @auprobe: arch specific probepoint information.
* @vaddr: the virtual address to insert the opcode.
*
* For mm @mm, restore the original opcode (opcode) at @vaddr.
* Return 0 (success) or a negative errno.
*/
-int __weak
-set_orig_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr)
+int __weak set_orig_insn(struct arch_uprobe *auprobe,
+ struct vm_area_struct *vma, unsigned long vaddr)
{
- return uprobe_write_opcode(auprobe, mm, vaddr,
+ return uprobe_write_opcode(auprobe, vma, vaddr,
*(uprobe_opcode_t *)&auprobe->insn);
}
@@ -1134,10 +1139,10 @@ static bool filter_chain(struct uprobe *uprobe, struct mm_struct *mm)
return ret;
}
-static int
-install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm,
- struct vm_area_struct *vma, unsigned long vaddr)
+static int install_breakpoint(struct uprobe *uprobe, struct vm_area_struct *vma,
+ unsigned long vaddr)
{
+ struct mm_struct *mm = vma->vm_mm;
bool first_uprobe;
int ret;
@@ -1153,7 +1158,7 @@ install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm,
if (first_uprobe)
set_bit(MMF_HAS_UPROBES, &mm->flags);
- ret = set_swbp(&uprobe->arch, mm, vaddr);
+ ret = set_swbp(&uprobe->arch, vma, vaddr);
if (!ret)
clear_bit(MMF_RECALC_UPROBES, &mm->flags);
else if (first_uprobe)
@@ -1162,11 +1167,13 @@ install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm,
return ret;
}
-static int
-remove_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, unsigned long vaddr)
+static int remove_breakpoint(struct uprobe *uprobe, struct vm_area_struct *vma,
+ unsigned long vaddr)
{
+ struct mm_struct *mm = vma->vm_mm;
+
set_bit(MMF_RECALC_UPROBES, &mm->flags);
- return set_orig_insn(&uprobe->arch, mm, vaddr);
+ return set_orig_insn(&uprobe->arch, vma, vaddr);
}
struct map_info {
@@ -1296,10 +1303,10 @@ register_for_each_vma(struct uprobe *uprobe, struct uprobe_consumer *new)
if (is_register) {
/* consult only the "caller", new consumer. */
if (consumer_filter(new, mm))
- err = install_breakpoint(uprobe, mm, vma, info->vaddr);
+ err = install_breakpoint(uprobe, vma, info->vaddr);
} else if (test_bit(MMF_HAS_UPROBES, &mm->flags)) {
if (!filter_chain(uprobe, mm))
- err |= remove_breakpoint(uprobe, mm, info->vaddr);
+ err |= remove_breakpoint(uprobe, vma, info->vaddr);
}
unlock:
@@ -1472,7 +1479,7 @@ static int unapply_uprobe(struct uprobe *uprobe, struct mm_struct *mm)
continue;
vaddr = offset_to_vaddr(vma, uprobe->offset);
- err |= remove_breakpoint(uprobe, mm, vaddr);
+ err |= remove_breakpoint(uprobe, vma, vaddr);
}
mmap_read_unlock(mm);
@@ -1610,7 +1617,7 @@ int uprobe_mmap(struct vm_area_struct *vma)
if (!fatal_signal_pending(current) &&
filter_chain(uprobe, vma->vm_mm)) {
unsigned long vaddr = offset_to_vaddr(vma, uprobe->offset);
- install_breakpoint(uprobe, vma->vm_mm, vma, vaddr);
+ install_breakpoint(uprobe, vma, vaddr);
}
put_uprobe(uprobe);
}
diff --git a/kernel/exit.c b/kernel/exit.c
index 38645039dd8f..bb184a67ac73 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -421,44 +421,30 @@ kill_orphaned_pgrp(struct task_struct *tsk, struct task_struct *parent)
}
}
-static void coredump_task_exit(struct task_struct *tsk)
+static void coredump_task_exit(struct task_struct *tsk,
+ struct core_state *core_state)
{
- struct core_state *core_state;
+ struct core_thread self;
+ self.task = tsk;
+ if (self.task->flags & PF_SIGNALED)
+ self.next = xchg(&core_state->dumper.next, &self);
+ else
+ self.task = NULL;
/*
- * Serialize with any possible pending coredump.
- * We must hold siglock around checking core_state
- * and setting PF_POSTCOREDUMP. The core-inducing thread
- * will increment ->nr_threads for each thread in the
- * group without PF_POSTCOREDUMP set.
+ * Implies mb(), the result of xchg() must be visible
+ * to core_state->dumper.
*/
- spin_lock_irq(&tsk->sighand->siglock);
- tsk->flags |= PF_POSTCOREDUMP;
- core_state = tsk->signal->core_state;
- spin_unlock_irq(&tsk->sighand->siglock);
- if (core_state) {
- struct core_thread self;
-
- self.task = current;
- if (self.task->flags & PF_SIGNALED)
- self.next = xchg(&core_state->dumper.next, &self);
- else
- self.task = NULL;
- /*
- * Implies mb(), the result of xchg() must be visible
- * to core_state->dumper.
- */
- if (atomic_dec_and_test(&core_state->nr_threads))
- complete(&core_state->startup);
+ if (atomic_dec_and_test(&core_state->nr_threads))
+ complete(&core_state->startup);
- for (;;) {
- set_current_state(TASK_IDLE|TASK_FREEZABLE);
- if (!self.task) /* see coredump_finish() */
- break;
- schedule();
- }
- __set_current_state(TASK_RUNNING);
+ for (;;) {
+ set_current_state(TASK_IDLE|TASK_FREEZABLE);
+ if (!self.task) /* see coredump_finish() */
+ break;
+ schedule();
}
+ __set_current_state(TASK_RUNNING);
}
#ifdef CONFIG_MEMCG
@@ -882,6 +868,7 @@ static void synchronize_group_exit(struct task_struct *tsk, long code)
{
struct sighand_struct *sighand = tsk->sighand;
struct signal_struct *signal = tsk->signal;
+ struct core_state *core_state;
spin_lock_irq(&sighand->siglock);
signal->quick_threads--;
@@ -891,7 +878,19 @@ static void synchronize_group_exit(struct task_struct *tsk, long code)
signal->group_exit_code = code;
signal->group_stop_count = 0;
}
+ /*
+ * Serialize with any possible pending coredump.
+ * We must hold siglock around checking core_state
+ * and setting PF_POSTCOREDUMP. The core-inducing thread
+ * will increment ->nr_threads for each thread in the
+ * group without PF_POSTCOREDUMP set.
+ */
+ tsk->flags |= PF_POSTCOREDUMP;
+ core_state = signal->core_state;
spin_unlock_irq(&sighand->siglock);
+
+ if (unlikely(core_state))
+ coredump_task_exit(tsk, core_state);
}
void __noreturn do_exit(long code)
@@ -900,15 +899,12 @@ void __noreturn do_exit(long code)
int group_dead;
WARN_ON(irqs_disabled());
-
- synchronize_group_exit(tsk, code);
-
WARN_ON(tsk->plug);
kcov_task_exit(tsk);
kmsan_task_exit(tsk);
- coredump_task_exit(tsk);
+ synchronize_group_exit(tsk, code);
ptrace_event(PTRACE_EVENT_EXIT, code);
user_events_exit(tsk);
@@ -944,6 +940,15 @@ void __noreturn do_exit(long code)
taskstats_exit(tsk, group_dead);
trace_sched_process_exit(tsk, group_dead);
+ /*
+ * Since sampling can touch ->mm, make sure to stop everything before we
+ * tear it down.
+ *
+ * Also flushes inherited counters to the parent - before the parent
+ * gets woken up by child-exit notifications.
+ */
+ perf_event_exit_task(tsk);
+
exit_mm();
if (group_dead)
@@ -959,14 +964,6 @@ void __noreturn do_exit(long code)
exit_task_work(tsk);
exit_thread(tsk);
- /*
- * Flush inherited counters to the parent - before the parent
- * gets woken up by child-exit notifications.
- *
- * because of cgroup mode, must be called before cgroup_exit()
- */
- perf_event_exit_task(tsk);
-
sched_autogroup_exit_task(tsk);
cgroup_exit(tsk);
diff --git a/kernel/fork.c b/kernel/fork.c
index 85afccfdf3b1..52901fe4a3c2 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -93,7 +93,7 @@
#include <linux/kcov.h>
#include <linux/livepatch.h>
#include <linux/thread_info.h>
-#include <linux/stackleak.h>
+#include <linux/kstack_erase.h>
#include <linux/kasan.h>
#include <linux/scs.h>
#include <linux/io_uring.h>
@@ -112,6 +112,9 @@
#include <asm/cacheflush.h>
#include <asm/tlbflush.h>
+/* For dup_mmap(). */
+#include "../mm/internal.h"
+
#include <trace/events/sched.h>
#define CREATE_TRACE_POINTS
@@ -428,88 +431,9 @@ struct kmem_cache *files_cachep;
/* SLAB cache for fs_struct structures (tsk->fs) */
struct kmem_cache *fs_cachep;
-/* SLAB cache for vm_area_struct structures */
-static struct kmem_cache *vm_area_cachep;
-
/* SLAB cache for mm_struct structures (tsk->mm) */
static struct kmem_cache *mm_cachep;
-struct vm_area_struct *vm_area_alloc(struct mm_struct *mm)
-{
- struct vm_area_struct *vma;
-
- vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
- if (!vma)
- return NULL;
-
- vma_init(vma, mm);
-
- return vma;
-}
-
-static void vm_area_init_from(const struct vm_area_struct *src,
- struct vm_area_struct *dest)
-{
- dest->vm_mm = src->vm_mm;
- dest->vm_ops = src->vm_ops;
- dest->vm_start = src->vm_start;
- dest->vm_end = src->vm_end;
- dest->anon_vma = src->anon_vma;
- dest->vm_pgoff = src->vm_pgoff;
- dest->vm_file = src->vm_file;
- dest->vm_private_data = src->vm_private_data;
- vm_flags_init(dest, src->vm_flags);
- memcpy(&dest->vm_page_prot, &src->vm_page_prot,
- sizeof(dest->vm_page_prot));
- /*
- * src->shared.rb may be modified concurrently when called from
- * dup_mmap(), but the clone will reinitialize it.
- */
- data_race(memcpy(&dest->shared, &src->shared, sizeof(dest->shared)));
- memcpy(&dest->vm_userfaultfd_ctx, &src->vm_userfaultfd_ctx,
- sizeof(dest->vm_userfaultfd_ctx));
-#ifdef CONFIG_ANON_VMA_NAME
- dest->anon_name = src->anon_name;
-#endif
-#ifdef CONFIG_SWAP
- memcpy(&dest->swap_readahead_info, &src->swap_readahead_info,
- sizeof(dest->swap_readahead_info));
-#endif
-#ifndef CONFIG_MMU
- dest->vm_region = src->vm_region;
-#endif
-#ifdef CONFIG_NUMA
- dest->vm_policy = src->vm_policy;
-#endif
-}
-
-struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig)
-{
- struct vm_area_struct *new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
-
- if (!new)
- return NULL;
-
- ASSERT_EXCLUSIVE_WRITER(orig->vm_flags);
- ASSERT_EXCLUSIVE_WRITER(orig->vm_file);
- vm_area_init_from(orig, new);
- vma_lock_init(new, true);
- INIT_LIST_HEAD(&new->anon_vma_chain);
- vma_numab_state_init(new);
- dup_anon_vma_name(orig, new);
-
- return new;
-}
-
-void vm_area_free(struct vm_area_struct *vma)
-{
- /* The vma should be detached while being destroyed. */
- vma_assert_detached(vma);
- vma_numab_state_free(vma);
- free_anon_vma_name(vma);
- kmem_cache_free(vm_area_cachep, vma);
-}
-
static void account_kernel_stack(struct task_struct *tsk, int account)
{
if (IS_ENABLED(CONFIG_VMAP_STACK)) {
@@ -589,7 +513,7 @@ void free_task(struct task_struct *tsk)
}
EXPORT_SYMBOL(free_task);
-static void dup_mm_exe_file(struct mm_struct *mm, struct mm_struct *oldmm)
+void dup_mm_exe_file(struct mm_struct *mm, struct mm_struct *oldmm)
{
struct file *exe_file;
@@ -604,183 +528,6 @@ static void dup_mm_exe_file(struct mm_struct *mm, struct mm_struct *oldmm)
}
#ifdef CONFIG_MMU
-static __latent_entropy int dup_mmap(struct mm_struct *mm,
- struct mm_struct *oldmm)
-{
- struct vm_area_struct *mpnt, *tmp;
- int retval;
- unsigned long charge = 0;
- LIST_HEAD(uf);
- VMA_ITERATOR(vmi, mm, 0);
-
- if (mmap_write_lock_killable(oldmm))
- return -EINTR;
- flush_cache_dup_mm(oldmm);
- uprobe_dup_mmap(oldmm, mm);
- /*
- * Not linked in yet - no deadlock potential:
- */
- mmap_write_lock_nested(mm, SINGLE_DEPTH_NESTING);
-
- /* No ordering required: file already has been exposed. */
- dup_mm_exe_file(mm, oldmm);
-
- mm->total_vm = oldmm->total_vm;
- mm->data_vm = oldmm->data_vm;
- mm->exec_vm = oldmm->exec_vm;
- mm->stack_vm = oldmm->stack_vm;
-
- /* Use __mt_dup() to efficiently build an identical maple tree. */
- retval = __mt_dup(&oldmm->mm_mt, &mm->mm_mt, GFP_KERNEL);
- if (unlikely(retval))
- goto out;
-
- mt_clear_in_rcu(vmi.mas.tree);
- for_each_vma(vmi, mpnt) {
- struct file *file;
-
- vma_start_write(mpnt);
- if (mpnt->vm_flags & VM_DONTCOPY) {
- retval = vma_iter_clear_gfp(&vmi, mpnt->vm_start,
- mpnt->vm_end, GFP_KERNEL);
- if (retval)
- goto loop_out;
-
- vm_stat_account(mm, mpnt->vm_flags, -vma_pages(mpnt));
- continue;
- }
- charge = 0;
- /*
- * Don't duplicate many vmas if we've been oom-killed (for
- * example)
- */
- if (fatal_signal_pending(current)) {
- retval = -EINTR;
- goto loop_out;
- }
- if (mpnt->vm_flags & VM_ACCOUNT) {
- unsigned long len = vma_pages(mpnt);
-
- if (security_vm_enough_memory_mm(oldmm, len)) /* sic */
- goto fail_nomem;
- charge = len;
- }
- tmp = vm_area_dup(mpnt);
- if (!tmp)
- goto fail_nomem;
-
- /* track_pfn_copy() will later take care of copying internal state. */
- if (unlikely(tmp->vm_flags & VM_PFNMAP))
- untrack_pfn_clear(tmp);
-
- retval = vma_dup_policy(mpnt, tmp);
- if (retval)
- goto fail_nomem_policy;
- tmp->vm_mm = mm;
- retval = dup_userfaultfd(tmp, &uf);
- if (retval)
- goto fail_nomem_anon_vma_fork;
- if (tmp->vm_flags & VM_WIPEONFORK) {
- /*
- * VM_WIPEONFORK gets a clean slate in the child.
- * Don't prepare anon_vma until fault since we don't
- * copy page for current vma.
- */
- tmp->anon_vma = NULL;
- } else if (anon_vma_fork(tmp, mpnt))
- goto fail_nomem_anon_vma_fork;
- vm_flags_clear(tmp, VM_LOCKED_MASK);
- /*
- * Copy/update hugetlb private vma information.
- */
- if (is_vm_hugetlb_page(tmp))
- hugetlb_dup_vma_private(tmp);
-
- /*
- * Link the vma into the MT. After using __mt_dup(), memory
- * allocation is not necessary here, so it cannot fail.
- */
- vma_iter_bulk_store(&vmi, tmp);
-
- mm->map_count++;
-
- if (tmp->vm_ops && tmp->vm_ops->open)
- tmp->vm_ops->open(tmp);
-
- file = tmp->vm_file;
- if (file) {
- struct address_space *mapping = file->f_mapping;
-
- get_file(file);
- i_mmap_lock_write(mapping);
- if (vma_is_shared_maywrite(tmp))
- mapping_allow_writable(mapping);
- flush_dcache_mmap_lock(mapping);
- /* insert tmp into the share list, just after mpnt */
- vma_interval_tree_insert_after(tmp, mpnt,
- &mapping->i_mmap);
- flush_dcache_mmap_unlock(mapping);
- i_mmap_unlock_write(mapping);
- }
-
- if (!(tmp->vm_flags & VM_WIPEONFORK))
- retval = copy_page_range(tmp, mpnt);
-
- if (retval) {
- mpnt = vma_next(&vmi);
- goto loop_out;
- }
- }
- /* a new mm has just been created */
- retval = arch_dup_mmap(oldmm, mm);
-loop_out:
- vma_iter_free(&vmi);
- if (!retval) {
- mt_set_in_rcu(vmi.mas.tree);
- ksm_fork(mm, oldmm);
- khugepaged_fork(mm, oldmm);
- } else {
-
- /*
- * The entire maple tree has already been duplicated. If the
- * mmap duplication fails, mark the failure point with
- * XA_ZERO_ENTRY. In exit_mmap(), if this marker is encountered,
- * stop releasing VMAs that have not been duplicated after this
- * point.
- */
- if (mpnt) {
- mas_set_range(&vmi.mas, mpnt->vm_start, mpnt->vm_end - 1);
- mas_store(&vmi.mas, XA_ZERO_ENTRY);
- /* Avoid OOM iterating a broken tree */
- set_bit(MMF_OOM_SKIP, &mm->flags);
- }
- /*
- * The mm_struct is going to exit, but the locks will be dropped
- * first. Set the mm_struct as unstable is advisable as it is
- * not fully initialised.
- */
- set_bit(MMF_UNSTABLE, &mm->flags);
- }
-out:
- mmap_write_unlock(mm);
- flush_tlb_mm(oldmm);
- mmap_write_unlock(oldmm);
- if (!retval)
- dup_userfaultfd_complete(&uf);
- else
- dup_userfaultfd_fail(&uf);
- return retval;
-
-fail_nomem_anon_vma_fork:
- mpol_put(vma_policy(tmp));
-fail_nomem_policy:
- vm_area_free(tmp);
-fail_nomem:
- retval = -ENOMEM;
- vm_unacct_memory(charge);
- goto loop_out;
-}
-
static inline int mm_alloc_pgd(struct mm_struct *mm)
{
mm->pgd = pgd_alloc(mm);
@@ -794,13 +541,6 @@ static inline void mm_free_pgd(struct mm_struct *mm)
pgd_free(mm, mm->pgd);
}
#else
-static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
-{
- mmap_write_lock(oldmm);
- dup_mm_exe_file(mm, oldmm);
- mmap_write_unlock(oldmm);
- return 0;
-}
#define mm_alloc_pgd(mm) (0)
#define mm_free_pgd(mm)
#endif /* CONFIG_MMU */
@@ -1306,7 +1046,6 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
RCU_INIT_POINTER(mm->exe_file, NULL);
mmu_notifier_subscriptions_init(mm);
init_tlb_flush_pending(mm);
- futex_mm_init(mm);
#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !defined(CONFIG_SPLIT_PMD_PTLOCKS)
mm->pmd_huge_pte = NULL;
#endif
@@ -1321,6 +1060,9 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
mm->def_flags = 0;
}
+ if (futex_mm_init(mm))
+ goto fail_mm_init;
+
if (mm_alloc_pgd(mm))
goto fail_nopgd;
@@ -1350,6 +1092,8 @@ fail_nocontext:
fail_noid:
mm_free_pgd(mm);
fail_nopgd:
+ futex_hash_free(mm);
+fail_mm_init:
free_mm(mm);
return NULL;
}
@@ -1405,7 +1149,7 @@ void mmput(struct mm_struct *mm)
}
EXPORT_SYMBOL_GPL(mmput);
-#ifdef CONFIG_MMU
+#if defined(CONFIG_MMU) || defined(CONFIG_FUTEX_PRIVATE_HASH)
static void mmput_async_fn(struct work_struct *work)
{
struct mm_struct *mm = container_of(work, struct mm_struct,
@@ -1802,14 +1546,14 @@ static int copy_fs(unsigned long clone_flags, struct task_struct *tsk)
struct fs_struct *fs = current->fs;
if (clone_flags & CLONE_FS) {
/* tsk->fs is already what we want */
- spin_lock(&fs->lock);
+ read_seqlock_excl(&fs->seq);
/* "users" and "in_exec" locked for check_unsafe_exec() */
if (fs->in_exec) {
- spin_unlock(&fs->lock);
+ read_sequnlock_excl(&fs->seq);
return -EAGAIN;
}
fs->users++;
- spin_unlock(&fs->lock);
+ read_sequnlock_excl(&fs->seq);
return 0;
}
tsk->fs = copy_fs_struct(fs);
@@ -2146,10 +1890,7 @@ static void copy_oom_score_adj(u64 clone_flags, struct task_struct *tsk)
#ifdef CONFIG_RV
static void rv_task_fork(struct task_struct *p)
{
- int i;
-
- for (i = 0; i < RV_PER_TASK_MONITORS; i++)
- p->rv[i].da_mon.monitoring = false;
+ memset(&p->rv, 0, sizeof(p->rv));
}
#else
#define rv_task_fork(p) do {} while (0)
@@ -2383,9 +2124,8 @@ __latent_entropy struct task_struct *copy_process(
lockdep_init_task(p);
#endif
-#ifdef CONFIG_DEBUG_MUTEXES
p->blocked_on = NULL; /* not blocked yet */
-#endif
+
#ifdef CONFIG_BCACHE
p->sequential_io = 0;
p->sequential_io_avg = 0;
@@ -3003,7 +2743,7 @@ SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp,
}
#endif
-noinline static int copy_clone_args_from_user(struct kernel_clone_args *kargs,
+static noinline int copy_clone_args_from_user(struct kernel_clone_args *kargs,
struct clone_args __user *uargs,
size_t usize)
{
@@ -3228,11 +2968,6 @@ void __init mm_cache_init(void)
void __init proc_caches_init(void)
{
- struct kmem_cache_args args = {
- .use_freeptr_offset = true,
- .freeptr_offset = offsetof(struct vm_area_struct, vm_freeptr),
- };
-
sighand_cachep = kmem_cache_create("sighand_cache",
sizeof(struct sighand_struct), 0,
SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_TYPESAFE_BY_RCU|
@@ -3249,10 +2984,6 @@ void __init proc_caches_init(void)
sizeof(struct fs_struct), 0,
SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT,
NULL);
- vm_area_cachep = kmem_cache_create("vm_area_struct",
- sizeof(struct vm_area_struct), &args,
- SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_TYPESAFE_BY_RCU|
- SLAB_ACCOUNT);
mmap_init();
nsproxy_cache_init();
}
@@ -3418,13 +3149,13 @@ int ksys_unshare(unsigned long unshare_flags)
if (new_fs) {
fs = current->fs;
- spin_lock(&fs->lock);
+ read_seqlock_excl(&fs->seq);
current->fs = new_fs;
if (--fs->users)
new_fs = NULL;
else
new_fs = fs;
- spin_unlock(&fs->lock);
+ read_sequnlock_excl(&fs->seq);
}
if (new_fd)
@@ -3485,7 +3216,7 @@ int unshare_files(void)
return 0;
}
-int sysctl_max_threads(const struct ctl_table *table, int write,
+static int sysctl_max_threads(const struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
struct ctl_table t;
@@ -3507,3 +3238,21 @@ int sysctl_max_threads(const struct ctl_table *table, int write,
return 0;
}
+
+static const struct ctl_table fork_sysctl_table[] = {
+ {
+ .procname = "threads-max",
+ .data = NULL,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = sysctl_max_threads,
+ },
+};
+
+static int __init init_fork_sysctl(void)
+{
+ register_sysctl_init("kernel", fork_sysctl_table);
+ return 0;
+}
+
+subsys_initcall(init_fork_sysctl);
diff --git a/kernel/freezer.c b/kernel/freezer.c
index 8d530d0949ff..6a96149aede9 100644
--- a/kernel/freezer.c
+++ b/kernel/freezer.c
@@ -201,18 +201,9 @@ static int __restore_freezer_state(struct task_struct *p, void *arg)
void __thaw_task(struct task_struct *p)
{
- unsigned long flags;
-
- spin_lock_irqsave(&freezer_lock, flags);
- if (WARN_ON_ONCE(freezing(p)))
- goto unlock;
-
- if (!frozen(p) || task_call_func(p, __restore_freezer_state, NULL))
- goto unlock;
-
- wake_up_state(p, TASK_FROZEN);
-unlock:
- spin_unlock_irqrestore(&freezer_lock, flags);
+ guard(spinlock_irqsave)(&freezer_lock);
+ if (frozen(p) && !task_call_func(p, __restore_freezer_state, NULL))
+ wake_up_state(p, TASK_FROZEN);
}
/**
diff --git a/kernel/futex/core.c b/kernel/futex/core.c
index 19a2c65f3d37..d9bb5567af0c 100644
--- a/kernel/futex/core.c
+++ b/kernel/futex/core.c
@@ -42,7 +42,6 @@
#include <linux/fault-inject.h>
#include <linux/slab.h>
#include <linux/prctl.h>
-#include <linux/rcuref.h>
#include <linux/mempolicy.h>
#include <linux/mmap_lock.h>
@@ -65,12 +64,11 @@ static struct {
#define futex_queues (__futex_data.queues)
struct futex_private_hash {
- rcuref_t users;
+ int state;
unsigned int hash_mask;
struct rcu_head rcu;
void *mm;
bool custom;
- bool immutable;
struct futex_hash_bucket queues[];
};
@@ -129,6 +127,12 @@ static struct futex_hash_bucket *
__futex_hash(union futex_key *key, struct futex_private_hash *fph);
#ifdef CONFIG_FUTEX_PRIVATE_HASH
+static bool futex_ref_get(struct futex_private_hash *fph);
+static bool futex_ref_put(struct futex_private_hash *fph);
+static bool futex_ref_is_dead(struct futex_private_hash *fph);
+
+enum { FR_PERCPU = 0, FR_ATOMIC };
+
static inline bool futex_key_is_private(union futex_key *key)
{
/*
@@ -138,19 +142,14 @@ static inline bool futex_key_is_private(union futex_key *key)
return !(key->both.offset & (FUT_OFF_INODE | FUT_OFF_MMSHARED));
}
-bool futex_private_hash_get(struct futex_private_hash *fph)
+static bool futex_private_hash_get(struct futex_private_hash *fph)
{
- if (fph->immutable)
- return true;
- return rcuref_get(&fph->users);
+ return futex_ref_get(fph);
}
void futex_private_hash_put(struct futex_private_hash *fph)
{
- /* Ignore return value, last put is verified via rcuref_is_dead() */
- if (fph->immutable)
- return;
- if (rcuref_put(&fph->users))
+ if (futex_ref_put(fph))
wake_up_var(fph->mm);
}
@@ -243,14 +242,18 @@ static bool __futex_pivot_hash(struct mm_struct *mm,
fph = rcu_dereference_protected(mm->futex_phash,
lockdep_is_held(&mm->futex_hash_lock));
if (fph) {
- if (!rcuref_is_dead(&fph->users)) {
+ if (!futex_ref_is_dead(fph)) {
mm->futex_phash_new = new;
return false;
}
futex_rehash_private(fph, new);
}
- rcu_assign_pointer(mm->futex_phash, new);
+ new->state = FR_PERCPU;
+ scoped_guard(rcu) {
+ mm->futex_batches = get_state_synchronize_rcu();
+ rcu_assign_pointer(mm->futex_phash, new);
+ }
kvfree_rcu(fph, rcu);
return true;
}
@@ -289,9 +292,7 @@ again:
if (!fph)
return NULL;
- if (fph->immutable)
- return fph;
- if (rcuref_get(&fph->users))
+ if (futex_private_hash_get(fph))
return fph;
}
futex_pivot_hash(mm);
@@ -531,7 +532,7 @@ static u64 get_inode_sequence_number(struct inode *inode)
*
* For shared mappings (when @fshared), the key is:
*
- * ( inode->i_sequence, page->index, offset_within_page )
+ * ( inode->i_sequence, page offset within mapping, offset_within_page )
*
* [ also see get_inode_sequence_number() ]
*
@@ -583,8 +584,8 @@ int get_futex_key(u32 __user *uaddr, unsigned int flags, union futex_key *key,
if (futex_get_value(&node, naddr))
return -EFAULT;
- if (node != FUTEX_NO_NODE &&
- (node >= MAX_NUMNODES || !node_possible(node)))
+ if ((node != FUTEX_NO_NODE) &&
+ ((unsigned int)node >= MAX_NUMNODES || !node_possible(node)))
return -EINVAL;
}
@@ -1524,19 +1525,221 @@ static void futex_hash_bucket_init(struct futex_hash_bucket *fhb,
}
#define FH_CUSTOM 0x01
-#define FH_IMMUTABLE 0x02
#ifdef CONFIG_FUTEX_PRIVATE_HASH
+
+/*
+ * futex-ref
+ *
+ * Heavily inspired by percpu-rwsem/percpu-refcount; not reusing any of that
+ * code because it just doesn't fit right.
+ *
+ * Dual counter, per-cpu / atomic approach like percpu-refcount, except it
+ * re-initializes the state automatically, such that the fph swizzle is also a
+ * transition back to per-cpu.
+ */
+
+static void futex_ref_rcu(struct rcu_head *head);
+
+static void __futex_ref_atomic_begin(struct futex_private_hash *fph)
+{
+ struct mm_struct *mm = fph->mm;
+
+ /*
+ * The counter we're about to switch to must have fully switched;
+ * otherwise it would be impossible for it to have reported success
+ * from futex_ref_is_dead().
+ */
+ WARN_ON_ONCE(atomic_long_read(&mm->futex_atomic) != 0);
+
+ /*
+ * Set the atomic to the bias value such that futex_ref_{get,put}()
+ * will never observe 0. Will be fixed up in __futex_ref_atomic_end()
+ * when folding in the percpu count.
+ */
+ atomic_long_set(&mm->futex_atomic, LONG_MAX);
+ smp_store_release(&fph->state, FR_ATOMIC);
+
+ call_rcu_hurry(&mm->futex_rcu, futex_ref_rcu);
+}
+
+static void __futex_ref_atomic_end(struct futex_private_hash *fph)
+{
+ struct mm_struct *mm = fph->mm;
+ unsigned int count = 0;
+ long ret;
+ int cpu;
+
+ /*
+ * Per __futex_ref_atomic_begin() the state of the fph must be ATOMIC
+ * and per this RCU callback, everybody must now observe this state and
+ * use the atomic variable.
+ */
+ WARN_ON_ONCE(fph->state != FR_ATOMIC);
+
+ /*
+ * Therefore the per-cpu counter is now stable, sum and reset.
+ */
+ for_each_possible_cpu(cpu) {
+ unsigned int *ptr = per_cpu_ptr(mm->futex_ref, cpu);
+ count += *ptr;
+ *ptr = 0;
+ }
+
+ /*
+ * Re-init for the next cycle.
+ */
+ this_cpu_inc(*mm->futex_ref); /* 0 -> 1 */
+
+ /*
+ * Add actual count, subtract bias and initial refcount.
+ *
+ * The moment this atomic operation happens, futex_ref_is_dead() can
+ * become true.
+ */
+ ret = atomic_long_add_return(count - LONG_MAX - 1, &mm->futex_atomic);
+ if (!ret)
+ wake_up_var(mm);
+
+ WARN_ON_ONCE(ret < 0);
+ mmput_async(mm);
+}
+
+static void futex_ref_rcu(struct rcu_head *head)
+{
+ struct mm_struct *mm = container_of(head, struct mm_struct, futex_rcu);
+ struct futex_private_hash *fph = rcu_dereference_raw(mm->futex_phash);
+
+ if (fph->state == FR_PERCPU) {
+ /*
+ * Per this extra grace-period, everybody must now observe
+ * fph as the current fph and no previously observed fph's
+ * are in-flight.
+ *
+ * Notably, nobody will now rely on the atomic
+ * futex_ref_is_dead() state anymore so we can begin the
+ * migration of the per-cpu counter into the atomic.
+ */
+ __futex_ref_atomic_begin(fph);
+ return;
+ }
+
+ __futex_ref_atomic_end(fph);
+}
+
+/*
+ * Drop the initial refcount and transition to atomics.
+ */
+static void futex_ref_drop(struct futex_private_hash *fph)
+{
+ struct mm_struct *mm = fph->mm;
+
+ /*
+ * Can only transition the current fph;
+ */
+ WARN_ON_ONCE(rcu_dereference_raw(mm->futex_phash) != fph);
+ /*
+ * We enqueue at least one RCU callback. Ensure mm stays if the task
+ * exits before the transition is completed.
+ */
+ mmget(mm);
+
+ /*
+ * In order to avoid the following scenario:
+ *
+ * futex_hash() __futex_pivot_hash()
+ * guard(rcu); guard(mm->futex_hash_lock);
+ * fph = mm->futex_phash;
+ * rcu_assign_pointer(&mm->futex_phash, new);
+ * futex_hash_allocate()
+ * futex_ref_drop()
+ * fph->state = FR_ATOMIC;
+ * atomic_set(, BIAS);
+ *
+ * futex_private_hash_get(fph); // OOPS
+ *
+ * Where an old fph (which is FR_ATOMIC) and should fail on
+ * inc_not_zero, will succeed because a new transition is started and
+ * the atomic is bias'ed away from 0.
+ *
+ * There must be at least one full grace-period between publishing a
+ * new fph and trying to replace it.
+ */
+ if (poll_state_synchronize_rcu(mm->futex_batches)) {
+ /*
+ * There was a grace-period, we can begin now.
+ */
+ __futex_ref_atomic_begin(fph);
+ return;
+ }
+
+ call_rcu_hurry(&mm->futex_rcu, futex_ref_rcu);
+}
+
+static bool futex_ref_get(struct futex_private_hash *fph)
+{
+ struct mm_struct *mm = fph->mm;
+
+ guard(rcu)();
+
+ if (smp_load_acquire(&fph->state) == FR_PERCPU) {
+ this_cpu_inc(*mm->futex_ref);
+ return true;
+ }
+
+ return atomic_long_inc_not_zero(&mm->futex_atomic);
+}
+
+static bool futex_ref_put(struct futex_private_hash *fph)
+{
+ struct mm_struct *mm = fph->mm;
+
+ guard(rcu)();
+
+ if (smp_load_acquire(&fph->state) == FR_PERCPU) {
+ this_cpu_dec(*mm->futex_ref);
+ return false;
+ }
+
+ return atomic_long_dec_and_test(&mm->futex_atomic);
+}
+
+static bool futex_ref_is_dead(struct futex_private_hash *fph)
+{
+ struct mm_struct *mm = fph->mm;
+
+ guard(rcu)();
+
+ if (smp_load_acquire(&fph->state) == FR_PERCPU)
+ return false;
+
+ return atomic_long_read(&mm->futex_atomic) == 0;
+}
+
+int futex_mm_init(struct mm_struct *mm)
+{
+ mutex_init(&mm->futex_hash_lock);
+ RCU_INIT_POINTER(mm->futex_phash, NULL);
+ mm->futex_phash_new = NULL;
+ /* futex-ref */
+ atomic_long_set(&mm->futex_atomic, 0);
+ mm->futex_batches = get_state_synchronize_rcu();
+ mm->futex_ref = alloc_percpu(unsigned int);
+ if (!mm->futex_ref)
+ return -ENOMEM;
+ this_cpu_inc(*mm->futex_ref); /* 0 -> 1 */
+ return 0;
+}
+
void futex_hash_free(struct mm_struct *mm)
{
struct futex_private_hash *fph;
+ free_percpu(mm->futex_ref);
kvfree(mm->futex_phash_new);
fph = rcu_dereference_raw(mm->futex_phash);
- if (fph) {
- WARN_ON_ONCE(rcuref_read(&fph->users) > 1);
+ if (fph)
kvfree(fph);
- }
}
static bool futex_pivot_pending(struct mm_struct *mm)
@@ -1549,7 +1752,7 @@ static bool futex_pivot_pending(struct mm_struct *mm)
return true;
fph = rcu_dereference(mm->futex_phash);
- return rcuref_is_dead(&fph->users);
+ return futex_ref_is_dead(fph);
}
static bool futex_hash_less(struct futex_private_hash *a,
@@ -1591,21 +1794,20 @@ static int futex_hash_allocate(unsigned int hash_slots, unsigned int flags)
*/
scoped_guard(rcu) {
fph = rcu_dereference(mm->futex_phash);
- if (fph && (!fph->hash_mask || fph->immutable)) {
+ if (fph && !fph->hash_mask) {
if (custom)
return -EBUSY;
return 0;
}
}
- fph = kvzalloc(struct_size(fph, queues, hash_slots), GFP_KERNEL_ACCOUNT | __GFP_NOWARN);
+ fph = kvzalloc(struct_size(fph, queues, hash_slots),
+ GFP_KERNEL_ACCOUNT | __GFP_NOWARN);
if (!fph)
return -ENOMEM;
- rcuref_init(&fph->users, 1);
fph->hash_mask = hash_slots ? hash_slots - 1 : 0;
fph->custom = custom;
- fph->immutable = !!(flags & FH_IMMUTABLE);
fph->mm = mm;
for (i = 0; i < hash_slots; i++)
@@ -1629,13 +1831,23 @@ again:
mm->futex_phash_new = NULL;
if (fph) {
+ if (cur && !cur->hash_mask) {
+ /*
+ * If two threads simultaneously request the global
+ * hash then the first one performs the switch,
+ * the second one returns here.
+ */
+ free = fph;
+ mm->futex_phash_new = new;
+ return -EBUSY;
+ }
if (cur && !new) {
/*
* If we have an existing hash, but do not yet have
* allocated a replacement hash, drop the initial
* reference on the existing hash.
*/
- futex_private_hash_put(cur);
+ futex_ref_drop(cur);
}
if (new) {
@@ -1712,19 +1924,6 @@ static int futex_hash_get_slots(void)
return 0;
}
-static int futex_hash_get_immutable(void)
-{
- struct futex_private_hash *fph;
-
- guard(rcu)();
- fph = rcu_dereference(current->mm->futex_phash);
- if (fph && fph->immutable)
- return 1;
- if (fph && !fph->hash_mask)
- return 1;
- return 0;
-}
-
#else
static int futex_hash_allocate(unsigned int hash_slots, unsigned int flags)
@@ -1737,10 +1936,6 @@ static int futex_hash_get_slots(void)
return 0;
}
-static int futex_hash_get_immutable(void)
-{
- return 0;
-}
#endif
int futex_hash_prctl(unsigned long arg2, unsigned long arg3, unsigned long arg4)
@@ -1750,10 +1945,8 @@ int futex_hash_prctl(unsigned long arg2, unsigned long arg3, unsigned long arg4)
switch (arg2) {
case PR_FUTEX_HASH_SET_SLOTS:
- if (arg4 & ~FH_FLAG_IMMUTABLE)
+ if (arg4)
return -EINVAL;
- if (arg4 & FH_FLAG_IMMUTABLE)
- flags |= FH_IMMUTABLE;
ret = futex_hash_allocate(arg3, flags);
break;
@@ -1761,10 +1954,6 @@ int futex_hash_prctl(unsigned long arg2, unsigned long arg3, unsigned long arg4)
ret = futex_hash_get_slots();
break;
- case PR_FUTEX_HASH_GET_IMMUTABLE:
- ret = futex_hash_get_immutable();
- break;
-
default:
ret = -EINVAL;
break;
diff --git a/kernel/futex/futex.h b/kernel/futex/futex.h
index fcd1617212ee..c74eac572acd 100644
--- a/kernel/futex/futex.h
+++ b/kernel/futex/futex.h
@@ -228,14 +228,12 @@ extern void futex_hash_get(struct futex_hash_bucket *hb);
extern void futex_hash_put(struct futex_hash_bucket *hb);
extern struct futex_private_hash *futex_private_hash(void);
-extern bool futex_private_hash_get(struct futex_private_hash *fph);
extern void futex_private_hash_put(struct futex_private_hash *fph);
#else /* !CONFIG_FUTEX_PRIVATE_HASH */
static inline void futex_hash_get(struct futex_hash_bucket *hb) { }
static inline void futex_hash_put(struct futex_hash_bucket *hb) { }
static inline struct futex_private_hash *futex_private_hash(void) { return NULL; }
-static inline bool futex_private_hash_get(void) { return false; }
static inline void futex_private_hash_put(struct futex_private_hash *fph) { }
#endif
diff --git a/kernel/gcov/gcc_4_7.c b/kernel/gcov/gcc_4_7.c
index fd75b4a484d7..a08cc076f332 100644
--- a/kernel/gcov/gcc_4_7.c
+++ b/kernel/gcov/gcc_4_7.c
@@ -22,10 +22,6 @@
#define GCOV_COUNTERS 9
#elif (__GNUC__ >= 10)
#define GCOV_COUNTERS 8
-#elif (__GNUC__ >= 7)
-#define GCOV_COUNTERS 9
-#elif (__GNUC__ > 5) || (__GNUC__ == 5 && __GNUC_MINOR__ >= 1)
-#define GCOV_COUNTERS 10
#else
#define GCOV_COUNTERS 9
#endif
diff --git a/kernel/hung_task.c b/kernel/hung_task.c
index dc898ec93463..d2432df2b905 100644
--- a/kernel/hung_task.c
+++ b/kernel/hung_task.c
@@ -22,6 +22,7 @@
#include <linux/sched/signal.h>
#include <linux/sched/debug.h>
#include <linux/sched/sysctl.h>
+#include <linux/hung_task.h>
#include <trace/events/sched.h>
@@ -98,30 +99,62 @@ static struct notifier_block panic_block = {
static void debug_show_blocker(struct task_struct *task)
{
struct task_struct *g, *t;
- unsigned long owner;
- struct mutex *lock;
+ unsigned long owner, blocker, blocker_type;
RCU_LOCKDEP_WARN(!rcu_read_lock_held(), "No rcu lock held");
- lock = READ_ONCE(task->blocker_mutex);
- if (!lock)
+ blocker = READ_ONCE(task->blocker);
+ if (!blocker)
return;
- owner = mutex_get_owner(lock);
+ blocker_type = hung_task_get_blocker_type(blocker);
+
+ switch (blocker_type) {
+ case BLOCKER_TYPE_MUTEX:
+ owner = mutex_get_owner(
+ (struct mutex *)hung_task_blocker_to_lock(blocker));
+ break;
+ case BLOCKER_TYPE_SEM:
+ owner = sem_last_holder(
+ (struct semaphore *)hung_task_blocker_to_lock(blocker));
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ return;
+ }
+
+
if (unlikely(!owner)) {
- pr_err("INFO: task %s:%d is blocked on a mutex, but the owner is not found.\n",
- task->comm, task->pid);
+ switch (blocker_type) {
+ case BLOCKER_TYPE_MUTEX:
+ pr_err("INFO: task %s:%d is blocked on a mutex, but the owner is not found.\n",
+ task->comm, task->pid);
+ break;
+ case BLOCKER_TYPE_SEM:
+ pr_err("INFO: task %s:%d is blocked on a semaphore, but the last holder is not found.\n",
+ task->comm, task->pid);
+ break;
+ }
return;
}
/* Ensure the owner information is correct. */
for_each_process_thread(g, t) {
- if ((unsigned long)t == owner) {
+ if ((unsigned long)t != owner)
+ continue;
+
+ switch (blocker_type) {
+ case BLOCKER_TYPE_MUTEX:
pr_err("INFO: task %s:%d is blocked on a mutex likely owned by task %s:%d.\n",
- task->comm, task->pid, t->comm, t->pid);
- sched_show_task(t);
- return;
+ task->comm, task->pid, t->comm, t->pid);
+ break;
+ case BLOCKER_TYPE_SEM:
+ pr_err("INFO: task %s:%d blocked on a semaphore likely last held by task %s:%d\n",
+ task->comm, task->pid, t->comm, t->pid);
+ break;
}
+ sched_show_task(t);
+ return;
}
}
#else
diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig
index 3f02a0e45254..1da5e9d9da71 100644
--- a/kernel/irq/Kconfig
+++ b/kernel/irq/Kconfig
@@ -144,6 +144,17 @@ config GENERIC_IRQ_DEBUGFS
config GENERIC_IRQ_KEXEC_CLEAR_VM_FORWARD
bool
+config IRQ_KUNIT_TEST
+ bool "KUnit tests for IRQ management APIs" if !KUNIT_ALL_TESTS
+ depends on KUNIT=y
+ default KUNIT_ALL_TESTS
+ imply SMP
+ help
+ This option enables KUnit tests for the IRQ subsystem API. These are
+ only for development and testing, not for regular kernel use cases.
+
+ If unsure, say N.
+
endmenu
config GENERIC_IRQ_MULTI_HANDLER
diff --git a/kernel/irq/Makefile b/kernel/irq/Makefile
index c0f44c06d69d..6ab3a4055667 100644
--- a/kernel/irq/Makefile
+++ b/kernel/irq/Makefile
@@ -19,3 +19,4 @@ obj-$(CONFIG_GENERIC_IRQ_IPI_MUX) += ipi-mux.o
obj-$(CONFIG_SMP) += affinity.o
obj-$(CONFIG_GENERIC_IRQ_DEBUGFS) += debugfs.o
obj-$(CONFIG_GENERIC_IRQ_MATRIX_ALLOCATOR) += matrix.o
+obj-$(CONFIG_IRQ_KUNIT_TEST) += irq_test.o
diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c
index 44a4eba80315..4013e6ad2b2f 100644
--- a/kernel/irq/affinity.c
+++ b/kernel/irq/affinity.c
@@ -69,21 +69,20 @@ irq_create_affinity_masks(unsigned int nvecs, struct irq_affinity *affd)
* have multiple sets, build each sets affinity mask separately.
*/
for (i = 0, usedvecs = 0; i < affd->nr_sets; i++) {
- unsigned int this_vecs = affd->set_size[i];
- int j;
- struct cpumask *result = group_cpus_evenly(this_vecs);
+ unsigned int nr_masks, this_vecs = affd->set_size[i];
+ struct cpumask *result = group_cpus_evenly(this_vecs, &nr_masks);
if (!result) {
kfree(masks);
return NULL;
}
- for (j = 0; j < this_vecs; j++)
+ for (int j = 0; j < nr_masks; j++)
cpumask_copy(&masks[curvec + j].mask, &result[j]);
kfree(result);
- curvec += this_vecs;
- usedvecs += this_vecs;
+ curvec += nr_masks;
+ usedvecs += nr_masks;
}
/* Fill out vectors at the end that don't need affinity */
diff --git a/kernel/irq/autoprobe.c b/kernel/irq/autoprobe.c
index ae60cae24e9a..d0af8a8b3ae6 100644
--- a/kernel/irq/autoprobe.c
+++ b/kernel/irq/autoprobe.c
@@ -43,18 +43,16 @@ unsigned long probe_irq_on(void)
* flush such a longstanding irq before considering it as spurious.
*/
for_each_irq_desc_reverse(i, desc) {
- raw_spin_lock_irq(&desc->lock);
+ guard(raw_spinlock_irq)(&desc->lock);
if (!desc->action && irq_settings_can_probe(desc)) {
/*
* Some chips need to know about probing in
* progress:
*/
if (desc->irq_data.chip->irq_set_type)
- desc->irq_data.chip->irq_set_type(&desc->irq_data,
- IRQ_TYPE_PROBE);
+ desc->irq_data.chip->irq_set_type(&desc->irq_data, IRQ_TYPE_PROBE);
irq_activate_and_startup(desc, IRQ_NORESEND);
}
- raw_spin_unlock_irq(&desc->lock);
}
/* Wait for longstanding interrupts to trigger. */
@@ -66,13 +64,12 @@ unsigned long probe_irq_on(void)
* happened in the previous stage, it may have masked itself)
*/
for_each_irq_desc_reverse(i, desc) {
- raw_spin_lock_irq(&desc->lock);
+ guard(raw_spinlock_irq)(&desc->lock);
if (!desc->action && irq_settings_can_probe(desc)) {
desc->istate |= IRQS_AUTODETECT | IRQS_WAITING;
if (irq_activate_and_startup(desc, IRQ_NORESEND))
desc->istate |= IRQS_PENDING;
}
- raw_spin_unlock_irq(&desc->lock);
}
/*
@@ -84,18 +81,16 @@ unsigned long probe_irq_on(void)
* Now filter out any obviously spurious interrupts
*/
for_each_irq_desc(i, desc) {
- raw_spin_lock_irq(&desc->lock);
-
+ guard(raw_spinlock_irq)(&desc->lock);
if (desc->istate & IRQS_AUTODETECT) {
/* It triggered already - consider it spurious. */
if (!(desc->istate & IRQS_WAITING)) {
desc->istate &= ~IRQS_AUTODETECT;
irq_shutdown_and_deactivate(desc);
- } else
- if (i < 32)
- mask |= 1 << i;
+ } else if (i < 32) {
+ mask |= 1 << i;
+ }
}
- raw_spin_unlock_irq(&desc->lock);
}
return mask;
@@ -121,7 +116,7 @@ unsigned int probe_irq_mask(unsigned long val)
int i;
for_each_irq_desc(i, desc) {
- raw_spin_lock_irq(&desc->lock);
+ guard(raw_spinlock_irq)(&desc->lock);
if (desc->istate & IRQS_AUTODETECT) {
if (i < 16 && !(desc->istate & IRQS_WAITING))
mask |= 1 << i;
@@ -129,7 +124,6 @@ unsigned int probe_irq_mask(unsigned long val)
desc->istate &= ~IRQS_AUTODETECT;
irq_shutdown_and_deactivate(desc);
}
- raw_spin_unlock_irq(&desc->lock);
}
mutex_unlock(&probing_active);
@@ -160,8 +154,7 @@ int probe_irq_off(unsigned long val)
struct irq_desc *desc;
for_each_irq_desc(i, desc) {
- raw_spin_lock_irq(&desc->lock);
-
+ guard(raw_spinlock_irq)(&desc->lock);
if (desc->istate & IRQS_AUTODETECT) {
if (!(desc->istate & IRQS_WAITING)) {
if (!nr_of_irqs)
@@ -171,7 +164,6 @@ int probe_irq_off(unsigned long val)
desc->istate &= ~IRQS_AUTODETECT;
irq_shutdown_and_deactivate(desc);
}
- raw_spin_unlock_irq(&desc->lock);
}
mutex_unlock(&probing_active);
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 36cf1b09cc84..624106e886ad 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -34,98 +34,80 @@ struct irqaction chained_action = {
};
/**
- * irq_set_chip - set the irq chip for an irq
- * @irq: irq number
- * @chip: pointer to irq chip description structure
+ * irq_set_chip - set the irq chip for an irq
+ * @irq: irq number
+ * @chip: pointer to irq chip description structure
*/
int irq_set_chip(unsigned int irq, const struct irq_chip *chip)
{
- unsigned long flags;
- struct irq_desc *desc = irq_get_desc_lock(irq, &flags, 0);
+ int ret = -EINVAL;
- if (!desc)
- return -EINVAL;
-
- desc->irq_data.chip = (struct irq_chip *)(chip ?: &no_irq_chip);
- irq_put_desc_unlock(desc, flags);
- /*
- * For !CONFIG_SPARSE_IRQ make the irq show up in
- * allocated_irqs.
- */
- irq_mark_irq(irq);
- return 0;
+ scoped_irqdesc_get_and_lock(irq, 0) {
+ scoped_irqdesc->irq_data.chip = (struct irq_chip *)(chip ?: &no_irq_chip);
+ ret = 0;
+ }
+ /* For !CONFIG_SPARSE_IRQ make the irq show up in allocated_irqs. */
+ if (!ret)
+ irq_mark_irq(irq);
+ return ret;
}
EXPORT_SYMBOL(irq_set_chip);
/**
- * irq_set_irq_type - set the irq trigger type for an irq
- * @irq: irq number
- * @type: IRQ_TYPE_{LEVEL,EDGE}_* value - see include/linux/irq.h
+ * irq_set_irq_type - set the irq trigger type for an irq
+ * @irq: irq number
+ * @type: IRQ_TYPE_{LEVEL,EDGE}_* value - see include/linux/irq.h
*/
int irq_set_irq_type(unsigned int irq, unsigned int type)
{
- unsigned long flags;
- struct irq_desc *desc = irq_get_desc_buslock(irq, &flags, IRQ_GET_DESC_CHECK_GLOBAL);
- int ret = 0;
-
- if (!desc)
- return -EINVAL;
-
- ret = __irq_set_trigger(desc, type);
- irq_put_desc_busunlock(desc, flags);
- return ret;
+ scoped_irqdesc_get_and_buslock(irq, IRQ_GET_DESC_CHECK_GLOBAL)
+ return __irq_set_trigger(scoped_irqdesc, type);
+ return -EINVAL;
}
EXPORT_SYMBOL(irq_set_irq_type);
/**
- * irq_set_handler_data - set irq handler data for an irq
- * @irq: Interrupt number
- * @data: Pointer to interrupt specific data
+ * irq_set_handler_data - set irq handler data for an irq
+ * @irq: Interrupt number
+ * @data: Pointer to interrupt specific data
*
- * Set the hardware irq controller data for an irq
+ * Set the hardware irq controller data for an irq
*/
int irq_set_handler_data(unsigned int irq, void *data)
{
- unsigned long flags;
- struct irq_desc *desc = irq_get_desc_lock(irq, &flags, 0);
-
- if (!desc)
- return -EINVAL;
- desc->irq_common_data.handler_data = data;
- irq_put_desc_unlock(desc, flags);
- return 0;
+ scoped_irqdesc_get_and_lock(irq, 0) {
+ scoped_irqdesc->irq_common_data.handler_data = data;
+ return 0;
+ }
+ return -EINVAL;
}
EXPORT_SYMBOL(irq_set_handler_data);
/**
- * irq_set_msi_desc_off - set MSI descriptor data for an irq at offset
- * @irq_base: Interrupt number base
- * @irq_offset: Interrupt number offset
- * @entry: Pointer to MSI descriptor data
+ * irq_set_msi_desc_off - set MSI descriptor data for an irq at offset
+ * @irq_base: Interrupt number base
+ * @irq_offset: Interrupt number offset
+ * @entry: Pointer to MSI descriptor data
*
- * Set the MSI descriptor entry for an irq at offset
+ * Set the MSI descriptor entry for an irq at offset
*/
-int irq_set_msi_desc_off(unsigned int irq_base, unsigned int irq_offset,
- struct msi_desc *entry)
-{
- unsigned long flags;
- struct irq_desc *desc = irq_get_desc_lock(irq_base + irq_offset, &flags, IRQ_GET_DESC_CHECK_GLOBAL);
-
- if (!desc)
- return -EINVAL;
- desc->irq_common_data.msi_desc = entry;
- if (entry && !irq_offset)
- entry->irq = irq_base;
- irq_put_desc_unlock(desc, flags);
- return 0;
+int irq_set_msi_desc_off(unsigned int irq_base, unsigned int irq_offset, struct msi_desc *entry)
+{
+ scoped_irqdesc_get_and_lock(irq_base + irq_offset, IRQ_GET_DESC_CHECK_GLOBAL) {
+ scoped_irqdesc->irq_common_data.msi_desc = entry;
+ if (entry && !irq_offset)
+ entry->irq = irq_base;
+ return 0;
+ }
+ return -EINVAL;
}
/**
- * irq_set_msi_desc - set MSI descriptor data for an irq
- * @irq: Interrupt number
- * @entry: Pointer to MSI descriptor data
+ * irq_set_msi_desc - set MSI descriptor data for an irq
+ * @irq: Interrupt number
+ * @entry: Pointer to MSI descriptor data
*
- * Set the MSI descriptor entry for an irq
+ * Set the MSI descriptor entry for an irq
*/
int irq_set_msi_desc(unsigned int irq, struct msi_desc *entry)
{
@@ -133,22 +115,19 @@ int irq_set_msi_desc(unsigned int irq, struct msi_desc *entry)
}
/**
- * irq_set_chip_data - set irq chip data for an irq
- * @irq: Interrupt number
- * @data: Pointer to chip specific data
+ * irq_set_chip_data - set irq chip data for an irq
+ * @irq: Interrupt number
+ * @data: Pointer to chip specific data
*
- * Set the hardware irq chip data for an irq
+ * Set the hardware irq chip data for an irq
*/
int irq_set_chip_data(unsigned int irq, void *data)
{
- unsigned long flags;
- struct irq_desc *desc = irq_get_desc_lock(irq, &flags, 0);
-
- if (!desc)
- return -EINVAL;
- desc->irq_data.chip_data = data;
- irq_put_desc_unlock(desc, flags);
- return 0;
+ scoped_irqdesc_get_and_lock(irq, 0) {
+ scoped_irqdesc->irq_data.chip_data = data;
+ return 0;
+ }
+ return -EINVAL;
}
EXPORT_SYMBOL(irq_set_chip_data);
@@ -223,6 +202,27 @@ __irq_startup_managed(struct irq_desc *desc, const struct cpumask *aff,
return IRQ_STARTUP_ABORT;
return IRQ_STARTUP_MANAGED;
}
+
+void irq_startup_managed(struct irq_desc *desc)
+{
+ struct irq_data *d = irq_desc_get_irq_data(desc);
+
+ /*
+ * Clear managed-shutdown flag, so we don't repeat managed-startup for
+ * multiple hotplugs, and cause imbalanced disable depth.
+ */
+ irqd_clr_managed_shutdown(d);
+
+ /*
+ * Only start it up when the disable depth is 1, so that a disable,
+ * hotunplug, hotplug sequence does not end up enabling it during
+ * hotplug unconditionally.
+ */
+ desc->depth--;
+ if (!desc->depth)
+ irq_startup(desc, IRQ_RESEND, IRQ_START_COND);
+}
+
#else
static __always_inline int
__irq_startup_managed(struct irq_desc *desc, const struct cpumask *aff,
@@ -290,6 +290,7 @@ int irq_startup(struct irq_desc *desc, bool resend, bool force)
ret = __irq_startup(desc);
break;
case IRQ_STARTUP_ABORT:
+ desc->depth = 1;
irqd_set_managed_shutdown(d);
return 0;
}
@@ -322,7 +323,13 @@ void irq_shutdown(struct irq_desc *desc)
{
if (irqd_is_started(&desc->irq_data)) {
clear_irq_resend(desc);
- desc->depth = 1;
+ /*
+ * Increment disable depth, so that a managed shutdown on
+ * CPU hotunplug preserves the actual disabled state when the
+ * CPU comes back online. See irq_startup_managed().
+ */
+ desc->depth++;
+
if (desc->irq_data.chip->irq_shutdown) {
desc->irq_data.chip->irq_shutdown(&desc->irq_data);
irq_state_set_disabled(desc);
@@ -450,64 +457,33 @@ void unmask_threaded_irq(struct irq_desc *desc)
unmask_irq(desc);
}
-/*
- * handle_nested_irq - Handle a nested irq from a irq thread
- * @irq: the interrupt number
- *
- * Handle interrupts which are nested into a threaded interrupt
- * handler. The handler function is called inside the calling
- * threads context.
- */
-void handle_nested_irq(unsigned int irq)
+/* Busy wait until INPROGRESS is cleared */
+static bool irq_wait_on_inprogress(struct irq_desc *desc)
{
- struct irq_desc *desc = irq_to_desc(irq);
- struct irqaction *action;
- irqreturn_t action_ret;
-
- might_sleep();
-
- raw_spin_lock_irq(&desc->lock);
-
- desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING);
+ if (IS_ENABLED(CONFIG_SMP)) {
+ do {
+ raw_spin_unlock(&desc->lock);
+ while (irqd_irq_inprogress(&desc->irq_data))
+ cpu_relax();
+ raw_spin_lock(&desc->lock);
+ } while (irqd_irq_inprogress(&desc->irq_data));
- action = desc->action;
- if (unlikely(!action || irqd_irq_disabled(&desc->irq_data))) {
- desc->istate |= IRQS_PENDING;
- raw_spin_unlock_irq(&desc->lock);
- return;
+ /* Might have been disabled in meantime */
+ return !irqd_irq_disabled(&desc->irq_data) && desc->action;
}
-
- kstat_incr_irqs_this_cpu(desc);
- atomic_inc(&desc->threads_active);
- raw_spin_unlock_irq(&desc->lock);
-
- action_ret = IRQ_NONE;
- for_each_action_of_desc(desc, action)
- action_ret |= action->thread_fn(action->irq, action->dev_id);
-
- if (!irq_settings_no_debug(desc))
- note_interrupt(desc, action_ret);
-
- wake_threads_waitq(desc);
+ return false;
}
-EXPORT_SYMBOL_GPL(handle_nested_irq);
-static bool irq_check_poll(struct irq_desc *desc)
+static bool irq_can_handle_pm(struct irq_desc *desc)
{
- if (!(desc->istate & IRQS_POLL_INPROGRESS))
- return false;
- return irq_wait_for_poll(desc);
-}
-
-static bool irq_may_run(struct irq_desc *desc)
-{
- unsigned int mask = IRQD_IRQ_INPROGRESS | IRQD_WAKEUP_ARMED;
+ struct irq_data *irqd = &desc->irq_data;
+ const struct cpumask *aff;
/*
* If the interrupt is not in progress and is not an armed
* wakeup interrupt, proceed.
*/
- if (!irqd_has_set(&desc->irq_data, mask))
+ if (!irqd_has_set(irqd, IRQD_IRQ_INPROGRESS | IRQD_WAKEUP_ARMED))
return true;
/*
@@ -515,86 +491,161 @@ static bool irq_may_run(struct irq_desc *desc)
* and suspended, disable it and notify the pm core about the
* event.
*/
- if (irq_pm_check_wakeup(desc))
+ if (unlikely(irqd_has_set(irqd, IRQD_WAKEUP_ARMED))) {
+ irq_pm_handle_wakeup(desc);
+ return false;
+ }
+
+ /* Check whether the interrupt is polled on another CPU */
+ if (unlikely(desc->istate & IRQS_POLL_INPROGRESS)) {
+ if (WARN_ONCE(irq_poll_cpu == smp_processor_id(),
+ "irq poll in progress on cpu %d for irq %d\n",
+ smp_processor_id(), desc->irq_data.irq))
+ return false;
+ return irq_wait_on_inprogress(desc);
+ }
+
+ /* The below works only for single target interrupts */
+ if (!IS_ENABLED(CONFIG_GENERIC_IRQ_EFFECTIVE_AFF_MASK) ||
+ !irqd_is_single_target(irqd) || desc->handle_irq != handle_edge_irq)
return false;
/*
- * Handle a potential concurrent poll on a different core.
+ * If the interrupt affinity was moved to this CPU and the
+ * interrupt is currently handled on the previous target CPU, then
+ * busy wait for INPROGRESS to be cleared. Otherwise for edge type
+ * interrupts the handler might get stuck on the previous target:
+ *
+ * CPU 0 CPU 1 (new target)
+ * handle_edge_irq()
+ * repeat:
+ * handle_event() handle_edge_irq()
+ * if (INPROGESS) {
+ * set(PENDING);
+ * mask();
+ * return;
+ * }
+ * if (PENDING) {
+ * clear(PENDING);
+ * unmask();
+ * goto repeat;
+ * }
+ *
+ * This happens when the device raises interrupts with a high rate
+ * and always before handle_event() completes and the CPU0 handler
+ * can clear INPROGRESS. This has been observed in virtual machines.
*/
- return irq_check_poll(desc);
+ aff = irq_data_get_effective_affinity_mask(irqd);
+ if (cpumask_first(aff) != smp_processor_id())
+ return false;
+ return irq_wait_on_inprogress(desc);
+}
+
+static inline bool irq_can_handle_actions(struct irq_desc *desc)
+{
+ desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING);
+
+ if (unlikely(!desc->action || irqd_irq_disabled(&desc->irq_data))) {
+ desc->istate |= IRQS_PENDING;
+ return false;
+ }
+ return true;
+}
+
+static inline bool irq_can_handle(struct irq_desc *desc)
+{
+ if (!irq_can_handle_pm(desc))
+ return false;
+
+ return irq_can_handle_actions(desc);
}
/**
- * handle_simple_irq - Simple and software-decoded IRQs.
- * @desc: the interrupt description structure for this irq
+ * handle_nested_irq - Handle a nested irq from a irq thread
+ * @irq: the interrupt number
*
- * Simple interrupts are either sent from a demultiplexing interrupt
- * handler or come from hardware, where no interrupt hardware control
- * is necessary.
- *
- * Note: The caller is expected to handle the ack, clear, mask and
- * unmask issues if necessary.
+ * Handle interrupts which are nested into a threaded interrupt
+ * handler. The handler function is called inside the calling threads
+ * context.
*/
-void handle_simple_irq(struct irq_desc *desc)
+void handle_nested_irq(unsigned int irq)
{
- raw_spin_lock(&desc->lock);
+ struct irq_desc *desc = irq_to_desc(irq);
+ struct irqaction *action;
+ irqreturn_t action_ret;
- if (!irq_may_run(desc))
- goto out_unlock;
+ might_sleep();
- desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING);
+ scoped_guard(raw_spinlock_irq, &desc->lock) {
+ if (!irq_can_handle_actions(desc))
+ return;
- if (unlikely(!desc->action || irqd_irq_disabled(&desc->irq_data))) {
- desc->istate |= IRQS_PENDING;
- goto out_unlock;
+ action = desc->action;
+ kstat_incr_irqs_this_cpu(desc);
+ atomic_inc(&desc->threads_active);
}
+ action_ret = IRQ_NONE;
+ for_each_action_of_desc(desc, action)
+ action_ret |= action->thread_fn(action->irq, action->dev_id);
+
+ if (!irq_settings_no_debug(desc))
+ note_interrupt(desc, action_ret);
+
+ wake_threads_waitq(desc);
+}
+EXPORT_SYMBOL_GPL(handle_nested_irq);
+
+/**
+ * handle_simple_irq - Simple and software-decoded IRQs.
+ * @desc: the interrupt description structure for this irq
+ *
+ * Simple interrupts are either sent from a demultiplexing interrupt
+ * handler or come from hardware, where no interrupt hardware control is
+ * necessary.
+ *
+ * Note: The caller is expected to handle the ack, clear, mask and unmask
+ * issues if necessary.
+ */
+void handle_simple_irq(struct irq_desc *desc)
+{
+ guard(raw_spinlock)(&desc->lock);
+
+ if (!irq_can_handle(desc))
+ return;
+
kstat_incr_irqs_this_cpu(desc);
handle_irq_event(desc);
-
-out_unlock:
- raw_spin_unlock(&desc->lock);
}
EXPORT_SYMBOL_GPL(handle_simple_irq);
/**
- * handle_untracked_irq - Simple and software-decoded IRQs.
- * @desc: the interrupt description structure for this irq
+ * handle_untracked_irq - Simple and software-decoded IRQs.
+ * @desc: the interrupt description structure for this irq
*
- * Untracked interrupts are sent from a demultiplexing interrupt
- * handler when the demultiplexer does not know which device it its
- * multiplexed irq domain generated the interrupt. IRQ's handled
- * through here are not subjected to stats tracking, randomness, or
- * spurious interrupt detection.
+ * Untracked interrupts are sent from a demultiplexing interrupt handler
+ * when the demultiplexer does not know which device it its multiplexed irq
+ * domain generated the interrupt. IRQ's handled through here are not
+ * subjected to stats tracking, randomness, or spurious interrupt
+ * detection.
*
- * Note: Like handle_simple_irq, the caller is expected to handle
- * the ack, clear, mask and unmask issues if necessary.
+ * Note: Like handle_simple_irq, the caller is expected to handle the ack,
+ * clear, mask and unmask issues if necessary.
*/
void handle_untracked_irq(struct irq_desc *desc)
{
- raw_spin_lock(&desc->lock);
-
- if (!irq_may_run(desc))
- goto out_unlock;
-
- desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING);
+ scoped_guard(raw_spinlock, &desc->lock) {
+ if (!irq_can_handle(desc))
+ return;
- if (unlikely(!desc->action || irqd_irq_disabled(&desc->irq_data))) {
- desc->istate |= IRQS_PENDING;
- goto out_unlock;
+ desc->istate &= ~IRQS_PENDING;
+ irqd_set(&desc->irq_data, IRQD_IRQ_INPROGRESS);
}
- desc->istate &= ~IRQS_PENDING;
- irqd_set(&desc->irq_data, IRQD_IRQ_INPROGRESS);
- raw_spin_unlock(&desc->lock);
-
__handle_irq_event_percpu(desc);
- raw_spin_lock(&desc->lock);
- irqd_clear(&desc->irq_data, IRQD_IRQ_INPROGRESS);
-
-out_unlock:
- raw_spin_unlock(&desc->lock);
+ scoped_guard(raw_spinlock, &desc->lock)
+ irqd_clear(&desc->irq_data, IRQD_IRQ_INPROGRESS);
}
EXPORT_SYMBOL_GPL(handle_untracked_irq);
@@ -617,40 +668,26 @@ static void cond_unmask_irq(struct irq_desc *desc)
}
/**
- * handle_level_irq - Level type irq handler
- * @desc: the interrupt description structure for this irq
+ * handle_level_irq - Level type irq handler
+ * @desc: the interrupt description structure for this irq
*
- * Level type interrupts are active as long as the hardware line has
- * the active level. This may require to mask the interrupt and unmask
- * it after the associated handler has acknowledged the device, so the
- * interrupt line is back to inactive.
+ * Level type interrupts are active as long as the hardware line has the
+ * active level. This may require to mask the interrupt and unmask it after
+ * the associated handler has acknowledged the device, so the interrupt
+ * line is back to inactive.
*/
void handle_level_irq(struct irq_desc *desc)
{
- raw_spin_lock(&desc->lock);
+ guard(raw_spinlock)(&desc->lock);
mask_ack_irq(desc);
- if (!irq_may_run(desc))
- goto out_unlock;
-
- desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING);
-
- /*
- * If its disabled or no action available
- * keep it masked and get out of here
- */
- if (unlikely(!desc->action || irqd_irq_disabled(&desc->irq_data))) {
- desc->istate |= IRQS_PENDING;
- goto out_unlock;
- }
+ if (!irq_can_handle(desc))
+ return;
kstat_incr_irqs_this_cpu(desc);
handle_irq_event(desc);
cond_unmask_irq(desc);
-
-out_unlock:
- raw_spin_unlock(&desc->lock);
}
EXPORT_SYMBOL_GPL(handle_level_irq);
@@ -675,42 +712,43 @@ static void cond_unmask_eoi_irq(struct irq_desc *desc, struct irq_chip *chip)
}
}
+static inline void cond_eoi_irq(struct irq_chip *chip, struct irq_data *data)
+{
+ if (!(chip->flags & IRQCHIP_EOI_IF_HANDLED))
+ chip->irq_eoi(data);
+}
+
/**
- * handle_fasteoi_irq - irq handler for transparent controllers
- * @desc: the interrupt description structure for this irq
+ * handle_fasteoi_irq - irq handler for transparent controllers
+ * @desc: the interrupt description structure for this irq
*
- * Only a single callback will be issued to the chip: an ->eoi()
- * call when the interrupt has been serviced. This enables support
- * for modern forms of interrupt handlers, which handle the flow
- * details in hardware, transparently.
+ * Only a single callback will be issued to the chip: an ->eoi() call when
+ * the interrupt has been serviced. This enables support for modern forms
+ * of interrupt handlers, which handle the flow details in hardware,
+ * transparently.
*/
void handle_fasteoi_irq(struct irq_desc *desc)
{
struct irq_chip *chip = desc->irq_data.chip;
- raw_spin_lock(&desc->lock);
+ guard(raw_spinlock)(&desc->lock);
/*
* When an affinity change races with IRQ handling, the next interrupt
* can arrive on the new CPU before the original CPU has completed
* handling the previous one - it may need to be resent.
*/
- if (!irq_may_run(desc)) {
+ if (!irq_can_handle_pm(desc)) {
if (irqd_needs_resend_when_in_progress(&desc->irq_data))
desc->istate |= IRQS_PENDING;
- goto out;
+ cond_eoi_irq(chip, &desc->irq_data);
+ return;
}
- desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING);
-
- /*
- * If its disabled or no action available
- * then mask it and get out of here:
- */
- if (unlikely(!desc->action || irqd_irq_disabled(&desc->irq_data))) {
- desc->istate |= IRQS_PENDING;
+ if (!irq_can_handle_actions(desc)) {
mask_irq(desc);
- goto out;
+ cond_eoi_irq(chip, &desc->irq_data);
+ return;
}
kstat_incr_irqs_this_cpu(desc);
@@ -726,13 +764,6 @@ void handle_fasteoi_irq(struct irq_desc *desc)
*/
if (unlikely(desc->istate & IRQS_PENDING))
check_irq_resend(desc, false);
-
- raw_spin_unlock(&desc->lock);
- return;
-out:
- if (!(chip->flags & IRQCHIP_EOI_IF_HANDLED))
- chip->irq_eoi(&desc->irq_data);
- raw_spin_unlock(&desc->lock);
}
EXPORT_SYMBOL_GPL(handle_fasteoi_irq);
@@ -770,40 +801,27 @@ void handle_fasteoi_nmi(struct irq_desc *desc)
EXPORT_SYMBOL_GPL(handle_fasteoi_nmi);
/**
- * handle_edge_irq - edge type IRQ handler
- * @desc: the interrupt description structure for this irq
+ * handle_edge_irq - edge type IRQ handler
+ * @desc: the interrupt description structure for this irq
*
- * Interrupt occurs on the falling and/or rising edge of a hardware
- * signal. The occurrence is latched into the irq controller hardware
- * and must be acked in order to be reenabled. After the ack another
- * interrupt can happen on the same source even before the first one
- * is handled by the associated event handler. If this happens it
- * might be necessary to disable (mask) the interrupt depending on the
- * controller hardware. This requires to reenable the interrupt inside
- * of the loop which handles the interrupts which have arrived while
- * the handler was running. If all pending interrupts are handled, the
- * loop is left.
+ * Interrupt occurs on the falling and/or rising edge of a hardware
+ * signal. The occurrence is latched into the irq controller hardware and
+ * must be acked in order to be reenabled. After the ack another interrupt
+ * can happen on the same source even before the first one is handled by
+ * the associated event handler. If this happens it might be necessary to
+ * disable (mask) the interrupt depending on the controller hardware. This
+ * requires to reenable the interrupt inside of the loop which handles the
+ * interrupts which have arrived while the handler was running. If all
+ * pending interrupts are handled, the loop is left.
*/
void handle_edge_irq(struct irq_desc *desc)
{
- raw_spin_lock(&desc->lock);
-
- desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING);
+ guard(raw_spinlock)(&desc->lock);
- if (!irq_may_run(desc)) {
- desc->istate |= IRQS_PENDING;
- mask_ack_irq(desc);
- goto out_unlock;
- }
-
- /*
- * If its disabled or no action available then mask it and get
- * out of here.
- */
- if (irqd_irq_disabled(&desc->irq_data) || !desc->action) {
+ if (!irq_can_handle(desc)) {
desc->istate |= IRQS_PENDING;
mask_ack_irq(desc);
- goto out_unlock;
+ return;
}
kstat_incr_irqs_this_cpu(desc);
@@ -814,7 +832,7 @@ void handle_edge_irq(struct irq_desc *desc)
do {
if (unlikely(!desc->action)) {
mask_irq(desc);
- goto out_unlock;
+ return;
}
/*
@@ -830,11 +848,7 @@ void handle_edge_irq(struct irq_desc *desc)
handle_irq_event(desc);
- } while ((desc->istate & IRQS_PENDING) &&
- !irqd_irq_disabled(&desc->irq_data));
-
-out_unlock:
- raw_spin_unlock(&desc->lock);
+ } while ((desc->istate & IRQS_PENDING) && !irqd_irq_disabled(&desc->irq_data));
}
EXPORT_SYMBOL(handle_edge_irq);
@@ -1007,35 +1021,23 @@ __irq_do_set_handler(struct irq_desc *desc, irq_flow_handler_t handle,
}
}
-void
-__irq_set_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained,
- const char *name)
+void __irq_set_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained,
+ const char *name)
{
- unsigned long flags;
- struct irq_desc *desc = irq_get_desc_buslock(irq, &flags, 0);
-
- if (!desc)
- return;
-
- __irq_do_set_handler(desc, handle, is_chained, name);
- irq_put_desc_busunlock(desc, flags);
+ scoped_irqdesc_get_and_lock(irq, 0)
+ __irq_do_set_handler(scoped_irqdesc, handle, is_chained, name);
}
EXPORT_SYMBOL_GPL(__irq_set_handler);
-void
-irq_set_chained_handler_and_data(unsigned int irq, irq_flow_handler_t handle,
- void *data)
+void irq_set_chained_handler_and_data(unsigned int irq, irq_flow_handler_t handle,
+ void *data)
{
- unsigned long flags;
- struct irq_desc *desc = irq_get_desc_buslock(irq, &flags, 0);
+ scoped_irqdesc_get_and_buslock(irq, 0) {
+ struct irq_desc *desc = scoped_irqdesc;
- if (!desc)
- return;
-
- desc->irq_common_data.handler_data = data;
- __irq_do_set_handler(desc, handle, 1, NULL);
-
- irq_put_desc_busunlock(desc, flags);
+ desc->irq_common_data.handler_data = data;
+ __irq_do_set_handler(desc, handle, 1, NULL);
+ }
}
EXPORT_SYMBOL_GPL(irq_set_chained_handler_and_data);
@@ -1050,38 +1052,34 @@ EXPORT_SYMBOL_GPL(irq_set_chip_and_handler_name);
void irq_modify_status(unsigned int irq, unsigned long clr, unsigned long set)
{
- unsigned long flags, trigger, tmp;
- struct irq_desc *desc = irq_get_desc_lock(irq, &flags, 0);
-
- if (!desc)
- return;
-
- /*
- * Warn when a driver sets the no autoenable flag on an already
- * active interrupt.
- */
- WARN_ON_ONCE(!desc->depth && (set & _IRQ_NOAUTOEN));
-
- irq_settings_clr_and_set(desc, clr, set);
+ scoped_irqdesc_get_and_lock(irq, 0) {
+ struct irq_desc *desc = scoped_irqdesc;
+ unsigned long trigger, tmp;
+ /*
+ * Warn when a driver sets the no autoenable flag on an already
+ * active interrupt.
+ */
+ WARN_ON_ONCE(!desc->depth && (set & _IRQ_NOAUTOEN));
- trigger = irqd_get_trigger_type(&desc->irq_data);
+ irq_settings_clr_and_set(desc, clr, set);
- irqd_clear(&desc->irq_data, IRQD_NO_BALANCING | IRQD_PER_CPU |
- IRQD_TRIGGER_MASK | IRQD_LEVEL);
- if (irq_settings_has_no_balance_set(desc))
- irqd_set(&desc->irq_data, IRQD_NO_BALANCING);
- if (irq_settings_is_per_cpu(desc))
- irqd_set(&desc->irq_data, IRQD_PER_CPU);
- if (irq_settings_is_level(desc))
- irqd_set(&desc->irq_data, IRQD_LEVEL);
+ trigger = irqd_get_trigger_type(&desc->irq_data);
- tmp = irq_settings_get_trigger_mask(desc);
- if (tmp != IRQ_TYPE_NONE)
- trigger = tmp;
+ irqd_clear(&desc->irq_data, IRQD_NO_BALANCING | IRQD_PER_CPU |
+ IRQD_TRIGGER_MASK | IRQD_LEVEL);
+ if (irq_settings_has_no_balance_set(desc))
+ irqd_set(&desc->irq_data, IRQD_NO_BALANCING);
+ if (irq_settings_is_per_cpu(desc))
+ irqd_set(&desc->irq_data, IRQD_PER_CPU);
+ if (irq_settings_is_level(desc))
+ irqd_set(&desc->irq_data, IRQD_LEVEL);
- irqd_set(&desc->irq_data, trigger);
+ tmp = irq_settings_get_trigger_mask(desc);
+ if (tmp != IRQ_TYPE_NONE)
+ trigger = tmp;
- irq_put_desc_unlock(desc, flags);
+ irqd_set(&desc->irq_data, trigger);
+ }
}
EXPORT_SYMBOL_GPL(irq_modify_status);
@@ -1094,25 +1092,21 @@ EXPORT_SYMBOL_GPL(irq_modify_status);
*/
void irq_cpu_online(void)
{
- struct irq_desc *desc;
- struct irq_chip *chip;
- unsigned long flags;
unsigned int irq;
for_each_active_irq(irq) {
- desc = irq_to_desc(irq);
+ struct irq_desc *desc = irq_to_desc(irq);
+ struct irq_chip *chip;
+
if (!desc)
continue;
- raw_spin_lock_irqsave(&desc->lock, flags);
-
+ guard(raw_spinlock_irqsave)(&desc->lock);
chip = irq_data_get_irq_chip(&desc->irq_data);
if (chip && chip->irq_cpu_online &&
(!(chip->flags & IRQCHIP_ONOFFLINE_ENABLED) ||
!irqd_irq_disabled(&desc->irq_data)))
chip->irq_cpu_online(&desc->irq_data);
-
- raw_spin_unlock_irqrestore(&desc->lock, flags);
}
}
@@ -1124,25 +1118,21 @@ void irq_cpu_online(void)
*/
void irq_cpu_offline(void)
{
- struct irq_desc *desc;
- struct irq_chip *chip;
- unsigned long flags;
unsigned int irq;
for_each_active_irq(irq) {
- desc = irq_to_desc(irq);
+ struct irq_desc *desc = irq_to_desc(irq);
+ struct irq_chip *chip;
+
if (!desc)
continue;
- raw_spin_lock_irqsave(&desc->lock, flags);
-
+ guard(raw_spinlock_irqsave)(&desc->lock);
chip = irq_data_get_irq_chip(&desc->irq_data);
if (chip && chip->irq_cpu_offline &&
(!(chip->flags & IRQCHIP_ONOFFLINE_ENABLED) ||
!irqd_irq_disabled(&desc->irq_data)))
chip->irq_cpu_offline(&desc->irq_data);
-
- raw_spin_unlock_irqrestore(&desc->lock, flags);
}
}
#endif
@@ -1151,102 +1141,69 @@ void irq_cpu_offline(void)
#ifdef CONFIG_IRQ_FASTEOI_HIERARCHY_HANDLERS
/**
- * handle_fasteoi_ack_irq - irq handler for edge hierarchy
- * stacked on transparent controllers
+ * handle_fasteoi_ack_irq - irq handler for edge hierarchy stacked on
+ * transparent controllers
*
- * @desc: the interrupt description structure for this irq
+ * @desc: the interrupt description structure for this irq
*
- * Like handle_fasteoi_irq(), but for use with hierarchy where
- * the irq_chip also needs to have its ->irq_ack() function
- * called.
+ * Like handle_fasteoi_irq(), but for use with hierarchy where the irq_chip
+ * also needs to have its ->irq_ack() function called.
*/
void handle_fasteoi_ack_irq(struct irq_desc *desc)
{
struct irq_chip *chip = desc->irq_data.chip;
- raw_spin_lock(&desc->lock);
+ guard(raw_spinlock)(&desc->lock);
- if (!irq_may_run(desc))
- goto out;
-
- desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING);
+ if (!irq_can_handle_pm(desc)) {
+ cond_eoi_irq(chip, &desc->irq_data);
+ return;
+ }
- /*
- * If its disabled or no action available
- * then mask it and get out of here:
- */
- if (unlikely(!desc->action || irqd_irq_disabled(&desc->irq_data))) {
- desc->istate |= IRQS_PENDING;
+ if (unlikely(!irq_can_handle_actions(desc))) {
mask_irq(desc);
- goto out;
+ cond_eoi_irq(chip, &desc->irq_data);
+ return;
}
kstat_incr_irqs_this_cpu(desc);
if (desc->istate & IRQS_ONESHOT)
mask_irq(desc);
- /* Start handling the irq */
desc->irq_data.chip->irq_ack(&desc->irq_data);
handle_irq_event(desc);
cond_unmask_eoi_irq(desc, chip);
-
- raw_spin_unlock(&desc->lock);
- return;
-out:
- if (!(chip->flags & IRQCHIP_EOI_IF_HANDLED))
- chip->irq_eoi(&desc->irq_data);
- raw_spin_unlock(&desc->lock);
}
EXPORT_SYMBOL_GPL(handle_fasteoi_ack_irq);
/**
- * handle_fasteoi_mask_irq - irq handler for level hierarchy
- * stacked on transparent controllers
+ * handle_fasteoi_mask_irq - irq handler for level hierarchy stacked on
+ * transparent controllers
*
- * @desc: the interrupt description structure for this irq
+ * @desc: the interrupt description structure for this irq
*
- * Like handle_fasteoi_irq(), but for use with hierarchy where
- * the irq_chip also needs to have its ->irq_mask_ack() function
- * called.
+ * Like handle_fasteoi_irq(), but for use with hierarchy where the irq_chip
+ * also needs to have its ->irq_mask_ack() function called.
*/
void handle_fasteoi_mask_irq(struct irq_desc *desc)
{
struct irq_chip *chip = desc->irq_data.chip;
- raw_spin_lock(&desc->lock);
+ guard(raw_spinlock)(&desc->lock);
mask_ack_irq(desc);
- if (!irq_may_run(desc))
- goto out;
-
- desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING);
-
- /*
- * If its disabled or no action available
- * then mask it and get out of here:
- */
- if (unlikely(!desc->action || irqd_irq_disabled(&desc->irq_data))) {
- desc->istate |= IRQS_PENDING;
- mask_irq(desc);
- goto out;
+ if (!irq_can_handle(desc)) {
+ cond_eoi_irq(chip, &desc->irq_data);
+ return;
}
kstat_incr_irqs_this_cpu(desc);
- if (desc->istate & IRQS_ONESHOT)
- mask_irq(desc);
handle_irq_event(desc);
cond_unmask_eoi_irq(desc, chip);
-
- raw_spin_unlock(&desc->lock);
- return;
-out:
- if (!(chip->flags & IRQCHIP_EOI_IF_HANDLED))
- chip->irq_eoi(&desc->irq_data);
- raw_spin_unlock(&desc->lock);
}
EXPORT_SYMBOL_GPL(handle_fasteoi_mask_irq);
diff --git a/kernel/irq/cpuhotplug.c b/kernel/irq/cpuhotplug.c
index 15a7654eff68..755346ea9819 100644
--- a/kernel/irq/cpuhotplug.c
+++ b/kernel/irq/cpuhotplug.c
@@ -177,9 +177,8 @@ void irq_migrate_all_off_this_cpu(void)
bool affinity_broken;
desc = irq_to_desc(irq);
- raw_spin_lock(&desc->lock);
- affinity_broken = migrate_one_irq(desc);
- raw_spin_unlock(&desc->lock);
+ scoped_guard(raw_spinlock, &desc->lock)
+ affinity_broken = migrate_one_irq(desc);
if (affinity_broken) {
pr_debug_ratelimited("IRQ %u: no longer affine to CPU%u\n",
@@ -211,15 +210,8 @@ static void irq_restore_affinity_of_irq(struct irq_desc *desc, unsigned int cpu)
!irq_data_get_irq_chip(data) || !cpumask_test_cpu(cpu, affinity))
return;
- /*
- * Don't restore suspended interrupts here when a system comes back
- * from S3. They are reenabled via resume_device_irqs().
- */
- if (desc->istate & IRQS_SUSPENDED)
- return;
-
if (irqd_is_managed_and_shutdown(data))
- irq_startup(desc, IRQ_RESEND, IRQ_START_COND);
+ irq_startup_managed(desc);
/*
* If the interrupt can only be directed to a single target
@@ -244,9 +236,8 @@ int irq_affinity_online_cpu(unsigned int cpu)
irq_lock_sparse();
for_each_active_irq(irq) {
desc = irq_to_desc(irq);
- raw_spin_lock_irq(&desc->lock);
- irq_restore_affinity_of_irq(desc, cpu);
- raw_spin_unlock_irq(&desc->lock);
+ scoped_guard(raw_spinlock_irq, &desc->lock)
+ irq_restore_affinity_of_irq(desc, cpu);
}
irq_unlock_sparse();
diff --git a/kernel/irq/debugfs.c b/kernel/irq/debugfs.c
index ca142b9a4db3..3527defd2890 100644
--- a/kernel/irq/debugfs.c
+++ b/kernel/irq/debugfs.c
@@ -160,7 +160,7 @@ static int irq_debug_show(struct seq_file *m, void *p)
struct irq_desc *desc = m->private;
struct irq_data *data;
- raw_spin_lock_irq(&desc->lock);
+ guard(raw_spinlock_irq)(&desc->lock);
data = irq_desc_get_irq_data(desc);
seq_printf(m, "handler: %ps\n", desc->handle_irq);
seq_printf(m, "device: %s\n", desc->dev_name);
@@ -178,7 +178,6 @@ static int irq_debug_show(struct seq_file *m, void *p)
seq_printf(m, "node: %d\n", irq_data_get_node(data));
irq_debug_show_masks(m, desc);
irq_debug_show_data(m, data, 0);
- raw_spin_unlock_irq(&desc->lock);
return 0;
}
@@ -226,12 +225,12 @@ void irq_debugfs_copy_devname(int irq, struct device *dev)
void irq_add_debugfs_entry(unsigned int irq, struct irq_desc *desc)
{
- char name [10];
+ char name [12];
if (!irq_dir || !desc || desc->debugfs_file)
return;
- sprintf(name, "%d", irq);
+ sprintf(name, "%u", irq);
desc->debugfs_file = debugfs_create_file(name, 0644, irq_dir, desc,
&dfs_irq_ops);
}
diff --git a/kernel/irq/generic-chip.c b/kernel/irq/generic-chip.c
index c4a8bca5f2b0..bf59e37d650a 100644
--- a/kernel/irq/generic-chip.c
+++ b/kernel/irq/generic-chip.c
@@ -40,10 +40,9 @@ void irq_gc_mask_disable_reg(struct irq_data *d)
struct irq_chip_type *ct = irq_data_get_chip_type(d);
u32 mask = d->mask;
- irq_gc_lock(gc);
+ guard(raw_spinlock)(&gc->lock);
irq_reg_writel(gc, mask, ct->regs.disable);
*ct->mask_cache &= ~mask;
- irq_gc_unlock(gc);
}
EXPORT_SYMBOL_GPL(irq_gc_mask_disable_reg);
@@ -60,10 +59,9 @@ void irq_gc_mask_set_bit(struct irq_data *d)
struct irq_chip_type *ct = irq_data_get_chip_type(d);
u32 mask = d->mask;
- irq_gc_lock(gc);
+ guard(raw_spinlock)(&gc->lock);
*ct->mask_cache |= mask;
irq_reg_writel(gc, *ct->mask_cache, ct->regs.mask);
- irq_gc_unlock(gc);
}
EXPORT_SYMBOL_GPL(irq_gc_mask_set_bit);
@@ -80,10 +78,9 @@ void irq_gc_mask_clr_bit(struct irq_data *d)
struct irq_chip_type *ct = irq_data_get_chip_type(d);
u32 mask = d->mask;
- irq_gc_lock(gc);
+ guard(raw_spinlock)(&gc->lock);
*ct->mask_cache &= ~mask;
irq_reg_writel(gc, *ct->mask_cache, ct->regs.mask);
- irq_gc_unlock(gc);
}
EXPORT_SYMBOL_GPL(irq_gc_mask_clr_bit);
@@ -100,10 +97,9 @@ void irq_gc_unmask_enable_reg(struct irq_data *d)
struct irq_chip_type *ct = irq_data_get_chip_type(d);
u32 mask = d->mask;
- irq_gc_lock(gc);
+ guard(raw_spinlock)(&gc->lock);
irq_reg_writel(gc, mask, ct->regs.enable);
*ct->mask_cache |= mask;
- irq_gc_unlock(gc);
}
EXPORT_SYMBOL_GPL(irq_gc_unmask_enable_reg);
@@ -117,9 +113,8 @@ void irq_gc_ack_set_bit(struct irq_data *d)
struct irq_chip_type *ct = irq_data_get_chip_type(d);
u32 mask = d->mask;
- irq_gc_lock(gc);
+ guard(raw_spinlock)(&gc->lock);
irq_reg_writel(gc, mask, ct->regs.ack);
- irq_gc_unlock(gc);
}
EXPORT_SYMBOL_GPL(irq_gc_ack_set_bit);
@@ -133,9 +128,8 @@ void irq_gc_ack_clr_bit(struct irq_data *d)
struct irq_chip_type *ct = irq_data_get_chip_type(d);
u32 mask = ~d->mask;
- irq_gc_lock(gc);
+ guard(raw_spinlock)(&gc->lock);
irq_reg_writel(gc, mask, ct->regs.ack);
- irq_gc_unlock(gc);
}
/**
@@ -156,11 +150,10 @@ void irq_gc_mask_disable_and_ack_set(struct irq_data *d)
struct irq_chip_type *ct = irq_data_get_chip_type(d);
u32 mask = d->mask;
- irq_gc_lock(gc);
+ guard(raw_spinlock)(&gc->lock);
irq_reg_writel(gc, mask, ct->regs.disable);
*ct->mask_cache &= ~mask;
irq_reg_writel(gc, mask, ct->regs.ack);
- irq_gc_unlock(gc);
}
EXPORT_SYMBOL_GPL(irq_gc_mask_disable_and_ack_set);
@@ -174,9 +167,8 @@ void irq_gc_eoi(struct irq_data *d)
struct irq_chip_type *ct = irq_data_get_chip_type(d);
u32 mask = d->mask;
- irq_gc_lock(gc);
+ guard(raw_spinlock)(&gc->lock);
irq_reg_writel(gc, mask, ct->regs.eoi);
- irq_gc_unlock(gc);
}
/**
@@ -196,12 +188,11 @@ int irq_gc_set_wake(struct irq_data *d, unsigned int on)
if (!(mask & gc->wake_enabled))
return -EINVAL;
- irq_gc_lock(gc);
+ guard(raw_spinlock)(&gc->lock);
if (on)
gc->wake_active |= mask;
else
gc->wake_active &= ~mask;
- irq_gc_unlock(gc);
return 0;
}
EXPORT_SYMBOL_GPL(irq_gc_set_wake);
@@ -288,7 +279,6 @@ int irq_domain_alloc_generic_chips(struct irq_domain *d,
{
struct irq_domain_chip_generic *dgc;
struct irq_chip_generic *gc;
- unsigned long flags;
int numchips, i;
size_t dgc_sz;
size_t gc_sz;
@@ -340,9 +330,8 @@ int irq_domain_alloc_generic_chips(struct irq_domain *d,
goto err;
}
- raw_spin_lock_irqsave(&gc_lock, flags);
- list_add_tail(&gc->list, &gc_list);
- raw_spin_unlock_irqrestore(&gc_lock, flags);
+ scoped_guard (raw_spinlock_irqsave, &gc_lock)
+ list_add_tail(&gc->list, &gc_list);
/* Calc pointer to the next generic chip */
tmp += gc_sz;
}
@@ -459,7 +448,6 @@ int irq_map_generic_chip(struct irq_domain *d, unsigned int virq,
struct irq_chip_generic *gc;
struct irq_chip_type *ct;
struct irq_chip *chip;
- unsigned long flags;
int idx;
gc = __irq_get_domain_generic_chip(d, hw_irq);
@@ -479,9 +467,8 @@ int irq_map_generic_chip(struct irq_domain *d, unsigned int virq,
/* We only init the cache for the first mapping of a generic chip */
if (!gc->installed) {
- raw_spin_lock_irqsave(&gc->lock, flags);
+ guard(raw_spinlock_irqsave)(&gc->lock);
irq_gc_init_mask_cache(gc, dgc->gc_flags);
- raw_spin_unlock_irqrestore(&gc->lock, flags);
}
/* Mark the interrupt as installed */
@@ -548,9 +535,8 @@ void irq_setup_generic_chip(struct irq_chip_generic *gc, u32 msk,
struct irq_chip *chip = &ct->chip;
unsigned int i;
- raw_spin_lock(&gc_lock);
- list_add_tail(&gc->list, &gc_list);
- raw_spin_unlock(&gc_lock);
+ scoped_guard (raw_spinlock, &gc_lock)
+ list_add_tail(&gc->list, &gc_list);
irq_gc_init_mask_cache(gc, flags);
@@ -616,9 +602,8 @@ void irq_remove_generic_chip(struct irq_chip_generic *gc, u32 msk,
{
unsigned int i, virq;
- raw_spin_lock(&gc_lock);
- list_del(&gc->list);
- raw_spin_unlock(&gc_lock);
+ scoped_guard (raw_spinlock, &gc_lock)
+ list_del(&gc->list);
for (i = 0; msk; msk >>= 1, i++) {
if (!(msk & 0x01))
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index b0290849c395..0164ca48da59 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -20,6 +20,7 @@
#define istate core_internal_state__do_not_mess_with_it
extern bool noirqdebug;
+extern int irq_poll_cpu;
extern struct irqaction chained_action;
@@ -87,6 +88,7 @@ extern void __enable_irq(struct irq_desc *desc);
extern int irq_activate(struct irq_desc *desc);
extern int irq_activate_and_startup(struct irq_desc *desc, bool resend);
extern int irq_startup(struct irq_desc *desc, bool resend, bool force);
+extern void irq_startup_managed(struct irq_desc *desc);
extern void irq_shutdown(struct irq_desc *desc);
extern void irq_shutdown_and_deactivate(struct irq_desc *desc);
@@ -111,7 +113,6 @@ irqreturn_t handle_irq_event(struct irq_desc *desc);
int check_irq_resend(struct irq_desc *desc, bool inject);
void clear_irq_resend(struct irq_desc *desc);
void irq_resend_init(struct irq_desc *desc);
-bool irq_wait_for_poll(struct irq_desc *desc);
void __irq_wake_thread(struct irq_desc *desc, struct irqaction *action);
void wake_threads_waitq(struct irq_desc *desc);
@@ -141,6 +142,10 @@ extern int irq_setup_affinity(struct irq_desc *desc);
static inline int irq_setup_affinity(struct irq_desc *desc) { return 0; }
#endif
+
+#define for_each_action_of_desc(desc, act) \
+ for (act = desc->action; act; act = act->next)
+
/* Inline functions for support of irq chips on slow busses */
static inline void chip_bus_lock(struct irq_desc *desc)
{
@@ -160,38 +165,33 @@ static inline void chip_bus_sync_unlock(struct irq_desc *desc)
#define IRQ_GET_DESC_CHECK_GLOBAL (_IRQ_DESC_CHECK)
#define IRQ_GET_DESC_CHECK_PERCPU (_IRQ_DESC_CHECK | _IRQ_DESC_PERCPU)
-#define for_each_action_of_desc(desc, act) \
- for (act = desc->action; act; act = act->next)
-
-struct irq_desc *
-__irq_get_desc_lock(unsigned int irq, unsigned long *flags, bool bus,
- unsigned int check);
+struct irq_desc *__irq_get_desc_lock(unsigned int irq, unsigned long *flags, bool bus,
+ unsigned int check);
void __irq_put_desc_unlock(struct irq_desc *desc, unsigned long flags, bool bus);
-static inline struct irq_desc *
-irq_get_desc_buslock(unsigned int irq, unsigned long *flags, unsigned int check)
-{
- return __irq_get_desc_lock(irq, flags, true, check);
-}
+__DEFINE_CLASS_IS_CONDITIONAL(irqdesc_lock, true);
+__DEFINE_UNLOCK_GUARD(irqdesc_lock, struct irq_desc,
+ __irq_put_desc_unlock(_T->lock, _T->flags, _T->bus),
+ unsigned long flags; bool bus);
-static inline void
-irq_put_desc_busunlock(struct irq_desc *desc, unsigned long flags)
+static inline class_irqdesc_lock_t class_irqdesc_lock_constructor(unsigned int irq, bool bus,
+ unsigned int check)
{
- __irq_put_desc_unlock(desc, flags, true);
-}
+ class_irqdesc_lock_t _t = { .bus = bus, };
-static inline struct irq_desc *
-irq_get_desc_lock(unsigned int irq, unsigned long *flags, unsigned int check)
-{
- return __irq_get_desc_lock(irq, flags, false, check);
-}
+ _t.lock = __irq_get_desc_lock(irq, &_t.flags, bus, check);
-static inline void
-irq_put_desc_unlock(struct irq_desc *desc, unsigned long flags)
-{
- __irq_put_desc_unlock(desc, flags, false);
+ return _t;
}
+#define scoped_irqdesc_get_and_lock(_irq, _check) \
+ scoped_guard(irqdesc_lock, _irq, false, _check)
+
+#define scoped_irqdesc_get_and_buslock(_irq, _check) \
+ scoped_guard(irqdesc_lock, _irq, true, _check)
+
+#define scoped_irqdesc ((struct irq_desc *)(__guard_ptr(irqdesc_lock)(&scope)))
+
#define __irqd_to_state(d) ACCESS_PRIVATE((d)->common, state_use_accessors)
static inline unsigned int irqd_get(struct irq_data *d)
@@ -277,11 +277,11 @@ static inline bool irq_is_nmi(struct irq_desc *desc)
}
#ifdef CONFIG_PM_SLEEP
-bool irq_pm_check_wakeup(struct irq_desc *desc);
+void irq_pm_handle_wakeup(struct irq_desc *desc);
void irq_pm_install_action(struct irq_desc *desc, struct irqaction *action);
void irq_pm_remove_action(struct irq_desc *desc, struct irqaction *action);
#else
-static inline bool irq_pm_check_wakeup(struct irq_desc *desc) { return false; }
+static inline void irq_pm_handle_wakeup(struct irq_desc *desc) { }
static inline void
irq_pm_install_action(struct irq_desc *desc, struct irqaction *action) { }
static inline void
diff --git a/kernel/irq/irq_sim.c b/kernel/irq/irq_sim.c
index 1a3d483548e2..ae4c9cbd1b4b 100644
--- a/kernel/irq/irq_sim.c
+++ b/kernel/irq/irq_sim.c
@@ -202,7 +202,7 @@ struct irq_domain *irq_domain_create_sim_full(struct fwnode_handle *fwnode,
void *data)
{
struct irq_sim_work_ctx *work_ctx __free(kfree) =
- kmalloc(sizeof(*work_ctx), GFP_KERNEL);
+ kzalloc(sizeof(*work_ctx), GFP_KERNEL);
if (!work_ctx)
return ERR_PTR(-ENOMEM);
diff --git a/kernel/irq/irq_test.c b/kernel/irq/irq_test.c
new file mode 100644
index 000000000000..5161b56a12f9
--- /dev/null
+++ b/kernel/irq/irq_test.c
@@ -0,0 +1,229 @@
+// SPDX-License-Identifier: LGPL-2.1+
+
+#include <linux/cpu.h>
+#include <linux/cpumask.h>
+#include <linux/interrupt.h>
+#include <linux/irq.h>
+#include <linux/irqdesc.h>
+#include <linux/irqdomain.h>
+#include <linux/nodemask.h>
+#include <kunit/test.h>
+
+#include "internals.h"
+
+static irqreturn_t noop_handler(int irq, void *data)
+{
+ return IRQ_HANDLED;
+}
+
+static void noop(struct irq_data *data) { }
+static unsigned int noop_ret(struct irq_data *data) { return 0; }
+
+static int noop_affinity(struct irq_data *data, const struct cpumask *dest,
+ bool force)
+{
+ irq_data_update_effective_affinity(data, dest);
+
+ return 0;
+}
+
+static struct irq_chip fake_irq_chip = {
+ .name = "fake",
+ .irq_startup = noop_ret,
+ .irq_shutdown = noop,
+ .irq_enable = noop,
+ .irq_disable = noop,
+ .irq_ack = noop,
+ .irq_mask = noop,
+ .irq_unmask = noop,
+ .irq_set_affinity = noop_affinity,
+ .flags = IRQCHIP_SKIP_SET_WAKE,
+};
+
+static void irq_disable_depth_test(struct kunit *test)
+{
+ struct irq_desc *desc;
+ int virq, ret;
+
+ virq = irq_domain_alloc_descs(-1, 1, 0, NUMA_NO_NODE, NULL);
+ KUNIT_ASSERT_GE(test, virq, 0);
+
+ irq_set_chip_and_handler(virq, &dummy_irq_chip, handle_simple_irq);
+
+ desc = irq_to_desc(virq);
+ KUNIT_ASSERT_PTR_NE(test, desc, NULL);
+
+ ret = request_irq(virq, noop_handler, 0, "test_irq", NULL);
+ KUNIT_EXPECT_EQ(test, ret, 0);
+
+ KUNIT_EXPECT_EQ(test, desc->depth, 0);
+
+ disable_irq(virq);
+ KUNIT_EXPECT_EQ(test, desc->depth, 1);
+
+ enable_irq(virq);
+ KUNIT_EXPECT_EQ(test, desc->depth, 0);
+
+ free_irq(virq, NULL);
+}
+
+static void irq_free_disabled_test(struct kunit *test)
+{
+ struct irq_desc *desc;
+ int virq, ret;
+
+ virq = irq_domain_alloc_descs(-1, 1, 0, NUMA_NO_NODE, NULL);
+ KUNIT_ASSERT_GE(test, virq, 0);
+
+ irq_set_chip_and_handler(virq, &dummy_irq_chip, handle_simple_irq);
+
+ desc = irq_to_desc(virq);
+ KUNIT_ASSERT_PTR_NE(test, desc, NULL);
+
+ ret = request_irq(virq, noop_handler, 0, "test_irq", NULL);
+ KUNIT_EXPECT_EQ(test, ret, 0);
+
+ KUNIT_EXPECT_EQ(test, desc->depth, 0);
+
+ disable_irq(virq);
+ KUNIT_EXPECT_EQ(test, desc->depth, 1);
+
+ free_irq(virq, NULL);
+ KUNIT_EXPECT_GE(test, desc->depth, 1);
+
+ ret = request_irq(virq, noop_handler, 0, "test_irq", NULL);
+ KUNIT_EXPECT_EQ(test, ret, 0);
+ KUNIT_EXPECT_EQ(test, desc->depth, 0);
+
+ free_irq(virq, NULL);
+}
+
+static void irq_shutdown_depth_test(struct kunit *test)
+{
+ struct irq_desc *desc;
+ struct irq_data *data;
+ int virq, ret;
+ struct irq_affinity_desc affinity = {
+ .is_managed = 1,
+ .mask = CPU_MASK_ALL,
+ };
+
+ if (!IS_ENABLED(CONFIG_SMP))
+ kunit_skip(test, "requires CONFIG_SMP for managed shutdown");
+
+ virq = irq_domain_alloc_descs(-1, 1, 0, NUMA_NO_NODE, &affinity);
+ KUNIT_ASSERT_GE(test, virq, 0);
+
+ irq_set_chip_and_handler(virq, &dummy_irq_chip, handle_simple_irq);
+
+ desc = irq_to_desc(virq);
+ KUNIT_ASSERT_PTR_NE(test, desc, NULL);
+
+ data = irq_desc_get_irq_data(desc);
+ KUNIT_ASSERT_PTR_NE(test, data, NULL);
+
+ ret = request_irq(virq, noop_handler, 0, "test_irq", NULL);
+ KUNIT_EXPECT_EQ(test, ret, 0);
+
+ KUNIT_EXPECT_TRUE(test, irqd_is_activated(data));
+ KUNIT_EXPECT_TRUE(test, irqd_is_started(data));
+ KUNIT_EXPECT_TRUE(test, irqd_affinity_is_managed(data));
+
+ KUNIT_EXPECT_EQ(test, desc->depth, 0);
+
+ disable_irq(virq);
+ KUNIT_EXPECT_EQ(test, desc->depth, 1);
+
+ irq_shutdown_and_deactivate(desc);
+
+ KUNIT_EXPECT_FALSE(test, irqd_is_activated(data));
+ KUNIT_EXPECT_FALSE(test, irqd_is_started(data));
+
+ KUNIT_EXPECT_EQ(test, irq_activate(desc), 0);
+#ifdef CONFIG_SMP
+ irq_startup_managed(desc);
+#endif
+
+ KUNIT_EXPECT_EQ(test, desc->depth, 1);
+
+ enable_irq(virq);
+ KUNIT_EXPECT_EQ(test, desc->depth, 0);
+
+ free_irq(virq, NULL);
+}
+
+static void irq_cpuhotplug_test(struct kunit *test)
+{
+ struct irq_desc *desc;
+ struct irq_data *data;
+ int virq, ret;
+ struct irq_affinity_desc affinity = {
+ .is_managed = 1,
+ };
+
+ if (!IS_ENABLED(CONFIG_SMP))
+ kunit_skip(test, "requires CONFIG_SMP for CPU hotplug");
+ if (!get_cpu_device(1))
+ kunit_skip(test, "requires more than 1 CPU for CPU hotplug");
+ if (!cpu_is_hotpluggable(1))
+ kunit_skip(test, "CPU 1 must be hotpluggable");
+
+ cpumask_copy(&affinity.mask, cpumask_of(1));
+
+ virq = irq_domain_alloc_descs(-1, 1, 0, NUMA_NO_NODE, &affinity);
+ KUNIT_ASSERT_GE(test, virq, 0);
+
+ irq_set_chip_and_handler(virq, &fake_irq_chip, handle_simple_irq);
+
+ desc = irq_to_desc(virq);
+ KUNIT_ASSERT_PTR_NE(test, desc, NULL);
+
+ data = irq_desc_get_irq_data(desc);
+ KUNIT_ASSERT_PTR_NE(test, data, NULL);
+
+ ret = request_irq(virq, noop_handler, 0, "test_irq", NULL);
+ KUNIT_EXPECT_EQ(test, ret, 0);
+
+ KUNIT_EXPECT_TRUE(test, irqd_is_activated(data));
+ KUNIT_EXPECT_TRUE(test, irqd_is_started(data));
+ KUNIT_EXPECT_TRUE(test, irqd_affinity_is_managed(data));
+
+ KUNIT_EXPECT_EQ(test, desc->depth, 0);
+
+ disable_irq(virq);
+ KUNIT_EXPECT_EQ(test, desc->depth, 1);
+
+ KUNIT_EXPECT_EQ(test, remove_cpu(1), 0);
+ KUNIT_EXPECT_FALSE(test, irqd_is_activated(data));
+ KUNIT_EXPECT_FALSE(test, irqd_is_started(data));
+ KUNIT_EXPECT_GE(test, desc->depth, 1);
+ KUNIT_EXPECT_EQ(test, add_cpu(1), 0);
+
+ KUNIT_EXPECT_FALSE(test, irqd_is_activated(data));
+ KUNIT_EXPECT_FALSE(test, irqd_is_started(data));
+ KUNIT_EXPECT_EQ(test, desc->depth, 1);
+
+ enable_irq(virq);
+ KUNIT_EXPECT_TRUE(test, irqd_is_activated(data));
+ KUNIT_EXPECT_TRUE(test, irqd_is_started(data));
+ KUNIT_EXPECT_EQ(test, desc->depth, 0);
+
+ free_irq(virq, NULL);
+}
+
+static struct kunit_case irq_test_cases[] = {
+ KUNIT_CASE(irq_disable_depth_test),
+ KUNIT_CASE(irq_free_disabled_test),
+ KUNIT_CASE(irq_shutdown_depth_test),
+ KUNIT_CASE(irq_cpuhotplug_test),
+ {}
+};
+
+static struct kunit_suite irq_test_suite = {
+ .name = "irq_test_cases",
+ .test_cases = irq_test_cases,
+};
+
+kunit_test_suite(irq_test_suite);
+MODULE_DESCRIPTION("IRQ unit test suite");
+MODULE_LICENSE("GPL");
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index 4258cd6bd3b4..b64c57b44c20 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -246,8 +246,7 @@ static struct kobject *irq_kobj_base;
#define IRQ_ATTR_RO(_name) \
static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
-static ssize_t per_cpu_count_show(struct kobject *kobj,
- struct kobj_attribute *attr, char *buf)
+static ssize_t per_cpu_count_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf)
{
struct irq_desc *desc = container_of(kobj, struct irq_desc, kobj);
ssize_t ret = 0;
@@ -257,112 +256,83 @@ static ssize_t per_cpu_count_show(struct kobject *kobj,
for_each_possible_cpu(cpu) {
unsigned int c = irq_desc_kstat_cpu(desc, cpu);
- ret += scnprintf(buf + ret, PAGE_SIZE - ret, "%s%u", p, c);
+ ret += sysfs_emit_at(buf, ret, "%s%u", p, c);
p = ",";
}
- ret += scnprintf(buf + ret, PAGE_SIZE - ret, "\n");
+ ret += sysfs_emit_at(buf, ret, "\n");
return ret;
}
IRQ_ATTR_RO(per_cpu_count);
-static ssize_t chip_name_show(struct kobject *kobj,
- struct kobj_attribute *attr, char *buf)
+static ssize_t chip_name_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf)
{
struct irq_desc *desc = container_of(kobj, struct irq_desc, kobj);
- ssize_t ret = 0;
-
- raw_spin_lock_irq(&desc->lock);
- if (desc->irq_data.chip && desc->irq_data.chip->name) {
- ret = scnprintf(buf, PAGE_SIZE, "%s\n",
- desc->irq_data.chip->name);
- }
- raw_spin_unlock_irq(&desc->lock);
- return ret;
+ guard(raw_spinlock_irq)(&desc->lock);
+ if (desc->irq_data.chip && desc->irq_data.chip->name)
+ return sysfs_emit(buf, "%s\n", desc->irq_data.chip->name);
+ return 0;
}
IRQ_ATTR_RO(chip_name);
-static ssize_t hwirq_show(struct kobject *kobj,
- struct kobj_attribute *attr, char *buf)
+static ssize_t hwirq_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf)
{
struct irq_desc *desc = container_of(kobj, struct irq_desc, kobj);
- ssize_t ret = 0;
- raw_spin_lock_irq(&desc->lock);
+ guard(raw_spinlock_irq)(&desc->lock);
if (desc->irq_data.domain)
- ret = sprintf(buf, "%lu\n", desc->irq_data.hwirq);
- raw_spin_unlock_irq(&desc->lock);
-
- return ret;
+ return sysfs_emit(buf, "%lu\n", desc->irq_data.hwirq);
+ return 0;
}
IRQ_ATTR_RO(hwirq);
-static ssize_t type_show(struct kobject *kobj,
- struct kobj_attribute *attr, char *buf)
+static ssize_t type_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf)
{
struct irq_desc *desc = container_of(kobj, struct irq_desc, kobj);
- ssize_t ret = 0;
- raw_spin_lock_irq(&desc->lock);
- ret = sprintf(buf, "%s\n",
- irqd_is_level_type(&desc->irq_data) ? "level" : "edge");
- raw_spin_unlock_irq(&desc->lock);
-
- return ret;
+ guard(raw_spinlock_irq)(&desc->lock);
+ return sysfs_emit(buf, "%s\n", irqd_is_level_type(&desc->irq_data) ? "level" : "edge");
}
IRQ_ATTR_RO(type);
-static ssize_t wakeup_show(struct kobject *kobj,
- struct kobj_attribute *attr, char *buf)
+static ssize_t wakeup_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf)
{
struct irq_desc *desc = container_of(kobj, struct irq_desc, kobj);
- ssize_t ret = 0;
-
- raw_spin_lock_irq(&desc->lock);
- ret = sprintf(buf, "%s\n", str_enabled_disabled(irqd_is_wakeup_set(&desc->irq_data)));
- raw_spin_unlock_irq(&desc->lock);
-
- return ret;
+ guard(raw_spinlock_irq)(&desc->lock);
+ return sysfs_emit(buf, "%s\n", str_enabled_disabled(irqd_is_wakeup_set(&desc->irq_data)));
}
IRQ_ATTR_RO(wakeup);
-static ssize_t name_show(struct kobject *kobj,
- struct kobj_attribute *attr, char *buf)
+static ssize_t name_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf)
{
struct irq_desc *desc = container_of(kobj, struct irq_desc, kobj);
- ssize_t ret = 0;
- raw_spin_lock_irq(&desc->lock);
+ guard(raw_spinlock_irq)(&desc->lock);
if (desc->name)
- ret = scnprintf(buf, PAGE_SIZE, "%s\n", desc->name);
- raw_spin_unlock_irq(&desc->lock);
-
- return ret;
+ return sysfs_emit(buf, "%s\n", desc->name);
+ return 0;
}
IRQ_ATTR_RO(name);
-static ssize_t actions_show(struct kobject *kobj,
- struct kobj_attribute *attr, char *buf)
+static ssize_t actions_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf)
{
struct irq_desc *desc = container_of(kobj, struct irq_desc, kobj);
struct irqaction *action;
ssize_t ret = 0;
char *p = "";
- raw_spin_lock_irq(&desc->lock);
- for_each_action_of_desc(desc, action) {
- ret += scnprintf(buf + ret, PAGE_SIZE - ret, "%s%s",
- p, action->name);
- p = ",";
+ scoped_guard(raw_spinlock_irq, &desc->lock) {
+ for_each_action_of_desc(desc, action) {
+ ret += sysfs_emit_at(buf, ret, "%s%s", p, action->name);
+ p = ",";
+ }
}
- raw_spin_unlock_irq(&desc->lock);
if (ret)
- ret += scnprintf(buf + ret, PAGE_SIZE - ret, "\n");
-
+ ret += sysfs_emit_at(buf, ret, "\n");
return ret;
}
IRQ_ATTR_RO(actions);
@@ -418,19 +388,14 @@ static int __init irq_sysfs_init(void)
int irq;
/* Prevent concurrent irq alloc/free */
- irq_lock_sparse();
-
+ guard(mutex)(&sparse_irq_lock);
irq_kobj_base = kobject_create_and_add("irq", kernel_kobj);
- if (!irq_kobj_base) {
- irq_unlock_sparse();
+ if (!irq_kobj_base)
return -ENOMEM;
- }
/* Add the already allocated interrupts */
for_each_irq_desc(irq, desc)
irq_sysfs_add(irq, desc);
- irq_unlock_sparse();
-
return 0;
}
postcore_initcall(irq_sysfs_init);
@@ -573,12 +538,12 @@ err:
return -ENOMEM;
}
-static int irq_expand_nr_irqs(unsigned int nr)
+static bool irq_expand_nr_irqs(unsigned int nr)
{
if (nr > MAX_SPARSE_IRQS)
- return -ENOMEM;
+ return false;
nr_irqs = nr;
- return 0;
+ return true;
}
int __init early_irq_init(void)
@@ -656,11 +621,9 @@ EXPORT_SYMBOL(irq_to_desc);
static void free_desc(unsigned int irq)
{
struct irq_desc *desc = irq_to_desc(irq);
- unsigned long flags;
- raw_spin_lock_irqsave(&desc->lock, flags);
- desc_set_defaults(irq, desc, irq_desc_get_node(desc), NULL, NULL);
- raw_spin_unlock_irqrestore(&desc->lock, flags);
+ scoped_guard(raw_spinlock_irqsave, &desc->lock)
+ desc_set_defaults(irq, desc, irq_desc_get_node(desc), NULL, NULL);
delete_irq_desc(irq);
}
@@ -679,16 +642,15 @@ static inline int alloc_descs(unsigned int start, unsigned int cnt, int node,
return start;
}
-static int irq_expand_nr_irqs(unsigned int nr)
+static inline bool irq_expand_nr_irqs(unsigned int nr)
{
- return -ENOMEM;
+ return false;
}
void irq_mark_irq(unsigned int irq)
{
- mutex_lock(&sparse_irq_lock);
+ guard(mutex)(&sparse_irq_lock);
irq_insert_desc(irq, irq_desc + irq);
- mutex_unlock(&sparse_irq_lock);
}
#ifdef CONFIG_GENERIC_IRQ_LEGACY
@@ -827,11 +789,9 @@ void irq_free_descs(unsigned int from, unsigned int cnt)
if (from >= nr_irqs || (from + cnt) > nr_irqs)
return;
- mutex_lock(&sparse_irq_lock);
+ guard(mutex)(&sparse_irq_lock);
for (i = 0; i < cnt; i++)
free_desc(from + i);
-
- mutex_unlock(&sparse_irq_lock);
}
EXPORT_SYMBOL_GPL(irq_free_descs);
@@ -848,11 +808,10 @@ EXPORT_SYMBOL_GPL(irq_free_descs);
*
* Returns the first irq number or error code
*/
-int __ref
-__irq_alloc_descs(int irq, unsigned int from, unsigned int cnt, int node,
- struct module *owner, const struct irq_affinity_desc *affinity)
+int __ref __irq_alloc_descs(int irq, unsigned int from, unsigned int cnt, int node,
+ struct module *owner, const struct irq_affinity_desc *affinity)
{
- int start, ret;
+ int start;
if (!cnt)
return -EINVAL;
@@ -870,22 +829,17 @@ __irq_alloc_descs(int irq, unsigned int from, unsigned int cnt, int node,
from = arch_dynirq_lower_bound(from);
}
- mutex_lock(&sparse_irq_lock);
+ guard(mutex)(&sparse_irq_lock);
start = irq_find_free_area(from, cnt);
- ret = -EEXIST;
if (irq >=0 && start != irq)
- goto unlock;
+ return -EEXIST;
if (start + cnt > nr_irqs) {
- ret = irq_expand_nr_irqs(start + cnt);
- if (ret)
- goto unlock;
+ if (!irq_expand_nr_irqs(start + cnt))
+ return -ENOMEM;
}
- ret = alloc_descs(start, cnt, node, affinity, owner);
-unlock:
- mutex_unlock(&sparse_irq_lock);
- return ret;
+ return alloc_descs(start, cnt, node, affinity, owner);
}
EXPORT_SYMBOL_GPL(__irq_alloc_descs);
@@ -900,27 +854,27 @@ unsigned int irq_get_next_irq(unsigned int offset)
return irq_find_at_or_after(offset);
}
-struct irq_desc *
-__irq_get_desc_lock(unsigned int irq, unsigned long *flags, bool bus,
- unsigned int check)
+struct irq_desc *__irq_get_desc_lock(unsigned int irq, unsigned long *flags, bool bus,
+ unsigned int check)
{
- struct irq_desc *desc = irq_to_desc(irq);
+ struct irq_desc *desc;
- if (desc) {
- if (check & _IRQ_DESC_CHECK) {
- if ((check & _IRQ_DESC_PERCPU) &&
- !irq_settings_is_per_cpu_devid(desc))
- return NULL;
-
- if (!(check & _IRQ_DESC_PERCPU) &&
- irq_settings_is_per_cpu_devid(desc))
- return NULL;
- }
+ desc = irq_to_desc(irq);
+ if (!desc)
+ return NULL;
+
+ if (check & _IRQ_DESC_CHECK) {
+ if ((check & _IRQ_DESC_PERCPU) && !irq_settings_is_per_cpu_devid(desc))
+ return NULL;
- if (bus)
- chip_bus_lock(desc);
- raw_spin_lock_irqsave(&desc->lock, *flags);
+ if (!(check & _IRQ_DESC_PERCPU) && irq_settings_is_per_cpu_devid(desc))
+ return NULL;
}
+
+ if (bus)
+ chip_bus_lock(desc);
+ raw_spin_lock_irqsave(&desc->lock, *flags);
+
return desc;
}
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index 9d5c8651492d..dc473faadcc8 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -317,6 +317,7 @@ static struct irq_domain *__irq_domain_instantiate(const struct irq_domain_info
domain->flags |= info->domain_flags;
domain->exit = info->exit;
+ domain->dev = info->dev;
#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY
if (info->parent) {
@@ -480,33 +481,6 @@ struct irq_domain *irq_domain_create_simple(struct fwnode_handle *fwnode,
}
EXPORT_SYMBOL_GPL(irq_domain_create_simple);
-/**
- * irq_domain_add_legacy() - Allocate and register a legacy revmap irq_domain.
- * @of_node: pointer to interrupt controller's device tree node.
- * @size: total number of irqs in legacy mapping
- * @first_irq: first number of irq block assigned to the domain
- * @first_hwirq: first hwirq number to use for the translation. Should normally
- * be '0', but a positive integer can be used if the effective
- * hwirqs numbering does not begin at zero.
- * @ops: map/unmap domain callbacks
- * @host_data: Controller private data pointer
- *
- * Note: the map() callback will be called before this function returns
- * for all legacy interrupts except 0 (which is always the invalid irq for
- * a legacy controller).
- */
-struct irq_domain *irq_domain_add_legacy(struct device_node *of_node,
- unsigned int size,
- unsigned int first_irq,
- irq_hw_number_t first_hwirq,
- const struct irq_domain_ops *ops,
- void *host_data)
-{
- return irq_domain_create_legacy(of_node_to_fwnode(of_node), size,
- first_irq, first_hwirq, ops, host_data);
-}
-EXPORT_SYMBOL_GPL(irq_domain_add_legacy);
-
struct irq_domain *irq_domain_create_legacy(struct fwnode_handle *fwnode,
unsigned int size,
unsigned int first_irq,
@@ -885,7 +859,7 @@ void of_phandle_args_to_fwspec(struct device_node *np, const u32 *args,
{
int i;
- fwspec->fwnode = of_node_to_fwnode(np);
+ fwspec->fwnode = of_fwnode_handle(np);
fwspec->param_count = count;
for (i = 0; i < count; i++)
@@ -1133,6 +1107,31 @@ int irq_domain_xlate_twocell(struct irq_domain *d, struct device_node *ctrlr,
EXPORT_SYMBOL_GPL(irq_domain_xlate_twocell);
/**
+ * irq_domain_xlate_twothreecell() - Generic xlate for direct two or three cell bindings
+ * @d: Interrupt domain involved in the translation
+ * @ctrlr: The device tree node for the device whose interrupt is translated
+ * @intspec: The interrupt specifier data from the device tree
+ * @intsize: The number of entries in @intspec
+ * @out_hwirq: Pointer to storage for the hardware interrupt number
+ * @out_type: Pointer to storage for the interrupt type
+ *
+ * Device Tree interrupt specifier translation function for two or three
+ * cell bindings, where the cell values map directly to the hardware
+ * interrupt number and the type specifier.
+ */
+int irq_domain_xlate_twothreecell(struct irq_domain *d, struct device_node *ctrlr,
+ const u32 *intspec, unsigned int intsize,
+ irq_hw_number_t *out_hwirq, unsigned int *out_type)
+{
+ struct irq_fwspec fwspec;
+
+ of_phandle_args_to_fwspec(ctrlr, intspec, intsize, &fwspec);
+
+ return irq_domain_translate_twothreecell(d, &fwspec, out_hwirq, out_type);
+}
+EXPORT_SYMBOL_GPL(irq_domain_xlate_twothreecell);
+
+/**
* irq_domain_xlate_onetwocell() - Generic xlate for one or two cell bindings
* @d: Interrupt domain involved in the translation
* @ctrlr: The device tree node for the device whose interrupt is translated
@@ -1216,6 +1215,37 @@ int irq_domain_translate_twocell(struct irq_domain *d,
}
EXPORT_SYMBOL_GPL(irq_domain_translate_twocell);
+/**
+ * irq_domain_translate_twothreecell() - Generic translate for direct two or three cell
+ * bindings
+ * @d: Interrupt domain involved in the translation
+ * @fwspec: The firmware interrupt specifier to translate
+ * @out_hwirq: Pointer to storage for the hardware interrupt number
+ * @out_type: Pointer to storage for the interrupt type
+ *
+ * Firmware interrupt specifier translation function for two or three cell
+ * specifications, where the parameter values map directly to the hardware
+ * interrupt number and the type specifier.
+ */
+int irq_domain_translate_twothreecell(struct irq_domain *d, struct irq_fwspec *fwspec,
+ unsigned long *out_hwirq, unsigned int *out_type)
+{
+ if (fwspec->param_count == 2) {
+ *out_hwirq = fwspec->param[0];
+ *out_type = fwspec->param[1] & IRQ_TYPE_SENSE_MASK;
+ return 0;
+ }
+
+ if (fwspec->param_count == 3) {
+ *out_hwirq = fwspec->param[1];
+ *out_type = fwspec->param[2] & IRQ_TYPE_SENSE_MASK;
+ return 0;
+ }
+
+ return -EINVAL;
+}
+EXPORT_SYMBOL_GPL(irq_domain_translate_twothreecell);
+
int irq_domain_alloc_descs(int virq, unsigned int cnt, irq_hw_number_t hwirq,
int node, const struct irq_affinity_desc *affinity)
{
@@ -1252,47 +1282,6 @@ void irq_domain_reset_irq_data(struct irq_data *irq_data)
EXPORT_SYMBOL_GPL(irq_domain_reset_irq_data);
#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY
-/**
- * irq_domain_create_hierarchy - Add a irqdomain into the hierarchy
- * @parent: Parent irq domain to associate with the new domain
- * @flags: Irq domain flags associated to the domain
- * @size: Size of the domain. See below
- * @fwnode: Optional fwnode of the interrupt controller
- * @ops: Pointer to the interrupt domain callbacks
- * @host_data: Controller private data pointer
- *
- * If @size is 0 a tree domain is created, otherwise a linear domain.
- *
- * If successful the parent is associated to the new domain and the
- * domain flags are set.
- * Returns pointer to IRQ domain, or NULL on failure.
- */
-struct irq_domain *irq_domain_create_hierarchy(struct irq_domain *parent,
- unsigned int flags,
- unsigned int size,
- struct fwnode_handle *fwnode,
- const struct irq_domain_ops *ops,
- void *host_data)
-{
- struct irq_domain_info info = {
- .fwnode = fwnode,
- .size = size,
- .hwirq_max = size,
- .ops = ops,
- .host_data = host_data,
- .domain_flags = flags,
- .parent = parent,
- };
- struct irq_domain *d;
-
- if (!info.size)
- info.hwirq_max = ~0U;
-
- d = irq_domain_instantiate(&info);
- return IS_ERR(d) ? NULL : d;
-}
-EXPORT_SYMBOL_GPL(irq_domain_create_hierarchy);
-
static void irq_domain_insert_irq(int virq)
{
struct irq_data *data;
@@ -1573,6 +1562,7 @@ void irq_domain_free_irqs_top(struct irq_domain *domain, unsigned int virq,
}
irq_domain_free_irqs_common(domain, virq, nr_irqs);
}
+EXPORT_SYMBOL_GPL(irq_domain_free_irqs_top);
static void irq_domain_free_irqs_hierarchy(struct irq_domain *domain,
unsigned int irq_base,
@@ -2008,7 +1998,7 @@ static void irq_domain_check_hierarchy(struct irq_domain *domain)
domain->flags |= IRQ_DOMAIN_FLAG_HIERARCHY;
}
#else /* CONFIG_IRQ_DOMAIN_HIERARCHY */
-/**
+/*
* irq_domain_get_irq_data - Get irq_data associated with @virq and @domain
* @domain: domain to match
* @virq: IRQ number to get irq_data
@@ -2022,7 +2012,7 @@ struct irq_data *irq_domain_get_irq_data(struct irq_domain *domain,
}
EXPORT_SYMBOL_GPL(irq_domain_get_irq_data);
-/**
+/*
* irq_domain_set_info - Set the complete data for a @virq in @domain
* @domain: Interrupt domain to match
* @virq: IRQ number
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 753eef8e041c..c94837382037 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -43,8 +43,6 @@ static void __synchronize_hardirq(struct irq_desc *desc, bool sync_chip)
bool inprogress;
do {
- unsigned long flags;
-
/*
* Wait until we're out of the critical section. This might
* give the wrong answer due to the lack of memory barriers.
@@ -53,7 +51,7 @@ static void __synchronize_hardirq(struct irq_desc *desc, bool sync_chip)
cpu_relax();
/* Ok, that indicated we're done: double-check carefully. */
- raw_spin_lock_irqsave(&desc->lock, flags);
+ guard(raw_spinlock_irqsave)(&desc->lock);
inprogress = irqd_irq_inprogress(&desc->irq_data);
/*
@@ -69,33 +67,30 @@ static void __synchronize_hardirq(struct irq_desc *desc, bool sync_chip)
__irq_get_irqchip_state(irqd, IRQCHIP_STATE_ACTIVE,
&inprogress);
}
- raw_spin_unlock_irqrestore(&desc->lock, flags);
-
/* Oops, that failed? */
} while (inprogress);
}
/**
- * synchronize_hardirq - wait for pending hard IRQ handlers (on other CPUs)
- * @irq: interrupt number to wait for
+ * synchronize_hardirq - wait for pending hard IRQ handlers (on other CPUs)
+ * @irq: interrupt number to wait for
*
- * This function waits for any pending hard IRQ handlers for this
- * interrupt to complete before returning. If you use this
- * function while holding a resource the IRQ handler may need you
- * will deadlock. It does not take associated threaded handlers
- * into account.
+ * This function waits for any pending hard IRQ handlers for this interrupt
+ * to complete before returning. If you use this function while holding a
+ * resource the IRQ handler may need you will deadlock. It does not take
+ * associated threaded handlers into account.
*
- * Do not use this for shutdown scenarios where you must be sure
- * that all parts (hardirq and threaded handler) have completed.
+ * Do not use this for shutdown scenarios where you must be sure that all
+ * parts (hardirq and threaded handler) have completed.
*
- * Returns: false if a threaded handler is active.
+ * Returns: false if a threaded handler is active.
*
- * This function may be called - with care - from IRQ context.
+ * This function may be called - with care - from IRQ context.
*
- * It does not check whether there is an interrupt in flight at the
- * hardware level, but not serviced yet, as this might deadlock when
- * called with interrupts disabled and the target CPU of the interrupt
- * is the current CPU.
+ * It does not check whether there is an interrupt in flight at the
+ * hardware level, but not serviced yet, as this might deadlock when called
+ * with interrupts disabled and the target CPU of the interrupt is the
+ * current CPU.
*/
bool synchronize_hardirq(unsigned int irq)
{
@@ -121,19 +116,19 @@ static void __synchronize_irq(struct irq_desc *desc)
}
/**
- * synchronize_irq - wait for pending IRQ handlers (on other CPUs)
- * @irq: interrupt number to wait for
+ * synchronize_irq - wait for pending IRQ handlers (on other CPUs)
+ * @irq: interrupt number to wait for
*
- * This function waits for any pending IRQ handlers for this interrupt
- * to complete before returning. If you use this function while
- * holding a resource the IRQ handler may need you will deadlock.
+ * This function waits for any pending IRQ handlers for this interrupt to
+ * complete before returning. If you use this function while holding a
+ * resource the IRQ handler may need you will deadlock.
*
- * Can only be called from preemptible code as it might sleep when
- * an interrupt thread is associated to @irq.
+ * Can only be called from preemptible code as it might sleep when
+ * an interrupt thread is associated to @irq.
*
- * It optionally makes sure (when the irq chip supports that method)
- * that the interrupt is not pending in any CPU and waiting for
- * service.
+ * It optionally makes sure (when the irq chip supports that method)
+ * that the interrupt is not pending in any CPU and waiting for
+ * service.
*/
void synchronize_irq(unsigned int irq)
{
@@ -156,8 +151,8 @@ static bool __irq_can_set_affinity(struct irq_desc *desc)
}
/**
- * irq_can_set_affinity - Check if the affinity of a given irq can be set
- * @irq: Interrupt to check
+ * irq_can_set_affinity - Check if the affinity of a given irq can be set
+ * @irq: Interrupt to check
*
*/
int irq_can_set_affinity(unsigned int irq)
@@ -181,13 +176,13 @@ bool irq_can_set_affinity_usr(unsigned int irq)
}
/**
- * irq_set_thread_affinity - Notify irq threads to adjust affinity
- * @desc: irq descriptor which has affinity changed
+ * irq_set_thread_affinity - Notify irq threads to adjust affinity
+ * @desc: irq descriptor which has affinity changed
*
- * We just set IRQTF_AFFINITY and delegate the affinity setting
- * to the interrupt thread itself. We can not call
- * set_cpus_allowed_ptr() here as we hold desc->lock and this
- * code can be called from hard interrupt context.
+ * Just set IRQTF_AFFINITY and delegate the affinity setting to the
+ * interrupt thread itself. We can not call set_cpus_allowed_ptr() here as
+ * we hold desc->lock and this code can be called from hard interrupt
+ * context.
*/
static void irq_set_thread_affinity(struct irq_desc *desc)
{
@@ -400,14 +395,8 @@ int irq_set_affinity_locked(struct irq_data *data, const struct cpumask *mask,
* an interrupt which is already started or which has already been configured
* as managed will also fail, as these mean invalid init state or double init.
*/
-int irq_update_affinity_desc(unsigned int irq,
- struct irq_affinity_desc *affinity)
+int irq_update_affinity_desc(unsigned int irq, struct irq_affinity_desc *affinity)
{
- struct irq_desc *desc;
- unsigned long flags;
- bool activated;
- int ret = 0;
-
/*
* Supporting this with the reservation scheme used by x86 needs
* some more thought. Fail it for now.
@@ -415,60 +404,50 @@ int irq_update_affinity_desc(unsigned int irq,
if (IS_ENABLED(CONFIG_GENERIC_IRQ_RESERVATION_MODE))
return -EOPNOTSUPP;
- desc = irq_get_desc_buslock(irq, &flags, 0);
- if (!desc)
- return -EINVAL;
-
- /* Requires the interrupt to be shut down */
- if (irqd_is_started(&desc->irq_data)) {
- ret = -EBUSY;
- goto out_unlock;
- }
+ scoped_irqdesc_get_and_buslock(irq, 0) {
+ struct irq_desc *desc = scoped_irqdesc;
+ bool activated;
- /* Interrupts which are already managed cannot be modified */
- if (irqd_affinity_is_managed(&desc->irq_data)) {
- ret = -EBUSY;
- goto out_unlock;
- }
-
- /*
- * Deactivate the interrupt. That's required to undo
- * anything an earlier activation has established.
- */
- activated = irqd_is_activated(&desc->irq_data);
- if (activated)
- irq_domain_deactivate_irq(&desc->irq_data);
+ /* Requires the interrupt to be shut down */
+ if (irqd_is_started(&desc->irq_data))
+ return -EBUSY;
- if (affinity->is_managed) {
- irqd_set(&desc->irq_data, IRQD_AFFINITY_MANAGED);
- irqd_set(&desc->irq_data, IRQD_MANAGED_SHUTDOWN);
- }
+ /* Interrupts which are already managed cannot be modified */
+ if (irqd_affinity_is_managed(&desc->irq_data))
+ return -EBUSY;
+ /*
+ * Deactivate the interrupt. That's required to undo
+ * anything an earlier activation has established.
+ */
+ activated = irqd_is_activated(&desc->irq_data);
+ if (activated)
+ irq_domain_deactivate_irq(&desc->irq_data);
- cpumask_copy(desc->irq_common_data.affinity, &affinity->mask);
+ if (affinity->is_managed) {
+ irqd_set(&desc->irq_data, IRQD_AFFINITY_MANAGED);
+ irqd_set(&desc->irq_data, IRQD_MANAGED_SHUTDOWN);
+ }
- /* Restore the activation state */
- if (activated)
- irq_domain_activate_irq(&desc->irq_data, false);
+ cpumask_copy(desc->irq_common_data.affinity, &affinity->mask);
-out_unlock:
- irq_put_desc_busunlock(desc, flags);
- return ret;
+ /* Restore the activation state */
+ if (activated)
+ irq_domain_activate_irq(&desc->irq_data, false);
+ return 0;
+ }
+ return -EINVAL;
}
static int __irq_set_affinity(unsigned int irq, const struct cpumask *mask,
bool force)
{
struct irq_desc *desc = irq_to_desc(irq);
- unsigned long flags;
- int ret;
if (!desc)
return -EINVAL;
- raw_spin_lock_irqsave(&desc->lock, flags);
- ret = irq_set_affinity_locked(irq_desc_get_irq_data(desc), mask, force);
- raw_spin_unlock_irqrestore(&desc->lock, flags);
- return ret;
+ guard(raw_spinlock_irqsave)(&desc->lock);
+ return irq_set_affinity_locked(irq_desc_get_irq_data(desc), mask, force);
}
/**
@@ -501,39 +480,36 @@ int irq_force_affinity(unsigned int irq, const struct cpumask *cpumask)
}
EXPORT_SYMBOL_GPL(irq_force_affinity);
-int __irq_apply_affinity_hint(unsigned int irq, const struct cpumask *m,
- bool setaffinity)
+int __irq_apply_affinity_hint(unsigned int irq, const struct cpumask *m, bool setaffinity)
{
- unsigned long flags;
- struct irq_desc *desc = irq_get_desc_lock(irq, &flags, IRQ_GET_DESC_CHECK_GLOBAL);
+ int ret = -EINVAL;
- if (!desc)
- return -EINVAL;
- desc->affinity_hint = m;
- irq_put_desc_unlock(desc, flags);
- if (m && setaffinity)
+ scoped_irqdesc_get_and_lock(irq, IRQ_GET_DESC_CHECK_GLOBAL) {
+ scoped_irqdesc->affinity_hint = m;
+ ret = 0;
+ }
+
+ if (!ret && m && setaffinity)
__irq_set_affinity(irq, m, false);
- return 0;
+ return ret;
}
EXPORT_SYMBOL_GPL(__irq_apply_affinity_hint);
static void irq_affinity_notify(struct work_struct *work)
{
- struct irq_affinity_notify *notify =
- container_of(work, struct irq_affinity_notify, work);
+ struct irq_affinity_notify *notify = container_of(work, struct irq_affinity_notify, work);
struct irq_desc *desc = irq_to_desc(notify->irq);
cpumask_var_t cpumask;
- unsigned long flags;
if (!desc || !alloc_cpumask_var(&cpumask, GFP_KERNEL))
goto out;
- raw_spin_lock_irqsave(&desc->lock, flags);
- if (irq_move_pending(&desc->irq_data))
- irq_get_pending(cpumask, desc);
- else
- cpumask_copy(cpumask, desc->irq_common_data.affinity);
- raw_spin_unlock_irqrestore(&desc->lock, flags);
+ scoped_guard(raw_spinlock_irqsave, &desc->lock) {
+ if (irq_move_pending(&desc->irq_data))
+ irq_get_pending(cpumask, desc);
+ else
+ cpumask_copy(cpumask, desc->irq_common_data.affinity);
+ }
notify->notify(notify, cpumask);
@@ -543,22 +519,20 @@ out:
}
/**
- * irq_set_affinity_notifier - control notification of IRQ affinity changes
- * @irq: Interrupt for which to enable/disable notification
- * @notify: Context for notification, or %NULL to disable
- * notification. Function pointers must be initialised;
- * the other fields will be initialised by this function.
- *
- * Must be called in process context. Notification may only be enabled
- * after the IRQ is allocated and must be disabled before the IRQ is
- * freed using free_irq().
+ * irq_set_affinity_notifier - control notification of IRQ affinity changes
+ * @irq: Interrupt for which to enable/disable notification
+ * @notify: Context for notification, or %NULL to disable
+ * notification. Function pointers must be initialised;
+ * the other fields will be initialised by this function.
+ *
+ * Must be called in process context. Notification may only be enabled
+ * after the IRQ is allocated and must be disabled before the IRQ is freed
+ * using free_irq().
*/
-int
-irq_set_affinity_notifier(unsigned int irq, struct irq_affinity_notify *notify)
+int irq_set_affinity_notifier(unsigned int irq, struct irq_affinity_notify *notify)
{
struct irq_desc *desc = irq_to_desc(irq);
struct irq_affinity_notify *old_notify;
- unsigned long flags;
/* The release function is promised process context */
might_sleep();
@@ -573,10 +547,10 @@ irq_set_affinity_notifier(unsigned int irq, struct irq_affinity_notify *notify)
INIT_WORK(&notify->work, irq_affinity_notify);
}
- raw_spin_lock_irqsave(&desc->lock, flags);
- old_notify = desc->affinity_notify;
- desc->affinity_notify = notify;
- raw_spin_unlock_irqrestore(&desc->lock, flags);
+ scoped_guard(raw_spinlock_irqsave, &desc->lock) {
+ old_notify = desc->affinity_notify;
+ desc->affinity_notify = notify;
+ }
if (old_notify) {
if (cancel_work_sync(&old_notify->work)) {
@@ -597,7 +571,8 @@ EXPORT_SYMBOL_GPL(irq_set_affinity_notifier);
int irq_setup_affinity(struct irq_desc *desc)
{
struct cpumask *set = irq_default_affinity;
- int ret, node = irq_desc_get_node(desc);
+ int node = irq_desc_get_node(desc);
+
static DEFINE_RAW_SPINLOCK(mask_lock);
static struct cpumask mask;
@@ -605,7 +580,7 @@ int irq_setup_affinity(struct irq_desc *desc)
if (!__irq_can_set_affinity(desc))
return 0;
- raw_spin_lock(&mask_lock);
+ guard(raw_spinlock)(&mask_lock);
/*
* Preserve the managed affinity setting and a userspace affinity
* setup, but make sure that one of the targets is online.
@@ -630,9 +605,7 @@ int irq_setup_affinity(struct irq_desc *desc)
if (cpumask_intersects(&mask, nodemask))
cpumask_and(&mask, &mask, nodemask);
}
- ret = irq_do_set_affinity(&desc->irq_data, &mask, false);
- raw_spin_unlock(&mask_lock);
- return ret;
+ return irq_do_set_affinity(&desc->irq_data, &mask, false);
}
#else
/* Wrapper for ALPHA specific affinity selector magic */
@@ -645,44 +618,36 @@ int irq_setup_affinity(struct irq_desc *desc)
/**
- * irq_set_vcpu_affinity - Set vcpu affinity for the interrupt
- * @irq: interrupt number to set affinity
- * @vcpu_info: vCPU specific data or pointer to a percpu array of vCPU
- * specific data for percpu_devid interrupts
- *
- * This function uses the vCPU specific data to set the vCPU
- * affinity for an irq. The vCPU specific data is passed from
- * outside, such as KVM. One example code path is as below:
- * KVM -> IOMMU -> irq_set_vcpu_affinity().
+ * irq_set_vcpu_affinity - Set vcpu affinity for the interrupt
+ * @irq: interrupt number to set affinity
+ * @vcpu_info: vCPU specific data or pointer to a percpu array of vCPU
+ * specific data for percpu_devid interrupts
+ *
+ * This function uses the vCPU specific data to set the vCPU affinity for
+ * an irq. The vCPU specific data is passed from outside, such as KVM. One
+ * example code path is as below: KVM -> IOMMU -> irq_set_vcpu_affinity().
*/
int irq_set_vcpu_affinity(unsigned int irq, void *vcpu_info)
{
- unsigned long flags;
- struct irq_desc *desc = irq_get_desc_lock(irq, &flags, 0);
- struct irq_data *data;
- struct irq_chip *chip;
- int ret = -ENOSYS;
+ scoped_irqdesc_get_and_lock(irq, 0) {
+ struct irq_desc *desc = scoped_irqdesc;
+ struct irq_data *data;
+ struct irq_chip *chip;
- if (!desc)
- return -EINVAL;
-
- data = irq_desc_get_irq_data(desc);
- do {
- chip = irq_data_get_irq_chip(data);
- if (chip && chip->irq_set_vcpu_affinity)
- break;
-#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY
- data = data->parent_data;
-#else
- data = NULL;
-#endif
- } while (data);
+ data = irq_desc_get_irq_data(desc);
+ do {
+ chip = irq_data_get_irq_chip(data);
+ if (chip && chip->irq_set_vcpu_affinity)
+ break;
- if (data)
- ret = chip->irq_set_vcpu_affinity(data, vcpu_info);
- irq_put_desc_unlock(desc, flags);
+ data = irqd_get_parent_data(data);
+ } while (data);
- return ret;
+ if (!data)
+ return -ENOSYS;
+ return chip->irq_set_vcpu_affinity(data, vcpu_info);
+ }
+ return -EINVAL;
}
EXPORT_SYMBOL_GPL(irq_set_vcpu_affinity);
@@ -694,26 +659,23 @@ void __disable_irq(struct irq_desc *desc)
static int __disable_irq_nosync(unsigned int irq)
{
- unsigned long flags;
- struct irq_desc *desc = irq_get_desc_buslock(irq, &flags, IRQ_GET_DESC_CHECK_GLOBAL);
-
- if (!desc)
- return -EINVAL;
- __disable_irq(desc);
- irq_put_desc_busunlock(desc, flags);
- return 0;
+ scoped_irqdesc_get_and_lock(irq, IRQ_GET_DESC_CHECK_GLOBAL) {
+ __disable_irq(scoped_irqdesc);
+ return 0;
+ }
+ return -EINVAL;
}
/**
- * disable_irq_nosync - disable an irq without waiting
- * @irq: Interrupt to disable
+ * disable_irq_nosync - disable an irq without waiting
+ * @irq: Interrupt to disable
*
- * Disable the selected interrupt line. Disables and Enables are
- * nested.
- * Unlike disable_irq(), this function does not ensure existing
- * instances of the IRQ handler have completed before returning.
+ * Disable the selected interrupt line. Disables and Enables are
+ * nested.
+ * Unlike disable_irq(), this function does not ensure existing
+ * instances of the IRQ handler have completed before returning.
*
- * This function may be called from IRQ context.
+ * This function may be called from IRQ context.
*/
void disable_irq_nosync(unsigned int irq)
{
@@ -722,17 +684,17 @@ void disable_irq_nosync(unsigned int irq)
EXPORT_SYMBOL(disable_irq_nosync);
/**
- * disable_irq - disable an irq and wait for completion
- * @irq: Interrupt to disable
+ * disable_irq - disable an irq and wait for completion
+ * @irq: Interrupt to disable
+ *
+ * Disable the selected interrupt line. Enables and Disables are nested.
*
- * Disable the selected interrupt line. Enables and Disables are
- * nested.
- * This function waits for any pending IRQ handlers for this interrupt
- * to complete before returning. If you use this function while
- * holding a resource the IRQ handler may need you will deadlock.
+ * This function waits for any pending IRQ handlers for this interrupt to
+ * complete before returning. If you use this function while holding a
+ * resource the IRQ handler may need you will deadlock.
*
- * Can only be called from preemptible code as it might sleep when
- * an interrupt thread is associated to @irq.
+ * Can only be called from preemptible code as it might sleep when an
+ * interrupt thread is associated to @irq.
*
*/
void disable_irq(unsigned int irq)
@@ -744,40 +706,39 @@ void disable_irq(unsigned int irq)
EXPORT_SYMBOL(disable_irq);
/**
- * disable_hardirq - disables an irq and waits for hardirq completion
- * @irq: Interrupt to disable
+ * disable_hardirq - disables an irq and waits for hardirq completion
+ * @irq: Interrupt to disable
*
- * Disable the selected interrupt line. Enables and Disables are
- * nested.
- * This function waits for any pending hard IRQ handlers for this
- * interrupt to complete before returning. If you use this function while
- * holding a resource the hard IRQ handler may need you will deadlock.
+ * Disable the selected interrupt line. Enables and Disables are nested.
*
- * When used to optimistically disable an interrupt from atomic context
- * the return value must be checked.
+ * This function waits for any pending hard IRQ handlers for this interrupt
+ * to complete before returning. If you use this function while holding a
+ * resource the hard IRQ handler may need you will deadlock.
*
- * Returns: false if a threaded handler is active.
+ * When used to optimistically disable an interrupt from atomic context the
+ * return value must be checked.
*
- * This function may be called - with care - from IRQ context.
+ * Returns: false if a threaded handler is active.
+ *
+ * This function may be called - with care - from IRQ context.
*/
bool disable_hardirq(unsigned int irq)
{
if (!__disable_irq_nosync(irq))
return synchronize_hardirq(irq);
-
return false;
}
EXPORT_SYMBOL_GPL(disable_hardirq);
/**
- * disable_nmi_nosync - disable an nmi without waiting
- * @irq: Interrupt to disable
- *
- * Disable the selected interrupt line. Disables and enables are
- * nested.
- * The interrupt to disable must have been requested through request_nmi.
- * Unlike disable_nmi(), this function does not ensure existing
- * instances of the IRQ handler have completed before returning.
+ * disable_nmi_nosync - disable an nmi without waiting
+ * @irq: Interrupt to disable
+ *
+ * Disable the selected interrupt line. Disables and enables are nested.
+ *
+ * The interrupt to disable must have been requested through request_nmi.
+ * Unlike disable_nmi(), this function does not ensure existing
+ * instances of the IRQ handler have completed before returning.
*/
void disable_nmi_nosync(unsigned int irq)
{
@@ -817,41 +778,34 @@ void __enable_irq(struct irq_desc *desc)
}
/**
- * enable_irq - enable handling of an irq
- * @irq: Interrupt to enable
+ * enable_irq - enable handling of an irq
+ * @irq: Interrupt to enable
*
- * Undoes the effect of one call to disable_irq(). If this
- * matches the last disable, processing of interrupts on this
- * IRQ line is re-enabled.
+ * Undoes the effect of one call to disable_irq(). If this matches the
+ * last disable, processing of interrupts on this IRQ line is re-enabled.
*
- * This function may be called from IRQ context only when
- * desc->irq_data.chip->bus_lock and desc->chip->bus_sync_unlock are NULL !
+ * This function may be called from IRQ context only when
+ * desc->irq_data.chip->bus_lock and desc->chip->bus_sync_unlock are NULL !
*/
void enable_irq(unsigned int irq)
{
- unsigned long flags;
- struct irq_desc *desc = irq_get_desc_buslock(irq, &flags, IRQ_GET_DESC_CHECK_GLOBAL);
+ scoped_irqdesc_get_and_lock(irq, IRQ_GET_DESC_CHECK_GLOBAL) {
+ struct irq_desc *desc = scoped_irqdesc;
- if (!desc)
- return;
- if (WARN(!desc->irq_data.chip,
- KERN_ERR "enable_irq before setup/request_irq: irq %u\n", irq))
- goto out;
-
- __enable_irq(desc);
-out:
- irq_put_desc_busunlock(desc, flags);
+ if (WARN(!desc->irq_data.chip, "enable_irq before setup/request_irq: irq %u\n", irq))
+ return;
+ __enable_irq(desc);
+ }
}
EXPORT_SYMBOL(enable_irq);
/**
- * enable_nmi - enable handling of an nmi
- * @irq: Interrupt to enable
+ * enable_nmi - enable handling of an nmi
+ * @irq: Interrupt to enable
*
- * The interrupt to enable must have been requested through request_nmi.
- * Undoes the effect of one call to disable_nmi(). If this
- * matches the last disable, processing of interrupts on this
- * IRQ line is re-enabled.
+ * The interrupt to enable must have been requested through request_nmi.
+ * Undoes the effect of one call to disable_nmi(). If this matches the last
+ * disable, processing of interrupts on this IRQ line is re-enabled.
*/
void enable_nmi(unsigned int irq)
{
@@ -873,65 +827,59 @@ static int set_irq_wake_real(unsigned int irq, unsigned int on)
}
/**
- * irq_set_irq_wake - control irq power management wakeup
- * @irq: interrupt to control
- * @on: enable/disable power management wakeup
- *
- * Enable/disable power management wakeup mode, which is
- * disabled by default. Enables and disables must match,
- * just as they match for non-wakeup mode support.
- *
- * Wakeup mode lets this IRQ wake the system from sleep
- * states like "suspend to RAM".
- *
- * Note: irq enable/disable state is completely orthogonal
- * to the enable/disable state of irq wake. An irq can be
- * disabled with disable_irq() and still wake the system as
- * long as the irq has wake enabled. If this does not hold,
- * then the underlying irq chip and the related driver need
- * to be investigated.
+ * irq_set_irq_wake - control irq power management wakeup
+ * @irq: interrupt to control
+ * @on: enable/disable power management wakeup
+ *
+ * Enable/disable power management wakeup mode, which is disabled by
+ * default. Enables and disables must match, just as they match for
+ * non-wakeup mode support.
+ *
+ * Wakeup mode lets this IRQ wake the system from sleep states like
+ * "suspend to RAM".
+ *
+ * Note: irq enable/disable state is completely orthogonal to the
+ * enable/disable state of irq wake. An irq can be disabled with
+ * disable_irq() and still wake the system as long as the irq has wake
+ * enabled. If this does not hold, then the underlying irq chip and the
+ * related driver need to be investigated.
*/
int irq_set_irq_wake(unsigned int irq, unsigned int on)
{
- unsigned long flags;
- struct irq_desc *desc = irq_get_desc_buslock(irq, &flags, IRQ_GET_DESC_CHECK_GLOBAL);
- int ret = 0;
+ scoped_irqdesc_get_and_buslock(irq, IRQ_GET_DESC_CHECK_GLOBAL) {
+ struct irq_desc *desc = scoped_irqdesc;
+ int ret = 0;
- if (!desc)
- return -EINVAL;
-
- /* Don't use NMIs as wake up interrupts please */
- if (irq_is_nmi(desc)) {
- ret = -EINVAL;
- goto out_unlock;
- }
+ /* Don't use NMIs as wake up interrupts please */
+ if (irq_is_nmi(desc))
+ return -EINVAL;
- /* wakeup-capable irqs can be shared between drivers that
- * don't need to have the same sleep mode behaviors.
- */
- if (on) {
- if (desc->wake_depth++ == 0) {
- ret = set_irq_wake_real(irq, on);
- if (ret)
- desc->wake_depth = 0;
- else
- irqd_set(&desc->irq_data, IRQD_WAKEUP_STATE);
- }
- } else {
- if (desc->wake_depth == 0) {
- WARN(1, "Unbalanced IRQ %d wake disable\n", irq);
- } else if (--desc->wake_depth == 0) {
- ret = set_irq_wake_real(irq, on);
- if (ret)
- desc->wake_depth = 1;
- else
- irqd_clear(&desc->irq_data, IRQD_WAKEUP_STATE);
+ /*
+ * wakeup-capable irqs can be shared between drivers that
+ * don't need to have the same sleep mode behaviors.
+ */
+ if (on) {
+ if (desc->wake_depth++ == 0) {
+ ret = set_irq_wake_real(irq, on);
+ if (ret)
+ desc->wake_depth = 0;
+ else
+ irqd_set(&desc->irq_data, IRQD_WAKEUP_STATE);
+ }
+ } else {
+ if (desc->wake_depth == 0) {
+ WARN(1, "Unbalanced IRQ %d wake disable\n", irq);
+ } else if (--desc->wake_depth == 0) {
+ ret = set_irq_wake_real(irq, on);
+ if (ret)
+ desc->wake_depth = 1;
+ else
+ irqd_clear(&desc->irq_data, IRQD_WAKEUP_STATE);
+ }
}
+ return ret;
}
-
-out_unlock:
- irq_put_desc_busunlock(desc, flags);
- return ret;
+ return -EINVAL;
}
EXPORT_SYMBOL(irq_set_irq_wake);
@@ -940,22 +888,17 @@ EXPORT_SYMBOL(irq_set_irq_wake);
* particular irq has been exclusively allocated or is available
* for driver use.
*/
-int can_request_irq(unsigned int irq, unsigned long irqflags)
+bool can_request_irq(unsigned int irq, unsigned long irqflags)
{
- unsigned long flags;
- struct irq_desc *desc = irq_get_desc_lock(irq, &flags, 0);
- int canrequest = 0;
+ scoped_irqdesc_get_and_lock(irq, IRQ_GET_DESC_CHECK_GLOBAL) {
+ struct irq_desc *desc = scoped_irqdesc;
- if (!desc)
- return 0;
-
- if (irq_settings_can_request(desc)) {
- if (!desc->action ||
- irqflags & desc->action->flags & IRQF_SHARED)
- canrequest = 1;
+ if (irq_settings_can_request(desc)) {
+ if (!desc->action || irqflags & desc->action->flags & IRQF_SHARED)
+ return true;
+ }
}
- irq_put_desc_unlock(desc, flags);
- return canrequest;
+ return false;
}
int __irq_set_trigger(struct irq_desc *desc, unsigned long flags)
@@ -1016,16 +959,11 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned long flags)
#ifdef CONFIG_HARDIRQS_SW_RESEND
int irq_set_parent(int irq, int parent_irq)
{
- unsigned long flags;
- struct irq_desc *desc = irq_get_desc_lock(irq, &flags, 0);
-
- if (!desc)
- return -EINVAL;
-
- desc->parent_irq = parent_irq;
-
- irq_put_desc_unlock(desc, flags);
- return 0;
+ scoped_irqdesc_get_and_lock(irq, 0) {
+ scoped_irqdesc->parent_irq = parent_irq;
+ return 0;
+ }
+ return -EINVAL;
}
EXPORT_SYMBOL_GPL(irq_set_parent);
#endif
@@ -1079,19 +1017,19 @@ static void irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *a
return;
}
- raw_spin_lock_irq(&desc->lock);
- /*
- * This code is triggered unconditionally. Check the affinity
- * mask pointer. For CPU_MASK_OFFSTACK=n this is optimized out.
- */
- if (cpumask_available(desc->irq_common_data.affinity)) {
- const struct cpumask *m;
+ scoped_guard(raw_spinlock_irq, &desc->lock) {
+ /*
+ * This code is triggered unconditionally. Check the affinity
+ * mask pointer. For CPU_MASK_OFFSTACK=n this is optimized out.
+ */
+ if (cpumask_available(desc->irq_common_data.affinity)) {
+ const struct cpumask *m;
- m = irq_data_get_effective_affinity_mask(&desc->irq_data);
- cpumask_copy(mask, m);
- valid = true;
+ m = irq_data_get_effective_affinity_mask(&desc->irq_data);
+ cpumask_copy(mask, m);
+ valid = true;
+ }
}
- raw_spin_unlock_irq(&desc->lock);
if (valid)
set_cpus_allowed_ptr(current, mask);
@@ -1259,9 +1197,8 @@ static void irq_wake_secondary(struct irq_desc *desc, struct irqaction *action)
if (WARN_ON_ONCE(!secondary))
return;
- raw_spin_lock_irq(&desc->lock);
+ guard(raw_spinlock_irq)(&desc->lock);
__irq_wake_thread(desc, secondary);
- raw_spin_unlock_irq(&desc->lock);
}
/*
@@ -1334,21 +1271,19 @@ static int irq_thread(void *data)
}
/**
- * irq_wake_thread - wake the irq thread for the action identified by dev_id
- * @irq: Interrupt line
- * @dev_id: Device identity for which the thread should be woken
- *
+ * irq_wake_thread - wake the irq thread for the action identified by dev_id
+ * @irq: Interrupt line
+ * @dev_id: Device identity for which the thread should be woken
*/
void irq_wake_thread(unsigned int irq, void *dev_id)
{
struct irq_desc *desc = irq_to_desc(irq);
struct irqaction *action;
- unsigned long flags;
if (!desc || WARN_ON(irq_settings_is_per_cpu_devid(desc)))
return;
- raw_spin_lock_irqsave(&desc->lock, flags);
+ guard(raw_spinlock_irqsave)(&desc->lock);
for_each_action_of_desc(desc, action) {
if (action->dev_id == dev_id) {
if (action->thread)
@@ -1356,7 +1291,6 @@ void irq_wake_thread(unsigned int irq, void *dev_id)
break;
}
}
- raw_spin_unlock_irqrestore(&desc->lock, flags);
}
EXPORT_SYMBOL_GPL(irq_wake_thread);
@@ -1987,9 +1921,8 @@ static struct irqaction *__free_irq(struct irq_desc *desc, void *dev_id)
* There is no interrupt on the fly anymore. Deactivate it
* completely.
*/
- raw_spin_lock_irqsave(&desc->lock, flags);
- irq_domain_deactivate_irq(&desc->irq_data);
- raw_spin_unlock_irqrestore(&desc->lock, flags);
+ scoped_guard(raw_spinlock_irqsave, &desc->lock)
+ irq_domain_deactivate_irq(&desc->irq_data);
irq_release_resources(desc);
chip_bus_sync_unlock(desc);
@@ -2005,20 +1938,19 @@ static struct irqaction *__free_irq(struct irq_desc *desc, void *dev_id)
}
/**
- * free_irq - free an interrupt allocated with request_irq
- * @irq: Interrupt line to free
- * @dev_id: Device identity to free
+ * free_irq - free an interrupt allocated with request_irq
+ * @irq: Interrupt line to free
+ * @dev_id: Device identity to free
*
- * Remove an interrupt handler. The handler is removed and if the
- * interrupt line is no longer in use by any driver it is disabled.
- * On a shared IRQ the caller must ensure the interrupt is disabled
- * on the card it drives before calling this function. The function
- * does not return until any executing interrupts for this IRQ
- * have completed.
+ * Remove an interrupt handler. The handler is removed and if the interrupt
+ * line is no longer in use by any driver it is disabled. On a shared IRQ
+ * the caller must ensure the interrupt is disabled on the card it drives
+ * before calling this function. The function does not return until any
+ * executing interrupts for this IRQ have completed.
*
- * This function must not be called from interrupt context.
+ * This function must not be called from interrupt context.
*
- * Returns the devname argument passed to request_irq.
+ * Returns the devname argument passed to request_irq.
*/
const void *free_irq(unsigned int irq, void *dev_id)
{
@@ -2075,8 +2007,6 @@ static const void *__cleanup_nmi(unsigned int irq, struct irq_desc *desc)
const void *free_nmi(unsigned int irq, void *dev_id)
{
struct irq_desc *desc = irq_to_desc(irq);
- unsigned long flags;
- const void *devname;
if (!desc || WARN_ON(!irq_is_nmi(desc)))
return NULL;
@@ -2088,53 +2018,46 @@ const void *free_nmi(unsigned int irq, void *dev_id)
if (WARN_ON(desc->depth == 0))
disable_nmi_nosync(irq);
- raw_spin_lock_irqsave(&desc->lock, flags);
-
+ guard(raw_spinlock_irqsave)(&desc->lock);
irq_nmi_teardown(desc);
- devname = __cleanup_nmi(irq, desc);
-
- raw_spin_unlock_irqrestore(&desc->lock, flags);
-
- return devname;
+ return __cleanup_nmi(irq, desc);
}
/**
- * request_threaded_irq - allocate an interrupt line
- * @irq: Interrupt line to allocate
- * @handler: Function to be called when the IRQ occurs.
- * Primary handler for threaded interrupts.
- * If handler is NULL and thread_fn != NULL
- * the default primary handler is installed.
- * @thread_fn: Function called from the irq handler thread
- * If NULL, no irq thread is created
- * @irqflags: Interrupt type flags
- * @devname: An ascii name for the claiming device
- * @dev_id: A cookie passed back to the handler function
- *
- * This call allocates interrupt resources and enables the
- * interrupt line and IRQ handling. From the point this
- * call is made your handler function may be invoked. Since
- * your handler function must clear any interrupt the board
- * raises, you must take care both to initialise your hardware
- * and to set up the interrupt handler in the right order.
- *
- * If you want to set up a threaded irq handler for your device
- * then you need to supply @handler and @thread_fn. @handler is
- * still called in hard interrupt context and has to check
- * whether the interrupt originates from the device. If yes it
- * needs to disable the interrupt on the device and return
- * IRQ_WAKE_THREAD which will wake up the handler thread and run
- * @thread_fn. This split handler design is necessary to support
- * shared interrupts.
- *
- * Dev_id must be globally unique. Normally the address of the
- * device data structure is used as the cookie. Since the handler
- * receives this value it makes sense to use it.
- *
- * If your interrupt is shared you must pass a non NULL dev_id
- * as this is required when freeing the interrupt.
- *
- * Flags:
+ * request_threaded_irq - allocate an interrupt line
+ * @irq: Interrupt line to allocate
+ * @handler: Function to be called when the IRQ occurs.
+ * Primary handler for threaded interrupts.
+ * If handler is NULL and thread_fn != NULL
+ * the default primary handler is installed.
+ * @thread_fn: Function called from the irq handler thread
+ * If NULL, no irq thread is created
+ * @irqflags: Interrupt type flags
+ * @devname: An ascii name for the claiming device
+ * @dev_id: A cookie passed back to the handler function
+ *
+ * This call allocates interrupt resources and enables the interrupt line
+ * and IRQ handling. From the point this call is made your handler function
+ * may be invoked. Since your handler function must clear any interrupt the
+ * board raises, you must take care both to initialise your hardware and to
+ * set up the interrupt handler in the right order.
+ *
+ * If you want to set up a threaded irq handler for your device then you
+ * need to supply @handler and @thread_fn. @handler is still called in hard
+ * interrupt context and has to check whether the interrupt originates from
+ * the device. If yes it needs to disable the interrupt on the device and
+ * return IRQ_WAKE_THREAD which will wake up the handler thread and run
+ * @thread_fn. This split handler design is necessary to support shared
+ * interrupts.
+ *
+ * @dev_id must be globally unique. Normally the address of the device data
+ * structure is used as the cookie. Since the handler receives this value
+ * it makes sense to use it.
+ *
+ * If your interrupt is shared you must pass a non NULL dev_id as this is
+ * required when freeing the interrupt.
+ *
+ * Flags:
*
* IRQF_SHARED Interrupt is shared
* IRQF_TRIGGER_* Specify active edge(s) or level
@@ -2232,21 +2155,20 @@ int request_threaded_irq(unsigned int irq, irq_handler_t handler,
EXPORT_SYMBOL(request_threaded_irq);
/**
- * request_any_context_irq - allocate an interrupt line
- * @irq: Interrupt line to allocate
- * @handler: Function to be called when the IRQ occurs.
- * Threaded handler for threaded interrupts.
- * @flags: Interrupt type flags
- * @name: An ascii name for the claiming device
- * @dev_id: A cookie passed back to the handler function
- *
- * This call allocates interrupt resources and enables the
- * interrupt line and IRQ handling. It selects either a
- * hardirq or threaded handling method depending on the
- * context.
- *
- * On failure, it returns a negative value. On success,
- * it returns either IRQC_IS_HARDIRQ or IRQC_IS_NESTED.
+ * request_any_context_irq - allocate an interrupt line
+ * @irq: Interrupt line to allocate
+ * @handler: Function to be called when the IRQ occurs.
+ * Threaded handler for threaded interrupts.
+ * @flags: Interrupt type flags
+ * @name: An ascii name for the claiming device
+ * @dev_id: A cookie passed back to the handler function
+ *
+ * This call allocates interrupt resources and enables the interrupt line
+ * and IRQ handling. It selects either a hardirq or threaded handling
+ * method depending on the context.
+ *
+ * Returns: On failure, it returns a negative value. On success, it returns either
+ * IRQC_IS_HARDIRQ or IRQC_IS_NESTED.
*/
int request_any_context_irq(unsigned int irq, irq_handler_t handler,
unsigned long flags, const char *name, void *dev_id)
@@ -2273,37 +2195,35 @@ int request_any_context_irq(unsigned int irq, irq_handler_t handler,
EXPORT_SYMBOL_GPL(request_any_context_irq);
/**
- * request_nmi - allocate an interrupt line for NMI delivery
- * @irq: Interrupt line to allocate
- * @handler: Function to be called when the IRQ occurs.
- * Threaded handler for threaded interrupts.
- * @irqflags: Interrupt type flags
- * @name: An ascii name for the claiming device
- * @dev_id: A cookie passed back to the handler function
- *
- * This call allocates interrupt resources and enables the
- * interrupt line and IRQ handling. It sets up the IRQ line
- * to be handled as an NMI.
- *
- * An interrupt line delivering NMIs cannot be shared and IRQ handling
- * cannot be threaded.
- *
- * Interrupt lines requested for NMI delivering must produce per cpu
- * interrupts and have auto enabling setting disabled.
- *
- * Dev_id must be globally unique. Normally the address of the
- * device data structure is used as the cookie. Since the handler
- * receives this value it makes sense to use it.
- *
- * If the interrupt line cannot be used to deliver NMIs, function
- * will fail and return a negative value.
+ * request_nmi - allocate an interrupt line for NMI delivery
+ * @irq: Interrupt line to allocate
+ * @handler: Function to be called when the IRQ occurs.
+ * Threaded handler for threaded interrupts.
+ * @irqflags: Interrupt type flags
+ * @name: An ascii name for the claiming device
+ * @dev_id: A cookie passed back to the handler function
+ *
+ * This call allocates interrupt resources and enables the interrupt line
+ * and IRQ handling. It sets up the IRQ line to be handled as an NMI.
+ *
+ * An interrupt line delivering NMIs cannot be shared and IRQ handling
+ * cannot be threaded.
+ *
+ * Interrupt lines requested for NMI delivering must produce per cpu
+ * interrupts and have auto enabling setting disabled.
+ *
+ * @dev_id must be globally unique. Normally the address of the device data
+ * structure is used as the cookie. Since the handler receives this value
+ * it makes sense to use it.
+ *
+ * If the interrupt line cannot be used to deliver NMIs, function will fail
+ * and return a negative value.
*/
int request_nmi(unsigned int irq, irq_handler_t handler,
unsigned long irqflags, const char *name, void *dev_id)
{
struct irqaction *action;
struct irq_desc *desc;
- unsigned long flags;
int retval;
if (irq == IRQ_NOTCONNECTED)
@@ -2345,21 +2265,17 @@ int request_nmi(unsigned int irq, irq_handler_t handler,
if (retval)
goto err_irq_setup;
- raw_spin_lock_irqsave(&desc->lock, flags);
-
- /* Setup NMI state */
- desc->istate |= IRQS_NMI;
- retval = irq_nmi_setup(desc);
- if (retval) {
- __cleanup_nmi(irq, desc);
- raw_spin_unlock_irqrestore(&desc->lock, flags);
- return -EINVAL;
+ scoped_guard(raw_spinlock_irqsave, &desc->lock) {
+ /* Setup NMI state */
+ desc->istate |= IRQS_NMI;
+ retval = irq_nmi_setup(desc);
+ if (retval) {
+ __cleanup_nmi(irq, desc);
+ return -EINVAL;
+ }
+ return 0;
}
- raw_spin_unlock_irqrestore(&desc->lock, flags);
-
- return 0;
-
err_irq_setup:
irq_chip_pm_put(&desc->irq_data);
err_out:
@@ -2370,35 +2286,25 @@ err_out:
void enable_percpu_irq(unsigned int irq, unsigned int type)
{
- unsigned int cpu = smp_processor_id();
- unsigned long flags;
- struct irq_desc *desc = irq_get_desc_lock(irq, &flags, IRQ_GET_DESC_CHECK_PERCPU);
+ scoped_irqdesc_get_and_lock(irq, IRQ_GET_DESC_CHECK_PERCPU) {
+ struct irq_desc *desc = scoped_irqdesc;
- if (!desc)
- return;
-
- /*
- * If the trigger type is not specified by the caller, then
- * use the default for this interrupt.
- */
- type &= IRQ_TYPE_SENSE_MASK;
- if (type == IRQ_TYPE_NONE)
- type = irqd_get_trigger_type(&desc->irq_data);
-
- if (type != IRQ_TYPE_NONE) {
- int ret;
-
- ret = __irq_set_trigger(desc, type);
-
- if (ret) {
- WARN(1, "failed to set type for IRQ%d\n", irq);
- goto out;
+ /*
+ * If the trigger type is not specified by the caller, then
+ * use the default for this interrupt.
+ */
+ type &= IRQ_TYPE_SENSE_MASK;
+ if (type == IRQ_TYPE_NONE)
+ type = irqd_get_trigger_type(&desc->irq_data);
+
+ if (type != IRQ_TYPE_NONE) {
+ if (__irq_set_trigger(desc, type)) {
+ WARN(1, "failed to set type for IRQ%d\n", irq);
+ return;
+ }
}
+ irq_percpu_enable(desc, smp_processor_id());
}
-
- irq_percpu_enable(desc, cpu);
-out:
- irq_put_desc_unlock(desc, flags);
}
EXPORT_SYMBOL_GPL(enable_percpu_irq);
@@ -2416,33 +2322,16 @@ void enable_percpu_nmi(unsigned int irq, unsigned int type)
*/
bool irq_percpu_is_enabled(unsigned int irq)
{
- unsigned int cpu = smp_processor_id();
- struct irq_desc *desc;
- unsigned long flags;
- bool is_enabled;
-
- desc = irq_get_desc_lock(irq, &flags, IRQ_GET_DESC_CHECK_PERCPU);
- if (!desc)
- return false;
-
- is_enabled = cpumask_test_cpu(cpu, desc->percpu_enabled);
- irq_put_desc_unlock(desc, flags);
-
- return is_enabled;
+ scoped_irqdesc_get_and_lock(irq, IRQ_GET_DESC_CHECK_PERCPU)
+ return cpumask_test_cpu(smp_processor_id(), scoped_irqdesc->percpu_enabled);
+ return false;
}
EXPORT_SYMBOL_GPL(irq_percpu_is_enabled);
void disable_percpu_irq(unsigned int irq)
{
- unsigned int cpu = smp_processor_id();
- unsigned long flags;
- struct irq_desc *desc = irq_get_desc_lock(irq, &flags, IRQ_GET_DESC_CHECK_PERCPU);
-
- if (!desc)
- return;
-
- irq_percpu_disable(desc, cpu);
- irq_put_desc_unlock(desc, flags);
+ scoped_irqdesc_get_and_lock(irq, IRQ_GET_DESC_CHECK_PERCPU)
+ irq_percpu_disable(scoped_irqdesc, smp_processor_id());
}
EXPORT_SYMBOL_GPL(disable_percpu_irq);
@@ -2458,71 +2347,47 @@ static struct irqaction *__free_percpu_irq(unsigned int irq, void __percpu *dev_
{
struct irq_desc *desc = irq_to_desc(irq);
struct irqaction *action;
- unsigned long flags;
WARN(in_interrupt(), "Trying to free IRQ %d from IRQ context!\n", irq);
if (!desc)
return NULL;
- raw_spin_lock_irqsave(&desc->lock, flags);
+ scoped_guard(raw_spinlock_irqsave, &desc->lock) {
+ action = desc->action;
+ if (!action || action->percpu_dev_id != dev_id) {
+ WARN(1, "Trying to free already-free IRQ %d\n", irq);
+ return NULL;
+ }
- action = desc->action;
- if (!action || action->percpu_dev_id != dev_id) {
- WARN(1, "Trying to free already-free IRQ %d\n", irq);
- goto bad;
- }
+ if (!cpumask_empty(desc->percpu_enabled)) {
+ WARN(1, "percpu IRQ %d still enabled on CPU%d!\n",
+ irq, cpumask_first(desc->percpu_enabled));
+ return NULL;
+ }
- if (!cpumask_empty(desc->percpu_enabled)) {
- WARN(1, "percpu IRQ %d still enabled on CPU%d!\n",
- irq, cpumask_first(desc->percpu_enabled));
- goto bad;
+ /* Found it - now remove it from the list of entries: */
+ desc->action = NULL;
+ desc->istate &= ~IRQS_NMI;
}
- /* Found it - now remove it from the list of entries: */
- desc->action = NULL;
-
- desc->istate &= ~IRQS_NMI;
-
- raw_spin_unlock_irqrestore(&desc->lock, flags);
-
unregister_handler_proc(irq, action);
-
irq_chip_pm_put(&desc->irq_data);
module_put(desc->owner);
return action;
-
-bad:
- raw_spin_unlock_irqrestore(&desc->lock, flags);
- return NULL;
}
/**
- * remove_percpu_irq - free a per-cpu interrupt
- * @irq: Interrupt line to free
- * @act: irqaction for the interrupt
+ * free_percpu_irq - free an interrupt allocated with request_percpu_irq
+ * @irq: Interrupt line to free
+ * @dev_id: Device identity to free
*
- * Used to remove interrupts statically setup by the early boot process.
- */
-void remove_percpu_irq(unsigned int irq, struct irqaction *act)
-{
- struct irq_desc *desc = irq_to_desc(irq);
-
- if (desc && irq_settings_is_per_cpu_devid(desc))
- __free_percpu_irq(irq, act->percpu_dev_id);
-}
-
-/**
- * free_percpu_irq - free an interrupt allocated with request_percpu_irq
- * @irq: Interrupt line to free
- * @dev_id: Device identity to free
+ * Remove a percpu interrupt handler. The handler is removed, but the
+ * interrupt line is not disabled. This must be done on each CPU before
+ * calling this function. The function does not return until any executing
+ * interrupts for this IRQ have completed.
*
- * Remove a percpu interrupt handler. The handler is removed, but
- * the interrupt line is not disabled. This must be done on each
- * CPU before calling this function. The function does not return
- * until any executing interrupts for this IRQ have completed.
- *
- * This function must not be called from interrupt context.
+ * This function must not be called from interrupt context.
*/
void free_percpu_irq(unsigned int irq, void __percpu *dev_id)
{
@@ -2551,9 +2416,9 @@ void free_percpu_nmi(unsigned int irq, void __percpu *dev_id)
}
/**
- * setup_percpu_irq - setup a per-cpu interrupt
- * @irq: Interrupt line to setup
- * @act: irqaction for the interrupt
+ * setup_percpu_irq - setup a per-cpu interrupt
+ * @irq: Interrupt line to setup
+ * @act: irqaction for the interrupt
*
* Used to statically setup per-cpu interrupts in the early boot process.
*/
@@ -2578,21 +2443,20 @@ int setup_percpu_irq(unsigned int irq, struct irqaction *act)
}
/**
- * __request_percpu_irq - allocate a percpu interrupt line
- * @irq: Interrupt line to allocate
- * @handler: Function to be called when the IRQ occurs.
- * @flags: Interrupt type flags (IRQF_TIMER only)
- * @devname: An ascii name for the claiming device
- * @dev_id: A percpu cookie passed back to the handler function
- *
- * This call allocates interrupt resources and enables the
- * interrupt on the local CPU. If the interrupt is supposed to be
- * enabled on other CPUs, it has to be done on each CPU using
- * enable_percpu_irq().
- *
- * Dev_id must be globally unique. It is a per-cpu variable, and
- * the handler gets called with the interrupted CPU's instance of
- * that variable.
+ * __request_percpu_irq - allocate a percpu interrupt line
+ * @irq: Interrupt line to allocate
+ * @handler: Function to be called when the IRQ occurs.
+ * @flags: Interrupt type flags (IRQF_TIMER only)
+ * @devname: An ascii name for the claiming device
+ * @dev_id: A percpu cookie passed back to the handler function
+ *
+ * This call allocates interrupt resources and enables the interrupt on the
+ * local CPU. If the interrupt is supposed to be enabled on other CPUs, it
+ * has to be done on each CPU using enable_percpu_irq().
+ *
+ * @dev_id must be globally unique. It is a per-cpu variable, and
+ * the handler gets called with the interrupted CPU's instance of
+ * that variable.
*/
int __request_percpu_irq(unsigned int irq, irq_handler_t handler,
unsigned long flags, const char *devname,
@@ -2640,32 +2504,31 @@ int __request_percpu_irq(unsigned int irq, irq_handler_t handler,
EXPORT_SYMBOL_GPL(__request_percpu_irq);
/**
- * request_percpu_nmi - allocate a percpu interrupt line for NMI delivery
- * @irq: Interrupt line to allocate
- * @handler: Function to be called when the IRQ occurs.
- * @name: An ascii name for the claiming device
- * @dev_id: A percpu cookie passed back to the handler function
+ * request_percpu_nmi - allocate a percpu interrupt line for NMI delivery
+ * @irq: Interrupt line to allocate
+ * @handler: Function to be called when the IRQ occurs.
+ * @name: An ascii name for the claiming device
+ * @dev_id: A percpu cookie passed back to the handler function
*
- * This call allocates interrupt resources for a per CPU NMI. Per CPU NMIs
- * have to be setup on each CPU by calling prepare_percpu_nmi() before
- * being enabled on the same CPU by using enable_percpu_nmi().
+ * This call allocates interrupt resources for a per CPU NMI. Per CPU NMIs
+ * have to be setup on each CPU by calling prepare_percpu_nmi() before
+ * being enabled on the same CPU by using enable_percpu_nmi().
*
- * Dev_id must be globally unique. It is a per-cpu variable, and
- * the handler gets called with the interrupted CPU's instance of
- * that variable.
+ * @dev_id must be globally unique. It is a per-cpu variable, and the
+ * handler gets called with the interrupted CPU's instance of that
+ * variable.
*
- * Interrupt lines requested for NMI delivering should have auto enabling
- * setting disabled.
+ * Interrupt lines requested for NMI delivering should have auto enabling
+ * setting disabled.
*
- * If the interrupt line cannot be used to deliver NMIs, function
- * will fail returning a negative value.
+ * If the interrupt line cannot be used to deliver NMIs, function
+ * will fail returning a negative value.
*/
int request_percpu_nmi(unsigned int irq, irq_handler_t handler,
const char *name, void __percpu *dev_id)
{
struct irqaction *action;
struct irq_desc *desc;
- unsigned long flags;
int retval;
if (!handler)
@@ -2701,10 +2564,8 @@ int request_percpu_nmi(unsigned int irq, irq_handler_t handler,
if (retval)
goto err_irq_setup;
- raw_spin_lock_irqsave(&desc->lock, flags);
- desc->istate |= IRQS_NMI;
- raw_spin_unlock_irqrestore(&desc->lock, flags);
-
+ scoped_guard(raw_spinlock_irqsave, &desc->lock)
+ desc->istate |= IRQS_NMI;
return 0;
err_irq_setup:
@@ -2716,79 +2577,55 @@ err_out:
}
/**
- * prepare_percpu_nmi - performs CPU local setup for NMI delivery
- * @irq: Interrupt line to prepare for NMI delivery
+ * prepare_percpu_nmi - performs CPU local setup for NMI delivery
+ * @irq: Interrupt line to prepare for NMI delivery
*
- * This call prepares an interrupt line to deliver NMI on the current CPU,
- * before that interrupt line gets enabled with enable_percpu_nmi().
+ * This call prepares an interrupt line to deliver NMI on the current CPU,
+ * before that interrupt line gets enabled with enable_percpu_nmi().
*
- * As a CPU local operation, this should be called from non-preemptible
- * context.
+ * As a CPU local operation, this should be called from non-preemptible
+ * context.
*
- * If the interrupt line cannot be used to deliver NMIs, function
- * will fail returning a negative value.
+ * If the interrupt line cannot be used to deliver NMIs, function will fail
+ * returning a negative value.
*/
int prepare_percpu_nmi(unsigned int irq)
{
- unsigned long flags;
- struct irq_desc *desc;
- int ret = 0;
+ int ret = -EINVAL;
WARN_ON(preemptible());
- desc = irq_get_desc_lock(irq, &flags,
- IRQ_GET_DESC_CHECK_PERCPU);
- if (!desc)
- return -EINVAL;
-
- if (WARN(!irq_is_nmi(desc),
- KERN_ERR "prepare_percpu_nmi called for a non-NMI interrupt: irq %u\n",
- irq)) {
- ret = -EINVAL;
- goto out;
- }
+ scoped_irqdesc_get_and_lock(irq, IRQ_GET_DESC_CHECK_PERCPU) {
+ if (WARN(!irq_is_nmi(scoped_irqdesc),
+ "prepare_percpu_nmi called for a non-NMI interrupt: irq %u\n", irq))
+ return -EINVAL;
- ret = irq_nmi_setup(desc);
- if (ret) {
- pr_err("Failed to setup NMI delivery: irq %u\n", irq);
- goto out;
+ ret = irq_nmi_setup(scoped_irqdesc);
+ if (ret)
+ pr_err("Failed to setup NMI delivery: irq %u\n", irq);
}
-
-out:
- irq_put_desc_unlock(desc, flags);
return ret;
}
/**
- * teardown_percpu_nmi - undoes NMI setup of IRQ line
- * @irq: Interrupt line from which CPU local NMI configuration should be
- * removed
- *
- * This call undoes the setup done by prepare_percpu_nmi().
+ * teardown_percpu_nmi - undoes NMI setup of IRQ line
+ * @irq: Interrupt line from which CPU local NMI configuration should be removed
*
- * IRQ line should not be enabled for the current CPU.
+ * This call undoes the setup done by prepare_percpu_nmi().
*
- * As a CPU local operation, this should be called from non-preemptible
- * context.
+ * IRQ line should not be enabled for the current CPU.
+ * As a CPU local operation, this should be called from non-preemptible
+ * context.
*/
void teardown_percpu_nmi(unsigned int irq)
{
- unsigned long flags;
- struct irq_desc *desc;
-
WARN_ON(preemptible());
- desc = irq_get_desc_lock(irq, &flags,
- IRQ_GET_DESC_CHECK_PERCPU);
- if (!desc)
- return;
-
- if (WARN_ON(!irq_is_nmi(desc)))
- goto out;
-
- irq_nmi_teardown(desc);
-out:
- irq_put_desc_unlock(desc, flags);
+ scoped_irqdesc_get_and_lock(irq, IRQ_GET_DESC_CHECK_PERCPU) {
+ if (WARN_ON(!irq_is_nmi(scoped_irqdesc)))
+ return;
+ irq_nmi_teardown(scoped_irqdesc);
+ }
}
static int __irq_get_irqchip_state(struct irq_data *data, enum irqchip_irq_state which, bool *state)
@@ -2815,87 +2652,62 @@ static int __irq_get_irqchip_state(struct irq_data *data, enum irqchip_irq_state
}
/**
- * irq_get_irqchip_state - returns the irqchip state of a interrupt.
- * @irq: Interrupt line that is forwarded to a VM
- * @which: One of IRQCHIP_STATE_* the caller wants to know about
- * @state: a pointer to a boolean where the state is to be stored
+ * irq_get_irqchip_state - returns the irqchip state of a interrupt.
+ * @irq: Interrupt line that is forwarded to a VM
+ * @which: One of IRQCHIP_STATE_* the caller wants to know about
+ * @state: a pointer to a boolean where the state is to be stored
*
- * This call snapshots the internal irqchip state of an
- * interrupt, returning into @state the bit corresponding to
- * stage @which
+ * This call snapshots the internal irqchip state of an interrupt,
+ * returning into @state the bit corresponding to stage @which
*
- * This function should be called with preemption disabled if the
- * interrupt controller has per-cpu registers.
+ * This function should be called with preemption disabled if the interrupt
+ * controller has per-cpu registers.
*/
-int irq_get_irqchip_state(unsigned int irq, enum irqchip_irq_state which,
- bool *state)
+int irq_get_irqchip_state(unsigned int irq, enum irqchip_irq_state which, bool *state)
{
- struct irq_desc *desc;
- struct irq_data *data;
- unsigned long flags;
- int err = -EINVAL;
-
- desc = irq_get_desc_buslock(irq, &flags, 0);
- if (!desc)
- return err;
+ scoped_irqdesc_get_and_buslock(irq, 0) {
+ struct irq_data *data = irq_desc_get_irq_data(scoped_irqdesc);
- data = irq_desc_get_irq_data(desc);
-
- err = __irq_get_irqchip_state(data, which, state);
-
- irq_put_desc_busunlock(desc, flags);
- return err;
+ return __irq_get_irqchip_state(data, which, state);
+ }
+ return -EINVAL;
}
EXPORT_SYMBOL_GPL(irq_get_irqchip_state);
/**
- * irq_set_irqchip_state - set the state of a forwarded interrupt.
- * @irq: Interrupt line that is forwarded to a VM
- * @which: State to be restored (one of IRQCHIP_STATE_*)
- * @val: Value corresponding to @which
+ * irq_set_irqchip_state - set the state of a forwarded interrupt.
+ * @irq: Interrupt line that is forwarded to a VM
+ * @which: State to be restored (one of IRQCHIP_STATE_*)
+ * @val: Value corresponding to @which
*
- * This call sets the internal irqchip state of an interrupt,
- * depending on the value of @which.
+ * This call sets the internal irqchip state of an interrupt, depending on
+ * the value of @which.
*
- * This function should be called with migration disabled if the
- * interrupt controller has per-cpu registers.
+ * This function should be called with migration disabled if the interrupt
+ * controller has per-cpu registers.
*/
-int irq_set_irqchip_state(unsigned int irq, enum irqchip_irq_state which,
- bool val)
+int irq_set_irqchip_state(unsigned int irq, enum irqchip_irq_state which, bool val)
{
- struct irq_desc *desc;
- struct irq_data *data;
- struct irq_chip *chip;
- unsigned long flags;
- int err = -EINVAL;
+ scoped_irqdesc_get_and_buslock(irq, 0) {
+ struct irq_data *data = irq_desc_get_irq_data(scoped_irqdesc);
+ struct irq_chip *chip;
- desc = irq_get_desc_buslock(irq, &flags, 0);
- if (!desc)
- return err;
+ do {
+ chip = irq_data_get_irq_chip(data);
- data = irq_desc_get_irq_data(desc);
+ if (WARN_ON_ONCE(!chip))
+ return -ENODEV;
- do {
- chip = irq_data_get_irq_chip(data);
- if (WARN_ON_ONCE(!chip)) {
- err = -ENODEV;
- goto out_unlock;
- }
- if (chip->irq_set_irqchip_state)
- break;
-#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY
- data = data->parent_data;
-#else
- data = NULL;
-#endif
- } while (data);
+ if (chip->irq_set_irqchip_state)
+ break;
- if (data)
- err = chip->irq_set_irqchip_state(data, which, val);
+ data = irqd_get_parent_data(data);
+ } while (data);
-out_unlock:
- irq_put_desc_busunlock(desc, flags);
- return err;
+ if (data)
+ return chip->irq_set_irqchip_state(data, which, val);
+ }
+ return -EINVAL;
}
EXPORT_SYMBOL_GPL(irq_set_irqchip_state);
diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c
index c05ba7ca00fa..9b09ad3f9914 100644
--- a/kernel/irq/msi.c
+++ b/kernel/irq/msi.c
@@ -59,7 +59,8 @@ struct msi_ctrl {
static void msi_domain_free_locked(struct device *dev, struct msi_ctrl *ctrl);
static unsigned int msi_domain_get_hwsize(struct device *dev, unsigned int domid);
static inline int msi_sysfs_create_group(struct device *dev);
-
+static int msi_domain_prepare_irqs(struct irq_domain *domain, struct device *dev,
+ int nvec, msi_alloc_info_t *arg);
/**
* msi_alloc_desc - Allocate an initialized msi_desc
@@ -343,26 +344,30 @@ int msi_setup_device_data(struct device *dev)
}
/**
- * msi_lock_descs - Lock the MSI descriptor storage of a device
+ * __msi_lock_descs - Lock the MSI descriptor storage of a device
* @dev: Device to operate on
+ *
+ * Internal function for guard(msi_descs_lock). Don't use in code.
*/
-void msi_lock_descs(struct device *dev)
+void __msi_lock_descs(struct device *dev)
{
mutex_lock(&dev->msi.data->mutex);
}
-EXPORT_SYMBOL_GPL(msi_lock_descs);
+EXPORT_SYMBOL_GPL(__msi_lock_descs);
/**
- * msi_unlock_descs - Unlock the MSI descriptor storage of a device
+ * __msi_unlock_descs - Unlock the MSI descriptor storage of a device
* @dev: Device to operate on
+ *
+ * Internal function for guard(msi_descs_lock). Don't use in code.
*/
-void msi_unlock_descs(struct device *dev)
+void __msi_unlock_descs(struct device *dev)
{
/* Invalidate the index which was cached by the iterator */
dev->msi.data->__iter_idx = MSI_XA_MAX_INDEX;
mutex_unlock(&dev->msi.data->mutex);
}
-EXPORT_SYMBOL_GPL(msi_unlock_descs);
+EXPORT_SYMBOL_GPL(__msi_unlock_descs);
static struct msi_desc *msi_find_desc(struct msi_device_data *md, unsigned int domid,
enum msi_desc_filter filter)
@@ -448,7 +453,6 @@ EXPORT_SYMBOL_GPL(msi_next_desc);
unsigned int msi_domain_get_virq(struct device *dev, unsigned int domid, unsigned int index)
{
struct msi_desc *desc;
- unsigned int ret = 0;
bool pcimsi = false;
struct xarray *xa;
@@ -462,7 +466,7 @@ unsigned int msi_domain_get_virq(struct device *dev, unsigned int domid, unsigne
if (dev_is_pci(dev) && domid == MSI_DEFAULT_DOMAIN)
pcimsi = to_pci_dev(dev)->msi_enabled;
- msi_lock_descs(dev);
+ guard(msi_descs_lock)(dev);
xa = &dev->msi.data->__domains[domid].store;
desc = xa_load(xa, pcimsi ? 0 : index);
if (desc && desc->irq) {
@@ -471,16 +475,12 @@ unsigned int msi_domain_get_virq(struct device *dev, unsigned int domid, unsigne
* PCI-MSIX and platform MSI use a descriptor per
* interrupt.
*/
- if (pcimsi) {
- if (index < desc->nvec_used)
- ret = desc->irq + index;
- } else {
- ret = desc->irq;
- }
+ if (!pcimsi)
+ return desc->irq;
+ if (index < desc->nvec_used)
+ return desc->irq + index;
}
-
- msi_unlock_descs(dev);
- return ret;
+ return 0;
}
EXPORT_SYMBOL_GPL(msi_domain_get_virq);
@@ -796,6 +796,10 @@ static int msi_domain_ops_prepare(struct irq_domain *domain, struct device *dev,
return 0;
}
+static void msi_domain_ops_teardown(struct irq_domain *domain, msi_alloc_info_t *arg)
+{
+}
+
static void msi_domain_ops_set_desc(msi_alloc_info_t *arg,
struct msi_desc *desc)
{
@@ -821,6 +825,7 @@ static struct msi_domain_ops msi_domain_ops_default = {
.get_hwirq = msi_domain_ops_get_hwirq,
.msi_init = msi_domain_ops_init,
.msi_prepare = msi_domain_ops_prepare,
+ .msi_teardown = msi_domain_ops_teardown,
.set_desc = msi_domain_ops_set_desc,
};
@@ -842,6 +847,8 @@ static void msi_domain_update_dom_ops(struct msi_domain_info *info)
ops->msi_init = msi_domain_ops_default.msi_init;
if (ops->msi_prepare == NULL)
ops->msi_prepare = msi_domain_ops_default.msi_prepare;
+ if (ops->msi_teardown == NULL)
+ ops->msi_teardown = msi_domain_ops_default.msi_teardown;
if (ops->set_desc == NULL)
ops->set_desc = msi_domain_ops_default.set_desc;
}
@@ -882,6 +889,7 @@ static struct irq_domain *__msi_create_irq_domain(struct fwnode_handle *fwnode,
if (domain) {
irq_domain_update_bus_token(domain, info->bus_token);
+ domain->dev = info->dev;
if (info->flags & MSI_FLAG_PARENT_PM_DEV)
domain->pm_dev = parent->pm_dev;
}
@@ -905,6 +913,32 @@ struct irq_domain *msi_create_irq_domain(struct fwnode_handle *fwnode,
}
/**
+ * msi_create_parent_irq_domain - Create an MSI-parent interrupt domain
+ * @info: MSI irqdomain creation info
+ * @msi_parent_ops: MSI parent callbacks and configuration
+ *
+ * Return: pointer to the created &struct irq_domain or %NULL on failure
+ */
+struct irq_domain *msi_create_parent_irq_domain(struct irq_domain_info *info,
+ const struct msi_parent_ops *msi_parent_ops)
+{
+ struct irq_domain *d;
+
+ info->hwirq_max = max(info->hwirq_max, info->size);
+ info->size = info->hwirq_max;
+ info->domain_flags |= IRQ_DOMAIN_FLAG_MSI_PARENT;
+ info->bus_token = msi_parent_ops->bus_select_token;
+
+ d = irq_domain_instantiate(info);
+ if (IS_ERR(d))
+ return NULL;
+
+ d->msi_parent_ops = msi_parent_ops;
+ return d;
+}
+EXPORT_SYMBOL_GPL(msi_create_parent_irq_domain);
+
+/**
* msi_parent_init_dev_msi_info - Delegate initialization of device MSI info down
* in the domain hierarchy
* @dev: The device for which the domain should be created
@@ -998,9 +1032,8 @@ bool msi_create_device_irq_domain(struct device *dev, unsigned int domid,
void *chip_data)
{
struct irq_domain *domain, *parent = dev->msi.domain;
- struct fwnode_handle *fwnode, *fwnalloced = NULL;
- struct msi_domain_template *bundle;
const struct msi_parent_ops *pops;
+ struct fwnode_handle *fwnode;
if (!irq_domain_is_msi_parent(parent))
return false;
@@ -1008,7 +1041,8 @@ bool msi_create_device_irq_domain(struct device *dev, unsigned int domid,
if (domid >= MSI_MAX_DEVICE_IRQDOMAINS)
return false;
- bundle = kmemdup(template, sizeof(*bundle), GFP_KERNEL);
+ struct msi_domain_template *bundle __free(kfree) =
+ kmemdup(template, sizeof(*bundle), GFP_KERNEL);
if (!bundle)
return false;
@@ -1017,6 +1051,8 @@ bool msi_create_device_irq_domain(struct device *dev, unsigned int domid,
bundle->info.ops = &bundle->ops;
bundle->info.data = domain_data;
bundle->info.chip_data = chip_data;
+ bundle->info.alloc_data = &bundle->alloc_info;
+ bundle->info.dev = dev;
pops = parent->msi_parent_ops;
snprintf(bundle->name, sizeof(bundle->name), "%s%s-%s",
@@ -1031,41 +1067,42 @@ bool msi_create_device_irq_domain(struct device *dev, unsigned int domid,
* node as they are not guaranteed to have a fwnode. They are never
* looked up and always handled in the context of the device.
*/
- if (bundle->info.flags & MSI_FLAG_USE_DEV_FWNODE)
- fwnode = dev->fwnode;
+ struct fwnode_handle *fwnode_alloced __free(irq_domain_free_fwnode) = NULL;
+
+ if (!(bundle->info.flags & MSI_FLAG_USE_DEV_FWNODE))
+ fwnode = fwnode_alloced = irq_domain_alloc_named_fwnode(bundle->name);
else
- fwnode = fwnalloced = irq_domain_alloc_named_fwnode(bundle->name);
+ fwnode = dev->fwnode;
if (!fwnode)
- goto free_bundle;
+ return false;
if (msi_setup_device_data(dev))
- goto free_fwnode;
-
- msi_lock_descs(dev);
+ return false;
+ guard(msi_descs_lock)(dev);
if (WARN_ON_ONCE(msi_get_device_domain(dev, domid)))
- goto fail;
+ return false;
if (!pops->init_dev_msi_info(dev, parent, parent, &bundle->info))
- goto fail;
+ return false;
domain = __msi_create_irq_domain(fwnode, &bundle->info, IRQ_DOMAIN_FLAG_MSI_DEVICE, parent);
if (!domain)
- goto fail;
+ return false;
- domain->dev = dev;
dev->msi.data->__domains[domid].domain = domain;
- msi_unlock_descs(dev);
- return true;
-fail:
- msi_unlock_descs(dev);
-free_fwnode:
- irq_domain_free_fwnode(fwnalloced);
-free_bundle:
- kfree(bundle);
- return false;
+ if (msi_domain_prepare_irqs(domain, dev, hwsize, &bundle->alloc_info)) {
+ dev->msi.data->__domains[domid].domain = NULL;
+ irq_domain_remove(domain);
+ return false;
+ }
+
+ /* @bundle and @fwnode_alloced are now in use. Prevent cleanup */
+ retain_and_null_ptr(bundle);
+ retain_and_null_ptr(fwnode_alloced);
+ return true;
}
/**
@@ -1079,23 +1116,21 @@ void msi_remove_device_irq_domain(struct device *dev, unsigned int domid)
struct msi_domain_info *info;
struct irq_domain *domain;
- msi_lock_descs(dev);
-
+ guard(msi_descs_lock)(dev);
domain = msi_get_device_domain(dev, domid);
-
if (!domain || !irq_domain_is_msi_device(domain))
- goto unlock;
+ return;
dev->msi.data->__domains[domid].domain = NULL;
info = domain->host_data;
+
+ info->ops->msi_teardown(domain, info->alloc_data);
+
if (irq_domain_is_msi_device(domain))
fwnode = domain->fwnode;
irq_domain_remove(domain);
irq_domain_free_fwnode(fwnode);
kfree(container_of(info, struct msi_domain_template, info));
-
-unlock:
- msi_unlock_descs(dev);
}
/**
@@ -1111,16 +1146,14 @@ bool msi_match_device_irq_domain(struct device *dev, unsigned int domid,
{
struct msi_domain_info *info;
struct irq_domain *domain;
- bool ret = false;
- msi_lock_descs(dev);
+ guard(msi_descs_lock)(dev);
domain = msi_get_device_domain(dev, domid);
if (domain && irq_domain_is_msi_device(domain)) {
info = domain->host_data;
- ret = info->bus_token == bus_token;
+ return info->bus_token == bus_token;
}
- msi_unlock_descs(dev);
- return ret;
+ return false;
}
static int msi_domain_prepare_irqs(struct irq_domain *domain, struct device *dev,
@@ -1238,6 +1271,24 @@ static int msi_init_virq(struct irq_domain *domain, int virq, unsigned int vflag
return 0;
}
+static int populate_alloc_info(struct irq_domain *domain, struct device *dev,
+ unsigned int nirqs, msi_alloc_info_t *arg)
+{
+ struct msi_domain_info *info = domain->host_data;
+
+ /*
+ * If the caller has provided a template alloc info, use that. Once
+ * all users of msi_create_irq_domain() have been eliminated, this
+ * should be the only source of allocation information, and the
+ * prepare call below should be finally removed.
+ */
+ if (!info->alloc_data)
+ return msi_domain_prepare_irqs(domain, dev, nirqs, arg);
+
+ *arg = *info->alloc_data;
+ return 0;
+}
+
static int __msi_domain_alloc_irqs(struct device *dev, struct irq_domain *domain,
struct msi_ctrl *ctrl)
{
@@ -1250,7 +1301,7 @@ static int __msi_domain_alloc_irqs(struct device *dev, struct irq_domain *domain
unsigned long idx;
int i, ret, virq;
- ret = msi_domain_prepare_irqs(domain, dev, ctrl->nirqs, &arg);
+ ret = populate_alloc_info(domain, dev, ctrl->nirqs, &arg);
if (ret)
return ret;
@@ -1391,12 +1442,9 @@ int msi_domain_alloc_irqs_range_locked(struct device *dev, unsigned int domid,
int msi_domain_alloc_irqs_range(struct device *dev, unsigned int domid,
unsigned int first, unsigned int last)
{
- int ret;
- msi_lock_descs(dev);
- ret = msi_domain_alloc_irqs_range_locked(dev, domid, first, last);
- msi_unlock_descs(dev);
- return ret;
+ guard(msi_descs_lock)(dev);
+ return msi_domain_alloc_irqs_range_locked(dev, domid, first, last);
}
EXPORT_SYMBOL_GPL(msi_domain_alloc_irqs_range);
@@ -1500,12 +1548,8 @@ struct msi_map msi_domain_alloc_irq_at(struct device *dev, unsigned int domid, u
const struct irq_affinity_desc *affdesc,
union msi_instance_cookie *icookie)
{
- struct msi_map map;
-
- msi_lock_descs(dev);
- map = __msi_domain_alloc_irq_at(dev, domid, index, affdesc, icookie);
- msi_unlock_descs(dev);
- return map;
+ guard(msi_descs_lock)(dev);
+ return __msi_domain_alloc_irq_at(dev, domid, index, affdesc, icookie);
}
/**
@@ -1542,13 +1586,11 @@ int msi_device_domain_alloc_wired(struct irq_domain *domain, unsigned int hwirq,
icookie.value = ((u64)type << 32) | hwirq;
- msi_lock_descs(dev);
+ guard(msi_descs_lock)(dev);
if (WARN_ON_ONCE(msi_get_device_domain(dev, domid) != domain))
map.index = -EINVAL;
else
map = __msi_domain_alloc_irq_at(dev, domid, MSI_ANY_INDEX, NULL, &icookie);
- msi_unlock_descs(dev);
-
return map.index >= 0 ? map.virq : map.index;
}
@@ -1641,9 +1683,8 @@ void msi_domain_free_irqs_range_locked(struct device *dev, unsigned int domid,
void msi_domain_free_irqs_range(struct device *dev, unsigned int domid,
unsigned int first, unsigned int last)
{
- msi_lock_descs(dev);
+ guard(msi_descs_lock)(dev);
msi_domain_free_irqs_range_locked(dev, domid, first, last);
- msi_unlock_descs(dev);
}
EXPORT_SYMBOL_GPL(msi_domain_free_irqs_all);
@@ -1673,9 +1714,8 @@ void msi_domain_free_irqs_all_locked(struct device *dev, unsigned int domid)
*/
void msi_domain_free_irqs_all(struct device *dev, unsigned int domid)
{
- msi_lock_descs(dev);
+ guard(msi_descs_lock)(dev);
msi_domain_free_irqs_all_locked(dev, domid);
- msi_unlock_descs(dev);
}
/**
@@ -1694,12 +1734,11 @@ void msi_device_domain_free_wired(struct irq_domain *domain, unsigned int virq)
if (WARN_ON_ONCE(!dev || !desc || domain->bus_token != DOMAIN_BUS_WIRED_TO_MSI))
return;
- msi_lock_descs(dev);
- if (!WARN_ON_ONCE(msi_get_device_domain(dev, MSI_DEFAULT_DOMAIN) != domain)) {
- msi_domain_free_irqs_range_locked(dev, MSI_DEFAULT_DOMAIN, desc->msi_index,
- desc->msi_index);
- }
- msi_unlock_descs(dev);
+ guard(msi_descs_lock)(dev);
+ if (WARN_ON_ONCE(msi_get_device_domain(dev, MSI_DEFAULT_DOMAIN) != domain))
+ return;
+ msi_domain_free_irqs_range_locked(dev, MSI_DEFAULT_DOMAIN, desc->msi_index,
+ desc->msi_index);
}
/**
diff --git a/kernel/irq/pm.c b/kernel/irq/pm.c
index c556bc49d213..f7394729cedc 100644
--- a/kernel/irq/pm.c
+++ b/kernel/irq/pm.c
@@ -13,17 +13,13 @@
#include "internals.h"
-bool irq_pm_check_wakeup(struct irq_desc *desc)
+void irq_pm_handle_wakeup(struct irq_desc *desc)
{
- if (irqd_is_wakeup_armed(&desc->irq_data)) {
- irqd_clear(&desc->irq_data, IRQD_WAKEUP_ARMED);
- desc->istate |= IRQS_SUSPENDED | IRQS_PENDING;
- desc->depth++;
- irq_disable(desc);
- pm_system_irq_wakeup(irq_desc_get_irq(desc));
- return true;
- }
- return false;
+ irqd_clear(&desc->irq_data, IRQD_WAKEUP_ARMED);
+ desc->istate |= IRQS_SUSPENDED | IRQS_PENDING;
+ desc->depth++;
+ irq_disable(desc);
+ pm_system_irq_wakeup(irq_desc_get_irq(desc));
}
/*
@@ -46,8 +42,7 @@ void irq_pm_install_action(struct irq_desc *desc, struct irqaction *action)
desc->cond_suspend_depth++;
WARN_ON_ONCE(desc->no_suspend_depth &&
- (desc->no_suspend_depth +
- desc->cond_suspend_depth) != desc->nr_actions);
+ (desc->no_suspend_depth + desc->cond_suspend_depth) != desc->nr_actions);
}
/*
@@ -134,14 +129,12 @@ void suspend_device_irqs(void)
int irq;
for_each_irq_desc(irq, desc) {
- unsigned long flags;
bool sync;
if (irq_settings_is_nested_thread(desc))
continue;
- raw_spin_lock_irqsave(&desc->lock, flags);
- sync = suspend_device_irq(desc);
- raw_spin_unlock_irqrestore(&desc->lock, flags);
+ scoped_guard(raw_spinlock_irqsave, &desc->lock)
+ sync = suspend_device_irq(desc);
if (sync)
synchronize_irq(irq);
@@ -186,18 +179,15 @@ static void resume_irqs(bool want_early)
int irq;
for_each_irq_desc(irq, desc) {
- unsigned long flags;
- bool is_early = desc->action &&
- desc->action->flags & IRQF_EARLY_RESUME;
+ bool is_early = desc->action && desc->action->flags & IRQF_EARLY_RESUME;
if (!is_early && want_early)
continue;
if (irq_settings_is_nested_thread(desc))
continue;
- raw_spin_lock_irqsave(&desc->lock, flags);
+ guard(raw_spinlock_irqsave)(&desc->lock);
resume_irq(desc);
- raw_spin_unlock_irqrestore(&desc->lock, flags);
}
}
@@ -207,22 +197,16 @@ static void resume_irqs(bool want_early)
*/
void rearm_wake_irq(unsigned int irq)
{
- unsigned long flags;
- struct irq_desc *desc = irq_get_desc_buslock(irq, &flags, IRQ_GET_DESC_CHECK_GLOBAL);
-
- if (!desc)
- return;
+ scoped_irqdesc_get_and_buslock(irq, IRQ_GET_DESC_CHECK_GLOBAL) {
+ struct irq_desc *desc = scoped_irqdesc;
- if (!(desc->istate & IRQS_SUSPENDED) ||
- !irqd_is_wakeup_set(&desc->irq_data))
- goto unlock;
+ if (!(desc->istate & IRQS_SUSPENDED) || !irqd_is_wakeup_set(&desc->irq_data))
+ return;
- desc->istate &= ~IRQS_SUSPENDED;
- irqd_set(&desc->irq_data, IRQD_WAKEUP_ARMED);
- __enable_irq(desc);
-
-unlock:
- irq_put_desc_busunlock(desc, flags);
+ desc->istate &= ~IRQS_SUSPENDED;
+ irqd_set(&desc->irq_data, IRQD_WAKEUP_ARMED);
+ __enable_irq(desc);
+ }
}
/**
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index 8e29809de38d..29c2404e743b 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -81,20 +81,18 @@ static int show_irq_affinity(int type, struct seq_file *m)
static int irq_affinity_hint_proc_show(struct seq_file *m, void *v)
{
struct irq_desc *desc = irq_to_desc((long)m->private);
- unsigned long flags;
cpumask_var_t mask;
if (!zalloc_cpumask_var(&mask, GFP_KERNEL))
return -ENOMEM;
- raw_spin_lock_irqsave(&desc->lock, flags);
- if (desc->affinity_hint)
- cpumask_copy(mask, desc->affinity_hint);
- raw_spin_unlock_irqrestore(&desc->lock, flags);
+ scoped_guard(raw_spinlock_irq, &desc->lock) {
+ if (desc->affinity_hint)
+ cpumask_copy(mask, desc->affinity_hint);
+ }
seq_printf(m, "%*pb\n", cpumask_pr_args(mask));
free_cpumask_var(mask);
-
return 0;
}
@@ -295,32 +293,26 @@ static int irq_spurious_proc_show(struct seq_file *m, void *v)
#define MAX_NAMELEN 128
-static int name_unique(unsigned int irq, struct irqaction *new_action)
+static bool name_unique(unsigned int irq, struct irqaction *new_action)
{
struct irq_desc *desc = irq_to_desc(irq);
struct irqaction *action;
- unsigned long flags;
- int ret = 1;
- raw_spin_lock_irqsave(&desc->lock, flags);
+ guard(raw_spinlock_irq)(&desc->lock);
for_each_action_of_desc(desc, action) {
if ((action != new_action) && action->name &&
- !strcmp(new_action->name, action->name)) {
- ret = 0;
- break;
- }
+ !strcmp(new_action->name, action->name))
+ return false;
}
- raw_spin_unlock_irqrestore(&desc->lock, flags);
- return ret;
+ return true;
}
void register_handler_proc(unsigned int irq, struct irqaction *action)
{
- char name [MAX_NAMELEN];
+ char name[MAX_NAMELEN];
struct irq_desc *desc = irq_to_desc(irq);
- if (!desc->dir || action->dir || !action->name ||
- !name_unique(irq, action))
+ if (!desc->dir || action->dir || !action->name || !name_unique(irq, action))
return;
snprintf(name, MAX_NAMELEN, "%s", action->name);
@@ -347,17 +339,16 @@ void register_irq_proc(unsigned int irq, struct irq_desc *desc)
* added, not when the descriptor is created, so multiple
* tasks might try to register at the same time.
*/
- mutex_lock(&register_lock);
+ guard(mutex)(&register_lock);
if (desc->dir)
- goto out_unlock;
-
- sprintf(name, "%d", irq);
+ return;
/* create /proc/irq/1234 */
+ sprintf(name, "%u", irq);
desc->dir = proc_mkdir(name, root_irq_dir);
if (!desc->dir)
- goto out_unlock;
+ return;
#ifdef CONFIG_SMP
umode_t umode = S_IRUGO;
@@ -366,31 +357,27 @@ void register_irq_proc(unsigned int irq, struct irq_desc *desc)
umode |= S_IWUSR;
/* create /proc/irq/<irq>/smp_affinity */
- proc_create_data("smp_affinity", umode, desc->dir,
- &irq_affinity_proc_ops, irqp);
+ proc_create_data("smp_affinity", umode, desc->dir, &irq_affinity_proc_ops, irqp);
/* create /proc/irq/<irq>/affinity_hint */
proc_create_single_data("affinity_hint", 0444, desc->dir,
- irq_affinity_hint_proc_show, irqp);
+ irq_affinity_hint_proc_show, irqp);
/* create /proc/irq/<irq>/smp_affinity_list */
proc_create_data("smp_affinity_list", umode, desc->dir,
&irq_affinity_list_proc_ops, irqp);
- proc_create_single_data("node", 0444, desc->dir, irq_node_proc_show,
- irqp);
+ proc_create_single_data("node", 0444, desc->dir, irq_node_proc_show, irqp);
# ifdef CONFIG_GENERIC_IRQ_EFFECTIVE_AFF_MASK
proc_create_single_data("effective_affinity", 0444, desc->dir,
- irq_effective_aff_proc_show, irqp);
+ irq_effective_aff_proc_show, irqp);
proc_create_single_data("effective_affinity_list", 0444, desc->dir,
- irq_effective_aff_list_proc_show, irqp);
+ irq_effective_aff_list_proc_show, irqp);
# endif
#endif
proc_create_single_data("spurious", 0444, desc->dir,
- irq_spurious_proc_show, (void *)(long)irq);
+ irq_spurious_proc_show, (void *)(long)irq);
-out_unlock:
- mutex_unlock(&register_lock);
}
void unregister_irq_proc(unsigned int irq, struct irq_desc *desc)
@@ -468,7 +455,6 @@ int show_interrupts(struct seq_file *p, void *v)
int i = *(loff_t *) v, j;
struct irqaction *action;
struct irq_desc *desc;
- unsigned long flags;
if (i > ACTUAL_NR_IRQS)
return 0;
@@ -487,13 +473,13 @@ int show_interrupts(struct seq_file *p, void *v)
seq_putc(p, '\n');
}
- rcu_read_lock();
+ guard(rcu)();
desc = irq_to_desc(i);
if (!desc || irq_settings_is_hidden(desc))
- goto outsparse;
+ return 0;
if (!desc->action || irq_desc_is_chained(desc) || !desc->kstat_irqs)
- goto outsparse;
+ return 0;
seq_printf(p, "%*d:", prec, i);
for_each_online_cpu(j) {
@@ -503,7 +489,7 @@ int show_interrupts(struct seq_file *p, void *v)
}
seq_putc(p, ' ');
- raw_spin_lock_irqsave(&desc->lock, flags);
+ guard(raw_spinlock_irq)(&desc->lock);
if (desc->irq_data.chip) {
if (desc->irq_data.chip->irq_print_chip)
desc->irq_data.chip->irq_print_chip(&desc->irq_data, p);
@@ -532,9 +518,6 @@ int show_interrupts(struct seq_file *p, void *v)
}
seq_putc(p, '\n');
- raw_spin_unlock_irqrestore(&desc->lock, flags);
-outsparse:
- rcu_read_unlock();
return 0;
}
#endif
diff --git a/kernel/irq/resend.c b/kernel/irq/resend.c
index 1b7fa72968bd..ca9cc1b806a9 100644
--- a/kernel/irq/resend.c
+++ b/kernel/irq/resend.c
@@ -30,18 +30,17 @@ static DEFINE_RAW_SPINLOCK(irq_resend_lock);
*/
static void resend_irqs(struct tasklet_struct *unused)
{
- struct irq_desc *desc;
-
- raw_spin_lock_irq(&irq_resend_lock);
+ guard(raw_spinlock_irq)(&irq_resend_lock);
while (!hlist_empty(&irq_resend_list)) {
- desc = hlist_entry(irq_resend_list.first, struct irq_desc,
- resend_node);
+ struct irq_desc *desc;
+
+ desc = hlist_entry(irq_resend_list.first, struct irq_desc, resend_node);
hlist_del_init(&desc->resend_node);
+
raw_spin_unlock(&irq_resend_lock);
desc->handle_irq(desc);
raw_spin_lock(&irq_resend_lock);
}
- raw_spin_unlock_irq(&irq_resend_lock);
}
/* Tasklet to handle resend: */
@@ -75,19 +74,18 @@ static int irq_sw_resend(struct irq_desc *desc)
}
/* Add to resend_list and activate the softirq: */
- raw_spin_lock(&irq_resend_lock);
- if (hlist_unhashed(&desc->resend_node))
- hlist_add_head(&desc->resend_node, &irq_resend_list);
- raw_spin_unlock(&irq_resend_lock);
+ scoped_guard(raw_spinlock, &irq_resend_lock) {
+ if (hlist_unhashed(&desc->resend_node))
+ hlist_add_head(&desc->resend_node, &irq_resend_list);
+ }
tasklet_schedule(&resend_tasklet);
return 0;
}
void clear_irq_resend(struct irq_desc *desc)
{
- raw_spin_lock(&irq_resend_lock);
+ guard(raw_spinlock)(&irq_resend_lock);
hlist_del_init(&desc->resend_node);
- raw_spin_unlock(&irq_resend_lock);
}
void irq_resend_init(struct irq_desc *desc)
@@ -172,30 +170,24 @@ int check_irq_resend(struct irq_desc *desc, bool inject)
*/
int irq_inject_interrupt(unsigned int irq)
{
- struct irq_desc *desc;
- unsigned long flags;
- int err;
+ int err = -EINVAL;
/* Try the state injection hardware interface first */
if (!irq_set_irqchip_state(irq, IRQCHIP_STATE_PENDING, true))
return 0;
/* That failed, try via the resend mechanism */
- desc = irq_get_desc_buslock(irq, &flags, 0);
- if (!desc)
- return -EINVAL;
+ scoped_irqdesc_get_and_buslock(irq, 0) {
+ struct irq_desc *desc = scoped_irqdesc;
- /*
- * Only try to inject when the interrupt is:
- * - not NMI type
- * - activated
- */
- if (irq_is_nmi(desc) || !irqd_is_activated(&desc->irq_data))
- err = -EINVAL;
- else
- err = check_irq_resend(desc, true);
-
- irq_put_desc_busunlock(desc, flags);
+ /*
+ * Only try to inject when the interrupt is:
+ * - not NMI type
+ * - activated
+ */
+ if (!irq_is_nmi(desc) && irqd_is_activated(&desc->irq_data))
+ err = check_irq_resend(desc, true);
+ }
return err;
}
EXPORT_SYMBOL_GPL(irq_inject_interrupt);
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index 02b2daf07441..73280ccb74b0 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -19,77 +19,41 @@ static int irqfixup __read_mostly;
#define POLL_SPURIOUS_IRQ_INTERVAL (HZ/10)
static void poll_spurious_irqs(struct timer_list *unused);
static DEFINE_TIMER(poll_spurious_irq_timer, poll_spurious_irqs);
-static int irq_poll_cpu;
+int irq_poll_cpu;
static atomic_t irq_poll_active;
/*
- * We wait here for a poller to finish.
- *
- * If the poll runs on this CPU, then we yell loudly and return
- * false. That will leave the interrupt line disabled in the worst
- * case, but it should never happen.
- *
- * We wait until the poller is done and then recheck disabled and
- * action (about to be disabled). Only if it's still active, we return
- * true and let the handler run.
- */
-bool irq_wait_for_poll(struct irq_desc *desc)
- __must_hold(&desc->lock)
-{
- if (WARN_ONCE(irq_poll_cpu == smp_processor_id(),
- "irq poll in progress on cpu %d for irq %d\n",
- smp_processor_id(), desc->irq_data.irq))
- return false;
-
-#ifdef CONFIG_SMP
- do {
- raw_spin_unlock(&desc->lock);
- while (irqd_irq_inprogress(&desc->irq_data))
- cpu_relax();
- raw_spin_lock(&desc->lock);
- } while (irqd_irq_inprogress(&desc->irq_data));
- /* Might have been disabled in meantime */
- return !irqd_irq_disabled(&desc->irq_data) && desc->action;
-#else
- return false;
-#endif
-}
-
-
-/*
* Recovery handler for misrouted interrupts.
*/
-static int try_one_irq(struct irq_desc *desc, bool force)
+static bool try_one_irq(struct irq_desc *desc, bool force)
{
- irqreturn_t ret = IRQ_NONE;
struct irqaction *action;
+ bool ret = false;
- raw_spin_lock(&desc->lock);
+ guard(raw_spinlock)(&desc->lock);
/*
* PER_CPU, nested thread interrupts and interrupts explicitly
* marked polled are excluded from polling.
*/
- if (irq_settings_is_per_cpu(desc) ||
- irq_settings_is_nested_thread(desc) ||
+ if (irq_settings_is_per_cpu(desc) || irq_settings_is_nested_thread(desc) ||
irq_settings_is_polled(desc))
- goto out;
+ return false;
/*
* Do not poll disabled interrupts unless the spurious
* disabled poller asks explicitly.
*/
if (irqd_irq_disabled(&desc->irq_data) && !force)
- goto out;
+ return false;
/*
* All handlers must agree on IRQF_SHARED, so we test just the
* first.
*/
action = desc->action;
- if (!action || !(action->flags & IRQF_SHARED) ||
- (action->flags & __IRQF_TIMER))
- goto out;
+ if (!action || !(action->flags & IRQF_SHARED) || (action->flags & __IRQF_TIMER))
+ return false;
/* Already running on another processor */
if (irqd_irq_inprogress(&desc->irq_data)) {
@@ -98,21 +62,19 @@ static int try_one_irq(struct irq_desc *desc, bool force)
* CPU to go looking for our mystery interrupt too
*/
desc->istate |= IRQS_PENDING;
- goto out;
+ return false;
}
/* Mark it poll in progress */
desc->istate |= IRQS_POLL_INPROGRESS;
do {
if (handle_irq_event(desc) == IRQ_HANDLED)
- ret = IRQ_HANDLED;
+ ret = true;
/* Make sure that there is still a valid action */
action = desc->action;
} while ((desc->istate & IRQS_PENDING) && action);
desc->istate &= ~IRQS_POLL_INPROGRESS;
-out:
- raw_spin_unlock(&desc->lock);
- return ret == IRQ_HANDLED;
+ return ret;
}
static int misrouted_irq(int irq)
@@ -157,8 +119,7 @@ static void poll_spurious_irqs(struct timer_list *unused)
continue;
/* Racy but it doesn't matter */
- state = desc->istate;
- barrier();
+ state = READ_ONCE(desc->istate);
if (!(state & IRQS_SPURIOUS_DISABLED))
continue;
@@ -168,8 +129,7 @@ static void poll_spurious_irqs(struct timer_list *unused)
}
out:
atomic_dec(&irq_poll_active);
- mod_timer(&poll_spurious_irq_timer,
- jiffies + POLL_SPURIOUS_IRQ_INTERVAL);
+ mod_timer(&poll_spurious_irq_timer, jiffies + POLL_SPURIOUS_IRQ_INTERVAL);
}
static inline int bad_action_ret(irqreturn_t action_ret)
@@ -193,17 +153,13 @@ static void __report_bad_irq(struct irq_desc *desc, irqreturn_t action_ret)
{
unsigned int irq = irq_desc_get_irq(desc);
struct irqaction *action;
- unsigned long flags;
- if (bad_action_ret(action_ret)) {
- printk(KERN_ERR "irq event %d: bogus return value %x\n",
- irq, action_ret);
- } else {
- printk(KERN_ERR "irq %d: nobody cared (try booting with "
- "the \"irqpoll\" option)\n", irq);
- }
+ if (bad_action_ret(action_ret))
+ pr_err("irq event %d: bogus return value %x\n", irq, action_ret);
+ else
+ pr_err("irq %d: nobody cared (try booting with the \"irqpoll\" option)\n", irq);
dump_stack();
- printk(KERN_ERR "handlers:\n");
+ pr_err("handlers:\n");
/*
* We need to take desc->lock here. note_interrupt() is called
@@ -211,15 +167,13 @@ static void __report_bad_irq(struct irq_desc *desc, irqreturn_t action_ret)
* with something else removing an action. It's ok to take
* desc->lock here. See synchronize_irq().
*/
- raw_spin_lock_irqsave(&desc->lock, flags);
+ guard(raw_spinlock_irqsave)(&desc->lock);
for_each_action_of_desc(desc, action) {
- printk(KERN_ERR "[<%p>] %ps", action->handler, action->handler);
+ pr_err("[<%p>] %ps", action->handler, action->handler);
if (action->thread_fn)
- printk(KERN_CONT " threaded [<%p>] %ps",
- action->thread_fn, action->thread_fn);
- printk(KERN_CONT "\n");
+ pr_cont(" threaded [<%p>] %ps", action->thread_fn, action->thread_fn);
+ pr_cont("\n");
}
- raw_spin_unlock_irqrestore(&desc->lock, flags);
}
static void report_bad_irq(struct irq_desc *desc, irqreturn_t action_ret)
@@ -232,18 +186,17 @@ static void report_bad_irq(struct irq_desc *desc, irqreturn_t action_ret)
}
}
-static inline int
-try_misrouted_irq(unsigned int irq, struct irq_desc *desc,
- irqreturn_t action_ret)
+static inline bool try_misrouted_irq(unsigned int irq, struct irq_desc *desc,
+ irqreturn_t action_ret)
{
struct irqaction *action;
if (!irqfixup)
- return 0;
+ return false;
/* We didn't actually handle the IRQ - see if it was misrouted? */
if (action_ret == IRQ_NONE)
- return 1;
+ return true;
/*
* But for 'irqfixup == 2' we also do it for handled interrupts if
@@ -251,19 +204,16 @@ try_misrouted_irq(unsigned int irq, struct irq_desc *desc,
* traditional PC timer interrupt.. Legacy)
*/
if (irqfixup < 2)
- return 0;
+ return false;
if (!irq)
- return 1;
+ return true;
/*
* Since we don't get the descriptor lock, "action" can
- * change under us. We don't really care, but we don't
- * want to follow a NULL pointer. So tell the compiler to
- * just load it once by using a barrier.
+ * change under us.
*/
- action = desc->action;
- barrier();
+ action = READ_ONCE(desc->action);
return action && (action->flags & IRQF_IRQPOLL);
}
@@ -273,8 +223,7 @@ void note_interrupt(struct irq_desc *desc, irqreturn_t action_ret)
{
unsigned int irq;
- if (desc->istate & IRQS_POLL_INPROGRESS ||
- irq_settings_is_polled(desc))
+ if (desc->istate & IRQS_POLL_INPROGRESS || irq_settings_is_polled(desc))
return;
if (bad_action_ret(action_ret)) {
@@ -420,13 +369,12 @@ void note_interrupt(struct irq_desc *desc, irqreturn_t action_ret)
/*
* Now kill the IRQ
*/
- printk(KERN_EMERG "Disabling IRQ #%d\n", irq);
+ pr_emerg("Disabling IRQ #%d\n", irq);
desc->istate |= IRQS_SPURIOUS_DISABLED;
desc->depth++;
irq_disable(desc);
- mod_timer(&poll_spurious_irq_timer,
- jiffies + POLL_SPURIOUS_IRQ_INTERVAL);
+ mod_timer(&poll_spurious_irq_timer, jiffies + POLL_SPURIOUS_IRQ_INTERVAL);
}
desc->irqs_unhandled = 0;
}
@@ -436,11 +384,9 @@ bool noirqdebug __read_mostly;
int noirqdebug_setup(char *str)
{
noirqdebug = 1;
- printk(KERN_INFO "IRQ lockup detection disabled\n");
-
+ pr_info("IRQ lockup detection disabled\n");
return 1;
}
-
__setup("noirqdebug", noirqdebug_setup);
module_param(noirqdebug, bool, 0644);
MODULE_PARM_DESC(noirqdebug, "Disable irq lockup detection when true");
@@ -452,12 +398,10 @@ static int __init irqfixup_setup(char *str)
return 1;
}
irqfixup = 1;
- printk(KERN_WARNING "Misrouted IRQ fixup support enabled.\n");
- printk(KERN_WARNING "This may impact system performance.\n");
-
+ pr_warn("Misrouted IRQ fixup support enabled.\n");
+ pr_warn("This may impact system performance.\n");
return 1;
}
-
__setup("irqfixup", irqfixup_setup);
module_param(irqfixup, int, 0644);
@@ -468,11 +412,8 @@ static int __init irqpoll_setup(char *str)
return 1;
}
irqfixup = 2;
- printk(KERN_WARNING "Misrouted IRQ fixup and polling support "
- "enabled\n");
- printk(KERN_WARNING "This may significantly impact system "
- "performance\n");
+ pr_warn("Misrouted IRQ fixup and polling support enabled\n");
+ pr_warn("This may significantly impact system performance\n");
return 1;
}
-
__setup("irqpoll", irqpoll_setup);
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index 4198f30aac3c..1e7635864124 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -829,8 +829,7 @@ static struct bpf_iter_reg ksym_iter_reg_info = {
.seq_info = &ksym_iter_seq_info,
};
-BTF_ID_LIST(btf_ksym_iter_id)
-BTF_ID(struct, kallsym_iter)
+BTF_ID_LIST_SINGLE(btf_ksym_iter_id, struct, kallsym_iter)
static int __init bpf_ksym_iter_register(void)
{
diff --git a/kernel/kcsan/kcsan_test.c b/kernel/kcsan/kcsan_test.c
index 6ce73cceaf53..49ab81faaed9 100644
--- a/kernel/kcsan/kcsan_test.c
+++ b/kernel/kcsan/kcsan_test.c
@@ -533,7 +533,7 @@ static void test_barrier_nothreads(struct kunit *test)
struct kcsan_scoped_access *reorder_access = NULL;
#endif
arch_spinlock_t arch_spinlock = __ARCH_SPIN_LOCK_UNLOCKED;
- atomic_t dummy;
+ atomic_t dummy = ATOMIC_INIT(0);
KCSAN_TEST_REQUIRES(test, reorder_access != NULL);
KCSAN_TEST_REQUIRES(test, IS_ENABLED(CONFIG_SMP));
@@ -1501,7 +1501,7 @@ static int access_thread(void *arg)
}
} while (!torture_must_stop());
timer_delete_sync(&timer);
- destroy_timer_on_stack(&timer);
+ timer_destroy_on_stack(&timer);
torture_kthread_stopping("access_thread");
return 0;
diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
index 3e62b944c883..351cd7d76dfa 100644
--- a/kernel/kexec_core.c
+++ b/kernel/kexec_core.c
@@ -877,6 +877,60 @@ int kimage_load_segment(struct kimage *image,
return result;
}
+void *kimage_map_segment(struct kimage *image,
+ unsigned long addr, unsigned long size)
+{
+ unsigned long src_page_addr, dest_page_addr = 0;
+ unsigned long eaddr = addr + size;
+ kimage_entry_t *ptr, entry;
+ struct page **src_pages;
+ unsigned int npages;
+ void *vaddr = NULL;
+ int i;
+
+ /*
+ * Collect the source pages and map them in a contiguous VA range.
+ */
+ npages = PFN_UP(eaddr) - PFN_DOWN(addr);
+ src_pages = kmalloc_array(npages, sizeof(*src_pages), GFP_KERNEL);
+ if (!src_pages) {
+ pr_err("Could not allocate ima pages array.\n");
+ return NULL;
+ }
+
+ i = 0;
+ for_each_kimage_entry(image, ptr, entry) {
+ if (entry & IND_DESTINATION) {
+ dest_page_addr = entry & PAGE_MASK;
+ } else if (entry & IND_SOURCE) {
+ if (dest_page_addr >= addr && dest_page_addr < eaddr) {
+ src_page_addr = entry & PAGE_MASK;
+ src_pages[i++] =
+ virt_to_page(__va(src_page_addr));
+ if (i == npages)
+ break;
+ dest_page_addr += PAGE_SIZE;
+ }
+ }
+ }
+
+ /* Sanity check. */
+ WARN_ON(i < npages);
+
+ vaddr = vmap(src_pages, npages, VM_MAP, PAGE_KERNEL);
+ kfree(src_pages);
+
+ if (!vaddr)
+ pr_err("Could not map ima buffer.\n");
+
+ return vaddr;
+}
+
+void kimage_unmap_segment(void *segment_buffer)
+{
+ vunmap(segment_buffer);
+}
+
struct kexec_load_limit {
/* Mutex protects the limit count. */
struct mutex mutex;
@@ -1026,7 +1080,7 @@ int kernel_kexec(void)
console_suspend_all();
error = dpm_suspend_start(PMSG_FREEZE);
if (error)
- goto Resume_console;
+ goto Resume_devices;
/*
* dpm_suspend_end() must be called after dpm_suspend_start()
* to complete the transition, like in the hibernation flows
@@ -1081,7 +1135,6 @@ int kernel_kexec(void)
dpm_resume_start(PMSG_RESTORE);
Resume_devices:
dpm_resume_end(PMSG_RESTORE);
- Resume_console:
console_resume_all();
thaw_processes();
Restore_console:
diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c
index fba686487e3b..b835033c65eb 100644
--- a/kernel/kexec_file.c
+++ b/kernel/kexec_file.c
@@ -19,7 +19,6 @@
#include <linux/list.h>
#include <linux/fs.h>
#include <linux/ima.h>
-#include <crypto/hash.h>
#include <crypto/sha2.h>
#include <linux/elf.h>
#include <linux/elfcore.h>
@@ -38,6 +37,21 @@ void set_kexec_sig_enforced(void)
}
#endif
+#ifdef CONFIG_IMA_KEXEC
+static bool check_ima_segment_index(struct kimage *image, int i)
+{
+ if (image->is_ima_segment_index_set && i == image->ima_segment_index)
+ return true;
+ else
+ return false;
+}
+#else
+static bool check_ima_segment_index(struct kimage *image, int i)
+{
+ return false;
+}
+#endif
+
static int kexec_calculate_store_digests(struct kimage *image);
/* Maximum size in bytes for kernel/initrd files. */
@@ -186,6 +200,15 @@ kimage_validate_signature(struct kimage *image)
}
#endif
+static int kexec_post_load(struct kimage *image, unsigned long flags)
+{
+#ifdef CONFIG_IMA_KEXEC
+ if (!(flags & KEXEC_FILE_ON_CRASH))
+ ima_kexec_post_load(image);
+#endif
+ return machine_kexec_post_load(image);
+}
+
/*
* In file mode list of segments is prepared by kernel. Copy relevant
* data from user space, do error checking, prepare segment list
@@ -253,6 +276,11 @@ kimage_file_prepare_segments(struct kimage *image, int kernel_fd, int initrd_fd,
/* IMA needs to pass the measurement list to the next kernel. */
ima_add_kexec_buffer(image);
+ /* If KHO is active, add its images to the list */
+ ret = kho_fill_kimage(image);
+ if (ret)
+ goto out;
+
/* Call image load handler */
ldata = kexec_image_load_default(image);
@@ -413,7 +441,7 @@ SYSCALL_DEFINE5(kexec_file_load, int, kernel_fd, int, initrd_fd,
kimage_terminate(image);
- ret = machine_kexec_post_load(image);
+ ret = kexec_post_load(image, flags);
if (ret)
goto out;
@@ -445,6 +473,7 @@ static int locate_mem_hole_top_down(unsigned long start, unsigned long end,
temp_end = min(end, kbuf->buf_max);
temp_start = temp_end - kbuf->memsz + 1;
+ kexec_random_range_start(temp_start, temp_end, kbuf, &temp_start);
do {
/* align down start */
@@ -489,6 +518,8 @@ static int locate_mem_hole_bottom_up(unsigned long start, unsigned long end,
temp_start = max(start, kbuf->buf_min);
+ kexec_random_range_start(temp_start, end, kbuf, &temp_start);
+
do {
temp_start = ALIGN(temp_start, kbuf->buf_align);
temp_end = temp_start + kbuf->memsz - 1;
@@ -648,6 +679,14 @@ int kexec_locate_mem_hole(struct kexec_buf *kbuf)
if (kbuf->mem != KEXEC_BUF_MEM_UNKNOWN)
return 0;
+ /*
+ * If KHO is active, only use KHO scratch memory. All other memory
+ * could potentially be handed over.
+ */
+ ret = kho_locate_mem_hole(kbuf, locate_mem_hole_callback);
+ if (ret <= 0)
+ return ret;
+
if (!IS_ENABLED(CONFIG_ARCH_KEEP_MEMBLOCK))
ret = kexec_walk_resources(kbuf, locate_mem_hole_callback);
else
@@ -712,11 +751,10 @@ int kexec_add_buffer(struct kexec_buf *kbuf)
/* Calculate and store the digest of segments */
static int kexec_calculate_store_digests(struct kimage *image)
{
- struct crypto_shash *tfm;
- struct shash_desc *desc;
+ struct sha256_ctx sctx;
int ret = 0, i, j, zero_buf_sz, sha_region_sz;
- size_t desc_size, nullsz;
- char *digest;
+ size_t nullsz;
+ u8 digest[SHA256_DIGEST_SIZE];
void *zero_buf;
struct kexec_sha_region *sha_regions;
struct purgatory_info *pi = &image->purgatory_info;
@@ -727,37 +765,12 @@ static int kexec_calculate_store_digests(struct kimage *image)
zero_buf = __va(page_to_pfn(ZERO_PAGE(0)) << PAGE_SHIFT);
zero_buf_sz = PAGE_SIZE;
- tfm = crypto_alloc_shash("sha256", 0, 0);
- if (IS_ERR(tfm)) {
- ret = PTR_ERR(tfm);
- goto out;
- }
-
- desc_size = crypto_shash_descsize(tfm) + sizeof(*desc);
- desc = kzalloc(desc_size, GFP_KERNEL);
- if (!desc) {
- ret = -ENOMEM;
- goto out_free_tfm;
- }
-
sha_region_sz = KEXEC_SEGMENT_MAX * sizeof(struct kexec_sha_region);
sha_regions = vzalloc(sha_region_sz);
- if (!sha_regions) {
- ret = -ENOMEM;
- goto out_free_desc;
- }
+ if (!sha_regions)
+ return -ENOMEM;
- desc->tfm = tfm;
-
- ret = crypto_shash_init(desc);
- if (ret < 0)
- goto out_free_sha_regions;
-
- digest = kzalloc(SHA256_DIGEST_SIZE, GFP_KERNEL);
- if (!digest) {
- ret = -ENOMEM;
- goto out_free_sha_regions;
- }
+ sha256_init(&sctx);
for (j = i = 0; i < image->nr_segments; i++) {
struct kexec_segment *ksegment;
@@ -776,10 +789,14 @@ static int kexec_calculate_store_digests(struct kimage *image)
if (ksegment->kbuf == pi->purgatory_buf)
continue;
- ret = crypto_shash_update(desc, ksegment->kbuf,
- ksegment->bufsz);
- if (ret)
- break;
+ /*
+ * Skip the segment if ima_segment_index is set and matches
+ * the current index
+ */
+ if (check_ima_segment_index(image, i))
+ continue;
+
+ sha256_update(&sctx, ksegment->kbuf, ksegment->bufsz);
/*
* Assume rest of the buffer is filled with zero and
@@ -791,44 +808,26 @@ static int kexec_calculate_store_digests(struct kimage *image)
if (bytes > zero_buf_sz)
bytes = zero_buf_sz;
- ret = crypto_shash_update(desc, zero_buf, bytes);
- if (ret)
- break;
+ sha256_update(&sctx, zero_buf, bytes);
nullsz -= bytes;
}
- if (ret)
- break;
-
sha_regions[j].start = ksegment->mem;
sha_regions[j].len = ksegment->memsz;
j++;
}
- if (!ret) {
- ret = crypto_shash_final(desc, digest);
- if (ret)
- goto out_free_digest;
- ret = kexec_purgatory_get_set_symbol(image, "purgatory_sha_regions",
- sha_regions, sha_region_sz, 0);
- if (ret)
- goto out_free_digest;
+ sha256_final(&sctx, digest);
- ret = kexec_purgatory_get_set_symbol(image, "purgatory_sha256_digest",
- digest, SHA256_DIGEST_SIZE, 0);
- if (ret)
- goto out_free_digest;
- }
+ ret = kexec_purgatory_get_set_symbol(image, "purgatory_sha_regions",
+ sha_regions, sha_region_sz, 0);
+ if (ret)
+ goto out_free_sha_regions;
-out_free_digest:
- kfree(digest);
+ ret = kexec_purgatory_get_set_symbol(image, "purgatory_sha256_digest",
+ digest, SHA256_DIGEST_SIZE, 0);
out_free_sha_regions:
vfree(sha_regions);
-out_free_desc:
- kfree(desc);
-out_free_tfm:
- kfree(tfm);
-out:
return ret;
}
diff --git a/kernel/kexec_handover.c b/kernel/kexec_handover.c
new file mode 100644
index 000000000000..d3b13a909913
--- /dev/null
+++ b/kernel/kexec_handover.c
@@ -0,0 +1,1271 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * kexec_handover.c - kexec handover metadata processing
+ * Copyright (C) 2023 Alexander Graf <graf@amazon.com>
+ * Copyright (C) 2025 Microsoft Corporation, Mike Rapoport <rppt@kernel.org>
+ * Copyright (C) 2025 Google LLC, Changyuan Lyu <changyuanl@google.com>
+ */
+
+#define pr_fmt(fmt) "KHO: " fmt
+
+#include <linux/cma.h>
+#include <linux/count_zeros.h>
+#include <linux/debugfs.h>
+#include <linux/kexec.h>
+#include <linux/kexec_handover.h>
+#include <linux/libfdt.h>
+#include <linux/list.h>
+#include <linux/memblock.h>
+#include <linux/notifier.h>
+#include <linux/page-isolation.h>
+
+#include <asm/early_ioremap.h>
+
+/*
+ * KHO is tightly coupled with mm init and needs access to some of mm
+ * internal APIs.
+ */
+#include "../mm/internal.h"
+#include "kexec_internal.h"
+
+#define KHO_FDT_COMPATIBLE "kho-v1"
+#define PROP_PRESERVED_MEMORY_MAP "preserved-memory-map"
+#define PROP_SUB_FDT "fdt"
+
+static bool kho_enable __ro_after_init;
+
+bool kho_is_enabled(void)
+{
+ return kho_enable;
+}
+EXPORT_SYMBOL_GPL(kho_is_enabled);
+
+static int __init kho_parse_enable(char *p)
+{
+ return kstrtobool(p, &kho_enable);
+}
+early_param("kho", kho_parse_enable);
+
+/*
+ * Keep track of memory that is to be preserved across KHO.
+ *
+ * The serializing side uses two levels of xarrays to manage chunks of per-order
+ * 512 byte bitmaps. For instance if PAGE_SIZE = 4096, the entire 1G order of a
+ * 1TB system would fit inside a single 512 byte bitmap. For order 0 allocations
+ * each bitmap will cover 16M of address space. Thus, for 16G of memory at most
+ * 512K of bitmap memory will be needed for order 0.
+ *
+ * This approach is fully incremental, as the serialization progresses folios
+ * can continue be aggregated to the tracker. The final step, immediately prior
+ * to kexec would serialize the xarray information into a linked list for the
+ * successor kernel to parse.
+ */
+
+#define PRESERVE_BITS (512 * 8)
+
+struct kho_mem_phys_bits {
+ DECLARE_BITMAP(preserve, PRESERVE_BITS);
+};
+
+struct kho_mem_phys {
+ /*
+ * Points to kho_mem_phys_bits, a sparse bitmap array. Each bit is sized
+ * to order.
+ */
+ struct xarray phys_bits;
+};
+
+struct kho_mem_track {
+ /* Points to kho_mem_phys, each order gets its own bitmap tree */
+ struct xarray orders;
+};
+
+struct khoser_mem_chunk;
+
+struct kho_serialization {
+ struct page *fdt;
+ struct list_head fdt_list;
+ struct dentry *sub_fdt_dir;
+ struct kho_mem_track track;
+ /* First chunk of serialized preserved memory map */
+ struct khoser_mem_chunk *preserved_mem_map;
+};
+
+static void *xa_load_or_alloc(struct xarray *xa, unsigned long index, size_t sz)
+{
+ void *elm, *res;
+
+ elm = xa_load(xa, index);
+ if (elm)
+ return elm;
+
+ elm = kzalloc(sz, GFP_KERNEL);
+ if (!elm)
+ return ERR_PTR(-ENOMEM);
+
+ res = xa_cmpxchg(xa, index, NULL, elm, GFP_KERNEL);
+ if (xa_is_err(res))
+ res = ERR_PTR(xa_err(res));
+
+ if (res) {
+ kfree(elm);
+ return res;
+ }
+
+ return elm;
+}
+
+static void __kho_unpreserve(struct kho_mem_track *track, unsigned long pfn,
+ unsigned long end_pfn)
+{
+ struct kho_mem_phys_bits *bits;
+ struct kho_mem_phys *physxa;
+
+ while (pfn < end_pfn) {
+ const unsigned int order =
+ min(count_trailing_zeros(pfn), ilog2(end_pfn - pfn));
+ const unsigned long pfn_high = pfn >> order;
+
+ physxa = xa_load(&track->orders, order);
+ if (!physxa)
+ continue;
+
+ bits = xa_load(&physxa->phys_bits, pfn_high / PRESERVE_BITS);
+ if (!bits)
+ continue;
+
+ clear_bit(pfn_high % PRESERVE_BITS, bits->preserve);
+
+ pfn += 1 << order;
+ }
+}
+
+static int __kho_preserve_order(struct kho_mem_track *track, unsigned long pfn,
+ unsigned int order)
+{
+ struct kho_mem_phys_bits *bits;
+ struct kho_mem_phys *physxa;
+ const unsigned long pfn_high = pfn >> order;
+
+ might_sleep();
+
+ physxa = xa_load_or_alloc(&track->orders, order, sizeof(*physxa));
+ if (IS_ERR(physxa))
+ return PTR_ERR(physxa);
+
+ bits = xa_load_or_alloc(&physxa->phys_bits, pfn_high / PRESERVE_BITS,
+ sizeof(*bits));
+ if (IS_ERR(bits))
+ return PTR_ERR(bits);
+
+ set_bit(pfn_high % PRESERVE_BITS, bits->preserve);
+
+ return 0;
+}
+
+/* almost as free_reserved_page(), just don't free the page */
+static void kho_restore_page(struct page *page, unsigned int order)
+{
+ unsigned int nr_pages = (1 << order);
+
+ /* Head page gets refcount of 1. */
+ set_page_count(page, 1);
+
+ /* For higher order folios, tail pages get a page count of zero. */
+ for (unsigned int i = 1; i < nr_pages; i++)
+ set_page_count(page + i, 0);
+
+ if (order > 0)
+ prep_compound_page(page, order);
+
+ adjust_managed_page_count(page, nr_pages);
+}
+
+/**
+ * kho_restore_folio - recreates the folio from the preserved memory.
+ * @phys: physical address of the folio.
+ *
+ * Return: pointer to the struct folio on success, NULL on failure.
+ */
+struct folio *kho_restore_folio(phys_addr_t phys)
+{
+ struct page *page = pfn_to_online_page(PHYS_PFN(phys));
+ unsigned long order;
+
+ if (!page)
+ return NULL;
+
+ order = page->private;
+ if (order > MAX_PAGE_ORDER)
+ return NULL;
+
+ kho_restore_page(page, order);
+ return page_folio(page);
+}
+EXPORT_SYMBOL_GPL(kho_restore_folio);
+
+/* Serialize and deserialize struct kho_mem_phys across kexec
+ *
+ * Record all the bitmaps in a linked list of pages for the next kernel to
+ * process. Each chunk holds bitmaps of the same order and each block of bitmaps
+ * starts at a given physical address. This allows the bitmaps to be sparse. The
+ * xarray is used to store them in a tree while building up the data structure,
+ * but the KHO successor kernel only needs to process them once in order.
+ *
+ * All of this memory is normal kmalloc() memory and is not marked for
+ * preservation. The successor kernel will remain isolated to the scratch space
+ * until it completes processing this list. Once processed all the memory
+ * storing these ranges will be marked as free.
+ */
+
+struct khoser_mem_bitmap_ptr {
+ phys_addr_t phys_start;
+ DECLARE_KHOSER_PTR(bitmap, struct kho_mem_phys_bits *);
+};
+
+struct khoser_mem_chunk_hdr {
+ DECLARE_KHOSER_PTR(next, struct khoser_mem_chunk *);
+ unsigned int order;
+ unsigned int num_elms;
+};
+
+#define KHOSER_BITMAP_SIZE \
+ ((PAGE_SIZE - sizeof(struct khoser_mem_chunk_hdr)) / \
+ sizeof(struct khoser_mem_bitmap_ptr))
+
+struct khoser_mem_chunk {
+ struct khoser_mem_chunk_hdr hdr;
+ struct khoser_mem_bitmap_ptr bitmaps[KHOSER_BITMAP_SIZE];
+};
+
+static_assert(sizeof(struct khoser_mem_chunk) == PAGE_SIZE);
+
+static struct khoser_mem_chunk *new_chunk(struct khoser_mem_chunk *cur_chunk,
+ unsigned long order)
+{
+ struct khoser_mem_chunk *chunk;
+
+ chunk = kzalloc(PAGE_SIZE, GFP_KERNEL);
+ if (!chunk)
+ return NULL;
+ chunk->hdr.order = order;
+ if (cur_chunk)
+ KHOSER_STORE_PTR(cur_chunk->hdr.next, chunk);
+ return chunk;
+}
+
+static void kho_mem_ser_free(struct khoser_mem_chunk *first_chunk)
+{
+ struct khoser_mem_chunk *chunk = first_chunk;
+
+ while (chunk) {
+ struct khoser_mem_chunk *tmp = chunk;
+
+ chunk = KHOSER_LOAD_PTR(chunk->hdr.next);
+ kfree(tmp);
+ }
+}
+
+static int kho_mem_serialize(struct kho_serialization *ser)
+{
+ struct khoser_mem_chunk *first_chunk = NULL;
+ struct khoser_mem_chunk *chunk = NULL;
+ struct kho_mem_phys *physxa;
+ unsigned long order;
+
+ xa_for_each(&ser->track.orders, order, physxa) {
+ struct kho_mem_phys_bits *bits;
+ unsigned long phys;
+
+ chunk = new_chunk(chunk, order);
+ if (!chunk)
+ goto err_free;
+
+ if (!first_chunk)
+ first_chunk = chunk;
+
+ xa_for_each(&physxa->phys_bits, phys, bits) {
+ struct khoser_mem_bitmap_ptr *elm;
+
+ if (chunk->hdr.num_elms == ARRAY_SIZE(chunk->bitmaps)) {
+ chunk = new_chunk(chunk, order);
+ if (!chunk)
+ goto err_free;
+ }
+
+ elm = &chunk->bitmaps[chunk->hdr.num_elms];
+ chunk->hdr.num_elms++;
+ elm->phys_start = (phys * PRESERVE_BITS)
+ << (order + PAGE_SHIFT);
+ KHOSER_STORE_PTR(elm->bitmap, bits);
+ }
+ }
+
+ ser->preserved_mem_map = first_chunk;
+
+ return 0;
+
+err_free:
+ kho_mem_ser_free(first_chunk);
+ return -ENOMEM;
+}
+
+static void __init deserialize_bitmap(unsigned int order,
+ struct khoser_mem_bitmap_ptr *elm)
+{
+ struct kho_mem_phys_bits *bitmap = KHOSER_LOAD_PTR(elm->bitmap);
+ unsigned long bit;
+
+ for_each_set_bit(bit, bitmap->preserve, PRESERVE_BITS) {
+ int sz = 1 << (order + PAGE_SHIFT);
+ phys_addr_t phys =
+ elm->phys_start + (bit << (order + PAGE_SHIFT));
+ struct page *page = phys_to_page(phys);
+
+ memblock_reserve(phys, sz);
+ memblock_reserved_mark_noinit(phys, sz);
+ page->private = order;
+ }
+}
+
+static void __init kho_mem_deserialize(const void *fdt)
+{
+ struct khoser_mem_chunk *chunk;
+ const phys_addr_t *mem;
+ int len;
+
+ mem = fdt_getprop(fdt, 0, PROP_PRESERVED_MEMORY_MAP, &len);
+
+ if (!mem || len != sizeof(*mem)) {
+ pr_err("failed to get preserved memory bitmaps\n");
+ return;
+ }
+
+ chunk = *mem ? phys_to_virt(*mem) : NULL;
+ while (chunk) {
+ unsigned int i;
+
+ for (i = 0; i != chunk->hdr.num_elms; i++)
+ deserialize_bitmap(chunk->hdr.order,
+ &chunk->bitmaps[i]);
+ chunk = KHOSER_LOAD_PTR(chunk->hdr.next);
+ }
+}
+
+/*
+ * With KHO enabled, memory can become fragmented because KHO regions may
+ * be anywhere in physical address space. The scratch regions give us a
+ * safe zones that we will never see KHO allocations from. This is where we
+ * can later safely load our new kexec images into and then use the scratch
+ * area for early allocations that happen before page allocator is
+ * initialized.
+ */
+static struct kho_scratch *kho_scratch;
+static unsigned int kho_scratch_cnt;
+
+/*
+ * The scratch areas are scaled by default as percent of memory allocated from
+ * memblock. A user can override the scale with command line parameter:
+ *
+ * kho_scratch=N%
+ *
+ * It is also possible to explicitly define size for a lowmem, a global and
+ * per-node scratch areas:
+ *
+ * kho_scratch=l[KMG],n[KMG],m[KMG]
+ *
+ * The explicit size definition takes precedence over scale definition.
+ */
+static unsigned int scratch_scale __initdata = 200;
+static phys_addr_t scratch_size_global __initdata;
+static phys_addr_t scratch_size_pernode __initdata;
+static phys_addr_t scratch_size_lowmem __initdata;
+
+static int __init kho_parse_scratch_size(char *p)
+{
+ size_t len;
+ unsigned long sizes[3];
+ int i;
+
+ if (!p)
+ return -EINVAL;
+
+ len = strlen(p);
+ if (!len)
+ return -EINVAL;
+
+ /* parse nn% */
+ if (p[len - 1] == '%') {
+ /* unsigned int max is 4,294,967,295, 10 chars */
+ char s_scale[11] = {};
+ int ret = 0;
+
+ if (len > ARRAY_SIZE(s_scale))
+ return -EINVAL;
+
+ memcpy(s_scale, p, len - 1);
+ ret = kstrtouint(s_scale, 10, &scratch_scale);
+ if (!ret)
+ pr_notice("scratch scale is %d%%\n", scratch_scale);
+ return ret;
+ }
+
+ /* parse ll[KMG],mm[KMG],nn[KMG] */
+ for (i = 0; i < ARRAY_SIZE(sizes); i++) {
+ char *endp = p;
+
+ if (i > 0) {
+ if (*p != ',')
+ return -EINVAL;
+ p += 1;
+ }
+
+ sizes[i] = memparse(p, &endp);
+ if (!sizes[i] || endp == p)
+ return -EINVAL;
+ p = endp;
+ }
+
+ scratch_size_lowmem = sizes[0];
+ scratch_size_global = sizes[1];
+ scratch_size_pernode = sizes[2];
+ scratch_scale = 0;
+
+ pr_notice("scratch areas: lowmem: %lluMiB global: %lluMiB pernode: %lldMiB\n",
+ (u64)(scratch_size_lowmem >> 20),
+ (u64)(scratch_size_global >> 20),
+ (u64)(scratch_size_pernode >> 20));
+
+ return 0;
+}
+early_param("kho_scratch", kho_parse_scratch_size);
+
+static void __init scratch_size_update(void)
+{
+ phys_addr_t size;
+
+ if (!scratch_scale)
+ return;
+
+ size = memblock_reserved_kern_size(ARCH_LOW_ADDRESS_LIMIT,
+ NUMA_NO_NODE);
+ size = size * scratch_scale / 100;
+ scratch_size_lowmem = round_up(size, CMA_MIN_ALIGNMENT_BYTES);
+
+ size = memblock_reserved_kern_size(MEMBLOCK_ALLOC_ANYWHERE,
+ NUMA_NO_NODE);
+ size = size * scratch_scale / 100 - scratch_size_lowmem;
+ scratch_size_global = round_up(size, CMA_MIN_ALIGNMENT_BYTES);
+}
+
+static phys_addr_t __init scratch_size_node(int nid)
+{
+ phys_addr_t size;
+
+ if (scratch_scale) {
+ size = memblock_reserved_kern_size(MEMBLOCK_ALLOC_ANYWHERE,
+ nid);
+ size = size * scratch_scale / 100;
+ } else {
+ size = scratch_size_pernode;
+ }
+
+ return round_up(size, CMA_MIN_ALIGNMENT_BYTES);
+}
+
+/**
+ * kho_reserve_scratch - Reserve a contiguous chunk of memory for kexec
+ *
+ * With KHO we can preserve arbitrary pages in the system. To ensure we still
+ * have a large contiguous region of memory when we search the physical address
+ * space for target memory, let's make sure we always have a large CMA region
+ * active. This CMA region will only be used for movable pages which are not a
+ * problem for us during KHO because we can just move them somewhere else.
+ */
+static void __init kho_reserve_scratch(void)
+{
+ phys_addr_t addr, size;
+ int nid, i = 0;
+
+ if (!kho_enable)
+ return;
+
+ scratch_size_update();
+
+ /* FIXME: deal with node hot-plug/remove */
+ kho_scratch_cnt = num_online_nodes() + 2;
+ size = kho_scratch_cnt * sizeof(*kho_scratch);
+ kho_scratch = memblock_alloc(size, PAGE_SIZE);
+ if (!kho_scratch)
+ goto err_disable_kho;
+
+ /*
+ * reserve scratch area in low memory for lowmem allocations in the
+ * next kernel
+ */
+ size = scratch_size_lowmem;
+ addr = memblock_phys_alloc_range(size, CMA_MIN_ALIGNMENT_BYTES, 0,
+ ARCH_LOW_ADDRESS_LIMIT);
+ if (!addr)
+ goto err_free_scratch_desc;
+
+ kho_scratch[i].addr = addr;
+ kho_scratch[i].size = size;
+ i++;
+
+ /* reserve large contiguous area for allocations without nid */
+ size = scratch_size_global;
+ addr = memblock_phys_alloc(size, CMA_MIN_ALIGNMENT_BYTES);
+ if (!addr)
+ goto err_free_scratch_areas;
+
+ kho_scratch[i].addr = addr;
+ kho_scratch[i].size = size;
+ i++;
+
+ for_each_online_node(nid) {
+ size = scratch_size_node(nid);
+ addr = memblock_alloc_range_nid(size, CMA_MIN_ALIGNMENT_BYTES,
+ 0, MEMBLOCK_ALLOC_ACCESSIBLE,
+ nid, true);
+ if (!addr)
+ goto err_free_scratch_areas;
+
+ kho_scratch[i].addr = addr;
+ kho_scratch[i].size = size;
+ i++;
+ }
+
+ return;
+
+err_free_scratch_areas:
+ for (i--; i >= 0; i--)
+ memblock_phys_free(kho_scratch[i].addr, kho_scratch[i].size);
+err_free_scratch_desc:
+ memblock_free(kho_scratch, kho_scratch_cnt * sizeof(*kho_scratch));
+err_disable_kho:
+ kho_enable = false;
+}
+
+struct fdt_debugfs {
+ struct list_head list;
+ struct debugfs_blob_wrapper wrapper;
+ struct dentry *file;
+};
+
+static int kho_debugfs_fdt_add(struct list_head *list, struct dentry *dir,
+ const char *name, const void *fdt)
+{
+ struct fdt_debugfs *f;
+ struct dentry *file;
+
+ f = kmalloc(sizeof(*f), GFP_KERNEL);
+ if (!f)
+ return -ENOMEM;
+
+ f->wrapper.data = (void *)fdt;
+ f->wrapper.size = fdt_totalsize(fdt);
+
+ file = debugfs_create_blob(name, 0400, dir, &f->wrapper);
+ if (IS_ERR(file)) {
+ kfree(f);
+ return PTR_ERR(file);
+ }
+
+ f->file = file;
+ list_add(&f->list, list);
+
+ return 0;
+}
+
+/**
+ * kho_add_subtree - record the physical address of a sub FDT in KHO root tree.
+ * @ser: serialization control object passed by KHO notifiers.
+ * @name: name of the sub tree.
+ * @fdt: the sub tree blob.
+ *
+ * Creates a new child node named @name in KHO root FDT and records
+ * the physical address of @fdt. The pages of @fdt must also be preserved
+ * by KHO for the new kernel to retrieve it after kexec.
+ *
+ * A debugfs blob entry is also created at
+ * ``/sys/kernel/debug/kho/out/sub_fdts/@name``.
+ *
+ * Return: 0 on success, error code on failure
+ */
+int kho_add_subtree(struct kho_serialization *ser, const char *name, void *fdt)
+{
+ int err = 0;
+ u64 phys = (u64)virt_to_phys(fdt);
+ void *root = page_to_virt(ser->fdt);
+
+ err |= fdt_begin_node(root, name);
+ err |= fdt_property(root, PROP_SUB_FDT, &phys, sizeof(phys));
+ err |= fdt_end_node(root);
+
+ if (err)
+ return err;
+
+ return kho_debugfs_fdt_add(&ser->fdt_list, ser->sub_fdt_dir, name, fdt);
+}
+EXPORT_SYMBOL_GPL(kho_add_subtree);
+
+struct kho_out {
+ struct blocking_notifier_head chain_head;
+
+ struct dentry *dir;
+
+ struct mutex lock; /* protects KHO FDT finalization */
+
+ struct kho_serialization ser;
+ bool finalized;
+};
+
+static struct kho_out kho_out = {
+ .chain_head = BLOCKING_NOTIFIER_INIT(kho_out.chain_head),
+ .lock = __MUTEX_INITIALIZER(kho_out.lock),
+ .ser = {
+ .fdt_list = LIST_HEAD_INIT(kho_out.ser.fdt_list),
+ .track = {
+ .orders = XARRAY_INIT(kho_out.ser.track.orders, 0),
+ },
+ },
+ .finalized = false,
+};
+
+int register_kho_notifier(struct notifier_block *nb)
+{
+ return blocking_notifier_chain_register(&kho_out.chain_head, nb);
+}
+EXPORT_SYMBOL_GPL(register_kho_notifier);
+
+int unregister_kho_notifier(struct notifier_block *nb)
+{
+ return blocking_notifier_chain_unregister(&kho_out.chain_head, nb);
+}
+EXPORT_SYMBOL_GPL(unregister_kho_notifier);
+
+/**
+ * kho_preserve_folio - preserve a folio across kexec.
+ * @folio: folio to preserve.
+ *
+ * Instructs KHO to preserve the whole folio across kexec. The order
+ * will be preserved as well.
+ *
+ * Return: 0 on success, error code on failure
+ */
+int kho_preserve_folio(struct folio *folio)
+{
+ const unsigned long pfn = folio_pfn(folio);
+ const unsigned int order = folio_order(folio);
+ struct kho_mem_track *track = &kho_out.ser.track;
+
+ if (kho_out.finalized)
+ return -EBUSY;
+
+ return __kho_preserve_order(track, pfn, order);
+}
+EXPORT_SYMBOL_GPL(kho_preserve_folio);
+
+/**
+ * kho_preserve_phys - preserve a physically contiguous range across kexec.
+ * @phys: physical address of the range.
+ * @size: size of the range.
+ *
+ * Instructs KHO to preserve the memory range from @phys to @phys + @size
+ * across kexec.
+ *
+ * Return: 0 on success, error code on failure
+ */
+int kho_preserve_phys(phys_addr_t phys, size_t size)
+{
+ unsigned long pfn = PHYS_PFN(phys);
+ unsigned long failed_pfn = 0;
+ const unsigned long start_pfn = pfn;
+ const unsigned long end_pfn = PHYS_PFN(phys + size);
+ int err = 0;
+ struct kho_mem_track *track = &kho_out.ser.track;
+
+ if (kho_out.finalized)
+ return -EBUSY;
+
+ if (!PAGE_ALIGNED(phys) || !PAGE_ALIGNED(size))
+ return -EINVAL;
+
+ while (pfn < end_pfn) {
+ const unsigned int order =
+ min(count_trailing_zeros(pfn), ilog2(end_pfn - pfn));
+
+ err = __kho_preserve_order(track, pfn, order);
+ if (err) {
+ failed_pfn = pfn;
+ break;
+ }
+
+ pfn += 1 << order;
+ }
+
+ if (err)
+ __kho_unpreserve(track, start_pfn, failed_pfn);
+
+ return err;
+}
+EXPORT_SYMBOL_GPL(kho_preserve_phys);
+
+/* Handling for debug/kho/out */
+
+static struct dentry *debugfs_root;
+
+static int kho_out_update_debugfs_fdt(void)
+{
+ int err = 0;
+ struct fdt_debugfs *ff, *tmp;
+
+ if (kho_out.finalized) {
+ err = kho_debugfs_fdt_add(&kho_out.ser.fdt_list, kho_out.dir,
+ "fdt", page_to_virt(kho_out.ser.fdt));
+ } else {
+ list_for_each_entry_safe(ff, tmp, &kho_out.ser.fdt_list, list) {
+ debugfs_remove(ff->file);
+ list_del(&ff->list);
+ kfree(ff);
+ }
+ }
+
+ return err;
+}
+
+static int kho_abort(void)
+{
+ int err;
+ unsigned long order;
+ struct kho_mem_phys *physxa;
+
+ xa_for_each(&kho_out.ser.track.orders, order, physxa) {
+ struct kho_mem_phys_bits *bits;
+ unsigned long phys;
+
+ xa_for_each(&physxa->phys_bits, phys, bits)
+ kfree(bits);
+
+ xa_destroy(&physxa->phys_bits);
+ kfree(physxa);
+ }
+ xa_destroy(&kho_out.ser.track.orders);
+
+ if (kho_out.ser.preserved_mem_map) {
+ kho_mem_ser_free(kho_out.ser.preserved_mem_map);
+ kho_out.ser.preserved_mem_map = NULL;
+ }
+
+ err = blocking_notifier_call_chain(&kho_out.chain_head, KEXEC_KHO_ABORT,
+ NULL);
+ err = notifier_to_errno(err);
+
+ if (err)
+ pr_err("Failed to abort KHO finalization: %d\n", err);
+
+ return err;
+}
+
+static int kho_finalize(void)
+{
+ int err = 0;
+ u64 *preserved_mem_map;
+ void *fdt = page_to_virt(kho_out.ser.fdt);
+
+ err |= fdt_create(fdt, PAGE_SIZE);
+ err |= fdt_finish_reservemap(fdt);
+ err |= fdt_begin_node(fdt, "");
+ err |= fdt_property_string(fdt, "compatible", KHO_FDT_COMPATIBLE);
+ /**
+ * Reserve the preserved-memory-map property in the root FDT, so
+ * that all property definitions will precede subnodes created by
+ * KHO callers.
+ */
+ err |= fdt_property_placeholder(fdt, PROP_PRESERVED_MEMORY_MAP,
+ sizeof(*preserved_mem_map),
+ (void **)&preserved_mem_map);
+ if (err)
+ goto abort;
+
+ err = kho_preserve_folio(page_folio(kho_out.ser.fdt));
+ if (err)
+ goto abort;
+
+ err = blocking_notifier_call_chain(&kho_out.chain_head,
+ KEXEC_KHO_FINALIZE, &kho_out.ser);
+ err = notifier_to_errno(err);
+ if (err)
+ goto abort;
+
+ err = kho_mem_serialize(&kho_out.ser);
+ if (err)
+ goto abort;
+
+ *preserved_mem_map = (u64)virt_to_phys(kho_out.ser.preserved_mem_map);
+
+ err |= fdt_end_node(fdt);
+ err |= fdt_finish(fdt);
+
+abort:
+ if (err) {
+ pr_err("Failed to convert KHO state tree: %d\n", err);
+ kho_abort();
+ }
+
+ return err;
+}
+
+static int kho_out_finalize_get(void *data, u64 *val)
+{
+ mutex_lock(&kho_out.lock);
+ *val = kho_out.finalized;
+ mutex_unlock(&kho_out.lock);
+
+ return 0;
+}
+
+static int kho_out_finalize_set(void *data, u64 _val)
+{
+ int ret = 0;
+ bool val = !!_val;
+
+ mutex_lock(&kho_out.lock);
+
+ if (val == kho_out.finalized) {
+ if (kho_out.finalized)
+ ret = -EEXIST;
+ else
+ ret = -ENOENT;
+ goto unlock;
+ }
+
+ if (val)
+ ret = kho_finalize();
+ else
+ ret = kho_abort();
+
+ if (ret)
+ goto unlock;
+
+ kho_out.finalized = val;
+ ret = kho_out_update_debugfs_fdt();
+
+unlock:
+ mutex_unlock(&kho_out.lock);
+ return ret;
+}
+
+DEFINE_DEBUGFS_ATTRIBUTE(fops_kho_out_finalize, kho_out_finalize_get,
+ kho_out_finalize_set, "%llu\n");
+
+static int scratch_phys_show(struct seq_file *m, void *v)
+{
+ for (int i = 0; i < kho_scratch_cnt; i++)
+ seq_printf(m, "0x%llx\n", kho_scratch[i].addr);
+
+ return 0;
+}
+DEFINE_SHOW_ATTRIBUTE(scratch_phys);
+
+static int scratch_len_show(struct seq_file *m, void *v)
+{
+ for (int i = 0; i < kho_scratch_cnt; i++)
+ seq_printf(m, "0x%llx\n", kho_scratch[i].size);
+
+ return 0;
+}
+DEFINE_SHOW_ATTRIBUTE(scratch_len);
+
+static __init int kho_out_debugfs_init(void)
+{
+ struct dentry *dir, *f, *sub_fdt_dir;
+
+ dir = debugfs_create_dir("out", debugfs_root);
+ if (IS_ERR(dir))
+ return -ENOMEM;
+
+ sub_fdt_dir = debugfs_create_dir("sub_fdts", dir);
+ if (IS_ERR(sub_fdt_dir))
+ goto err_rmdir;
+
+ f = debugfs_create_file("scratch_phys", 0400, dir, NULL,
+ &scratch_phys_fops);
+ if (IS_ERR(f))
+ goto err_rmdir;
+
+ f = debugfs_create_file("scratch_len", 0400, dir, NULL,
+ &scratch_len_fops);
+ if (IS_ERR(f))
+ goto err_rmdir;
+
+ f = debugfs_create_file("finalize", 0600, dir, NULL,
+ &fops_kho_out_finalize);
+ if (IS_ERR(f))
+ goto err_rmdir;
+
+ kho_out.dir = dir;
+ kho_out.ser.sub_fdt_dir = sub_fdt_dir;
+ return 0;
+
+err_rmdir:
+ debugfs_remove_recursive(dir);
+ return -ENOENT;
+}
+
+struct kho_in {
+ struct dentry *dir;
+ phys_addr_t fdt_phys;
+ phys_addr_t scratch_phys;
+ struct list_head fdt_list;
+};
+
+static struct kho_in kho_in = {
+ .fdt_list = LIST_HEAD_INIT(kho_in.fdt_list),
+};
+
+static const void *kho_get_fdt(void)
+{
+ return kho_in.fdt_phys ? phys_to_virt(kho_in.fdt_phys) : NULL;
+}
+
+/**
+ * kho_retrieve_subtree - retrieve a preserved sub FDT by its name.
+ * @name: the name of the sub FDT passed to kho_add_subtree().
+ * @phys: if found, the physical address of the sub FDT is stored in @phys.
+ *
+ * Retrieve a preserved sub FDT named @name and store its physical
+ * address in @phys.
+ *
+ * Return: 0 on success, error code on failure
+ */
+int kho_retrieve_subtree(const char *name, phys_addr_t *phys)
+{
+ const void *fdt = kho_get_fdt();
+ const u64 *val;
+ int offset, len;
+
+ if (!fdt)
+ return -ENOENT;
+
+ if (!phys)
+ return -EINVAL;
+
+ offset = fdt_subnode_offset(fdt, 0, name);
+ if (offset < 0)
+ return -ENOENT;
+
+ val = fdt_getprop(fdt, offset, PROP_SUB_FDT, &len);
+ if (!val || len != sizeof(*val))
+ return -EINVAL;
+
+ *phys = (phys_addr_t)*val;
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(kho_retrieve_subtree);
+
+/* Handling for debugfs/kho/in */
+
+static __init int kho_in_debugfs_init(const void *fdt)
+{
+ struct dentry *sub_fdt_dir;
+ int err, child;
+
+ kho_in.dir = debugfs_create_dir("in", debugfs_root);
+ if (IS_ERR(kho_in.dir))
+ return PTR_ERR(kho_in.dir);
+
+ sub_fdt_dir = debugfs_create_dir("sub_fdts", kho_in.dir);
+ if (IS_ERR(sub_fdt_dir)) {
+ err = PTR_ERR(sub_fdt_dir);
+ goto err_rmdir;
+ }
+
+ err = kho_debugfs_fdt_add(&kho_in.fdt_list, kho_in.dir, "fdt", fdt);
+ if (err)
+ goto err_rmdir;
+
+ fdt_for_each_subnode(child, fdt, 0) {
+ int len = 0;
+ const char *name = fdt_get_name(fdt, child, NULL);
+ const u64 *fdt_phys;
+
+ fdt_phys = fdt_getprop(fdt, child, "fdt", &len);
+ if (!fdt_phys)
+ continue;
+ if (len != sizeof(*fdt_phys)) {
+ pr_warn("node `%s`'s prop `fdt` has invalid length: %d\n",
+ name, len);
+ continue;
+ }
+ err = kho_debugfs_fdt_add(&kho_in.fdt_list, sub_fdt_dir, name,
+ phys_to_virt(*fdt_phys));
+ if (err) {
+ pr_warn("failed to add fdt `%s` to debugfs: %d\n", name,
+ err);
+ continue;
+ }
+ }
+
+ return 0;
+
+err_rmdir:
+ debugfs_remove_recursive(kho_in.dir);
+ return err;
+}
+
+static __init int kho_init(void)
+{
+ int err = 0;
+ const void *fdt = kho_get_fdt();
+
+ if (!kho_enable)
+ return 0;
+
+ kho_out.ser.fdt = alloc_page(GFP_KERNEL);
+ if (!kho_out.ser.fdt) {
+ err = -ENOMEM;
+ goto err_free_scratch;
+ }
+
+ debugfs_root = debugfs_create_dir("kho", NULL);
+ if (IS_ERR(debugfs_root)) {
+ err = -ENOENT;
+ goto err_free_fdt;
+ }
+
+ err = kho_out_debugfs_init();
+ if (err)
+ goto err_free_fdt;
+
+ if (fdt) {
+ err = kho_in_debugfs_init(fdt);
+ /*
+ * Failure to create /sys/kernel/debug/kho/in does not prevent
+ * reviving state from KHO and setting up KHO for the next
+ * kexec.
+ */
+ if (err)
+ pr_err("failed exposing handover FDT in debugfs: %d\n",
+ err);
+
+ return 0;
+ }
+
+ for (int i = 0; i < kho_scratch_cnt; i++) {
+ unsigned long base_pfn = PHYS_PFN(kho_scratch[i].addr);
+ unsigned long count = kho_scratch[i].size >> PAGE_SHIFT;
+ unsigned long pfn;
+
+ for (pfn = base_pfn; pfn < base_pfn + count;
+ pfn += pageblock_nr_pages)
+ init_cma_reserved_pageblock(pfn_to_page(pfn));
+ }
+
+ return 0;
+
+err_free_fdt:
+ put_page(kho_out.ser.fdt);
+ kho_out.ser.fdt = NULL;
+err_free_scratch:
+ for (int i = 0; i < kho_scratch_cnt; i++) {
+ void *start = __va(kho_scratch[i].addr);
+ void *end = start + kho_scratch[i].size;
+
+ free_reserved_area(start, end, -1, "");
+ }
+ kho_enable = false;
+ return err;
+}
+late_initcall(kho_init);
+
+static void __init kho_release_scratch(void)
+{
+ phys_addr_t start, end;
+ u64 i;
+
+ memmap_init_kho_scratch_pages();
+
+ /*
+ * Mark scratch mem as CMA before we return it. That way we
+ * ensure that no kernel allocations happen on it. That means
+ * we can reuse it as scratch memory again later.
+ */
+ __for_each_mem_range(i, &memblock.memory, NULL, NUMA_NO_NODE,
+ MEMBLOCK_KHO_SCRATCH, &start, &end, NULL) {
+ ulong start_pfn = pageblock_start_pfn(PFN_DOWN(start));
+ ulong end_pfn = pageblock_align(PFN_UP(end));
+ ulong pfn;
+
+ for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages)
+ set_pageblock_migratetype(pfn_to_page(pfn),
+ MIGRATE_CMA);
+ }
+}
+
+void __init kho_memory_init(void)
+{
+ struct folio *folio;
+
+ if (kho_in.scratch_phys) {
+ kho_scratch = phys_to_virt(kho_in.scratch_phys);
+ kho_release_scratch();
+
+ kho_mem_deserialize(kho_get_fdt());
+ folio = kho_restore_folio(kho_in.fdt_phys);
+ if (!folio)
+ pr_warn("failed to restore folio for KHO fdt\n");
+ } else {
+ kho_reserve_scratch();
+ }
+}
+
+void __init kho_populate(phys_addr_t fdt_phys, u64 fdt_len,
+ phys_addr_t scratch_phys, u64 scratch_len)
+{
+ void *fdt = NULL;
+ struct kho_scratch *scratch = NULL;
+ int err = 0;
+ unsigned int scratch_cnt = scratch_len / sizeof(*kho_scratch);
+
+ /* Validate the input FDT */
+ fdt = early_memremap(fdt_phys, fdt_len);
+ if (!fdt) {
+ pr_warn("setup: failed to memremap FDT (0x%llx)\n", fdt_phys);
+ err = -EFAULT;
+ goto out;
+ }
+ err = fdt_check_header(fdt);
+ if (err) {
+ pr_warn("setup: handover FDT (0x%llx) is invalid: %d\n",
+ fdt_phys, err);
+ err = -EINVAL;
+ goto out;
+ }
+ err = fdt_node_check_compatible(fdt, 0, KHO_FDT_COMPATIBLE);
+ if (err) {
+ pr_warn("setup: handover FDT (0x%llx) is incompatible with '%s': %d\n",
+ fdt_phys, KHO_FDT_COMPATIBLE, err);
+ err = -EINVAL;
+ goto out;
+ }
+
+ scratch = early_memremap(scratch_phys, scratch_len);
+ if (!scratch) {
+ pr_warn("setup: failed to memremap scratch (phys=0x%llx, len=%lld)\n",
+ scratch_phys, scratch_len);
+ err = -EFAULT;
+ goto out;
+ }
+
+ /*
+ * We pass a safe contiguous blocks of memory to use for early boot
+ * purporses from the previous kernel so that we can resize the
+ * memblock array as needed.
+ */
+ for (int i = 0; i < scratch_cnt; i++) {
+ struct kho_scratch *area = &scratch[i];
+ u64 size = area->size;
+
+ memblock_add(area->addr, size);
+ err = memblock_mark_kho_scratch(area->addr, size);
+ if (WARN_ON(err)) {
+ pr_warn("failed to mark the scratch region 0x%pa+0x%pa: %d",
+ &area->addr, &size, err);
+ goto out;
+ }
+ pr_debug("Marked 0x%pa+0x%pa as scratch", &area->addr, &size);
+ }
+
+ memblock_reserve(scratch_phys, scratch_len);
+
+ /*
+ * Now that we have a viable region of scratch memory, let's tell
+ * the memblocks allocator to only use that for any allocations.
+ * That way we ensure that nothing scribbles over in use data while
+ * we initialize the page tables which we will need to ingest all
+ * memory reservations from the previous kernel.
+ */
+ memblock_set_kho_scratch_only();
+
+ kho_in.fdt_phys = fdt_phys;
+ kho_in.scratch_phys = scratch_phys;
+ kho_scratch_cnt = scratch_cnt;
+ pr_info("found kexec handover data. Will skip init for some devices\n");
+
+out:
+ if (fdt)
+ early_memunmap(fdt, fdt_len);
+ if (scratch)
+ early_memunmap(scratch, scratch_len);
+ if (err)
+ pr_warn("disabling KHO revival: %d\n", err);
+}
+
+/* Helper functions for kexec_file_load */
+
+int kho_fill_kimage(struct kimage *image)
+{
+ ssize_t scratch_size;
+ int err = 0;
+ struct kexec_buf scratch;
+
+ if (!kho_enable)
+ return 0;
+
+ image->kho.fdt = page_to_phys(kho_out.ser.fdt);
+
+ scratch_size = sizeof(*kho_scratch) * kho_scratch_cnt;
+ scratch = (struct kexec_buf){
+ .image = image,
+ .buffer = kho_scratch,
+ .bufsz = scratch_size,
+ .mem = KEXEC_BUF_MEM_UNKNOWN,
+ .memsz = scratch_size,
+ .buf_align = SZ_64K, /* Makes it easier to map */
+ .buf_max = ULONG_MAX,
+ .top_down = true,
+ };
+ err = kexec_add_buffer(&scratch);
+ if (err)
+ return err;
+ image->kho.scratch = &image->segment[image->nr_segments - 1];
+
+ return 0;
+}
+
+static int kho_walk_scratch(struct kexec_buf *kbuf,
+ int (*func)(struct resource *, void *))
+{
+ int ret = 0;
+ int i;
+
+ for (i = 0; i < kho_scratch_cnt; i++) {
+ struct resource res = {
+ .start = kho_scratch[i].addr,
+ .end = kho_scratch[i].addr + kho_scratch[i].size - 1,
+ };
+
+ /* Try to fit the kimage into our KHO scratch region */
+ ret = func(&res, kbuf);
+ if (ret)
+ break;
+ }
+
+ return ret;
+}
+
+int kho_locate_mem_hole(struct kexec_buf *kbuf,
+ int (*func)(struct resource *, void *))
+{
+ int ret;
+
+ if (!kho_enable || kbuf->image->type == KEXEC_TYPE_CRASH)
+ return 1;
+
+ ret = kho_walk_scratch(kbuf, func);
+
+ return ret == 1 ? 0 : -EADDRNOTAVAIL;
+}
diff --git a/kernel/kexec_internal.h b/kernel/kexec_internal.h
index d35d9792402d..30a733a55a67 100644
--- a/kernel/kexec_internal.h
+++ b/kernel/kexec_internal.h
@@ -39,4 +39,20 @@ extern size_t kexec_purgatory_size;
#else /* CONFIG_KEXEC_FILE */
static inline void kimage_file_post_load_cleanup(struct kimage *image) { }
#endif /* CONFIG_KEXEC_FILE */
+
+struct kexec_buf;
+
+#ifdef CONFIG_KEXEC_HANDOVER
+int kho_locate_mem_hole(struct kexec_buf *kbuf,
+ int (*func)(struct resource *, void *));
+int kho_fill_kimage(struct kimage *image);
+#else
+static inline int kho_locate_mem_hole(struct kexec_buf *kbuf,
+ int (*func)(struct resource *, void *))
+{
+ return 1;
+}
+
+static inline int kho_fill_kimage(struct kimage *image) { return 0; }
+#endif /* CONFIG_KEXEC_HANDOVER */
#endif /* LINUX_KEXEC_INTERNAL_H */
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index ffe0c3d52306..ab8f9fc1f0d1 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -135,8 +135,12 @@ struct kprobe_insn_cache kprobe_insn_slots = {
static int collect_garbage_slots(struct kprobe_insn_cache *c);
/**
- * __get_insn_slot() - Find a slot on an executable page for an instruction.
- * We allocate an executable page if there's no room on existing ones.
+ * __get_insn_slot - Find a slot on an executable page for an instruction.
+ * @c: Pointer to kprobe instruction cache
+ *
+ * Description: Locates available slot on existing executable pages,
+ * allocates an executable page if there's no room on existing ones.
+ * Return: Pointer to instruction slot on success, NULL on failure.
*/
kprobe_opcode_t *__get_insn_slot(struct kprobe_insn_cache *c)
{
diff --git a/kernel/stackleak.c b/kernel/kstack_erase.c
index bb65321761b4..e49bb88b4f0a 100644
--- a/kernel/stackleak.c
+++ b/kernel/kstack_erase.c
@@ -6,14 +6,14 @@
*
* Author: Alexander Popov <alex.popov@linux.com>
*
- * STACKLEAK reduces the information which kernel stack leak bugs can
+ * KSTACK_ERASE reduces the information which kernel stack leak bugs can
* reveal and blocks some uninitialized stack variable attacks.
*/
-#include <linux/stackleak.h>
+#include <linux/kstack_erase.h>
#include <linux/kprobes.h>
-#ifdef CONFIG_STACKLEAK_RUNTIME_DISABLE
+#ifdef CONFIG_KSTACK_ERASE_RUNTIME_DISABLE
#include <linux/jump_label.h>
#include <linux/string_choices.h>
#include <linux/sysctl.h>
@@ -68,7 +68,7 @@ late_initcall(stackleak_sysctls_init);
#define skip_erasing() static_branch_unlikely(&stack_erasing_bypass)
#else
#define skip_erasing() false
-#endif /* CONFIG_STACKLEAK_RUNTIME_DISABLE */
+#endif /* CONFIG_KSTACK_ERASE_RUNTIME_DISABLE */
#ifndef __stackleak_poison
static __always_inline void __stackleak_poison(unsigned long erase_low,
@@ -91,7 +91,7 @@ static __always_inline void __stackleak_erase(bool on_task_stack)
erase_low = stackleak_find_top_of_poison(task_stack_low,
current->lowest_stack);
-#ifdef CONFIG_STACKLEAK_METRICS
+#ifdef CONFIG_KSTACK_ERASE_METRICS
current->prev_lowest_stack = erase_low;
#endif
@@ -113,7 +113,7 @@ static __always_inline void __stackleak_erase(bool on_task_stack)
else
erase_high = task_stack_high;
- __stackleak_poison(erase_low, erase_high, STACKLEAK_POISON);
+ __stackleak_poison(erase_low, erase_high, KSTACK_ERASE_POISON);
/* Reset the 'lowest_stack' value for the next syscall */
current->lowest_stack = task_stack_high;
@@ -156,16 +156,16 @@ asmlinkage void noinstr stackleak_erase_off_task_stack(void)
__stackleak_erase(false);
}
-void __used __no_caller_saved_registers noinstr stackleak_track_stack(void)
+void __used __no_caller_saved_registers noinstr __sanitizer_cov_stack_depth(void)
{
unsigned long sp = current_stack_pointer;
/*
- * Having CONFIG_STACKLEAK_TRACK_MIN_SIZE larger than
- * STACKLEAK_SEARCH_DEPTH makes the poison search in
+ * Having CONFIG_KSTACK_ERASE_TRACK_MIN_SIZE larger than
+ * KSTACK_ERASE_SEARCH_DEPTH makes the poison search in
* stackleak_erase() unreliable. Let's prevent that.
*/
- BUILD_BUG_ON(CONFIG_STACKLEAK_TRACK_MIN_SIZE > STACKLEAK_SEARCH_DEPTH);
+ BUILD_BUG_ON(CONFIG_KSTACK_ERASE_TRACK_MIN_SIZE > KSTACK_ERASE_SEARCH_DEPTH);
/* 'lowest_stack' should be aligned on the register width boundary */
sp = ALIGN(sp, sizeof(unsigned long));
@@ -174,4 +174,4 @@ void __used __no_caller_saved_registers noinstr stackleak_track_stack(void)
current->lowest_stack = sp;
}
}
-EXPORT_SYMBOL(stackleak_track_stack);
+EXPORT_SYMBOL(__sanitizer_cov_stack_depth);
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 77c44924cf54..85fc068f0083 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -1207,7 +1207,8 @@ EXPORT_SYMBOL_GPL(kthread_queue_work);
*/
void kthread_delayed_work_timer_fn(struct timer_list *t)
{
- struct kthread_delayed_work *dwork = from_timer(dwork, t, timer);
+ struct kthread_delayed_work *dwork = timer_container_of(dwork, t,
+ timer);
struct kthread_work *work = &dwork->work;
struct kthread_worker *worker = work->worker;
unsigned long flags;
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index dd2bbf73718b..2d4c5bab5af8 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -297,33 +297,30 @@ static inline void lock_time_add(struct lock_time *src, struct lock_time *dst)
dst->nr += src->nr;
}
-struct lock_class_stats lock_stats(struct lock_class *class)
+void lock_stats(struct lock_class *class, struct lock_class_stats *stats)
{
- struct lock_class_stats stats;
int cpu, i;
- memset(&stats, 0, sizeof(struct lock_class_stats));
+ memset(stats, 0, sizeof(struct lock_class_stats));
for_each_possible_cpu(cpu) {
struct lock_class_stats *pcs =
&per_cpu(cpu_lock_stats, cpu)[class - lock_classes];
- for (i = 0; i < ARRAY_SIZE(stats.contention_point); i++)
- stats.contention_point[i] += pcs->contention_point[i];
+ for (i = 0; i < ARRAY_SIZE(stats->contention_point); i++)
+ stats->contention_point[i] += pcs->contention_point[i];
- for (i = 0; i < ARRAY_SIZE(stats.contending_point); i++)
- stats.contending_point[i] += pcs->contending_point[i];
+ for (i = 0; i < ARRAY_SIZE(stats->contending_point); i++)
+ stats->contending_point[i] += pcs->contending_point[i];
- lock_time_add(&pcs->read_waittime, &stats.read_waittime);
- lock_time_add(&pcs->write_waittime, &stats.write_waittime);
+ lock_time_add(&pcs->read_waittime, &stats->read_waittime);
+ lock_time_add(&pcs->write_waittime, &stats->write_waittime);
- lock_time_add(&pcs->read_holdtime, &stats.read_holdtime);
- lock_time_add(&pcs->write_holdtime, &stats.write_holdtime);
+ lock_time_add(&pcs->read_holdtime, &stats->read_holdtime);
+ lock_time_add(&pcs->write_holdtime, &stats->write_holdtime);
- for (i = 0; i < ARRAY_SIZE(stats.bounces); i++)
- stats.bounces[i] += pcs->bounces[i];
+ for (i = 0; i < ARRAY_SIZE(stats->bounces); i++)
+ stats->bounces[i] += pcs->bounces[i];
}
-
- return stats;
}
void clear_lock_stats(struct lock_class *class)
@@ -6619,8 +6616,16 @@ void lockdep_unregister_key(struct lock_class_key *key)
if (need_callback)
call_rcu(&delayed_free.rcu_head, free_zapped_rcu);
- /* Wait until is_dynamic_key() has finished accessing k->hash_entry. */
- synchronize_rcu();
+ /*
+ * Wait until is_dynamic_key() has finished accessing k->hash_entry.
+ *
+ * Some operations like __qdisc_destroy() will call this in a debug
+ * kernel, and the network traffic is disabled while waiting, hence
+ * the delay of the wait matters in debugging cases. Currently use a
+ * synchronize_rcu_expedited() to speed up the wait at the cost of
+ * system IPIs. TODO: Replace RCU with hazptr for this.
+ */
+ synchronize_rcu_expedited();
}
EXPORT_SYMBOL_GPL(lockdep_unregister_key);
diff --git a/kernel/locking/lockdep_internals.h b/kernel/locking/lockdep_internals.h
index 82156caf77d1..0e5e6ffe91a3 100644
--- a/kernel/locking/lockdep_internals.h
+++ b/kernel/locking/lockdep_internals.h
@@ -47,29 +47,31 @@ enum {
__LOCKF(USED_READ)
};
+enum {
#define LOCKDEP_STATE(__STATE) LOCKF_ENABLED_##__STATE |
-static const unsigned long LOCKF_ENABLED_IRQ =
+ LOCKF_ENABLED_IRQ =
#include "lockdep_states.h"
- 0;
+ 0,
#undef LOCKDEP_STATE
#define LOCKDEP_STATE(__STATE) LOCKF_USED_IN_##__STATE |
-static const unsigned long LOCKF_USED_IN_IRQ =
+ LOCKF_USED_IN_IRQ =
#include "lockdep_states.h"
- 0;
+ 0,
#undef LOCKDEP_STATE
#define LOCKDEP_STATE(__STATE) LOCKF_ENABLED_##__STATE##_READ |
-static const unsigned long LOCKF_ENABLED_IRQ_READ =
+ LOCKF_ENABLED_IRQ_READ =
#include "lockdep_states.h"
- 0;
+ 0,
#undef LOCKDEP_STATE
#define LOCKDEP_STATE(__STATE) LOCKF_USED_IN_##__STATE##_READ |
-static const unsigned long LOCKF_USED_IN_IRQ_READ =
+ LOCKF_USED_IN_IRQ_READ =
#include "lockdep_states.h"
- 0;
+ 0,
#undef LOCKDEP_STATE
+};
#define LOCKF_ENABLED_IRQ_ALL (LOCKF_ENABLED_IRQ | LOCKF_ENABLED_IRQ_READ)
#define LOCKF_USED_IN_IRQ_ALL (LOCKF_USED_IN_IRQ | LOCKF_USED_IN_IRQ_READ)
diff --git a/kernel/locking/lockdep_proc.c b/kernel/locking/lockdep_proc.c
index b52c07c4707c..1916db9aa46b 100644
--- a/kernel/locking/lockdep_proc.c
+++ b/kernel/locking/lockdep_proc.c
@@ -657,7 +657,7 @@ static int lock_stat_open(struct inode *inode, struct file *file)
if (!test_bit(idx, lock_classes_in_use))
continue;
iter->class = class;
- iter->stats = lock_stats(class);
+ lock_stats(class, &iter->stats);
iter++;
}
diff --git a/kernel/locking/mutex-debug.c b/kernel/locking/mutex-debug.c
index 6e6f6071cfa2..949103fd8e9b 100644
--- a/kernel/locking/mutex-debug.c
+++ b/kernel/locking/mutex-debug.c
@@ -53,17 +53,18 @@ void debug_mutex_add_waiter(struct mutex *lock, struct mutex_waiter *waiter,
{
lockdep_assert_held(&lock->wait_lock);
- /* Mark the current thread as blocked on the lock: */
- task->blocked_on = waiter;
+ /* Current thread can't be already blocked (since it's executing!) */
+ DEBUG_LOCKS_WARN_ON(__get_task_blocked_on(task));
}
void debug_mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter,
struct task_struct *task)
{
+ struct mutex *blocked_on = __get_task_blocked_on(task);
+
DEBUG_LOCKS_WARN_ON(list_empty(&waiter->list));
DEBUG_LOCKS_WARN_ON(waiter->task != task);
- DEBUG_LOCKS_WARN_ON(task->blocked_on != waiter);
- task->blocked_on = NULL;
+ DEBUG_LOCKS_WARN_ON(blocked_on && blocked_on != lock);
INIT_LIST_HEAD(&waiter->list);
waiter->task = NULL;
diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
index 555e2b3a665a..de7d6702cd96 100644
--- a/kernel/locking/mutex.c
+++ b/kernel/locking/mutex.c
@@ -29,6 +29,7 @@
#include <linux/interrupt.h>
#include <linux/debug_locks.h>
#include <linux/osq_lock.h>
+#include <linux/hung_task.h>
#define CREATE_TRACE_POINTS
#include <trace/events/lock.h>
@@ -190,9 +191,7 @@ static void
__mutex_add_waiter(struct mutex *lock, struct mutex_waiter *waiter,
struct list_head *list)
{
-#ifdef CONFIG_DETECT_HUNG_TASK_BLOCKER
- WRITE_ONCE(current->blocker_mutex, lock);
-#endif
+ hung_task_set_blocker(lock, BLOCKER_TYPE_MUTEX);
debug_mutex_add_waiter(lock, waiter, current);
list_add_tail(&waiter->list, list);
@@ -208,9 +207,7 @@ __mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter)
__mutex_clear_flag(lock, MUTEX_FLAGS);
debug_mutex_remove_waiter(lock, waiter, current);
-#ifdef CONFIG_DETECT_HUNG_TASK_BLOCKER
- WRITE_ONCE(current->blocker_mutex, NULL);
-#endif
+ hung_task_clear_blocker();
}
/*
@@ -643,6 +640,7 @@ __mutex_lock_common(struct mutex *lock, unsigned int state, unsigned int subclas
goto err_early_kill;
}
+ __set_task_blocked_on(current, lock);
set_current_state(state);
trace_contention_begin(lock, LCB_F_MUTEX);
for (;;) {
@@ -679,6 +677,12 @@ __mutex_lock_common(struct mutex *lock, unsigned int state, unsigned int subclas
first = __mutex_waiter_is_first(lock, &waiter);
+ /*
+ * As we likely have been woken up by task
+ * that has cleared our blocked_on state, re-set
+ * it to the lock we are trying to acquire.
+ */
+ set_task_blocked_on(current, lock);
set_current_state(state);
/*
* Here we order against unlock; we must either see it change
@@ -690,8 +694,15 @@ __mutex_lock_common(struct mutex *lock, unsigned int state, unsigned int subclas
if (first) {
trace_contention_begin(lock, LCB_F_MUTEX | LCB_F_SPIN);
+ /*
+ * mutex_optimistic_spin() can call schedule(), so
+ * clear blocked on so we don't become unselectable
+ * to run.
+ */
+ clear_task_blocked_on(current, lock);
if (mutex_optimistic_spin(lock, ww_ctx, &waiter))
break;
+ set_task_blocked_on(current, lock);
trace_contention_begin(lock, LCB_F_MUTEX);
}
@@ -699,6 +710,7 @@ __mutex_lock_common(struct mutex *lock, unsigned int state, unsigned int subclas
}
raw_spin_lock_irqsave(&lock->wait_lock, flags);
acquired:
+ __clear_task_blocked_on(current, lock);
__set_current_state(TASK_RUNNING);
if (ww_ctx) {
@@ -728,9 +740,11 @@ skip_wait:
return 0;
err:
+ __clear_task_blocked_on(current, lock);
__set_current_state(TASK_RUNNING);
__mutex_remove_waiter(lock, &waiter);
err_early_kill:
+ WARN_ON(__get_task_blocked_on(current));
trace_contention_end(lock, ret);
raw_spin_unlock_irqrestore_wake(&lock->wait_lock, flags, &wake_q);
debug_mutex_free_waiter(&waiter);
@@ -808,11 +822,12 @@ _mutex_lock_nest_lock(struct mutex *lock, struct lockdep_map *nest)
EXPORT_SYMBOL_GPL(_mutex_lock_nest_lock);
int __sched
-mutex_lock_killable_nested(struct mutex *lock, unsigned int subclass)
+_mutex_lock_killable(struct mutex *lock, unsigned int subclass,
+ struct lockdep_map *nest)
{
- return __mutex_lock(lock, TASK_KILLABLE, subclass, NULL, _RET_IP_);
+ return __mutex_lock(lock, TASK_KILLABLE, subclass, nest, _RET_IP_);
}
-EXPORT_SYMBOL_GPL(mutex_lock_killable_nested);
+EXPORT_SYMBOL_GPL(_mutex_lock_killable);
int __sched
mutex_lock_interruptible_nested(struct mutex *lock, unsigned int subclass)
@@ -940,6 +955,7 @@ static noinline void __sched __mutex_unlock_slowpath(struct mutex *lock, unsigne
next = waiter->task;
debug_mutex_wake_waiter(lock, waiter);
+ __clear_task_blocked_on(next, lock);
wake_q_add(&wake_q, next);
}
@@ -1062,6 +1078,7 @@ __ww_mutex_lock_interruptible_slowpath(struct ww_mutex *lock,
#endif
+#ifndef CONFIG_DEBUG_LOCK_ALLOC
/**
* mutex_trylock - try to acquire the mutex, without waiting
* @lock: the mutex to be acquired
@@ -1078,17 +1095,24 @@ __ww_mutex_lock_interruptible_slowpath(struct ww_mutex *lock,
*/
int __sched mutex_trylock(struct mutex *lock)
{
+ MUTEX_WARN_ON(lock->magic != lock);
+ return __mutex_trylock(lock);
+}
+EXPORT_SYMBOL(mutex_trylock);
+#else
+int __sched _mutex_trylock_nest_lock(struct mutex *lock, struct lockdep_map *nest_lock)
+{
bool locked;
MUTEX_WARN_ON(lock->magic != lock);
-
locked = __mutex_trylock(lock);
if (locked)
- mutex_acquire(&lock->dep_map, 0, 1, _RET_IP_);
+ mutex_acquire_nest(&lock->dep_map, 0, 1, nest_lock, _RET_IP_);
return locked;
}
-EXPORT_SYMBOL(mutex_trylock);
+EXPORT_SYMBOL(_mutex_trylock_nest_lock);
+#endif
#ifndef CONFIG_DEBUG_LOCK_ALLOC
int __sched
diff --git a/kernel/locking/mutex.h b/kernel/locking/mutex.h
index cbff35b9b7ae..2e8080a9bee3 100644
--- a/kernel/locking/mutex.h
+++ b/kernel/locking/mutex.h
@@ -6,7 +6,7 @@
*
* Copyright (C) 2004, 2005, 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
*/
-
+#ifndef CONFIG_PREEMPT_RT
/*
* This is the control structure for tasks blocked on mutex, which resides
* on the blocked task's kernel stack:
@@ -70,3 +70,4 @@ extern void debug_mutex_init(struct mutex *lock, const char *name,
# define debug_mutex_unlock(lock) do { } while (0)
# define debug_mutex_init(lock, name, key) do { } while (0)
#endif /* !CONFIG_DEBUG_MUTEXES */
+#endif /* CONFIG_PREEMPT_RT */
diff --git a/kernel/locking/rtmutex_api.c b/kernel/locking/rtmutex_api.c
index 191e4720e546..bafd5af98eae 100644
--- a/kernel/locking/rtmutex_api.c
+++ b/kernel/locking/rtmutex_api.c
@@ -13,6 +13,24 @@
*/
int max_lock_depth = 1024;
+static const struct ctl_table rtmutex_sysctl_table[] = {
+ {
+ .procname = "max_lock_depth",
+ .data = &max_lock_depth,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+};
+
+static int __init init_rtmutex_sysctl(void)
+{
+ register_sysctl_init("kernel", rtmutex_sysctl_table);
+ return 0;
+}
+
+subsys_initcall(init_rtmutex_sysctl);
+
/*
* Debug aware fast / slowpath lock,trylock,unlock
*
@@ -544,12 +562,12 @@ int __sched mutex_lock_interruptible_nested(struct mutex *lock,
}
EXPORT_SYMBOL_GPL(mutex_lock_interruptible_nested);
-int __sched mutex_lock_killable_nested(struct mutex *lock,
- unsigned int subclass)
+int __sched _mutex_lock_killable(struct mutex *lock, unsigned int subclass,
+ struct lockdep_map *nest_lock)
{
- return __mutex_lock_common(lock, TASK_KILLABLE, subclass, NULL, _RET_IP_);
+ return __mutex_lock_common(lock, TASK_KILLABLE, subclass, nest_lock, _RET_IP_);
}
-EXPORT_SYMBOL_GPL(mutex_lock_killable_nested);
+EXPORT_SYMBOL_GPL(_mutex_lock_killable);
void __sched mutex_lock_io_nested(struct mutex *lock, unsigned int subclass)
{
@@ -563,6 +581,21 @@ void __sched mutex_lock_io_nested(struct mutex *lock, unsigned int subclass)
}
EXPORT_SYMBOL_GPL(mutex_lock_io_nested);
+int __sched _mutex_trylock_nest_lock(struct mutex *lock,
+ struct lockdep_map *nest_lock)
+{
+ int ret;
+
+ if (IS_ENABLED(CONFIG_DEBUG_RT_MUTEXES) && WARN_ON_ONCE(!in_task()))
+ return 0;
+
+ ret = __rt_mutex_trylock(&lock->rtmutex);
+ if (ret)
+ mutex_acquire_nest(&lock->dep_map, 0, 1, nest_lock, _RET_IP_);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(_mutex_trylock_nest_lock);
#else /* CONFIG_DEBUG_LOCK_ALLOC */
void __sched mutex_lock(struct mutex *lock)
@@ -591,22 +624,16 @@ void __sched mutex_lock_io(struct mutex *lock)
io_schedule_finish(token);
}
EXPORT_SYMBOL(mutex_lock_io);
-#endif /* !CONFIG_DEBUG_LOCK_ALLOC */
int __sched mutex_trylock(struct mutex *lock)
{
- int ret;
-
if (IS_ENABLED(CONFIG_DEBUG_RT_MUTEXES) && WARN_ON_ONCE(!in_task()))
return 0;
- ret = __rt_mutex_trylock(&lock->rtmutex);
- if (ret)
- mutex_acquire(&lock->dep_map, 0, 1, _RET_IP_);
-
- return ret;
+ return __rt_mutex_trylock(&lock->rtmutex);
}
EXPORT_SYMBOL(mutex_trylock);
+#endif /* !CONFIG_DEBUG_LOCK_ALLOC */
void __sched mutex_unlock(struct mutex *lock)
{
diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c
index 2ddb827e3bea..8572dba95af4 100644
--- a/kernel/locking/rwsem.c
+++ b/kernel/locking/rwsem.c
@@ -727,8 +727,6 @@ static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem)
return ret;
}
-#define OWNER_SPINNABLE (OWNER_NULL | OWNER_WRITER | OWNER_READER)
-
static inline enum owner_state
rwsem_owner_state(struct task_struct *owner, unsigned long flags)
{
@@ -835,7 +833,7 @@ static bool rwsem_optimistic_spin(struct rw_semaphore *sem)
enum owner_state owner_state;
owner_state = rwsem_spin_on_owner(sem);
- if (!(owner_state & OWNER_SPINNABLE))
+ if (owner_state == OWNER_NONSPINNABLE)
break;
/*
diff --git a/kernel/locking/semaphore.c b/kernel/locking/semaphore.c
index de9117c0e671..3ef032e22f7e 100644
--- a/kernel/locking/semaphore.c
+++ b/kernel/locking/semaphore.c
@@ -34,6 +34,7 @@
#include <linux/spinlock.h>
#include <linux/ftrace.h>
#include <trace/events/lock.h>
+#include <linux/hung_task.h>
static noinline void __down(struct semaphore *sem);
static noinline int __down_interruptible(struct semaphore *sem);
@@ -41,6 +42,41 @@ static noinline int __down_killable(struct semaphore *sem);
static noinline int __down_timeout(struct semaphore *sem, long timeout);
static noinline void __up(struct semaphore *sem, struct wake_q_head *wake_q);
+#ifdef CONFIG_DETECT_HUNG_TASK_BLOCKER
+static inline void hung_task_sem_set_holder(struct semaphore *sem)
+{
+ WRITE_ONCE((sem)->last_holder, (unsigned long)current);
+}
+
+static inline void hung_task_sem_clear_if_holder(struct semaphore *sem)
+{
+ if (READ_ONCE((sem)->last_holder) == (unsigned long)current)
+ WRITE_ONCE((sem)->last_holder, 0UL);
+}
+
+unsigned long sem_last_holder(struct semaphore *sem)
+{
+ return READ_ONCE(sem->last_holder);
+}
+#else
+static inline void hung_task_sem_set_holder(struct semaphore *sem)
+{
+}
+static inline void hung_task_sem_clear_if_holder(struct semaphore *sem)
+{
+}
+unsigned long sem_last_holder(struct semaphore *sem)
+{
+ return 0UL;
+}
+#endif
+
+static inline void __sem_acquire(struct semaphore *sem)
+{
+ sem->count--;
+ hung_task_sem_set_holder(sem);
+}
+
/**
* down - acquire the semaphore
* @sem: the semaphore to be acquired
@@ -59,7 +95,7 @@ void __sched down(struct semaphore *sem)
might_sleep();
raw_spin_lock_irqsave(&sem->lock, flags);
if (likely(sem->count > 0))
- sem->count--;
+ __sem_acquire(sem);
else
__down(sem);
raw_spin_unlock_irqrestore(&sem->lock, flags);
@@ -83,7 +119,7 @@ int __sched down_interruptible(struct semaphore *sem)
might_sleep();
raw_spin_lock_irqsave(&sem->lock, flags);
if (likely(sem->count > 0))
- sem->count--;
+ __sem_acquire(sem);
else
result = __down_interruptible(sem);
raw_spin_unlock_irqrestore(&sem->lock, flags);
@@ -110,7 +146,7 @@ int __sched down_killable(struct semaphore *sem)
might_sleep();
raw_spin_lock_irqsave(&sem->lock, flags);
if (likely(sem->count > 0))
- sem->count--;
+ __sem_acquire(sem);
else
result = __down_killable(sem);
raw_spin_unlock_irqrestore(&sem->lock, flags);
@@ -140,7 +176,7 @@ int __sched down_trylock(struct semaphore *sem)
raw_spin_lock_irqsave(&sem->lock, flags);
count = sem->count - 1;
if (likely(count >= 0))
- sem->count = count;
+ __sem_acquire(sem);
raw_spin_unlock_irqrestore(&sem->lock, flags);
return (count < 0);
@@ -165,7 +201,7 @@ int __sched down_timeout(struct semaphore *sem, long timeout)
might_sleep();
raw_spin_lock_irqsave(&sem->lock, flags);
if (likely(sem->count > 0))
- sem->count--;
+ __sem_acquire(sem);
else
result = __down_timeout(sem, timeout);
raw_spin_unlock_irqrestore(&sem->lock, flags);
@@ -187,6 +223,9 @@ void __sched up(struct semaphore *sem)
DEFINE_WAKE_Q(wake_q);
raw_spin_lock_irqsave(&sem->lock, flags);
+
+ hung_task_sem_clear_if_holder(sem);
+
if (likely(list_empty(&sem->wait_list)))
sem->count++;
else
@@ -228,8 +267,10 @@ static inline int __sched ___down_common(struct semaphore *sem, long state,
raw_spin_unlock_irq(&sem->lock);
timeout = schedule_timeout(timeout);
raw_spin_lock_irq(&sem->lock);
- if (waiter.up)
+ if (waiter.up) {
+ hung_task_sem_set_holder(sem);
return 0;
+ }
}
timed_out:
@@ -246,10 +287,14 @@ static inline int __sched __down_common(struct semaphore *sem, long state,
{
int ret;
+ hung_task_set_blocker(sem, BLOCKER_TYPE_SEM);
+
trace_contention_begin(sem, 0);
ret = ___down_common(sem, state, timeout);
trace_contention_end(sem, ret);
+ hung_task_clear_blocker();
+
return ret;
}
diff --git a/kernel/locking/ww_mutex.h b/kernel/locking/ww_mutex.h
index 37f025a096c9..086fd5487ca7 100644
--- a/kernel/locking/ww_mutex.h
+++ b/kernel/locking/ww_mutex.h
@@ -284,6 +284,12 @@ __ww_mutex_die(struct MUTEX *lock, struct MUTEX_WAITER *waiter,
#ifndef WW_RT
debug_mutex_wake_waiter(lock, waiter);
#endif
+ /*
+ * When waking up the task to die, be sure to clear the
+ * blocked_on pointer. Otherwise we can see circular
+ * blocked_on relationships that can't resolve.
+ */
+ __clear_task_blocked_on(waiter->task, lock);
wake_q_add(wake_q, waiter->task);
}
@@ -331,9 +337,15 @@ static bool __ww_mutex_wound(struct MUTEX *lock,
* it's wounded in __ww_mutex_check_kill() or has a
* wakeup pending to re-read the wounded state.
*/
- if (owner != current)
+ if (owner != current) {
+ /*
+ * When waking up the task to wound, be sure to clear the
+ * blocked_on pointer. Otherwise we can see circular
+ * blocked_on relationships that can't resolve.
+ */
+ __clear_task_blocked_on(owner, lock);
wake_q_add(wake_q, owner);
-
+ }
return true;
}
diff --git a/kernel/module/internal.h b/kernel/module/internal.h
index 626cf8668a7e..51ddd8866ef3 100644
--- a/kernel/module/internal.h
+++ b/kernel/module/internal.h
@@ -58,6 +58,9 @@ extern const struct kernel_symbol __stop___ksymtab_gpl[];
extern const u32 __start___kcrctab[];
extern const u32 __start___kcrctab_gpl[];
+#define KMOD_PATH_LEN 256
+extern char modprobe_path[];
+
struct load_info {
const char *name;
/* pointer to module in temporary copy, freed at end of load_module() */
@@ -322,8 +325,11 @@ int module_enable_rodata_ro(const struct module *mod);
int module_enable_rodata_ro_after_init(const struct module *mod);
int module_enable_data_nx(const struct module *mod);
int module_enable_text_rox(const struct module *mod);
-int module_enforce_rwx_sections(Elf_Ehdr *hdr, Elf_Shdr *sechdrs,
- char *secstrings, struct module *mod);
+int module_enforce_rwx_sections(const Elf_Ehdr *hdr, const Elf_Shdr *sechdrs,
+ const char *secstrings,
+ const struct module *mod);
+void module_mark_ro_after_init(const Elf_Ehdr *hdr, Elf_Shdr *sechdrs,
+ const char *secstrings);
#ifdef CONFIG_MODULE_SIG
int module_sig_check(struct load_info *info, int flags);
diff --git a/kernel/module/main.c b/kernel/module/main.c
index 5c6ab20240a6..81f9df8859dc 100644
--- a/kernel/module/main.c
+++ b/kernel/module/main.c
@@ -126,9 +126,37 @@ static void mod_update_bounds(struct module *mod)
}
/* Block module loading/unloading? */
-int modules_disabled;
+static int modules_disabled;
core_param(nomodule, modules_disabled, bint, 0);
+static const struct ctl_table module_sysctl_table[] = {
+ {
+ .procname = "modprobe",
+ .data = &modprobe_path,
+ .maxlen = KMOD_PATH_LEN,
+ .mode = 0644,
+ .proc_handler = proc_dostring,
+ },
+ {
+ .procname = "modules_disabled",
+ .data = &modules_disabled,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ /* only handle a transition from default "0" to "1" */
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ONE,
+ .extra2 = SYSCTL_ONE,
+ },
+};
+
+static int __init init_module_sysctl(void)
+{
+ register_sysctl_init("kernel", module_sysctl_table);
+ return 0;
+}
+
+subsys_initcall(init_module_sysctl);
+
/* Waiting for a module to finish initializing? */
static DECLARE_WAIT_QUEUE_HEAD(module_wq);
@@ -170,6 +198,30 @@ static inline void add_taint_module(struct module *mod, unsigned flag,
}
/*
+ * Like strncmp(), except s/-/_/g as per scripts/Makefile.lib:name-fix-token rule.
+ */
+static int mod_strncmp(const char *str_a, const char *str_b, size_t n)
+{
+ for (int i = 0; i < n; i++) {
+ char a = str_a[i];
+ char b = str_b[i];
+ int d;
+
+ if (a == '-') a = '_';
+ if (b == '-') b = '_';
+
+ d = a - b;
+ if (d)
+ return d;
+
+ if (!a)
+ break;
+ }
+
+ return 0;
+}
+
+/*
* A thread that wants to hold a reference to a module only while it
* is running can call this to safely exit.
*/
@@ -1083,6 +1135,46 @@ static char *get_modinfo(const struct load_info *info, const char *tag)
return get_next_modinfo(info, tag, NULL);
}
+/**
+ * verify_module_namespace() - does @modname have access to this symbol's @namespace
+ * @namespace: export symbol namespace
+ * @modname: module name
+ *
+ * If @namespace is prefixed with "module:" to indicate it is a module namespace
+ * then test if @modname matches any of the comma separated patterns.
+ *
+ * The patterns only support tail-glob.
+ */
+static bool verify_module_namespace(const char *namespace, const char *modname)
+{
+ size_t len, modlen = strlen(modname);
+ const char *prefix = "module:";
+ const char *sep;
+ bool glob;
+
+ if (!strstarts(namespace, prefix))
+ return false;
+
+ for (namespace += strlen(prefix); *namespace; namespace = sep) {
+ sep = strchrnul(namespace, ',');
+ len = sep - namespace;
+
+ glob = false;
+ if (sep[-1] == '*') {
+ len--;
+ glob = true;
+ }
+
+ if (*sep)
+ sep++;
+
+ if (mod_strncmp(namespace, modname, len) == 0 && (glob || len == modlen))
+ return true;
+ }
+
+ return false;
+}
+
static int verify_namespace_is_imported(const struct load_info *info,
const struct kernel_symbol *sym,
struct module *mod)
@@ -1092,6 +1184,10 @@ static int verify_namespace_is_imported(const struct load_info *info,
namespace = kernel_symbol_namespace(sym);
if (namespace && namespace[0]) {
+
+ if (verify_module_namespace(namespace, mod->name))
+ return 0;
+
for_each_modinfo_entry(imported_namespace, info, "import_ns") {
if (strcmp(namespace, imported_namespace) == 0)
return 0;
@@ -1505,8 +1601,14 @@ static int apply_relocations(struct module *mod, const struct load_info *info)
if (infosec >= info->hdr->e_shnum)
continue;
- /* Don't bother with non-allocated sections */
- if (!(info->sechdrs[infosec].sh_flags & SHF_ALLOC))
+ /*
+ * Don't bother with non-allocated sections.
+ * An exception is the percpu section, which has separate allocations
+ * for individual CPUs. We relocate the percpu section in the initial
+ * ELF template and subsequently copy it to the per-CPU destinations.
+ */
+ if (!(info->sechdrs[infosec].sh_flags & SHF_ALLOC) &&
+ (!infosec || infosec != info->index.pcpu))
continue;
if (info->sechdrs[i].sh_flags & SHF_RELA_LIVEPATCH)
@@ -1562,12 +1664,11 @@ static void __layout_sections(struct module *mod, struct load_info *info, bool i
{
unsigned int m, i;
+ /*
+ * { Mask of required section header flags,
+ * Mask of excluded section header flags }
+ */
static const unsigned long masks[][2] = {
- /*
- * NOTE: all executable code must be the first section
- * in this array; otherwise modify the text_size
- * finder in the two loops below
- */
{ SHF_EXECINSTR | SHF_ALLOC, ARCH_SHF_SMALL },
{ SHF_ALLOC, SHF_WRITE | ARCH_SHF_SMALL },
{ SHF_RO_AFTER_INIT | SHF_ALLOC, ARCH_SHF_SMALL },
@@ -1659,15 +1760,30 @@ static void module_license_taint_check(struct module *mod, const char *license)
}
}
-static void setup_modinfo(struct module *mod, struct load_info *info)
+static int setup_modinfo(struct module *mod, struct load_info *info)
{
const struct module_attribute *attr;
+ char *imported_namespace;
int i;
for (i = 0; (attr = modinfo_attrs[i]); i++) {
if (attr->setup)
attr->setup(mod, get_modinfo(info, attr->attr.name));
}
+
+ for_each_modinfo_entry(imported_namespace, info, "import_ns") {
+ /*
+ * 'module:' prefixed namespaces are implicit, disallow
+ * explicit imports.
+ */
+ if (strstarts(imported_namespace, "module:")) {
+ pr_err("%s: module tries to import module namespace: %s\n",
+ mod->name, imported_namespace);
+ return -EPERM;
+ }
+ }
+
+ return 0;
}
static void free_modinfo(struct module *mod)
@@ -2557,7 +2673,7 @@ static int find_module_sections(struct module *mod, struct load_info *info)
sizeof(*mod->trace_bprintk_fmt_start),
&mod->num_trace_bprintk_fmt);
#endif
-#ifdef CONFIG_FTRACE_MCOUNT_RECORD
+#ifdef CONFIG_DYNAMIC_FTRACE
/* sechdrs[0].sh_size is always zero */
mod->ftrace_callsites = section_objs(info, FTRACE_CALLSITE_SECTION,
sizeof(*mod->ftrace_callsites),
@@ -2614,9 +2730,8 @@ static int find_module_sections(struct module *mod, struct load_info *info)
static int move_module(struct module *mod, struct load_info *info)
{
- int i;
- enum mod_mem_type t = 0;
- int ret = -ENOMEM;
+ int i, ret;
+ enum mod_mem_type t = MOD_MEM_NUM_TYPES;
bool codetag_section_found = false;
for_each_mod_mem_type(type) {
@@ -2694,7 +2809,7 @@ static int move_module(struct module *mod, struct load_info *info)
return 0;
out_err:
module_memory_restore_rox(mod);
- for (t--; t >= 0; t--)
+ while (t--)
module_memory_free(mod, t);
if (codetag_section_found)
codetag_free_module_sections(mod);
@@ -2768,7 +2883,6 @@ core_param(module_blacklist, module_blacklist, charp, 0400);
static struct module *layout_and_allocate(struct load_info *info, int flags)
{
struct module *mod;
- unsigned int ndx;
int err;
/* Allow arches to frob section contents and sizes. */
@@ -2786,22 +2900,11 @@ static struct module *layout_and_allocate(struct load_info *info, int flags)
info->sechdrs[info->index.pcpu].sh_flags &= ~(unsigned long)SHF_ALLOC;
/*
- * Mark ro_after_init section with SHF_RO_AFTER_INIT so that
- * layout_sections() can put it in the right place.
+ * Mark relevant sections as SHF_RO_AFTER_INIT so layout_sections() can
+ * put them in the right place.
* Note: ro_after_init sections also have SHF_{WRITE,ALLOC} set.
*/
- ndx = find_sec(info, ".data..ro_after_init");
- if (ndx)
- info->sechdrs[ndx].sh_flags |= SHF_RO_AFTER_INIT;
- /*
- * Mark the __jump_table section as ro_after_init as well: these data
- * structures are never modified, with the exception of entries that
- * refer to code in the __init section, which are annotated as such
- * at module load time.
- */
- ndx = find_sec(info, "__jump_table");
- if (ndx)
- info->sechdrs[ndx].sh_flags |= SHF_RO_AFTER_INIT;
+ module_mark_ro_after_init(info->hdr, info->sechdrs, info->secstrings);
/*
* Determine total sizes, and put offsets in sh_entsize. For now
@@ -3298,7 +3401,7 @@ static int load_module(struct load_info *info, const char __user *uargs,
module_allocated = true;
- audit_log_kern_module(mod->name);
+ audit_log_kern_module(info->name);
/* Reserve our place in the list. */
err = add_unformed_module(mod);
@@ -3336,7 +3439,9 @@ static int load_module(struct load_info *info, const char __user *uargs,
goto free_unload;
/* Set up MODINFO_ATTR fields */
- setup_modinfo(mod, info);
+ err = setup_modinfo(mod, info);
+ if (err)
+ goto free_modinfo;
/* Fix up syms, so that st_value is a pointer to location. */
err = simplify_symbols(mod, info);
@@ -3399,11 +3504,12 @@ static int load_module(struct load_info *info, const char __user *uargs,
goto sysfs_cleanup;
}
+ if (codetag_load_module(mod))
+ goto sysfs_cleanup;
+
/* Get rid of temporary copy. */
free_copy(info, flags);
- codetag_load_module(mod);
-
/* Done! */
trace_module_load(mod);
@@ -3459,8 +3565,10 @@ static int load_module(struct load_info *info, const char __user *uargs,
* failures once the proper module was allocated and
* before that.
*/
- if (!module_allocated)
+ if (!module_allocated) {
+ audit_log_kern_module(info->name ? info->name : "?");
mod_stat_bump_becoming(info, flags);
+ }
free_copy(info, flags);
return err;
}
diff --git a/kernel/module/strict_rwx.c b/kernel/module/strict_rwx.c
index 03f4142cfbf4..8fd438529fbc 100644
--- a/kernel/module/strict_rwx.c
+++ b/kernel/module/strict_rwx.c
@@ -87,8 +87,9 @@ int module_enable_data_nx(const struct module *mod)
return 0;
}
-int module_enforce_rwx_sections(Elf_Ehdr *hdr, Elf_Shdr *sechdrs,
- char *secstrings, struct module *mod)
+int module_enforce_rwx_sections(const Elf_Ehdr *hdr, const Elf_Shdr *sechdrs,
+ const char *secstrings,
+ const struct module *mod)
{
const unsigned long shf_wx = SHF_WRITE | SHF_EXECINSTR;
int i;
@@ -106,3 +107,45 @@ int module_enforce_rwx_sections(Elf_Ehdr *hdr, Elf_Shdr *sechdrs,
return 0;
}
+
+static const char *const ro_after_init[] = {
+ /*
+ * Section .data..ro_after_init holds data explicitly annotated by
+ * __ro_after_init.
+ */
+ ".data..ro_after_init",
+
+ /*
+ * Section __jump_table holds data structures that are never modified,
+ * with the exception of entries that refer to code in the __init
+ * section, which are marked as such at module load time.
+ */
+ "__jump_table",
+
+#ifdef CONFIG_HAVE_STATIC_CALL_INLINE
+ /*
+ * Section .static_call_sites holds data structures that need to be
+ * sorted and processed at module load time but are never modified
+ * afterwards.
+ */
+ ".static_call_sites",
+#endif
+};
+
+void module_mark_ro_after_init(const Elf_Ehdr *hdr, Elf_Shdr *sechdrs,
+ const char *secstrings)
+{
+ int i, j;
+
+ for (i = 1; i < hdr->e_shnum; i++) {
+ Elf_Shdr *shdr = &sechdrs[i];
+
+ for (j = 0; j < ARRAY_SIZE(ro_after_init); j++) {
+ if (strcmp(secstrings + shdr->sh_name,
+ ro_after_init[j]) == 0) {
+ shdr->sh_flags |= SHF_RO_AFTER_INIT;
+ break;
+ }
+ }
+ }
+}
diff --git a/kernel/module/sysfs.c b/kernel/module/sysfs.c
index b401ff4b02d2..c7622ff5226a 100644
--- a/kernel/module/sysfs.c
+++ b/kernel/module/sysfs.c
@@ -56,9 +56,9 @@ static void free_sect_attrs(struct module_sect_attrs *sect_attrs)
{
const struct bin_attribute *const *bin_attr;
- for (bin_attr = sect_attrs->grp.bin_attrs_new; *bin_attr; bin_attr++)
+ for (bin_attr = sect_attrs->grp.bin_attrs; *bin_attr; bin_attr++)
kfree((*bin_attr)->attr.name);
- kfree(sect_attrs->grp.bin_attrs_new);
+ kfree(sect_attrs->grp.bin_attrs);
kfree(sect_attrs);
}
@@ -86,7 +86,7 @@ static int add_sect_attrs(struct module *mod, const struct load_info *info)
/* Setup section attributes. */
sect_attrs->grp.name = "sections";
- sect_attrs->grp.bin_attrs_new = gattr;
+ sect_attrs->grp.bin_attrs = gattr;
sattr = &sect_attrs->attrs[0];
for (i = 0; i < info->hdr->e_shnum; i++) {
@@ -101,7 +101,7 @@ static int add_sect_attrs(struct module *mod, const struct load_info *info)
ret = -ENOMEM;
goto out;
}
- sattr->read_new = module_sect_read;
+ sattr->read = module_sect_read;
sattr->private = (void *)sec->sh_addr;
sattr->size = MODULE_SECT_READ_SIZE;
sattr->attr.mode = 0400;
@@ -144,7 +144,7 @@ struct module_notes_attrs {
static void free_notes_attrs(struct module_notes_attrs *notes_attrs)
{
- kfree(notes_attrs->grp.bin_attrs_new);
+ kfree(notes_attrs->grp.bin_attrs);
kfree(notes_attrs);
}
@@ -178,7 +178,7 @@ static int add_notes_attrs(struct module *mod, const struct load_info *info)
}
notes_attrs->grp.name = "notes";
- notes_attrs->grp.bin_attrs_new = gattr;
+ notes_attrs->grp.bin_attrs = gattr;
nattr = &notes_attrs->attrs[0];
for (loaded = i = 0; i < info->hdr->e_shnum; ++i) {
@@ -190,7 +190,7 @@ static int add_notes_attrs(struct module *mod, const struct load_info *info)
nattr->attr.mode = 0444;
nattr->size = info->sechdrs[i].sh_size;
nattr->private = (void *)info->sechdrs[i].sh_addr;
- nattr->read_new = sysfs_bin_attr_simple_read;
+ nattr->read = sysfs_bin_attr_simple_read;
*(gattr++) = nattr++;
}
++loaded;
diff --git a/kernel/panic.c b/kernel/panic.c
index a3889f38153d..43817111c979 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -84,6 +84,50 @@ ATOMIC_NOTIFIER_HEAD(panic_notifier_list);
EXPORT_SYMBOL(panic_notifier_list);
#ifdef CONFIG_SYSCTL
+
+/*
+ * Taint values can only be increased
+ * This means we can safely use a temporary.
+ */
+static int proc_taint(const struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ struct ctl_table t;
+ unsigned long tmptaint = get_taint();
+ int err;
+
+ if (write && !capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ t = *table;
+ t.data = &tmptaint;
+ err = proc_doulongvec_minmax(&t, write, buffer, lenp, ppos);
+ if (err < 0)
+ return err;
+
+ if (write) {
+ int i;
+
+ /*
+ * If we are relying on panic_on_taint not producing
+ * false positives due to userspace input, bail out
+ * before setting the requested taint flags.
+ */
+ if (panic_on_taint_nousertaint && (tmptaint & panic_on_taint))
+ return -EINVAL;
+
+ /*
+ * Poor man's atomic or. Not worth adding a primitive
+ * to everyone's atomic.h for this
+ */
+ for (i = 0; i < TAINT_FLAGS_COUNT; i++)
+ if ((1UL << i) & tmptaint)
+ add_taint(i, LOCKDEP_STILL_OK);
+ }
+
+ return err;
+}
+
static const struct ctl_table kern_panic_table[] = {
#ifdef CONFIG_SMP
{
@@ -97,12 +141,58 @@ static const struct ctl_table kern_panic_table[] = {
},
#endif
{
+ .procname = "tainted",
+ .maxlen = sizeof(long),
+ .mode = 0644,
+ .proc_handler = proc_taint,
+ },
+ {
+ .procname = "panic",
+ .data = &panic_timeout,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "panic_on_oops",
+ .data = &panic_on_oops,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "panic_print",
+ .data = &panic_print,
+ .maxlen = sizeof(unsigned long),
+ .mode = 0644,
+ .proc_handler = proc_doulongvec_minmax,
+ },
+ {
+ .procname = "panic_on_warn",
+ .data = &panic_on_warn,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
+ },
+ {
.procname = "warn_limit",
.data = &warn_limit,
.maxlen = sizeof(warn_limit),
.mode = 0644,
.proc_handler = proc_douintvec,
},
+#if (defined(CONFIG_X86_32) || defined(CONFIG_PARISC)) && \
+ defined(CONFIG_DEBUG_STACKOVERFLOW)
+ {
+ .procname = "panic_on_stackoverflow",
+ .data = &sysctl_panic_on_stackoverflow,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+#endif
};
static __init int kernel_panic_sysctls_init(void)
@@ -277,17 +367,15 @@ static void panic_other_cpus_shutdown(bool crash_kexec)
}
/**
- * panic - halt the system
- * @fmt: The text string to print
+ * vpanic - halt the system
+ * @fmt: The text string to print
+ * @args: Arguments for the format string
*
- * Display a message, then perform cleanups.
- *
- * This function never returns.
+ * Display a message, then perform cleanups. This function never returns.
*/
-void panic(const char *fmt, ...)
+void vpanic(const char *fmt, va_list args)
{
static char buf[1024];
- va_list args;
long i, i_next = 0, len;
int state = 0;
int old_cpu, this_cpu;
@@ -338,9 +426,7 @@ void panic(const char *fmt, ...)
console_verbose();
bust_spinlocks(1);
- va_start(args, fmt);
len = vscnprintf(buf, sizeof(buf), fmt, args);
- va_end(args);
if (len && buf[len - 1] == '\n')
buf[len - 1] = '\0';
@@ -477,7 +563,17 @@ void panic(const char *fmt, ...)
mdelay(PANIC_TIMER_STEP);
}
}
+EXPORT_SYMBOL(vpanic);
+
+/* Identical to vpanic(), except it takes variadic arguments instead of va_list */
+void panic(const char *fmt, ...)
+{
+ va_list args;
+ va_start(args, fmt);
+ vpanic(fmt, args);
+ va_end(args);
+}
EXPORT_SYMBOL(panic);
#define TAINT_FLAG(taint, _c_true, _c_false, _module) \
diff --git a/kernel/pid.c b/kernel/pid.c
index 8317bcbc7cf7..c45a28c16cd2 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -100,7 +100,7 @@ void put_pid(struct pid *pid)
ns = pid->numbers[pid->level].ns;
if (refcount_dec_and_test(&pid->count)) {
- WARN_ON_ONCE(pid->stashed);
+ pidfs_free_pid(pid);
kmem_cache_free(ns->pid_cachep, pid);
put_pid_ns(ns);
}
@@ -713,6 +713,29 @@ static struct ctl_table_root pid_table_root = {
.set_ownership = pid_table_root_set_ownership,
};
+static int proc_do_cad_pid(const struct ctl_table *table, int write, void *buffer,
+ size_t *lenp, loff_t *ppos)
+{
+ struct pid *new_pid;
+ pid_t tmp_pid;
+ int r;
+ struct ctl_table tmp_table = *table;
+
+ tmp_pid = pid_vnr(cad_pid);
+ tmp_table.data = &tmp_pid;
+
+ r = proc_dointvec(&tmp_table, write, buffer, lenp, ppos);
+ if (r || !write)
+ return r;
+
+ new_pid = find_get_pid(tmp_pid);
+ if (!new_pid)
+ return -ESRCH;
+
+ put_pid(xchg(&cad_pid, new_pid));
+ return 0;
+}
+
static const struct ctl_table pid_table[] = {
{
.procname = "pid_max",
@@ -723,6 +746,14 @@ static const struct ctl_table pid_table[] = {
.extra1 = &pid_max_min,
.extra2 = &pid_max_max,
},
+#ifdef CONFIG_PROC_SYSCTL
+ {
+ .procname = "cad_pid",
+ .maxlen = sizeof(int),
+ .mode = 0600,
+ .proc_handler = proc_do_cad_pid,
+ },
+#endif
};
#endif
diff --git a/kernel/power/console.c b/kernel/power/console.c
index fcdf0e14a47d..19c48aa5355d 100644
--- a/kernel/power/console.c
+++ b/kernel/power/console.c
@@ -16,6 +16,7 @@
#define SUSPEND_CONSOLE (MAX_NR_CONSOLES-1)
static int orig_fgconsole, orig_kmsg;
+static bool vt_switch_done;
static DEFINE_MUTEX(vt_switch_mutex);
@@ -136,17 +137,21 @@ void pm_prepare_console(void)
if (orig_fgconsole < 0)
return;
+ vt_switch_done = true;
+
orig_kmsg = vt_kmsg_redirect(SUSPEND_CONSOLE);
return;
}
void pm_restore_console(void)
{
- if (!pm_vt_switch())
+ if (!pm_vt_switch() && !vt_switch_done)
return;
if (orig_fgconsole >= 0) {
vt_move_to_console(orig_fgconsole, 0);
vt_kmsg_redirect(orig_kmsg);
}
+
+ vt_switch_done = false;
}
diff --git a/kernel/power/energy_model.c b/kernel/power/energy_model.c
index d9b7e2b38c7a..ea7995a25780 100644
--- a/kernel/power/energy_model.c
+++ b/kernel/power/energy_model.c
@@ -233,6 +233,10 @@ static int em_compute_costs(struct device *dev, struct em_perf_state *table,
unsigned long prev_cost = ULONG_MAX;
int i, ret;
+ /* This is needed only for CPUs and EAS skip other devices */
+ if (!_is_cpu_device(dev))
+ return 0;
+
/* Compute the cost of each performance state. */
for (i = nr_states - 1; i >= 0; i--) {
unsigned long power_res, cost;
@@ -698,10 +702,12 @@ static int em_recalc_and_update(struct device *dev, struct em_perf_domain *pd,
{
int ret;
- ret = em_compute_costs(dev, em_table->state, NULL, pd->nr_perf_states,
- pd->flags);
- if (ret)
- goto free_em_table;
+ if (!em_is_artificial(pd)) {
+ ret = em_compute_costs(dev, em_table->state, NULL,
+ pd->nr_perf_states, pd->flags);
+ if (ret)
+ goto free_em_table;
+ }
ret = em_dev_update_perf_domain(dev, em_table);
if (ret)
@@ -721,10 +727,24 @@ free_em_table:
* Adjustment of CPU performance values after boot, when all CPUs capacites
* are correctly calculated.
*/
-static void em_adjust_new_capacity(struct device *dev,
+static void em_adjust_new_capacity(unsigned int cpu, struct device *dev,
struct em_perf_domain *pd)
{
+ unsigned long cpu_capacity = arch_scale_cpu_capacity(cpu);
struct em_perf_table *em_table;
+ struct em_perf_state *table;
+ unsigned long em_max_perf;
+
+ rcu_read_lock();
+ table = em_perf_state_from_pd(pd);
+ em_max_perf = table[pd->nr_perf_states - 1].performance;
+ rcu_read_unlock();
+
+ if (em_max_perf == cpu_capacity)
+ return;
+
+ pr_debug("updating cpu%d cpu_cap=%lu old capacity=%lu\n", cpu,
+ cpu_capacity, em_max_perf);
em_table = em_table_dup(pd);
if (!em_table) {
@@ -737,12 +757,27 @@ static void em_adjust_new_capacity(struct device *dev,
em_recalc_and_update(dev, pd, em_table);
}
+/**
+ * em_adjust_cpu_capacity() - Adjust the EM for a CPU after a capacity update.
+ * @cpu: Target CPU.
+ *
+ * Adjust the existing EM for @cpu after a capacity update under the assumption
+ * that the capacity has been updated in the same way for all of the CPUs in
+ * the same perf domain.
+ */
+void em_adjust_cpu_capacity(unsigned int cpu)
+{
+ struct device *dev = get_cpu_device(cpu);
+ struct em_perf_domain *pd;
+
+ pd = em_pd_get(dev);
+ if (pd)
+ em_adjust_new_capacity(cpu, dev, pd);
+}
+
static void em_check_capacity_update(void)
{
cpumask_var_t cpu_done_mask;
- struct em_perf_state *table;
- struct em_perf_domain *pd;
- unsigned long cpu_capacity;
int cpu;
if (!zalloc_cpumask_var(&cpu_done_mask, GFP_KERNEL)) {
@@ -753,7 +788,7 @@ static void em_check_capacity_update(void)
/* Check if CPUs capacity has changed than update EM */
for_each_possible_cpu(cpu) {
struct cpufreq_policy *policy;
- unsigned long em_max_perf;
+ struct em_perf_domain *pd;
struct device *dev;
if (cpumask_test_cpu(cpu, cpu_done_mask))
@@ -776,24 +811,7 @@ static void em_check_capacity_update(void)
cpumask_or(cpu_done_mask, cpu_done_mask,
em_span_cpus(pd));
- cpu_capacity = arch_scale_cpu_capacity(cpu);
-
- rcu_read_lock();
- table = em_perf_state_from_pd(pd);
- em_max_perf = table[pd->nr_perf_states - 1].performance;
- rcu_read_unlock();
-
- /*
- * Check if the CPU capacity has been adjusted during boot
- * and trigger the update for new performance values.
- */
- if (em_max_perf == cpu_capacity)
- continue;
-
- pr_debug("updating cpu%d cpu_cap=%lu old capacity=%lu\n",
- cpu, cpu_capacity, em_max_perf);
-
- em_adjust_new_capacity(dev, pd);
+ em_adjust_new_capacity(cpu, dev, pd);
}
free_cpumask_var(cpu_done_mask);
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index 338c9917d4ee..1f1f30cca573 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -90,6 +90,11 @@ void hibernate_release(void)
atomic_inc(&hibernate_atomic);
}
+bool hibernation_in_progress(void)
+{
+ return !atomic_read(&hibernate_atomic);
+}
+
bool hibernation_available(void)
{
return nohibernate == 0 &&
@@ -133,10 +138,15 @@ bool system_entering_hibernation(void)
EXPORT_SYMBOL(system_entering_hibernation);
#ifdef CONFIG_PM_DEBUG
+static unsigned int pm_test_delay = 5;
+module_param(pm_test_delay, uint, 0644);
+MODULE_PARM_DESC(pm_test_delay,
+ "Number of seconds to wait before resuming from hibernation test");
static void hibernation_debug_sleep(void)
{
- pr_info("debug: Waiting for 5 seconds.\n");
- mdelay(5000);
+ pr_info("hibernation debug: Waiting for %d second(s).\n",
+ pm_test_delay);
+ mdelay(pm_test_delay * 1000);
}
static int hibernation_test(int level)
@@ -371,6 +381,23 @@ static int create_image(int platform_mode)
return error;
}
+static void shrink_shmem_memory(void)
+{
+ struct sysinfo info;
+ unsigned long nr_shmem_pages, nr_freed_pages;
+
+ si_meminfo(&info);
+ nr_shmem_pages = info.sharedram; /* current page count used for shmem */
+ /*
+ * The intent is to reclaim all shmem pages. Though shrink_all_memory() can
+ * only reclaim about half of them, it's enough for creating the hibernation
+ * image.
+ */
+ nr_freed_pages = shrink_all_memory(nr_shmem_pages);
+ pr_debug("requested to reclaim %lu shmem pages, actually freed %lu pages\n",
+ nr_shmem_pages, nr_freed_pages);
+}
+
/**
* hibernation_snapshot - Quiesce devices and create a hibernation image.
* @platform_mode: If set, use platform driver to prepare for the transition.
@@ -412,8 +439,16 @@ int hibernation_snapshot(int platform_mode)
goto Thaw;
}
+ /*
+ * Device drivers may move lots of data to shmem in dpm_prepare(). The shmem
+ * pages will use lots of system memory, causing hibernation image creation
+ * fail due to insufficient free memory.
+ * This call is to force flush the shmem pages to swap disk and reclaim
+ * the system memory so that image creation can succeed.
+ */
+ shrink_shmem_memory();
+
console_suspend_all();
- pm_restrict_gfp_mask();
error = dpm_suspend(PMSG_FREEZE);
@@ -549,7 +584,6 @@ int hibernation_restore(int platform_mode)
pm_prepare_console();
console_suspend_all();
- pm_restrict_gfp_mask();
error = dpm_suspend_start(PMSG_QUIESCE);
if (!error) {
error = resume_target_kernel(platform_mode);
@@ -561,7 +595,6 @@ int hibernation_restore(int platform_mode)
BUG_ON(!error);
}
dpm_resume_end(PMSG_RECOVER);
- pm_restore_gfp_mask();
console_resume_all();
pm_restore_console();
return error;
@@ -757,7 +790,7 @@ int hibernate(void)
* Query for the compression algorithm support if compression is enabled.
*/
if (!nocompress) {
- strscpy(hib_comp_algo, hibernate_compressor, sizeof(hib_comp_algo));
+ strscpy(hib_comp_algo, hibernate_compressor);
if (!crypto_has_acomp(hib_comp_algo, 0, CRYPTO_ALG_ASYNC)) {
pr_err("%s compression is not available\n", hib_comp_algo);
return -EOPNOTSUPP;
@@ -1013,9 +1046,9 @@ static int software_resume(void)
*/
if (!(swsusp_header_flags & SF_NOCOMPRESS_MODE)) {
if (swsusp_header_flags & SF_COMPRESSION_ALG_LZ4)
- strscpy(hib_comp_algo, COMPRESSION_ALGO_LZ4, sizeof(hib_comp_algo));
+ strscpy(hib_comp_algo, COMPRESSION_ALGO_LZ4);
else
- strscpy(hib_comp_algo, COMPRESSION_ALGO_LZO, sizeof(hib_comp_algo));
+ strscpy(hib_comp_algo, COMPRESSION_ALGO_LZO);
if (!crypto_has_acomp(hib_comp_algo, 0, CRYPTO_ALG_ASYNC)) {
pr_err("%s compression is not available\n", hib_comp_algo);
error = -EOPNOTSUPP;
@@ -1470,8 +1503,7 @@ static int hibernate_compressor_param_set(const char *compressor,
if (index >= 0) {
ret = param_set_copystring(comp_alg_enabled[index], kp);
if (!ret)
- strscpy(hib_comp_algo, comp_alg_enabled[index],
- sizeof(hib_comp_algo));
+ strscpy(hib_comp_algo, comp_alg_enabled[index]);
} else {
ret = index;
}
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 0b0e76324c43..3cf2d7e72567 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -8,6 +8,7 @@
#include <linux/acpi.h>
#include <linux/export.h>
+#include <linux/init.h>
#include <linux/kobject.h>
#include <linux/string.h>
#include <linux/pm-trace.h>
@@ -112,6 +113,14 @@ int pm_notifier_call_chain(unsigned long val)
/* If set, devices may be suspended and resumed asynchronously. */
int pm_async_enabled = 1;
+static int __init pm_async_setup(char *str)
+{
+ if (!strcmp(str, "off"))
+ pm_async_enabled = 0;
+ return 1;
+}
+__setup("pm_async=", pm_async_setup);
+
static ssize_t pm_async_show(struct kobject *kobj, struct kobj_attribute *attr,
char *buf)
{
@@ -557,6 +566,10 @@ static int __init pm_debugfs_init(void)
late_initcall(pm_debugfs_init);
#endif /* CONFIG_DEBUG_FS */
+bool pm_sleep_transition_in_progress(void)
+{
+ return pm_suspend_in_progress() || hibernation_in_progress();
+}
#endif /* CONFIG_PM_SLEEP */
#ifdef CONFIG_PM_SLEEP_DEBUG
@@ -594,7 +607,7 @@ power_attr(pm_print_times);
static inline void pm_print_times_init(void)
{
- pm_print_times_enabled = !!initcall_debug;
+ pm_print_times_enabled = initcall_debug;
}
static ssize_t pm_wakeup_irq_show(struct kobject *kobj,
@@ -613,7 +626,7 @@ bool pm_debug_messages_on __read_mostly;
bool pm_debug_messages_should_print(void)
{
- return pm_debug_messages_on && pm_suspend_target_state != PM_SUSPEND_ON;
+ return pm_debug_messages_on && pm_sleep_transition_in_progress();
}
EXPORT_SYMBOL_GPL(pm_debug_messages_should_print);
diff --git a/kernel/power/power.h b/kernel/power/power.h
index 2eb81662b8fa..7ccd709af93f 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -75,10 +75,14 @@ extern void enable_restore_image_protection(void);
static inline void enable_restore_image_protection(void) {}
#endif /* CONFIG_STRICT_KERNEL_RWX */
+extern bool hibernation_in_progress(void);
+
#else /* !CONFIG_HIBERNATION */
static inline void hibernate_reserved_size_init(void) {}
static inline void hibernate_image_size_init(void) {}
+
+static inline bool hibernation_in_progress(void) { return false; }
#endif /* !CONFIG_HIBERNATION */
#define power_attr(_name) \
@@ -235,11 +239,6 @@ static inline void suspend_test_finish(const char *label) {}
/* kernel/power/main.c */
extern int pm_notifier_call_chain_robust(unsigned long val_up, unsigned long val_down);
extern int pm_notifier_call_chain(unsigned long val);
-void pm_restrict_gfp_mask(void);
-void pm_restore_gfp_mask(void);
-#else
-static inline void pm_restrict_gfp_mask(void) {}
-static inline void pm_restore_gfp_mask(void) {}
#endif
#ifdef CONFIG_HIGHMEM
diff --git a/kernel/power/process.c b/kernel/power/process.c
index 66ac067d9ae6..dc0dfc349f22 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -189,7 +189,7 @@ void thaw_processes(void)
oom_killer_enable();
- pr_info("Restarting tasks ... ");
+ pr_info("Restarting tasks: Starting\n");
__usermodehelper_set_disable_depth(UMH_FREEZING);
thaw_workqueues();
@@ -208,7 +208,7 @@ void thaw_processes(void)
usermodehelper_enable();
schedule();
- pr_cont("done.\n");
+ pr_info("Restarting tasks: Done\n");
trace_suspend_resume(TPS("thaw_processes"), 0, false);
}
@@ -217,7 +217,7 @@ void thaw_kernel_threads(void)
struct task_struct *g, *p;
pm_nosig_freezing = false;
- pr_info("Restarting kernel threads ... ");
+ pr_info("Restarting kernel threads ...\n");
thaw_workqueues();
@@ -229,5 +229,5 @@ void thaw_kernel_threads(void)
read_unlock(&tasklist_lock);
schedule();
- pr_cont("done.\n");
+ pr_info("Done restarting kernel threads.\n");
}
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 4e6e24e8b854..501df0676a61 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -1094,16 +1094,15 @@ static void mark_nosave_pages(struct memory_bitmap *bm)
((unsigned long long) region->end_pfn << PAGE_SHIFT)
- 1);
- for (pfn = region->start_pfn; pfn < region->end_pfn; pfn++)
- if (pfn_valid(pfn)) {
- /*
- * It is safe to ignore the result of
- * mem_bm_set_bit_check() here, since we won't
- * touch the PFNs for which the error is
- * returned anyway.
- */
- mem_bm_set_bit_check(bm, pfn);
- }
+ for_each_valid_pfn(pfn, region->start_pfn, region->end_pfn) {
+ /*
+ * It is safe to ignore the result of
+ * mem_bm_set_bit_check() here, since we won't
+ * touch the PFNs for which the error is
+ * returned anyway.
+ */
+ mem_bm_set_bit_check(bm, pfn);
+ }
}
}
@@ -1255,21 +1254,20 @@ static void mark_free_pages(struct zone *zone)
spin_lock_irqsave(&zone->lock, flags);
max_zone_pfn = zone_end_pfn(zone);
- for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
- if (pfn_valid(pfn)) {
- page = pfn_to_page(pfn);
+ for_each_valid_pfn(pfn, zone->zone_start_pfn, max_zone_pfn) {
+ page = pfn_to_page(pfn);
- if (!--page_count) {
- touch_nmi_watchdog();
- page_count = WD_PAGE_COUNT;
- }
+ if (!--page_count) {
+ touch_nmi_watchdog();
+ page_count = WD_PAGE_COUNT;
+ }
- if (page_zone(page) != zone)
- continue;
+ if (page_zone(page) != zone)
+ continue;
- if (!swsusp_page_is_forbidden(page))
- swsusp_unset_page_free(page);
- }
+ if (!swsusp_page_is_forbidden(page))
+ swsusp_unset_page_free(page);
+ }
for_each_migratetype_order(order, t) {
list_for_each_entry(page,
@@ -1538,7 +1536,7 @@ static unsigned long copy_data_pages(struct memory_bitmap *copy_bm,
memory_bm_position_reset(orig_bm);
memory_bm_position_reset(copy_bm);
copy_pfn = memory_bm_next_pfn(copy_bm);
- for(;;) {
+ for (;;) {
pfn = memory_bm_next_pfn(orig_bm);
if (unlikely(pfn == BM_END_OF_MAP))
break;
@@ -2163,13 +2161,13 @@ static const char *check_image_kernel(struct swsusp_info *info)
{
if (info->version_code != LINUX_VERSION_CODE)
return "kernel version";
- if (strcmp(info->uts.sysname,init_utsname()->sysname))
+ if (strcmp(info->uts.sysname, init_utsname()->sysname))
return "system type";
- if (strcmp(info->uts.release,init_utsname()->release))
+ if (strcmp(info->uts.release, init_utsname()->release))
return "kernel release";
- if (strcmp(info->uts.version,init_utsname()->version))
+ if (strcmp(info->uts.version, init_utsname()->version))
return "version";
- if (strcmp(info->uts.machine,init_utsname()->machine))
+ if (strcmp(info->uts.machine, init_utsname()->machine))
return "machine";
return NULL;
}
@@ -2363,7 +2361,7 @@ static int unpack_orig_pfns(unsigned long *buf, struct memory_bitmap *bm,
struct memory_bitmap *zero_bm)
{
unsigned long decoded_pfn;
- bool zero;
+ bool zero;
int j;
for (j = 0; j < PAGE_SIZE / sizeof(long); j++) {
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index 76b141b9aac0..b4ca17c2fecf 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -384,6 +384,7 @@ static int suspend_prepare(suspend_state_t state)
return 0;
dpm_save_failed_step(SUSPEND_FREEZE);
+ filesystems_thaw();
pm_notifier_call_chain(PM_POST_SUSPEND);
Restore:
pm_restore_console();
@@ -592,8 +593,6 @@ static int enter_state(suspend_state_t state)
ksys_sync_helper();
trace_suspend_resume(TPS("sync_filesystems"), 0, false);
}
- if (filesystem_freeze_enabled)
- filesystems_freeze();
pm_pr_dbg("Preparing system for sleep (%s)\n", mem_sleep_labels[state]);
pm_suspend_clear_flags();
@@ -606,16 +605,13 @@ static int enter_state(suspend_state_t state)
trace_suspend_resume(TPS("suspend_enter"), state, false);
pm_pr_dbg("Suspending system (%s)\n", mem_sleep_labels[state]);
- pm_restrict_gfp_mask();
error = suspend_devices_and_enter(state);
- pm_restore_gfp_mask();
Finish:
events_check_enabled = false;
pm_pr_dbg("Finishing wakeup.\n");
suspend_finish();
Unlock:
- filesystems_thaw();
mutex_unlock(&system_transition_mutex);
return error;
}
diff --git a/kernel/power/wakelock.c b/kernel/power/wakelock.c
index 52571dcad768..4e941999a53b 100644
--- a/kernel/power/wakelock.c
+++ b/kernel/power/wakelock.c
@@ -49,6 +49,9 @@ ssize_t pm_show_wakelocks(char *buf, bool show_active)
len += sysfs_emit_at(buf, len, "%s ", wl->name);
}
+ if (len > 0)
+ --len;
+
len += sysfs_emit_at(buf, len, "\n");
mutex_unlock(&wakelocks_lock);
diff --git a/kernel/printk/internal.h b/kernel/printk/internal.h
index 48a24e7b309d..bbed41ad29cf 100644
--- a/kernel/printk/internal.h
+++ b/kernel/printk/internal.h
@@ -72,7 +72,6 @@ int vprintk_store(int facility, int level,
const char *fmt, va_list args);
__printf(1, 0) int vprintk_default(const char *fmt, va_list args);
-__printf(1, 0) int vprintk_deferred(const char *fmt, va_list args);
void __printk_safe_enter(void);
void __printk_safe_exit(void);
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index d5f89f9ef29f..75a84efad40f 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -921,7 +921,6 @@ ptrace_get_syscall_info_entry(struct task_struct *child, struct pt_regs *regs,
unsigned long args[ARRAY_SIZE(info->entry.args)];
int i;
- info->op = PTRACE_SYSCALL_INFO_ENTRY;
info->entry.nr = syscall_get_nr(child, regs);
syscall_get_arguments(child, regs, args);
for (i = 0; i < ARRAY_SIZE(args); i++)
@@ -943,10 +942,12 @@ ptrace_get_syscall_info_seccomp(struct task_struct *child, struct pt_regs *regs,
* diverge significantly enough.
*/
ptrace_get_syscall_info_entry(child, regs, info);
- info->op = PTRACE_SYSCALL_INFO_SECCOMP;
info->seccomp.ret_data = child->ptrace_message;
- /* ret_data is the last field in struct ptrace_syscall_info.seccomp */
+ /*
+ * ret_data is the last non-reserved field
+ * in struct ptrace_syscall_info.seccomp
+ */
return offsetofend(struct ptrace_syscall_info, seccomp.ret_data);
}
@@ -954,7 +955,6 @@ static unsigned long
ptrace_get_syscall_info_exit(struct task_struct *child, struct pt_regs *regs,
struct ptrace_syscall_info *info)
{
- info->op = PTRACE_SYSCALL_INFO_EXIT;
info->exit.rval = syscall_get_error(child, regs);
info->exit.is_error = !!info->exit.rval;
if (!info->exit.is_error)
@@ -965,19 +965,8 @@ ptrace_get_syscall_info_exit(struct task_struct *child, struct pt_regs *regs,
}
static int
-ptrace_get_syscall_info(struct task_struct *child, unsigned long user_size,
- void __user *datavp)
+ptrace_get_syscall_info_op(struct task_struct *child)
{
- struct pt_regs *regs = task_pt_regs(child);
- struct ptrace_syscall_info info = {
- .op = PTRACE_SYSCALL_INFO_NONE,
- .arch = syscall_get_arch(child),
- .instruction_pointer = instruction_pointer(regs),
- .stack_pointer = user_stack_pointer(regs),
- };
- unsigned long actual_size = offsetof(struct ptrace_syscall_info, entry);
- unsigned long write_size;
-
/*
* This does not need lock_task_sighand() to access
* child->last_siginfo because ptrace_freeze_traced()
@@ -988,24 +977,160 @@ ptrace_get_syscall_info(struct task_struct *child, unsigned long user_size,
case SIGTRAP | 0x80:
switch (child->ptrace_message) {
case PTRACE_EVENTMSG_SYSCALL_ENTRY:
- actual_size = ptrace_get_syscall_info_entry(child, regs,
- &info);
- break;
+ return PTRACE_SYSCALL_INFO_ENTRY;
case PTRACE_EVENTMSG_SYSCALL_EXIT:
- actual_size = ptrace_get_syscall_info_exit(child, regs,
- &info);
- break;
+ return PTRACE_SYSCALL_INFO_EXIT;
+ default:
+ return PTRACE_SYSCALL_INFO_NONE;
}
- break;
case SIGTRAP | (PTRACE_EVENT_SECCOMP << 8):
- actual_size = ptrace_get_syscall_info_seccomp(child, regs,
- &info);
+ return PTRACE_SYSCALL_INFO_SECCOMP;
+ default:
+ return PTRACE_SYSCALL_INFO_NONE;
+ }
+}
+
+static int
+ptrace_get_syscall_info(struct task_struct *child, unsigned long user_size,
+ void __user *datavp)
+{
+ struct pt_regs *regs = task_pt_regs(child);
+ struct ptrace_syscall_info info = {
+ .op = ptrace_get_syscall_info_op(child),
+ .arch = syscall_get_arch(child),
+ .instruction_pointer = instruction_pointer(regs),
+ .stack_pointer = user_stack_pointer(regs),
+ };
+ unsigned long actual_size = offsetof(struct ptrace_syscall_info, entry);
+ unsigned long write_size;
+
+ switch (info.op) {
+ case PTRACE_SYSCALL_INFO_ENTRY:
+ actual_size = ptrace_get_syscall_info_entry(child, regs, &info);
+ break;
+ case PTRACE_SYSCALL_INFO_EXIT:
+ actual_size = ptrace_get_syscall_info_exit(child, regs, &info);
+ break;
+ case PTRACE_SYSCALL_INFO_SECCOMP:
+ actual_size = ptrace_get_syscall_info_seccomp(child, regs, &info);
break;
}
write_size = min(actual_size, user_size);
return copy_to_user(datavp, &info, write_size) ? -EFAULT : actual_size;
}
+
+static int
+ptrace_set_syscall_info_entry(struct task_struct *child, struct pt_regs *regs,
+ struct ptrace_syscall_info *info)
+{
+ unsigned long args[ARRAY_SIZE(info->entry.args)];
+ int nr = info->entry.nr;
+ int i;
+
+ /*
+ * Check that the syscall number specified in info->entry.nr
+ * is either a value of type "int" or a sign-extended value
+ * of type "int".
+ */
+ if (nr != info->entry.nr)
+ return -ERANGE;
+
+ for (i = 0; i < ARRAY_SIZE(args); i++) {
+ args[i] = info->entry.args[i];
+ /*
+ * Check that the syscall argument specified in
+ * info->entry.args[i] is either a value of type
+ * "unsigned long" or a sign-extended value of type "long".
+ */
+ if (args[i] != info->entry.args[i])
+ return -ERANGE;
+ }
+
+ syscall_set_nr(child, regs, nr);
+ /*
+ * If the syscall number is set to -1, setting syscall arguments is not
+ * just pointless, it would also clobber the syscall return value on
+ * those architectures that share the same register both for the first
+ * argument of syscall and its return value.
+ */
+ if (nr != -1)
+ syscall_set_arguments(child, regs, args);
+
+ return 0;
+}
+
+static int
+ptrace_set_syscall_info_seccomp(struct task_struct *child, struct pt_regs *regs,
+ struct ptrace_syscall_info *info)
+{
+ /*
+ * info->entry is currently a subset of info->seccomp,
+ * info->seccomp.ret_data is currently ignored.
+ */
+ return ptrace_set_syscall_info_entry(child, regs, info);
+}
+
+static int
+ptrace_set_syscall_info_exit(struct task_struct *child, struct pt_regs *regs,
+ struct ptrace_syscall_info *info)
+{
+ long rval = info->exit.rval;
+
+ /*
+ * Check that the return value specified in info->exit.rval
+ * is either a value of type "long" or a sign-extended value
+ * of type "long".
+ */
+ if (rval != info->exit.rval)
+ return -ERANGE;
+
+ if (info->exit.is_error)
+ syscall_set_return_value(child, regs, rval, 0);
+ else
+ syscall_set_return_value(child, regs, 0, rval);
+
+ return 0;
+}
+
+static int
+ptrace_set_syscall_info(struct task_struct *child, unsigned long user_size,
+ const void __user *datavp)
+{
+ struct pt_regs *regs = task_pt_regs(child);
+ struct ptrace_syscall_info info;
+
+ if (user_size < sizeof(info))
+ return -EINVAL;
+
+ /*
+ * The compatibility is tracked by info.op and info.flags: if user-space
+ * does not instruct us to use unknown extra bits from future versions
+ * of ptrace_syscall_info, we are not going to read them either.
+ */
+ if (copy_from_user(&info, datavp, sizeof(info)))
+ return -EFAULT;
+
+ /* Reserved for future use. */
+ if (info.flags || info.reserved)
+ return -EINVAL;
+
+ /* Changing the type of the system call stop is not supported yet. */
+ if (ptrace_get_syscall_info_op(child) != info.op)
+ return -EINVAL;
+
+ switch (info.op) {
+ case PTRACE_SYSCALL_INFO_ENTRY:
+ return ptrace_set_syscall_info_entry(child, regs, &info);
+ case PTRACE_SYSCALL_INFO_EXIT:
+ return ptrace_set_syscall_info_exit(child, regs, &info);
+ case PTRACE_SYSCALL_INFO_SECCOMP:
+ return ptrace_set_syscall_info_seccomp(child, regs, &info);
+ default:
+ /* Other types of system call stops are not supported yet. */
+ return -EINVAL;
+ }
+}
#endif /* CONFIG_HAVE_ARCH_TRACEHOOK */
int ptrace_request(struct task_struct *child, long request,
@@ -1224,6 +1349,10 @@ int ptrace_request(struct task_struct *child, long request,
case PTRACE_GET_SYSCALL_INFO:
ret = ptrace_get_syscall_info(child, addr, datavp);
break;
+
+ case PTRACE_SET_SYSCALL_INFO:
+ ret = ptrace_set_syscall_info(child, addr, datavp);
+ break;
#endif
case PTRACE_SECCOMP_GET_FILTER:
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index b5db1ac32a3d..7a893d51d02b 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -55,22 +55,24 @@ MODULE_DESCRIPTION("Read-Copy Update module-based torture test facility");
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Paul E. McKenney <paulmck@linux.ibm.com> and Josh Triplett <josh@joshtriplett.org>");
-/* Bits for ->extendables field, extendables param, and related definitions. */
-#define RCUTORTURE_RDR_SHIFT_1 8 /* Put SRCU index in upper bits. */
-#define RCUTORTURE_RDR_MASK_1 (0xff << RCUTORTURE_RDR_SHIFT_1)
-#define RCUTORTURE_RDR_SHIFT_2 16 /* Put SRCU index in upper bits. */
-#define RCUTORTURE_RDR_MASK_2 (0xff << RCUTORTURE_RDR_SHIFT_2)
-#define RCUTORTURE_RDR_BH 0x01 /* Extend readers by disabling bh. */
-#define RCUTORTURE_RDR_IRQ 0x02 /* ... disabling interrupts. */
-#define RCUTORTURE_RDR_PREEMPT 0x04 /* ... disabling preemption. */
-#define RCUTORTURE_RDR_RBH 0x08 /* ... rcu_read_lock_bh(). */
-#define RCUTORTURE_RDR_SCHED 0x10 /* ... rcu_read_lock_sched(). */
-#define RCUTORTURE_RDR_RCU_1 0x20 /* ... entering another RCU reader. */
-#define RCUTORTURE_RDR_RCU_2 0x40 /* ... entering another RCU reader. */
-#define RCUTORTURE_RDR_NBITS 7 /* Number of bits defined above. */
-#define RCUTORTURE_MAX_EXTEND \
+// Bits for ->extendables field, extendables param, and related definitions.
+#define RCUTORTURE_RDR_SHIFT_1 8 // Put SRCU index in upper bits.
+#define RCUTORTURE_RDR_MASK_1 (0xff << RCUTORTURE_RDR_SHIFT_1)
+#define RCUTORTURE_RDR_SHIFT_2 16 // Put SRCU index in upper bits.
+#define RCUTORTURE_RDR_MASK_2 (0xff << RCUTORTURE_RDR_SHIFT_2)
+#define RCUTORTURE_RDR_BH 0x01 // Extend readers by disabling bh.
+#define RCUTORTURE_RDR_IRQ 0x02 // ... disabling interrupts.
+#define RCUTORTURE_RDR_PREEMPT 0x04 // ... disabling preemption.
+#define RCUTORTURE_RDR_RBH 0x08 // ... rcu_read_lock_bh().
+#define RCUTORTURE_RDR_SCHED 0x10 // ... rcu_read_lock_sched().
+#define RCUTORTURE_RDR_RCU_1 0x20 // ... entering another RCU reader.
+#define RCUTORTURE_RDR_RCU_2 0x40 // ... entering another RCU reader.
+#define RCUTORTURE_RDR_UPDOWN 0x80 // ... up-read from task, down-read from timer.
+ // Note: Manual start, automatic end.
+#define RCUTORTURE_RDR_NBITS 8 // Number of bits defined above.
+#define RCUTORTURE_MAX_EXTEND \
(RCUTORTURE_RDR_BH | RCUTORTURE_RDR_IRQ | RCUTORTURE_RDR_PREEMPT | \
- RCUTORTURE_RDR_RBH | RCUTORTURE_RDR_SCHED)
+ RCUTORTURE_RDR_RBH | RCUTORTURE_RDR_SCHED) // Intentionally omit RCUTORTURE_RDR_UPDOWN.
#define RCUTORTURE_RDR_ALLBITS \
(RCUTORTURE_MAX_EXTEND | RCUTORTURE_RDR_RCU_1 | RCUTORTURE_RDR_RCU_2 | \
RCUTORTURE_RDR_MASK_1 | RCUTORTURE_RDR_MASK_2)
@@ -110,6 +112,7 @@ torture_param(bool, gp_sync, false, "Use synchronous GP wait primitives");
torture_param(int, irqreader, 1, "Allow RCU readers from irq handlers");
torture_param(int, leakpointer, 0, "Leak pointer dereferences from readers");
torture_param(int, n_barrier_cbs, 0, "# of callbacks/kthreads for barrier testing");
+torture_param(int, n_up_down, 32, "# of concurrent up/down hrtimer-based RCU readers");
torture_param(int, nfakewriters, 4, "Number of RCU fake writer threads");
torture_param(int, nreaders, -1, "Number of RCU reader threads");
torture_param(int, object_debug, 0, "Enable debug-object double call_rcu() testing");
@@ -156,6 +159,7 @@ static int nrealfakewriters;
static struct task_struct *writer_task;
static struct task_struct **fakewriter_tasks;
static struct task_struct **reader_tasks;
+static struct task_struct *updown_task;
static struct task_struct **nocb_tasks;
static struct task_struct *stats_task;
static struct task_struct *fqs_task;
@@ -378,6 +382,8 @@ struct rcu_torture_ops {
void (*readunlock)(int idx);
int (*readlock_held)(void); // lockdep.
int (*readlock_nesting)(void); // actual nesting, if available, -1 if not.
+ int (*down_read)(void);
+ void (*up_read)(int idx);
unsigned long (*get_gp_seq)(void);
unsigned long (*gp_diff)(unsigned long new, unsigned long old);
void (*deferred_free)(struct rcu_torture *p);
@@ -427,6 +433,7 @@ struct rcu_torture_ops {
int no_pi_lock;
int debug_objects;
int start_poll_irqsoff;
+ int have_up_down;
const char *name;
};
@@ -464,7 +471,7 @@ rcu_read_delay(struct torture_random_state *rrsp, struct rt_read_seg *rtrsp)
!(torture_random(rrsp) % (nrealreaders * 2000 * longdelay_ms))) {
started = cur_ops->get_gp_seq();
ts = rcu_trace_clock_local();
- if (preempt_count() & (SOFTIRQ_MASK | HARDIRQ_MASK))
+ if ((preempt_count() & HARDIRQ_MASK) || softirq_count())
longdelay_ms = 5; /* Avoid triggering BH limits. */
mdelay(longdelay_ms);
rtrsp->rt_delay_ms = longdelay_ms;
@@ -711,11 +718,6 @@ static int srcu_torture_read_lock(void)
WARN_ON_ONCE(idx & ~0x1);
ret += idx << 1;
}
- if (reader_flavor & SRCU_READ_FLAVOR_LITE) {
- idx = srcu_read_lock_lite(srcu_ctlp);
- WARN_ON_ONCE(idx & ~0x1);
- ret += idx << 2;
- }
if (reader_flavor & SRCU_READ_FLAVOR_FAST) {
scp = srcu_read_lock_fast(srcu_ctlp);
idx = __srcu_ptr_to_ctr(srcu_ctlp, scp);
@@ -749,8 +751,6 @@ static void srcu_torture_read_unlock(int idx)
WARN_ON_ONCE((reader_flavor && (idx & ~reader_flavor)) || (!reader_flavor && (idx & ~0x1)));
if (reader_flavor & SRCU_READ_FLAVOR_FAST)
srcu_read_unlock_fast(srcu_ctlp, __srcu_ctr_to_ptr(srcu_ctlp, (idx & 0x8) >> 3));
- if (reader_flavor & SRCU_READ_FLAVOR_LITE)
- srcu_read_unlock_lite(srcu_ctlp, (idx & 0x4) >> 2);
if (reader_flavor & SRCU_READ_FLAVOR_NMI)
srcu_read_unlock_nmisafe(srcu_ctlp, (idx & 0x2) >> 1);
if ((reader_flavor & SRCU_READ_FLAVOR_NORMAL) || !(reader_flavor & SRCU_READ_FLAVOR_ALL))
@@ -762,6 +762,50 @@ static int torture_srcu_read_lock_held(void)
return srcu_read_lock_held(srcu_ctlp);
}
+static bool srcu_torture_have_up_down(void)
+{
+ int rf = reader_flavor;
+
+ if (!rf)
+ rf = SRCU_READ_FLAVOR_NORMAL;
+ return !!(cur_ops->have_up_down & rf);
+}
+
+static int srcu_torture_down_read(void)
+{
+ int idx;
+ struct srcu_ctr __percpu *scp;
+
+ WARN_ON_ONCE(reader_flavor & ~SRCU_READ_FLAVOR_ALL);
+ WARN_ON_ONCE(reader_flavor & (reader_flavor - 1));
+
+ if ((reader_flavor & SRCU_READ_FLAVOR_NORMAL) || !(reader_flavor & SRCU_READ_FLAVOR_ALL)) {
+ idx = srcu_down_read(srcu_ctlp);
+ WARN_ON_ONCE(idx & ~0x1);
+ return idx;
+ }
+ if (reader_flavor & SRCU_READ_FLAVOR_FAST) {
+ scp = srcu_down_read_fast(srcu_ctlp);
+ idx = __srcu_ptr_to_ctr(srcu_ctlp, scp);
+ WARN_ON_ONCE(idx & ~0x1);
+ return idx << 3;
+ }
+ WARN_ON_ONCE(1);
+ return 0;
+}
+
+static void srcu_torture_up_read(int idx)
+{
+ WARN_ON_ONCE((reader_flavor && (idx & ~reader_flavor)) || (!reader_flavor && (idx & ~0x1)));
+ if (reader_flavor & SRCU_READ_FLAVOR_FAST)
+ srcu_up_read_fast(srcu_ctlp, __srcu_ctr_to_ptr(srcu_ctlp, (idx & 0x8) >> 3));
+ else if ((reader_flavor & SRCU_READ_FLAVOR_NORMAL) ||
+ !(reader_flavor & SRCU_READ_FLAVOR_ALL))
+ srcu_up_read(srcu_ctlp, idx & 0x1);
+ else
+ WARN_ON_ONCE(1);
+}
+
static unsigned long srcu_torture_completed(void)
{
return srcu_batches_completed(srcu_ctlp);
@@ -819,6 +863,8 @@ static struct rcu_torture_ops srcu_ops = {
.readlock = srcu_torture_read_lock,
.read_delay = srcu_read_delay,
.readunlock = srcu_torture_read_unlock,
+ .down_read = srcu_torture_down_read,
+ .up_read = srcu_torture_up_read,
.readlock_held = torture_srcu_read_lock_held,
.get_gp_seq = srcu_torture_completed,
.gp_diff = rcu_seq_diff,
@@ -839,6 +885,8 @@ static struct rcu_torture_ops srcu_ops = {
.irq_capable = 1,
.no_pi_lock = IS_ENABLED(CONFIG_TINY_SRCU),
.debug_objects = 1,
+ .have_up_down = IS_ENABLED(CONFIG_TINY_SRCU)
+ ? 0 : SRCU_READ_FLAVOR_NORMAL | SRCU_READ_FLAVOR_FAST,
.name = "srcu"
};
@@ -864,6 +912,8 @@ static struct rcu_torture_ops srcud_ops = {
.read_delay = srcu_read_delay,
.readunlock = srcu_torture_read_unlock,
.readlock_held = torture_srcu_read_lock_held,
+ .down_read = srcu_torture_down_read,
+ .up_read = srcu_torture_up_read,
.get_gp_seq = srcu_torture_completed,
.gp_diff = rcu_seq_diff,
.deferred_free = srcu_torture_deferred_free,
@@ -883,6 +933,8 @@ static struct rcu_torture_ops srcud_ops = {
.irq_capable = 1,
.no_pi_lock = IS_ENABLED(CONFIG_TINY_SRCU),
.debug_objects = 1,
+ .have_up_down = IS_ENABLED(CONFIG_TINY_SRCU)
+ ? 0 : SRCU_READ_FLAVOR_NORMAL | SRCU_READ_FLAVOR_FAST,
.name = "srcud"
};
@@ -910,7 +962,8 @@ static struct rcu_torture_ops busted_srcud_ops = {
/*
* Definitions for trivial CONFIG_PREEMPT=n-only torture testing.
- * This implementation does not necessarily work well with CPU hotplug.
+ * This implementation does not work well with CPU hotplug nor
+ * with rcutorture's shuffling.
*/
static void synchronize_rcu_trivial(void)
@@ -923,6 +976,16 @@ static void synchronize_rcu_trivial(void)
}
}
+static void rcu_sync_torture_init_trivial(void)
+{
+ rcu_sync_torture_init();
+ // if (onoff_interval || shuffle_interval) {
+ if (WARN_ONCE(onoff_interval || shuffle_interval, "%s: Non-zero onoff_interval (%d) or shuffle_interval (%d) breaks trivial RCU, resetting to zero", __func__, onoff_interval, shuffle_interval)) {
+ onoff_interval = 0;
+ shuffle_interval = 0;
+ }
+}
+
static int rcu_torture_read_lock_trivial(void)
{
preempt_disable();
@@ -936,7 +999,7 @@ static void rcu_torture_read_unlock_trivial(int idx)
static struct rcu_torture_ops trivial_ops = {
.ttype = RCU_TRIVIAL_FLAVOR,
- .init = rcu_sync_torture_init,
+ .init = rcu_sync_torture_init_trivial,
.readlock = rcu_torture_read_lock_trivial,
.read_delay = rcu_read_delay, /* just reuse rcu's version. */
.readunlock = rcu_torture_read_unlock_trivial,
@@ -1722,6 +1785,7 @@ rcu_torture_writer(void *arg)
cur_ops->gp_kthread_dbg();
WARN(1, "%s: rtort_pipe_count: %d\n", __func__, rcu_tortures[i].rtort_pipe_count);
rcu_ftrace_dump(DUMP_ALL);
+ break;
}
if (stutter_waited)
sched_set_normal(current, oldnice);
@@ -1915,14 +1979,14 @@ static void rcu_torture_reader_do_mbchk(long myid, struct rcu_torture *rtp,
// Verify the specified RCUTORTURE_RDR* state.
#define ROEC_ARGS "%s %s: Current %#x To add %#x To remove %#x preempt_count() %#x\n", __func__, s, curstate, new, old, preempt_count()
-static void rcutorture_one_extend_check(char *s, int curstate, int new, int old, bool insoftirq)
+static void rcutorture_one_extend_check(char *s, int curstate, int new, int old)
{
int mask;
- if (!IS_ENABLED(CONFIG_RCU_TORTURE_TEST_CHK_RDR_STATE))
+ if (!IS_ENABLED(CONFIG_RCU_TORTURE_TEST_CHK_RDR_STATE) || in_nmi())
return;
- WARN_ONCE(!(curstate & RCUTORTURE_RDR_IRQ) && irqs_disabled(), ROEC_ARGS);
+ WARN_ONCE(!(curstate & RCUTORTURE_RDR_IRQ) && irqs_disabled() && !in_hardirq(), ROEC_ARGS);
WARN_ONCE((curstate & RCUTORTURE_RDR_IRQ) && !irqs_disabled(), ROEC_ARGS);
// If CONFIG_PREEMPT_COUNT=n, further checks are unreliable.
@@ -1930,21 +1994,21 @@ static void rcutorture_one_extend_check(char *s, int curstate, int new, int old,
return;
WARN_ONCE((curstate & (RCUTORTURE_RDR_BH | RCUTORTURE_RDR_RBH)) &&
- !(preempt_count() & SOFTIRQ_MASK), ROEC_ARGS);
+ !softirq_count(), ROEC_ARGS);
WARN_ONCE((curstate & (RCUTORTURE_RDR_PREEMPT | RCUTORTURE_RDR_SCHED)) &&
!(preempt_count() & PREEMPT_MASK), ROEC_ARGS);
WARN_ONCE(cur_ops->readlock_nesting &&
(curstate & (RCUTORTURE_RDR_RCU_1 | RCUTORTURE_RDR_RCU_2)) &&
cur_ops->readlock_nesting() == 0, ROEC_ARGS);
- // Timer handlers have all sorts of stuff disabled, so ignore
+ // Interrupt handlers have all sorts of stuff disabled, so ignore
// unintended disabling.
- if (insoftirq)
+ if (in_serving_softirq() || in_hardirq())
return;
WARN_ONCE(cur_ops->extendables &&
!(curstate & (RCUTORTURE_RDR_BH | RCUTORTURE_RDR_RBH)) &&
- (preempt_count() & SOFTIRQ_MASK), ROEC_ARGS);
+ softirq_count(), ROEC_ARGS);
/*
* non-preemptible RCU in a preemptible kernel uses preempt_disable()
@@ -1965,6 +2029,9 @@ static void rcutorture_one_extend_check(char *s, int curstate, int new, int old,
if (!IS_ENABLED(CONFIG_PREEMPT_RCU))
mask |= RCUTORTURE_RDR_PREEMPT | RCUTORTURE_RDR_SCHED;
+ if (IS_ENABLED(CONFIG_PREEMPT_RT) && softirq_count())
+ mask |= RCUTORTURE_RDR_BH | RCUTORTURE_RDR_RBH;
+
WARN_ONCE(cur_ops->readlock_nesting && !(curstate & mask) &&
cur_ops->readlock_nesting() > 0, ROEC_ARGS);
}
@@ -1978,8 +2045,7 @@ static void rcutorture_one_extend_check(char *s, int curstate, int new, int old,
* beginning or end of the critical section and if there was actually a
* change, do a ->read_delay().
*/
-static void rcutorture_one_extend(int *readstate, int newstate, bool insoftirq,
- struct torture_random_state *trsp,
+static void rcutorture_one_extend(int *readstate, int newstate, struct torture_random_state *trsp,
struct rt_read_seg *rtrsp)
{
bool first;
@@ -1993,8 +2059,8 @@ static void rcutorture_one_extend(int *readstate, int newstate, bool insoftirq,
first = idxold1 == 0;
WARN_ON_ONCE(idxold2 < 0);
- WARN_ON_ONCE(idxold2 & ~RCUTORTURE_RDR_ALLBITS);
- rcutorture_one_extend_check("before change", idxold1, statesnew, statesold, insoftirq);
+ WARN_ON_ONCE(idxold2 & ~(RCUTORTURE_RDR_ALLBITS | RCUTORTURE_RDR_UPDOWN));
+ rcutorture_one_extend_check("before change", idxold1, statesnew, statesold);
rtrsp->rt_readstate = newstate;
/* First, put new protection in place to avoid critical-section gap. */
@@ -2014,8 +2080,7 @@ static void rcutorture_one_extend(int *readstate, int newstate, bool insoftirq,
idxnew2 = (cur_ops->readlock() << RCUTORTURE_RDR_SHIFT_2) & RCUTORTURE_RDR_MASK_2;
// Complain unless both the old and the new protection is in place.
- rcutorture_one_extend_check("during change",
- idxold1 | statesnew, statesnew, statesold, insoftirq);
+ rcutorture_one_extend_check("during change", idxold1 | statesnew, statesnew, statesold);
// Sample CPU under both sets of protections to reduce confusion.
if (IS_ENABLED(CONFIG_RCU_TORTURE_TEST_LOG_CPU)) {
@@ -2069,6 +2134,11 @@ static void rcutorture_one_extend(int *readstate, int newstate, bool insoftirq,
if (lockit)
raw_spin_unlock_irqrestore(&current->pi_lock, flags);
}
+ if (statesold & RCUTORTURE_RDR_UPDOWN) {
+ cur_ops->up_read((idxold1 & RCUTORTURE_RDR_MASK_1) >> RCUTORTURE_RDR_SHIFT_1);
+ WARN_ON_ONCE(idxnew1 != -1);
+ idxold1 = 0;
+ }
/* Delay if neither beginning nor end and there was a change. */
if ((statesnew || statesold) && *readstate && newstate)
@@ -2085,7 +2155,7 @@ static void rcutorture_one_extend(int *readstate, int newstate, bool insoftirq,
WARN_ON_ONCE(*readstate < 0);
if (WARN_ON_ONCE(*readstate & ~RCUTORTURE_RDR_ALLBITS))
pr_info("Unexpected readstate value of %#x\n", *readstate);
- rcutorture_one_extend_check("after change", *readstate, statesnew, statesold, insoftirq);
+ rcutorture_one_extend_check("after change", *readstate, statesnew, statesold);
}
/* Return the biggest extendables mask given current RCU and boot parameters. */
@@ -2152,8 +2222,7 @@ rcutorture_extend_mask(int oldmask, struct torture_random_state *trsp)
* critical section.
*/
static struct rt_read_seg *
-rcutorture_loop_extend(int *readstate, bool insoftirq, struct torture_random_state *trsp,
- struct rt_read_seg *rtrsp)
+rcutorture_loop_extend(int *readstate, struct torture_random_state *trsp, struct rt_read_seg *rtrsp)
{
int i;
int j;
@@ -2167,7 +2236,8 @@ rcutorture_loop_extend(int *readstate, bool insoftirq, struct torture_random_sta
i = ((i | (i >> 3)) & RCUTORTURE_RDR_MAX_LOOPS) + 1;
for (j = 0; j < i; j++) {
mask = rcutorture_extend_mask(*readstate, trsp);
- rcutorture_one_extend(readstate, mask, insoftirq, trsp, &rtrsp[j]);
+ WARN_ON_ONCE(mask & RCUTORTURE_RDR_UPDOWN);
+ rcutorture_one_extend(readstate, mask, trsp, &rtrsp[j]);
}
return &rtrsp[j];
}
@@ -2209,10 +2279,11 @@ static bool rcu_torture_one_read_start(struct rcu_torture_one_read_state *rtorsp
rtorsp->started = cur_ops->get_gp_seq();
rtorsp->ts = rcu_trace_clock_local();
rtorsp->p = rcu_dereference_check(rcu_torture_current,
- !cur_ops->readlock_held || cur_ops->readlock_held());
+ !cur_ops->readlock_held || cur_ops->readlock_held() ||
+ (rtorsp->readstate & RCUTORTURE_RDR_UPDOWN));
if (rtorsp->p == NULL) {
/* Wait for rcu_torture_writer to get underway */
- rcutorture_one_extend(&rtorsp->readstate, 0, myid < 0, trsp, rtorsp->rtrsp);
+ rcutorture_one_extend(&rtorsp->readstate, 0, trsp, rtorsp->rtrsp);
return false;
}
if (rtorsp->p->rtort_mbtest == 0)
@@ -2226,7 +2297,7 @@ static bool rcu_torture_one_read_start(struct rcu_torture_one_read_state *rtorsp
* critical sections and check for errors.
*/
static void rcu_torture_one_read_end(struct rcu_torture_one_read_state *rtorsp,
- struct torture_random_state *trsp, long myid)
+ struct torture_random_state *trsp)
{
int i;
unsigned long completed;
@@ -2273,7 +2344,7 @@ static void rcu_torture_one_read_end(struct rcu_torture_one_read_state *rtorsp,
}
if (cur_ops->reader_blocked)
preempted = cur_ops->reader_blocked();
- rcutorture_one_extend(&rtorsp->readstate, 0, myid < 0, trsp, rtorsp->rtrsp);
+ rcutorture_one_extend(&rtorsp->readstate, 0, trsp, rtorsp->rtrsp);
WARN_ON_ONCE(rtorsp->readstate);
// This next splat is expected behavior if leakpointer, especially
// for CONFIG_RCU_STRICT_GRACE_PERIOD=y kernels.
@@ -2302,13 +2373,14 @@ static bool rcu_torture_one_read(struct torture_random_state *trsp, long myid)
WARN_ON_ONCE(!rcu_is_watching());
init_rcu_torture_one_read_state(&rtors, trsp);
newstate = rcutorture_extend_mask(rtors.readstate, trsp);
- rcutorture_one_extend(&rtors.readstate, newstate, myid < 0, trsp, rtors.rtrsp++);
+ WARN_ON_ONCE(newstate & RCUTORTURE_RDR_UPDOWN);
+ rcutorture_one_extend(&rtors.readstate, newstate, trsp, rtors.rtrsp++);
if (!rcu_torture_one_read_start(&rtors, trsp, myid)) {
- rcutorture_one_extend(&rtors.readstate, 0, myid < 0, trsp, rtors.rtrsp);
+ rcutorture_one_extend(&rtors.readstate, 0, trsp, rtors.rtrsp);
return false;
}
- rtors.rtrsp = rcutorture_loop_extend(&rtors.readstate, myid < 0, trsp, rtors.rtrsp);
- rcu_torture_one_read_end(&rtors, trsp, myid);
+ rtors.rtrsp = rcutorture_loop_extend(&rtors.readstate, trsp, rtors.rtrsp);
+ rcu_torture_one_read_end(&rtors, trsp);
return true;
}
@@ -2371,13 +2443,159 @@ rcu_torture_reader(void *arg)
} while (!torture_must_stop());
if (irqreader && cur_ops->irq_capable) {
timer_delete_sync(&t);
- destroy_timer_on_stack(&t);
+ timer_destroy_on_stack(&t);
}
tick_dep_clear_task(current, TICK_DEP_BIT_RCU);
torture_kthread_stopping("rcu_torture_reader");
return 0;
}
+struct rcu_torture_one_read_state_updown {
+ struct hrtimer rtorsu_hrt;
+ bool rtorsu_inuse;
+ ktime_t rtorsu_kt;
+ int rtorsu_cpu;
+ unsigned long rtorsu_j;
+ unsigned long rtorsu_ndowns;
+ unsigned long rtorsu_nups;
+ unsigned long rtorsu_nmigrates;
+ struct torture_random_state rtorsu_trs;
+ struct rcu_torture_one_read_state rtorsu_rtors;
+};
+
+static struct rcu_torture_one_read_state_updown *updownreaders;
+static DEFINE_TORTURE_RANDOM(rcu_torture_updown_rand);
+static int rcu_torture_updown(void *arg);
+
+static enum hrtimer_restart rcu_torture_updown_hrt(struct hrtimer *hrtp)
+{
+ int cpu = raw_smp_processor_id();
+ struct rcu_torture_one_read_state_updown *rtorsup;
+
+ rtorsup = container_of(hrtp, struct rcu_torture_one_read_state_updown, rtorsu_hrt);
+ rcu_torture_one_read_end(&rtorsup->rtorsu_rtors, &rtorsup->rtorsu_trs);
+ WARN_ONCE(rtorsup->rtorsu_nups >= rtorsup->rtorsu_ndowns, "%s: Up without matching down #%zu.\n", __func__, rtorsup - updownreaders);
+ WRITE_ONCE(rtorsup->rtorsu_nups, rtorsup->rtorsu_nups + 1);
+ WRITE_ONCE(rtorsup->rtorsu_nmigrates,
+ rtorsup->rtorsu_nmigrates + (cpu != rtorsup->rtorsu_cpu));
+ smp_store_release(&rtorsup->rtorsu_inuse, false);
+ return HRTIMER_NORESTART;
+}
+
+static int rcu_torture_updown_init(void)
+{
+ int i;
+ struct torture_random_state *rand = &rcu_torture_updown_rand;
+ int ret;
+
+ if (n_up_down < 0)
+ return 0;
+ if (!srcu_torture_have_up_down()) {
+ VERBOSE_TOROUT_STRING("rcu_torture_updown_init: Disabling up/down reader tests due to lack of primitives");
+ return 0;
+ }
+ updownreaders = kcalloc(n_up_down, sizeof(*updownreaders), GFP_KERNEL);
+ if (!updownreaders) {
+ VERBOSE_TOROUT_STRING("rcu_torture_updown_init: Out of memory, disabling up/down reader tests");
+ return -ENOMEM;
+ }
+ for (i = 0; i < n_up_down; i++) {
+ init_rcu_torture_one_read_state(&updownreaders[i].rtorsu_rtors, rand);
+ hrtimer_setup(&updownreaders[i].rtorsu_hrt, rcu_torture_updown_hrt, CLOCK_MONOTONIC,
+ HRTIMER_MODE_REL | HRTIMER_MODE_HARD);
+ torture_random_init(&updownreaders[i].rtorsu_trs);
+ init_rcu_torture_one_read_state(&updownreaders[i].rtorsu_rtors,
+ &updownreaders[i].rtorsu_trs);
+ }
+ ret = torture_create_kthread(rcu_torture_updown, rand, updown_task);
+ if (ret) {
+ kfree(updownreaders);
+ updownreaders = NULL;
+ }
+ return ret;
+}
+
+static void rcu_torture_updown_cleanup(void)
+{
+ struct rcu_torture_one_read_state_updown *rtorsup;
+
+ for (rtorsup = updownreaders; rtorsup < &updownreaders[n_up_down]; rtorsup++) {
+ if (!smp_load_acquire(&rtorsup->rtorsu_inuse))
+ continue;
+ if (hrtimer_cancel(&rtorsup->rtorsu_hrt) || WARN_ON_ONCE(rtorsup->rtorsu_inuse)) {
+ rcu_torture_one_read_end(&rtorsup->rtorsu_rtors, &rtorsup->rtorsu_trs);
+ WARN_ONCE(rtorsup->rtorsu_nups >= rtorsup->rtorsu_ndowns, "%s: Up without matching down #%zu.\n", __func__, rtorsup - updownreaders);
+ WRITE_ONCE(rtorsup->rtorsu_nups, rtorsup->rtorsu_nups + 1);
+ smp_store_release(&rtorsup->rtorsu_inuse, false);
+ }
+
+ }
+ kfree(updownreaders);
+ updownreaders = NULL;
+}
+
+// Do one reader for rcu_torture_updown().
+static void rcu_torture_updown_one(struct rcu_torture_one_read_state_updown *rtorsup)
+{
+ int idx;
+ int rawidx;
+ ktime_t t;
+
+ init_rcu_torture_one_read_state(&rtorsup->rtorsu_rtors, &rtorsup->rtorsu_trs);
+ rawidx = cur_ops->down_read();
+ WRITE_ONCE(rtorsup->rtorsu_ndowns, rtorsup->rtorsu_ndowns + 1);
+ idx = (rawidx << RCUTORTURE_RDR_SHIFT_1) & RCUTORTURE_RDR_MASK_1;
+ rtorsup->rtorsu_rtors.readstate = idx | RCUTORTURE_RDR_UPDOWN;
+ rtorsup->rtorsu_rtors.rtrsp++;
+ rtorsup->rtorsu_cpu = raw_smp_processor_id();
+ if (!rcu_torture_one_read_start(&rtorsup->rtorsu_rtors, &rtorsup->rtorsu_trs, -1)) {
+ WARN_ONCE(rtorsup->rtorsu_nups >= rtorsup->rtorsu_ndowns, "%s: Up without matching down #%zu.\n", __func__, rtorsup - updownreaders);
+ WRITE_ONCE(rtorsup->rtorsu_nups, rtorsup->rtorsu_nups + 1);
+ schedule_timeout_idle(HZ);
+ return;
+ }
+ smp_store_release(&rtorsup->rtorsu_inuse, true);
+ t = torture_random(&rtorsup->rtorsu_trs) & 0xfffff; // One per million.
+ if (t < 10 * 1000)
+ t = 200 * 1000 * 1000;
+ hrtimer_start(&rtorsup->rtorsu_hrt, t, HRTIMER_MODE_REL | HRTIMER_MODE_HARD);
+ smp_mb(); // Sample jiffies after posting hrtimer.
+ rtorsup->rtorsu_j = jiffies; // Not used by hrtimer handler.
+ rtorsup->rtorsu_kt = t;
+}
+
+/*
+ * RCU torture up/down reader kthread, starting RCU readers in kthread
+ * context and ending them in hrtimer handlers. Otherwise similar to
+ * rcu_torture_reader().
+ */
+static int
+rcu_torture_updown(void *arg)
+{
+ unsigned long j;
+ struct rcu_torture_one_read_state_updown *rtorsup;
+
+ VERBOSE_TOROUT_STRING("rcu_torture_updown task started");
+ do {
+ for (rtorsup = updownreaders; rtorsup < &updownreaders[n_up_down]; rtorsup++) {
+ if (torture_must_stop())
+ break;
+ j = smp_load_acquire(&jiffies); // Time before ->rtorsu_inuse.
+ if (smp_load_acquire(&rtorsup->rtorsu_inuse)) {
+ WARN_ONCE(time_after(j, rtorsup->rtorsu_j + 1 + HZ * 10),
+ "hrtimer queued at jiffies %lu for %lld ns took %lu jiffies\n", rtorsup->rtorsu_j, rtorsup->rtorsu_kt, j - rtorsup->rtorsu_j);
+ continue;
+ }
+ rcu_torture_updown_one(rtorsup);
+ }
+ torture_hrtimeout_ms(1, 1000, &rcu_torture_updown_rand);
+ stutter_wait("rcu_torture_updown");
+ } while (!torture_must_stop());
+ rcu_torture_updown_cleanup();
+ torture_kthread_stopping("rcu_torture_updown");
+ return 0;
+}
+
/*
* Randomly Toggle CPUs' callback-offload state. This uses hrtimers to
* increase race probabilities and fuzzes the interval between toggling.
@@ -2441,6 +2659,10 @@ rcu_torture_stats_print(void)
long pipesummary[RCU_TORTURE_PIPE_LEN + 1] = { 0 };
long batchsummary[RCU_TORTURE_PIPE_LEN + 1] = { 0 };
long n_gpwraps = 0;
+ unsigned long ndowns = 0;
+ unsigned long nunexpired = 0;
+ unsigned long nmigrates = 0;
+ unsigned long nups = 0;
struct rcu_torture *rtcp;
static unsigned long rtcv_snap = ULONG_MAX;
static bool splatted;
@@ -2454,10 +2676,18 @@ rcu_torture_stats_print(void)
if (cur_ops->get_gpwrap_count)
n_gpwraps += cur_ops->get_gpwrap_count(cpu);
}
+ if (updownreaders) {
+ for (i = 0; i < n_up_down; i++) {
+ ndowns += READ_ONCE(updownreaders[i].rtorsu_ndowns);
+ nups += READ_ONCE(updownreaders[i].rtorsu_nups);
+ nunexpired += READ_ONCE(updownreaders[i].rtorsu_inuse);
+ nmigrates += READ_ONCE(updownreaders[i].rtorsu_nmigrates);
+ }
+ }
for (i = RCU_TORTURE_PIPE_LEN; i >= 0; i--) {
if (pipesummary[i] != 0)
break;
- }
+ } // The value of variable "i" is used later, so don't clobber it!
pr_alert("%s%s ", torture_type, TORTURE_FLAG);
rtcp = rcu_access_pointer(rcu_torture_current);
@@ -2478,6 +2708,8 @@ rcu_torture_stats_print(void)
n_rcu_torture_boost_failure,
n_rcu_torture_boosts,
atomic_long_read(&n_rcu_torture_timers));
+ if (updownreaders)
+ pr_cont("ndowns: %lu nups: %lu nhrt: %lu nmigrates: %lu ", ndowns, nups, nunexpired, nmigrates);
torture_onoff_stats();
pr_cont("barrier: %ld/%ld:%ld ",
data_race(n_barrier_successes),
@@ -2632,7 +2864,7 @@ rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, const char *tag)
"reader_flavor=%x "
"nocbs_nthreads=%d nocbs_toggle=%d "
"test_nmis=%d "
- "preempt_duration=%d preempt_interval=%d\n",
+ "preempt_duration=%d preempt_interval=%d n_up_down=%d\n",
torture_type, tag, nrealreaders, nrealfakewriters,
stat_interval, verbose, test_no_idle_hz, shuffle_interval,
stutter, irqreader, fqs_duration, fqs_holdoff, fqs_stutter,
@@ -2646,7 +2878,7 @@ rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, const char *tag)
reader_flavor,
nocbs_nthreads, nocbs_toggle,
test_nmis,
- preempt_duration, preempt_interval);
+ preempt_duration, preempt_interval, n_up_down);
}
static int rcutorture_booster_cleanup(unsigned int cpu)
@@ -3749,6 +3981,10 @@ rcu_torture_cleanup(void)
nocb_tasks = NULL;
}
+ if (updown_task) {
+ torture_stop_kthread(rcu_torture_updown, updown_task);
+ updown_task = NULL;
+ }
if (reader_tasks) {
for (i = 0; i < nrealreaders; i++)
torture_stop_kthread(rcu_torture_reader,
@@ -4245,11 +4481,6 @@ rcu_torture_init(void)
/* Start up the kthreads. */
rcu_torture_write_types();
- firsterr = torture_create_kthread(rcu_torture_writer, NULL,
- writer_task);
- if (torture_init_error(firsterr))
- goto unwind;
-
if (nrealfakewriters > 0) {
fakewriter_tasks = kcalloc(nrealfakewriters,
sizeof(fakewriter_tasks[0]),
@@ -4282,6 +4513,15 @@ rcu_torture_init(void)
if (torture_init_error(firsterr))
goto unwind;
}
+
+ firsterr = torture_create_kthread(rcu_torture_writer, NULL,
+ writer_task);
+ if (torture_init_error(firsterr))
+ goto unwind;
+
+ firsterr = rcu_torture_updown_init();
+ if (torture_init_error(firsterr))
+ goto unwind;
nrealnocbers = nocbs_nthreads;
if (WARN_ON(nrealnocbers < 0))
nrealnocbers = 1;
diff --git a/kernel/rcu/refscale.c b/kernel/rcu/refscale.c
index f11a7c2af778..df646e0694a8 100644
--- a/kernel/rcu/refscale.c
+++ b/kernel/rcu/refscale.c
@@ -85,7 +85,7 @@ torture_param(int, holdoff, IS_BUILTIN(CONFIG_RCU_REF_SCALE_TEST) ? 10 : 0,
// Number of typesafe_lookup structures, that is, the degree of concurrency.
torture_param(long, lookup_instances, 0, "Number of typesafe_lookup structures.");
// Number of loops per experiment, all readers execute operations concurrently.
-torture_param(long, loops, 10000, "Number of loops per experiment.");
+torture_param(int, loops, 10000, "Number of loops per experiment.");
// Number of readers, with -1 defaulting to about 75% of the CPUs.
torture_param(int, nreaders, -1, "Number of readers, -1 for 75% of CPUs.");
// Number of runs.
@@ -246,36 +246,6 @@ static const struct ref_scale_ops srcu_fast_ops = {
.name = "srcu-fast"
};
-static void srcu_lite_ref_scale_read_section(const int nloops)
-{
- int i;
- int idx;
-
- for (i = nloops; i >= 0; i--) {
- idx = srcu_read_lock_lite(srcu_ctlp);
- srcu_read_unlock_lite(srcu_ctlp, idx);
- }
-}
-
-static void srcu_lite_ref_scale_delay_section(const int nloops, const int udl, const int ndl)
-{
- int i;
- int idx;
-
- for (i = nloops; i >= 0; i--) {
- idx = srcu_read_lock_lite(srcu_ctlp);
- un_delay(udl, ndl);
- srcu_read_unlock_lite(srcu_ctlp, idx);
- }
-}
-
-static const struct ref_scale_ops srcu_lite_ops = {
- .init = rcu_sync_scale_init,
- .readsection = srcu_lite_ref_scale_read_section,
- .delaysection = srcu_lite_ref_scale_delay_section,
- .name = "srcu-lite"
-};
-
#ifdef CONFIG_TASKS_RCU
// Definitions for RCU Tasks ref scale testing: Empty read markers.
@@ -1140,7 +1110,7 @@ static void
ref_scale_print_module_parms(const struct ref_scale_ops *cur_ops, const char *tag)
{
pr_alert("%s" SCALE_FLAG
- "--- %s: verbose=%d verbose_batched=%d shutdown=%d holdoff=%d lookup_instances=%ld loops=%ld nreaders=%d nruns=%d readdelay=%d\n", scale_type, tag,
+ "--- %s: verbose=%d verbose_batched=%d shutdown=%d holdoff=%d lookup_instances=%ld loops=%d nreaders=%d nruns=%d readdelay=%d\n", scale_type, tag,
verbose, verbose_batched, shutdown, holdoff, lookup_instances, loops, nreaders, nruns, readdelay);
}
@@ -1193,7 +1163,7 @@ ref_scale_init(void)
long i;
int firsterr = 0;
static const struct ref_scale_ops *scale_ops[] = {
- &rcu_ops, &srcu_ops, &srcu_fast_ops, &srcu_lite_ops, RCU_TRACE_OPS RCU_TASKS_OPS
+ &rcu_ops, &srcu_ops, &srcu_fast_ops, RCU_TRACE_OPS RCU_TASKS_OPS
&refcnt_ops, &rwlock_ops, &rwsem_ops, &lock_ops, &lock_irq_ops,
&acqrel_ops, &sched_clock_ops, &clock_ops, &jiffies_ops,
&typesafe_ref_ops, &typesafe_lock_ops, &typesafe_seqlock_ops,
@@ -1238,12 +1208,16 @@ ref_scale_init(void)
// Reader tasks (default to ~75% of online CPUs).
if (nreaders < 0)
nreaders = (num_online_cpus() >> 1) + (num_online_cpus() >> 2);
- if (WARN_ONCE(loops <= 0, "%s: loops = %ld, adjusted to 1\n", __func__, loops))
+ if (WARN_ONCE(loops <= 0, "%s: loops = %d, adjusted to 1\n", __func__, loops))
loops = 1;
if (WARN_ONCE(nreaders <= 0, "%s: nreaders = %d, adjusted to 1\n", __func__, nreaders))
nreaders = 1;
if (WARN_ONCE(nruns <= 0, "%s: nruns = %d, adjusted to 1\n", __func__, nruns))
nruns = 1;
+ if (WARN_ONCE(loops > INT_MAX / nreaders,
+ "%s: nreaders * loops will overflow, adjusted loops to %d",
+ __func__, INT_MAX / nreaders))
+ loops = INT_MAX / nreaders;
reader_tasks = kcalloc(nreaders, sizeof(reader_tasks[0]),
GFP_KERNEL);
if (!reader_tasks) {
diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c
index 48047260697e..c5e8ebc493d5 100644
--- a/kernel/rcu/srcutree.c
+++ b/kernel/rcu/srcutree.c
@@ -502,6 +502,8 @@ static bool srcu_readers_active_idx_check(struct srcu_struct *ssp, int idx)
*/
if (!did_gp)
smp_mb(); /* A */
+ else if (srcu_gp_is_expedited(ssp))
+ synchronize_rcu_expedited(); /* X */
else
synchronize_rcu(); /* X */
diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h
index c0cc7ae41106..f92443561d36 100644
--- a/kernel/rcu/tasks.h
+++ b/kernel/rcu/tasks.h
@@ -316,7 +316,8 @@ static void call_rcu_tasks_generic_timer(struct timer_list *tlp)
unsigned long flags;
bool needwake = false;
struct rcu_tasks *rtp;
- struct rcu_tasks_percpu *rtpcp = from_timer(rtpcp, tlp, lazy_timer);
+ struct rcu_tasks_percpu *rtpcp = timer_container_of(rtpcp, tlp,
+ lazy_timer);
rtp = rtpcp->rtpp;
raw_spin_lock_irqsave_rcu_node(rtpcp, flags);
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index e8a4b720d7d2..174ee243b349 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -160,7 +160,6 @@ static void rcu_report_qs_rnp(unsigned long mask, struct rcu_node *rnp,
unsigned long gps, unsigned long flags);
static void invoke_rcu_core(void);
static void rcu_report_exp_rdp(struct rcu_data *rdp);
-static void sync_sched_exp_online_cleanup(int cpu);
static void check_cb_ovld_locked(struct rcu_data *rdp, struct rcu_node *rnp);
static bool rcu_rdp_is_offloaded(struct rcu_data *rdp);
static bool rcu_rdp_cpu_online(struct rcu_data *rdp);
@@ -377,7 +376,7 @@ EXPORT_SYMBOL_GPL(rcu_momentary_eqs);
*/
static int rcu_is_cpu_rrupt_from_idle(void)
{
- long nesting;
+ long nmi_nesting = ct_nmi_nesting();
/*
* Usually called from the tick; but also used from smp_function_call()
@@ -389,21 +388,28 @@ static int rcu_is_cpu_rrupt_from_idle(void)
/* Check for counter underflows */
RCU_LOCKDEP_WARN(ct_nesting() < 0,
"RCU nesting counter underflow!");
- RCU_LOCKDEP_WARN(ct_nmi_nesting() <= 0,
- "RCU nmi_nesting counter underflow/zero!");
- /* Are we at first interrupt nesting level? */
- nesting = ct_nmi_nesting();
- if (nesting > 1)
+ /* Non-idle interrupt or nested idle interrupt */
+ if (nmi_nesting > 1)
return false;
/*
- * If we're not in an interrupt, we must be in the idle task!
+ * Non nested idle interrupt (interrupting section where RCU
+ * wasn't watching).
*/
- WARN_ON_ONCE(!nesting && !is_idle_task(current));
+ if (nmi_nesting == 1)
+ return true;
+
+ /* Not in an interrupt */
+ if (!nmi_nesting) {
+ RCU_LOCKDEP_WARN(!in_task() || !is_idle_task(current),
+ "RCU nmi_nesting counter not in idle task!");
+ return !rcu_is_watching_curr_cpu();
+ }
- /* Does CPU appear to be idle from an RCU standpoint? */
- return ct_nesting() == 0;
+ RCU_LOCKDEP_WARN(1, "RCU nmi_nesting counter underflow/zero!");
+
+ return false;
}
#define DEFAULT_RCU_BLIMIT (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD) ? 1000 : 10)
@@ -1625,8 +1631,10 @@ static void rcu_sr_put_wait_head(struct llist_node *node)
atomic_set_release(&sr_wn->inuse, 0);
}
-/* Disabled by default. */
-static int rcu_normal_wake_from_gp;
+/* Enable rcu_normal_wake_from_gp automatically on small systems. */
+#define WAKE_FROM_GP_CPU_THRESHOLD 16
+
+static int rcu_normal_wake_from_gp = -1;
module_param(rcu_normal_wake_from_gp, int, 0644);
static struct workqueue_struct *sync_wq;
@@ -1829,6 +1837,18 @@ static noinline_for_stack bool rcu_gp_init(void)
start_new_poll = rcu_sr_normal_gp_init();
/* Record GP times before starting GP, hence rcu_seq_start(). */
old_gp_seq = rcu_state.gp_seq;
+ /*
+ * Critical ordering: rcu_seq_start() must happen BEFORE the CPU hotplug
+ * scan below. Otherwise we risk a race where a newly onlining CPU could
+ * be missed by the current grace period, potentially leading to
+ * use-after-free errors. For a detailed explanation of this race, see
+ * Documentation/RCU/Design/Requirements/Requirements.rst in the
+ * "Hotplug CPU" section.
+ *
+ * Also note that the root rnp's gp_seq is kept separate from, and lags,
+ * the rcu_state's gp_seq, for a reason. See the Quick-Quiz on
+ * Single-node systems for more details (in Data-Structures.rst).
+ */
rcu_seq_start(&rcu_state.gp_seq);
/* Ensure that rcu_seq_done_exact() guardband doesn't give false positives. */
WARN_ON_ONCE(IS_ENABLED(CONFIG_PROVE_RCU) &&
@@ -1865,6 +1885,10 @@ static noinline_for_stack bool rcu_gp_init(void)
/* Exclude CPU hotplug operations. */
rcu_for_each_leaf_node(rnp) {
local_irq_disable();
+ /*
+ * Serialize with CPU offline. See Requirements.rst > Hotplug CPU >
+ * Concurrent Quiescent State Reporting for Offline CPUs.
+ */
arch_spin_lock(&rcu_state.ofl_lock);
raw_spin_lock_rcu_node(rnp);
if (rnp->qsmaskinit == rnp->qsmaskinitnext &&
@@ -1939,7 +1963,12 @@ static noinline_for_stack bool rcu_gp_init(void)
trace_rcu_grace_period_init(rcu_state.name, rnp->gp_seq,
rnp->level, rnp->grplo,
rnp->grphi, rnp->qsmask);
- /* Quiescent states for tasks on any now-offline CPUs. */
+ /*
+ * Quiescent states for tasks on any now-offline CPUs. Since we
+ * released the ofl and rnp lock before this loop, CPUs might
+ * have gone offline and we have to report QS on their behalf.
+ * See Requirements.rst > Hotplug CPU > Concurrent QS Reporting.
+ */
mask = rnp->qsmask & ~rnp->qsmaskinitnext;
rnp->rcu_gp_init_mask = mask;
if ((mask || rnp->wait_blkd_tasks) && rcu_is_leaf_node(rnp))
@@ -3072,6 +3101,10 @@ __call_rcu_common(struct rcu_head *head, rcu_callback_t func, bool lazy_in)
/* Misaligned rcu_head! */
WARN_ON_ONCE((unsigned long)head & (sizeof(void *) - 1));
+ /* Avoid NULL dereference if callback is NULL. */
+ if (WARN_ON_ONCE(!func))
+ return;
+
if (debug_rcu_head_queue(head)) {
/*
* Probable double call_rcu(), so leak the callback.
@@ -3239,7 +3272,7 @@ static void synchronize_rcu_normal(void)
trace_rcu_sr_normal(rcu_state.name, &rs.head, TPS("request"));
- if (!READ_ONCE(rcu_normal_wake_from_gp)) {
+ if (READ_ONCE(rcu_normal_wake_from_gp) < 1) {
wait_rcu_gp(call_rcu_hurry);
goto trace_complete_out;
}
@@ -4264,7 +4297,6 @@ int rcutree_online_cpu(unsigned int cpu)
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
if (rcu_scheduler_active == RCU_SCHEDULER_INACTIVE)
return 0; /* Too early in boot for scheduler work. */
- sync_sched_exp_online_cleanup(cpu);
// Stop-machine done, so allow nohz_full to disable tick.
tick_dep_clear(TICK_DEP_BIT_RCU);
@@ -4354,6 +4386,12 @@ void rcutree_report_cpu_dead(void)
* may introduce a new READ-side while it is actually off the QS masks.
*/
lockdep_assert_irqs_disabled();
+ /*
+ * CPUHP_AP_SMPCFD_DYING was the last call for rcu_exp_handler() execution.
+ * The requested QS must have been reported on the last context switch
+ * from stop machine to idle.
+ */
+ WARN_ON_ONCE(rdp->cpu_no_qs.b.exp);
// Do any dangling deferred wakeups.
do_nocb_deferred_wakeup(rdp);
@@ -4361,6 +4399,13 @@ void rcutree_report_cpu_dead(void)
/* Remove outgoing CPU from mask in the leaf rcu_node structure. */
mask = rdp->grpmask;
+
+ /*
+ * Hold the ofl_lock and rnp lock to avoid races between CPU going
+ * offline and doing a QS report (as below), versus rcu_gp_init().
+ * See Requirements.rst > Hotplug CPU > Concurrent QS Reporting section
+ * for more details.
+ */
arch_spin_lock(&rcu_state.ofl_lock);
raw_spin_lock_irqsave_rcu_node(rnp, flags); /* Enforce GP memory-order guarantee. */
rdp->rcu_ofl_gp_seq = READ_ONCE(rcu_state.gp_seq);
@@ -4371,6 +4416,7 @@ void rcutree_report_cpu_dead(void)
rcu_report_qs_rnp(mask, rnp, rnp->gp_seq, flags);
raw_spin_lock_irqsave_rcu_node(rnp, flags);
}
+ /* Clear from ->qsmaskinitnext to mark offline. */
WRITE_ONCE(rnp->qsmaskinitnext, rnp->qsmaskinitnext & ~mask);
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
arch_spin_unlock(&rcu_state.ofl_lock);
@@ -4843,6 +4889,12 @@ void __init rcu_init(void)
sync_wq = alloc_workqueue("sync_wq", WQ_MEM_RECLAIM, 0);
WARN_ON(!sync_wq);
+ /* Respect if explicitly disabled via a boot parameter. */
+ if (rcu_normal_wake_from_gp < 0) {
+ if (num_possible_cpus() <= WAKE_FROM_GP_CPU_THRESHOLD)
+ rcu_normal_wake_from_gp = 1;
+ }
+
/* Fill in default value for rcutree.qovld boot parameter. */
/* -After- the rcu_node ->lock fields are initialized! */
if (qovld < 0)
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index 3830c19cf2f6..de6ca13a7b5f 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -174,6 +174,17 @@ struct rcu_snap_record {
unsigned long jiffies; /* Track jiffies value */
};
+/*
+ * An IRQ work (deferred_qs_iw) is used by RCU to get the scheduler's attention.
+ * to report quiescent states at the soonest possible time.
+ * The request can be in one of the following states:
+ * - DEFER_QS_IDLE: An IRQ work is yet to be scheduled.
+ * - DEFER_QS_PENDING: An IRQ work was scheduled but either not yet run, or it
+ * ran and we still haven't reported a quiescent state.
+ */
+#define DEFER_QS_IDLE 0
+#define DEFER_QS_PENDING 1
+
/* Per-CPU data for read-copy update. */
struct rcu_data {
/* 1) quiescent-state and grace-period handling : */
@@ -192,7 +203,7 @@ struct rcu_data {
/* during and after the last grace */
/* period it is aware of. */
struct irq_work defer_qs_iw; /* Obtain later scheduler attention. */
- bool defer_qs_iw_pending; /* Scheduler attention pending? */
+ int defer_qs_iw_pending; /* Scheduler attention pending? */
struct work_struct strict_work; /* Schedule readers for strict GPs. */
/* 2) batch handling */
diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h
index c36c7d5575ca..6058a734090c 100644
--- a/kernel/rcu/tree_exp.h
+++ b/kernel/rcu/tree_exp.h
@@ -141,6 +141,13 @@ static void __maybe_unused sync_exp_reset_tree(void)
raw_spin_lock_irqsave_rcu_node(rnp, flags);
WARN_ON_ONCE(rnp->expmask);
WRITE_ONCE(rnp->expmask, rnp->expmaskinit);
+ /*
+ * Need to wait for any blocked tasks as well. Note that
+ * additional blocking tasks will also block the expedited GP
+ * until such time as the ->expmask bits are cleared.
+ */
+ if (rcu_is_leaf_node(rnp) && rcu_preempt_has_tasks(rnp))
+ WRITE_ONCE(rnp->exp_tasks, rnp->blkd_tasks.next);
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
}
}
@@ -393,13 +400,6 @@ static void __sync_rcu_exp_select_node_cpus(struct rcu_exp_work *rewp)
}
mask_ofl_ipi = rnp->expmask & ~mask_ofl_test;
- /*
- * Need to wait for any blocked tasks as well. Note that
- * additional blocking tasks will also block the expedited GP
- * until such time as the ->expmask bits are cleared.
- */
- if (rcu_preempt_has_tasks(rnp))
- WRITE_ONCE(rnp->exp_tasks, rnp->blkd_tasks.next);
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
/* IPI the remaining CPUs for expedited quiescent state. */
@@ -751,12 +751,8 @@ static void rcu_exp_handler(void *unused)
struct task_struct *t = current;
/*
- * First, is there no need for a quiescent state from this CPU,
- * or is this CPU already looking for a quiescent state for the
- * current grace period? If either is the case, just leave.
- * However, this should not happen due to the preemptible
- * sync_sched_exp_online_cleanup() implementation being a no-op,
- * so warn if this does happen.
+ * WARN if the CPU is unexpectedly already looking for a
+ * QS or has already reported one.
*/
ASSERT_EXCLUSIVE_WRITER_SCOPED(rdp->cpu_no_qs.b.exp);
if (WARN_ON_ONCE(!(READ_ONCE(rnp->expmask) & rdp->grpmask) ||
@@ -803,11 +799,6 @@ static void rcu_exp_handler(void *unused)
WARN_ON_ONCE(1);
}
-/* PREEMPTION=y, so no PREEMPTION=n expedited grace period to clean up after. */
-static void sync_sched_exp_online_cleanup(int cpu)
-{
-}
-
/*
* Scan the current list of tasks blocked within RCU read-side critical
* sections, printing out the tid of each that is blocking the current
@@ -885,38 +876,6 @@ static void rcu_exp_handler(void *unused)
rcu_exp_need_qs();
}
-/* Send IPI for expedited cleanup if needed at end of CPU-hotplug operation. */
-static void sync_sched_exp_online_cleanup(int cpu)
-{
- unsigned long flags;
- int my_cpu;
- struct rcu_data *rdp;
- int ret;
- struct rcu_node *rnp;
-
- rdp = per_cpu_ptr(&rcu_data, cpu);
- rnp = rdp->mynode;
- my_cpu = get_cpu();
- /* Quiescent state either not needed or already requested, leave. */
- if (!(READ_ONCE(rnp->expmask) & rdp->grpmask) ||
- READ_ONCE(rdp->cpu_no_qs.b.exp)) {
- put_cpu();
- return;
- }
- /* Quiescent state needed on current CPU, so set it up locally. */
- if (my_cpu == cpu) {
- local_irq_save(flags);
- rcu_exp_need_qs();
- local_irq_restore(flags);
- put_cpu();
- return;
- }
- /* Quiescent state needed on some other CPU, send IPI. */
- ret = smp_call_function_single(cpu, rcu_exp_handler, NULL, 0);
- put_cpu();
- WARN_ON_ONCE(ret);
-}
-
/*
* Because preemptible RCU does not exist, we never have to check for
* tasks blocked within RCU read-side critical sections that are
diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h
index 1596812f7f12..e6cd56603cad 100644
--- a/kernel/rcu/tree_nocb.h
+++ b/kernel/rcu/tree_nocb.h
@@ -276,7 +276,7 @@ static void wake_nocb_gp_defer(struct rcu_data *rdp, int waketype,
* callback storms, no need to wake up too early.
*/
if (waketype == RCU_NOCB_WAKE_LAZY &&
- rdp->nocb_defer_wakeup == RCU_NOCB_WAKE_NOT) {
+ rdp_gp->nocb_defer_wakeup == RCU_NOCB_WAKE_NOT) {
mod_timer(&rdp_gp->nocb_timer, jiffies + rcu_get_jiffies_lazy_flush());
WRITE_ONCE(rdp_gp->nocb_defer_wakeup, waketype);
} else if (waketype == RCU_NOCB_WAKE_BYPASS) {
@@ -985,7 +985,7 @@ static bool do_nocb_deferred_wakeup_common(struct rcu_data *rdp_gp,
static void do_nocb_deferred_wakeup_timer(struct timer_list *t)
{
unsigned long flags;
- struct rcu_data *rdp = from_timer(rdp, t, nocb_timer);
+ struct rcu_data *rdp = timer_container_of(rdp, t, nocb_timer);
WARN_ON_ONCE(rdp->nocb_gp_rdp != rdp);
trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("Timer"));
@@ -1146,7 +1146,6 @@ static bool rcu_nocb_rdp_offload_wait_cond(struct rcu_data *rdp)
static int rcu_nocb_rdp_offload(struct rcu_data *rdp)
{
int wake_gp;
- struct rcu_data *rdp_gp = rdp->nocb_gp_rdp;
WARN_ON_ONCE(cpu_online(rdp->cpu));
/*
@@ -1156,7 +1155,7 @@ static int rcu_nocb_rdp_offload(struct rcu_data *rdp)
if (!rdp->nocb_gp_rdp)
return -EINVAL;
- if (WARN_ON_ONCE(!rdp_gp->nocb_gp_kthread))
+ if (WARN_ON_ONCE(!rdp->nocb_gp_kthread))
return -EINVAL;
pr_info("Offloading %d\n", rdp->cpu);
@@ -1166,7 +1165,7 @@ static int rcu_nocb_rdp_offload(struct rcu_data *rdp)
wake_gp = rcu_nocb_queue_toggle_rdp(rdp);
if (wake_gp)
- wake_up_process(rdp_gp->nocb_gp_kthread);
+ wake_up_process(rdp->nocb_gp_kthread);
swait_event_exclusive(rdp->nocb_state_wq,
rcu_nocb_rdp_offload_wait_cond(rdp));
@@ -1564,6 +1563,9 @@ static void show_rcu_nocb_state(struct rcu_data *rdp)
if (rdp->nocb_gp_rdp == rdp)
show_rcu_nocb_gp_state(rdp);
+ if (!rcu_segcblist_is_offloaded(&rdp->cblist))
+ return;
+
nocb_next_rdp = list_next_or_null_rcu(&rdp->nocb_gp_rdp->nocb_head_rdp,
&rdp->nocb_entry_rdp,
typeof(*rdp),
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 0b0f56f6abc8..fc14adf15cbb 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -486,13 +486,16 @@ rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, unsigned long flags)
struct rcu_node *rnp;
union rcu_special special;
+ rdp = this_cpu_ptr(&rcu_data);
+ if (rdp->defer_qs_iw_pending == DEFER_QS_PENDING)
+ rdp->defer_qs_iw_pending = DEFER_QS_IDLE;
+
/*
* If RCU core is waiting for this CPU to exit its critical section,
* report the fact that it has exited. Because irqs are disabled,
* t->rcu_read_unlock_special cannot change.
*/
special = t->rcu_read_unlock_special;
- rdp = this_cpu_ptr(&rcu_data);
if (!special.s && !rdp->cpu_no_qs.b.exp) {
local_irq_restore(flags);
return;
@@ -534,7 +537,6 @@ rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, unsigned long flags)
WARN_ON_ONCE(rnp->completedqs == rnp->gp_seq &&
(!empty_norm || rnp->qsmask));
empty_exp = sync_rcu_exp_done(rnp);
- smp_mb(); /* ensure expedited fastpath sees end of RCU c-s. */
np = rcu_next_node_entry(t, rnp);
list_del_init(&t->rcu_node_entry);
t->rcu_blocked_node = NULL;
@@ -624,10 +626,98 @@ notrace void rcu_preempt_deferred_qs(struct task_struct *t)
*/
static void rcu_preempt_deferred_qs_handler(struct irq_work *iwp)
{
+ unsigned long flags;
struct rcu_data *rdp;
rdp = container_of(iwp, struct rcu_data, defer_qs_iw);
- rdp->defer_qs_iw_pending = false;
+ local_irq_save(flags);
+
+ /*
+ * If the IRQ work handler happens to run in the middle of RCU read-side
+ * critical section, it could be ineffective in getting the scheduler's
+ * attention to report a deferred quiescent state (the whole point of the
+ * IRQ work). For this reason, requeue the IRQ work.
+ *
+ * Basically, we want to avoid following situation:
+ * 1. rcu_read_unlock() queues IRQ work (state -> DEFER_QS_PENDING)
+ * 2. CPU enters new rcu_read_lock()
+ * 3. IRQ work runs but cannot report QS due to rcu_preempt_depth() > 0
+ * 4. rcu_read_unlock() does not re-queue work (state still PENDING)
+ * 5. Deferred QS reporting does not happen.
+ */
+ if (rcu_preempt_depth() > 0)
+ WRITE_ONCE(rdp->defer_qs_iw_pending, DEFER_QS_IDLE);
+
+ local_irq_restore(flags);
+}
+
+/*
+ * Check if expedited grace period processing during unlock is needed.
+ *
+ * This function determines whether expedited handling is required based on:
+ * 1. Task blocking an expedited grace period (based on a heuristic, could be
+ * false-positive, see below.)
+ * 2. CPU participating in an expedited grace period
+ * 3. Strict grace period mode requiring expedited handling
+ * 4. RCU priority deboosting needs when interrupts were disabled
+ *
+ * @t: The task being checked
+ * @rdp: The per-CPU RCU data
+ * @rnp: The RCU node for this CPU
+ * @irqs_were_disabled: Whether interrupts were disabled before rcu_read_unlock()
+ *
+ * Returns true if expedited processing of the rcu_read_unlock() is needed.
+ */
+static bool rcu_unlock_needs_exp_handling(struct task_struct *t,
+ struct rcu_data *rdp,
+ struct rcu_node *rnp,
+ bool irqs_were_disabled)
+{
+ /*
+ * Check if this task is blocking an expedited grace period. If the
+ * task was preempted within an RCU read-side critical section and is
+ * on the expedited grace period blockers list (exp_tasks), we need
+ * expedited handling to unblock the expedited GP. This is not an exact
+ * check because 't' might not be on the exp_tasks list at all - its
+ * just a fast heuristic that can be false-positive sometimes.
+ */
+ if (t->rcu_blocked_node && READ_ONCE(t->rcu_blocked_node->exp_tasks))
+ return true;
+
+ /*
+ * Check if this CPU is participating in an expedited grace period.
+ * The expmask bitmap tracks which CPUs need to check in for the
+ * current expedited GP. If our CPU's bit is set, we need expedited
+ * handling to help complete the expedited GP.
+ */
+ if (rdp->grpmask & READ_ONCE(rnp->expmask))
+ return true;
+
+ /*
+ * In CONFIG_RCU_STRICT_GRACE_PERIOD=y kernels, all grace periods
+ * are treated as short for testing purposes even if that means
+ * disturbing the system more. Check if either:
+ * - This CPU has not yet reported a quiescent state, or
+ * - This task was preempted within an RCU critical section
+ * In either case, require expedited handling for strict GP mode.
+ */
+ if (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD) &&
+ ((rdp->grpmask & READ_ONCE(rnp->qsmask)) || t->rcu_blocked_node))
+ return true;
+
+ /*
+ * RCU priority boosting case: If a task is subject to RCU priority
+ * boosting and exits an RCU read-side critical section with interrupts
+ * disabled, we need expedited handling to ensure timely deboosting.
+ * Without this, a low-priority task could incorrectly run at high
+ * real-time priority for an extended period degrading real-time
+ * responsiveness. This applies to all CONFIG_RCU_BOOST=y kernels,
+ * not just to PREEMPT_RT.
+ */
+ if (IS_ENABLED(CONFIG_RCU_BOOST) && irqs_were_disabled && t->rcu_blocked_node)
+ return true;
+
+ return false;
}
/*
@@ -649,18 +739,14 @@ static void rcu_read_unlock_special(struct task_struct *t)
local_irq_save(flags);
irqs_were_disabled = irqs_disabled_flags(flags);
if (preempt_bh_were_disabled || irqs_were_disabled) {
- bool expboost; // Expedited GP in flight or possible boosting.
+ bool needs_exp; // Expedited handling needed.
struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
struct rcu_node *rnp = rdp->mynode;
- expboost = (t->rcu_blocked_node && READ_ONCE(t->rcu_blocked_node->exp_tasks)) ||
- (rdp->grpmask & READ_ONCE(rnp->expmask)) ||
- (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD) &&
- ((rdp->grpmask & READ_ONCE(rnp->qsmask)) || t->rcu_blocked_node)) ||
- (IS_ENABLED(CONFIG_RCU_BOOST) && irqs_were_disabled &&
- t->rcu_blocked_node);
+ needs_exp = rcu_unlock_needs_exp_handling(t, rdp, rnp, irqs_were_disabled);
+
// Need to defer quiescent state until everything is enabled.
- if (use_softirq && (in_hardirq() || (expboost && !irqs_were_disabled))) {
+ if (use_softirq && (in_hardirq() || (needs_exp && !irqs_were_disabled))) {
// Using softirq, safe to awaken, and either the
// wakeup is free or there is either an expedited
// GP in flight or a potential need to deboost.
@@ -673,17 +759,13 @@ static void rcu_read_unlock_special(struct task_struct *t)
set_tsk_need_resched(current);
set_preempt_need_resched();
if (IS_ENABLED(CONFIG_IRQ_WORK) && irqs_were_disabled &&
- expboost && !rdp->defer_qs_iw_pending && cpu_online(rdp->cpu)) {
+ needs_exp && rdp->defer_qs_iw_pending != DEFER_QS_PENDING &&
+ cpu_online(rdp->cpu)) {
// Get scheduler to re-evaluate and call hooks.
// If !IRQ_WORK, FQS scan will eventually IPI.
- if (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD) &&
- IS_ENABLED(CONFIG_PREEMPT_RT))
- rdp->defer_qs_iw = IRQ_WORK_INIT_HARD(
- rcu_preempt_deferred_qs_handler);
- else
- init_irq_work(&rdp->defer_qs_iw,
- rcu_preempt_deferred_qs_handler);
- rdp->defer_qs_iw_pending = true;
+ rdp->defer_qs_iw =
+ IRQ_WORK_INIT_HARD(rcu_preempt_deferred_qs_handler);
+ rdp->defer_qs_iw_pending = DEFER_QS_PENDING;
irq_work_queue_on(&rdp->defer_qs_iw, rdp->cpu);
}
}
diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h
index 56b21219442b..2a5a5329322d 100644
--- a/kernel/rcu/tree_stall.h
+++ b/kernel/rcu/tree_stall.h
@@ -17,8 +17,59 @@
// Controlling CPU stall warnings, including delay calculation.
/* panic() on RCU Stall sysctl. */
-int sysctl_panic_on_rcu_stall __read_mostly;
-int sysctl_max_rcu_stall_to_panic __read_mostly;
+static int sysctl_panic_on_rcu_stall __read_mostly;
+static int sysctl_max_rcu_stall_to_panic __read_mostly;
+
+static const struct ctl_table rcu_stall_sysctl_table[] = {
+ {
+ .procname = "panic_on_rcu_stall",
+ .data = &sysctl_panic_on_rcu_stall,
+ .maxlen = sizeof(sysctl_panic_on_rcu_stall),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
+ },
+ {
+ .procname = "max_rcu_stall_to_panic",
+ .data = &sysctl_max_rcu_stall_to_panic,
+ .maxlen = sizeof(sysctl_max_rcu_stall_to_panic),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ONE,
+ .extra2 = SYSCTL_INT_MAX,
+ },
+};
+
+static int __init init_rcu_stall_sysctl(void)
+{
+ register_sysctl_init("kernel", rcu_stall_sysctl_table);
+ return 0;
+}
+
+subsys_initcall(init_rcu_stall_sysctl);
+
+#ifdef CONFIG_SYSFS
+
+static unsigned int rcu_stall_count;
+
+static ssize_t rcu_stall_count_show(struct kobject *kobj, struct kobj_attribute *attr,
+ char *page)
+{
+ return sysfs_emit(page, "%u\n", rcu_stall_count);
+}
+
+static struct kobj_attribute rcu_stall_count_attr = __ATTR_RO(rcu_stall_count);
+
+static __init int kernel_rcu_stall_sysfs_init(void)
+{
+ sysfs_add_file_to_group(kernel_kobj, &rcu_stall_count_attr.attr, NULL);
+ return 0;
+}
+
+late_initcall(kernel_rcu_stall_sysfs_init);
+
+#endif // CONFIG_SYSFS
#ifdef CONFIG_PROVE_RCU
#define RCU_STALL_DELAY_DELTA (5 * HZ)
@@ -784,6 +835,10 @@ static void check_cpu_stall(struct rcu_data *rdp)
if (kvm_check_and_clear_guest_paused())
return;
+#ifdef CONFIG_SYSFS
+ ++rcu_stall_count;
+#endif
+
rcu_stall_notifier_call_chain(RCU_STALL_NOTIFY_NORM, (void *)j - gps);
if (READ_ONCE(csd_lock_suppress_rcu_stall) && csd_lock_is_stuck()) {
pr_err("INFO: %s detected stall, but suppressed full report due to a stuck CSD-lock.\n", rcu_state.name);
@@ -927,8 +982,7 @@ void show_rcu_gp_kthreads(void)
for_each_possible_cpu(cpu) {
rdp = per_cpu_ptr(&rcu_data, cpu);
cbs += data_race(READ_ONCE(rdp->n_cbs_invoked));
- if (rcu_segcblist_is_offloaded(&rdp->cblist))
- show_rcu_nocb_state(rdp);
+ show_rcu_nocb_state(rdp);
}
pr_info("RCU callbacks invoked since boot: %lu\n", cbs);
show_rcu_tasks_gp_kthreads();
diff --git a/kernel/relay.c b/kernel/relay.c
index 5ac7e711e4b6..c0c93a04d4ce 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -452,7 +452,7 @@ int relay_prepare_cpu(unsigned int cpu)
/**
* relay_open - create a new relay channel
- * @base_filename: base name of files to create, %NULL for buffering only
+ * @base_filename: base name of files to create
* @parent: dentry of parent directory, %NULL for root directory or buffer
* @subbuf_size: size of sub-buffers
* @n_subbufs: number of sub-buffers
@@ -465,10 +465,6 @@ int relay_prepare_cpu(unsigned int cpu)
* attributes specified. The created channel buffer files
* will be named base_filename0...base_filenameN-1. File
* permissions will be %S_IRUSR.
- *
- * If opening a buffer (@parent = NULL) that you later wish to register
- * in a filesystem, call relay_late_setup_files() once the @parent dentry
- * is available.
*/
struct rchan *relay_open(const char *base_filename,
struct dentry *parent,
@@ -540,111 +536,6 @@ struct rchan_percpu_buf_dispatcher {
struct dentry *dentry;
};
-/* Called in atomic context. */
-static void __relay_set_buf_dentry(void *info)
-{
- struct rchan_percpu_buf_dispatcher *p = info;
-
- relay_set_buf_dentry(p->buf, p->dentry);
-}
-
-/**
- * relay_late_setup_files - triggers file creation
- * @chan: channel to operate on
- * @base_filename: base name of files to create
- * @parent: dentry of parent directory, %NULL for root directory
- *
- * Returns 0 if successful, non-zero otherwise.
- *
- * Use to setup files for a previously buffer-only channel created
- * by relay_open() with a NULL parent dentry.
- *
- * For example, this is useful for perfomring early tracing in kernel,
- * before VFS is up and then exposing the early results once the dentry
- * is available.
- */
-int relay_late_setup_files(struct rchan *chan,
- const char *base_filename,
- struct dentry *parent)
-{
- int err = 0;
- unsigned int i, curr_cpu;
- unsigned long flags;
- struct dentry *dentry;
- struct rchan_buf *buf;
- struct rchan_percpu_buf_dispatcher disp;
-
- if (!chan || !base_filename)
- return -EINVAL;
-
- strscpy(chan->base_filename, base_filename, NAME_MAX);
-
- mutex_lock(&relay_channels_mutex);
- /* Is chan already set up? */
- if (unlikely(chan->has_base_filename)) {
- mutex_unlock(&relay_channels_mutex);
- return -EEXIST;
- }
- chan->has_base_filename = 1;
- chan->parent = parent;
-
- if (chan->is_global) {
- err = -EINVAL;
- buf = *per_cpu_ptr(chan->buf, 0);
- if (!WARN_ON_ONCE(!buf)) {
- dentry = relay_create_buf_file(chan, buf, 0);
- if (dentry && !WARN_ON_ONCE(!chan->is_global)) {
- relay_set_buf_dentry(buf, dentry);
- err = 0;
- }
- }
- mutex_unlock(&relay_channels_mutex);
- return err;
- }
-
- curr_cpu = get_cpu();
- /*
- * The CPU hotplug notifier ran before us and created buffers with
- * no files associated. So it's safe to call relay_setup_buf_file()
- * on all currently online CPUs.
- */
- for_each_online_cpu(i) {
- buf = *per_cpu_ptr(chan->buf, i);
- if (unlikely(!buf)) {
- WARN_ONCE(1, KERN_ERR "CPU has no buffer!\n");
- err = -EINVAL;
- break;
- }
-
- dentry = relay_create_buf_file(chan, buf, i);
- if (unlikely(!dentry)) {
- err = -EINVAL;
- break;
- }
-
- if (curr_cpu == i) {
- local_irq_save(flags);
- relay_set_buf_dentry(buf, dentry);
- local_irq_restore(flags);
- } else {
- disp.buf = buf;
- disp.dentry = dentry;
- smp_mb();
- /* relay_channels_mutex must be held, so wait. */
- err = smp_call_function_single(i,
- __relay_set_buf_dentry,
- &disp, 1);
- }
- if (unlikely(err))
- break;
- }
- put_cpu();
- mutex_unlock(&relay_channels_mutex);
-
- return err;
-}
-EXPORT_SYMBOL_GPL(relay_late_setup_files);
-
/**
* relay_switch_subbuf - switch to a new sub-buffer
* @buf: channel buffer
diff --git a/kernel/resource.c b/kernel/resource.c
index 8d3e6ed0bdc1..f9bb5481501a 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -1279,8 +1279,9 @@ static int __request_region_locked(struct resource *res, struct resource *parent
* become unavailable to other users. Conflicts are
* not expected. Warn to aid debugging if encountered.
*/
- if (conflict->desc == IORES_DESC_DEVICE_PRIVATE_MEMORY) {
- pr_warn("Unaddressable device %s %pR conflicts with %pR",
+ if (parent == &iomem_resource &&
+ conflict->desc == IORES_DESC_DEVICE_PRIVATE_MEMORY) {
+ pr_warn("Unaddressable device %s %pR conflicts with %pR\n",
conflict->name, conflict, res);
}
if (conflict != parent) {
diff --git a/kernel/sched/autogroup.c b/kernel/sched/autogroup.c
index 2b331822c7e7..cdea931aae30 100644
--- a/kernel/sched/autogroup.c
+++ b/kernel/sched/autogroup.c
@@ -4,6 +4,9 @@
* Auto-group scheduling implementation:
*/
+#include "autogroup.h"
+#include "sched.h"
+
unsigned int __read_mostly sysctl_sched_autogroup_enabled = 1;
static struct autogroup autogroup_default;
static atomic_t autogroup_seq_nr;
@@ -25,9 +28,9 @@ static void __init sched_autogroup_sysctl_init(void)
{
register_sysctl_init("kernel", sched_autogroup_sysctls);
}
-#else
+#else /* !CONFIG_SYSCTL: */
#define sched_autogroup_sysctl_init() do { } while (0)
-#endif
+#endif /* !CONFIG_SYSCTL */
void __init autogroup_init(struct task_struct *init_task)
{
@@ -108,7 +111,7 @@ static inline struct autogroup *autogroup_create(void)
free_rt_sched_group(tg);
tg->rt_se = root_task_group.rt_se;
tg->rt_rq = root_task_group.rt_rq;
-#endif
+#endif /* CONFIG_RT_GROUP_SCHED */
tg->autogroup = ag;
sched_online_group(tg, &root_task_group);
diff --git a/kernel/sched/autogroup.h b/kernel/sched/autogroup.h
index 90d69f2c5eaf..06c82b2bdfb5 100644
--- a/kernel/sched/autogroup.h
+++ b/kernel/sched/autogroup.h
@@ -2,6 +2,8 @@
#ifndef _KERNEL_SCHED_AUTOGROUP_H
#define _KERNEL_SCHED_AUTOGROUP_H
+#include "sched.h"
+
#ifdef CONFIG_SCHED_AUTOGROUP
struct autogroup {
@@ -41,7 +43,7 @@ autogroup_task_group(struct task_struct *p, struct task_group *tg)
extern int autogroup_path(struct task_group *tg, char *buf, int buflen);
-#else /* !CONFIG_SCHED_AUTOGROUP */
+#else /* !CONFIG_SCHED_AUTOGROUP: */
static inline void autogroup_init(struct task_struct *init_task) { }
static inline void autogroup_free(struct task_group *tg) { }
@@ -61,6 +63,6 @@ static inline int autogroup_path(struct task_group *tg, char *buf, int buflen)
return 0;
}
-#endif /* CONFIG_SCHED_AUTOGROUP */
+#endif /* !CONFIG_SCHED_AUTOGROUP */
#endif /* _KERNEL_SCHED_AUTOGROUP_H */
diff --git a/kernel/sched/build_policy.c b/kernel/sched/build_policy.c
index 72d97aa8b726..c4a488e67aa7 100644
--- a/kernel/sched/build_policy.c
+++ b/kernel/sched/build_policy.c
@@ -50,11 +50,9 @@
#include "idle.c"
#include "rt.c"
+#include "cpudeadline.c"
-#ifdef CONFIG_SMP
-# include "cpudeadline.c"
-# include "pelt.c"
-#endif
+#include "pelt.c"
#include "cputime.c"
#include "deadline.c"
diff --git a/kernel/sched/build_utility.c b/kernel/sched/build_utility.c
index bf9d8db94b70..e2cf3b08d4e9 100644
--- a/kernel/sched/build_utility.c
+++ b/kernel/sched/build_utility.c
@@ -80,11 +80,10 @@
#include "wait_bit.c"
#include "wait.c"
-#ifdef CONFIG_SMP
-# include "cpupri.c"
-# include "stop_task.c"
-# include "topology.c"
-#endif
+#include "cpupri.c"
+#include "stop_task.c"
+
+#include "topology.c"
#ifdef CONFIG_SCHED_CORE
# include "core_sched.c"
diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c
index a09655b48140..f5e6dd6a6b3a 100644
--- a/kernel/sched/clock.c
+++ b/kernel/sched/clock.c
@@ -54,6 +54,9 @@
*
*/
+#include <linux/sched/clock.h>
+#include "sched.h"
+
/*
* Scheduler clock - returns current time in nanosec units.
* This is default implementation.
@@ -471,7 +474,7 @@ notrace void sched_clock_idle_wakeup_event(void)
}
EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event);
-#else /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */
+#else /* !CONFIG_HAVE_UNSTABLE_SCHED_CLOCK: */
void __init sched_clock_init(void)
{
@@ -489,7 +492,7 @@ notrace u64 sched_clock_cpu(int cpu)
return sched_clock();
}
-#endif /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */
+#endif /* !CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */
/*
* Running clock - returns the time that has elapsed while a guest has been
diff --git a/kernel/sched/completion.c b/kernel/sched/completion.c
index 3561ab533dd4..19ee702273c0 100644
--- a/kernel/sched/completion.c
+++ b/kernel/sched/completion.c
@@ -13,6 +13,11 @@
* Waiting for completion is a typically sync point, but not an exclusion point.
*/
+#include <linux/linkage.h>
+#include <linux/sched/debug.h>
+#include <linux/completion.h>
+#include "sched.h"
+
static void complete_with_flags(struct completion *x, int wake_flags)
{
unsigned long flags;
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 62b3416f5e43..66d93a872968 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -69,8 +69,8 @@
#include <linux/livepatch_sched.h>
#ifdef CONFIG_PREEMPT_DYNAMIC
-# ifdef CONFIG_GENERIC_ENTRY
-# include <linux/entry-common.h>
+# ifdef CONFIG_GENERIC_IRQ_ENTRY
+# include <linux/irq-entry-common.h>
# endif
#endif
@@ -96,6 +96,7 @@
#include "../workqueue_internal.h"
#include "../../io_uring/io-wq.h"
#include "../smpboot.h"
+#include "../locking/mutex.h"
EXPORT_TRACEPOINT_SYMBOL_GPL(ipi_send_cpu);
EXPORT_TRACEPOINT_SYMBOL_GPL(ipi_send_cpumask);
@@ -119,6 +120,35 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(sched_compute_energy_tp);
DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
+#ifdef CONFIG_SCHED_PROXY_EXEC
+DEFINE_STATIC_KEY_TRUE(__sched_proxy_exec);
+static int __init setup_proxy_exec(char *str)
+{
+ bool proxy_enable = true;
+
+ if (*str && kstrtobool(str + 1, &proxy_enable)) {
+ pr_warn("Unable to parse sched_proxy_exec=\n");
+ return 0;
+ }
+
+ if (proxy_enable) {
+ pr_info("sched_proxy_exec enabled via boot arg\n");
+ static_branch_enable(&__sched_proxy_exec);
+ } else {
+ pr_info("sched_proxy_exec disabled via boot arg\n");
+ static_branch_disable(&__sched_proxy_exec);
+ }
+ return 1;
+}
+#else
+static int __init setup_proxy_exec(char *str)
+{
+ pr_warn("CONFIG_SCHED_PROXY_EXEC=n, so it cannot be enabled or disabled at boot time\n");
+ return 0;
+}
+#endif
+__setup("sched_proxy_exec", setup_proxy_exec);
+
/*
* Debugging: various feature bits
*
@@ -481,13 +511,13 @@ void sched_core_put(void)
schedule_work(&_work);
}
-#else /* !CONFIG_SCHED_CORE */
+#else /* !CONFIG_SCHED_CORE: */
static inline void sched_core_enqueue(struct rq *rq, struct task_struct *p) { }
static inline void
sched_core_dequeue(struct rq *rq, struct task_struct *p, int flags) { }
-#endif /* CONFIG_SCHED_CORE */
+#endif /* !CONFIG_SCHED_CORE */
/* need a wrapper since we may need to trace from modules */
EXPORT_TRACEPOINT_SYMBOL(sched_set_state_tp);
@@ -650,7 +680,6 @@ void raw_spin_rq_unlock(struct rq *rq)
raw_spin_unlock(rq_lockp(rq));
}
-#ifdef CONFIG_SMP
/*
* double_rq_lock - safely lock two runqueues
*/
@@ -667,7 +696,6 @@ void double_rq_lock(struct rq *rq1, struct rq *rq2)
double_rq_clock_clear_update(rq1, rq2);
}
-#endif
/*
* __task_rq_lock - lock the rq @p resides on.
@@ -853,8 +881,6 @@ static enum hrtimer_restart hrtick(struct hrtimer *timer)
return HRTIMER_NORESTART;
}
-#ifdef CONFIG_SMP
-
static void __hrtick_restart(struct rq *rq)
{
struct hrtimer *timer = &rq->hrtick_timer;
@@ -899,33 +925,12 @@ void hrtick_start(struct rq *rq, u64 delay)
smp_call_function_single_async(cpu_of(rq), &rq->hrtick_csd);
}
-#else
-/*
- * Called to set the hrtick timer state.
- *
- * called with rq->lock held and IRQs disabled
- */
-void hrtick_start(struct rq *rq, u64 delay)
-{
- /*
- * Don't schedule slices shorter than 10000ns, that just
- * doesn't make sense. Rely on vruntime for fairness.
- */
- delay = max_t(u64, delay, 10000LL);
- hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay),
- HRTIMER_MODE_REL_PINNED_HARD);
-}
-
-#endif /* CONFIG_SMP */
-
static void hrtick_rq_init(struct rq *rq)
{
-#ifdef CONFIG_SMP
INIT_CSD(&rq->hrtick_csd, __hrtick_start, rq);
-#endif
hrtimer_setup(&rq->hrtick_timer, hrtick, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD);
}
-#else /* CONFIG_SCHED_HRTICK */
+#else /* !CONFIG_SCHED_HRTICK: */
static inline void hrtick_clear(struct rq *rq)
{
}
@@ -933,7 +938,7 @@ static inline void hrtick_clear(struct rq *rq)
static inline void hrtick_rq_init(struct rq *rq)
{
}
-#endif /* CONFIG_SCHED_HRTICK */
+#endif /* !CONFIG_SCHED_HRTICK */
/*
* try_cmpxchg based fetch_or() macro so it works for different integer types:
@@ -949,7 +954,7 @@ static inline void hrtick_rq_init(struct rq *rq)
_val; \
})
-#if defined(CONFIG_SMP) && defined(TIF_POLLING_NRFLAG)
+#ifdef TIF_POLLING_NRFLAG
/*
* Atomically set TIF_NEED_RESCHED and test for TIF_POLLING_NRFLAG,
* this avoids any races wrt polling state changes and thereby avoids
@@ -988,13 +993,11 @@ static inline bool set_nr_and_not_polling(struct thread_info *ti, int tif)
return true;
}
-#ifdef CONFIG_SMP
static inline bool set_nr_if_polling(struct task_struct *p)
{
return false;
}
#endif
-#endif
static bool __wake_q_add(struct wake_q_head *head, struct task_struct *task)
{
@@ -1110,6 +1113,7 @@ static void __resched_curr(struct rq *rq, int tif)
cpu = cpu_of(rq);
+ trace_sched_set_need_resched_tp(curr, cpu, tif);
if (cpu == smp_processor_id()) {
set_ti_thread_flag(cti, tif);
if (tif == TIF_NEED_RESCHED)
@@ -1125,6 +1129,11 @@ static void __resched_curr(struct rq *rq, int tif)
}
}
+void __trace_set_need_resched(struct task_struct *curr, int tif)
+{
+ trace_sched_set_need_resched_tp(curr, smp_processor_id(), tif);
+}
+
void resched_curr(struct rq *rq)
{
__resched_curr(rq, TIF_NEED_RESCHED);
@@ -1167,7 +1176,6 @@ void resched_cpu(int cpu)
raw_spin_rq_unlock_irqrestore(rq, flags);
}
-#ifdef CONFIG_SMP
#ifdef CONFIG_NO_HZ_COMMON
/*
* In the semi idle case, use the nearest busy CPU for migrating timers
@@ -1374,10 +1382,8 @@ bool sched_can_stop_tick(struct rq *rq)
return true;
}
#endif /* CONFIG_NO_HZ_FULL */
-#endif /* CONFIG_SMP */
-#if defined(CONFIG_RT_GROUP_SCHED) || (defined(CONFIG_FAIR_GROUP_SCHED) && \
- (defined(CONFIG_SMP) || defined(CONFIG_CFS_BANDWIDTH)))
+#if defined(CONFIG_RT_GROUP_SCHED) || defined(CONFIG_FAIR_GROUP_SCHED)
/*
* Iterate task_group tree rooted at *from, calling @down when first entering a
* node and @up when leaving it for the final time.
@@ -1971,7 +1977,7 @@ undo:
sysctl_sched_uclamp_util_min_rt_default = old_min_rt;
return result;
}
-#endif
+#endif /* CONFIG_SYSCTL */
static void uclamp_fork(struct task_struct *p)
{
@@ -2037,13 +2043,13 @@ static void __init init_uclamp(void)
}
}
-#else /* !CONFIG_UCLAMP_TASK */
+#else /* !CONFIG_UCLAMP_TASK: */
static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p, int flags) { }
static inline void uclamp_rq_dec(struct rq *rq, struct task_struct *p) { }
static inline void uclamp_fork(struct task_struct *p) { }
static inline void uclamp_post_fork(struct task_struct *p) { }
static inline void init_uclamp(void) { }
-#endif /* CONFIG_UCLAMP_TASK */
+#endif /* !CONFIG_UCLAMP_TASK */
bool sched_task_on_rq(struct task_struct *p)
{
@@ -2353,8 +2359,6 @@ unsigned long wait_task_inactive(struct task_struct *p, unsigned int match_state
return ncsw;
}
-#ifdef CONFIG_SMP
-
static void
__do_set_cpus_allowed(struct task_struct *p, struct affinity_context *ctx);
@@ -2936,8 +2940,15 @@ static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flag
struct set_affinity_pending my_pending = { }, *pending = NULL;
bool stop_pending, complete = false;
- /* Can the task run on the task's current CPU? If so, we're done */
- if (cpumask_test_cpu(task_cpu(p), &p->cpus_mask)) {
+ /*
+ * Can the task run on the task's current CPU? If so, we're done
+ *
+ * We are also done if the task is the current donor, boosting a lock-
+ * holding proxy, (and potentially has been migrated outside its
+ * current or previous affinity mask)
+ */
+ if (cpumask_test_cpu(task_cpu(p), &p->cpus_mask) ||
+ (task_current_donor(rq, p) && !task_current(rq, p))) {
struct task_struct *push_task = NULL;
if ((flags & SCA_MIGRATE_ENABLE) &&
@@ -3305,6 +3316,8 @@ void relax_compatible_cpus_allowed_ptr(struct task_struct *p)
WARN_ON_ONCE(ret);
}
+#ifdef CONFIG_SMP
+
void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
{
unsigned int state = READ_ONCE(p->__state);
@@ -3358,6 +3371,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
__set_task_cpu(p, new_cpu);
}
+#endif /* CONFIG_SMP */
#ifdef CONFIG_NUMA_BALANCING
static void __migrate_swap_task(struct task_struct *p, int cpu)
@@ -3657,17 +3671,6 @@ void sched_set_stop_task(int cpu, struct task_struct *stop)
}
}
-#else /* CONFIG_SMP */
-
-static inline void migrate_disable_switch(struct rq *rq, struct task_struct *p) { }
-
-static inline bool rq_has_pinned_tasks(struct rq *rq)
-{
- return false;
-}
-
-#endif /* !CONFIG_SMP */
-
static void
ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
{
@@ -3678,7 +3681,6 @@ ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
rq = this_rq();
-#ifdef CONFIG_SMP
if (cpu == rq->cpu) {
__schedstat_inc(rq->ttwu_local);
__schedstat_inc(p->stats.nr_wakeups_local);
@@ -3698,7 +3700,6 @@ ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
if (wake_flags & WF_MIGRATED)
__schedstat_inc(p->stats.nr_wakeups_migrate);
-#endif /* CONFIG_SMP */
__schedstat_inc(rq->ttwu_count);
__schedstat_inc(p->stats.nr_wakeups);
@@ -3727,13 +3728,11 @@ ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags,
if (p->sched_contributes_to_load)
rq->nr_uninterruptible--;
-#ifdef CONFIG_SMP
if (wake_flags & WF_RQ_SELECTED)
en_flags |= ENQUEUE_RQ_SELECTED;
if (wake_flags & WF_MIGRATED)
en_flags |= ENQUEUE_MIGRATED;
else
-#endif
if (p->in_iowait) {
delayacct_blkio_end(p);
atomic_dec(&task_rq(p)->nr_iowait);
@@ -3744,7 +3743,6 @@ ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags,
ttwu_do_wakeup(p);
-#ifdef CONFIG_SMP
if (p->sched_class->task_woken) {
/*
* Our task @p is fully woken up and running; so it's safe to
@@ -3766,7 +3764,6 @@ ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags,
rq->idle_stamp = 0;
}
-#endif
}
/*
@@ -3820,7 +3817,6 @@ static int ttwu_runnable(struct task_struct *p, int wake_flags)
return ret;
}
-#ifdef CONFIG_SMP
void sched_ttwu_pending(void *arg)
{
struct llist_node *llist = arg;
@@ -3887,7 +3883,9 @@ static void __ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags
p->sched_remote_wakeup = !!(wake_flags & WF_MIGRATED);
WRITE_ONCE(rq->ttwu_pending, 1);
+#ifdef CONFIG_SMP
__smp_call_single_queue(cpu, &p->wake_entry.llist);
+#endif
}
void wake_up_if_idle(int cpu)
@@ -3939,6 +3937,11 @@ static inline bool ttwu_queue_cond(struct task_struct *p, int cpu)
if (!scx_allow_ttwu_queue(p))
return false;
+#ifdef CONFIG_SMP
+ if (p->sched_class == &stop_sched_class)
+ return false;
+#endif
+
/*
* Do not complicate things with the async wake_list while the CPU is
* in hotplug state.
@@ -3988,15 +3991,6 @@ static bool ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags)
return false;
}
-#else /* !CONFIG_SMP */
-
-static inline bool ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags)
-{
- return false;
-}
-
-#endif /* CONFIG_SMP */
-
static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags)
{
struct rq *rq = cpu_rq(cpu);
@@ -4252,7 +4246,6 @@ int try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
if (READ_ONCE(p->on_rq) && ttwu_runnable(p, wake_flags))
break;
-#ifdef CONFIG_SMP
/*
* Ensure we load p->on_cpu _after_ p->on_rq, otherwise it would be
* possible to, falsely, observe p->on_cpu == 0.
@@ -4331,9 +4324,6 @@ int try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
psi_ttwu_dequeue(p);
set_task_cpu(p, cpu);
}
-#else
- cpu = task_cpu(p);
-#endif /* CONFIG_SMP */
ttwu_queue(p, cpu, wake_flags);
}
@@ -4366,14 +4356,12 @@ static bool __task_needs_rq_lock(struct task_struct *p)
if (p->on_rq)
return true;
-#ifdef CONFIG_SMP
/*
* Ensure the task has finished __schedule() and will not be referenced
* anymore. Again, see try_to_wake_up() for a longer comment.
*/
smp_rmb();
smp_cond_load_acquire(&p->on_cpu, !VAL);
-#endif
return false;
}
@@ -4529,10 +4517,8 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
p->capture_control = NULL;
#endif
init_numa_balancing(clone_flags, p);
-#ifdef CONFIG_SMP
p->wake_entry.u_flags = CSD_TYPE_TTWU;
p->migration_pending = NULL;
-#endif
init_sched_mm_cid(p);
}
@@ -4595,8 +4581,8 @@ static int sysctl_numa_balancing(const struct ctl_table *table, int write,
}
return err;
}
-#endif
-#endif
+#endif /* CONFIG_PROC_SYSCTL */
+#endif /* CONFIG_NUMA_BALANCING */
#ifdef CONFIG_SCHEDSTATS
@@ -4783,14 +4769,11 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
if (likely(sched_info_on()))
memset(&p->sched_info, 0, sizeof(p->sched_info));
#endif
-#if defined(CONFIG_SMP)
p->on_cpu = 0;
-#endif
init_task_preempt_count(p);
-#ifdef CONFIG_SMP
plist_node_init(&p->pushable_tasks, MAX_PRIO);
RB_CLEAR_NODE(&p->pushable_dl_tasks);
-#endif
+
return 0;
}
@@ -4867,7 +4850,6 @@ void wake_up_new_task(struct task_struct *p)
raw_spin_lock_irqsave(&p->pi_lock, rf.flags);
WRITE_ONCE(p->__state, TASK_RUNNING);
-#ifdef CONFIG_SMP
/*
* Fork balancing, do it here and not earlier because:
* - cpus_ptr can change in the fork path
@@ -4879,7 +4861,6 @@ void wake_up_new_task(struct task_struct *p)
p->recent_used_cpu = task_cpu(p);
rseq_migrate(p);
__set_task_cpu(p, select_task_rq(p, task_cpu(p), &wake_flags));
-#endif
rq = __task_rq_lock(p, &rf);
update_rq_clock(rq);
post_init_entity_util_avg(p);
@@ -4887,7 +4868,6 @@ void wake_up_new_task(struct task_struct *p)
activate_task(rq, p, ENQUEUE_NOCLOCK | ENQUEUE_INITIAL);
trace_sched_wakeup_new(p);
wakeup_preempt(rq, p, wake_flags);
-#ifdef CONFIG_SMP
if (p->sched_class->task_woken) {
/*
* Nothing relies on rq->lock after this, so it's fine to
@@ -4897,7 +4877,6 @@ void wake_up_new_task(struct task_struct *p)
p->sched_class->task_woken(rq, p);
rq_repin_lock(rq, &rf);
}
-#endif
task_rq_unlock(rq, p, &rf);
}
@@ -4974,7 +4953,7 @@ fire_sched_out_preempt_notifiers(struct task_struct *curr,
__fire_sched_out_preempt_notifiers(curr, next);
}
-#else /* !CONFIG_PREEMPT_NOTIFIERS */
+#else /* !CONFIG_PREEMPT_NOTIFIERS: */
static inline void fire_sched_in_preempt_notifiers(struct task_struct *curr)
{
@@ -4986,11 +4965,10 @@ fire_sched_out_preempt_notifiers(struct task_struct *curr,
{
}
-#endif /* CONFIG_PREEMPT_NOTIFIERS */
+#endif /* !CONFIG_PREEMPT_NOTIFIERS */
static inline void prepare_task(struct task_struct *next)
{
-#ifdef CONFIG_SMP
/*
* Claim the task as running, we do this before switching to it
* such that any running task will have this set.
@@ -4999,12 +4977,10 @@ static inline void prepare_task(struct task_struct *next)
* its ordering comment.
*/
WRITE_ONCE(next->on_cpu, 1);
-#endif
}
static inline void finish_task(struct task_struct *prev)
{
-#ifdef CONFIG_SMP
/*
* This must be the very last reference to @prev from this CPU. After
* p->on_cpu is cleared, the task can be moved to a different CPU. We
@@ -5017,11 +4993,8 @@ static inline void finish_task(struct task_struct *prev)
* Pairs with the smp_cond_load_acquire() in try_to_wake_up().
*/
smp_store_release(&prev->on_cpu, 0);
-#endif
}
-#ifdef CONFIG_SMP
-
static void do_balance_callbacks(struct rq *rq, struct balance_callback *head)
{
void (*func)(struct rq *rq);
@@ -5103,14 +5076,6 @@ void balance_callbacks(struct rq *rq, struct balance_callback *head)
}
}
-#else
-
-static inline void __balance_callbacks(struct rq *rq)
-{
-}
-
-#endif
-
static inline void
prepare_lock_switch(struct rq *rq, struct task_struct *next, struct rq_flags *rf)
{
@@ -5320,7 +5285,7 @@ asmlinkage __visible void schedule_tail(struct task_struct *prev)
* switched the context for the first time. It is returning from
* schedule for the first time in this path.
*/
- trace_sched_exit_tp(true, CALLER_ADDR0);
+ trace_sched_exit_tp(true);
preempt_enable();
if (current->set_child_tid)
@@ -5498,8 +5463,6 @@ unsigned int nr_iowait(void)
return sum;
}
-#ifdef CONFIG_SMP
-
/*
* sched_exec - execve() is a valuable balancing opportunity, because at
* this point the task has the smallest effective memory and cache footprint.
@@ -5523,8 +5486,6 @@ void sched_exec(void)
stop_one_cpu(task_cpu(p), migration_cpu_stop, &arg);
}
-#endif
-
DEFINE_PER_CPU(struct kernel_stat, kstat);
DEFINE_PER_CPU(struct kernel_cpustat, kernel_cpustat);
@@ -5559,7 +5520,7 @@ unsigned long long task_sched_runtime(struct task_struct *p)
struct rq *rq;
u64 ns;
-#if defined(CONFIG_64BIT) && defined(CONFIG_SMP)
+#ifdef CONFIG_64BIT
/*
* 64-bit doesn't need locks to atomically read a 64-bit value.
* So we have a optimization chance when the task's delta_exec is 0.
@@ -5686,12 +5647,10 @@ void sched_tick(void)
if (donor->flags & PF_WQ_WORKER)
wq_worker_tick(donor);
-#ifdef CONFIG_SMP
if (!scx_switched_all()) {
rq->idle_balance = idle_cpu(cpu);
sched_balance_trigger(rq);
}
-#endif
}
#ifdef CONFIG_NO_HZ_FULL
@@ -5831,10 +5790,10 @@ int __init sched_tick_offload_init(void)
return 0;
}
-#else /* !CONFIG_NO_HZ_FULL */
+#else /* !CONFIG_NO_HZ_FULL: */
static inline void sched_tick_start(int cpu) { }
static inline void sched_tick_stop(int cpu) { }
-#endif
+#endif /* !CONFIG_NO_HZ_FULL */
#if defined(CONFIG_PREEMPTION) && (defined(CONFIG_DEBUG_PREEMPT) || \
defined(CONFIG_TRACE_PREEMPT_TOGGLE))
@@ -6549,7 +6508,7 @@ static inline void sched_core_cpu_dying(unsigned int cpu)
rq->core = rq;
}
-#else /* !CONFIG_SCHED_CORE */
+#else /* !CONFIG_SCHED_CORE: */
static inline void sched_core_cpu_starting(unsigned int cpu) {}
static inline void sched_core_cpu_deactivate(unsigned int cpu) {}
@@ -6561,7 +6520,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
return __pick_next_task(rq, prev, rf);
}
-#endif /* CONFIG_SCHED_CORE */
+#endif /* !CONFIG_SCHED_CORE */
/*
* Constants for the sched_mode argument of __schedule().
@@ -6577,11 +6536,13 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
/*
* Helper function for __schedule()
*
- * If a task does not have signals pending, deactivate it
- * Otherwise marks the task's __state as RUNNING
+ * Tries to deactivate the task, unless the should_block arg
+ * is false or if a signal is pending. In the case a signal
+ * is pending, marks the task's __state as RUNNING (and clear
+ * blocked_on).
*/
static bool try_to_block_task(struct rq *rq, struct task_struct *p,
- unsigned long *task_state_p)
+ unsigned long *task_state_p, bool should_block)
{
unsigned long task_state = *task_state_p;
int flags = DEQUEUE_NOCLOCK;
@@ -6592,6 +6553,16 @@ static bool try_to_block_task(struct rq *rq, struct task_struct *p,
return false;
}
+ /*
+ * We check should_block after signal_pending because we
+ * will want to wake the task in that case. But if
+ * should_block is false, its likely due to the task being
+ * blocked on a mutex, and we want to keep it on the runqueue
+ * to be selectable for proxy-execution.
+ */
+ if (!should_block)
+ return false;
+
p->sched_contributes_to_load =
(task_state & TASK_UNINTERRUPTIBLE) &&
!(task_state & TASK_NOLOAD) &&
@@ -6615,6 +6586,194 @@ static bool try_to_block_task(struct rq *rq, struct task_struct *p,
return true;
}
+#ifdef CONFIG_SCHED_PROXY_EXEC
+static inline struct task_struct *proxy_resched_idle(struct rq *rq)
+{
+ put_prev_set_next_task(rq, rq->donor, rq->idle);
+ rq_set_donor(rq, rq->idle);
+ set_tsk_need_resched(rq->idle);
+ return rq->idle;
+}
+
+static bool __proxy_deactivate(struct rq *rq, struct task_struct *donor)
+{
+ unsigned long state = READ_ONCE(donor->__state);
+
+ /* Don't deactivate if the state has been changed to TASK_RUNNING */
+ if (state == TASK_RUNNING)
+ return false;
+ /*
+ * Because we got donor from pick_next_task(), it is *crucial*
+ * that we call proxy_resched_idle() before we deactivate it.
+ * As once we deactivate donor, donor->on_rq is set to zero,
+ * which allows ttwu() to immediately try to wake the task on
+ * another rq. So we cannot use *any* references to donor
+ * after that point. So things like cfs_rq->curr or rq->donor
+ * need to be changed from next *before* we deactivate.
+ */
+ proxy_resched_idle(rq);
+ return try_to_block_task(rq, donor, &state, true);
+}
+
+static struct task_struct *proxy_deactivate(struct rq *rq, struct task_struct *donor)
+{
+ if (!__proxy_deactivate(rq, donor)) {
+ /*
+ * XXX: For now, if deactivation failed, set donor
+ * as unblocked, as we aren't doing proxy-migrations
+ * yet (more logic will be needed then).
+ */
+ donor->blocked_on = NULL;
+ }
+ return NULL;
+}
+
+/*
+ * Find runnable lock owner to proxy for mutex blocked donor
+ *
+ * Follow the blocked-on relation:
+ * task->blocked_on -> mutex->owner -> task...
+ *
+ * Lock order:
+ *
+ * p->pi_lock
+ * rq->lock
+ * mutex->wait_lock
+ *
+ * Returns the task that is going to be used as execution context (the one
+ * that is actually going to be run on cpu_of(rq)).
+ */
+static struct task_struct *
+find_proxy_task(struct rq *rq, struct task_struct *donor, struct rq_flags *rf)
+{
+ struct task_struct *owner = NULL;
+ int this_cpu = cpu_of(rq);
+ struct task_struct *p;
+ struct mutex *mutex;
+
+ /* Follow blocked_on chain. */
+ for (p = donor; task_is_blocked(p); p = owner) {
+ mutex = p->blocked_on;
+ /* Something changed in the chain, so pick again */
+ if (!mutex)
+ return NULL;
+ /*
+ * By taking mutex->wait_lock we hold off concurrent mutex_unlock()
+ * and ensure @owner sticks around.
+ */
+ guard(raw_spinlock)(&mutex->wait_lock);
+
+ /* Check again that p is blocked with wait_lock held */
+ if (mutex != __get_task_blocked_on(p)) {
+ /*
+ * Something changed in the blocked_on chain and
+ * we don't know if only at this level. So, let's
+ * just bail out completely and let __schedule()
+ * figure things out (pick_again loop).
+ */
+ return NULL;
+ }
+
+ owner = __mutex_owner(mutex);
+ if (!owner) {
+ __clear_task_blocked_on(p, mutex);
+ return p;
+ }
+
+ if (!READ_ONCE(owner->on_rq) || owner->se.sched_delayed) {
+ /* XXX Don't handle blocked owners/delayed dequeue yet */
+ return proxy_deactivate(rq, donor);
+ }
+
+ if (task_cpu(owner) != this_cpu) {
+ /* XXX Don't handle migrations yet */
+ return proxy_deactivate(rq, donor);
+ }
+
+ if (task_on_rq_migrating(owner)) {
+ /*
+ * One of the chain of mutex owners is currently migrating to this
+ * CPU, but has not yet been enqueued because we are holding the
+ * rq lock. As a simple solution, just schedule rq->idle to give
+ * the migration a chance to complete. Much like the migrate_task
+ * case we should end up back in find_proxy_task(), this time
+ * hopefully with all relevant tasks already enqueued.
+ */
+ return proxy_resched_idle(rq);
+ }
+
+ /*
+ * Its possible to race where after we check owner->on_rq
+ * but before we check (owner_cpu != this_cpu) that the
+ * task on another cpu was migrated back to this cpu. In
+ * that case it could slip by our checks. So double check
+ * we are still on this cpu and not migrating. If we get
+ * inconsistent results, try again.
+ */
+ if (!task_on_rq_queued(owner) || task_cpu(owner) != this_cpu)
+ return NULL;
+
+ if (owner == p) {
+ /*
+ * It's possible we interleave with mutex_unlock like:
+ *
+ * lock(&rq->lock);
+ * find_proxy_task()
+ * mutex_unlock()
+ * lock(&wait_lock);
+ * donor(owner) = current->blocked_donor;
+ * unlock(&wait_lock);
+ *
+ * wake_up_q();
+ * ...
+ * ttwu_runnable()
+ * __task_rq_lock()
+ * lock(&wait_lock);
+ * owner == p
+ *
+ * Which leaves us to finish the ttwu_runnable() and make it go.
+ *
+ * So schedule rq->idle so that ttwu_runnable() can get the rq
+ * lock and mark owner as running.
+ */
+ return proxy_resched_idle(rq);
+ }
+ /*
+ * OK, now we're absolutely sure @owner is on this
+ * rq, therefore holding @rq->lock is sufficient to
+ * guarantee its existence, as per ttwu_remote().
+ */
+ }
+
+ WARN_ON_ONCE(owner && !owner->on_rq);
+ return owner;
+}
+#else /* SCHED_PROXY_EXEC */
+static struct task_struct *
+find_proxy_task(struct rq *rq, struct task_struct *donor, struct rq_flags *rf)
+{
+ WARN_ONCE(1, "This should never be called in the !SCHED_PROXY_EXEC case\n");
+ return donor;
+}
+#endif /* SCHED_PROXY_EXEC */
+
+static inline void proxy_tag_curr(struct rq *rq, struct task_struct *owner)
+{
+ if (!sched_proxy_exec())
+ return;
+ /*
+ * pick_next_task() calls set_next_task() on the chosen task
+ * at some point, which ensures it is not push/pullable.
+ * However, the chosen/donor task *and* the mutex owner form an
+ * atomic pair wrt push/pull.
+ *
+ * Make sure owner we run is not pushable. Unfortunately we can
+ * only deal with that by means of a dequeue/enqueue cycle. :-/
+ */
+ dequeue_task(rq, owner, DEQUEUE_NOCLOCK | DEQUEUE_SAVE);
+ enqueue_task(rq, owner, ENQUEUE_NOCLOCK | ENQUEUE_RESTORE);
+}
+
/*
* __schedule() is the main scheduler function.
*
@@ -6669,7 +6828,8 @@ static void __sched notrace __schedule(int sched_mode)
struct rq *rq;
int cpu;
- trace_sched_entry_tp(preempt, CALLER_ADDR0);
+ /* Trace preemptions consistently with task switches */
+ trace_sched_entry_tp(sched_mode == SM_PREEMPT);
cpu = smp_processor_id();
rq = cpu_rq(cpu);
@@ -6727,15 +6887,31 @@ static void __sched notrace __schedule(int sched_mode)
goto picked;
}
} else if (!preempt && prev_state) {
- try_to_block_task(rq, prev, &prev_state);
+ /*
+ * We pass task_is_blocked() as the should_block arg
+ * in order to keep mutex-blocked tasks on the runqueue
+ * for slection with proxy-exec (without proxy-exec
+ * task_is_blocked() will always be false).
+ */
+ try_to_block_task(rq, prev, &prev_state,
+ !task_is_blocked(prev));
switch_count = &prev->nvcsw;
}
- next = pick_next_task(rq, prev, &rf);
+pick_again:
+ next = pick_next_task(rq, rq->donor, &rf);
rq_set_donor(rq, next);
+ if (unlikely(task_is_blocked(next))) {
+ next = find_proxy_task(rq, next, &rf);
+ if (!next)
+ goto pick_again;
+ if (next == rq->idle)
+ goto keep_resched;
+ }
picked:
clear_tsk_need_resched(prev);
clear_preempt_need_resched();
+keep_resched:
rq->last_seen_need_resched_ns = 0;
is_switch = prev != next;
@@ -6746,6 +6922,10 @@ picked:
* changes to task_struct made by pick_next_task().
*/
RCU_INIT_POINTER(rq->curr, next);
+
+ if (!task_current_donor(rq, next))
+ proxy_tag_curr(rq, next);
+
/*
* The membarrier system call requires each architecture
* to have a full memory barrier after updating
@@ -6780,11 +6960,15 @@ picked:
/* Also unlocks the rq: */
rq = context_switch(rq, prev, next, &rf);
} else {
+ /* In case next was already curr but just got blocked_donor */
+ if (!task_current_donor(rq, next))
+ proxy_tag_curr(rq, next);
+
rq_unpin_lock(rq, &rf);
__balance_callbacks(rq);
raw_spin_rq_unlock_irq(rq);
}
- trace_sched_exit_tp(is_switch, CALLER_ADDR0);
+ trace_sched_exit_tp(is_switch);
}
void __noreturn do_task_dead(void)
@@ -6988,14 +7172,14 @@ NOKPROBE_SYMBOL(preempt_schedule);
EXPORT_SYMBOL(preempt_schedule);
#ifdef CONFIG_PREEMPT_DYNAMIC
-#if defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL)
-#ifndef preempt_schedule_dynamic_enabled
-#define preempt_schedule_dynamic_enabled preempt_schedule
-#define preempt_schedule_dynamic_disabled NULL
-#endif
+# ifdef CONFIG_HAVE_PREEMPT_DYNAMIC_CALL
+# ifndef preempt_schedule_dynamic_enabled
+# define preempt_schedule_dynamic_enabled preempt_schedule
+# define preempt_schedule_dynamic_disabled NULL
+# endif
DEFINE_STATIC_CALL(preempt_schedule, preempt_schedule_dynamic_enabled);
EXPORT_STATIC_CALL_TRAMP(preempt_schedule);
-#elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY)
+# elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY)
static DEFINE_STATIC_KEY_TRUE(sk_dynamic_preempt_schedule);
void __sched notrace dynamic_preempt_schedule(void)
{
@@ -7005,8 +7189,8 @@ void __sched notrace dynamic_preempt_schedule(void)
}
NOKPROBE_SYMBOL(dynamic_preempt_schedule);
EXPORT_SYMBOL(dynamic_preempt_schedule);
-#endif
-#endif
+# endif
+#endif /* CONFIG_PREEMPT_DYNAMIC */
/**
* preempt_schedule_notrace - preempt_schedule called by tracing
@@ -7061,14 +7245,14 @@ asmlinkage __visible void __sched notrace preempt_schedule_notrace(void)
EXPORT_SYMBOL_GPL(preempt_schedule_notrace);
#ifdef CONFIG_PREEMPT_DYNAMIC
-#if defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL)
-#ifndef preempt_schedule_notrace_dynamic_enabled
-#define preempt_schedule_notrace_dynamic_enabled preempt_schedule_notrace
-#define preempt_schedule_notrace_dynamic_disabled NULL
-#endif
+# if defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL)
+# ifndef preempt_schedule_notrace_dynamic_enabled
+# define preempt_schedule_notrace_dynamic_enabled preempt_schedule_notrace
+# define preempt_schedule_notrace_dynamic_disabled NULL
+# endif
DEFINE_STATIC_CALL(preempt_schedule_notrace, preempt_schedule_notrace_dynamic_enabled);
EXPORT_STATIC_CALL_TRAMP(preempt_schedule_notrace);
-#elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY)
+# elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY)
static DEFINE_STATIC_KEY_TRUE(sk_dynamic_preempt_schedule_notrace);
void __sched notrace dynamic_preempt_schedule_notrace(void)
{
@@ -7078,7 +7262,7 @@ void __sched notrace dynamic_preempt_schedule_notrace(void)
}
NOKPROBE_SYMBOL(dynamic_preempt_schedule_notrace);
EXPORT_SYMBOL(dynamic_preempt_schedule_notrace);
-#endif
+# endif
#endif
#endif /* CONFIG_PREEMPTION */
@@ -7297,7 +7481,7 @@ out_unlock:
preempt_enable();
}
-#endif
+#endif /* CONFIG_RT_MUTEXES */
#if !defined(CONFIG_PREEMPTION) || defined(CONFIG_PREEMPT_DYNAMIC)
int __sched __cond_resched(void)
@@ -7328,17 +7512,17 @@ EXPORT_SYMBOL(__cond_resched);
#endif
#ifdef CONFIG_PREEMPT_DYNAMIC
-#if defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL)
-#define cond_resched_dynamic_enabled __cond_resched
-#define cond_resched_dynamic_disabled ((void *)&__static_call_return0)
+# ifdef CONFIG_HAVE_PREEMPT_DYNAMIC_CALL
+# define cond_resched_dynamic_enabled __cond_resched
+# define cond_resched_dynamic_disabled ((void *)&__static_call_return0)
DEFINE_STATIC_CALL_RET0(cond_resched, __cond_resched);
EXPORT_STATIC_CALL_TRAMP(cond_resched);
-#define might_resched_dynamic_enabled __cond_resched
-#define might_resched_dynamic_disabled ((void *)&__static_call_return0)
+# define might_resched_dynamic_enabled __cond_resched
+# define might_resched_dynamic_disabled ((void *)&__static_call_return0)
DEFINE_STATIC_CALL_RET0(might_resched, __cond_resched);
EXPORT_STATIC_CALL_TRAMP(might_resched);
-#elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY)
+# elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY)
static DEFINE_STATIC_KEY_FALSE(sk_dynamic_cond_resched);
int __sched dynamic_cond_resched(void)
{
@@ -7356,8 +7540,8 @@ int __sched dynamic_might_resched(void)
return __cond_resched();
}
EXPORT_SYMBOL(dynamic_might_resched);
-#endif
-#endif
+# endif
+#endif /* CONFIG_PREEMPT_DYNAMIC */
/*
* __cond_resched_lock() - if a reschedule is pending, drop the given lock,
@@ -7423,9 +7607,9 @@ EXPORT_SYMBOL(__cond_resched_rwlock_write);
#ifdef CONFIG_PREEMPT_DYNAMIC
-#ifdef CONFIG_GENERIC_ENTRY
-#include <linux/entry-common.h>
-#endif
+# ifdef CONFIG_GENERIC_IRQ_ENTRY
+# include <linux/irq-entry-common.h>
+# endif
/*
* SC:cond_resched
@@ -7480,37 +7664,37 @@ int preempt_dynamic_mode = preempt_dynamic_undefined;
int sched_dynamic_mode(const char *str)
{
-#ifndef CONFIG_PREEMPT_RT
+# ifndef CONFIG_PREEMPT_RT
if (!strcmp(str, "none"))
return preempt_dynamic_none;
if (!strcmp(str, "voluntary"))
return preempt_dynamic_voluntary;
-#endif
+# endif
if (!strcmp(str, "full"))
return preempt_dynamic_full;
-#ifdef CONFIG_ARCH_HAS_PREEMPT_LAZY
+# ifdef CONFIG_ARCH_HAS_PREEMPT_LAZY
if (!strcmp(str, "lazy"))
return preempt_dynamic_lazy;
-#endif
+# endif
return -EINVAL;
}
-#define preempt_dynamic_key_enable(f) static_key_enable(&sk_dynamic_##f.key)
-#define preempt_dynamic_key_disable(f) static_key_disable(&sk_dynamic_##f.key)
+# define preempt_dynamic_key_enable(f) static_key_enable(&sk_dynamic_##f.key)
+# define preempt_dynamic_key_disable(f) static_key_disable(&sk_dynamic_##f.key)
-#if defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL)
-#define preempt_dynamic_enable(f) static_call_update(f, f##_dynamic_enabled)
-#define preempt_dynamic_disable(f) static_call_update(f, f##_dynamic_disabled)
-#elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY)
-#define preempt_dynamic_enable(f) preempt_dynamic_key_enable(f)
-#define preempt_dynamic_disable(f) preempt_dynamic_key_disable(f)
-#else
-#error "Unsupported PREEMPT_DYNAMIC mechanism"
-#endif
+# if defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL)
+# define preempt_dynamic_enable(f) static_call_update(f, f##_dynamic_enabled)
+# define preempt_dynamic_disable(f) static_call_update(f, f##_dynamic_disabled)
+# elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY)
+# define preempt_dynamic_enable(f) preempt_dynamic_key_enable(f)
+# define preempt_dynamic_disable(f) preempt_dynamic_key_disable(f)
+# else
+# error "Unsupported PREEMPT_DYNAMIC mechanism"
+# endif
static DEFINE_MUTEX(sched_dynamic_mutex);
@@ -7614,7 +7798,7 @@ static void __init preempt_dynamic_init(void)
}
}
-#define PREEMPT_MODEL_ACCESSOR(mode) \
+# define PREEMPT_MODEL_ACCESSOR(mode) \
bool preempt_model_##mode(void) \
{ \
WARN_ON_ONCE(preempt_dynamic_mode == preempt_dynamic_undefined); \
@@ -7659,7 +7843,7 @@ const char *preempt_model_str(void)
if (IS_ENABLED(CONFIG_PREEMPT_DYNAMIC)) {
seq_buf_printf(&s, "(%s)%s",
- preempt_dynamic_mode > 0 ?
+ preempt_dynamic_mode >= 0 ?
preempt_modes[preempt_dynamic_mode] : "undef",
brace ? "}" : "");
return seq_buf_str(&s);
@@ -7815,12 +7999,10 @@ void show_state_filter(unsigned int state_filter)
*/
void __init init_idle(struct task_struct *idle, int cpu)
{
-#ifdef CONFIG_SMP
struct affinity_context ac = (struct affinity_context) {
.new_mask = cpumask_of(cpu),
.flags = 0,
};
-#endif
struct rq *rq = cpu_rq(cpu);
unsigned long flags;
@@ -7836,13 +8018,11 @@ void __init init_idle(struct task_struct *idle, int cpu)
idle->flags |= PF_KTHREAD | PF_NO_SETAFFINITY;
kthread_set_per_cpu(idle, cpu);
-#ifdef CONFIG_SMP
/*
* No validation and serialization required at boot time and for
* setting up the idle tasks of not yet online CPUs.
*/
set_cpus_allowed_common(idle, &ac);
-#endif
/*
* We're having a chicken and egg problem, even though we are
* holding rq->lock, the CPU isn't yet set to this CPU so the
@@ -7861,9 +8041,7 @@ void __init init_idle(struct task_struct *idle, int cpu)
rq_set_donor(rq, idle);
rcu_assign_pointer(rq->curr, idle);
idle->on_rq = TASK_ON_RQ_QUEUED;
-#ifdef CONFIG_SMP
idle->on_cpu = 1;
-#endif
raw_spin_rq_unlock(rq);
raw_spin_unlock_irqrestore(&idle->pi_lock, flags);
@@ -7876,13 +8054,9 @@ void __init init_idle(struct task_struct *idle, int cpu)
idle->sched_class = &idle_sched_class;
ftrace_graph_init_idle_task(idle, cpu);
vtime_init_idle(idle, cpu);
-#ifdef CONFIG_SMP
sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu);
-#endif
}
-#ifdef CONFIG_SMP
-
int cpuset_cpumask_can_shrink(const struct cpumask *cur,
const struct cpumask *trial)
{
@@ -8118,7 +8292,7 @@ static void balance_hotplug_wait(void)
TASK_UNINTERRUPTIBLE);
}
-#else
+#else /* !CONFIG_HOTPLUG_CPU: */
static inline void balance_push(struct rq *rq)
{
@@ -8132,7 +8306,7 @@ static inline void balance_hotplug_wait(void)
{
}
-#endif /* CONFIG_HOTPLUG_CPU */
+#endif /* !CONFIG_HOTPLUG_CPU */
void set_rq_online(struct rq *rq)
{
@@ -8441,7 +8615,7 @@ int sched_cpu_dying(unsigned int cpu)
sched_core_cpu_dying(cpu);
return 0;
}
-#endif
+#endif /* CONFIG_HOTPLUG_CPU */
void __init sched_init_smp(void)
{
@@ -8465,6 +8639,8 @@ void __init sched_init_smp(void)
init_sched_rt_class();
init_sched_dl_class();
+ sched_init_dl_servers();
+
sched_smp_initialized = true;
}
@@ -8475,13 +8651,6 @@ static int __init migration_init(void)
}
early_initcall(migration_init);
-#else
-void __init sched_init_smp(void)
-{
- sched_init_granularity();
-}
-#endif /* CONFIG_SMP */
-
int in_sched_functions(unsigned long addr)
{
return in_lock_functions(addr) ||
@@ -8507,9 +8676,7 @@ void __init sched_init(void)
int i;
/* Make sure the linker didn't screw up */
-#ifdef CONFIG_SMP
BUG_ON(!sched_class_above(&stop_sched_class, &dl_sched_class));
-#endif
BUG_ON(!sched_class_above(&dl_sched_class, &rt_sched_class));
BUG_ON(!sched_class_above(&rt_sched_class, &fair_sched_class));
BUG_ON(!sched_class_above(&fair_sched_class, &idle_sched_class));
@@ -8540,7 +8707,7 @@ void __init sched_init(void)
init_cfs_bandwidth(&root_task_group.cfs_bandwidth, NULL);
#endif /* CONFIG_FAIR_GROUP_SCHED */
#ifdef CONFIG_EXT_GROUP_SCHED
- root_task_group.scx_weight = CGROUP_WEIGHT_DFL;
+ scx_tg_init(&root_task_group);
#endif /* CONFIG_EXT_GROUP_SCHED */
#ifdef CONFIG_RT_GROUP_SCHED
root_task_group.rt_se = (struct sched_rt_entity **)ptr;
@@ -8552,9 +8719,7 @@ void __init sched_init(void)
#endif /* CONFIG_RT_GROUP_SCHED */
}
-#ifdef CONFIG_SMP
init_defrootdomain();
-#endif
#ifdef CONFIG_RT_GROUP_SCHED
init_rt_bandwidth(&root_task_group.rt_bandwidth,
@@ -8615,7 +8780,6 @@ void __init sched_init(void)
rq->rt.rt_runtime = global_rt_runtime();
init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, NULL);
#endif
-#ifdef CONFIG_SMP
rq->sd = NULL;
rq->rd = NULL;
rq->cpu_capacity = SCHED_CAPACITY_SCALE;
@@ -8641,7 +8805,6 @@ void __init sched_init(void)
#ifdef CONFIG_HOTPLUG_CPU
rcuwait_init(&rq->hotplug_wait);
#endif
-#endif /* CONFIG_SMP */
hrtick_rq_init(rq);
atomic_set(&rq->nr_iowait, 0);
fair_server_init(rq);
@@ -8689,10 +8852,9 @@ void __init sched_init(void)
calc_load_update = jiffies + LOAD_FREQ;
-#ifdef CONFIG_SMP
idle_thread_set_boot_cpu();
+
balance_push_set(smp_processor_id(), false);
-#endif
init_sched_fair_class();
init_sched_ext_class();
@@ -8825,7 +8987,7 @@ void __cant_sleep(const char *file, int line, int preempt_offset)
}
EXPORT_SYMBOL_GPL(__cant_sleep);
-#ifdef CONFIG_SMP
+# ifdef CONFIG_SMP
void __cant_migrate(const char *file, int line)
{
static unsigned long prev_jiffy;
@@ -8856,8 +9018,8 @@ void __cant_migrate(const char *file, int line)
add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
}
EXPORT_SYMBOL_GPL(__cant_migrate);
-#endif
-#endif
+# endif /* CONFIG_SMP */
+#endif /* CONFIG_DEBUG_ATOMIC_SLEEP */
#ifdef CONFIG_MAGIC_SYSRQ
void normalize_rt_tasks(void)
@@ -8897,7 +9059,7 @@ void normalize_rt_tasks(void)
#endif /* CONFIG_MAGIC_SYSRQ */
-#if defined(CONFIG_KGDB_KDB)
+#ifdef CONFIG_KGDB_KDB
/*
* These functions are only useful for KDB.
*
@@ -8921,7 +9083,7 @@ struct task_struct *curr_task(int cpu)
return cpu_curr(cpu);
}
-#endif /* defined(CONFIG_KGDB_KDB) */
+#endif /* CONFIG_KGDB_KDB */
#ifdef CONFIG_CGROUP_SCHED
/* task_group_lock serializes the addition/removal of task groups */
@@ -8980,7 +9142,7 @@ struct task_group *sched_create_group(struct task_group *parent)
if (!alloc_rt_sched_group(tg, parent))
goto err;
- scx_group_set_weight(tg, CGROUP_WEIGHT_DFL);
+ scx_tg_init(tg);
alloc_uclamp_sched_group(tg, parent);
return tg;
@@ -9417,47 +9579,23 @@ static u64 cpu_shares_read_u64(struct cgroup_subsys_state *css,
#ifdef CONFIG_CFS_BANDWIDTH
static DEFINE_MUTEX(cfs_constraints_mutex);
-const u64 max_cfs_quota_period = 1 * NSEC_PER_SEC; /* 1s */
-static const u64 min_cfs_quota_period = 1 * NSEC_PER_MSEC; /* 1ms */
-/* More than 203 days if BW_SHIFT equals 20. */
-static const u64 max_cfs_runtime = MAX_BW * NSEC_PER_USEC;
-
static int __cfs_schedulable(struct task_group *tg, u64 period, u64 runtime);
-static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota,
- u64 burst)
+static int tg_set_cfs_bandwidth(struct task_group *tg,
+ u64 period_us, u64 quota_us, u64 burst_us)
{
int i, ret = 0, runtime_enabled, runtime_was_enabled;
struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
+ u64 period, quota, burst;
- if (tg == &root_task_group)
- return -EINVAL;
-
- /*
- * Ensure we have at some amount of bandwidth every period. This is
- * to prevent reaching a state of large arrears when throttled via
- * entity_tick() resulting in prolonged exit starvation.
- */
- if (quota < min_cfs_quota_period || period < min_cfs_quota_period)
- return -EINVAL;
-
- /*
- * Likewise, bound things on the other side by preventing insane quota
- * periods. This also allows us to normalize in computing quota
- * feasibility.
- */
- if (period > max_cfs_quota_period)
- return -EINVAL;
+ period = (u64)period_us * NSEC_PER_USEC;
- /*
- * Bound quota to defend quota against overflow during bandwidth shift.
- */
- if (quota != RUNTIME_INF && quota > max_cfs_runtime)
- return -EINVAL;
+ if (quota_us == RUNTIME_INF)
+ quota = RUNTIME_INF;
+ else
+ quota = (u64)quota_us * NSEC_PER_USEC;
- if (quota != RUNTIME_INF && (burst > quota ||
- burst + quota > max_cfs_runtime))
- return -EINVAL;
+ burst = (u64)burst_us * NSEC_PER_USEC;
/*
* Prevent race between setting of cfs_rq->runtime_enabled and
@@ -9512,28 +9650,22 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota,
return 0;
}
-static int tg_set_cfs_quota(struct task_group *tg, long cfs_quota_us)
+static u64 tg_get_cfs_period(struct task_group *tg)
{
- u64 quota, period, burst;
+ u64 cfs_period_us;
- period = ktime_to_ns(tg->cfs_bandwidth.period);
- burst = tg->cfs_bandwidth.burst;
- if (cfs_quota_us < 0)
- quota = RUNTIME_INF;
- else if ((u64)cfs_quota_us <= U64_MAX / NSEC_PER_USEC)
- quota = (u64)cfs_quota_us * NSEC_PER_USEC;
- else
- return -EINVAL;
+ cfs_period_us = ktime_to_ns(tg->cfs_bandwidth.period);
+ do_div(cfs_period_us, NSEC_PER_USEC);
- return tg_set_cfs_bandwidth(tg, period, quota, burst);
+ return cfs_period_us;
}
-static long tg_get_cfs_quota(struct task_group *tg)
+static u64 tg_get_cfs_quota(struct task_group *tg)
{
u64 quota_us;
if (tg->cfs_bandwidth.quota == RUNTIME_INF)
- return -1;
+ return RUNTIME_INF;
quota_us = tg->cfs_bandwidth.quota;
do_div(quota_us, NSEC_PER_USEC);
@@ -9541,45 +9673,7 @@ static long tg_get_cfs_quota(struct task_group *tg)
return quota_us;
}
-static int tg_set_cfs_period(struct task_group *tg, long cfs_period_us)
-{
- u64 quota, period, burst;
-
- if ((u64)cfs_period_us > U64_MAX / NSEC_PER_USEC)
- return -EINVAL;
-
- period = (u64)cfs_period_us * NSEC_PER_USEC;
- quota = tg->cfs_bandwidth.quota;
- burst = tg->cfs_bandwidth.burst;
-
- return tg_set_cfs_bandwidth(tg, period, quota, burst);
-}
-
-static long tg_get_cfs_period(struct task_group *tg)
-{
- u64 cfs_period_us;
-
- cfs_period_us = ktime_to_ns(tg->cfs_bandwidth.period);
- do_div(cfs_period_us, NSEC_PER_USEC);
-
- return cfs_period_us;
-}
-
-static int tg_set_cfs_burst(struct task_group *tg, long cfs_burst_us)
-{
- u64 quota, period, burst;
-
- if ((u64)cfs_burst_us > U64_MAX / NSEC_PER_USEC)
- return -EINVAL;
-
- burst = (u64)cfs_burst_us * NSEC_PER_USEC;
- period = ktime_to_ns(tg->cfs_bandwidth.period);
- quota = tg->cfs_bandwidth.quota;
-
- return tg_set_cfs_bandwidth(tg, period, quota, burst);
-}
-
-static long tg_get_cfs_burst(struct task_group *tg)
+static u64 tg_get_cfs_burst(struct task_group *tg)
{
u64 burst_us;
@@ -9589,42 +9683,6 @@ static long tg_get_cfs_burst(struct task_group *tg)
return burst_us;
}
-static s64 cpu_cfs_quota_read_s64(struct cgroup_subsys_state *css,
- struct cftype *cft)
-{
- return tg_get_cfs_quota(css_tg(css));
-}
-
-static int cpu_cfs_quota_write_s64(struct cgroup_subsys_state *css,
- struct cftype *cftype, s64 cfs_quota_us)
-{
- return tg_set_cfs_quota(css_tg(css), cfs_quota_us);
-}
-
-static u64 cpu_cfs_period_read_u64(struct cgroup_subsys_state *css,
- struct cftype *cft)
-{
- return tg_get_cfs_period(css_tg(css));
-}
-
-static int cpu_cfs_period_write_u64(struct cgroup_subsys_state *css,
- struct cftype *cftype, u64 cfs_period_us)
-{
- return tg_set_cfs_period(css_tg(css), cfs_period_us);
-}
-
-static u64 cpu_cfs_burst_read_u64(struct cgroup_subsys_state *css,
- struct cftype *cft)
-{
- return tg_get_cfs_burst(css_tg(css));
-}
-
-static int cpu_cfs_burst_write_u64(struct cgroup_subsys_state *css,
- struct cftype *cftype, u64 cfs_burst_us)
-{
- return tg_set_cfs_burst(css_tg(css), cfs_burst_us);
-}
-
struct cfs_schedulable_data {
struct task_group *tg;
u64 period, quota;
@@ -9757,6 +9815,126 @@ static int cpu_cfs_local_stat_show(struct seq_file *sf, void *v)
return 0;
}
+
+const u64 max_bw_quota_period_us = 1 * USEC_PER_SEC; /* 1s */
+static const u64 min_bw_quota_period_us = 1 * USEC_PER_MSEC; /* 1ms */
+/* More than 203 days if BW_SHIFT equals 20. */
+static const u64 max_bw_runtime_us = MAX_BW;
+
+static void tg_bandwidth(struct task_group *tg,
+ u64 *period_us_p, u64 *quota_us_p, u64 *burst_us_p)
+{
+ if (period_us_p)
+ *period_us_p = tg_get_cfs_period(tg);
+ if (quota_us_p)
+ *quota_us_p = tg_get_cfs_quota(tg);
+ if (burst_us_p)
+ *burst_us_p = tg_get_cfs_burst(tg);
+}
+
+static u64 cpu_period_read_u64(struct cgroup_subsys_state *css,
+ struct cftype *cft)
+{
+ u64 period_us;
+
+ tg_bandwidth(css_tg(css), &period_us, NULL, NULL);
+ return period_us;
+}
+
+static int tg_set_bandwidth(struct task_group *tg,
+ u64 period_us, u64 quota_us, u64 burst_us)
+{
+ const u64 max_usec = U64_MAX / NSEC_PER_USEC;
+
+ if (tg == &root_task_group)
+ return -EINVAL;
+
+ /* Values should survive translation to nsec */
+ if (period_us > max_usec ||
+ (quota_us != RUNTIME_INF && quota_us > max_usec) ||
+ burst_us > max_usec)
+ return -EINVAL;
+
+ /*
+ * Ensure we have some amount of bandwidth every period. This is to
+ * prevent reaching a state of large arrears when throttled via
+ * entity_tick() resulting in prolonged exit starvation.
+ */
+ if (quota_us < min_bw_quota_period_us ||
+ period_us < min_bw_quota_period_us)
+ return -EINVAL;
+
+ /*
+ * Likewise, bound things on the other side by preventing insane quota
+ * periods. This also allows us to normalize in computing quota
+ * feasibility.
+ */
+ if (period_us > max_bw_quota_period_us)
+ return -EINVAL;
+
+ /*
+ * Bound quota to defend quota against overflow during bandwidth shift.
+ */
+ if (quota_us != RUNTIME_INF && quota_us > max_bw_runtime_us)
+ return -EINVAL;
+
+ if (quota_us != RUNTIME_INF && (burst_us > quota_us ||
+ burst_us + quota_us > max_bw_runtime_us))
+ return -EINVAL;
+
+ return tg_set_cfs_bandwidth(tg, period_us, quota_us, burst_us);
+}
+
+static s64 cpu_quota_read_s64(struct cgroup_subsys_state *css,
+ struct cftype *cft)
+{
+ u64 quota_us;
+
+ tg_bandwidth(css_tg(css), NULL, &quota_us, NULL);
+ return quota_us; /* (s64)RUNTIME_INF becomes -1 */
+}
+
+static u64 cpu_burst_read_u64(struct cgroup_subsys_state *css,
+ struct cftype *cft)
+{
+ u64 burst_us;
+
+ tg_bandwidth(css_tg(css), NULL, NULL, &burst_us);
+ return burst_us;
+}
+
+static int cpu_period_write_u64(struct cgroup_subsys_state *css,
+ struct cftype *cftype, u64 period_us)
+{
+ struct task_group *tg = css_tg(css);
+ u64 quota_us, burst_us;
+
+ tg_bandwidth(tg, NULL, &quota_us, &burst_us);
+ return tg_set_bandwidth(tg, period_us, quota_us, burst_us);
+}
+
+static int cpu_quota_write_s64(struct cgroup_subsys_state *css,
+ struct cftype *cftype, s64 quota_us)
+{
+ struct task_group *tg = css_tg(css);
+ u64 period_us, burst_us;
+
+ if (quota_us < 0)
+ quota_us = RUNTIME_INF;
+
+ tg_bandwidth(tg, &period_us, NULL, &burst_us);
+ return tg_set_bandwidth(tg, period_us, quota_us, burst_us);
+}
+
+static int cpu_burst_write_u64(struct cgroup_subsys_state *css,
+ struct cftype *cftype, u64 burst_us)
+{
+ struct task_group *tg = css_tg(css);
+ u64 period_us, quota_us;
+
+ tg_bandwidth(tg, &period_us, &quota_us, NULL);
+ return tg_set_bandwidth(tg, period_us, quota_us, burst_us);
+}
#endif /* CONFIG_CFS_BANDWIDTH */
#ifdef CONFIG_RT_GROUP_SCHED
@@ -9802,7 +9980,7 @@ static int cpu_idle_write_s64(struct cgroup_subsys_state *css,
scx_group_set_idle(css_tg(css), idle);
return ret;
}
-#endif
+#endif /* CONFIG_GROUP_SCHED_WEIGHT */
static struct cftype cpu_legacy_files[] = {
#ifdef CONFIG_GROUP_SCHED_WEIGHT
@@ -9819,19 +9997,19 @@ static struct cftype cpu_legacy_files[] = {
#endif
#ifdef CONFIG_CFS_BANDWIDTH
{
- .name = "cfs_quota_us",
- .read_s64 = cpu_cfs_quota_read_s64,
- .write_s64 = cpu_cfs_quota_write_s64,
+ .name = "cfs_period_us",
+ .read_u64 = cpu_period_read_u64,
+ .write_u64 = cpu_period_write_u64,
},
{
- .name = "cfs_period_us",
- .read_u64 = cpu_cfs_period_read_u64,
- .write_u64 = cpu_cfs_period_write_u64,
+ .name = "cfs_quota_us",
+ .read_s64 = cpu_quota_read_s64,
+ .write_s64 = cpu_quota_write_s64,
},
{
.name = "cfs_burst_us",
- .read_u64 = cpu_cfs_burst_read_u64,
- .write_u64 = cpu_cfs_burst_write_u64,
+ .read_u64 = cpu_burst_read_u64,
+ .write_u64 = cpu_burst_write_u64,
},
{
.name = "stat",
@@ -9930,7 +10108,7 @@ static int cpu_extra_stat_show(struct seq_file *sf,
cfs_b->nr_periods, cfs_b->nr_throttled,
throttled_usec, cfs_b->nr_burst, burst_usec);
}
-#endif
+#endif /* CONFIG_CFS_BANDWIDTH */
return 0;
}
@@ -10028,22 +10206,20 @@ static void __maybe_unused cpu_period_quota_print(struct seq_file *sf,
}
/* caller should put the current value in *@periodp before calling */
-static int __maybe_unused cpu_period_quota_parse(char *buf,
- u64 *periodp, u64 *quotap)
+static int __maybe_unused cpu_period_quota_parse(char *buf, u64 *period_us_p,
+ u64 *quota_us_p)
{
char tok[21]; /* U64_MAX */
- if (sscanf(buf, "%20s %llu", tok, periodp) < 1)
+ if (sscanf(buf, "%20s %llu", tok, period_us_p) < 1)
return -EINVAL;
- *periodp *= NSEC_PER_USEC;
-
- if (sscanf(tok, "%llu", quotap))
- *quotap *= NSEC_PER_USEC;
- else if (!strcmp(tok, "max"))
- *quotap = RUNTIME_INF;
- else
- return -EINVAL;
+ if (sscanf(tok, "%llu", quota_us_p) < 1) {
+ if (!strcmp(tok, "max"))
+ *quota_us_p = RUNTIME_INF;
+ else
+ return -EINVAL;
+ }
return 0;
}
@@ -10052,8 +10228,10 @@ static int __maybe_unused cpu_period_quota_parse(char *buf,
static int cpu_max_show(struct seq_file *sf, void *v)
{
struct task_group *tg = css_tg(seq_css(sf));
+ u64 period_us, quota_us;
- cpu_period_quota_print(sf, tg_get_cfs_period(tg), tg_get_cfs_quota(tg));
+ tg_bandwidth(tg, &period_us, &quota_us, NULL);
+ cpu_period_quota_print(sf, period_us, quota_us);
return 0;
}
@@ -10061,17 +10239,16 @@ static ssize_t cpu_max_write(struct kernfs_open_file *of,
char *buf, size_t nbytes, loff_t off)
{
struct task_group *tg = css_tg(of_css(of));
- u64 period = tg_get_cfs_period(tg);
- u64 burst = tg->cfs_bandwidth.burst;
- u64 quota;
+ u64 period_us, quota_us, burst_us;
int ret;
- ret = cpu_period_quota_parse(buf, &period, &quota);
+ tg_bandwidth(tg, &period_us, NULL, &burst_us);
+ ret = cpu_period_quota_parse(buf, &period_us, &quota_us);
if (!ret)
- ret = tg_set_cfs_bandwidth(tg, period, quota, burst);
+ ret = tg_set_bandwidth(tg, period_us, quota_us, burst_us);
return ret ?: nbytes;
}
-#endif
+#endif /* CONFIG_CFS_BANDWIDTH */
static struct cftype cpu_files[] = {
#ifdef CONFIG_GROUP_SCHED_WEIGHT
@@ -10104,10 +10281,10 @@ static struct cftype cpu_files[] = {
{
.name = "max.burst",
.flags = CFTYPE_NOT_ON_ROOT,
- .read_u64 = cpu_cfs_burst_read_u64,
- .write_u64 = cpu_cfs_burst_write_u64,
+ .read_u64 = cpu_burst_read_u64,
+ .write_u64 = cpu_burst_write_u64,
},
-#endif
+#endif /* CONFIG_CFS_BANDWIDTH */
#ifdef CONFIG_UCLAMP_TASK_GROUP
{
.name = "uclamp.min",
@@ -10121,7 +10298,7 @@ static struct cftype cpu_files[] = {
.seq_show = cpu_uclamp_max_show,
.write = cpu_uclamp_max_write,
},
-#endif
+#endif /* CONFIG_UCLAMP_TASK_GROUP */
{ } /* terminate */
};
@@ -10142,7 +10319,7 @@ struct cgroup_subsys cpu_cgrp_subsys = {
.threaded = true,
};
-#endif /* CONFIG_CGROUP_SCHED */
+#endif /* CONFIG_CGROUP_SCHED */
void dump_cpu_task(int cpu)
{
@@ -10728,7 +10905,7 @@ void sched_mm_cid_fork(struct task_struct *t)
WARN_ON_ONCE(!t->mm || t->mm_cid != -1);
t->mm_cid_active = 1;
}
-#endif
+#endif /* CONFIG_SCHED_MM_CID */
#ifdef CONFIG_SCHED_CLASS_EXT
void sched_deq_and_put_task(struct task_struct *p, int queue_flags,
@@ -10763,4 +10940,4 @@ void sched_enq_and_set_task(struct sched_enq_and_set_ctx *ctx)
if (ctx->running)
set_next_task(rq, ctx->p);
}
-#endif /* CONFIG_SCHED_CLASS_EXT */
+#endif /* CONFIG_SCHED_CLASS_EXT */
diff --git a/kernel/sched/core_sched.c b/kernel/sched/core_sched.c
index c4606ca89210..9ede71ecba7f 100644
--- a/kernel/sched/core_sched.c
+++ b/kernel/sched/core_sched.c
@@ -4,6 +4,8 @@
* A simple wrapper around refcount. An allocated sched_core_cookie's
* address is used to compute the cookie of the task.
*/
+#include "sched.h"
+
struct sched_core_cookie {
refcount_t refcnt;
};
diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c
index 0de9dda09949..23a56ba12d81 100644
--- a/kernel/sched/cpuacct.c
+++ b/kernel/sched/cpuacct.c
@@ -6,6 +6,8 @@
* Based on the work by Paul Menage (menage@google.com) and Balbir Singh
* (balbir@in.ibm.com).
*/
+#include <linux/sched/cputime.h>
+#include "sched.h"
/* Time spent by the tasks of the CPU accounting group executing in ... */
enum cpuacct_stat_index {
diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c
index 95baa12a1029..cdd740b3f774 100644
--- a/kernel/sched/cpudeadline.c
+++ b/kernel/sched/cpudeadline.c
@@ -6,6 +6,7 @@
*
* Author: Juri Lelli <j.lelli@sssup.it>
*/
+#include "sched.h"
static inline int parent(int i)
{
diff --git a/kernel/sched/cpudeadline.h b/kernel/sched/cpudeadline.h
index 0adeda93b5fb..11c0f1faa7e1 100644
--- a/kernel/sched/cpudeadline.h
+++ b/kernel/sched/cpudeadline.h
@@ -1,4 +1,6 @@
/* SPDX-License-Identifier: GPL-2.0 */
+#include <linux/types.h>
+#include <linux/spinlock.h>
#define IDX_INVALID -1
@@ -15,7 +17,6 @@ struct cpudl {
struct cpudl_item *elements;
};
-#ifdef CONFIG_SMP
int cpudl_find(struct cpudl *cp, struct task_struct *p, struct cpumask *later_mask);
void cpudl_set(struct cpudl *cp, int cpu, u64 dl);
void cpudl_clear(struct cpudl *cp, int cpu);
@@ -23,4 +24,3 @@ int cpudl_init(struct cpudl *cp);
void cpudl_set_freecpu(struct cpudl *cp, int cpu);
void cpudl_clear_freecpu(struct cpudl *cp, int cpu);
void cpudl_cleanup(struct cpudl *cp);
-#endif /* CONFIG_SMP */
diff --git a/kernel/sched/cpufreq.c b/kernel/sched/cpufreq.c
index 5252fb191fae..742fb9e62e1a 100644
--- a/kernel/sched/cpufreq.c
+++ b/kernel/sched/cpufreq.c
@@ -5,6 +5,7 @@
* Copyright (C) 2016, Intel Corporation
* Author: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
*/
+#include "sched.h"
DEFINE_PER_CPU(struct update_util_data __rcu *, cpufreq_update_util_data);
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index 816f07f9d30f..0ab5f9d4bc59 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -5,6 +5,8 @@
* Copyright (C) 2016, Intel Corporation
* Author: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
*/
+#include <uapi/linux/sched/types.h>
+#include "sched.h"
#define IOWAIT_BOOST_MIN (SCHED_CAPACITY_SCALE / 8)
@@ -380,9 +382,9 @@ static bool sugov_hold_freq(struct sugov_cpu *sg_cpu)
sg_cpu->saved_idle_calls = idle_calls;
return ret;
}
-#else
+#else /* !CONFIG_NO_HZ_COMMON: */
static inline bool sugov_hold_freq(struct sugov_cpu *sg_cpu) { return false; }
-#endif /* CONFIG_NO_HZ_COMMON */
+#endif /* !CONFIG_NO_HZ_COMMON */
/*
* Make sugov_should_update_freq() ignore the rate limit when DL
@@ -630,7 +632,7 @@ static const struct kobj_type sugov_tunables_ktype = {
/********************** cpufreq governor interface *********************/
-struct cpufreq_governor schedutil_gov;
+static struct cpufreq_governor schedutil_gov;
static struct sugov_policy *sugov_policy_alloc(struct cpufreq_policy *policy)
{
@@ -909,7 +911,7 @@ static void sugov_limits(struct cpufreq_policy *policy)
WRITE_ONCE(sg_policy->limits_changed, true);
}
-struct cpufreq_governor schedutil_gov = {
+static struct cpufreq_governor schedutil_gov = {
.name = "schedutil",
.owner = THIS_MODULE,
.flags = CPUFREQ_GOV_DYNAMIC_SWITCHING,
@@ -927,4 +929,9 @@ struct cpufreq_governor *cpufreq_default_governor(void)
}
#endif
+bool sugov_is_governor(struct cpufreq_policy *policy)
+{
+ return policy->governor == &schedutil_gov;
+}
+
cpufreq_governor_init(schedutil_gov);
diff --git a/kernel/sched/cpupri.c b/kernel/sched/cpupri.c
index 42c40cfdf836..76a9ac5eb794 100644
--- a/kernel/sched/cpupri.c
+++ b/kernel/sched/cpupri.c
@@ -22,6 +22,7 @@
* worst case complexity of O(min(101, nr_domcpus)), though the scenario that
* yields the worst case search is fairly contrived.
*/
+#include "sched.h"
/*
* p->rt_priority p->prio newpri cpupri
diff --git a/kernel/sched/cpupri.h b/kernel/sched/cpupri.h
index d6cba0020064..6f562088c056 100644
--- a/kernel/sched/cpupri.h
+++ b/kernel/sched/cpupri.h
@@ -1,4 +1,7 @@
/* SPDX-License-Identifier: GPL-2.0 */
+#include <linux/atomic.h>
+#include <linux/cpumask.h>
+#include <linux/sched/rt.h>
#define CPUPRI_NR_PRIORITIES (MAX_RT_PRIO+1)
@@ -17,7 +20,6 @@ struct cpupri {
int *cpu_to_pri;
};
-#ifdef CONFIG_SMP
int cpupri_find(struct cpupri *cp, struct task_struct *p,
struct cpumask *lowest_mask);
int cpupri_find_fitness(struct cpupri *cp, struct task_struct *p,
@@ -26,4 +28,3 @@ int cpupri_find_fitness(struct cpupri *cp, struct task_struct *p,
void cpupri_set(struct cpupri *cp, int cpu, int pri);
int cpupri_init(struct cpupri *cp);
void cpupri_cleanup(struct cpupri *cp);
-#endif
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 6dab4854c6c0..7097de2c8cda 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -2,6 +2,9 @@
/*
* Simple CPU accounting cgroup controller
*/
+#include <linux/sched/cputime.h>
+#include <linux/tsacct_kern.h>
+#include "sched.h"
#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
#include <asm/cputime.h>
@@ -88,7 +91,7 @@ static u64 irqtime_tick_accounted(u64 maxtime)
return delta;
}
-#else /* CONFIG_IRQ_TIME_ACCOUNTING */
+#else /* !CONFIG_IRQ_TIME_ACCOUNTING: */
static u64 irqtime_tick_accounted(u64 dummy)
{
@@ -241,7 +244,7 @@ void __account_forceidle_time(struct task_struct *p, u64 delta)
task_group_account_field(p, CPUTIME_FORCEIDLE, delta);
}
-#endif
+#endif /* CONFIG_SCHED_CORE */
/*
* When a guest is interrupted for a longer amount of time, missed clock
@@ -262,7 +265,7 @@ static __always_inline u64 steal_account_process_time(u64 maxtime)
return steal;
}
-#endif
+#endif /* CONFIG_PARAVIRT */
return 0;
}
@@ -288,7 +291,7 @@ static inline u64 read_sum_exec_runtime(struct task_struct *t)
{
return t->se.sum_exec_runtime;
}
-#else
+#else /* !CONFIG_64BIT: */
static u64 read_sum_exec_runtime(struct task_struct *t)
{
u64 ns;
@@ -301,7 +304,7 @@ static u64 read_sum_exec_runtime(struct task_struct *t)
return ns;
}
-#endif
+#endif /* !CONFIG_64BIT */
/*
* Accumulate raw cputime values of dead tasks (sig->[us]time) and live
@@ -411,11 +414,11 @@ static void irqtime_account_idle_ticks(int ticks)
{
irqtime_account_process_tick(current, 0, ticks);
}
-#else /* CONFIG_IRQ_TIME_ACCOUNTING */
+#else /* !CONFIG_IRQ_TIME_ACCOUNTING: */
static inline void irqtime_account_idle_ticks(int ticks) { }
static inline void irqtime_account_process_tick(struct task_struct *p, int user_tick,
int nr_ticks) { }
-#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
+#endif /* !CONFIG_IRQ_TIME_ACCOUNTING */
/*
* Use precise platform statistics if available:
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index ad45a8fea245..e2d51f4306b3 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -17,6 +17,10 @@
*/
#include <linux/cpuset.h>
+#include <linux/sched/clock.h>
+#include <uapi/linux/sched/types.h>
+#include "sched.h"
+#include "pelt.h"
/*
* Default limits for DL period; on the top end we guard against small util
@@ -51,7 +55,7 @@ static int __init sched_dl_sysctl_init(void)
return 0;
}
late_initcall(sched_dl_sysctl_init);
-#endif
+#endif /* CONFIG_SYSCTL */
static bool dl_server(struct sched_dl_entity *dl_se)
{
@@ -99,7 +103,7 @@ static inline bool is_dl_boosted(struct sched_dl_entity *dl_se)
{
return pi_of(dl_se) != dl_se;
}
-#else
+#else /* !CONFIG_RT_MUTEXES: */
static inline struct sched_dl_entity *pi_of(struct sched_dl_entity *dl_se)
{
return dl_se;
@@ -109,9 +113,8 @@ static inline bool is_dl_boosted(struct sched_dl_entity *dl_se)
{
return false;
}
-#endif
+#endif /* !CONFIG_RT_MUTEXES */
-#ifdef CONFIG_SMP
static inline struct dl_bw *dl_bw_of(int i)
{
RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held(),
@@ -191,35 +194,6 @@ void __dl_update(struct dl_bw *dl_b, s64 bw)
rq->dl.extra_bw += bw;
}
}
-#else
-static inline struct dl_bw *dl_bw_of(int i)
-{
- return &cpu_rq(i)->dl.dl_bw;
-}
-
-static inline int dl_bw_cpus(int i)
-{
- return 1;
-}
-
-static inline unsigned long dl_bw_capacity(int i)
-{
- return SCHED_CAPACITY_SCALE;
-}
-
-bool dl_bw_visited(int cpu, u64 cookie)
-{
- return false;
-}
-
-static inline
-void __dl_update(struct dl_bw *dl_b, s64 bw)
-{
- struct dl_rq *dl = container_of(dl_b, struct dl_rq, dl_bw);
-
- dl->extra_bw += bw;
-}
-#endif
static inline
void __dl_sub(struct dl_bw *dl_b, u64 tsk_bw, int cpus)
@@ -552,23 +526,17 @@ void init_dl_rq(struct dl_rq *dl_rq)
{
dl_rq->root = RB_ROOT_CACHED;
-#ifdef CONFIG_SMP
/* zero means no -deadline tasks */
dl_rq->earliest_dl.curr = dl_rq->earliest_dl.next = 0;
dl_rq->overloaded = 0;
dl_rq->pushable_dl_tasks_root = RB_ROOT_CACHED;
-#else
- init_dl_bw(&dl_rq->dl_bw);
-#endif
dl_rq->running_bw = 0;
dl_rq->this_bw = 0;
init_dl_rq_bw_ratio(dl_rq);
}
-#ifdef CONFIG_SMP
-
static inline int dl_overloaded(struct rq *rq)
{
return atomic_read(&rq->rd->dlo_count);
@@ -753,37 +721,6 @@ static struct rq *dl_task_offline_migration(struct rq *rq, struct task_struct *p
return later_rq;
}
-#else
-
-static inline
-void enqueue_pushable_dl_task(struct rq *rq, struct task_struct *p)
-{
-}
-
-static inline
-void dequeue_pushable_dl_task(struct rq *rq, struct task_struct *p)
-{
-}
-
-static inline
-void inc_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
-{
-}
-
-static inline
-void dec_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
-{
-}
-
-static inline void deadline_queue_push_tasks(struct rq *rq)
-{
-}
-
-static inline void deadline_queue_pull_task(struct rq *rq)
-{
-}
-#endif /* CONFIG_SMP */
-
static void
enqueue_dl_entity(struct sched_dl_entity *dl_se, int flags);
static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags);
@@ -824,6 +761,8 @@ static inline void setup_new_dl_entity(struct sched_dl_entity *dl_se)
struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
struct rq *rq = rq_of_dl_rq(dl_rq);
+ update_rq_clock(rq);
+
WARN_ON(is_dl_boosted(dl_se));
WARN_ON(dl_time_before(rq_clock(rq), dl_se->deadline));
@@ -1195,7 +1134,6 @@ static int start_dl_timer(struct sched_dl_entity *dl_se)
static void __push_dl_task(struct rq *rq, struct rq_flags *rf)
{
-#ifdef CONFIG_SMP
/*
* Queueing this task back might have overloaded rq, check if we need
* to kick someone away.
@@ -1209,12 +1147,13 @@ static void __push_dl_task(struct rq *rq, struct rq_flags *rf)
push_dl_task(rq);
rq_repin_lock(rq, rf);
}
-#endif
}
/* a defer timer will not be reset if the runtime consumed was < dl_server_min_res */
static const u64 dl_server_min_res = 1 * NSEC_PER_MSEC;
+static bool dl_server_stopped(struct sched_dl_entity *dl_se);
+
static enum hrtimer_restart dl_server_timer(struct hrtimer *timer, struct sched_dl_entity *dl_se)
{
struct rq *rq = rq_of_dl_se(dl_se);
@@ -1234,6 +1173,7 @@ static enum hrtimer_restart dl_server_timer(struct hrtimer *timer, struct sched_
if (!dl_se->server_has_tasks(dl_se)) {
replenish_dl_entity(dl_se);
+ dl_server_stopped(dl_se);
return HRTIMER_NORESTART;
}
@@ -1339,7 +1279,6 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer)
goto unlock;
}
-#ifdef CONFIG_SMP
if (unlikely(!rq->online)) {
/*
* If the runqueue is no longer available, migrate the
@@ -1356,7 +1295,6 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer)
* there.
*/
}
-#endif
enqueue_task_dl(rq, p, ENQUEUE_REPLENISH);
if (dl_task(rq->donor))
@@ -1504,7 +1442,9 @@ static void update_curr_dl_se(struct rq *rq, struct sched_dl_entity *dl_se, s64
if (dl_entity_is_special(dl_se))
return;
- scaled_delta_exec = dl_scaled_delta_exec(rq, dl_se, delta_exec);
+ scaled_delta_exec = delta_exec;
+ if (!dl_server(dl_se))
+ scaled_delta_exec = dl_scaled_delta_exec(rq, dl_se, delta_exec);
dl_se->runtime -= scaled_delta_exec;
@@ -1598,7 +1538,7 @@ throttle:
rt_rq->rt_time += delta_exec;
raw_spin_unlock(&rt_rq->rt_runtime_lock);
}
-#endif
+#endif /* CONFIG_RT_GROUP_SCHED */
}
/*
@@ -1611,7 +1551,7 @@ throttle:
*/
void dl_server_update_idle_time(struct rq *rq, struct task_struct *p)
{
- s64 delta_exec, scaled_delta_exec;
+ s64 delta_exec;
if (!rq->fair_server.dl_defer)
return;
@@ -1624,9 +1564,7 @@ void dl_server_update_idle_time(struct rq *rq, struct task_struct *p)
if (delta_exec < 0)
return;
- scaled_delta_exec = dl_scaled_delta_exec(rq, &rq->fair_server, delta_exec);
-
- rq->fair_server.runtime -= scaled_delta_exec;
+ rq->fair_server.runtime -= delta_exec;
if (rq->fair_server.runtime < 0) {
rq->fair_server.dl_defer_running = 0;
@@ -1639,31 +1577,17 @@ void dl_server_update_idle_time(struct rq *rq, struct task_struct *p)
void dl_server_update(struct sched_dl_entity *dl_se, s64 delta_exec)
{
/* 0 runtime = fair server disabled */
- if (dl_se->dl_runtime)
+ if (dl_se->dl_runtime) {
+ dl_se->dl_server_idle = 0;
update_curr_dl_se(dl_se->rq, dl_se, delta_exec);
+ }
}
void dl_server_start(struct sched_dl_entity *dl_se)
{
struct rq *rq = dl_se->rq;
- /*
- * XXX: the apply do not work fine at the init phase for the
- * fair server because things are not yet set. We need to improve
- * this before getting generic.
- */
- if (!dl_server(dl_se)) {
- u64 runtime = 50 * NSEC_PER_MSEC;
- u64 period = 1000 * NSEC_PER_MSEC;
-
- dl_server_apply_params(dl_se, runtime, period, 1);
-
- dl_se->dl_server = 1;
- dl_se->dl_defer = 1;
- setup_new_dl_entity(dl_se);
- }
-
- if (!dl_se->dl_runtime)
+ if (!dl_server(dl_se) || dl_se->dl_server_active)
return;
dl_se->dl_server_active = 1;
@@ -1674,7 +1598,7 @@ void dl_server_start(struct sched_dl_entity *dl_se)
void dl_server_stop(struct sched_dl_entity *dl_se)
{
- if (!dl_se->dl_runtime)
+ if (!dl_server(dl_se) || !dl_server_active(dl_se))
return;
dequeue_dl_entity(dl_se, DEQUEUE_SLEEP);
@@ -1684,6 +1608,20 @@ void dl_server_stop(struct sched_dl_entity *dl_se)
dl_se->dl_server_active = 0;
}
+static bool dl_server_stopped(struct sched_dl_entity *dl_se)
+{
+ if (!dl_se->dl_server_active)
+ return false;
+
+ if (dl_se->dl_server_idle) {
+ dl_server_stop(dl_se);
+ return true;
+ }
+
+ dl_se->dl_server_idle = 1;
+ return false;
+}
+
void dl_server_init(struct sched_dl_entity *dl_se, struct rq *rq,
dl_server_has_tasks_f has_tasks,
dl_server_pick_f pick_task)
@@ -1693,6 +1631,32 @@ void dl_server_init(struct sched_dl_entity *dl_se, struct rq *rq,
dl_se->server_pick_task = pick_task;
}
+void sched_init_dl_servers(void)
+{
+ int cpu;
+ struct rq *rq;
+ struct sched_dl_entity *dl_se;
+
+ for_each_online_cpu(cpu) {
+ u64 runtime = 50 * NSEC_PER_MSEC;
+ u64 period = 1000 * NSEC_PER_MSEC;
+
+ rq = cpu_rq(cpu);
+
+ guard(rq_lock_irq)(rq);
+
+ dl_se = &rq->fair_server;
+
+ WARN_ON(dl_server(dl_se));
+
+ dl_server_apply_params(dl_se, runtime, period, 1);
+
+ dl_se->dl_server = 1;
+ dl_se->dl_defer = 1;
+ setup_new_dl_entity(dl_se);
+ }
+}
+
void __dl_server_attach_root(struct sched_dl_entity *dl_se, struct rq *rq)
{
u64 new_bw = dl_se->dl_bw;
@@ -1844,8 +1808,6 @@ static void init_dl_inactive_task_timer(struct sched_dl_entity *dl_se)
#define __node_2_dle(node) \
rb_entry((node), struct sched_dl_entity, rb_node)
-#ifdef CONFIG_SMP
-
static void inc_dl_deadline(struct dl_rq *dl_rq, u64 deadline)
{
struct rq *rq = rq_of_dl_rq(dl_rq);
@@ -1881,13 +1843,6 @@ static void dec_dl_deadline(struct dl_rq *dl_rq, u64 deadline)
}
}
-#else
-
-static inline void inc_dl_deadline(struct dl_rq *dl_rq, u64 deadline) {}
-static inline void dec_dl_deadline(struct dl_rq *dl_rq, u64 deadline) {}
-
-#endif /* CONFIG_SMP */
-
static inline
void inc_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
{
@@ -2166,6 +2121,9 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags)
if (dl_server(&p->dl))
return;
+ if (task_is_blocked(p))
+ return;
+
if (!task_current(rq, p) && !p->dl.dl_throttled && p->nr_cpus_allowed > 1)
enqueue_pushable_dl_task(rq, p);
}
@@ -2214,8 +2172,6 @@ static void yield_task_dl(struct rq *rq)
rq_clock_skip_update(rq);
}
-#ifdef CONFIG_SMP
-
static inline bool dl_task_is_earliest_deadline(struct task_struct *p,
struct rq *rq)
{
@@ -2345,7 +2301,6 @@ static int balance_dl(struct rq *rq, struct task_struct *p, struct rq_flags *rf)
return sched_stop_runnable(rq) || sched_dl_runnable(rq);
}
-#endif /* CONFIG_SMP */
/*
* Only called when both the current and waking task are -deadline
@@ -2359,7 +2314,6 @@ static void wakeup_preempt_dl(struct rq *rq, struct task_struct *p,
return;
}
-#ifdef CONFIG_SMP
/*
* In the unlikely case current and p have the same deadline
* let us try to decide what's the best thing to do...
@@ -2367,7 +2321,6 @@ static void wakeup_preempt_dl(struct rq *rq, struct task_struct *p,
if ((p->dl.deadline == rq->donor->dl.deadline) &&
!test_tsk_need_resched(rq->curr))
check_preempt_equal_dl(rq, p);
-#endif /* CONFIG_SMP */
}
#ifdef CONFIG_SCHED_HRTICK
@@ -2375,11 +2328,11 @@ static void start_hrtick_dl(struct rq *rq, struct sched_dl_entity *dl_se)
{
hrtick_start(rq, dl_se->runtime);
}
-#else /* !CONFIG_SCHED_HRTICK */
+#else /* !CONFIG_SCHED_HRTICK: */
static void start_hrtick_dl(struct rq *rq, struct sched_dl_entity *dl_se)
{
}
-#endif
+#endif /* !CONFIG_SCHED_HRTICK */
static void set_next_task_dl(struct rq *rq, struct task_struct *p, bool first)
{
@@ -2435,7 +2388,7 @@ again:
if (dl_server(dl_se)) {
p = dl_se->server_pick_task(dl_se);
if (!p) {
- if (dl_server_active(dl_se)) {
+ if (!dl_server_stopped(dl_se)) {
dl_se->dl_yielded = 1;
update_curr_dl_se(rq, dl_se, 0);
}
@@ -2465,6 +2418,10 @@ static void put_prev_task_dl(struct rq *rq, struct task_struct *p, struct task_s
update_curr_dl(rq);
update_dl_rq_load_avg(rq_clock_pelt(rq), rq, 1);
+
+ if (task_is_blocked(p))
+ return;
+
if (on_dl_rq(&p->dl) && p->nr_cpus_allowed > 1)
enqueue_pushable_dl_task(rq, p);
}
@@ -2500,8 +2457,6 @@ static void task_fork_dl(struct task_struct *p)
*/
}
-#ifdef CONFIG_SMP
-
/* Only try algorithms three times */
#define DL_MAX_TRIES 3
@@ -2976,7 +2931,14 @@ void dl_clear_root_domain(struct root_domain *rd)
int i;
guard(raw_spinlock_irqsave)(&rd->dl_bw.lock);
+
+ /*
+ * Reset total_bw to zero and extra_bw to max_bw so that next
+ * loop will add dl-servers contributions back properly,
+ */
rd->dl_bw.total_bw = 0;
+ for_each_cpu(i, rd->span)
+ cpu_rq(i)->dl.extra_bw = cpu_rq(i)->dl.max_bw;
/*
* dl_servers are not tasks. Since dl_add_task_root_domain ignores
@@ -2995,8 +2957,6 @@ void dl_clear_root_domain_cpu(int cpu)
dl_clear_root_domain(cpu_rq(cpu)->rd);
}
-#endif /* CONFIG_SMP */
-
static void switched_from_dl(struct rq *rq, struct task_struct *p)
{
/*
@@ -3069,10 +3029,8 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p)
}
if (rq->donor != p) {
-#ifdef CONFIG_SMP
if (p->nr_cpus_allowed > 1 && rq->dl.overloaded)
deadline_queue_push_tasks(rq);
-#endif
if (dl_task(rq->donor))
wakeup_preempt_dl(rq, p, 0);
else
@@ -3092,7 +3050,6 @@ static void prio_changed_dl(struct rq *rq, struct task_struct *p,
if (!task_on_rq_queued(p))
return;
-#ifdef CONFIG_SMP
/*
* This might be too much, but unfortunately
* we don't have the old deadline value, and
@@ -3121,13 +3078,6 @@ static void prio_changed_dl(struct rq *rq, struct task_struct *p,
dl_time_before(p->dl.deadline, rq->curr->dl.deadline))
resched_curr(rq);
}
-#else
- /*
- * We don't know if p has a earlier or later deadline, so let's blindly
- * set a (maybe not needed) rescheduling point.
- */
- resched_curr(rq);
-#endif
}
#ifdef CONFIG_SCHED_CORE
@@ -3149,7 +3099,6 @@ DEFINE_SCHED_CLASS(dl) = {
.put_prev_task = put_prev_task_dl,
.set_next_task = set_next_task_dl,
-#ifdef CONFIG_SMP
.balance = balance_dl,
.select_task_rq = select_task_rq_dl,
.migrate_task_rq = migrate_task_rq_dl,
@@ -3158,7 +3107,6 @@ DEFINE_SCHED_CLASS(dl) = {
.rq_offline = rq_offline_dl,
.task_woken = task_woken_dl,
.find_lock_rq = find_lock_later_rq,
-#endif
.task_tick = task_tick_dl,
.task_fork = task_fork_dl,
@@ -3242,6 +3190,9 @@ void sched_dl_do_global(void)
if (global_rt_runtime() != RUNTIME_INF)
new_bw = to_ratio(global_rt_period(), global_rt_runtime());
+ for_each_possible_cpu(cpu)
+ init_dl_rq_bw_ratio(&cpu_rq(cpu)->dl);
+
for_each_possible_cpu(cpu) {
rcu_read_lock_sched();
@@ -3257,7 +3208,6 @@ void sched_dl_do_global(void)
raw_spin_unlock_irqrestore(&dl_b->lock, flags);
rcu_read_unlock_sched();
- init_dl_rq_bw_ratio(&cpu_rq(cpu)->dl);
}
}
@@ -3458,7 +3408,6 @@ bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr)
return false;
}
-#ifdef CONFIG_SMP
int dl_cpuset_cpumask_can_shrink(const struct cpumask *cur,
const struct cpumask *trial)
{
@@ -3570,7 +3519,6 @@ void dl_bw_free(int cpu, u64 dl_bw)
{
dl_bw_manage(dl_bw_req_free, cpu, dl_bw);
}
-#endif
void print_dl_stats(struct seq_file *m, int cpu)
{
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 557246880a7e..3f06ab84d53f 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -6,6 +6,9 @@
*
* Copyright(C) 2007, Red Hat, Inc., Ingo Molnar
*/
+#include <linux/debugfs.h>
+#include <linux/nmi.h>
+#include "sched.h"
/*
* This allows printing both to /sys/kernel/debug/sched/debug and
@@ -90,10 +93,10 @@ static void sched_feat_enable(int i)
{
static_key_enable_cpuslocked(&sched_feat_keys[i]);
}
-#else
+#else /* !CONFIG_JUMP_LABEL: */
static void sched_feat_disable(int i) { };
static void sched_feat_enable(int i) { };
-#endif /* CONFIG_JUMP_LABEL */
+#endif /* !CONFIG_JUMP_LABEL */
static int sched_feat_set(char *cmp)
{
@@ -166,8 +169,6 @@ static const struct file_operations sched_feat_fops = {
.release = single_release,
};
-#ifdef CONFIG_SMP
-
static ssize_t sched_scaling_write(struct file *filp, const char __user *ubuf,
size_t cnt, loff_t *ppos)
{
@@ -214,8 +215,6 @@ static const struct file_operations sched_scaling_fops = {
.release = single_release,
};
-#endif /* SMP */
-
#ifdef CONFIG_PREEMPT_DYNAMIC
static ssize_t sched_dynamic_write(struct file *filp, const char __user *ubuf,
@@ -283,7 +282,6 @@ static const struct file_operations sched_dynamic_fops = {
__read_mostly bool sched_debug_verbose;
-#ifdef CONFIG_SMP
static struct dentry *sd_dentry;
@@ -311,9 +309,6 @@ static ssize_t sched_verbose_write(struct file *filp, const char __user *ubuf,
return result;
}
-#else
-#define sched_verbose_write debugfs_write_file_bool
-#endif
static const struct file_operations sched_verbose_fops = {
.read = debugfs_read_file_bool,
@@ -512,7 +507,6 @@ static __init int sched_init_debug(void)
debugfs_create_u32("latency_warn_ms", 0644, debugfs_sched, &sysctl_resched_latency_warn_ms);
debugfs_create_u32("latency_warn_once", 0644, debugfs_sched, &sysctl_resched_latency_warn_once);
-#ifdef CONFIG_SMP
debugfs_create_file("tunable_scaling", 0644, debugfs_sched, NULL, &sched_scaling_fops);
debugfs_create_u32("migration_cost_ns", 0644, debugfs_sched, &sysctl_sched_migration_cost);
debugfs_create_u32("nr_migrate", 0644, debugfs_sched, &sysctl_sched_nr_migrate);
@@ -520,7 +514,6 @@ static __init int sched_init_debug(void)
sched_domains_mutex_lock();
update_sched_domain_debugfs();
sched_domains_mutex_unlock();
-#endif
#ifdef CONFIG_NUMA_BALANCING
numa = debugfs_create_dir("numa_balancing", debugfs_sched);
@@ -530,7 +523,7 @@ static __init int sched_init_debug(void)
debugfs_create_u32("scan_period_max_ms", 0644, numa, &sysctl_numa_balancing_scan_period_max);
debugfs_create_u32("scan_size_mb", 0644, numa, &sysctl_numa_balancing_scan_size);
debugfs_create_u32("hot_threshold_ms", 0644, numa, &sysctl_numa_balancing_hot_threshold);
-#endif
+#endif /* CONFIG_NUMA_BALANCING */
debugfs_create_file("debug", 0444, debugfs_sched, NULL, &sched_debug_fops);
@@ -540,8 +533,6 @@ static __init int sched_init_debug(void)
}
late_initcall(sched_init_debug);
-#ifdef CONFIG_SMP
-
static cpumask_var_t sd_sysctl_cpus;
static int sd_flags_show(struct seq_file *m, void *v)
@@ -652,8 +643,6 @@ void dirty_sched_domain_sysctl(int cpu)
__cpumask_set_cpu(cpu, sd_sysctl_cpus);
}
-#endif /* CONFIG_SMP */
-
#ifdef CONFIG_FAIR_GROUP_SCHED
static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group *tg)
{
@@ -690,18 +679,16 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group
}
P(se->load.weight);
-#ifdef CONFIG_SMP
P(se->avg.load_avg);
P(se->avg.util_avg);
P(se->avg.runnable_avg);
-#endif
#undef PN_SCHEDSTAT
#undef PN
#undef P_SCHEDSTAT
#undef P
}
-#endif
+#endif /* CONFIG_FAIR_GROUP_SCHED */
#ifdef CONFIG_CGROUP_SCHED
static DEFINE_SPINLOCK(sched_debug_lock);
@@ -854,7 +841,6 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
SEQ_printf(m, " .%-30s: %d\n", "h_nr_queued", cfs_rq->h_nr_queued);
SEQ_printf(m, " .%-30s: %d\n", "h_nr_idle", cfs_rq->h_nr_idle);
SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight);
-#ifdef CONFIG_SMP
SEQ_printf(m, " .%-30s: %lu\n", "load_avg",
cfs_rq->avg.load_avg);
SEQ_printf(m, " .%-30s: %lu\n", "runnable_avg",
@@ -874,8 +860,7 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
cfs_rq->tg_load_avg_contrib);
SEQ_printf(m, " .%-30s: %ld\n", "tg_load_avg",
atomic_long_read(&cfs_rq->tg->load_avg));
-#endif
-#endif
+#endif /* CONFIG_FAIR_GROUP_SCHED */
#ifdef CONFIG_CFS_BANDWIDTH
SEQ_printf(m, " .%-30s: %d\n", "throttled",
cfs_rq->throttled);
@@ -929,11 +914,7 @@ void print_dl_rq(struct seq_file *m, int cpu, struct dl_rq *dl_rq)
SEQ_printf(m, " .%-30s: %lu\n", #x, (unsigned long)(dl_rq->x))
PU(dl_nr_running);
-#ifdef CONFIG_SMP
dl_bw = &cpu_rq(cpu)->rd->dl_bw;
-#else
- dl_bw = &dl_rq->dl_bw;
-#endif
SEQ_printf(m, " .%-30s: %lld\n", "dl_bw->bw", dl_bw->bw);
SEQ_printf(m, " .%-30s: %lld\n", "dl_bw->total_bw", dl_bw->total_bw);
@@ -951,9 +932,9 @@ static void print_cpu(struct seq_file *m, int cpu)
SEQ_printf(m, "cpu#%d, %u.%03u MHz\n",
cpu, freq / 1000, (freq % 1000));
}
-#else
+#else /* !CONFIG_X86: */
SEQ_printf(m, "cpu#%d\n", cpu);
-#endif
+#endif /* !CONFIG_X86 */
#define P(x) \
do { \
@@ -976,12 +957,10 @@ do { \
#undef P
#undef PN
-#ifdef CONFIG_SMP
#define P64(n) SEQ_printf(m, " .%-30s: %Ld\n", #n, rq->n);
P64(avg_idle);
P64(max_idle_balance_cost);
#undef P64
-#endif
#define P(n) SEQ_printf(m, " .%-30s: %d\n", #n, schedstat_val(rq->n));
if (schedstat_enabled()) {
@@ -1163,7 +1142,7 @@ static void sched_show_numa(struct task_struct *p, struct seq_file *m)
SEQ_printf(m, "current_node=%d, numa_group_id=%d\n",
task_node(p), task_numa_group_id(p));
show_numa_stats(p, m);
-#endif
+#endif /* CONFIG_NUMA_BALANCING */
}
void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns,
@@ -1247,7 +1226,6 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns,
__PS("nr_involuntary_switches", p->nivcsw);
P(se.load.weight);
-#ifdef CONFIG_SMP
P(se.avg.load_sum);
P(se.avg.runnable_sum);
P(se.avg.util_sum);
@@ -1256,13 +1234,12 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns,
P(se.avg.util_avg);
P(se.avg.last_update_time);
PM(se.avg.util_est, ~UTIL_AVG_UNCHANGED);
-#endif
#ifdef CONFIG_UCLAMP_TASK
__PS("uclamp.min", p->uclamp_req[UCLAMP_MIN].value);
__PS("uclamp.max", p->uclamp_req[UCLAMP_MAX].value);
__PS("effective uclamp.min", uclamp_eff_value(p, UCLAMP_MIN));
__PS("effective uclamp.max", uclamp_eff_value(p, UCLAMP_MAX));
-#endif
+#endif /* CONFIG_UCLAMP_TASK */
P(policy);
P(prio);
if (task_has_dl_policy(p)) {
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index f5133249fd4d..7dd5cbcb7a06 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -26,7 +26,7 @@ enum scx_consts {
* Iterating all tasks may take a while. Periodically drop
* scx_tasks_lock to avoid causing e.g. CSD and RCU stalls.
*/
- SCX_OPS_TASK_ITER_BATCH = 32,
+ SCX_TASK_ITER_BATCH = 32,
};
enum scx_exit_kind {
@@ -44,9 +44,9 @@ enum scx_exit_kind {
};
/*
- * An exit code can be specified when exiting with scx_bpf_exit() or
- * scx_ops_exit(), corresponding to exit_kind UNREG_BPF and UNREG_KERN
- * respectively. The codes are 64bit of the format:
+ * An exit code can be specified when exiting with scx_bpf_exit() or scx_exit(),
+ * corresponding to exit_kind UNREG_BPF and UNREG_KERN respectively. The codes
+ * are 64bit of the format:
*
* Bits: [63 .. 48 47 .. 32 31 .. 0]
* [ SYS ACT ] [ SYS RSN ] [ USR ]
@@ -163,16 +163,21 @@ enum scx_ops_flags {
/*
* CPU cgroup support flags
*/
- SCX_OPS_HAS_CGROUP_WEIGHT = 1LLU << 16, /* DEPRECATED, will be removed on 6.18 */
+ SCX_OPS_HAS_CGROUP_WEIGHT = 1LLU << 16, /* DEPRECATED, will be removed on 6.18 */
- SCX_OPS_ALL_FLAGS = SCX_OPS_KEEP_BUILTIN_IDLE |
- SCX_OPS_ENQ_LAST |
- SCX_OPS_ENQ_EXITING |
- SCX_OPS_ENQ_MIGRATION_DISABLED |
- SCX_OPS_ALLOW_QUEUED_WAKEUP |
- SCX_OPS_SWITCH_PARTIAL |
- SCX_OPS_BUILTIN_IDLE_PER_NODE |
- SCX_OPS_HAS_CGROUP_WEIGHT,
+ SCX_OPS_ALL_FLAGS = SCX_OPS_KEEP_BUILTIN_IDLE |
+ SCX_OPS_ENQ_LAST |
+ SCX_OPS_ENQ_EXITING |
+ SCX_OPS_ENQ_MIGRATION_DISABLED |
+ SCX_OPS_ALLOW_QUEUED_WAKEUP |
+ SCX_OPS_SWITCH_PARTIAL |
+ SCX_OPS_BUILTIN_IDLE_PER_NODE |
+ SCX_OPS_HAS_CGROUP_WEIGHT,
+
+ /* high 8 bits are internal, don't include in SCX_OPS_ALL_FLAGS */
+ __SCX_OPS_INTERNAL_MASK = 0xffLLU << 56,
+
+ SCX_OPS_HAS_CPU_PREEMPT = 1LLU << 56,
};
/* argument container for ops.init_task() */
@@ -368,6 +373,15 @@ struct sched_ext_ops {
* @running: A task is starting to run on its associated CPU
* @p: task starting to run
*
+ * Note that this callback may be called from a CPU other than the
+ * one the task is going to run on. This can happen when a task
+ * property is changed (i.e., affinity), since scx_next_task_scx(),
+ * which triggers this callback, may run on a CPU different from
+ * the task's assigned CPU.
+ *
+ * Therefore, always use scx_bpf_task_cpu(@p) to determine the
+ * target CPU the task is going to use.
+ *
* See ->runnable() for explanation on the task state notifiers.
*/
void (*running)(struct task_struct *p);
@@ -377,6 +391,15 @@ struct sched_ext_ops {
* @p: task stopping to run
* @runnable: is task @p still runnable?
*
+ * Note that this callback may be called from a CPU other than the
+ * one the task was running on. This can happen when a task
+ * property is changed (i.e., affinity), since dequeue_task_scx(),
+ * which triggers this callback, may run on a CPU different from
+ * the task's assigned CPU.
+ *
+ * Therefore, always use scx_bpf_task_cpu(@p) to retrieve the CPU
+ * the task was running on.
+ *
* See ->runnable() for explanation on the task state notifiers. If
* !@runnable, ->quiescent() will be invoked after this operation
* returns.
@@ -465,6 +488,7 @@ struct sched_ext_ops {
* idle CPU tracking and the following helpers become unavailable:
*
* - scx_bpf_select_cpu_dfl()
+ * - scx_bpf_select_cpu_and()
* - scx_bpf_test_and_clear_cpu_idle()
* - scx_bpf_pick_idle_cpu()
*
@@ -728,6 +752,9 @@ struct sched_ext_ops {
* BPF scheduler is enabled.
*/
char name[SCX_OPS_NAME_LEN];
+
+ /* internal use only, must be NULL */
+ void *priv;
};
enum scx_opi {
@@ -739,6 +766,98 @@ enum scx_opi {
SCX_OPI_END = SCX_OP_IDX(init),
};
+/*
+ * Collection of event counters. Event types are placed in descending order.
+ */
+struct scx_event_stats {
+ /*
+ * If ops.select_cpu() returns a CPU which can't be used by the task,
+ * the core scheduler code silently picks a fallback CPU.
+ */
+ s64 SCX_EV_SELECT_CPU_FALLBACK;
+
+ /*
+ * When dispatching to a local DSQ, the CPU may have gone offline in
+ * the meantime. In this case, the task is bounced to the global DSQ.
+ */
+ s64 SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE;
+
+ /*
+ * If SCX_OPS_ENQ_LAST is not set, the number of times that a task
+ * continued to run because there were no other tasks on the CPU.
+ */
+ s64 SCX_EV_DISPATCH_KEEP_LAST;
+
+ /*
+ * If SCX_OPS_ENQ_EXITING is not set, the number of times that a task
+ * is dispatched to a local DSQ when exiting.
+ */
+ s64 SCX_EV_ENQ_SKIP_EXITING;
+
+ /*
+ * If SCX_OPS_ENQ_MIGRATION_DISABLED is not set, the number of times a
+ * migration disabled task skips ops.enqueue() and is dispatched to its
+ * local DSQ.
+ */
+ s64 SCX_EV_ENQ_SKIP_MIGRATION_DISABLED;
+
+ /*
+ * Total number of times a task's time slice was refilled with the
+ * default value (SCX_SLICE_DFL).
+ */
+ s64 SCX_EV_REFILL_SLICE_DFL;
+
+ /*
+ * The total duration of bypass modes in nanoseconds.
+ */
+ s64 SCX_EV_BYPASS_DURATION;
+
+ /*
+ * The number of tasks dispatched in the bypassing mode.
+ */
+ s64 SCX_EV_BYPASS_DISPATCH;
+
+ /*
+ * The number of times the bypassing mode has been activated.
+ */
+ s64 SCX_EV_BYPASS_ACTIVATE;
+};
+
+struct scx_sched {
+ struct sched_ext_ops ops;
+ DECLARE_BITMAP(has_op, SCX_OPI_END);
+
+ /*
+ * Dispatch queues.
+ *
+ * The global DSQ (%SCX_DSQ_GLOBAL) is split per-node for scalability.
+ * This is to avoid live-locking in bypass mode where all tasks are
+ * dispatched to %SCX_DSQ_GLOBAL and all CPUs consume from it. If
+ * per-node split isn't sufficient, it can be further split.
+ */
+ struct rhashtable dsq_hash;
+ struct scx_dispatch_q **global_dsqs;
+
+ /*
+ * The event counters are in a per-CPU variable to minimize the
+ * accounting overhead. A system-wide view on the event counter is
+ * constructed when requested by scx_bpf_events().
+ */
+ struct scx_event_stats __percpu *event_stats_cpu;
+
+ bool warned_zero_slice;
+
+ atomic_t exit_kind;
+ struct scx_exit_info *exit_info;
+
+ struct kobject kobj;
+
+ struct kthread_worker *helper;
+ struct irq_work error_irq_work;
+ struct kthread_work disable_work;
+ struct rcu_work rcu_work;
+};
+
enum scx_wake_flags {
/* expose select WF_* flags as enums */
SCX_WAKE_FORK = WF_FORK,
@@ -838,18 +957,18 @@ enum scx_tg_flags {
SCX_TG_INITED = 1U << 1,
};
-enum scx_ops_enable_state {
- SCX_OPS_ENABLING,
- SCX_OPS_ENABLED,
- SCX_OPS_DISABLING,
- SCX_OPS_DISABLED,
+enum scx_enable_state {
+ SCX_ENABLING,
+ SCX_ENABLED,
+ SCX_DISABLING,
+ SCX_DISABLED,
};
-static const char *scx_ops_enable_state_str[] = {
- [SCX_OPS_ENABLING] = "enabling",
- [SCX_OPS_ENABLED] = "enabled",
- [SCX_OPS_DISABLING] = "disabling",
- [SCX_OPS_DISABLED] = "disabled",
+static const char *scx_enable_state_str[] = {
+ [SCX_ENABLING] = "enabling",
+ [SCX_ENABLED] = "enabled",
+ [SCX_DISABLING] = "disabling",
+ [SCX_DISABLED] = "disabled",
};
/*
@@ -898,6 +1017,16 @@ enum scx_ops_state {
#define SCX_OPSS_QSEQ_MASK (~SCX_OPSS_STATE_MASK)
/*
+ * NOTE: sched_ext is in the process of growing multiple scheduler support and
+ * scx_root usage is in a transitional state. Naked dereferences are safe if the
+ * caller is one of the tasks attached to SCX and explicit RCU dereference is
+ * necessary otherwise. Naked scx_root dereferences trigger sparse warnings but
+ * are used as temporary markers to indicate that the dereferences need to be
+ * updated to point to the associated scheduler instances rather than scx_root.
+ */
+static struct scx_sched __rcu *scx_root;
+
+/*
* During exit, a task may schedule after losing its PIDs. When disabling the
* BPF scheduler, we need to be able to iterate tasks in every state to
* guarantee system safety. Maintain a dedicated task list which contains every
@@ -907,33 +1036,17 @@ static DEFINE_SPINLOCK(scx_tasks_lock);
static LIST_HEAD(scx_tasks);
/* ops enable/disable */
-static struct kthread_worker *scx_ops_helper;
-static DEFINE_MUTEX(scx_ops_enable_mutex);
-DEFINE_STATIC_KEY_FALSE(__scx_ops_enabled);
+static DEFINE_MUTEX(scx_enable_mutex);
+DEFINE_STATIC_KEY_FALSE(__scx_enabled);
DEFINE_STATIC_PERCPU_RWSEM(scx_fork_rwsem);
-static atomic_t scx_ops_enable_state_var = ATOMIC_INIT(SCX_OPS_DISABLED);
+static atomic_t scx_enable_state_var = ATOMIC_INIT(SCX_DISABLED);
static unsigned long scx_in_softlockup;
-static atomic_t scx_ops_breather_depth = ATOMIC_INIT(0);
-static int scx_ops_bypass_depth;
-static bool scx_ops_init_task_enabled;
+static atomic_t scx_breather_depth = ATOMIC_INIT(0);
+static int scx_bypass_depth;
+static bool scx_init_task_enabled;
static bool scx_switching_all;
DEFINE_STATIC_KEY_FALSE(__scx_switched_all);
-static struct sched_ext_ops scx_ops;
-static bool scx_warned_zero_slice;
-
-DEFINE_STATIC_KEY_FALSE(scx_ops_allow_queued_wakeup);
-static DEFINE_STATIC_KEY_FALSE(scx_ops_enq_last);
-static DEFINE_STATIC_KEY_FALSE(scx_ops_enq_exiting);
-static DEFINE_STATIC_KEY_FALSE(scx_ops_enq_migration_disabled);
-static DEFINE_STATIC_KEY_FALSE(scx_ops_cpu_preempt);
-
-static struct static_key_false scx_has_op[SCX_OPI_END] =
- { [0 ... SCX_OPI_END-1] = STATIC_KEY_FALSE_INIT };
-
-static atomic_t scx_exit_kind = ATOMIC_INIT(SCX_EXIT_DONE);
-static struct scx_exit_info *scx_exit_info;
-
static atomic_long_t scx_nr_rejected = ATOMIC_LONG_INIT(0);
static atomic_long_t scx_hotplug_seq = ATOMIC_LONG_INIT(0);
@@ -947,7 +1060,7 @@ static atomic_long_t scx_enable_seq = ATOMIC_LONG_INIT(0);
/*
* The maximum amount of time in jiffies that a task may be runnable without
* being scheduled on a CPU. If this timeout is exceeded, it will trigger
- * scx_ops_error().
+ * scx_error().
*/
static unsigned long scx_watchdog_timeout;
@@ -973,23 +1086,12 @@ static unsigned long __percpu *scx_kick_cpus_pnt_seqs;
*/
static DEFINE_PER_CPU(struct task_struct *, direct_dispatch_task);
-/*
- * Dispatch queues.
- *
- * The global DSQ (%SCX_DSQ_GLOBAL) is split per-node for scalability. This is
- * to avoid live-locking in bypass mode where all tasks are dispatched to
- * %SCX_DSQ_GLOBAL and all CPUs consume from it. If per-node split isn't
- * sufficient, it can be further split.
- */
-static struct scx_dispatch_q **global_dsqs;
-
static const struct rhashtable_params dsq_hash_params = {
.key_len = sizeof_field(struct scx_dispatch_q, id),
.key_offset = offsetof(struct scx_dispatch_q, id),
.head_offset = offsetof(struct scx_dispatch_q, hash_node),
};
-static struct rhashtable dsq_hash;
static LLIST_HEAD(dsqs_to_free);
/* dispatch buf */
@@ -1036,27 +1138,46 @@ static struct scx_dump_data scx_dump_data = {
/* /sys/kernel/sched_ext interface */
static struct kset *scx_kset;
-static struct kobject *scx_root_kobj;
#define CREATE_TRACE_POINTS
#include <trace/events/sched_ext.h>
static void process_ddsp_deferred_locals(struct rq *rq);
static void scx_bpf_kick_cpu(s32 cpu, u64 flags);
-static __printf(3, 4) void scx_ops_exit_kind(enum scx_exit_kind kind,
- s64 exit_code,
- const char *fmt, ...);
+static void scx_vexit(struct scx_sched *sch, enum scx_exit_kind kind,
+ s64 exit_code, const char *fmt, va_list args);
-#define scx_ops_error_kind(err, fmt, args...) \
- scx_ops_exit_kind((err), 0, fmt, ##args)
+static __printf(4, 5) void scx_exit(struct scx_sched *sch,
+ enum scx_exit_kind kind, s64 exit_code,
+ const char *fmt, ...)
+{
+ va_list args;
-#define scx_ops_exit(code, fmt, args...) \
- scx_ops_exit_kind(SCX_EXIT_UNREG_KERN, (code), fmt, ##args)
+ va_start(args, fmt);
+ scx_vexit(sch, kind, exit_code, fmt, args);
+ va_end(args);
+}
+
+static __printf(3, 4) void scx_kf_exit(enum scx_exit_kind kind, s64 exit_code,
+ const char *fmt, ...)
+{
+ struct scx_sched *sch;
+ va_list args;
+
+ rcu_read_lock();
+ sch = rcu_dereference(scx_root);
+ if (sch) {
+ va_start(args, fmt);
+ scx_vexit(sch, kind, exit_code, fmt, args);
+ va_end(args);
+ }
+ rcu_read_unlock();
+}
-#define scx_ops_error(fmt, args...) \
- scx_ops_error_kind(SCX_EXIT_ERROR, fmt, ##args)
+#define scx_error(sch, fmt, args...) scx_exit((sch), SCX_EXIT_ERROR, 0, fmt, ##args)
+#define scx_kf_error(fmt, args...) scx_kf_exit(SCX_EXIT_ERROR, 0, fmt, ##args)
-#define SCX_HAS_OP(op) static_branch_likely(&scx_has_op[SCX_OP_IDX(op)])
+#define SCX_HAS_OP(sch, op) test_bit(SCX_OP_IDX(op), (sch)->has_op)
static long jiffies_delta_msecs(unsigned long at, unsigned long now)
{
@@ -1086,12 +1207,14 @@ static bool u32_before(u32 a, u32 b)
static struct scx_dispatch_q *find_global_dsq(struct task_struct *p)
{
- return global_dsqs[cpu_to_node(task_cpu(p))];
+ struct scx_sched *sch = scx_root;
+
+ return sch->global_dsqs[cpu_to_node(task_cpu(p))];
}
-static struct scx_dispatch_q *find_user_dsq(u64 dsq_id)
+static struct scx_dispatch_q *find_user_dsq(struct scx_sched *sch, u64 dsq_id)
{
- return rhashtable_lookup_fast(&dsq_hash, &dsq_id, dsq_hash_params);
+ return rhashtable_lookup_fast(&sch->dsq_hash, &dsq_id, dsq_hash_params);
}
/*
@@ -1147,32 +1270,36 @@ static inline struct rq *scx_locked_rq(void)
return __this_cpu_read(locked_rq);
}
-#define SCX_CALL_OP(mask, op, rq, args...) \
+#define SCX_CALL_OP(sch, mask, op, rq, args...) \
do { \
- update_locked_rq(rq); \
+ if (rq) \
+ update_locked_rq(rq); \
if (mask) { \
scx_kf_allow(mask); \
- scx_ops.op(args); \
+ (sch)->ops.op(args); \
scx_kf_disallow(mask); \
} else { \
- scx_ops.op(args); \
+ (sch)->ops.op(args); \
} \
- update_locked_rq(NULL); \
+ if (rq) \
+ update_locked_rq(NULL); \
} while (0)
-#define SCX_CALL_OP_RET(mask, op, rq, args...) \
+#define SCX_CALL_OP_RET(sch, mask, op, rq, args...) \
({ \
- __typeof__(scx_ops.op(args)) __ret; \
+ __typeof__((sch)->ops.op(args)) __ret; \
\
- update_locked_rq(rq); \
+ if (rq) \
+ update_locked_rq(rq); \
if (mask) { \
scx_kf_allow(mask); \
- __ret = scx_ops.op(args); \
+ __ret = (sch)->ops.op(args); \
scx_kf_disallow(mask); \
} else { \
- __ret = scx_ops.op(args); \
+ __ret = (sch)->ops.op(args); \
} \
- update_locked_rq(NULL); \
+ if (rq) \
+ update_locked_rq(NULL); \
__ret; \
})
@@ -1187,31 +1314,31 @@ do { \
* scx_kf_allowed_on_arg_tasks() to test whether the invocation is allowed on
* the specific task.
*/
-#define SCX_CALL_OP_TASK(mask, op, rq, task, args...) \
+#define SCX_CALL_OP_TASK(sch, mask, op, rq, task, args...) \
do { \
BUILD_BUG_ON((mask) & ~__SCX_KF_TERMINAL); \
current->scx.kf_tasks[0] = task; \
- SCX_CALL_OP(mask, op, rq, task, ##args); \
+ SCX_CALL_OP((sch), mask, op, rq, task, ##args); \
current->scx.kf_tasks[0] = NULL; \
} while (0)
-#define SCX_CALL_OP_TASK_RET(mask, op, rq, task, args...) \
+#define SCX_CALL_OP_TASK_RET(sch, mask, op, rq, task, args...) \
({ \
- __typeof__(scx_ops.op(task, ##args)) __ret; \
+ __typeof__((sch)->ops.op(task, ##args)) __ret; \
BUILD_BUG_ON((mask) & ~__SCX_KF_TERMINAL); \
current->scx.kf_tasks[0] = task; \
- __ret = SCX_CALL_OP_RET(mask, op, rq, task, ##args); \
+ __ret = SCX_CALL_OP_RET((sch), mask, op, rq, task, ##args); \
current->scx.kf_tasks[0] = NULL; \
__ret; \
})
-#define SCX_CALL_OP_2TASKS_RET(mask, op, rq, task0, task1, args...) \
+#define SCX_CALL_OP_2TASKS_RET(sch, mask, op, rq, task0, task1, args...) \
({ \
- __typeof__(scx_ops.op(task0, task1, ##args)) __ret; \
+ __typeof__((sch)->ops.op(task0, task1, ##args)) __ret; \
BUILD_BUG_ON((mask) & ~__SCX_KF_TERMINAL); \
current->scx.kf_tasks[0] = task0; \
current->scx.kf_tasks[1] = task1; \
- __ret = SCX_CALL_OP_RET(mask, op, rq, task0, task1, ##args); \
+ __ret = SCX_CALL_OP_RET((sch), mask, op, rq, task0, task1, ##args); \
current->scx.kf_tasks[0] = NULL; \
current->scx.kf_tasks[1] = NULL; \
__ret; \
@@ -1221,8 +1348,8 @@ do { \
static __always_inline bool scx_kf_allowed(u32 mask)
{
if (unlikely(!(current->scx.kf_mask & mask))) {
- scx_ops_error("kfunc with mask 0x%x called from an operation only allowing 0x%x",
- mask, current->scx.kf_mask);
+ scx_kf_error("kfunc with mask 0x%x called from an operation only allowing 0x%x",
+ mask, current->scx.kf_mask);
return false;
}
@@ -1235,13 +1362,13 @@ static __always_inline bool scx_kf_allowed(u32 mask)
*/
if (unlikely(highest_bit(mask) == SCX_KF_CPU_RELEASE &&
(current->scx.kf_mask & higher_bits(SCX_KF_CPU_RELEASE)))) {
- scx_ops_error("cpu_release kfunc called from a nested operation");
+ scx_kf_error("cpu_release kfunc called from a nested operation");
return false;
}
if (unlikely(highest_bit(mask) == SCX_KF_DISPATCH &&
(current->scx.kf_mask & higher_bits(SCX_KF_DISPATCH)))) {
- scx_ops_error("dispatch kfunc called from a nested operation");
+ scx_kf_error("dispatch kfunc called from a nested operation");
return false;
}
@@ -1257,18 +1384,13 @@ static __always_inline bool scx_kf_allowed_on_arg_tasks(u32 mask,
if (unlikely((p != current->scx.kf_tasks[0] &&
p != current->scx.kf_tasks[1]))) {
- scx_ops_error("called on a task not being operated on");
+ scx_kf_error("called on a task not being operated on");
return false;
}
return true;
}
-static bool scx_kf_allowed_if_unlocked(void)
-{
- return !current->scx.kf_mask;
-}
-
/**
* nldsq_next_task - Iterate to the next task in a non-local DSQ
* @dsq: user dsq being iterated
@@ -1436,15 +1558,15 @@ static void scx_task_iter_stop(struct scx_task_iter *iter)
* @iter: iterator to walk
*
* Visit the next task. See scx_task_iter_start() for details. Locks are dropped
- * and re-acquired every %SCX_OPS_TASK_ITER_BATCH iterations to avoid causing
- * stalls by holding scx_tasks_lock for too long.
+ * and re-acquired every %SCX_TASK_ITER_BATCH iterations to avoid causing stalls
+ * by holding scx_tasks_lock for too long.
*/
static struct task_struct *scx_task_iter_next(struct scx_task_iter *iter)
{
struct list_head *cursor = &iter->cursor.tasks_node;
struct sched_ext_entity *pos;
- if (!(++iter->cnt % SCX_OPS_TASK_ITER_BATCH)) {
+ if (!(++iter->cnt % SCX_TASK_ITER_BATCH)) {
scx_task_iter_unlock(iter);
cond_resched();
scx_task_iter_relock(iter);
@@ -1515,91 +1637,29 @@ static struct task_struct *scx_task_iter_next_locked(struct scx_task_iter *iter)
return p;
}
-/*
- * Collection of event counters. Event types are placed in descending order.
- */
-struct scx_event_stats {
- /*
- * If ops.select_cpu() returns a CPU which can't be used by the task,
- * the core scheduler code silently picks a fallback CPU.
- */
- s64 SCX_EV_SELECT_CPU_FALLBACK;
-
- /*
- * When dispatching to a local DSQ, the CPU may have gone offline in
- * the meantime. In this case, the task is bounced to the global DSQ.
- */
- s64 SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE;
-
- /*
- * If SCX_OPS_ENQ_LAST is not set, the number of times that a task
- * continued to run because there were no other tasks on the CPU.
- */
- s64 SCX_EV_DISPATCH_KEEP_LAST;
-
- /*
- * If SCX_OPS_ENQ_EXITING is not set, the number of times that a task
- * is dispatched to a local DSQ when exiting.
- */
- s64 SCX_EV_ENQ_SKIP_EXITING;
-
- /*
- * If SCX_OPS_ENQ_MIGRATION_DISABLED is not set, the number of times a
- * migration disabled task skips ops.enqueue() and is dispatched to its
- * local DSQ.
- */
- s64 SCX_EV_ENQ_SKIP_MIGRATION_DISABLED;
-
- /*
- * The total number of tasks enqueued (or pick_task-ed) with a
- * default time slice (SCX_SLICE_DFL).
- */
- s64 SCX_EV_ENQ_SLICE_DFL;
-
- /*
- * The total duration of bypass modes in nanoseconds.
- */
- s64 SCX_EV_BYPASS_DURATION;
-
- /*
- * The number of tasks dispatched in the bypassing mode.
- */
- s64 SCX_EV_BYPASS_DISPATCH;
-
- /*
- * The number of times the bypassing mode has been activated.
- */
- s64 SCX_EV_BYPASS_ACTIVATE;
-};
-
-/*
- * The event counter is organized by a per-CPU variable to minimize the
- * accounting overhead without synchronization. A system-wide view on the
- * event counter is constructed when requested by scx_bpf_get_event_stat().
- */
-static DEFINE_PER_CPU(struct scx_event_stats, event_stats_cpu);
-
/**
* scx_add_event - Increase an event counter for 'name' by 'cnt'
+ * @sch: scx_sched to account events for
* @name: an event name defined in struct scx_event_stats
* @cnt: the number of the event occured
*
* This can be used when preemption is not disabled.
*/
-#define scx_add_event(name, cnt) do { \
- this_cpu_add(event_stats_cpu.name, cnt); \
- trace_sched_ext_event(#name, cnt); \
+#define scx_add_event(sch, name, cnt) do { \
+ this_cpu_add((sch)->event_stats_cpu->name, (cnt)); \
+ trace_sched_ext_event(#name, (cnt)); \
} while(0)
/**
* __scx_add_event - Increase an event counter for 'name' by 'cnt'
+ * @sch: scx_sched to account events for
* @name: an event name defined in struct scx_event_stats
* @cnt: the number of the event occured
*
* This should be used only when preemption is disabled.
*/
-#define __scx_add_event(name, cnt) do { \
- __this_cpu_add(event_stats_cpu.name, cnt); \
+#define __scx_add_event(sch, name, cnt) do { \
+ __this_cpu_add((sch)->event_stats_cpu->name, (cnt)); \
trace_sched_ext_event(#name, cnt); \
} while(0)
@@ -1624,25 +1684,25 @@ static DEFINE_PER_CPU(struct scx_event_stats, event_stats_cpu);
} while (0)
-static void scx_bpf_events(struct scx_event_stats *events, size_t events__sz);
+static void scx_read_events(struct scx_sched *sch,
+ struct scx_event_stats *events);
-static enum scx_ops_enable_state scx_ops_enable_state(void)
+static enum scx_enable_state scx_enable_state(void)
{
- return atomic_read(&scx_ops_enable_state_var);
+ return atomic_read(&scx_enable_state_var);
}
-static enum scx_ops_enable_state
-scx_ops_set_enable_state(enum scx_ops_enable_state to)
+static enum scx_enable_state scx_set_enable_state(enum scx_enable_state to)
{
- return atomic_xchg(&scx_ops_enable_state_var, to);
+ return atomic_xchg(&scx_enable_state_var, to);
}
-static bool scx_ops_tryset_enable_state(enum scx_ops_enable_state to,
- enum scx_ops_enable_state from)
+static bool scx_tryset_enable_state(enum scx_enable_state to,
+ enum scx_enable_state from)
{
int from_v = from;
- return atomic_try_cmpxchg(&scx_ops_enable_state_var, &from_v, to);
+ return atomic_try_cmpxchg(&scx_enable_state_var, &from_v, to);
}
static bool scx_rq_bypassing(struct rq *rq)
@@ -1667,8 +1727,14 @@ static void wait_ops_state(struct task_struct *p, unsigned long opss)
} while (atomic_long_read_acquire(&p->scx.ops_state) == opss);
}
+static inline bool __cpu_valid(s32 cpu)
+{
+ return likely(cpu >= 0 && cpu < nr_cpu_ids && cpu_possible(cpu));
+}
+
/**
- * ops_cpu_valid - Verify a cpu number
+ * ops_cpu_valid - Verify a cpu number, to be used on ops input args
+ * @sch: scx_sched to abort on error
* @cpu: cpu number which came from a BPF ops
* @where: extra information reported on error
*
@@ -1676,35 +1742,52 @@ static void wait_ops_state(struct task_struct *p, unsigned long opss)
* Verify that it is in range and one of the possible cpus. If invalid, trigger
* an ops error.
*/
-static bool ops_cpu_valid(s32 cpu, const char *where)
+static bool ops_cpu_valid(struct scx_sched *sch, s32 cpu, const char *where)
+{
+ if (__cpu_valid(cpu)) {
+ return true;
+ } else {
+ scx_error(sch, "invalid CPU %d%s%s", cpu, where ? " " : "", where ?: "");
+ return false;
+ }
+}
+
+/**
+ * kf_cpu_valid - Verify a CPU number, to be used on kfunc input args
+ * @cpu: cpu number which came from a BPF ops
+ * @where: extra information reported on error
+ *
+ * The same as ops_cpu_valid() but @sch is implicit.
+ */
+static bool kf_cpu_valid(u32 cpu, const char *where)
{
- if (likely(cpu >= 0 && cpu < nr_cpu_ids && cpu_possible(cpu))) {
+ if (__cpu_valid(cpu)) {
return true;
} else {
- scx_ops_error("invalid CPU %d%s%s", cpu,
- where ? " " : "", where ?: "");
+ scx_kf_error("invalid CPU %d%s%s", cpu, where ? " " : "", where ?: "");
return false;
}
}
/**
* ops_sanitize_err - Sanitize a -errno value
+ * @sch: scx_sched to error out on error
* @ops_name: operation to blame on failure
* @err: -errno value to sanitize
*
- * Verify @err is a valid -errno. If not, trigger scx_ops_error() and return
+ * Verify @err is a valid -errno. If not, trigger scx_error() and return
* -%EPROTO. This is necessary because returning a rogue -errno up the chain can
* cause misbehaviors. For an example, a large negative return from
* ops.init_task() triggers an oops when passed up the call chain because the
* value fails IS_ERR() test after being encoded with ERR_PTR() and then is
* handled as a pointer.
*/
-static int ops_sanitize_err(const char *ops_name, s32 err)
+static int ops_sanitize_err(struct scx_sched *sch, const char *ops_name, s32 err)
{
if (err < 0 && err >= -MAX_ERRNO)
return err;
- scx_ops_error("ops.%s() returned an invalid errno %d", ops_name, err);
+ scx_error(sch, "ops.%s() returned an invalid errno %d", ops_name, err);
return -EPROTO;
}
@@ -1811,7 +1894,7 @@ static void touch_core_sched_dispatch(struct rq *rq, struct task_struct *p)
lockdep_assert_rq_held(rq);
#ifdef CONFIG_SCHED_CORE
- if (SCX_HAS_OP(core_sched_before))
+ if (unlikely(SCX_HAS_OP(scx_root, core_sched_before)))
touch_core_sched(rq, p);
#endif
}
@@ -1849,8 +1932,14 @@ static void dsq_mod_nr(struct scx_dispatch_q *dsq, s32 delta)
WRITE_ONCE(dsq->nr, dsq->nr + delta);
}
-static void dispatch_enqueue(struct scx_dispatch_q *dsq, struct task_struct *p,
- u64 enq_flags)
+static void refill_task_slice_dfl(struct task_struct *p)
+{
+ p->scx.slice = SCX_SLICE_DFL;
+ __scx_add_event(scx_root, SCX_EV_REFILL_SLICE_DFL, 1);
+}
+
+static void dispatch_enqueue(struct scx_sched *sch, struct scx_dispatch_q *dsq,
+ struct task_struct *p, u64 enq_flags)
{
bool is_local = dsq->id == SCX_DSQ_LOCAL;
@@ -1861,7 +1950,7 @@ static void dispatch_enqueue(struct scx_dispatch_q *dsq, struct task_struct *p,
if (!is_local) {
raw_spin_lock(&dsq->lock);
if (unlikely(dsq->id == SCX_DSQ_INVALID)) {
- scx_ops_error("attempting to dispatch to a destroyed dsq");
+ scx_error(sch, "attempting to dispatch to a destroyed dsq");
/* fall back to the global dsq */
raw_spin_unlock(&dsq->lock);
dsq = find_global_dsq(p);
@@ -1878,7 +1967,7 @@ static void dispatch_enqueue(struct scx_dispatch_q *dsq, struct task_struct *p,
* disallow any internal DSQ from doing vtime ordering of
* tasks.
*/
- scx_ops_error("cannot use vtime ordering for built-in DSQs");
+ scx_error(sch, "cannot use vtime ordering for built-in DSQs");
enq_flags &= ~SCX_ENQ_DSQ_PRIQ;
}
@@ -1892,8 +1981,8 @@ static void dispatch_enqueue(struct scx_dispatch_q *dsq, struct task_struct *p,
*/
if (unlikely(RB_EMPTY_ROOT(&dsq->priq) &&
nldsq_next_task(dsq, NULL, false)))
- scx_ops_error("DSQ ID 0x%016llx already had FIFO-enqueued tasks",
- dsq->id);
+ scx_error(sch, "DSQ ID 0x%016llx already had FIFO-enqueued tasks",
+ dsq->id);
p->scx.dsq_flags |= SCX_TASK_DSQ_ON_PRIQ;
rb_add(&p->scx.dsq_priq, &dsq->priq, scx_dsq_priq_less);
@@ -1914,8 +2003,8 @@ static void dispatch_enqueue(struct scx_dispatch_q *dsq, struct task_struct *p,
} else {
/* a FIFO DSQ shouldn't be using PRIQ enqueuing */
if (unlikely(!RB_EMPTY_ROOT(&dsq->priq)))
- scx_ops_error("DSQ ID 0x%016llx already had PRIQ-enqueued tasks",
- dsq->id);
+ scx_error(sch, "DSQ ID 0x%016llx already had PRIQ-enqueued tasks",
+ dsq->id);
if (enq_flags & (SCX_ENQ_HEAD | SCX_ENQ_PREEMPT))
list_add(&p->scx.dsq_list.node, &dsq->list);
@@ -2030,7 +2119,8 @@ static void dispatch_dequeue(struct rq *rq, struct task_struct *p)
raw_spin_unlock(&dsq->lock);
}
-static struct scx_dispatch_q *find_dsq_for_dispatch(struct rq *rq, u64 dsq_id,
+static struct scx_dispatch_q *find_dsq_for_dispatch(struct scx_sched *sch,
+ struct rq *rq, u64 dsq_id,
struct task_struct *p)
{
struct scx_dispatch_q *dsq;
@@ -2041,7 +2131,7 @@ static struct scx_dispatch_q *find_dsq_for_dispatch(struct rq *rq, u64 dsq_id,
if ((dsq_id & SCX_DSQ_LOCAL_ON) == SCX_DSQ_LOCAL_ON) {
s32 cpu = dsq_id & SCX_DSQ_LOCAL_CPU_MASK;
- if (!ops_cpu_valid(cpu, "in SCX_DSQ_LOCAL_ON dispatch verdict"))
+ if (!ops_cpu_valid(sch, cpu, "in SCX_DSQ_LOCAL_ON dispatch verdict"))
return find_global_dsq(p);
return &cpu_rq(cpu)->scx.local_dsq;
@@ -2050,11 +2140,11 @@ static struct scx_dispatch_q *find_dsq_for_dispatch(struct rq *rq, u64 dsq_id,
if (dsq_id == SCX_DSQ_GLOBAL)
dsq = find_global_dsq(p);
else
- dsq = find_user_dsq(dsq_id);
+ dsq = find_user_dsq(sch, dsq_id);
if (unlikely(!dsq)) {
- scx_ops_error("non-existent DSQ 0x%llx for %s[%d]",
- dsq_id, p->comm, p->pid);
+ scx_error(sch, "non-existent DSQ 0x%llx for %s[%d]",
+ dsq_id, p->comm, p->pid);
return find_global_dsq(p);
}
@@ -2075,12 +2165,12 @@ static void mark_direct_dispatch(struct task_struct *ddsp_task,
/* @p must match the task on the enqueue path */
if (unlikely(p != ddsp_task)) {
if (IS_ERR(ddsp_task))
- scx_ops_error("%s[%d] already direct-dispatched",
- p->comm, p->pid);
+ scx_kf_error("%s[%d] already direct-dispatched",
+ p->comm, p->pid);
else
- scx_ops_error("scheduling for %s[%d] but trying to direct-dispatch %s[%d]",
- ddsp_task->comm, ddsp_task->pid,
- p->comm, p->pid);
+ scx_kf_error("scheduling for %s[%d] but trying to direct-dispatch %s[%d]",
+ ddsp_task->comm, ddsp_task->pid,
+ p->comm, p->pid);
return;
}
@@ -2091,11 +2181,12 @@ static void mark_direct_dispatch(struct task_struct *ddsp_task,
p->scx.ddsp_enq_flags = enq_flags;
}
-static void direct_dispatch(struct task_struct *p, u64 enq_flags)
+static void direct_dispatch(struct scx_sched *sch, struct task_struct *p,
+ u64 enq_flags)
{
struct rq *rq = task_rq(p);
struct scx_dispatch_q *dsq =
- find_dsq_for_dispatch(rq, p->scx.ddsp_dsq_id, p);
+ find_dsq_for_dispatch(sch, rq, p->scx.ddsp_dsq_id, p);
touch_core_sched_dispatch(rq, p);
@@ -2136,7 +2227,8 @@ static void direct_dispatch(struct task_struct *p, u64 enq_flags)
return;
}
- dispatch_enqueue(dsq, p, p->scx.ddsp_enq_flags | SCX_ENQ_CLEAR_OPSS);
+ dispatch_enqueue(sch, dsq, p,
+ p->scx.ddsp_enq_flags | SCX_ENQ_CLEAR_OPSS);
}
static bool scx_rq_online(struct rq *rq)
@@ -2154,6 +2246,7 @@ static bool scx_rq_online(struct rq *rq)
static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags,
int sticky_cpu)
{
+ struct scx_sched *sch = scx_root;
struct task_struct **ddsp_taskp;
unsigned long qseq;
@@ -2172,7 +2265,7 @@ static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags,
goto local;
if (scx_rq_bypassing(rq)) {
- __scx_add_event(SCX_EV_BYPASS_DISPATCH, 1);
+ __scx_add_event(sch, SCX_EV_BYPASS_DISPATCH, 1);
goto global;
}
@@ -2180,20 +2273,20 @@ static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags,
goto direct;
/* see %SCX_OPS_ENQ_EXITING */
- if (!static_branch_unlikely(&scx_ops_enq_exiting) &&
+ if (!(sch->ops.flags & SCX_OPS_ENQ_EXITING) &&
unlikely(p->flags & PF_EXITING)) {
- __scx_add_event(SCX_EV_ENQ_SKIP_EXITING, 1);
+ __scx_add_event(sch, SCX_EV_ENQ_SKIP_EXITING, 1);
goto local;
}
/* see %SCX_OPS_ENQ_MIGRATION_DISABLED */
- if (!static_branch_unlikely(&scx_ops_enq_migration_disabled) &&
+ if (!(sch->ops.flags & SCX_OPS_ENQ_MIGRATION_DISABLED) &&
is_migration_disabled(p)) {
- __scx_add_event(SCX_EV_ENQ_SKIP_MIGRATION_DISABLED, 1);
+ __scx_add_event(sch, SCX_EV_ENQ_SKIP_MIGRATION_DISABLED, 1);
goto local;
}
- if (!SCX_HAS_OP(enqueue))
+ if (unlikely(!SCX_HAS_OP(sch, enqueue)))
goto global;
/* DSQ bypass didn't trigger, enqueue on the BPF scheduler */
@@ -2206,7 +2299,7 @@ static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags,
WARN_ON_ONCE(*ddsp_taskp);
*ddsp_taskp = p;
- SCX_CALL_OP_TASK(SCX_KF_ENQUEUE, enqueue, rq, p, enq_flags);
+ SCX_CALL_OP_TASK(sch, SCX_KF_ENQUEUE, enqueue, rq, p, enq_flags);
*ddsp_taskp = NULL;
if (p->scx.ddsp_dsq_id != SCX_DSQ_INVALID)
@@ -2220,7 +2313,7 @@ static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags,
return;
direct:
- direct_dispatch(p, enq_flags);
+ direct_dispatch(sch, p, enq_flags);
return;
local:
@@ -2230,17 +2323,15 @@ local:
* higher priority it becomes from scx_prio_less()'s POV.
*/
touch_core_sched(rq, p);
- p->scx.slice = SCX_SLICE_DFL;
- __scx_add_event(SCX_EV_ENQ_SLICE_DFL, 1);
+ refill_task_slice_dfl(p);
local_norefill:
- dispatch_enqueue(&rq->scx.local_dsq, p, enq_flags);
+ dispatch_enqueue(sch, &rq->scx.local_dsq, p, enq_flags);
return;
global:
touch_core_sched(rq, p); /* see the comment in local: */
- p->scx.slice = SCX_SLICE_DFL;
- __scx_add_event(SCX_EV_ENQ_SLICE_DFL, 1);
- dispatch_enqueue(find_global_dsq(p), p, enq_flags);
+ refill_task_slice_dfl(p);
+ dispatch_enqueue(sch, find_global_dsq(p), p, enq_flags);
}
static bool task_runnable(const struct task_struct *p)
@@ -2258,7 +2349,7 @@ static void set_task_runnable(struct rq *rq, struct task_struct *p)
}
/*
- * list_add_tail() must be used. scx_ops_bypass() depends on tasks being
+ * list_add_tail() must be used. scx_bypass() depends on tasks being
* appended to the runnable_list.
*/
list_add_tail(&p->scx.runnable_node, &rq->scx.runnable_list);
@@ -2273,6 +2364,7 @@ static void clr_task_runnable(struct task_struct *p, bool reset_runnable_at)
static void enqueue_task_scx(struct rq *rq, struct task_struct *p, int enq_flags)
{
+ struct scx_sched *sch = scx_root;
int sticky_cpu = p->scx.sticky_cpu;
if (enq_flags & ENQUEUE_WAKEUP)
@@ -2302,8 +2394,8 @@ static void enqueue_task_scx(struct rq *rq, struct task_struct *p, int enq_flags
rq->scx.nr_running++;
add_nr_running(rq, 1);
- if (SCX_HAS_OP(runnable) && !task_on_rq_migrating(p))
- SCX_CALL_OP_TASK(SCX_KF_REST, runnable, rq, p, enq_flags);
+ if (SCX_HAS_OP(sch, runnable) && !task_on_rq_migrating(p))
+ SCX_CALL_OP_TASK(sch, SCX_KF_REST, runnable, rq, p, enq_flags);
if (enq_flags & SCX_ENQ_WAKEUP)
touch_core_sched(rq, p);
@@ -2314,11 +2406,12 @@ out:
if ((enq_flags & SCX_ENQ_CPU_SELECTED) &&
unlikely(cpu_of(rq) != p->scx.selected_cpu))
- __scx_add_event(SCX_EV_SELECT_CPU_FALLBACK, 1);
+ __scx_add_event(sch, SCX_EV_SELECT_CPU_FALLBACK, 1);
}
static void ops_dequeue(struct rq *rq, struct task_struct *p, u64 deq_flags)
{
+ struct scx_sched *sch = scx_root;
unsigned long opss;
/* dequeue is always temporary, don't reset runnable_at */
@@ -2337,8 +2430,9 @@ static void ops_dequeue(struct rq *rq, struct task_struct *p, u64 deq_flags)
*/
BUG();
case SCX_OPSS_QUEUED:
- if (SCX_HAS_OP(dequeue))
- SCX_CALL_OP_TASK(SCX_KF_REST, dequeue, rq, p, deq_flags);
+ if (SCX_HAS_OP(sch, dequeue))
+ SCX_CALL_OP_TASK(sch, SCX_KF_REST, dequeue, rq,
+ p, deq_flags);
if (atomic_long_try_cmpxchg(&p->scx.ops_state, &opss,
SCX_OPSS_NONE))
@@ -2366,6 +2460,8 @@ static void ops_dequeue(struct rq *rq, struct task_struct *p, u64 deq_flags)
static bool dequeue_task_scx(struct rq *rq, struct task_struct *p, int deq_flags)
{
+ struct scx_sched *sch = scx_root;
+
if (!(p->scx.flags & SCX_TASK_QUEUED)) {
WARN_ON_ONCE(task_runnable(p));
return true;
@@ -2385,13 +2481,13 @@ static bool dequeue_task_scx(struct rq *rq, struct task_struct *p, int deq_flags
* information meaningful to the BPF scheduler and can be suppressed by
* skipping the callbacks if the task is !QUEUED.
*/
- if (SCX_HAS_OP(stopping) && task_current(rq, p)) {
+ if (SCX_HAS_OP(sch, stopping) && task_current(rq, p)) {
update_curr_scx(rq);
- SCX_CALL_OP_TASK(SCX_KF_REST, stopping, rq, p, false);
+ SCX_CALL_OP_TASK(sch, SCX_KF_REST, stopping, rq, p, false);
}
- if (SCX_HAS_OP(quiescent) && !task_on_rq_migrating(p))
- SCX_CALL_OP_TASK(SCX_KF_REST, quiescent, rq, p, deq_flags);
+ if (SCX_HAS_OP(sch, quiescent) && !task_on_rq_migrating(p))
+ SCX_CALL_OP_TASK(sch, SCX_KF_REST, quiescent, rq, p, deq_flags);
if (deq_flags & SCX_DEQ_SLEEP)
p->scx.flags |= SCX_TASK_DEQD_FOR_SLEEP;
@@ -2408,20 +2504,23 @@ static bool dequeue_task_scx(struct rq *rq, struct task_struct *p, int deq_flags
static void yield_task_scx(struct rq *rq)
{
+ struct scx_sched *sch = scx_root;
struct task_struct *p = rq->curr;
- if (SCX_HAS_OP(yield))
- SCX_CALL_OP_2TASKS_RET(SCX_KF_REST, yield, rq, p, NULL);
+ if (SCX_HAS_OP(sch, yield))
+ SCX_CALL_OP_2TASKS_RET(sch, SCX_KF_REST, yield, rq, p, NULL);
else
p->scx.slice = 0;
}
static bool yield_to_task_scx(struct rq *rq, struct task_struct *to)
{
+ struct scx_sched *sch = scx_root;
struct task_struct *from = rq->curr;
- if (SCX_HAS_OP(yield))
- return SCX_CALL_OP_2TASKS_RET(SCX_KF_REST, yield, rq, from, to);
+ if (SCX_HAS_OP(sch, yield))
+ return SCX_CALL_OP_2TASKS_RET(sch, SCX_KF_REST, yield, rq,
+ from, to);
else
return false;
}
@@ -2501,7 +2600,8 @@ static void move_remote_task_to_local_dsq(struct task_struct *p, u64 enq_flags,
*
* The caller must ensure that @p and @rq are on different CPUs.
*/
-static bool task_can_run_on_remote_rq(struct task_struct *p, struct rq *rq,
+static bool task_can_run_on_remote_rq(struct scx_sched *sch,
+ struct task_struct *p, struct rq *rq,
bool enforce)
{
int cpu = cpu_of(rq);
@@ -2522,8 +2622,8 @@ static bool task_can_run_on_remote_rq(struct task_struct *p, struct rq *rq,
*/
if (unlikely(is_migration_disabled(p))) {
if (enforce)
- scx_ops_error("SCX_DSQ_LOCAL[_ON] cannot move migration disabled %s[%d] from CPU %d to %d",
- p->comm, p->pid, task_cpu(p), cpu);
+ scx_error(sch, "SCX_DSQ_LOCAL[_ON] cannot move migration disabled %s[%d] from CPU %d to %d",
+ p->comm, p->pid, task_cpu(p), cpu);
return false;
}
@@ -2535,14 +2635,15 @@ static bool task_can_run_on_remote_rq(struct task_struct *p, struct rq *rq,
*/
if (!task_allowed_on_cpu(p, cpu)) {
if (enforce)
- scx_ops_error("SCX_DSQ_LOCAL[_ON] target CPU %d not allowed for %s[%d]",
- cpu, p->comm, p->pid);
+ scx_error(sch, "SCX_DSQ_LOCAL[_ON] target CPU %d not allowed for %s[%d]",
+ cpu, p->comm, p->pid);
return false;
}
if (!scx_rq_online(rq)) {
if (enforce)
- __scx_add_event(SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE, 1);
+ __scx_add_event(scx_root,
+ SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE, 1);
return false;
}
@@ -2614,12 +2715,13 @@ static bool consume_remote_task(struct rq *this_rq, struct task_struct *p,
}
#else /* CONFIG_SMP */
static inline void move_remote_task_to_local_dsq(struct task_struct *p, u64 enq_flags, struct rq *src_rq, struct rq *dst_rq) { WARN_ON_ONCE(1); }
-static inline bool task_can_run_on_remote_rq(struct task_struct *p, struct rq *rq, bool enforce) { return false; }
+static inline bool task_can_run_on_remote_rq(struct scx_sched *sch, struct task_struct *p, struct rq *rq, bool enforce) { return false; }
static inline bool consume_remote_task(struct rq *this_rq, struct task_struct *p, struct scx_dispatch_q *dsq, struct rq *task_rq) { return false; }
#endif /* CONFIG_SMP */
/**
* move_task_between_dsqs() - Move a task from one DSQ to another
+ * @sch: scx_sched being operated on
* @p: target task
* @enq_flags: %SCX_ENQ_*
* @src_dsq: DSQ @p is currently on, must not be a local DSQ
@@ -2633,7 +2735,8 @@ static inline bool consume_remote_task(struct rq *this_rq, struct task_struct *p
* On return, @src_dsq is unlocked and only @p's new task_rq, which is the
* return value, is locked.
*/
-static struct rq *move_task_between_dsqs(struct task_struct *p, u64 enq_flags,
+static struct rq *move_task_between_dsqs(struct scx_sched *sch,
+ struct task_struct *p, u64 enq_flags,
struct scx_dispatch_q *src_dsq,
struct scx_dispatch_q *dst_dsq)
{
@@ -2646,7 +2749,7 @@ static struct rq *move_task_between_dsqs(struct task_struct *p, u64 enq_flags,
if (dst_dsq->id == SCX_DSQ_LOCAL) {
dst_rq = container_of(dst_dsq, struct rq, scx.local_dsq);
if (src_rq != dst_rq &&
- unlikely(!task_can_run_on_remote_rq(p, dst_rq, true))) {
+ unlikely(!task_can_run_on_remote_rq(sch, p, dst_rq, true))) {
dst_dsq = find_global_dsq(p);
dst_rq = src_rq;
}
@@ -2680,7 +2783,7 @@ static struct rq *move_task_between_dsqs(struct task_struct *p, u64 enq_flags,
p->scx.dsq = NULL;
raw_spin_unlock(&src_dsq->lock);
- dispatch_enqueue(dst_dsq, p, enq_flags);
+ dispatch_enqueue(sch, dst_dsq, p, enq_flags);
}
return dst_rq;
@@ -2692,13 +2795,13 @@ static struct rq *move_task_between_dsqs(struct task_struct *p, u64 enq_flags,
* to the bypass mode can take a long time. Inject artificial delays while the
* bypass mode is switching to guarantee timely completion.
*/
-static void scx_ops_breather(struct rq *rq)
+static void scx_breather(struct rq *rq)
{
u64 until;
lockdep_assert_rq_held(rq);
- if (likely(!atomic_read(&scx_ops_breather_depth)))
+ if (likely(!atomic_read(&scx_breather_depth)))
return;
raw_spin_rq_unlock(rq);
@@ -2707,25 +2810,26 @@ static void scx_ops_breather(struct rq *rq)
do {
int cnt = 1024;
- while (atomic_read(&scx_ops_breather_depth) && --cnt)
+ while (atomic_read(&scx_breather_depth) && --cnt)
cpu_relax();
- } while (atomic_read(&scx_ops_breather_depth) &&
+ } while (atomic_read(&scx_breather_depth) &&
time_before64(ktime_get_ns(), until));
raw_spin_rq_lock(rq);
}
-static bool consume_dispatch_q(struct rq *rq, struct scx_dispatch_q *dsq)
+static bool consume_dispatch_q(struct scx_sched *sch, struct rq *rq,
+ struct scx_dispatch_q *dsq)
{
struct task_struct *p;
retry:
/*
- * This retry loop can repeatedly race against scx_ops_bypass()
- * dequeueing tasks from @dsq trying to put the system into the bypass
- * mode. On some multi-socket machines (e.g. 2x Intel 8480c), this can
- * live-lock the machine into soft lockups. Give a breather.
+ * This retry loop can repeatedly race against scx_bypass() dequeueing
+ * tasks from @dsq trying to put the system into the bypass mode. On
+ * some multi-socket machines (e.g. 2x Intel 8480c), this can live-lock
+ * the machine into soft lockups. Give a breather.
*/
- scx_ops_breather(rq);
+ scx_breather(rq);
/*
* The caller can't expect to successfully consume a task if the task's
@@ -2747,7 +2851,7 @@ retry:
return true;
}
- if (task_can_run_on_remote_rq(p, rq, false)) {
+ if (task_can_run_on_remote_rq(sch, p, rq, false)) {
if (likely(consume_remote_task(rq, p, dsq, task_rq)))
return true;
goto retry;
@@ -2758,15 +2862,16 @@ retry:
return false;
}
-static bool consume_global_dsq(struct rq *rq)
+static bool consume_global_dsq(struct scx_sched *sch, struct rq *rq)
{
int node = cpu_to_node(cpu_of(rq));
- return consume_dispatch_q(rq, global_dsqs[node]);
+ return consume_dispatch_q(sch, rq, sch->global_dsqs[node]);
}
/**
* dispatch_to_local_dsq - Dispatch a task to a local dsq
+ * @sch: scx_sched being operated on
* @rq: current rq which is locked
* @dst_dsq: destination DSQ
* @p: task to dispatch
@@ -2779,7 +2884,8 @@ static bool consume_global_dsq(struct rq *rq)
* The caller must have exclusive ownership of @p (e.g. through
* %SCX_OPSS_DISPATCHING).
*/
-static void dispatch_to_local_dsq(struct rq *rq, struct scx_dispatch_q *dst_dsq,
+static void dispatch_to_local_dsq(struct scx_sched *sch, struct rq *rq,
+ struct scx_dispatch_q *dst_dsq,
struct task_struct *p, u64 enq_flags)
{
struct rq *src_rq = task_rq(p);
@@ -2795,14 +2901,15 @@ static void dispatch_to_local_dsq(struct rq *rq, struct scx_dispatch_q *dst_dsq,
* If dispatching to @rq that @p is already on, no lock dancing needed.
*/
if (rq == src_rq && rq == dst_rq) {
- dispatch_enqueue(dst_dsq, p, enq_flags | SCX_ENQ_CLEAR_OPSS);
+ dispatch_enqueue(sch, dst_dsq, p,
+ enq_flags | SCX_ENQ_CLEAR_OPSS);
return;
}
#ifdef CONFIG_SMP
if (src_rq != dst_rq &&
- unlikely(!task_can_run_on_remote_rq(p, dst_rq, true))) {
- dispatch_enqueue(find_global_dsq(p), p,
+ unlikely(!task_can_run_on_remote_rq(sch, p, dst_rq, true))) {
+ dispatch_enqueue(sch, find_global_dsq(p), p,
enq_flags | SCX_ENQ_CLEAR_OPSS);
return;
}
@@ -2840,7 +2947,8 @@ static void dispatch_to_local_dsq(struct rq *rq, struct scx_dispatch_q *dst_dsq,
*/
if (src_rq == dst_rq) {
p->scx.holding_cpu = -1;
- dispatch_enqueue(&dst_rq->scx.local_dsq, p, enq_flags);
+ dispatch_enqueue(sch, &dst_rq->scx.local_dsq, p,
+ enq_flags);
} else {
move_remote_task_to_local_dsq(p, enq_flags,
src_rq, dst_rq);
@@ -2882,7 +2990,8 @@ static void dispatch_to_local_dsq(struct rq *rq, struct scx_dispatch_q *dst_dsq,
* was valid in the first place. Make sure that the task is still owned by the
* BPF scheduler and claim the ownership before dispatching.
*/
-static void finish_dispatch(struct rq *rq, struct task_struct *p,
+static void finish_dispatch(struct scx_sched *sch, struct rq *rq,
+ struct task_struct *p,
unsigned long qseq_at_dispatch,
u64 dsq_id, u64 enq_flags)
{
@@ -2935,15 +3044,15 @@ retry:
BUG_ON(!(p->scx.flags & SCX_TASK_QUEUED));
- dsq = find_dsq_for_dispatch(this_rq(), dsq_id, p);
+ dsq = find_dsq_for_dispatch(sch, this_rq(), dsq_id, p);
if (dsq->id == SCX_DSQ_LOCAL)
- dispatch_to_local_dsq(rq, dsq, p, enq_flags);
+ dispatch_to_local_dsq(sch, rq, dsq, p, enq_flags);
else
- dispatch_enqueue(dsq, p, enq_flags | SCX_ENQ_CLEAR_OPSS);
+ dispatch_enqueue(sch, dsq, p, enq_flags | SCX_ENQ_CLEAR_OPSS);
}
-static void flush_dispatch_buf(struct rq *rq)
+static void flush_dispatch_buf(struct scx_sched *sch, struct rq *rq)
{
struct scx_dsp_ctx *dspc = this_cpu_ptr(scx_dsp_ctx);
u32 u;
@@ -2951,7 +3060,7 @@ static void flush_dispatch_buf(struct rq *rq)
for (u = 0; u < dspc->cursor; u++) {
struct scx_dsp_buf_ent *ent = &dspc->buf[u];
- finish_dispatch(rq, ent->task, ent->qseq, ent->dsq_id,
+ finish_dispatch(sch, rq, ent->task, ent->qseq, ent->dsq_id,
ent->enq_flags);
}
@@ -2961,6 +3070,7 @@ static void flush_dispatch_buf(struct rq *rq)
static int balance_one(struct rq *rq, struct task_struct *prev)
{
+ struct scx_sched *sch = scx_root;
struct scx_dsp_ctx *dspc = this_cpu_ptr(scx_dsp_ctx);
bool prev_on_scx = prev->sched_class == &ext_sched_class;
bool prev_on_rq = prev->scx.flags & SCX_TASK_QUEUED;
@@ -2970,7 +3080,7 @@ static int balance_one(struct rq *rq, struct task_struct *prev)
rq->scx.flags |= SCX_RQ_IN_BALANCE;
rq->scx.flags &= ~(SCX_RQ_BAL_PENDING | SCX_RQ_BAL_KEEP);
- if (static_branch_unlikely(&scx_ops_cpu_preempt) &&
+ if ((sch->ops.flags & SCX_OPS_HAS_CPU_PREEMPT) &&
unlikely(rq->scx.cpu_released)) {
/*
* If the previous sched_class for the current CPU was not SCX,
@@ -2978,8 +3088,9 @@ static int balance_one(struct rq *rq, struct task_struct *prev)
* core. This callback complements ->cpu_release(), which is
* emitted in switch_class().
*/
- if (SCX_HAS_OP(cpu_acquire))
- SCX_CALL_OP(SCX_KF_REST, cpu_acquire, rq, cpu_of(rq), NULL);
+ if (SCX_HAS_OP(sch, cpu_acquire))
+ SCX_CALL_OP(sch, SCX_KF_REST, cpu_acquire, rq,
+ cpu_of(rq), NULL);
rq->scx.cpu_released = false;
}
@@ -2993,8 +3104,8 @@ static int balance_one(struct rq *rq, struct task_struct *prev)
* scheduler wants to handle this explicitly, it should
* implement ->cpu_release().
*
- * See scx_ops_disable_workfn() for the explanation on the
- * bypassing test.
+ * See scx_disable_workfn() for the explanation on the bypassing
+ * test.
*/
if (prev_on_rq && prev->scx.slice && !scx_rq_bypassing(rq)) {
rq->scx.flags |= SCX_RQ_BAL_KEEP;
@@ -3006,10 +3117,11 @@ static int balance_one(struct rq *rq, struct task_struct *prev)
if (rq->scx.local_dsq.nr)
goto has_tasks;
- if (consume_global_dsq(rq))
+ if (consume_global_dsq(sch, rq))
goto has_tasks;
- if (!SCX_HAS_OP(dispatch) || scx_rq_bypassing(rq) || !scx_rq_online(rq))
+ if (unlikely(!SCX_HAS_OP(sch, dispatch)) ||
+ scx_rq_bypassing(rq) || !scx_rq_online(rq))
goto no_tasks;
dspc->rq = rq;
@@ -3024,10 +3136,10 @@ static int balance_one(struct rq *rq, struct task_struct *prev)
do {
dspc->nr_tasks = 0;
- SCX_CALL_OP(SCX_KF_DISPATCH, dispatch, rq, cpu_of(rq),
- prev_on_scx ? prev : NULL);
+ SCX_CALL_OP(sch, SCX_KF_DISPATCH, dispatch, rq,
+ cpu_of(rq), prev_on_scx ? prev : NULL);
- flush_dispatch_buf(rq);
+ flush_dispatch_buf(sch, rq);
if (prev_on_rq && prev->scx.slice) {
rq->scx.flags |= SCX_RQ_BAL_KEEP;
@@ -3035,7 +3147,7 @@ static int balance_one(struct rq *rq, struct task_struct *prev)
}
if (rq->scx.local_dsq.nr)
goto has_tasks;
- if (consume_global_dsq(rq))
+ if (consume_global_dsq(sch, rq))
goto has_tasks;
/*
@@ -3058,10 +3170,10 @@ no_tasks:
* Didn't find another task to run. Keep running @prev unless
* %SCX_OPS_ENQ_LAST is in effect.
*/
- if (prev_on_rq && (!static_branch_unlikely(&scx_ops_enq_last) ||
- scx_rq_bypassing(rq))) {
+ if (prev_on_rq &&
+ (!(sch->ops.flags & SCX_OPS_ENQ_LAST) || scx_rq_bypassing(rq))) {
rq->scx.flags |= SCX_RQ_BAL_KEEP;
- __scx_add_event(SCX_EV_DISPATCH_KEEP_LAST, 1);
+ __scx_add_event(sch, SCX_EV_DISPATCH_KEEP_LAST, 1);
goto has_tasks;
}
rq->scx.flags &= ~SCX_RQ_IN_BALANCE;
@@ -3121,18 +3233,22 @@ static void process_ddsp_deferred_locals(struct rq *rq)
*/
while ((p = list_first_entry_or_null(&rq->scx.ddsp_deferred_locals,
struct task_struct, scx.dsq_list.node))) {
+ struct scx_sched *sch = scx_root;
struct scx_dispatch_q *dsq;
list_del_init(&p->scx.dsq_list.node);
- dsq = find_dsq_for_dispatch(rq, p->scx.ddsp_dsq_id, p);
+ dsq = find_dsq_for_dispatch(sch, rq, p->scx.ddsp_dsq_id, p);
if (!WARN_ON_ONCE(dsq->id != SCX_DSQ_LOCAL))
- dispatch_to_local_dsq(rq, dsq, p, p->scx.ddsp_enq_flags);
+ dispatch_to_local_dsq(sch, rq, dsq, p,
+ p->scx.ddsp_enq_flags);
}
}
static void set_next_task_scx(struct rq *rq, struct task_struct *p, bool first)
{
+ struct scx_sched *sch = scx_root;
+
if (p->scx.flags & SCX_TASK_QUEUED) {
/*
* Core-sched might decide to execute @p before it is
@@ -3145,8 +3261,8 @@ static void set_next_task_scx(struct rq *rq, struct task_struct *p, bool first)
p->se.exec_start = rq_clock_task(rq);
/* see dequeue_task_scx() on why we skip when !QUEUED */
- if (SCX_HAS_OP(running) && (p->scx.flags & SCX_TASK_QUEUED))
- SCX_CALL_OP_TASK(SCX_KF_REST, running, rq, p);
+ if (SCX_HAS_OP(sch, running) && (p->scx.flags & SCX_TASK_QUEUED))
+ SCX_CALL_OP_TASK(sch, SCX_KF_REST, running, rq, p);
clr_task_runnable(p, true);
@@ -3189,6 +3305,7 @@ preempt_reason_from_class(const struct sched_class *class)
static void switch_class(struct rq *rq, struct task_struct *next)
{
+ struct scx_sched *sch = scx_root;
const struct sched_class *next_class = next->sched_class;
#ifdef CONFIG_SMP
@@ -3199,7 +3316,7 @@ static void switch_class(struct rq *rq, struct task_struct *next)
*/
smp_store_release(&rq->scx.pnt_seq, rq->scx.pnt_seq + 1);
#endif
- if (!static_branch_unlikely(&scx_ops_cpu_preempt))
+ if (!(sch->ops.flags & SCX_OPS_HAS_CPU_PREEMPT))
return;
/*
@@ -3221,13 +3338,14 @@ static void switch_class(struct rq *rq, struct task_struct *next)
* next time that balance_scx() is invoked.
*/
if (!rq->scx.cpu_released) {
- if (SCX_HAS_OP(cpu_release)) {
+ if (SCX_HAS_OP(sch, cpu_release)) {
struct scx_cpu_release_args args = {
.reason = preempt_reason_from_class(next_class),
.task = next,
};
- SCX_CALL_OP(SCX_KF_CPU_RELEASE, cpu_release, rq, cpu_of(rq), &args);
+ SCX_CALL_OP(sch, SCX_KF_CPU_RELEASE, cpu_release, rq,
+ cpu_of(rq), &args);
}
rq->scx.cpu_released = true;
}
@@ -3236,11 +3354,12 @@ static void switch_class(struct rq *rq, struct task_struct *next)
static void put_prev_task_scx(struct rq *rq, struct task_struct *p,
struct task_struct *next)
{
+ struct scx_sched *sch = scx_root;
update_curr_scx(rq);
/* see dequeue_task_scx() on why we skip when !QUEUED */
- if (SCX_HAS_OP(stopping) && (p->scx.flags & SCX_TASK_QUEUED))
- SCX_CALL_OP_TASK(SCX_KF_REST, stopping, rq, p, true);
+ if (SCX_HAS_OP(sch, stopping) && (p->scx.flags & SCX_TASK_QUEUED))
+ SCX_CALL_OP_TASK(sch, SCX_KF_REST, stopping, rq, p, true);
if (p->scx.flags & SCX_TASK_QUEUED) {
set_task_runnable(rq, p);
@@ -3252,7 +3371,8 @@ static void put_prev_task_scx(struct rq *rq, struct task_struct *p,
* DSQ.
*/
if (p->scx.slice && !scx_rq_bypassing(rq)) {
- dispatch_enqueue(&rq->scx.local_dsq, p, SCX_ENQ_HEAD);
+ dispatch_enqueue(sch, &rq->scx.local_dsq, p,
+ SCX_ENQ_HEAD);
goto switch_class;
}
@@ -3263,7 +3383,7 @@ static void put_prev_task_scx(struct rq *rq, struct task_struct *p,
* which should trigger an explicit follow-up scheduling event.
*/
if (sched_class_above(&ext_sched_class, next->sched_class)) {
- WARN_ON_ONCE(!static_branch_unlikely(&scx_ops_enq_last));
+ WARN_ON_ONCE(!(sch->ops.flags & SCX_OPS_ENQ_LAST));
do_enqueue_task(rq, p, SCX_ENQ_LAST, -1);
} else {
do_enqueue_task(rq, p, 0, -1);
@@ -3316,7 +3436,7 @@ static struct task_struct *pick_task_scx(struct rq *rq)
* Can happen while enabling as SCX_RQ_BAL_PENDING assertion is
* conditional on scx_enabled() and may have been skipped.
*/
- WARN_ON_ONCE(scx_ops_enable_state() == SCX_OPS_ENABLED);
+ WARN_ON_ONCE(scx_enable_state() == SCX_ENABLED);
keep_prev = false;
}
@@ -3327,10 +3447,8 @@ static struct task_struct *pick_task_scx(struct rq *rq)
*/
if (keep_prev) {
p = prev;
- if (!p->scx.slice) {
- p->scx.slice = SCX_SLICE_DFL;
- __scx_add_event(SCX_EV_ENQ_SLICE_DFL, 1);
- }
+ if (!p->scx.slice)
+ refill_task_slice_dfl(p);
} else {
p = first_local_task(rq);
if (!p) {
@@ -3340,13 +3458,14 @@ static struct task_struct *pick_task_scx(struct rq *rq)
}
if (unlikely(!p->scx.slice)) {
- if (!scx_rq_bypassing(rq) && !scx_warned_zero_slice) {
+ struct scx_sched *sch = scx_root;
+
+ if (!scx_rq_bypassing(rq) && !sch->warned_zero_slice) {
printk_deferred(KERN_WARNING "sched_ext: %s[%d] has zero slice in %s()\n",
p->comm, p->pid, __func__);
- scx_warned_zero_slice = true;
+ sch->warned_zero_slice = true;
}
- p->scx.slice = SCX_SLICE_DFL;
- __scx_add_event(SCX_EV_ENQ_SLICE_DFL, 1);
+ refill_task_slice_dfl(p);
}
}
@@ -3375,13 +3494,17 @@ static struct task_struct *pick_task_scx(struct rq *rq)
bool scx_prio_less(const struct task_struct *a, const struct task_struct *b,
bool in_fi)
{
+ struct scx_sched *sch = scx_root;
+
/*
* The const qualifiers are dropped from task_struct pointers when
* calling ops.core_sched_before(). Accesses are controlled by the
* verifier.
*/
- if (SCX_HAS_OP(core_sched_before) && !scx_rq_bypassing(task_rq(a)))
- return SCX_CALL_OP_2TASKS_RET(SCX_KF_REST, core_sched_before, NULL,
+ if (SCX_HAS_OP(sch, core_sched_before) &&
+ !scx_rq_bypassing(task_rq(a)))
+ return SCX_CALL_OP_2TASKS_RET(sch, SCX_KF_REST, core_sched_before,
+ NULL,
(struct task_struct *)a,
(struct task_struct *)b);
else
@@ -3393,6 +3516,7 @@ bool scx_prio_less(const struct task_struct *a, const struct task_struct *b,
static int select_task_rq_scx(struct task_struct *p, int prev_cpu, int wake_flags)
{
+ struct scx_sched *sch = scx_root;
bool rq_bypass;
/*
@@ -3409,7 +3533,7 @@ static int select_task_rq_scx(struct task_struct *p, int prev_cpu, int wake_flag
return prev_cpu;
rq_bypass = scx_rq_bypassing(task_rq(p));
- if (SCX_HAS_OP(select_cpu) && !rq_bypass) {
+ if (likely(SCX_HAS_OP(sch, select_cpu)) && !rq_bypass) {
s32 cpu;
struct task_struct **ddsp_taskp;
@@ -3417,29 +3541,30 @@ static int select_task_rq_scx(struct task_struct *p, int prev_cpu, int wake_flag
WARN_ON_ONCE(*ddsp_taskp);
*ddsp_taskp = p;
- cpu = SCX_CALL_OP_TASK_RET(SCX_KF_ENQUEUE | SCX_KF_SELECT_CPU,
- select_cpu, NULL, p, prev_cpu, wake_flags);
+ cpu = SCX_CALL_OP_TASK_RET(sch,
+ SCX_KF_ENQUEUE | SCX_KF_SELECT_CPU,
+ select_cpu, NULL, p, prev_cpu,
+ wake_flags);
p->scx.selected_cpu = cpu;
*ddsp_taskp = NULL;
- if (ops_cpu_valid(cpu, "from ops.select_cpu()"))
+ if (ops_cpu_valid(sch, cpu, "from ops.select_cpu()"))
return cpu;
else
return prev_cpu;
} else {
s32 cpu;
- cpu = scx_select_cpu_dfl(p, prev_cpu, wake_flags, 0);
+ cpu = scx_select_cpu_dfl(p, prev_cpu, wake_flags, NULL, 0);
if (cpu >= 0) {
- p->scx.slice = SCX_SLICE_DFL;
+ refill_task_slice_dfl(p);
p->scx.ddsp_dsq_id = SCX_DSQ_LOCAL;
- __scx_add_event(SCX_EV_ENQ_SLICE_DFL, 1);
} else {
cpu = prev_cpu;
}
p->scx.selected_cpu = cpu;
if (rq_bypass)
- __scx_add_event(SCX_EV_BYPASS_DISPATCH, 1);
+ __scx_add_event(sch, SCX_EV_BYPASS_DISPATCH, 1);
return cpu;
}
}
@@ -3452,6 +3577,8 @@ static void task_woken_scx(struct rq *rq, struct task_struct *p)
static void set_cpus_allowed_scx(struct task_struct *p,
struct affinity_context *ac)
{
+ struct scx_sched *sch = scx_root;
+
set_cpus_allowed_common(p, ac);
/*
@@ -3462,28 +3589,38 @@ static void set_cpus_allowed_scx(struct task_struct *p,
* Fine-grained memory write control is enforced by BPF making the const
* designation pointless. Cast it away when calling the operation.
*/
- if (SCX_HAS_OP(set_cpumask))
- SCX_CALL_OP_TASK(SCX_KF_REST, set_cpumask, NULL,
+ if (SCX_HAS_OP(sch, set_cpumask))
+ SCX_CALL_OP_TASK(sch, SCX_KF_REST, set_cpumask, NULL,
p, (struct cpumask *)p->cpus_ptr);
}
static void handle_hotplug(struct rq *rq, bool online)
{
+ struct scx_sched *sch = scx_root;
int cpu = cpu_of(rq);
atomic_long_inc(&scx_hotplug_seq);
+ /*
+ * scx_root updates are protected by cpus_read_lock() and will stay
+ * stable here. Note that we can't depend on scx_enabled() test as the
+ * hotplug ops need to be enabled before __scx_enabled is set.
+ */
+ if (unlikely(!sch))
+ return;
+
if (scx_enabled())
- scx_idle_update_selcpu_topology(&scx_ops);
+ scx_idle_update_selcpu_topology(&sch->ops);
- if (online && SCX_HAS_OP(cpu_online))
- SCX_CALL_OP(SCX_KF_UNLOCKED, cpu_online, NULL, cpu);
- else if (!online && SCX_HAS_OP(cpu_offline))
- SCX_CALL_OP(SCX_KF_UNLOCKED, cpu_offline, NULL, cpu);
+ if (online && SCX_HAS_OP(sch, cpu_online))
+ SCX_CALL_OP(sch, SCX_KF_UNLOCKED, cpu_online, NULL, cpu);
+ else if (!online && SCX_HAS_OP(sch, cpu_offline))
+ SCX_CALL_OP(sch, SCX_KF_UNLOCKED, cpu_offline, NULL, cpu);
else
- scx_ops_exit(SCX_ECODE_ACT_RESTART | SCX_ECODE_RSN_HOTPLUG,
- "cpu %d going %s, exiting scheduler", cpu,
- online ? "online" : "offline");
+ scx_exit(sch, SCX_EXIT_UNREG_KERN,
+ SCX_ECODE_ACT_RESTART | SCX_ECODE_RSN_HOTPLUG,
+ "cpu %d going %s, exiting scheduler", cpu,
+ online ? "online" : "offline");
}
void scx_rq_activate(struct rq *rq)
@@ -3510,11 +3647,16 @@ static void rq_offline_scx(struct rq *rq)
static bool check_rq_for_timeouts(struct rq *rq)
{
+ struct scx_sched *sch;
struct task_struct *p;
struct rq_flags rf;
bool timed_out = false;
rq_lock_irqsave(rq, &rf);
+ sch = rcu_dereference_bh(scx_root);
+ if (unlikely(!sch))
+ goto out_unlock;
+
list_for_each_entry(p, &rq->scx.runnable_list, scx.runnable_node) {
unsigned long last_runnable = p->scx.runnable_at;
@@ -3522,16 +3664,15 @@ static bool check_rq_for_timeouts(struct rq *rq)
last_runnable + scx_watchdog_timeout))) {
u32 dur_ms = jiffies_to_msecs(jiffies - last_runnable);
- scx_ops_error_kind(SCX_EXIT_ERROR_STALL,
- "%s[%d] failed to run for %u.%03us",
- p->comm, p->pid,
- dur_ms / 1000, dur_ms % 1000);
+ scx_exit(sch, SCX_EXIT_ERROR_STALL, 0,
+ "%s[%d] failed to run for %u.%03us",
+ p->comm, p->pid, dur_ms / 1000, dur_ms % 1000);
timed_out = true;
break;
}
}
+out_unlock:
rq_unlock_irqrestore(rq, &rf);
-
return timed_out;
}
@@ -3553,19 +3694,24 @@ static void scx_watchdog_workfn(struct work_struct *work)
void scx_tick(struct rq *rq)
{
+ struct scx_sched *sch;
unsigned long last_check;
if (!scx_enabled())
return;
+ sch = rcu_dereference_bh(scx_root);
+ if (unlikely(!sch))
+ return;
+
last_check = READ_ONCE(scx_watchdog_timestamp);
if (unlikely(time_after(jiffies,
last_check + READ_ONCE(scx_watchdog_timeout)))) {
u32 dur_ms = jiffies_to_msecs(jiffies - last_check);
- scx_ops_error_kind(SCX_EXIT_ERROR_STALL,
- "watchdog failed to check in for %u.%03us",
- dur_ms / 1000, dur_ms % 1000);
+ scx_exit(sch, SCX_EXIT_ERROR_STALL, 0,
+ "watchdog failed to check in for %u.%03us",
+ dur_ms / 1000, dur_ms % 1000);
}
update_other_load_avgs(rq);
@@ -3573,6 +3719,8 @@ void scx_tick(struct rq *rq)
static void task_tick_scx(struct rq *rq, struct task_struct *curr, int queued)
{
+ struct scx_sched *sch = scx_root;
+
update_curr_scx(rq);
/*
@@ -3582,8 +3730,8 @@ static void task_tick_scx(struct rq *rq, struct task_struct *curr, int queued)
if (scx_rq_bypassing(rq)) {
curr->scx.slice = 0;
touch_core_sched(rq, curr);
- } else if (SCX_HAS_OP(tick)) {
- SCX_CALL_OP_TASK(SCX_KF_REST, tick, rq, curr);
+ } else if (SCX_HAS_OP(sch, tick)) {
+ SCX_CALL_OP_TASK(sch, SCX_KF_REST, tick, rq, curr);
}
if (!curr->scx.slice)
@@ -3648,21 +3796,23 @@ static void scx_set_task_state(struct task_struct *p, enum scx_task_state state)
p->scx.flags |= state << SCX_TASK_STATE_SHIFT;
}
-static int scx_ops_init_task(struct task_struct *p, struct task_group *tg, bool fork)
+static int scx_init_task(struct task_struct *p, struct task_group *tg, bool fork)
{
+ struct scx_sched *sch = scx_root;
int ret;
p->scx.disallow = false;
- if (SCX_HAS_OP(init_task)) {
+ if (SCX_HAS_OP(sch, init_task)) {
struct scx_init_task_args args = {
SCX_INIT_TASK_ARGS_CGROUP(tg)
.fork = fork,
};
- ret = SCX_CALL_OP_RET(SCX_KF_UNLOCKED, init_task, NULL, p, &args);
+ ret = SCX_CALL_OP_RET(sch, SCX_KF_UNLOCKED, init_task, NULL,
+ p, &args);
if (unlikely(ret)) {
- ret = ops_sanitize_err("init_task", ret);
+ ret = ops_sanitize_err(sch, "init_task", ret);
return ret;
}
}
@@ -3690,8 +3840,8 @@ static int scx_ops_init_task(struct task_struct *p, struct task_group *tg, bool
task_rq_unlock(rq, p, &rf);
} else if (p->policy == SCHED_EXT) {
- scx_ops_error("ops.init_task() set task->scx.disallow for %s[%d] during fork",
- p->comm, p->pid);
+ scx_error(sch, "ops.init_task() set task->scx.disallow for %s[%d] during fork",
+ p->comm, p->pid);
}
}
@@ -3699,8 +3849,9 @@ static int scx_ops_init_task(struct task_struct *p, struct task_group *tg, bool
return 0;
}
-static void scx_ops_enable_task(struct task_struct *p)
+static void scx_enable_task(struct task_struct *p)
{
+ struct scx_sched *sch = scx_root;
struct rq *rq = task_rq(p);
u32 weight;
@@ -3717,28 +3868,31 @@ static void scx_ops_enable_task(struct task_struct *p)
p->scx.weight = sched_weight_to_cgroup(weight);
- if (SCX_HAS_OP(enable))
- SCX_CALL_OP_TASK(SCX_KF_REST, enable, rq, p);
+ if (SCX_HAS_OP(sch, enable))
+ SCX_CALL_OP_TASK(sch, SCX_KF_REST, enable, rq, p);
scx_set_task_state(p, SCX_TASK_ENABLED);
- if (SCX_HAS_OP(set_weight))
- SCX_CALL_OP_TASK(SCX_KF_REST, set_weight, rq, p, p->scx.weight);
+ if (SCX_HAS_OP(sch, set_weight))
+ SCX_CALL_OP_TASK(sch, SCX_KF_REST, set_weight, rq,
+ p, p->scx.weight);
}
-static void scx_ops_disable_task(struct task_struct *p)
+static void scx_disable_task(struct task_struct *p)
{
+ struct scx_sched *sch = scx_root;
struct rq *rq = task_rq(p);
lockdep_assert_rq_held(rq);
WARN_ON_ONCE(scx_get_task_state(p) != SCX_TASK_ENABLED);
- if (SCX_HAS_OP(disable))
- SCX_CALL_OP_TASK(SCX_KF_REST, disable, rq, p);
+ if (SCX_HAS_OP(sch, disable))
+ SCX_CALL_OP_TASK(sch, SCX_KF_REST, disable, rq, p);
scx_set_task_state(p, SCX_TASK_READY);
}
-static void scx_ops_exit_task(struct task_struct *p)
+static void scx_exit_task(struct task_struct *p)
{
+ struct scx_sched *sch = scx_root;
struct scx_exit_task_args args = {
.cancelled = false,
};
@@ -3754,15 +3908,16 @@ static void scx_ops_exit_task(struct task_struct *p)
case SCX_TASK_READY:
break;
case SCX_TASK_ENABLED:
- scx_ops_disable_task(p);
+ scx_disable_task(p);
break;
default:
WARN_ON_ONCE(true);
return;
}
- if (SCX_HAS_OP(exit_task))
- SCX_CALL_OP_TASK(SCX_KF_REST, exit_task, task_rq(p), p, &args);
+ if (SCX_HAS_OP(sch, exit_task))
+ SCX_CALL_OP_TASK(sch, SCX_KF_REST, exit_task, task_rq(p),
+ p, &args);
scx_set_task_state(p, SCX_TASK_NONE);
}
@@ -3794,15 +3949,15 @@ int scx_fork(struct task_struct *p)
{
percpu_rwsem_assert_held(&scx_fork_rwsem);
- if (scx_ops_init_task_enabled)
- return scx_ops_init_task(p, task_group(p), true);
+ if (scx_init_task_enabled)
+ return scx_init_task(p, task_group(p), true);
else
return 0;
}
void scx_post_fork(struct task_struct *p)
{
- if (scx_ops_init_task_enabled) {
+ if (scx_init_task_enabled) {
scx_set_task_state(p, SCX_TASK_READY);
/*
@@ -3815,7 +3970,7 @@ void scx_post_fork(struct task_struct *p)
struct rq *rq;
rq = task_rq_lock(p, &rf);
- scx_ops_enable_task(p);
+ scx_enable_task(p);
task_rq_unlock(rq, p, &rf);
}
}
@@ -3835,7 +3990,7 @@ void scx_cancel_fork(struct task_struct *p)
rq = task_rq_lock(p, &rf);
WARN_ON_ONCE(scx_get_task_state(p) >= SCX_TASK_READY);
- scx_ops_exit_task(p);
+ scx_exit_task(p);
task_rq_unlock(rq, p, &rf);
}
@@ -3851,15 +4006,15 @@ void sched_ext_free(struct task_struct *p)
spin_unlock_irqrestore(&scx_tasks_lock, flags);
/*
- * @p is off scx_tasks and wholly ours. scx_ops_enable()'s READY ->
- * ENABLED transitions can't race us. Disable ops for @p.
+ * @p is off scx_tasks and wholly ours. scx_enable()'s READY -> ENABLED
+ * transitions can't race us. Disable ops for @p.
*/
if (scx_get_task_state(p) != SCX_TASK_NONE) {
struct rq_flags rf;
struct rq *rq;
rq = task_rq_lock(p, &rf);
- scx_ops_exit_task(p);
+ scx_exit_task(p);
task_rq_unlock(rq, p, &rf);
}
}
@@ -3867,11 +4022,14 @@ void sched_ext_free(struct task_struct *p)
static void reweight_task_scx(struct rq *rq, struct task_struct *p,
const struct load_weight *lw)
{
+ struct scx_sched *sch = scx_root;
+
lockdep_assert_rq_held(task_rq(p));
p->scx.weight = sched_weight_to_cgroup(scale_load_down(lw->weight));
- if (SCX_HAS_OP(set_weight))
- SCX_CALL_OP_TASK(SCX_KF_REST, set_weight, rq, p, p->scx.weight);
+ if (SCX_HAS_OP(sch, set_weight))
+ SCX_CALL_OP_TASK(sch, SCX_KF_REST, set_weight, rq,
+ p, p->scx.weight);
}
static void prio_changed_scx(struct rq *rq, struct task_struct *p, int oldprio)
@@ -3880,20 +4038,22 @@ static void prio_changed_scx(struct rq *rq, struct task_struct *p, int oldprio)
static void switching_to_scx(struct rq *rq, struct task_struct *p)
{
- scx_ops_enable_task(p);
+ struct scx_sched *sch = scx_root;
+
+ scx_enable_task(p);
/*
* set_cpus_allowed_scx() is not called while @p is associated with a
* different scheduler class. Keep the BPF scheduler up-to-date.
*/
- if (SCX_HAS_OP(set_cpumask))
- SCX_CALL_OP_TASK(SCX_KF_REST, set_cpumask, rq,
+ if (SCX_HAS_OP(sch, set_cpumask))
+ SCX_CALL_OP_TASK(sch, SCX_KF_REST, set_cpumask, rq,
p, (struct cpumask *)p->cpus_ptr);
}
static void switched_from_scx(struct rq *rq, struct task_struct *p)
{
- scx_ops_disable_task(p);
+ scx_disable_task(p);
}
static void wakeup_preempt_scx(struct rq *rq, struct task_struct *p,int wake_flags) {}
@@ -3936,8 +4096,14 @@ bool scx_can_stop_tick(struct rq *rq)
DEFINE_STATIC_PERCPU_RWSEM(scx_cgroup_rwsem);
static bool scx_cgroup_enabled;
+void scx_tg_init(struct task_group *tg)
+{
+ tg->scx_weight = CGROUP_WEIGHT_DFL;
+}
+
int scx_tg_online(struct task_group *tg)
{
+ struct scx_sched *sch = scx_root;
int ret = 0;
WARN_ON_ONCE(tg->scx_flags & (SCX_TG_ONLINE | SCX_TG_INITED));
@@ -3945,14 +4111,14 @@ int scx_tg_online(struct task_group *tg)
percpu_down_read(&scx_cgroup_rwsem);
if (scx_cgroup_enabled) {
- if (SCX_HAS_OP(cgroup_init)) {
+ if (SCX_HAS_OP(sch, cgroup_init)) {
struct scx_cgroup_init_args args =
{ .weight = tg->scx_weight };
- ret = SCX_CALL_OP_RET(SCX_KF_UNLOCKED, cgroup_init, NULL,
- tg->css.cgroup, &args);
+ ret = SCX_CALL_OP_RET(sch, SCX_KF_UNLOCKED, cgroup_init,
+ NULL, tg->css.cgroup, &args);
if (ret)
- ret = ops_sanitize_err("cgroup_init", ret);
+ ret = ops_sanitize_err(sch, "cgroup_init", ret);
}
if (ret == 0)
tg->scx_flags |= SCX_TG_ONLINE | SCX_TG_INITED;
@@ -3966,12 +4132,16 @@ int scx_tg_online(struct task_group *tg)
void scx_tg_offline(struct task_group *tg)
{
+ struct scx_sched *sch = scx_root;
+
WARN_ON_ONCE(!(tg->scx_flags & SCX_TG_ONLINE));
percpu_down_read(&scx_cgroup_rwsem);
- if (SCX_HAS_OP(cgroup_exit) && (tg->scx_flags & SCX_TG_INITED))
- SCX_CALL_OP(SCX_KF_UNLOCKED, cgroup_exit, NULL, tg->css.cgroup);
+ if (scx_cgroup_enabled && SCX_HAS_OP(sch, cgroup_exit) &&
+ (tg->scx_flags & SCX_TG_INITED))
+ SCX_CALL_OP(sch, SCX_KF_UNLOCKED, cgroup_exit, NULL,
+ tg->css.cgroup);
tg->scx_flags &= ~(SCX_TG_ONLINE | SCX_TG_INITED);
percpu_up_read(&scx_cgroup_rwsem);
@@ -3979,6 +4149,7 @@ void scx_tg_offline(struct task_group *tg)
int scx_cgroup_can_attach(struct cgroup_taskset *tset)
{
+ struct scx_sched *sch = scx_root;
struct cgroup_subsys_state *css;
struct task_struct *p;
int ret;
@@ -4003,8 +4174,9 @@ int scx_cgroup_can_attach(struct cgroup_taskset *tset)
if (from == to)
continue;
- if (SCX_HAS_OP(cgroup_prep_move)) {
- ret = SCX_CALL_OP_RET(SCX_KF_UNLOCKED, cgroup_prep_move, NULL,
+ if (SCX_HAS_OP(sch, cgroup_prep_move)) {
+ ret = SCX_CALL_OP_RET(sch, SCX_KF_UNLOCKED,
+ cgroup_prep_move, NULL,
p, from, css->cgroup);
if (ret)
goto err;
@@ -4017,18 +4189,21 @@ int scx_cgroup_can_attach(struct cgroup_taskset *tset)
err:
cgroup_taskset_for_each(p, css, tset) {
- if (SCX_HAS_OP(cgroup_cancel_move) && p->scx.cgrp_moving_from)
- SCX_CALL_OP(SCX_KF_UNLOCKED, cgroup_cancel_move, NULL,
+ if (SCX_HAS_OP(sch, cgroup_cancel_move) &&
+ p->scx.cgrp_moving_from)
+ SCX_CALL_OP(sch, SCX_KF_UNLOCKED, cgroup_cancel_move, NULL,
p, p->scx.cgrp_moving_from, css->cgroup);
p->scx.cgrp_moving_from = NULL;
}
percpu_up_read(&scx_cgroup_rwsem);
- return ops_sanitize_err("cgroup_prep_move", ret);
+ return ops_sanitize_err(sch, "cgroup_prep_move", ret);
}
void scx_cgroup_move_task(struct task_struct *p)
{
+ struct scx_sched *sch = scx_root;
+
if (!scx_cgroup_enabled)
return;
@@ -4036,9 +4211,11 @@ void scx_cgroup_move_task(struct task_struct *p)
* @p must have ops.cgroup_prep_move() called on it and thus
* cgrp_moving_from set.
*/
- if (SCX_HAS_OP(cgroup_move) && !WARN_ON_ONCE(!p->scx.cgrp_moving_from))
- SCX_CALL_OP_TASK(SCX_KF_UNLOCKED, cgroup_move, NULL,
- p, p->scx.cgrp_moving_from, tg_cgrp(task_group(p)));
+ if (SCX_HAS_OP(sch, cgroup_move) &&
+ !WARN_ON_ONCE(!p->scx.cgrp_moving_from))
+ SCX_CALL_OP_TASK(sch, SCX_KF_UNLOCKED, cgroup_move, NULL,
+ p, p->scx.cgrp_moving_from,
+ tg_cgrp(task_group(p)));
p->scx.cgrp_moving_from = NULL;
}
@@ -4049,6 +4226,7 @@ void scx_cgroup_finish_attach(void)
void scx_cgroup_cancel_attach(struct cgroup_taskset *tset)
{
+ struct scx_sched *sch = scx_root;
struct cgroup_subsys_state *css;
struct task_struct *p;
@@ -4056,8 +4234,9 @@ void scx_cgroup_cancel_attach(struct cgroup_taskset *tset)
goto out_unlock;
cgroup_taskset_for_each(p, css, tset) {
- if (SCX_HAS_OP(cgroup_cancel_move) && p->scx.cgrp_moving_from)
- SCX_CALL_OP(SCX_KF_UNLOCKED, cgroup_cancel_move, NULL,
+ if (SCX_HAS_OP(sch, cgroup_cancel_move) &&
+ p->scx.cgrp_moving_from)
+ SCX_CALL_OP(sch, SCX_KF_UNLOCKED, cgroup_cancel_move, NULL,
p, p->scx.cgrp_moving_from, css->cgroup);
p->scx.cgrp_moving_from = NULL;
}
@@ -4067,14 +4246,16 @@ out_unlock:
void scx_group_set_weight(struct task_group *tg, unsigned long weight)
{
+ struct scx_sched *sch = scx_root;
+
percpu_down_read(&scx_cgroup_rwsem);
- if (scx_cgroup_enabled && tg->scx_weight != weight) {
- if (SCX_HAS_OP(cgroup_set_weight))
- SCX_CALL_OP(SCX_KF_UNLOCKED, cgroup_set_weight, NULL,
- tg_cgrp(tg), weight);
- tg->scx_weight = weight;
- }
+ if (scx_cgroup_enabled && SCX_HAS_OP(sch, cgroup_set_weight) &&
+ tg->scx_weight != weight)
+ SCX_CALL_OP(sch, SCX_KF_UNLOCKED, cgroup_set_weight, NULL,
+ tg_cgrp(tg), weight);
+
+ tg->scx_weight = weight;
percpu_up_read(&scx_cgroup_rwsem);
}
@@ -4160,29 +4341,6 @@ static void init_dsq(struct scx_dispatch_q *dsq, u64 dsq_id)
dsq->id = dsq_id;
}
-static struct scx_dispatch_q *create_dsq(u64 dsq_id, int node)
-{
- struct scx_dispatch_q *dsq;
- int ret;
-
- if (dsq_id & SCX_DSQ_FLAG_BUILTIN)
- return ERR_PTR(-EINVAL);
-
- dsq = kmalloc_node(sizeof(*dsq), GFP_KERNEL, node);
- if (!dsq)
- return ERR_PTR(-ENOMEM);
-
- init_dsq(dsq, dsq_id);
-
- ret = rhashtable_lookup_insert_fast(&dsq_hash, &dsq->hash_node,
- dsq_hash_params);
- if (ret) {
- kfree(dsq);
- return ERR_PTR(ret);
- }
- return dsq;
-}
-
static void free_dsq_irq_workfn(struct irq_work *irq_work)
{
struct llist_node *to_free = llist_del_all(&dsqs_to_free);
@@ -4194,26 +4352,27 @@ static void free_dsq_irq_workfn(struct irq_work *irq_work)
static DEFINE_IRQ_WORK(free_dsq_irq_work, free_dsq_irq_workfn);
-static void destroy_dsq(u64 dsq_id)
+static void destroy_dsq(struct scx_sched *sch, u64 dsq_id)
{
struct scx_dispatch_q *dsq;
unsigned long flags;
rcu_read_lock();
- dsq = find_user_dsq(dsq_id);
+ dsq = find_user_dsq(sch, dsq_id);
if (!dsq)
goto out_unlock_rcu;
raw_spin_lock_irqsave(&dsq->lock, flags);
if (dsq->nr) {
- scx_ops_error("attempting to destroy in-use dsq 0x%016llx (nr=%u)",
- dsq->id, dsq->nr);
+ scx_error(sch, "attempting to destroy in-use dsq 0x%016llx (nr=%u)",
+ dsq->id, dsq->nr);
goto out_unlock_dsq;
}
- if (rhashtable_remove_fast(&dsq_hash, &dsq->hash_node, dsq_hash_params))
+ if (rhashtable_remove_fast(&sch->dsq_hash, &dsq->hash_node,
+ dsq_hash_params))
goto out_unlock_dsq;
/*
@@ -4233,7 +4392,7 @@ out_unlock_rcu:
}
#ifdef CONFIG_EXT_GROUP_SCHED
-static void scx_cgroup_exit(void)
+static void scx_cgroup_exit(struct scx_sched *sch)
{
struct cgroup_subsys_state *css;
@@ -4253,14 +4412,15 @@ static void scx_cgroup_exit(void)
continue;
tg->scx_flags &= ~SCX_TG_INITED;
- if (!scx_ops.cgroup_exit)
+ if (!sch->ops.cgroup_exit)
continue;
if (WARN_ON_ONCE(!css_tryget(css)))
continue;
rcu_read_unlock();
- SCX_CALL_OP(SCX_KF_UNLOCKED, cgroup_exit, NULL, css->cgroup);
+ SCX_CALL_OP(sch, SCX_KF_UNLOCKED, cgroup_exit, NULL,
+ css->cgroup);
rcu_read_lock();
css_put(css);
@@ -4268,7 +4428,7 @@ static void scx_cgroup_exit(void)
rcu_read_unlock();
}
-static int scx_cgroup_init(void)
+static int scx_cgroup_init(struct scx_sched *sch)
{
struct cgroup_subsys_state *css;
int ret;
@@ -4288,7 +4448,7 @@ static int scx_cgroup_init(void)
(SCX_TG_ONLINE | SCX_TG_INITED)) != SCX_TG_ONLINE)
continue;
- if (!scx_ops.cgroup_init) {
+ if (!sch->ops.cgroup_init) {
tg->scx_flags |= SCX_TG_INITED;
continue;
}
@@ -4297,11 +4457,11 @@ static int scx_cgroup_init(void)
continue;
rcu_read_unlock();
- ret = SCX_CALL_OP_RET(SCX_KF_UNLOCKED, cgroup_init, NULL,
+ ret = SCX_CALL_OP_RET(sch, SCX_KF_UNLOCKED, cgroup_init, NULL,
css->cgroup, &args);
if (ret) {
css_put(css);
- scx_ops_error("ops.cgroup_init() failed (%d)", ret);
+ scx_error(sch, "ops.cgroup_init() failed (%d)", ret);
return ret;
}
tg->scx_flags |= SCX_TG_INITED;
@@ -4318,8 +4478,8 @@ static int scx_cgroup_init(void)
}
#else
-static void scx_cgroup_exit(void) {}
-static int scx_cgroup_init(void) { return 0; }
+static void scx_cgroup_exit(struct scx_sched *sch) {}
+static int scx_cgroup_init(struct scx_sched *sch) { return 0; }
#endif
@@ -4336,8 +4496,7 @@ static int scx_cgroup_init(void) { return 0; }
static ssize_t scx_attr_state_show(struct kobject *kobj,
struct kobj_attribute *ka, char *buf)
{
- return sysfs_emit(buf, "%s\n",
- scx_ops_enable_state_str[scx_ops_enable_state()]);
+ return sysfs_emit(buf, "%s\n", scx_enable_state_str[scx_enable_state()]);
}
SCX_ATTR(state);
@@ -4382,15 +4541,51 @@ static const struct attribute_group scx_global_attr_group = {
.attrs = scx_global_attrs,
};
+static void free_exit_info(struct scx_exit_info *ei);
+
+static void scx_sched_free_rcu_work(struct work_struct *work)
+{
+ struct rcu_work *rcu_work = to_rcu_work(work);
+ struct scx_sched *sch = container_of(rcu_work, struct scx_sched, rcu_work);
+ struct rhashtable_iter rht_iter;
+ struct scx_dispatch_q *dsq;
+ int node;
+
+ kthread_stop(sch->helper->task);
+ free_percpu(sch->event_stats_cpu);
+
+ for_each_node_state(node, N_POSSIBLE)
+ kfree(sch->global_dsqs[node]);
+ kfree(sch->global_dsqs);
+
+ rhashtable_walk_enter(&sch->dsq_hash, &rht_iter);
+ do {
+ rhashtable_walk_start(&rht_iter);
+
+ while ((dsq = rhashtable_walk_next(&rht_iter)) && !IS_ERR(dsq))
+ destroy_dsq(sch, dsq->id);
+
+ rhashtable_walk_stop(&rht_iter);
+ } while (dsq == ERR_PTR(-EAGAIN));
+ rhashtable_walk_exit(&rht_iter);
+
+ rhashtable_free_and_destroy(&sch->dsq_hash, NULL, NULL);
+ free_exit_info(sch->exit_info);
+ kfree(sch);
+}
+
static void scx_kobj_release(struct kobject *kobj)
{
- kfree(kobj);
+ struct scx_sched *sch = container_of(kobj, struct scx_sched, kobj);
+
+ INIT_RCU_WORK(&sch->rcu_work, scx_sched_free_rcu_work);
+ queue_rcu_work(system_unbound_wq, &sch->rcu_work);
}
static ssize_t scx_attr_ops_show(struct kobject *kobj,
struct kobj_attribute *ka, char *buf)
{
- return sysfs_emit(buf, "%s\n", scx_ops.name);
+ return sysfs_emit(buf, "%s\n", scx_root->ops.name);
}
SCX_ATTR(ops);
@@ -4401,16 +4596,17 @@ SCX_ATTR(ops);
static ssize_t scx_attr_events_show(struct kobject *kobj,
struct kobj_attribute *ka, char *buf)
{
+ struct scx_sched *sch = container_of(kobj, struct scx_sched, kobj);
struct scx_event_stats events;
int at = 0;
- scx_bpf_events(&events, sizeof(events));
+ scx_read_events(sch, &events);
at += scx_attr_event_show(buf, at, &events, SCX_EV_SELECT_CPU_FALLBACK);
at += scx_attr_event_show(buf, at, &events, SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE);
at += scx_attr_event_show(buf, at, &events, SCX_EV_DISPATCH_KEEP_LAST);
at += scx_attr_event_show(buf, at, &events, SCX_EV_ENQ_SKIP_EXITING);
at += scx_attr_event_show(buf, at, &events, SCX_EV_ENQ_SKIP_MIGRATION_DISABLED);
- at += scx_attr_event_show(buf, at, &events, SCX_EV_ENQ_SLICE_DFL);
+ at += scx_attr_event_show(buf, at, &events, SCX_EV_REFILL_SLICE_DFL);
at += scx_attr_event_show(buf, at, &events, SCX_EV_BYPASS_DURATION);
at += scx_attr_event_show(buf, at, &events, SCX_EV_BYPASS_DISPATCH);
at += scx_attr_event_show(buf, at, &events, SCX_EV_BYPASS_ACTIVATE);
@@ -4433,7 +4629,7 @@ static const struct kobj_type scx_ktype = {
static int scx_uevent(const struct kobject *kobj, struct kobj_uevent_env *env)
{
- return add_uevent_var(env, "SCXOPS=%s", scx_ops.name);
+ return add_uevent_var(env, "SCXOPS=%s", scx_root->ops.name);
}
static const struct kset_uevent_ops scx_uevent_ops = {
@@ -4446,14 +4642,20 @@ static const struct kset_uevent_ops scx_uevent_ops = {
*/
bool task_should_scx(int policy)
{
- if (!scx_enabled() ||
- unlikely(scx_ops_enable_state() == SCX_OPS_DISABLING))
+ if (!scx_enabled() || unlikely(scx_enable_state() == SCX_DISABLING))
return false;
if (READ_ONCE(scx_switching_all))
return true;
return policy == SCHED_EXT;
}
+bool scx_allow_ttwu_queue(const struct task_struct *p)
+{
+ return !scx_enabled() ||
+ (scx_root->ops.flags & SCX_OPS_ALLOW_QUEUED_WAKEUP) ||
+ p->sched_class != &ext_sched_class;
+}
+
/**
* scx_softlockup - sched_ext softlockup handler
* @dur_s: number of seconds of CPU stuck due to soft lockup
@@ -4466,40 +4668,48 @@ bool task_should_scx(int policy)
*/
void scx_softlockup(u32 dur_s)
{
- switch (scx_ops_enable_state()) {
- case SCX_OPS_ENABLING:
- case SCX_OPS_ENABLED:
+ struct scx_sched *sch;
+
+ rcu_read_lock();
+
+ sch = rcu_dereference(scx_root);
+ if (unlikely(!sch))
+ goto out_unlock;
+
+ switch (scx_enable_state()) {
+ case SCX_ENABLING:
+ case SCX_ENABLED:
break;
default:
- return;
+ goto out_unlock;
}
- /* allow only one instance, cleared at the end of scx_ops_bypass() */
+ /* allow only one instance, cleared at the end of scx_bypass() */
if (test_and_set_bit(0, &scx_in_softlockup))
- return;
+ goto out_unlock;
printk_deferred(KERN_ERR "sched_ext: Soft lockup - CPU%d stuck for %us, disabling \"%s\"\n",
- smp_processor_id(), dur_s, scx_ops.name);
+ smp_processor_id(), dur_s, scx_root->ops.name);
/*
* Some CPUs may be trapped in the dispatch paths. Enable breather
- * immediately; otherwise, we might even be able to get to
- * scx_ops_bypass().
+ * immediately; otherwise, we might even be able to get to scx_bypass().
*/
- atomic_inc(&scx_ops_breather_depth);
+ atomic_inc(&scx_breather_depth);
- scx_ops_error("soft lockup - CPU#%d stuck for %us",
- smp_processor_id(), dur_s);
+ scx_error(sch, "soft lockup - CPU#%d stuck for %us", smp_processor_id(), dur_s);
+out_unlock:
+ rcu_read_unlock();
}
static void scx_clear_softlockup(void)
{
if (test_and_clear_bit(0, &scx_in_softlockup))
- atomic_dec(&scx_ops_breather_depth);
+ atomic_dec(&scx_breather_depth);
}
/**
- * scx_ops_bypass - [Un]bypass scx_ops and guarantee forward progress
+ * scx_bypass - [Un]bypass scx_ops and guarantee forward progress
* @bypass: true for bypass, false for unbypass
*
* Bypassing guarantees that all runnable tasks make forward progress without
@@ -4529,32 +4739,36 @@ static void scx_clear_softlockup(void)
*
* - scx_prio_less() reverts to the default core_sched_at order.
*/
-static void scx_ops_bypass(bool bypass)
+static void scx_bypass(bool bypass)
{
static DEFINE_RAW_SPINLOCK(bypass_lock);
static unsigned long bypass_timestamp;
-
- int cpu;
+ struct scx_sched *sch;
unsigned long flags;
+ int cpu;
raw_spin_lock_irqsave(&bypass_lock, flags);
+ sch = rcu_dereference_bh(scx_root);
+
if (bypass) {
- scx_ops_bypass_depth++;
- WARN_ON_ONCE(scx_ops_bypass_depth <= 0);
- if (scx_ops_bypass_depth != 1)
+ scx_bypass_depth++;
+ WARN_ON_ONCE(scx_bypass_depth <= 0);
+ if (scx_bypass_depth != 1)
goto unlock;
bypass_timestamp = ktime_get_ns();
- scx_add_event(SCX_EV_BYPASS_ACTIVATE, 1);
+ if (sch)
+ scx_add_event(sch, SCX_EV_BYPASS_ACTIVATE, 1);
} else {
- scx_ops_bypass_depth--;
- WARN_ON_ONCE(scx_ops_bypass_depth < 0);
- if (scx_ops_bypass_depth != 0)
+ scx_bypass_depth--;
+ WARN_ON_ONCE(scx_bypass_depth < 0);
+ if (scx_bypass_depth != 0)
goto unlock;
- scx_add_event(SCX_EV_BYPASS_DURATION,
- ktime_get_ns() - bypass_timestamp);
+ if (sch)
+ scx_add_event(sch, SCX_EV_BYPASS_DURATION,
+ ktime_get_ns() - bypass_timestamp);
}
- atomic_inc(&scx_ops_breather_depth);
+ atomic_inc(&scx_breather_depth);
/*
* No task property is changing. We just need to make sure all currently
@@ -4612,7 +4826,7 @@ static void scx_ops_bypass(bool bypass)
raw_spin_rq_unlock(rq);
}
- atomic_dec(&scx_ops_breather_depth);
+ atomic_dec(&scx_breather_depth);
unlock:
raw_spin_unlock_irqrestore(&bypass_lock, flags);
scx_clear_softlockup();
@@ -4668,42 +4882,36 @@ static const char *scx_exit_reason(enum scx_exit_kind kind)
}
}
-static void scx_ops_disable_workfn(struct kthread_work *work)
+static void scx_disable_workfn(struct kthread_work *work)
{
- struct scx_exit_info *ei = scx_exit_info;
+ struct scx_sched *sch = container_of(work, struct scx_sched, disable_work);
+ struct scx_exit_info *ei = sch->exit_info;
struct scx_task_iter sti;
struct task_struct *p;
- struct rhashtable_iter rht_iter;
- struct scx_dispatch_q *dsq;
- int i, kind, cpu;
+ int kind, cpu;
- kind = atomic_read(&scx_exit_kind);
+ kind = atomic_read(&sch->exit_kind);
while (true) {
- /*
- * NONE indicates that a new scx_ops has been registered since
- * disable was scheduled - don't kill the new ops. DONE
- * indicates that the ops has already been disabled.
- */
- if (kind == SCX_EXIT_NONE || kind == SCX_EXIT_DONE)
+ if (kind == SCX_EXIT_DONE) /* already disabled? */
return;
- if (atomic_try_cmpxchg(&scx_exit_kind, &kind, SCX_EXIT_DONE))
+ WARN_ON_ONCE(kind == SCX_EXIT_NONE);
+ if (atomic_try_cmpxchg(&sch->exit_kind, &kind, SCX_EXIT_DONE))
break;
}
ei->kind = kind;
ei->reason = scx_exit_reason(ei->kind);
/* guarantee forward progress by bypassing scx_ops */
- scx_ops_bypass(true);
+ scx_bypass(true);
- switch (scx_ops_set_enable_state(SCX_OPS_DISABLING)) {
- case SCX_OPS_DISABLING:
+ switch (scx_set_enable_state(SCX_DISABLING)) {
+ case SCX_DISABLING:
WARN_ONCE(true, "sched_ext: duplicate disabling instance?");
break;
- case SCX_OPS_DISABLED:
+ case SCX_DISABLED:
pr_warn("sched_ext: ops error detected without ops (%s)\n",
- scx_exit_info->msg);
- WARN_ON_ONCE(scx_ops_set_enable_state(SCX_OPS_DISABLED) !=
- SCX_OPS_DISABLING);
+ sch->exit_info->msg);
+ WARN_ON_ONCE(scx_set_enable_state(SCX_DISABLED) != SCX_DISABLING);
goto done;
default:
break;
@@ -4714,17 +4922,17 @@ static void scx_ops_disable_workfn(struct kthread_work *work)
* we can safely use blocking synchronization constructs. Actually
* disable ops.
*/
- mutex_lock(&scx_ops_enable_mutex);
+ mutex_lock(&scx_enable_mutex);
static_branch_disable(&__scx_switched_all);
WRITE_ONCE(scx_switching_all, false);
/*
* Shut down cgroup support before tasks so that the cgroup attach path
- * doesn't race against scx_ops_exit_task().
+ * doesn't race against scx_exit_task().
*/
scx_cgroup_lock();
- scx_cgroup_exit();
+ scx_cgroup_exit(sch);
scx_cgroup_unlock();
/*
@@ -4733,7 +4941,7 @@ static void scx_ops_disable_workfn(struct kthread_work *work)
*/
percpu_down_write(&scx_fork_rwsem);
- scx_ops_init_task_enabled = false;
+ scx_init_task_enabled = false;
scx_task_iter_start(&sti);
while ((p = scx_task_iter_next_locked(&sti))) {
@@ -4753,7 +4961,7 @@ static void scx_ops_disable_workfn(struct kthread_work *work)
sched_enq_and_set_task(&ctx);
check_class_changed(task_rq(p), p, old_class, p->prio);
- scx_ops_exit_task(p);
+ scx_exit_task(p);
}
scx_task_iter_stop(&sti);
percpu_up_write(&scx_fork_rwsem);
@@ -4768,98 +4976,71 @@ static void scx_ops_disable_workfn(struct kthread_work *work)
}
/* no task is on scx, turn off all the switches and flush in-progress calls */
- static_branch_disable(&__scx_ops_enabled);
- for (i = SCX_OPI_BEGIN; i < SCX_OPI_END; i++)
- static_branch_disable(&scx_has_op[i]);
- static_branch_disable(&scx_ops_allow_queued_wakeup);
- static_branch_disable(&scx_ops_enq_last);
- static_branch_disable(&scx_ops_enq_exiting);
- static_branch_disable(&scx_ops_enq_migration_disabled);
- static_branch_disable(&scx_ops_cpu_preempt);
+ static_branch_disable(&__scx_enabled);
+ bitmap_zero(sch->has_op, SCX_OPI_END);
scx_idle_disable();
synchronize_rcu();
if (ei->kind >= SCX_EXIT_ERROR) {
pr_err("sched_ext: BPF scheduler \"%s\" disabled (%s)\n",
- scx_ops.name, ei->reason);
+ sch->ops.name, ei->reason);
if (ei->msg[0] != '\0')
- pr_err("sched_ext: %s: %s\n", scx_ops.name, ei->msg);
+ pr_err("sched_ext: %s: %s\n", sch->ops.name, ei->msg);
#ifdef CONFIG_STACKTRACE
stack_trace_print(ei->bt, ei->bt_len, 2);
#endif
} else {
pr_info("sched_ext: BPF scheduler \"%s\" disabled (%s)\n",
- scx_ops.name, ei->reason);
+ sch->ops.name, ei->reason);
}
- if (scx_ops.exit)
- SCX_CALL_OP(SCX_KF_UNLOCKED, exit, NULL, ei);
+ if (sch->ops.exit)
+ SCX_CALL_OP(sch, SCX_KF_UNLOCKED, exit, NULL, ei);
cancel_delayed_work_sync(&scx_watchdog_work);
/*
- * Delete the kobject from the hierarchy eagerly in addition to just
- * dropping a reference. Otherwise, if the object is deleted
- * asynchronously, sysfs could observe an object of the same name still
- * in the hierarchy when another scheduler is loaded.
+ * scx_root clearing must be inside cpus_read_lock(). See
+ * handle_hotplug().
*/
- kobject_del(scx_root_kobj);
- kobject_put(scx_root_kobj);
- scx_root_kobj = NULL;
-
- memset(&scx_ops, 0, sizeof(scx_ops));
-
- rhashtable_walk_enter(&dsq_hash, &rht_iter);
- do {
- rhashtable_walk_start(&rht_iter);
-
- while ((dsq = rhashtable_walk_next(&rht_iter)) && !IS_ERR(dsq))
- destroy_dsq(dsq->id);
+ cpus_read_lock();
+ RCU_INIT_POINTER(scx_root, NULL);
+ cpus_read_unlock();
- rhashtable_walk_stop(&rht_iter);
- } while (dsq == ERR_PTR(-EAGAIN));
- rhashtable_walk_exit(&rht_iter);
+ /*
+ * Delete the kobject from the hierarchy synchronously. Otherwise, sysfs
+ * could observe an object of the same name still in the hierarchy when
+ * the next scheduler is loaded.
+ */
+ kobject_del(&sch->kobj);
free_percpu(scx_dsp_ctx);
scx_dsp_ctx = NULL;
scx_dsp_max_batch = 0;
- free_exit_info(scx_exit_info);
- scx_exit_info = NULL;
-
- mutex_unlock(&scx_ops_enable_mutex);
+ mutex_unlock(&scx_enable_mutex);
- WARN_ON_ONCE(scx_ops_set_enable_state(SCX_OPS_DISABLED) !=
- SCX_OPS_DISABLING);
+ WARN_ON_ONCE(scx_set_enable_state(SCX_DISABLED) != SCX_DISABLING);
done:
- scx_ops_bypass(false);
+ scx_bypass(false);
}
-static DEFINE_KTHREAD_WORK(scx_ops_disable_work, scx_ops_disable_workfn);
-
-static void schedule_scx_ops_disable_work(void)
-{
- struct kthread_worker *helper = READ_ONCE(scx_ops_helper);
-
- /*
- * We may be called spuriously before the first bpf_sched_ext_reg(). If
- * scx_ops_helper isn't set up yet, there's nothing to do.
- */
- if (helper)
- kthread_queue_work(helper, &scx_ops_disable_work);
-}
-
-static void scx_ops_disable(enum scx_exit_kind kind)
+static void scx_disable(enum scx_exit_kind kind)
{
int none = SCX_EXIT_NONE;
+ struct scx_sched *sch;
if (WARN_ON_ONCE(kind == SCX_EXIT_NONE || kind == SCX_EXIT_DONE))
kind = SCX_EXIT_ERROR;
- atomic_try_cmpxchg(&scx_exit_kind, &none, kind);
-
- schedule_scx_ops_disable_work();
+ rcu_read_lock();
+ sch = rcu_dereference(scx_root);
+ if (sch) {
+ atomic_try_cmpxchg(&sch->exit_kind, &none, kind);
+ kthread_queue_work(sch->helper, &sch->disable_work);
+ }
+ rcu_read_unlock();
}
static void dump_newline(struct seq_buf *s)
@@ -4977,6 +5158,7 @@ static void scx_dump_task(struct seq_buf *s, struct scx_dump_ctx *dctx,
struct task_struct *p, char marker)
{
static unsigned long bt[SCX_EXIT_BT_LEN];
+ struct scx_sched *sch = scx_root;
char dsq_id_buf[19] = "(n/a)";
unsigned long ops_state = atomic_long_read(&p->scx.ops_state);
unsigned int bt_len = 0;
@@ -4999,9 +5181,9 @@ static void scx_dump_task(struct seq_buf *s, struct scx_dump_ctx *dctx,
p->scx.dsq_vtime, p->scx.slice, p->scx.weight);
dump_line(s, " cpus=%*pb", cpumask_pr_args(p->cpus_ptr));
- if (SCX_HAS_OP(dump_task)) {
+ if (SCX_HAS_OP(sch, dump_task)) {
ops_dump_init(s, " ");
- SCX_CALL_OP(SCX_KF_REST, dump_task, NULL, dctx, p);
+ SCX_CALL_OP(sch, SCX_KF_REST, dump_task, NULL, dctx, p);
ops_dump_exit();
}
@@ -5018,6 +5200,7 @@ static void scx_dump_state(struct scx_exit_info *ei, size_t dump_len)
{
static DEFINE_SPINLOCK(dump_lock);
static const char trunc_marker[] = "\n\n~~~~ TRUNCATED ~~~~\n";
+ struct scx_sched *sch = scx_root;
struct scx_dump_ctx dctx = {
.kind = ei->kind,
.exit_code = ei->exit_code,
@@ -5046,9 +5229,9 @@ static void scx_dump_state(struct scx_exit_info *ei, size_t dump_len)
dump_stack_trace(&s, " ", ei->bt, ei->bt_len);
}
- if (SCX_HAS_OP(dump)) {
+ if (SCX_HAS_OP(sch, dump)) {
ops_dump_init(&s, "");
- SCX_CALL_OP(SCX_KF_UNLOCKED, dump, NULL, &dctx);
+ SCX_CALL_OP(sch, SCX_KF_UNLOCKED, dump, NULL, &dctx);
ops_dump_exit();
}
@@ -5069,7 +5252,7 @@ static void scx_dump_state(struct scx_exit_info *ei, size_t dump_len)
idle = list_empty(&rq->scx.runnable_list) &&
rq->curr->sched_class == &idle_sched_class;
- if (idle && !SCX_HAS_OP(dump_cpu))
+ if (idle && !SCX_HAS_OP(sch, dump_cpu))
goto next;
/*
@@ -5103,9 +5286,10 @@ static void scx_dump_state(struct scx_exit_info *ei, size_t dump_len)
cpumask_pr_args(rq->scx.cpus_to_wait));
used = seq_buf_used(&ns);
- if (SCX_HAS_OP(dump_cpu)) {
+ if (SCX_HAS_OP(sch, dump_cpu)) {
ops_dump_init(&ns, " ");
- SCX_CALL_OP(SCX_KF_REST, dump_cpu, NULL, &dctx, cpu, idle);
+ SCX_CALL_OP(sch, SCX_KF_REST, dump_cpu, NULL,
+ &dctx, cpu, idle);
ops_dump_exit();
}
@@ -5139,13 +5323,13 @@ static void scx_dump_state(struct scx_exit_info *ei, size_t dump_len)
dump_line(&s, "Event counters");
dump_line(&s, "--------------");
- scx_bpf_events(&events, sizeof(events));
+ scx_read_events(sch, &events);
scx_dump_event(s, &events, SCX_EV_SELECT_CPU_FALLBACK);
scx_dump_event(s, &events, SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE);
scx_dump_event(s, &events, SCX_EV_DISPATCH_KEEP_LAST);
scx_dump_event(s, &events, SCX_EV_ENQ_SKIP_EXITING);
scx_dump_event(s, &events, SCX_EV_ENQ_SKIP_MIGRATION_DISABLED);
- scx_dump_event(s, &events, SCX_EV_ENQ_SLICE_DFL);
+ scx_dump_event(s, &events, SCX_EV_REFILL_SLICE_DFL);
scx_dump_event(s, &events, SCX_EV_BYPASS_DURATION);
scx_dump_event(s, &events, SCX_EV_BYPASS_DISPATCH);
scx_dump_event(s, &events, SCX_EV_BYPASS_ACTIVATE);
@@ -5157,27 +5341,25 @@ static void scx_dump_state(struct scx_exit_info *ei, size_t dump_len)
spin_unlock_irqrestore(&dump_lock, flags);
}
-static void scx_ops_error_irq_workfn(struct irq_work *irq_work)
+static void scx_error_irq_workfn(struct irq_work *irq_work)
{
- struct scx_exit_info *ei = scx_exit_info;
+ struct scx_sched *sch = container_of(irq_work, struct scx_sched, error_irq_work);
+ struct scx_exit_info *ei = sch->exit_info;
if (ei->kind >= SCX_EXIT_ERROR)
- scx_dump_state(ei, scx_ops.exit_dump_len);
+ scx_dump_state(ei, sch->ops.exit_dump_len);
- schedule_scx_ops_disable_work();
+ kthread_queue_work(sch->helper, &sch->disable_work);
}
-static DEFINE_IRQ_WORK(scx_ops_error_irq_work, scx_ops_error_irq_workfn);
-
-static __printf(3, 4) void scx_ops_exit_kind(enum scx_exit_kind kind,
- s64 exit_code,
- const char *fmt, ...)
+static void scx_vexit(struct scx_sched *sch,
+ enum scx_exit_kind kind, s64 exit_code,
+ const char *fmt, va_list args)
{
- struct scx_exit_info *ei = scx_exit_info;
+ struct scx_exit_info *ei = sch->exit_info;
int none = SCX_EXIT_NONE;
- va_list args;
- if (!atomic_try_cmpxchg(&scx_exit_kind, &none, kind))
+ if (!atomic_try_cmpxchg(&sch->exit_kind, &none, kind))
return;
ei->exit_code = exit_code;
@@ -5185,31 +5367,98 @@ static __printf(3, 4) void scx_ops_exit_kind(enum scx_exit_kind kind,
if (kind >= SCX_EXIT_ERROR)
ei->bt_len = stack_trace_save(ei->bt, SCX_EXIT_BT_LEN, 1);
#endif
- va_start(args, fmt);
vscnprintf(ei->msg, SCX_EXIT_MSG_LEN, fmt, args);
- va_end(args);
/*
* Set ei->kind and ->reason for scx_dump_state(). They'll be set again
- * in scx_ops_disable_workfn().
+ * in scx_disable_workfn().
*/
ei->kind = kind;
ei->reason = scx_exit_reason(ei->kind);
- irq_work_queue(&scx_ops_error_irq_work);
+ irq_work_queue(&sch->error_irq_work);
}
-static struct kthread_worker *scx_create_rt_helper(const char *name)
+static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops)
{
- struct kthread_worker *helper;
+ struct scx_sched *sch;
+ int node, ret;
- helper = kthread_run_worker(0, name);
- if (helper)
- sched_set_fifo(helper->task);
- return helper;
-}
+ sch = kzalloc(sizeof(*sch), GFP_KERNEL);
+ if (!sch)
+ return ERR_PTR(-ENOMEM);
+
+ sch->exit_info = alloc_exit_info(ops->exit_dump_len);
+ if (!sch->exit_info) {
+ ret = -ENOMEM;
+ goto err_free_sch;
+ }
+
+ ret = rhashtable_init(&sch->dsq_hash, &dsq_hash_params);
+ if (ret < 0)
+ goto err_free_ei;
+
+ sch->global_dsqs = kcalloc(nr_node_ids, sizeof(sch->global_dsqs[0]),
+ GFP_KERNEL);
+ if (!sch->global_dsqs) {
+ ret = -ENOMEM;
+ goto err_free_hash;
+ }
-static void check_hotplug_seq(const struct sched_ext_ops *ops)
+ for_each_node_state(node, N_POSSIBLE) {
+ struct scx_dispatch_q *dsq;
+
+ dsq = kzalloc_node(sizeof(*dsq), GFP_KERNEL, node);
+ if (!dsq) {
+ ret = -ENOMEM;
+ goto err_free_gdsqs;
+ }
+
+ init_dsq(dsq, SCX_DSQ_GLOBAL);
+ sch->global_dsqs[node] = dsq;
+ }
+
+ sch->event_stats_cpu = alloc_percpu(struct scx_event_stats);
+ if (!sch->event_stats_cpu)
+ goto err_free_gdsqs;
+
+ sch->helper = kthread_run_worker(0, "sched_ext_helper");
+ if (!sch->helper)
+ goto err_free_event_stats;
+ sched_set_fifo(sch->helper->task);
+
+ atomic_set(&sch->exit_kind, SCX_EXIT_NONE);
+ init_irq_work(&sch->error_irq_work, scx_error_irq_workfn);
+ kthread_init_work(&sch->disable_work, scx_disable_workfn);
+ sch->ops = *ops;
+ ops->priv = sch;
+
+ sch->kobj.kset = scx_kset;
+ ret = kobject_init_and_add(&sch->kobj, &scx_ktype, NULL, "root");
+ if (ret < 0)
+ goto err_stop_helper;
+
+ return sch;
+
+err_stop_helper:
+ kthread_stop(sch->helper->task);
+err_free_event_stats:
+ free_percpu(sch->event_stats_cpu);
+err_free_gdsqs:
+ for_each_node_state(node, N_POSSIBLE)
+ kfree(sch->global_dsqs[node]);
+ kfree(sch->global_dsqs);
+err_free_hash:
+ rhashtable_free_and_destroy(&sch->dsq_hash, NULL, NULL);
+err_free_ei:
+ free_exit_info(sch->exit_info);
+err_free_sch:
+ kfree(sch);
+ return ERR_PTR(ret);
+}
+
+static void check_hotplug_seq(struct scx_sched *sch,
+ const struct sched_ext_ops *ops)
{
unsigned long long global_hotplug_seq;
@@ -5221,21 +5470,22 @@ static void check_hotplug_seq(const struct sched_ext_ops *ops)
if (ops->hotplug_seq) {
global_hotplug_seq = atomic_long_read(&scx_hotplug_seq);
if (ops->hotplug_seq != global_hotplug_seq) {
- scx_ops_exit(SCX_ECODE_ACT_RESTART | SCX_ECODE_RSN_HOTPLUG,
- "expected hotplug seq %llu did not match actual %llu",
- ops->hotplug_seq, global_hotplug_seq);
+ scx_exit(sch, SCX_EXIT_UNREG_KERN,
+ SCX_ECODE_ACT_RESTART | SCX_ECODE_RSN_HOTPLUG,
+ "expected hotplug seq %llu did not match actual %llu",
+ ops->hotplug_seq, global_hotplug_seq);
}
}
}
-static int validate_ops(const struct sched_ext_ops *ops)
+static int validate_ops(struct scx_sched *sch, const struct sched_ext_ops *ops)
{
/*
* It doesn't make sense to specify the SCX_OPS_ENQ_LAST flag if the
* ops.enqueue() callback isn't implemented.
*/
if ((ops->flags & SCX_OPS_ENQ_LAST) && !ops->enqueue) {
- scx_ops_error("SCX_OPS_ENQ_LAST requires ops.enqueue() to be implemented");
+ scx_error(sch, "SCX_OPS_ENQ_LAST requires ops.enqueue() to be implemented");
return -EINVAL;
}
@@ -5245,7 +5495,7 @@ static int validate_ops(const struct sched_ext_ops *ops)
*/
if ((ops->flags & SCX_OPS_BUILTIN_IDLE_PER_NODE) &&
(ops->update_idle && !(ops->flags & SCX_OPS_KEEP_BUILTIN_IDLE))) {
- scx_ops_error("SCX_OPS_BUILTIN_IDLE_PER_NODE requires CPU idle selection enabled");
+ scx_error(sch, "SCX_OPS_BUILTIN_IDLE_PER_NODE requires CPU idle selection enabled");
return -EINVAL;
}
@@ -5255,12 +5505,13 @@ static int validate_ops(const struct sched_ext_ops *ops)
return 0;
}
-static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link)
+static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
{
+ struct scx_sched *sch;
struct scx_task_iter sti;
struct task_struct *p;
unsigned long timeout;
- int i, cpu, node, ret;
+ int i, cpu, ret;
if (!cpumask_equal(housekeeping_cpumask(HK_TYPE_DOMAIN),
cpu_possible_mask)) {
@@ -5268,87 +5519,25 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link)
return -EINVAL;
}
- mutex_lock(&scx_ops_enable_mutex);
-
- /*
- * Clear event counters so a new scx scheduler gets
- * fresh event counter values.
- */
- for_each_possible_cpu(cpu) {
- struct scx_event_stats *e = per_cpu_ptr(&event_stats_cpu, cpu);
- memset(e, 0, sizeof(*e));
- }
-
- if (!scx_ops_helper) {
- WRITE_ONCE(scx_ops_helper,
- scx_create_rt_helper("sched_ext_ops_helper"));
- if (!scx_ops_helper) {
- ret = -ENOMEM;
- goto err_unlock;
- }
- }
-
- if (!global_dsqs) {
- struct scx_dispatch_q **dsqs;
-
- dsqs = kcalloc(nr_node_ids, sizeof(dsqs[0]), GFP_KERNEL);
- if (!dsqs) {
- ret = -ENOMEM;
- goto err_unlock;
- }
-
- for_each_node_state(node, N_POSSIBLE) {
- struct scx_dispatch_q *dsq;
-
- dsq = kzalloc_node(sizeof(*dsq), GFP_KERNEL, node);
- if (!dsq) {
- for_each_node_state(node, N_POSSIBLE)
- kfree(dsqs[node]);
- kfree(dsqs);
- ret = -ENOMEM;
- goto err_unlock;
- }
-
- init_dsq(dsq, SCX_DSQ_GLOBAL);
- dsqs[node] = dsq;
- }
-
- global_dsqs = dsqs;
- }
+ mutex_lock(&scx_enable_mutex);
- if (scx_ops_enable_state() != SCX_OPS_DISABLED) {
+ if (scx_enable_state() != SCX_DISABLED) {
ret = -EBUSY;
goto err_unlock;
}
- scx_root_kobj = kzalloc(sizeof(*scx_root_kobj), GFP_KERNEL);
- if (!scx_root_kobj) {
- ret = -ENOMEM;
+ sch = scx_alloc_and_add_sched(ops);
+ if (IS_ERR(sch)) {
+ ret = PTR_ERR(sch);
goto err_unlock;
}
- scx_root_kobj->kset = scx_kset;
- ret = kobject_init_and_add(scx_root_kobj, &scx_ktype, NULL, "root");
- if (ret < 0)
- goto err;
-
- scx_exit_info = alloc_exit_info(ops->exit_dump_len);
- if (!scx_exit_info) {
- ret = -ENOMEM;
- goto err_del;
- }
-
/*
- * Set scx_ops, transition to ENABLING and clear exit info to arm the
- * disable path. Failure triggers full disabling from here on.
+ * Transition to ENABLING and clear exit info to arm the disable path.
+ * Failure triggers full disabling from here on.
*/
- scx_ops = *ops;
-
- WARN_ON_ONCE(scx_ops_set_enable_state(SCX_OPS_ENABLING) !=
- SCX_OPS_DISABLED);
-
- atomic_set(&scx_exit_kind, SCX_EXIT_NONE);
- scx_warned_zero_slice = false;
+ WARN_ON_ONCE(scx_set_enable_state(SCX_ENABLING) != SCX_DISABLED);
+ WARN_ON_ONCE(scx_root);
atomic_long_set(&scx_nr_rejected, 0);
@@ -5361,28 +5550,34 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link)
*/
cpus_read_lock();
+ /*
+ * Make the scheduler instance visible. Must be inside cpus_read_lock().
+ * See handle_hotplug().
+ */
+ rcu_assign_pointer(scx_root, sch);
+
scx_idle_enable(ops);
- if (scx_ops.init) {
- ret = SCX_CALL_OP_RET(SCX_KF_UNLOCKED, init, NULL);
+ if (sch->ops.init) {
+ ret = SCX_CALL_OP_RET(sch, SCX_KF_UNLOCKED, init, NULL);
if (ret) {
- ret = ops_sanitize_err("init", ret);
+ ret = ops_sanitize_err(sch, "init", ret);
cpus_read_unlock();
- scx_ops_error("ops.init() failed (%d)", ret);
+ scx_error(sch, "ops.init() failed (%d)", ret);
goto err_disable;
}
}
for (i = SCX_OPI_CPU_HOTPLUG_BEGIN; i < SCX_OPI_CPU_HOTPLUG_END; i++)
if (((void (**)(void))ops)[i])
- static_branch_enable_cpuslocked(&scx_has_op[i]);
+ set_bit(i, sch->has_op);
- check_hotplug_seq(ops);
+ check_hotplug_seq(sch, ops);
scx_idle_update_selcpu_topology(ops);
cpus_read_unlock();
- ret = validate_ops(ops);
+ ret = validate_ops(sch, ops);
if (ret)
goto err_disable;
@@ -5407,27 +5602,19 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link)
scx_watchdog_timeout / 2);
/*
- * Once __scx_ops_enabled is set, %current can be switched to SCX
- * anytime. This can lead to stalls as some BPF schedulers (e.g.
- * userspace scheduling) may not function correctly before all tasks are
- * switched. Init in bypass mode to guarantee forward progress.
+ * Once __scx_enabled is set, %current can be switched to SCX anytime.
+ * This can lead to stalls as some BPF schedulers (e.g. userspace
+ * scheduling) may not function correctly before all tasks are switched.
+ * Init in bypass mode to guarantee forward progress.
*/
- scx_ops_bypass(true);
+ scx_bypass(true);
for (i = SCX_OPI_NORMAL_BEGIN; i < SCX_OPI_NORMAL_END; i++)
if (((void (**)(void))ops)[i])
- static_branch_enable(&scx_has_op[i]);
-
- if (ops->flags & SCX_OPS_ALLOW_QUEUED_WAKEUP)
- static_branch_enable(&scx_ops_allow_queued_wakeup);
- if (ops->flags & SCX_OPS_ENQ_LAST)
- static_branch_enable(&scx_ops_enq_last);
- if (ops->flags & SCX_OPS_ENQ_EXITING)
- static_branch_enable(&scx_ops_enq_exiting);
- if (ops->flags & SCX_OPS_ENQ_MIGRATION_DISABLED)
- static_branch_enable(&scx_ops_enq_migration_disabled);
- if (scx_ops.cpu_acquire || scx_ops.cpu_release)
- static_branch_enable(&scx_ops_cpu_preempt);
+ set_bit(i, sch->has_op);
+
+ if (sch->ops.cpu_acquire || sch->ops.cpu_release)
+ sch->ops.flags |= SCX_OPS_HAS_CPU_PREEMPT;
/*
* Lock out forks, cgroup on/offlining and moves before opening the
@@ -5435,8 +5622,8 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link)
*/
percpu_down_write(&scx_fork_rwsem);
- WARN_ON_ONCE(scx_ops_init_task_enabled);
- scx_ops_init_task_enabled = true;
+ WARN_ON_ONCE(scx_init_task_enabled);
+ scx_init_task_enabled = true;
/*
* Enable ops for every task. Fork is excluded by scx_fork_rwsem
@@ -5445,14 +5632,14 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link)
* tasks. Prep all tasks first and then enable them with preemption
* disabled.
*
- * All cgroups should be initialized before scx_ops_init_task() so that
- * the BPF scheduler can reliably track each task's cgroup membership
- * from scx_ops_init_task(). Lock out cgroup on/offlining and task
- * migrations while tasks are being initialized so that
- * scx_cgroup_can_attach() never sees uninitialized tasks.
+ * All cgroups should be initialized before scx_init_task() so that the
+ * BPF scheduler can reliably track each task's cgroup membership from
+ * scx_init_task(). Lock out cgroup on/offlining and task migrations
+ * while tasks are being initialized so that scx_cgroup_can_attach()
+ * never sees uninitialized tasks.
*/
scx_cgroup_lock();
- ret = scx_cgroup_init();
+ ret = scx_cgroup_init(sch);
if (ret)
goto err_disable_unlock_all;
@@ -5468,13 +5655,13 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link)
scx_task_iter_unlock(&sti);
- ret = scx_ops_init_task(p, task_group(p), false);
+ ret = scx_init_task(p, task_group(p), false);
if (ret) {
put_task_struct(p);
scx_task_iter_relock(&sti);
scx_task_iter_stop(&sti);
- scx_ops_error("ops.init_task() failed (%d) for %s[%d]",
- ret, p->comm, p->pid);
+ scx_error(sch, "ops.init_task() failed (%d) for %s[%d]",
+ ret, p->comm, p->pid);
goto err_disable_unlock_all;
}
@@ -5492,7 +5679,7 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link)
* all eligible tasks.
*/
WRITE_ONCE(scx_switching_all, !(ops->flags & SCX_OPS_SWITCH_PARTIAL));
- static_branch_enable(&__scx_ops_enabled);
+ static_branch_enable(&__scx_enabled);
/*
* We're fully committed and can't fail. The task READY -> ENABLED
@@ -5523,10 +5710,10 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link)
scx_task_iter_stop(&sti);
percpu_up_write(&scx_fork_rwsem);
- scx_ops_bypass(false);
+ scx_bypass(false);
- if (!scx_ops_tryset_enable_state(SCX_OPS_ENABLED, SCX_OPS_ENABLING)) {
- WARN_ON_ONCE(atomic_read(&scx_exit_kind) == SCX_EXIT_NONE);
+ if (!scx_tryset_enable_state(SCX_ENABLED, SCX_ENABLING)) {
+ WARN_ON_ONCE(atomic_read(&sch->exit_kind) == SCX_EXIT_NONE);
goto err_disable;
}
@@ -5534,44 +5721,35 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link)
static_branch_enable(&__scx_switched_all);
pr_info("sched_ext: BPF scheduler \"%s\" enabled%s\n",
- scx_ops.name, scx_switched_all() ? "" : " (partial)");
- kobject_uevent(scx_root_kobj, KOBJ_ADD);
- mutex_unlock(&scx_ops_enable_mutex);
+ sch->ops.name, scx_switched_all() ? "" : " (partial)");
+ kobject_uevent(&sch->kobj, KOBJ_ADD);
+ mutex_unlock(&scx_enable_mutex);
atomic_long_inc(&scx_enable_seq);
return 0;
-err_del:
- kobject_del(scx_root_kobj);
-err:
- kobject_put(scx_root_kobj);
- scx_root_kobj = NULL;
- if (scx_exit_info) {
- free_exit_info(scx_exit_info);
- scx_exit_info = NULL;
- }
err_unlock:
- mutex_unlock(&scx_ops_enable_mutex);
+ mutex_unlock(&scx_enable_mutex);
return ret;
err_disable_unlock_all:
scx_cgroup_unlock();
percpu_up_write(&scx_fork_rwsem);
- scx_ops_bypass(false);
+ scx_bypass(false);
err_disable:
- mutex_unlock(&scx_ops_enable_mutex);
+ mutex_unlock(&scx_enable_mutex);
/*
* Returning an error code here would not pass all the error information
- * to userspace. Record errno using scx_ops_error() for cases
- * scx_ops_error() wasn't already invoked and exit indicating success so
- * that the error is notified through ops.exit() with all the details.
+ * to userspace. Record errno using scx_error() for cases scx_error()
+ * wasn't already invoked and exit indicating success so that the error
+ * is notified through ops.exit() with all the details.
*
- * Flush scx_ops_disable_work to ensure that error is reported before
- * init completion.
+ * Flush scx_disable_work to ensure that error is reported before init
+ * completion. sch's base reference will be put by bpf_scx_unreg().
*/
- scx_ops_error("scx_ops_enable() failed (%d)", ret);
- kthread_flush_work(&scx_ops_disable_work);
+ scx_error(sch, "scx_enable() failed (%d)", ret);
+ kthread_flush_work(&sch->disable_work);
return 0;
}
@@ -5622,21 +5800,8 @@ static int bpf_scx_btf_struct_access(struct bpf_verifier_log *log,
return -EACCES;
}
-static const struct bpf_func_proto *
-bpf_scx_get_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
-{
- switch (func_id) {
- case BPF_FUNC_task_storage_get:
- return &bpf_task_storage_get_proto;
- case BPF_FUNC_task_storage_delete:
- return &bpf_task_storage_delete_proto;
- default:
- return bpf_base_func_proto(func_id, prog);
- }
-}
-
static const struct bpf_verifier_ops bpf_scx_verifier_ops = {
- .get_func_proto = bpf_scx_get_func_proto,
+ .get_func_proto = bpf_base_func_proto,
.is_valid_access = bpf_scx_is_valid_access,
.btf_struct_access = bpf_scx_btf_struct_access,
};
@@ -5715,13 +5880,17 @@ static int bpf_scx_check_member(const struct btf_type *t,
static int bpf_scx_reg(void *kdata, struct bpf_link *link)
{
- return scx_ops_enable(kdata, link);
+ return scx_enable(kdata, link);
}
static void bpf_scx_unreg(void *kdata, struct bpf_link *link)
{
- scx_ops_disable(SCX_EXIT_UNREG);
- kthread_flush_work(&scx_ops_disable_work);
+ struct sched_ext_ops *ops = kdata;
+ struct scx_sched *sch = ops->priv;
+
+ scx_disable(SCX_EXIT_UNREG);
+ kthread_flush_work(&sch->disable_work);
+ kobject_put(&sch->kobj);
}
static int bpf_scx_init(struct btf *btf)
@@ -5843,10 +6012,7 @@ static struct bpf_struct_ops bpf_sched_ext_ops = {
static void sysrq_handle_sched_ext_reset(u8 key)
{
- if (scx_ops_helper)
- scx_ops_disable(SCX_EXIT_SYSRQ);
- else
- pr_info("sched_ext: BPF scheduler not yet used\n");
+ scx_disable(SCX_EXIT_SYSRQ);
}
static const struct sysrq_key_op sysrq_sched_ext_reset_op = {
@@ -5994,13 +6160,14 @@ static void kick_cpus_irq_workfn(struct irq_work *irq_work)
*/
void print_scx_info(const char *log_lvl, struct task_struct *p)
{
- enum scx_ops_enable_state state = scx_ops_enable_state();
+ struct scx_sched *sch = scx_root;
+ enum scx_enable_state state = scx_enable_state();
const char *all = READ_ONCE(scx_switching_all) ? "+all" : "";
char runnable_at_buf[22] = "?";
struct sched_class *class;
unsigned long runnable_at;
- if (state == SCX_OPS_DISABLED)
+ if (state == SCX_DISABLED)
return;
/*
@@ -6009,8 +6176,8 @@ void print_scx_info(const char *log_lvl, struct task_struct *p)
*/
if (copy_from_kernel_nofault(&class, &p->sched_class, sizeof(class)) ||
class != &ext_sched_class) {
- printk("%sSched_ext: %s (%s%s)", log_lvl, scx_ops.name,
- scx_ops_enable_state_str[state], all);
+ printk("%sSched_ext: %s (%s%s)", log_lvl, sch->ops.name,
+ scx_enable_state_str[state], all);
return;
}
@@ -6021,7 +6188,7 @@ void print_scx_info(const char *log_lvl, struct task_struct *p)
/* print everything onto one line to conserve console space */
printk("%sSched_ext: %s (%s%s), task: runnable_at=%s",
- log_lvl, scx_ops.name, scx_ops_enable_state_str[state], all,
+ log_lvl, sch->ops.name, scx_enable_state_str[state], all,
runnable_at_buf);
}
@@ -6037,12 +6204,12 @@ static int scx_pm_handler(struct notifier_block *nb, unsigned long event, void *
case PM_HIBERNATION_PREPARE:
case PM_SUSPEND_PREPARE:
case PM_RESTORE_PREPARE:
- scx_ops_bypass(true);
+ scx_bypass(true);
break;
case PM_POST_HIBERNATION:
case PM_POST_SUSPEND:
case PM_POST_RESTORE:
- scx_ops_bypass(false);
+ scx_bypass(false);
break;
}
@@ -6065,7 +6232,6 @@ void __init init_sched_ext_class(void)
WRITE_ONCE(v, SCX_ENQ_WAKEUP | SCX_DEQ_SLEEP | SCX_KICK_PREEMPT |
SCX_TG_ONLINE);
- BUG_ON(rhashtable_init(&dsq_hash, &dsq_hash_params));
scx_idle_init_masks();
scx_kick_cpus_pnt_seqs =
@@ -6109,12 +6275,12 @@ static bool scx_dsq_insert_preamble(struct task_struct *p, u64 enq_flags)
lockdep_assert_irqs_disabled();
if (unlikely(!p)) {
- scx_ops_error("called with NULL task");
+ scx_kf_error("called with NULL task");
return false;
}
if (unlikely(enq_flags & __SCX_ENQ_INTERNAL_MASK)) {
- scx_ops_error("invalid enq_flags 0x%llx", enq_flags);
+ scx_kf_error("invalid enq_flags 0x%llx", enq_flags);
return false;
}
@@ -6134,7 +6300,7 @@ static void scx_dsq_insert_commit(struct task_struct *p, u64 dsq_id,
}
if (unlikely(dspc->cursor >= scx_dsp_max_batch)) {
- scx_ops_error("dispatch buffer overflow");
+ scx_kf_error("dispatch buffer overflow");
return;
}
@@ -6266,6 +6432,7 @@ static const struct btf_kfunc_id_set scx_kfunc_set_enqueue_dispatch = {
static bool scx_dsq_move(struct bpf_iter_scx_dsq_kern *kit,
struct task_struct *p, u64 dsq_id, u64 enq_flags)
{
+ struct scx_sched *sch = scx_root;
struct scx_dispatch_q *src_dsq = kit->dsq, *dst_dsq;
struct rq *this_rq, *src_rq, *locked_rq;
bool dispatched = false;
@@ -6300,7 +6467,7 @@ static bool scx_dsq_move(struct bpf_iter_scx_dsq_kern *kit,
* cause similar live-lock conditions as consume_dispatch_q(). Insert a
* breather if necessary.
*/
- scx_ops_breather(src_rq);
+ scx_breather(src_rq);
locked_rq = src_rq;
raw_spin_lock(&src_dsq->lock);
@@ -6318,7 +6485,7 @@ static bool scx_dsq_move(struct bpf_iter_scx_dsq_kern *kit,
}
/* @p is still on $src_dsq and stable, determine the destination */
- dst_dsq = find_dsq_for_dispatch(this_rq, dsq_id, p);
+ dst_dsq = find_dsq_for_dispatch(sch, this_rq, dsq_id, p);
/*
* Apply vtime and slice updates before moving so that the new time is
@@ -6331,7 +6498,7 @@ static bool scx_dsq_move(struct bpf_iter_scx_dsq_kern *kit,
p->scx.slice = kit->slice;
/* execute move */
- locked_rq = move_task_between_dsqs(p, enq_flags, src_dsq, dst_dsq);
+ locked_rq = move_task_between_dsqs(sch, p, enq_flags, src_dsq, dst_dsq);
dispatched = true;
out:
if (in_balance) {
@@ -6379,7 +6546,7 @@ __bpf_kfunc void scx_bpf_dispatch_cancel(void)
if (dspc->cursor > 0)
dspc->cursor--;
else
- scx_ops_error("dispatch buffer underflow");
+ scx_kf_error("dispatch buffer underflow");
}
/**
@@ -6398,21 +6565,22 @@ __bpf_kfunc void scx_bpf_dispatch_cancel(void)
*/
__bpf_kfunc bool scx_bpf_dsq_move_to_local(u64 dsq_id)
{
+ struct scx_sched *sch = scx_root;
struct scx_dsp_ctx *dspc = this_cpu_ptr(scx_dsp_ctx);
struct scx_dispatch_q *dsq;
if (!scx_kf_allowed(SCX_KF_DISPATCH))
return false;
- flush_dispatch_buf(dspc->rq);
+ flush_dispatch_buf(sch, dspc->rq);
- dsq = find_user_dsq(dsq_id);
+ dsq = find_user_dsq(sch, dsq_id);
if (unlikely(!dsq)) {
- scx_ops_error("invalid DSQ ID 0x%016llx", dsq_id);
+ scx_error(sch, "invalid DSQ ID 0x%016llx", dsq_id);
return false;
}
- if (consume_dispatch_q(dspc->rq, dsq)) {
+ if (consume_dispatch_q(sch, dspc->rq, dsq)) {
/*
* A successfully consumed task can be dequeued before it starts
* running while the CPU is trying to migrate other dispatched
@@ -6666,10 +6834,36 @@ __bpf_kfunc_start_defs();
*/
__bpf_kfunc s32 scx_bpf_create_dsq(u64 dsq_id, s32 node)
{
+ struct scx_dispatch_q *dsq;
+ struct scx_sched *sch;
+ s32 ret;
+
if (unlikely(node >= (int)nr_node_ids ||
(node < 0 && node != NUMA_NO_NODE)))
return -EINVAL;
- return PTR_ERR_OR_ZERO(create_dsq(dsq_id, node));
+
+ if (unlikely(dsq_id & SCX_DSQ_FLAG_BUILTIN))
+ return -EINVAL;
+
+ dsq = kmalloc_node(sizeof(*dsq), GFP_KERNEL, node);
+ if (!dsq)
+ return -ENOMEM;
+
+ init_dsq(dsq, dsq_id);
+
+ rcu_read_lock();
+
+ sch = rcu_dereference(scx_root);
+ if (sch)
+ ret = rhashtable_lookup_insert_fast(&sch->dsq_hash, &dsq->hash_node,
+ dsq_hash_params);
+ else
+ ret = -ENODEV;
+
+ rcu_read_unlock();
+ if (ret)
+ kfree(dsq);
+ return ret;
}
__bpf_kfunc_end_defs();
@@ -6708,7 +6902,7 @@ __bpf_kfunc void scx_bpf_kick_cpu(s32 cpu, u64 flags)
struct rq *this_rq;
unsigned long irq_flags;
- if (!ops_cpu_valid(cpu, NULL))
+ if (!kf_cpu_valid(cpu, NULL))
return;
local_irq_save(irq_flags);
@@ -6732,7 +6926,7 @@ __bpf_kfunc void scx_bpf_kick_cpu(s32 cpu, u64 flags)
struct rq *target_rq = cpu_rq(cpu);
if (unlikely(flags & (SCX_KICK_PREEMPT | SCX_KICK_WAIT)))
- scx_ops_error("PREEMPT/WAIT cannot be used with SCX_KICK_IDLE");
+ scx_kf_error("PREEMPT/WAIT cannot be used with SCX_KICK_IDLE");
if (raw_spin_rq_trylock(target_rq)) {
if (can_skip_idle_kick(target_rq)) {
@@ -6765,23 +6959,30 @@ out:
*/
__bpf_kfunc s32 scx_bpf_dsq_nr_queued(u64 dsq_id)
{
+ struct scx_sched *sch;
struct scx_dispatch_q *dsq;
s32 ret;
preempt_disable();
+ sch = rcu_dereference_sched(scx_root);
+ if (unlikely(!sch)) {
+ ret = -ENODEV;
+ goto out;
+ }
+
if (dsq_id == SCX_DSQ_LOCAL) {
ret = READ_ONCE(this_rq()->scx.local_dsq.nr);
goto out;
} else if ((dsq_id & SCX_DSQ_LOCAL_ON) == SCX_DSQ_LOCAL_ON) {
s32 cpu = dsq_id & SCX_DSQ_LOCAL_CPU_MASK;
- if (ops_cpu_valid(cpu, NULL)) {
+ if (ops_cpu_valid(sch, cpu, NULL)) {
ret = READ_ONCE(cpu_rq(cpu)->scx.local_dsq.nr);
goto out;
}
} else {
- dsq = find_user_dsq(dsq_id);
+ dsq = find_user_dsq(sch, dsq_id);
if (dsq) {
ret = READ_ONCE(dsq->nr);
goto out;
@@ -6804,7 +7005,13 @@ out:
*/
__bpf_kfunc void scx_bpf_destroy_dsq(u64 dsq_id)
{
- destroy_dsq(dsq_id);
+ struct scx_sched *sch;
+
+ rcu_read_lock();
+ sch = rcu_dereference(scx_root);
+ if (sch)
+ destroy_dsq(sch, dsq_id);
+ rcu_read_unlock();
}
/**
@@ -6821,6 +7028,7 @@ __bpf_kfunc int bpf_iter_scx_dsq_new(struct bpf_iter_scx_dsq *it, u64 dsq_id,
u64 flags)
{
struct bpf_iter_scx_dsq_kern *kit = (void *)it;
+ struct scx_sched *sch;
BUILD_BUG_ON(sizeof(struct bpf_iter_scx_dsq_kern) >
sizeof(struct bpf_iter_scx_dsq));
@@ -6833,10 +7041,14 @@ __bpf_kfunc int bpf_iter_scx_dsq_new(struct bpf_iter_scx_dsq *it, u64 dsq_id,
*/
kit->dsq = NULL;
+ sch = rcu_dereference_check(scx_root, rcu_read_lock_bh_held());
+ if (unlikely(!sch))
+ return -ENODEV;
+
if (flags & ~__SCX_DSQ_ITER_USER_FLAGS)
return -EINVAL;
- kit->dsq = find_user_dsq(dsq_id);
+ kit->dsq = find_user_dsq(sch, dsq_id);
if (!kit->dsq)
return -ENOENT;
@@ -6926,21 +7138,20 @@ static s32 __bstr_format(u64 *data_buf, char *line_buf, size_t line_size,
if (data__sz % 8 || data__sz > MAX_BPRINTF_VARARGS * 8 ||
(data__sz && !data)) {
- scx_ops_error("invalid data=%p and data__sz=%u",
- (void *)data, data__sz);
+ scx_kf_error("invalid data=%p and data__sz=%u", (void *)data, data__sz);
return -EINVAL;
}
ret = copy_from_kernel_nofault(data_buf, data, data__sz);
if (ret < 0) {
- scx_ops_error("failed to read data fields (%d)", ret);
+ scx_kf_error("failed to read data fields (%d)", ret);
return ret;
}
ret = bpf_bprintf_prepare(fmt, UINT_MAX, data_buf, data__sz / 8,
&bprintf_data);
if (ret < 0) {
- scx_ops_error("format preparation failed (%d)", ret);
+ scx_kf_error("format preparation failed (%d)", ret);
return ret;
}
@@ -6948,8 +7159,7 @@ static s32 __bstr_format(u64 *data_buf, char *line_buf, size_t line_size,
bprintf_data.bin_args);
bpf_bprintf_cleanup(&bprintf_data);
if (ret < 0) {
- scx_ops_error("(\"%s\", %p, %u) failed to format",
- fmt, data, data__sz);
+ scx_kf_error("(\"%s\", %p, %u) failed to format", fmt, data, data__sz);
return ret;
}
@@ -6982,8 +7192,7 @@ __bpf_kfunc void scx_bpf_exit_bstr(s64 exit_code, char *fmt,
raw_spin_lock_irqsave(&scx_exit_bstr_buf_lock, flags);
if (bstr_format(&scx_exit_bstr_buf, fmt, data, data__sz) >= 0)
- scx_ops_exit_kind(SCX_EXIT_UNREG_BPF, exit_code, "%s",
- scx_exit_bstr_buf.line);
+ scx_kf_exit(SCX_EXIT_UNREG_BPF, exit_code, "%s", scx_exit_bstr_buf.line);
raw_spin_unlock_irqrestore(&scx_exit_bstr_buf_lock, flags);
}
@@ -7003,8 +7212,7 @@ __bpf_kfunc void scx_bpf_error_bstr(char *fmt, unsigned long long *data,
raw_spin_lock_irqsave(&scx_exit_bstr_buf_lock, flags);
if (bstr_format(&scx_exit_bstr_buf, fmt, data, data__sz) >= 0)
- scx_ops_exit_kind(SCX_EXIT_ERROR_BPF, 0, "%s",
- scx_exit_bstr_buf.line);
+ scx_kf_exit(SCX_EXIT_ERROR_BPF, 0, "%s", scx_exit_bstr_buf.line);
raw_spin_unlock_irqrestore(&scx_exit_bstr_buf_lock, flags);
}
@@ -7028,7 +7236,7 @@ __bpf_kfunc void scx_bpf_dump_bstr(char *fmt, unsigned long long *data,
s32 ret;
if (raw_smp_processor_id() != dd->cpu) {
- scx_ops_error("scx_bpf_dump() must only be called from ops.dump() and friends");
+ scx_kf_error("scx_bpf_dump() must only be called from ops.dump() and friends");
return;
}
@@ -7069,7 +7277,7 @@ __bpf_kfunc void scx_bpf_dump_bstr(char *fmt, unsigned long long *data,
*/
__bpf_kfunc u32 scx_bpf_cpuperf_cap(s32 cpu)
{
- if (ops_cpu_valid(cpu, NULL))
+ if (kf_cpu_valid(cpu, NULL))
return arch_scale_cpu_capacity(cpu);
else
return SCX_CPUPERF_ONE;
@@ -7091,7 +7299,7 @@ __bpf_kfunc u32 scx_bpf_cpuperf_cap(s32 cpu)
*/
__bpf_kfunc u32 scx_bpf_cpuperf_cur(s32 cpu)
{
- if (ops_cpu_valid(cpu, NULL))
+ if (kf_cpu_valid(cpu, NULL))
return arch_scale_freq_capacity(cpu);
else
return SCX_CPUPERF_ONE;
@@ -7114,11 +7322,11 @@ __bpf_kfunc u32 scx_bpf_cpuperf_cur(s32 cpu)
__bpf_kfunc void scx_bpf_cpuperf_set(s32 cpu, u32 perf)
{
if (unlikely(perf > SCX_CPUPERF_ONE)) {
- scx_ops_error("Invalid cpuperf target %u for CPU %d", perf, cpu);
+ scx_kf_error("Invalid cpuperf target %u for CPU %d", perf, cpu);
return;
}
- if (ops_cpu_valid(cpu, NULL)) {
+ if (kf_cpu_valid(cpu, NULL)) {
struct rq *rq = cpu_rq(cpu), *locked_rq = scx_locked_rq();
struct rq_flags rf;
@@ -7127,7 +7335,7 @@ __bpf_kfunc void scx_bpf_cpuperf_set(s32 cpu, u32 perf)
* to the corresponding CPU to prevent ABBA deadlocks.
*/
if (locked_rq && rq != locked_rq) {
- scx_ops_error("Invalid target CPU %d", cpu);
+ scx_kf_error("Invalid target CPU %d", cpu);
return;
}
@@ -7222,7 +7430,7 @@ __bpf_kfunc s32 scx_bpf_task_cpu(const struct task_struct *p)
*/
__bpf_kfunc struct rq *scx_bpf_cpu_rq(s32 cpu)
{
- if (!ops_cpu_valid(cpu, NULL))
+ if (!kf_cpu_valid(cpu, NULL))
return NULL;
return cpu_rq(cpu);
@@ -7318,6 +7526,27 @@ __bpf_kfunc u64 scx_bpf_now(void)
return clock;
}
+static void scx_read_events(struct scx_sched *sch, struct scx_event_stats *events)
+{
+ struct scx_event_stats *e_cpu;
+ int cpu;
+
+ /* Aggregate per-CPU event counters into @events. */
+ memset(events, 0, sizeof(*events));
+ for_each_possible_cpu(cpu) {
+ e_cpu = per_cpu_ptr(sch->event_stats_cpu, cpu);
+ scx_agg_event(events, e_cpu, SCX_EV_SELECT_CPU_FALLBACK);
+ scx_agg_event(events, e_cpu, SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE);
+ scx_agg_event(events, e_cpu, SCX_EV_DISPATCH_KEEP_LAST);
+ scx_agg_event(events, e_cpu, SCX_EV_ENQ_SKIP_EXITING);
+ scx_agg_event(events, e_cpu, SCX_EV_ENQ_SKIP_MIGRATION_DISABLED);
+ scx_agg_event(events, e_cpu, SCX_EV_REFILL_SLICE_DFL);
+ scx_agg_event(events, e_cpu, SCX_EV_BYPASS_DURATION);
+ scx_agg_event(events, e_cpu, SCX_EV_BYPASS_DISPATCH);
+ scx_agg_event(events, e_cpu, SCX_EV_BYPASS_ACTIVATE);
+ }
+}
+
/*
* scx_bpf_events - Get a system-wide event counter to
* @events: output buffer from a BPF program
@@ -7326,23 +7555,16 @@ __bpf_kfunc u64 scx_bpf_now(void)
__bpf_kfunc void scx_bpf_events(struct scx_event_stats *events,
size_t events__sz)
{
- struct scx_event_stats e_sys, *e_cpu;
- int cpu;
+ struct scx_sched *sch;
+ struct scx_event_stats e_sys;
- /* Aggregate per-CPU event counters into the system-wide counters. */
- memset(&e_sys, 0, sizeof(e_sys));
- for_each_possible_cpu(cpu) {
- e_cpu = per_cpu_ptr(&event_stats_cpu, cpu);
- scx_agg_event(&e_sys, e_cpu, SCX_EV_SELECT_CPU_FALLBACK);
- scx_agg_event(&e_sys, e_cpu, SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE);
- scx_agg_event(&e_sys, e_cpu, SCX_EV_DISPATCH_KEEP_LAST);
- scx_agg_event(&e_sys, e_cpu, SCX_EV_ENQ_SKIP_EXITING);
- scx_agg_event(&e_sys, e_cpu, SCX_EV_ENQ_SKIP_MIGRATION_DISABLED);
- scx_agg_event(&e_sys, e_cpu, SCX_EV_ENQ_SLICE_DFL);
- scx_agg_event(&e_sys, e_cpu, SCX_EV_BYPASS_DURATION);
- scx_agg_event(&e_sys, e_cpu, SCX_EV_BYPASS_DISPATCH);
- scx_agg_event(&e_sys, e_cpu, SCX_EV_BYPASS_ACTIVATE);
- }
+ rcu_read_lock();
+ sch = rcu_dereference(scx_root);
+ if (sch)
+ scx_read_events(sch, &e_sys);
+ else
+ memset(&e_sys, 0, sizeof(e_sys));
+ rcu_read_unlock();
/*
* We cannot entirely trust a BPF-provided size since a BPF program
diff --git a/kernel/sched/ext.h b/kernel/sched/ext.h
index 1bda96b19a1b..a75835c23f15 100644
--- a/kernel/sched/ext.h
+++ b/kernel/sched/ext.h
@@ -8,6 +8,11 @@
*/
#ifdef CONFIG_SCHED_CLASS_EXT
+static inline bool scx_kf_allowed_if_unlocked(void)
+{
+ return !current->scx.kf_mask;
+}
+
DECLARE_STATIC_KEY_FALSE(scx_ops_allow_queued_wakeup);
void scx_tick(struct rq *rq);
@@ -21,6 +26,7 @@ void scx_rq_activate(struct rq *rq);
void scx_rq_deactivate(struct rq *rq);
int scx_check_setscheduler(struct task_struct *p, int policy);
bool task_should_scx(int policy);
+bool scx_allow_ttwu_queue(const struct task_struct *p);
void init_sched_ext_class(void);
static inline u32 scx_cpuperf_target(s32 cpu)
@@ -36,13 +42,6 @@ static inline bool task_on_scx(const struct task_struct *p)
return scx_enabled() && p->sched_class == &ext_sched_class;
}
-static inline bool scx_allow_ttwu_queue(const struct task_struct *p)
-{
- return !scx_enabled() ||
- static_branch_likely(&scx_ops_allow_queued_wakeup) ||
- p->sched_class != &ext_sched_class;
-}
-
#ifdef CONFIG_SCHED_CORE
bool scx_prio_less(const struct task_struct *a, const struct task_struct *b,
bool in_fi);
@@ -80,6 +79,7 @@ static inline void scx_update_idle(struct rq *rq, bool idle, bool do_notify) {}
#ifdef CONFIG_CGROUP_SCHED
#ifdef CONFIG_EXT_GROUP_SCHED
+void scx_tg_init(struct task_group *tg);
int scx_tg_online(struct task_group *tg);
void scx_tg_offline(struct task_group *tg);
int scx_cgroup_can_attach(struct cgroup_taskset *tset);
@@ -89,6 +89,7 @@ void scx_cgroup_cancel_attach(struct cgroup_taskset *tset);
void scx_group_set_weight(struct task_group *tg, unsigned long cgrp_weight);
void scx_group_set_idle(struct task_group *tg, bool idle);
#else /* CONFIG_EXT_GROUP_SCHED */
+static inline void scx_tg_init(struct task_group *tg) {}
static inline int scx_tg_online(struct task_group *tg) { return 0; }
static inline void scx_tg_offline(struct task_group *tg) {}
static inline int scx_cgroup_can_attach(struct cgroup_taskset *tset) { return 0; }
diff --git a/kernel/sched/ext_idle.c b/kernel/sched/ext_idle.c
index e67a19a071c1..001fb88a8481 100644
--- a/kernel/sched/ext_idle.c
+++ b/kernel/sched/ext_idle.c
@@ -47,6 +47,13 @@ static struct scx_idle_cpus scx_idle_global_masks;
static struct scx_idle_cpus **scx_idle_node_masks;
/*
+ * Local per-CPU cpumasks (used to generate temporary idle cpumasks).
+ */
+static DEFINE_PER_CPU(cpumask_var_t, local_idle_cpumask);
+static DEFINE_PER_CPU(cpumask_var_t, local_llc_idle_cpumask);
+static DEFINE_PER_CPU(cpumask_var_t, local_numa_idle_cpumask);
+
+/*
* Return the idle masks associated to a target @node.
*
* NUMA_NO_NODE identifies the global idle cpumask.
@@ -131,6 +138,7 @@ found:
goto retry;
}
+#ifdef CONFIG_NUMA
/*
* Tracks nodes that have not yet been visited when searching for an idle
* CPU across all available nodes.
@@ -179,6 +187,13 @@ static s32 pick_idle_cpu_from_online_nodes(const struct cpumask *cpus_allowed, i
return cpu;
}
+#else
+static inline s32
+pick_idle_cpu_from_online_nodes(const struct cpumask *cpus_allowed, int node, u64 flags)
+{
+ return -EBUSY;
+}
+#endif
/*
* Find an idle CPU in the system, starting from @node.
@@ -392,6 +407,14 @@ void scx_idle_update_selcpu_topology(struct sched_ext_ops *ops)
}
/*
+ * Return true if @p can run on all possible CPUs, false otherwise.
+ */
+static inline bool task_affinity_all(const struct task_struct *p)
+{
+ return p->nr_cpus_allowed >= num_possible_cpus();
+}
+
+/*
* Built-in CPU idle selection policy:
*
* 1. Prioritize full-idle cores:
@@ -403,13 +426,15 @@ void scx_idle_update_selcpu_topology(struct sched_ext_ops *ops)
* branch prediction optimizations.
*
* 3. Pick a CPU within the same LLC (Last-Level Cache):
- * - if the above conditions aren't met, pick a CPU that shares the same LLC
- * to maintain cache locality.
+ * - if the above conditions aren't met, pick a CPU that shares the same
+ * LLC, if the LLC domain is a subset of @cpus_allowed, to maintain
+ * cache locality.
*
* 4. Pick a CPU within the same NUMA node, if enabled:
- * - choose a CPU from the same NUMA node to reduce memory access latency.
+ * - choose a CPU from the same NUMA node, if the node cpumask is a
+ * subset of @cpus_allowed, to reduce memory access latency.
*
- * 5. Pick any idle CPU usable by the task.
+ * 5. Pick any idle CPU within the @cpus_allowed domain.
*
* Step 3 and 4 are performed only if the system has, respectively,
* multiple LLCs / multiple NUMA nodes (see scx_selcpu_topo_llc and
@@ -424,35 +449,69 @@ void scx_idle_update_selcpu_topology(struct sched_ext_ops *ops)
* NOTE: tasks that can only run on 1 CPU are excluded by this logic, because
* we never call ops.select_cpu() for them, see select_task_rq().
*/
-s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags, u64 flags)
+s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags,
+ const struct cpumask *cpus_allowed, u64 flags)
{
- const struct cpumask *llc_cpus = NULL;
- const struct cpumask *numa_cpus = NULL;
+ const struct cpumask *llc_cpus = NULL, *numa_cpus = NULL;
+ const struct cpumask *allowed = cpus_allowed ?: p->cpus_ptr;
int node = scx_cpu_node_if_enabled(prev_cpu);
+ bool is_prev_allowed;
s32 cpu;
+ preempt_disable();
+
+ /*
+ * Check whether @prev_cpu is still within the allowed set. If not,
+ * we can still try selecting a nearby CPU.
+ */
+ is_prev_allowed = cpumask_test_cpu(prev_cpu, allowed);
+
+ /*
+ * Determine the subset of CPUs usable by @p within @cpus_allowed.
+ */
+ if (allowed != p->cpus_ptr) {
+ struct cpumask *local_cpus = this_cpu_cpumask_var_ptr(local_idle_cpumask);
+
+ if (task_affinity_all(p)) {
+ allowed = cpus_allowed;
+ } else if (cpumask_and(local_cpus, cpus_allowed, p->cpus_ptr)) {
+ allowed = local_cpus;
+ } else {
+ cpu = -EBUSY;
+ goto out_enable;
+ }
+ }
+
/*
* This is necessary to protect llc_cpus.
*/
rcu_read_lock();
/*
- * Determine the scheduling domain only if the task is allowed to run
- * on all CPUs.
+ * Determine the subset of CPUs that the task can use in its
+ * current LLC and node.
*
- * This is done primarily for efficiency, as it avoids the overhead of
- * updating a cpumask every time we need to select an idle CPU (which
- * can be costly in large SMP systems), but it also aligns logically:
- * if a task's scheduling domain is restricted by user-space (through
- * CPU affinity), the task will simply use the flat scheduling domain
- * defined by user-space.
+ * If the task can run on all CPUs, use the node and LLC cpumasks
+ * directly.
*/
- if (p->nr_cpus_allowed >= num_possible_cpus()) {
- if (static_branch_maybe(CONFIG_NUMA, &scx_selcpu_topo_numa))
- numa_cpus = numa_span(prev_cpu);
+ if (static_branch_maybe(CONFIG_NUMA, &scx_selcpu_topo_numa)) {
+ struct cpumask *local_cpus = this_cpu_cpumask_var_ptr(local_numa_idle_cpumask);
+ const struct cpumask *cpus = numa_span(prev_cpu);
+
+ if (allowed == p->cpus_ptr && task_affinity_all(p))
+ numa_cpus = cpus;
+ else if (cpus && cpumask_and(local_cpus, allowed, cpus))
+ numa_cpus = local_cpus;
+ }
- if (static_branch_maybe(CONFIG_SCHED_MC, &scx_selcpu_topo_llc))
- llc_cpus = llc_span(prev_cpu);
+ if (static_branch_maybe(CONFIG_SCHED_MC, &scx_selcpu_topo_llc)) {
+ struct cpumask *local_cpus = this_cpu_cpumask_var_ptr(local_llc_idle_cpumask);
+ const struct cpumask *cpus = llc_span(prev_cpu);
+
+ if (allowed == p->cpus_ptr && task_affinity_all(p))
+ llc_cpus = cpus;
+ else if (cpus && cpumask_and(local_cpus, allowed, cpus))
+ llc_cpus = local_cpus;
}
/*
@@ -466,7 +525,7 @@ s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags, u64
* then avoid a migration.
*/
cpu = smp_processor_id();
- if (cpus_share_cache(cpu, prev_cpu) &&
+ if (is_prev_allowed && cpus_share_cache(cpu, prev_cpu) &&
scx_idle_test_and_clear_cpu(prev_cpu)) {
cpu = prev_cpu;
goto out_unlock;
@@ -490,7 +549,7 @@ s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags, u64
cpu_rq(cpu)->scx.local_dsq.nr == 0 &&
(!(flags & SCX_PICK_IDLE_IN_NODE) || (waker_node == node)) &&
!cpumask_empty(idle_cpumask(waker_node)->cpu)) {
- if (cpumask_test_cpu(cpu, p->cpus_ptr))
+ if (cpumask_test_cpu(cpu, allowed))
goto out_unlock;
}
}
@@ -503,7 +562,8 @@ s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags, u64
/*
* Keep using @prev_cpu if it's part of a fully idle core.
*/
- if (cpumask_test_cpu(prev_cpu, idle_cpumask(node)->smt) &&
+ if (is_prev_allowed &&
+ cpumask_test_cpu(prev_cpu, idle_cpumask(node)->smt) &&
scx_idle_test_and_clear_cpu(prev_cpu)) {
cpu = prev_cpu;
goto out_unlock;
@@ -535,7 +595,7 @@ s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags, u64
* begin in prev_cpu's node and proceed to other nodes in
* order of increasing distance.
*/
- cpu = scx_pick_idle_cpu(p->cpus_ptr, node, flags | SCX_PICK_IDLE_CORE);
+ cpu = scx_pick_idle_cpu(allowed, node, flags | SCX_PICK_IDLE_CORE);
if (cpu >= 0)
goto out_unlock;
@@ -552,7 +612,7 @@ s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags, u64
/*
* Use @prev_cpu if it's idle.
*/
- if (scx_idle_test_and_clear_cpu(prev_cpu)) {
+ if (is_prev_allowed && scx_idle_test_and_clear_cpu(prev_cpu)) {
cpu = prev_cpu;
goto out_unlock;
}
@@ -583,10 +643,12 @@ s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags, u64
* in prev_cpu's node and proceed to other nodes in order of
* increasing distance.
*/
- cpu = scx_pick_idle_cpu(p->cpus_ptr, node, flags);
+ cpu = scx_pick_idle_cpu(allowed, node, flags);
out_unlock:
rcu_read_unlock();
+out_enable:
+ preempt_enable();
return cpu;
}
@@ -596,7 +658,7 @@ out_unlock:
*/
void scx_idle_init_masks(void)
{
- int node;
+ int i;
/* Allocate global idle cpumasks */
BUG_ON(!alloc_cpumask_var(&scx_idle_global_masks.cpu, GFP_KERNEL));
@@ -607,13 +669,23 @@ void scx_idle_init_masks(void)
sizeof(*scx_idle_node_masks), GFP_KERNEL);
BUG_ON(!scx_idle_node_masks);
- for_each_node(node) {
- scx_idle_node_masks[node] = kzalloc_node(sizeof(**scx_idle_node_masks),
- GFP_KERNEL, node);
- BUG_ON(!scx_idle_node_masks[node]);
+ for_each_node(i) {
+ scx_idle_node_masks[i] = kzalloc_node(sizeof(**scx_idle_node_masks),
+ GFP_KERNEL, i);
+ BUG_ON(!scx_idle_node_masks[i]);
- BUG_ON(!alloc_cpumask_var_node(&scx_idle_node_masks[node]->cpu, GFP_KERNEL, node));
- BUG_ON(!alloc_cpumask_var_node(&scx_idle_node_masks[node]->smt, GFP_KERNEL, node));
+ BUG_ON(!alloc_cpumask_var_node(&scx_idle_node_masks[i]->cpu, GFP_KERNEL, i));
+ BUG_ON(!alloc_cpumask_var_node(&scx_idle_node_masks[i]->smt, GFP_KERNEL, i));
+ }
+
+ /* Allocate local per-cpu idle cpumasks */
+ for_each_possible_cpu(i) {
+ BUG_ON(!alloc_cpumask_var_node(&per_cpu(local_idle_cpumask, i),
+ GFP_KERNEL, cpu_to_node(i)));
+ BUG_ON(!alloc_cpumask_var_node(&per_cpu(local_llc_idle_cpumask, i),
+ GFP_KERNEL, cpu_to_node(i)));
+ BUG_ON(!alloc_cpumask_var_node(&per_cpu(local_numa_idle_cpumask, i),
+ GFP_KERNEL, cpu_to_node(i)));
}
}
@@ -662,21 +734,12 @@ static void update_builtin_idle(int cpu, bool idle)
*/
void __scx_update_idle(struct rq *rq, bool idle, bool do_notify)
{
+ struct scx_sched *sch = scx_root;
int cpu = cpu_of(rq);
lockdep_assert_rq_held(rq);
/*
- * Trigger ops.update_idle() only when transitioning from a task to
- * the idle thread and vice versa.
- *
- * Idle transitions are indicated by do_notify being set to true,
- * managed by put_prev_task_idle()/set_next_task_idle().
- */
- if (SCX_HAS_OP(update_idle) && do_notify && !scx_rq_bypassing(rq))
- SCX_CALL_OP(SCX_KF_REST, update_idle, rq, cpu_of(rq), idle);
-
- /*
* Update the idle masks:
* - for real idle transitions (do_notify == true)
* - for idle-to-idle transitions (indicated by the previous task
@@ -693,6 +756,21 @@ void __scx_update_idle(struct rq *rq, bool idle, bool do_notify)
if (static_branch_likely(&scx_builtin_idle_enabled))
if (do_notify || is_idle_task(rq->curr))
update_builtin_idle(cpu, idle);
+
+ /*
+ * Trigger ops.update_idle() only when transitioning from a task to
+ * the idle thread and vice versa.
+ *
+ * Idle transitions are indicated by do_notify being set to true,
+ * managed by put_prev_task_idle()/set_next_task_idle().
+ *
+ * This must come after builtin idle update so that BPF schedulers can
+ * create interlocking between ops.update_idle() and ops.enqueue() -
+ * either enqueue() sees the idle bit or update_idle() sees the task
+ * that enqueue() queued.
+ */
+ if (SCX_HAS_OP(sch, update_idle) && do_notify && !scx_rq_bypassing(rq))
+ SCX_CALL_OP(sch, SCX_KF_REST, update_idle, rq, cpu_of(rq), idle);
}
static void reset_idle_masks(struct sched_ext_ops *ops)
@@ -748,7 +826,7 @@ void scx_idle_disable(void)
static int validate_node(int node)
{
if (!static_branch_likely(&scx_builtin_idle_per_node)) {
- scx_ops_error("per-node idle tracking is disabled");
+ scx_kf_error("per-node idle tracking is disabled");
return -EOPNOTSUPP;
}
@@ -758,13 +836,13 @@ static int validate_node(int node)
/* Make sure node is in a valid range */
if (node < 0 || node >= nr_node_ids) {
- scx_ops_error("invalid node %d", node);
+ scx_kf_error("invalid node %d", node);
return -EINVAL;
}
/* Make sure the node is part of the set of possible nodes */
if (!node_possible(node)) {
- scx_ops_error("unavailable node %d", node);
+ scx_kf_error("unavailable node %d", node);
return -EINVAL;
}
@@ -778,10 +856,72 @@ static bool check_builtin_idle_enabled(void)
if (static_branch_likely(&scx_builtin_idle_enabled))
return true;
- scx_ops_error("built-in idle tracking is disabled");
+ scx_kf_error("built-in idle tracking is disabled");
return false;
}
+s32 select_cpu_from_kfunc(struct task_struct *p, s32 prev_cpu, u64 wake_flags,
+ const struct cpumask *allowed, u64 flags)
+{
+ struct rq *rq;
+ struct rq_flags rf;
+ s32 cpu;
+
+ if (!kf_cpu_valid(prev_cpu, NULL))
+ return -EINVAL;
+
+ if (!check_builtin_idle_enabled())
+ return -EBUSY;
+
+ /*
+ * If called from an unlocked context, acquire the task's rq lock,
+ * so that we can safely access p->cpus_ptr and p->nr_cpus_allowed.
+ *
+ * Otherwise, allow to use this kfunc only from ops.select_cpu()
+ * and ops.select_enqueue().
+ */
+ if (scx_kf_allowed_if_unlocked()) {
+ rq = task_rq_lock(p, &rf);
+ } else {
+ if (!scx_kf_allowed(SCX_KF_SELECT_CPU | SCX_KF_ENQUEUE))
+ return -EPERM;
+ rq = scx_locked_rq();
+ }
+
+ /*
+ * Validate locking correctness to access p->cpus_ptr and
+ * p->nr_cpus_allowed: if we're holding an rq lock, we're safe;
+ * otherwise, assert that p->pi_lock is held.
+ */
+ if (!rq)
+ lockdep_assert_held(&p->pi_lock);
+
+#ifdef CONFIG_SMP
+ /*
+ * This may also be called from ops.enqueue(), so we need to handle
+ * per-CPU tasks as well. For these tasks, we can skip all idle CPU
+ * selection optimizations and simply check whether the previously
+ * used CPU is idle and within the allowed cpumask.
+ */
+ if (p->nr_cpus_allowed == 1 || is_migration_disabled(p)) {
+ if (cpumask_test_cpu(prev_cpu, allowed ?: p->cpus_ptr) &&
+ scx_idle_test_and_clear_cpu(prev_cpu))
+ cpu = prev_cpu;
+ else
+ cpu = -EBUSY;
+ } else {
+ cpu = scx_select_cpu_dfl(p, prev_cpu, wake_flags,
+ allowed ?: p->cpus_ptr, flags);
+ }
+#else
+ cpu = -EBUSY;
+#endif
+ if (scx_kf_allowed_if_unlocked())
+ task_rq_unlock(rq, p, &rf);
+
+ return cpu;
+}
+
/**
* scx_bpf_cpu_node - Return the NUMA node the given @cpu belongs to, or
* trigger an error if @cpu is invalid
@@ -790,7 +930,7 @@ static bool check_builtin_idle_enabled(void)
__bpf_kfunc int scx_bpf_cpu_node(s32 cpu)
{
#ifdef CONFIG_NUMA
- if (!ops_cpu_valid(cpu, NULL))
+ if (!kf_cpu_valid(cpu, NULL))
return NUMA_NO_NODE;
return cpu_to_node(cpu);
@@ -806,9 +946,10 @@ __bpf_kfunc int scx_bpf_cpu_node(s32 cpu)
* @wake_flags: %SCX_WAKE_* flags
* @is_idle: out parameter indicating whether the returned CPU is idle
*
- * Can only be called from ops.select_cpu() if the built-in CPU selection is
- * enabled - ops.update_idle() is missing or %SCX_OPS_KEEP_BUILTIN_IDLE is set.
- * @p, @prev_cpu and @wake_flags match ops.select_cpu().
+ * Can be called from ops.select_cpu(), ops.enqueue(), or from an unlocked
+ * context such as a BPF test_run() call, as long as built-in CPU selection
+ * is enabled: ops.update_idle() is missing or %SCX_OPS_KEEP_BUILTIN_IDLE
+ * is set.
*
* Returns the picked CPU with *@is_idle indicating whether the picked CPU is
* currently idle and thus a good candidate for direct dispatching.
@@ -816,39 +957,52 @@ __bpf_kfunc int scx_bpf_cpu_node(s32 cpu)
__bpf_kfunc s32 scx_bpf_select_cpu_dfl(struct task_struct *p, s32 prev_cpu,
u64 wake_flags, bool *is_idle)
{
-#ifdef CONFIG_SMP
s32 cpu;
-#endif
- if (!ops_cpu_valid(prev_cpu, NULL))
- goto prev_cpu;
-
- if (!check_builtin_idle_enabled())
- goto prev_cpu;
-
- if (!scx_kf_allowed(SCX_KF_SELECT_CPU))
- goto prev_cpu;
-#ifdef CONFIG_SMP
- cpu = scx_select_cpu_dfl(p, prev_cpu, wake_flags, 0);
+ cpu = select_cpu_from_kfunc(p, prev_cpu, wake_flags, NULL, 0);
if (cpu >= 0) {
*is_idle = true;
return cpu;
}
-#endif
-
-prev_cpu:
*is_idle = false;
+
return prev_cpu;
}
/**
+ * scx_bpf_select_cpu_and - Pick an idle CPU usable by task @p,
+ * prioritizing those in @cpus_allowed
+ * @p: task_struct to select a CPU for
+ * @prev_cpu: CPU @p was on previously
+ * @wake_flags: %SCX_WAKE_* flags
+ * @cpus_allowed: cpumask of allowed CPUs
+ * @flags: %SCX_PICK_IDLE* flags
+ *
+ * Can be called from ops.select_cpu(), ops.enqueue(), or from an unlocked
+ * context such as a BPF test_run() call, as long as built-in CPU selection
+ * is enabled: ops.update_idle() is missing or %SCX_OPS_KEEP_BUILTIN_IDLE
+ * is set.
+ *
+ * @p, @prev_cpu and @wake_flags match ops.select_cpu().
+ *
+ * Returns the selected idle CPU, which will be automatically awakened upon
+ * returning from ops.select_cpu() and can be used for direct dispatch, or
+ * a negative value if no idle CPU is available.
+ */
+__bpf_kfunc s32 scx_bpf_select_cpu_and(struct task_struct *p, s32 prev_cpu, u64 wake_flags,
+ const struct cpumask *cpus_allowed, u64 flags)
+{
+ return select_cpu_from_kfunc(p, prev_cpu, wake_flags, cpus_allowed, flags);
+}
+
+/**
* scx_bpf_get_idle_cpumask_node - Get a referenced kptr to the
* idle-tracking per-CPU cpumask of a target NUMA node.
* @node: target NUMA node
*
* Returns an empty cpumask if idle tracking is not enabled, if @node is
* not valid, or running on a UP kernel. In this case the actual error will
- * be reported to the BPF scheduler via scx_ops_error().
+ * be reported to the BPF scheduler via scx_error().
*/
__bpf_kfunc const struct cpumask *scx_bpf_get_idle_cpumask_node(int node)
{
@@ -873,7 +1027,7 @@ __bpf_kfunc const struct cpumask *scx_bpf_get_idle_cpumask_node(int node)
__bpf_kfunc const struct cpumask *scx_bpf_get_idle_cpumask(void)
{
if (static_branch_unlikely(&scx_builtin_idle_per_node)) {
- scx_ops_error("SCX_OPS_BUILTIN_IDLE_PER_NODE enabled");
+ scx_kf_error("SCX_OPS_BUILTIN_IDLE_PER_NODE enabled");
return cpu_none_mask;
}
@@ -895,7 +1049,7 @@ __bpf_kfunc const struct cpumask *scx_bpf_get_idle_cpumask(void)
*
* Returns an empty cpumask if idle tracking is not enabled, if @node is
* not valid, or running on a UP kernel. In this case the actual error will
- * be reported to the BPF scheduler via scx_ops_error().
+ * be reported to the BPF scheduler via scx_error().
*/
__bpf_kfunc const struct cpumask *scx_bpf_get_idle_smtmask_node(int node)
{
@@ -924,7 +1078,7 @@ __bpf_kfunc const struct cpumask *scx_bpf_get_idle_smtmask_node(int node)
__bpf_kfunc const struct cpumask *scx_bpf_get_idle_smtmask(void)
{
if (static_branch_unlikely(&scx_builtin_idle_per_node)) {
- scx_ops_error("SCX_OPS_BUILTIN_IDLE_PER_NODE enabled");
+ scx_kf_error("SCX_OPS_BUILTIN_IDLE_PER_NODE enabled");
return cpu_none_mask;
}
@@ -971,7 +1125,7 @@ __bpf_kfunc bool scx_bpf_test_and_clear_cpu_idle(s32 cpu)
if (!check_builtin_idle_enabled())
return false;
- if (ops_cpu_valid(cpu, NULL))
+ if (kf_cpu_valid(cpu, NULL))
return scx_idle_test_and_clear_cpu(cpu);
else
return false;
@@ -1032,7 +1186,7 @@ __bpf_kfunc s32 scx_bpf_pick_idle_cpu(const struct cpumask *cpus_allowed,
u64 flags)
{
if (static_branch_maybe(CONFIG_NUMA, &scx_builtin_idle_per_node)) {
- scx_ops_error("per-node idle tracking is enabled");
+ scx_kf_error("per-node idle tracking is enabled");
return -EBUSY;
}
@@ -1109,7 +1263,7 @@ __bpf_kfunc s32 scx_bpf_pick_any_cpu(const struct cpumask *cpus_allowed,
s32 cpu;
if (static_branch_maybe(CONFIG_NUMA, &scx_builtin_idle_per_node)) {
- scx_ops_error("per-node idle tracking is enabled");
+ scx_kf_error("per-node idle tracking is enabled");
return -EBUSY;
}
@@ -1140,6 +1294,8 @@ BTF_ID_FLAGS(func, scx_bpf_pick_idle_cpu_node, KF_RCU)
BTF_ID_FLAGS(func, scx_bpf_pick_idle_cpu, KF_RCU)
BTF_ID_FLAGS(func, scx_bpf_pick_any_cpu_node, KF_RCU)
BTF_ID_FLAGS(func, scx_bpf_pick_any_cpu, KF_RCU)
+BTF_ID_FLAGS(func, scx_bpf_select_cpu_and, KF_RCU)
+BTF_ID_FLAGS(func, scx_bpf_select_cpu_dfl, KF_RCU)
BTF_KFUNCS_END(scx_kfunc_ids_idle)
static const struct btf_kfunc_id_set scx_kfunc_set_idle = {
@@ -1147,21 +1303,11 @@ static const struct btf_kfunc_id_set scx_kfunc_set_idle = {
.set = &scx_kfunc_ids_idle,
};
-BTF_KFUNCS_START(scx_kfunc_ids_select_cpu)
-BTF_ID_FLAGS(func, scx_bpf_select_cpu_dfl, KF_RCU)
-BTF_KFUNCS_END(scx_kfunc_ids_select_cpu)
-
-static const struct btf_kfunc_id_set scx_kfunc_set_select_cpu = {
- .owner = THIS_MODULE,
- .set = &scx_kfunc_ids_select_cpu,
-};
-
int scx_idle_init(void)
{
int ret;
- ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, &scx_kfunc_set_select_cpu) ||
- register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, &scx_kfunc_set_idle) ||
+ ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, &scx_kfunc_set_idle) ||
register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, &scx_kfunc_set_idle) ||
register_btf_kfunc_id_set(BPF_PROG_TYPE_SYSCALL, &scx_kfunc_set_idle);
diff --git a/kernel/sched/ext_idle.h b/kernel/sched/ext_idle.h
index 511cc2221f7a..37be78a7502b 100644
--- a/kernel/sched/ext_idle.h
+++ b/kernel/sched/ext_idle.h
@@ -27,7 +27,8 @@ static inline s32 scx_pick_idle_cpu(const struct cpumask *cpus_allowed, int node
}
#endif /* CONFIG_SMP */
-s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags, u64 flags);
+s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags,
+ const struct cpumask *cpus_allowed, u64 flags);
void scx_idle_enable(struct sched_ext_ops *ops);
void scx_idle_disable(void);
int scx_idle_init(void);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 125912c0e9dd..b173a059315c 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -88,7 +88,6 @@ static int __init setup_sched_thermal_decay_shift(char *str)
}
__setup("sched_thermal_decay_shift=", setup_sched_thermal_decay_shift);
-#ifdef CONFIG_SMP
/*
* For asym packing, by default the lower numbered CPU has higher priority.
*/
@@ -111,7 +110,6 @@ int __weak arch_asym_cpu_priority(int cpu)
* (default: ~5%)
*/
#define capacity_greater(cap1, cap2) ((cap1) * 1024 > (cap2) * 1078)
-#endif
#ifdef CONFIG_CFS_BANDWIDTH
/*
@@ -162,7 +160,7 @@ static int __init sched_fair_sysctl_init(void)
return 0;
}
late_initcall(sched_fair_sysctl_init);
-#endif
+#endif /* CONFIG_SYSCTL */
static inline void update_load_add(struct load_weight *lw, unsigned long inc)
{
@@ -471,7 +469,7 @@ static int se_is_idle(struct sched_entity *se)
return cfs_rq_is_idle(group_cfs_rq(se));
}
-#else /* !CONFIG_FAIR_GROUP_SCHED */
+#else /* !CONFIG_FAIR_GROUP_SCHED: */
#define for_each_sched_entity(se) \
for (; se; se = NULL)
@@ -517,7 +515,7 @@ static int se_is_idle(struct sched_entity *se)
return task_has_idle_policy(task_of(se));
}
-#endif /* CONFIG_FAIR_GROUP_SCHED */
+#endif /* !CONFIG_FAIR_GROUP_SCHED */
static __always_inline
void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec);
@@ -884,23 +882,44 @@ struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq)
}
/*
- * HACK, stash a copy of deadline at the point of pick in vlag,
- * which isn't used until dequeue.
+ * Set the vruntime up to which an entity can run before looking
+ * for another entity to pick.
+ * In case of run to parity, we use the shortest slice of the enqueued
+ * entities to set the protected period.
+ * When run to parity is disabled, we give a minimum quantum to the running
+ * entity to ensure progress.
*/
-static inline void set_protect_slice(struct sched_entity *se)
+static inline void set_protect_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+ u64 slice = normalized_sysctl_sched_base_slice;
+ u64 vprot = se->deadline;
+
+ if (sched_feat(RUN_TO_PARITY))
+ slice = cfs_rq_min_slice(cfs_rq);
+
+ slice = min(slice, se->slice);
+ if (slice != se->slice)
+ vprot = min_vruntime(vprot, se->vruntime + calc_delta_fair(slice, se));
+
+ se->vprot = vprot;
+}
+
+static inline void update_protect_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
- se->vlag = se->deadline;
+ u64 slice = cfs_rq_min_slice(cfs_rq);
+
+ se->vprot = min_vruntime(se->vprot, se->vruntime + calc_delta_fair(slice, se));
}
static inline bool protect_slice(struct sched_entity *se)
{
- return se->vlag == se->deadline;
+ return ((s64)(se->vprot - se->vruntime) > 0);
}
static inline void cancel_protect_slice(struct sched_entity *se)
{
if (protect_slice(se))
- se->vlag = se->deadline + 1;
+ se->vprot = se->vruntime;
}
/*
@@ -922,7 +941,7 @@ static inline void cancel_protect_slice(struct sched_entity *se)
*
* Which allows tree pruning through eligibility.
*/
-static struct sched_entity *pick_eevdf(struct cfs_rq *cfs_rq)
+static struct sched_entity *__pick_eevdf(struct cfs_rq *cfs_rq, bool protect)
{
struct rb_node *node = cfs_rq->tasks_timeline.rb_root.rb_node;
struct sched_entity *se = __pick_first_entity(cfs_rq);
@@ -939,7 +958,7 @@ static struct sched_entity *pick_eevdf(struct cfs_rq *cfs_rq)
if (curr && (!curr->on_rq || !entity_eligible(cfs_rq, curr)))
curr = NULL;
- if (sched_feat(RUN_TO_PARITY) && curr && protect_slice(curr))
+ if (curr && protect && protect_slice(curr))
return curr;
/* Pick the leftmost entity if it's eligible */
@@ -983,6 +1002,11 @@ found:
return best;
}
+static struct sched_entity *pick_eevdf(struct cfs_rq *cfs_rq)
+{
+ return __pick_eevdf(cfs_rq, true);
+}
+
struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
{
struct rb_node *last = rb_last(&cfs_rq->tasks_timeline.rb_root);
@@ -996,7 +1020,6 @@ struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
/**************************************************************
* Scheduling class statistics methods:
*/
-#ifdef CONFIG_SMP
int sched_update_scaling(void)
{
unsigned int factor = get_update_sysctl_factor();
@@ -1008,7 +1031,6 @@ int sched_update_scaling(void)
return 0;
}
-#endif
static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se);
@@ -1041,7 +1063,6 @@ static bool update_deadline(struct cfs_rq *cfs_rq, struct sched_entity *se)
}
#include "pelt.h"
-#ifdef CONFIG_SMP
static int select_idle_sibling(struct task_struct *p, int prev_cpu, int cpu);
static unsigned long task_h_load(struct task_struct *p);
@@ -1131,34 +1152,40 @@ void post_init_entity_util_avg(struct task_struct *p)
sa->runnable_avg = sa->util_avg;
}
-#else /* !CONFIG_SMP */
-void init_entity_runnable_average(struct sched_entity *se)
-{
-}
-void post_init_entity_util_avg(struct task_struct *p)
-{
-}
-static void update_tg_load_avg(struct cfs_rq *cfs_rq)
-{
-}
-#endif /* CONFIG_SMP */
-
-static s64 update_curr_se(struct rq *rq, struct sched_entity *curr)
+static s64 update_se(struct rq *rq, struct sched_entity *se)
{
u64 now = rq_clock_task(rq);
s64 delta_exec;
- delta_exec = now - curr->exec_start;
+ delta_exec = now - se->exec_start;
if (unlikely(delta_exec <= 0))
return delta_exec;
- curr->exec_start = now;
- curr->sum_exec_runtime += delta_exec;
+ se->exec_start = now;
+ if (entity_is_task(se)) {
+ struct task_struct *donor = task_of(se);
+ struct task_struct *running = rq->curr;
+ /*
+ * If se is a task, we account the time against the running
+ * task, as w/ proxy-exec they may not be the same.
+ */
+ running->se.exec_start = now;
+ running->se.sum_exec_runtime += delta_exec;
+
+ trace_sched_stat_runtime(running, delta_exec);
+ account_group_exec_runtime(running, delta_exec);
+
+ /* cgroup time is always accounted against the donor */
+ cgroup_account_cputime(donor, delta_exec);
+ } else {
+ /* If not task, account the time against donor se */
+ se->sum_exec_runtime += delta_exec;
+ }
if (schedstat_enabled()) {
struct sched_statistics *stats;
- stats = __schedstats_from_se(curr);
+ stats = __schedstats_from_se(se);
__schedstat_set(stats->exec_max,
max(delta_exec, stats->exec_max));
}
@@ -1166,58 +1193,12 @@ static s64 update_curr_se(struct rq *rq, struct sched_entity *curr)
return delta_exec;
}
-static inline void update_curr_task(struct task_struct *p, s64 delta_exec)
-{
- trace_sched_stat_runtime(p, delta_exec);
- account_group_exec_runtime(p, delta_exec);
- cgroup_account_cputime(p, delta_exec);
-}
-
-static inline bool did_preempt_short(struct cfs_rq *cfs_rq, struct sched_entity *curr)
-{
- if (!sched_feat(PREEMPT_SHORT))
- return false;
-
- if (curr->vlag == curr->deadline)
- return false;
-
- return !entity_eligible(cfs_rq, curr);
-}
-
-static inline bool do_preempt_short(struct cfs_rq *cfs_rq,
- struct sched_entity *pse, struct sched_entity *se)
-{
- if (!sched_feat(PREEMPT_SHORT))
- return false;
-
- if (pse->slice >= se->slice)
- return false;
-
- if (!entity_eligible(cfs_rq, pse))
- return false;
-
- if (entity_before(pse, se))
- return true;
-
- if (!entity_eligible(cfs_rq, se))
- return true;
-
- return false;
-}
-
/*
* Used by other classes to account runtime.
*/
s64 update_curr_common(struct rq *rq)
{
- struct task_struct *donor = rq->donor;
- s64 delta_exec;
-
- delta_exec = update_curr_se(rq, &donor->se);
- if (likely(delta_exec > 0))
- update_curr_task(donor, delta_exec);
-
- return delta_exec;
+ return update_se(rq, &rq->donor->se);
}
/*
@@ -1225,6 +1206,12 @@ s64 update_curr_common(struct rq *rq)
*/
static void update_curr(struct cfs_rq *cfs_rq)
{
+ /*
+ * Note: cfs_rq->curr corresponds to the task picked to
+ * run (ie: rq->donor.se) which due to proxy-exec may
+ * not necessarily be the actual task running
+ * (rq->curr.se). This is easy to confuse!
+ */
struct sched_entity *curr = cfs_rq->curr;
struct rq *rq = rq_of(cfs_rq);
s64 delta_exec;
@@ -1233,7 +1220,7 @@ static void update_curr(struct cfs_rq *cfs_rq)
if (unlikely(!curr))
return;
- delta_exec = update_curr_se(rq, curr);
+ delta_exec = update_se(rq, curr);
if (unlikely(delta_exec <= 0))
return;
@@ -1242,10 +1229,6 @@ static void update_curr(struct cfs_rq *cfs_rq)
update_min_vruntime(cfs_rq);
if (entity_is_task(curr)) {
- struct task_struct *p = task_of(curr);
-
- update_curr_task(p, delta_exec);
-
/*
* If the fair_server is active, we need to account for the
* fair_server time whether or not the task is running on
@@ -1265,7 +1248,7 @@ static void update_curr(struct cfs_rq *cfs_rq)
if (cfs_rq->nr_queued == 1)
return;
- if (resched || did_preempt_short(cfs_rq, curr)) {
+ if (resched || !protect_slice(curr)) {
resched_curr_lazy(rq);
clear_buddies(cfs_rq, curr);
}
@@ -2114,12 +2097,12 @@ static inline int numa_idle_core(int idle_core, int cpu)
return idle_core;
}
-#else
+#else /* !CONFIG_SCHED_SMT: */
static inline int numa_idle_core(int idle_core, int cpu)
{
return idle_core;
}
-#endif
+#endif /* !CONFIG_SCHED_SMT */
/*
* Gather all necessary information to make NUMA balancing placement
@@ -2273,7 +2256,8 @@ static bool task_numa_compare(struct task_numa_env *env,
rcu_read_lock();
cur = rcu_dereference(dst_rq->curr);
- if (cur && ((cur->flags & PF_EXITING) || is_idle_task(cur)))
+ if (cur && ((cur->flags & (PF_EXITING | PF_KTHREAD)) ||
+ !cur->mm))
cur = NULL;
/*
@@ -3329,6 +3313,15 @@ static void task_numa_work(struct callback_head *work)
if (p->flags & PF_EXITING)
return;
+ /*
+ * Memory is pinned to only one NUMA node via cpuset.mems, naturally
+ * no page can be migrated.
+ */
+ if (cpusets_enabled() && nodes_weight(cpuset_current_mems_allowed) == 1) {
+ trace_sched_skip_cpuset_numa(current, &cpuset_current_mems_allowed);
+ return;
+ }
+
if (!mm->numa_next_scan) {
mm->numa_next_scan = now +
msecs_to_jiffies(sysctl_numa_balancing_scan_delay);
@@ -3663,7 +3656,8 @@ static void update_scan_period(struct task_struct *p, int new_cpu)
p->numa_scan_period = task_scan_start(p);
}
-#else
+#else /* !CONFIG_NUMA_BALANCING: */
+
static void task_tick_numa(struct rq *rq, struct task_struct *curr)
{
}
@@ -3680,20 +3674,18 @@ static inline void update_scan_period(struct task_struct *p, int new_cpu)
{
}
-#endif /* CONFIG_NUMA_BALANCING */
+#endif /* !CONFIG_NUMA_BALANCING */
static void
account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
update_load_add(&cfs_rq->load, se->load.weight);
-#ifdef CONFIG_SMP
if (entity_is_task(se)) {
struct rq *rq = rq_of(cfs_rq);
account_numa_enqueue(rq, task_of(se));
list_add(&se->group_node, &rq->cfs_tasks);
}
-#endif
cfs_rq->nr_queued++;
}
@@ -3701,12 +3693,10 @@ static void
account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
update_load_sub(&cfs_rq->load, se->load.weight);
-#ifdef CONFIG_SMP
if (entity_is_task(se)) {
account_numa_dequeue(rq_of(cfs_rq), task_of(se));
list_del_init(&se->group_node);
}
-#endif
cfs_rq->nr_queued--;
}
@@ -3758,7 +3748,6 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
*ptr -= min_t(typeof(*ptr), *ptr, _val); \
} while (0)
-#ifdef CONFIG_SMP
static inline void
enqueue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
@@ -3775,12 +3764,6 @@ dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
cfs_rq->avg.load_sum = max_t(u32, cfs_rq->avg.load_sum,
cfs_rq->avg.load_avg * PELT_MIN_DIVIDER);
}
-#else
-static inline void
-enqueue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { }
-static inline void
-dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { }
-#endif
static void place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags);
@@ -3812,13 +3795,11 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
update_load_set(&se->load, weight);
-#ifdef CONFIG_SMP
do {
u32 divider = get_pelt_divider(&se->avg);
se->avg.load_avg = div_u64(se_weight(se) * se->avg.load_sum, divider);
} while (0);
-#endif
enqueue_load_avg(cfs_rq, se);
if (se->on_rq) {
@@ -3853,7 +3834,6 @@ static void reweight_task_fair(struct rq *rq, struct task_struct *p,
static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);
#ifdef CONFIG_FAIR_GROUP_SCHED
-#ifdef CONFIG_SMP
/*
* All this does is approximate the hierarchical proportion which includes that
* global sum we all love to hate.
@@ -3960,7 +3940,6 @@ static long calc_group_shares(struct cfs_rq *cfs_rq)
*/
return clamp_t(long, shares, MIN_SHARES, tg_shares);
}
-#endif /* CONFIG_SMP */
/*
* Recomputes the group entity based on the current state of its group
@@ -3981,20 +3960,16 @@ static void update_cfs_group(struct sched_entity *se)
if (throttled_hierarchy(gcfs_rq))
return;
-#ifndef CONFIG_SMP
- shares = READ_ONCE(gcfs_rq->tg->shares);
-#else
shares = calc_group_shares(gcfs_rq);
-#endif
if (unlikely(se->load.weight != shares))
reweight_entity(cfs_rq_of(se), se, shares);
}
-#else /* CONFIG_FAIR_GROUP_SCHED */
+#else /* !CONFIG_FAIR_GROUP_SCHED: */
static inline void update_cfs_group(struct sched_entity *se)
{
}
-#endif /* CONFIG_FAIR_GROUP_SCHED */
+#endif /* !CONFIG_FAIR_GROUP_SCHED */
static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq, int flags)
{
@@ -4019,7 +3994,6 @@ static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq, int flags)
}
}
-#ifdef CONFIG_SMP
static inline bool load_avg_is_decayed(struct sched_avg *sa)
{
if (sa->load_sum)
@@ -4471,7 +4445,7 @@ static inline bool skip_blocked_update(struct sched_entity *se)
return true;
}
-#else /* CONFIG_FAIR_GROUP_SCHED */
+#else /* !CONFIG_FAIR_GROUP_SCHED: */
static inline void update_tg_load_avg(struct cfs_rq *cfs_rq) {}
@@ -4484,7 +4458,7 @@ static inline int propagate_entity_load_avg(struct sched_entity *se)
static inline void add_tg_cfs_propagate(struct cfs_rq *cfs_rq, long runnable_sum) {}
-#endif /* CONFIG_FAIR_GROUP_SCHED */
+#endif /* !CONFIG_FAIR_GROUP_SCHED */
#ifdef CONFIG_NO_HZ_COMMON
static inline void migrate_se_pelt_lag(struct sched_entity *se)
@@ -4565,9 +4539,9 @@ static inline void migrate_se_pelt_lag(struct sched_entity *se)
__update_load_avg_blocked_se(now, se);
}
-#else
+#else /* !CONFIG_NO_HZ_COMMON: */
static void migrate_se_pelt_lag(struct sched_entity *se) {}
-#endif
+#endif /* !CONFIG_NO_HZ_COMMON */
/**
* update_cfs_rq_load_avg - update the cfs_rq's load/util averages
@@ -5134,48 +5108,6 @@ static inline void update_misfit_status(struct task_struct *p, struct rq *rq)
rq->misfit_task_load = max_t(unsigned long, task_h_load(p), 1);
}
-#else /* CONFIG_SMP */
-
-static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq)
-{
- return !cfs_rq->nr_queued;
-}
-
-#define UPDATE_TG 0x0
-#define SKIP_AGE_LOAD 0x0
-#define DO_ATTACH 0x0
-#define DO_DETACH 0x0
-
-static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int not_used1)
-{
- cfs_rq_util_change(cfs_rq, 0);
-}
-
-static inline void remove_entity_load_avg(struct sched_entity *se) {}
-
-static inline void
-attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
-static inline void
-detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
-
-static inline int sched_balance_newidle(struct rq *rq, struct rq_flags *rf)
-{
- return 0;
-}
-
-static inline void
-util_est_enqueue(struct cfs_rq *cfs_rq, struct task_struct *p) {}
-
-static inline void
-util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p) {}
-
-static inline void
-util_est_update(struct cfs_rq *cfs_rq, struct task_struct *p,
- bool task_sleep) {}
-static inline void update_misfit_status(struct task_struct *p, struct rq *rq) {}
-
-#endif /* CONFIG_SMP */
-
void __setparam_fair(struct task_struct *p, const struct sched_attr *attr)
{
struct sched_entity *se = &p->se;
@@ -5243,7 +5175,7 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
* V' = (\Sum w_j*v_j + w_i*v_i) / (W + w_i)
* = (W*V + w_i*(V - vl_i)) / (W + w_i)
* = (W*V + w_i*V - w_i*vl_i) / (W + w_i)
- * = (V*(W + w_i) - w_i*l) / (W + w_i)
+ * = (V*(W + w_i) - w_i*vl_i) / (W + w_i)
* = V - w_i*vl_i / (W + w_i)
*
* And the actual lag after adding an entity with vl_i is:
@@ -5540,7 +5472,7 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
__dequeue_entity(cfs_rq, se);
update_load_avg(cfs_rq, se, UPDATE_TG);
- set_protect_slice(se);
+ set_protect_slice(cfs_rq, se);
}
update_stats_curr_start(cfs_rq, se);
@@ -5675,7 +5607,7 @@ void cfs_bandwidth_usage_dec(void)
{
static_key_slow_dec_cpuslocked(&__cfs_bandwidth_used);
}
-#else /* CONFIG_JUMP_LABEL */
+#else /* !CONFIG_JUMP_LABEL: */
static bool cfs_bandwidth_used(void)
{
return true;
@@ -5683,16 +5615,7 @@ static bool cfs_bandwidth_used(void)
void cfs_bandwidth_usage_inc(void) {}
void cfs_bandwidth_usage_dec(void) {}
-#endif /* CONFIG_JUMP_LABEL */
-
-/*
- * default period for cfs group bandwidth.
- * default: 0.1s, units: nanoseconds
- */
-static inline u64 default_cfs_period(void)
-{
- return 100000000ULL;
-}
+#endif /* !CONFIG_JUMP_LABEL */
static inline u64 sched_cfs_bandwidth_slice(void)
{
@@ -5879,7 +5802,6 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq)
struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
struct sched_entity *se;
long queued_delta, runnable_delta, idle_delta, dequeue = 1;
- long rq_h_nr_queued = rq->cfs.h_nr_queued;
raw_spin_lock(&cfs_b->lock);
/* This will start the period timer if necessary */
@@ -5963,10 +5885,6 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq)
/* At this point se is NULL and we are at root level*/
sub_nr_running(rq, queued_delta);
-
- /* Stop the fair server if throttling resulted in no runnable tasks */
- if (rq_h_nr_queued && !rq->cfs.h_nr_queued)
- dl_server_stop(&rq->fair_server);
done:
/*
* Note: distribution will already see us throttled via the
@@ -6078,7 +5996,6 @@ unthrottle_throttle:
resched_curr(rq);
}
-#ifdef CONFIG_SMP
static void __cfsb_csd_unthrottle(void *arg)
{
struct cfs_rq *cursor, *tmp;
@@ -6137,12 +6054,6 @@ static inline void __unthrottle_cfs_rq_async(struct cfs_rq *cfs_rq)
if (first)
smp_call_function_single_async(cpu_of(rq), &rq->cfsb_csd);
}
-#else
-static inline void __unthrottle_cfs_rq_async(struct cfs_rq *cfs_rq)
-{
- unthrottle_cfs_rq(cfs_rq);
-}
-#endif
static void unthrottle_cfs_rq_async(struct cfs_rq *cfs_rq)
{
@@ -6480,8 +6391,6 @@ static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer)
return HRTIMER_NORESTART;
}
-extern const u64 max_cfs_quota_period;
-
static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
{
struct cfs_bandwidth *cfs_b =
@@ -6508,7 +6417,7 @@ static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
* to fail.
*/
new = old * 2;
- if (new < max_cfs_quota_period) {
+ if (new < max_bw_quota_period_us * NSEC_PER_USEC) {
cfs_b->period = ns_to_ktime(new);
cfs_b->quota *= 2;
cfs_b->burst *= 2;
@@ -6542,7 +6451,7 @@ void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b, struct cfs_bandwidth *paren
raw_spin_lock_init(&cfs_b->lock);
cfs_b->runtime = 0;
cfs_b->quota = RUNTIME_INF;
- cfs_b->period = ns_to_ktime(default_cfs_period());
+ cfs_b->period = us_to_ktime(default_bw_period_us());
cfs_b->burst = 0;
cfs_b->hierarchical_quota = parent ? parent->hierarchical_quota : RUNTIME_INF;
@@ -6598,7 +6507,6 @@ static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
* guaranteed at this point that no additional cfs_rq of this group can
* join a CSD list.
*/
-#ifdef CONFIG_SMP
for_each_possible_cpu(i) {
struct rq *rq = cpu_rq(i);
unsigned long flags;
@@ -6610,7 +6518,6 @@ static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
__cfsb_csd_unthrottle(rq);
local_irq_restore(flags);
}
-#endif
}
/*
@@ -6723,9 +6630,9 @@ static void sched_fair_update_stop_tick(struct rq *rq, struct task_struct *p)
if (cfs_task_bw_constrained(p))
tick_nohz_dep_set_cpu(cpu, TICK_DEP_BIT_SCHED);
}
-#endif
+#endif /* CONFIG_NO_HZ_FULL */
-#else /* CONFIG_CFS_BANDWIDTH */
+#else /* !CONFIG_CFS_BANDWIDTH: */
static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) {}
static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq) { return false; }
@@ -6767,7 +6674,7 @@ bool cfs_task_bw_constrained(struct task_struct *p)
return false;
}
#endif
-#endif /* CONFIG_CFS_BANDWIDTH */
+#endif /* !CONFIG_CFS_BANDWIDTH */
#if !defined(CONFIG_CFS_BANDWIDTH) || !defined(CONFIG_NO_HZ_FULL)
static inline void sched_fair_update_stop_tick(struct rq *rq, struct task_struct *p) {}
@@ -6812,7 +6719,7 @@ static void hrtick_update(struct rq *rq)
hrtick_start_fair(rq, donor);
}
-#else /* !CONFIG_SCHED_HRTICK */
+#else /* !CONFIG_SCHED_HRTICK: */
static inline void
hrtick_start_fair(struct rq *rq, struct task_struct *p)
{
@@ -6821,9 +6728,8 @@ hrtick_start_fair(struct rq *rq, struct task_struct *p)
static inline void hrtick_update(struct rq *rq)
{
}
-#endif
+#endif /* !CONFIG_SCHED_HRTICK */
-#ifdef CONFIG_SMP
static inline bool cpu_overutilized(int cpu)
{
unsigned long rq_util_min, rq_util_max;
@@ -6865,9 +6771,6 @@ static inline void check_update_overutilized_status(struct rq *rq)
if (!is_rd_overutilized(rq->rd) && cpu_overutilized(rq->cpu))
set_rd_overutilized(rq->rd, 1);
}
-#else
-static inline void check_update_overutilized_status(struct rq *rq) { }
-#endif
/* Runqueue only has SCHED_IDLE tasks enqueued */
static int sched_idle_rq(struct rq *rq)
@@ -6876,12 +6779,10 @@ static int sched_idle_rq(struct rq *rq)
rq->nr_running);
}
-#ifdef CONFIG_SMP
static int sched_idle_cpu(int cpu)
{
return sched_idle_rq(cpu_rq(cpu));
}
-#endif
static void
requeue_delayed_entity(struct sched_entity *se)
@@ -7060,7 +6961,6 @@ static void set_next_buddy(struct sched_entity *se);
static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags)
{
bool was_sched_idle = sched_idle_rq(rq);
- int rq_h_nr_queued = rq->cfs.h_nr_queued;
bool task_sleep = flags & DEQUEUE_SLEEP;
bool task_delayed = flags & DEQUEUE_DELAYED;
struct task_struct *p = NULL;
@@ -7144,9 +7044,6 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags)
sub_nr_running(rq, h_nr_queued);
- if (rq_h_nr_queued && !rq->cfs.h_nr_queued)
- dl_server_stop(&rq->fair_server);
-
/* balance early to pull high priority tasks */
if (unlikely(!was_sched_idle && sched_idle_rq(rq)))
rq->next_balance = jiffies;
@@ -7196,8 +7093,6 @@ static inline unsigned int cfs_h_nr_delayed(struct rq *rq)
return (rq->cfs.h_nr_queued - rq->cfs.h_nr_runnable);
}
-#ifdef CONFIG_SMP
-
/* Working cpumask for: sched_balance_rq(), sched_balance_newidle(). */
static DEFINE_PER_CPU(cpumask_var_t, load_balance_mask);
static DEFINE_PER_CPU(cpumask_var_t, select_rq_mask);
@@ -7667,7 +7562,7 @@ static int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int t
return -1;
}
-#else /* CONFIG_SCHED_SMT */
+#else /* !CONFIG_SCHED_SMT: */
static inline void set_idle_cores(int cpu, int val)
{
@@ -7688,7 +7583,7 @@ static inline int select_idle_smt(struct task_struct *p, struct sched_domain *sd
return -1;
}
-#endif /* CONFIG_SCHED_SMT */
+#endif /* !CONFIG_SCHED_SMT */
/*
* Scan the LLC domain for idle CPUs; this is dynamically regulated by
@@ -8733,9 +8628,6 @@ balance_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
return sched_balance_newidle(rq, rf) != 0;
}
-#else
-static inline void set_task_max_allowed_capacity(struct task_struct *p) {}
-#endif /* CONFIG_SMP */
static void set_next_buddy(struct sched_entity *se)
{
@@ -8757,6 +8649,7 @@ static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int
struct sched_entity *se = &donor->se, *pse = &p->se;
struct cfs_rq *cfs_rq = task_cfs_rq(donor);
int cse_is_idle, pse_is_idle;
+ bool do_preempt_short = false;
if (unlikely(se == pse))
return;
@@ -8805,7 +8698,7 @@ static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int
* When non-idle entity preempt an idle entity,
* don't give idle entity slice protection.
*/
- cancel_protect_slice(se);
+ do_preempt_short = true;
goto preempt;
}
@@ -8823,22 +8716,24 @@ static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int
/*
* If @p has a shorter slice than current and @p is eligible, override
* current's slice protection in order to allow preemption.
- *
- * Note that even if @p does not turn out to be the most eligible
- * task at this moment, current's slice protection will be lost.
*/
- if (do_preempt_short(cfs_rq, pse, se))
- cancel_protect_slice(se);
+ do_preempt_short = sched_feat(PREEMPT_SHORT) && (pse->slice < se->slice);
/*
* If @p has become the most eligible task, force preemption.
*/
- if (pick_eevdf(cfs_rq) == pse)
+ if (__pick_eevdf(cfs_rq, !do_preempt_short) == pse)
goto preempt;
+ if (sched_feat(RUN_TO_PARITY) && do_preempt_short)
+ update_protect_slice(cfs_rq, se);
+
return;
preempt:
+ if (do_preempt_short)
+ cancel_protect_slice(se);
+
resched_curr_lazy(rq);
}
@@ -8929,7 +8824,7 @@ again:
return p;
simple:
-#endif
+#endif /* CONFIG_FAIR_GROUP_SCHED */
put_prev_set_next_task(rq, prev, p);
return p;
@@ -9045,7 +8940,6 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p)
return true;
}
-#ifdef CONFIG_SMP
/**************************************************
* Fair scheduling class load-balancing methods.
*
@@ -9347,13 +9241,13 @@ static long migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
return src_weight - dst_weight;
}
-#else
+#else /* !CONFIG_NUMA_BALANCING: */
static inline long migrate_degrades_locality(struct task_struct *p,
struct lb_env *env)
{
return 0;
}
-#endif
+#endif /* !CONFIG_NUMA_BALANCING */
/*
* Check whether the task is ineligible on the destination cpu
@@ -9397,7 +9291,8 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
* 2) throttled_lb_pair, or
* 3) cannot be migrated to this CPU due to cpus_ptr, or
* 4) running (obviously), or
- * 5) are cache-hot on their current CPU.
+ * 5) are cache-hot on their current CPU, or
+ * 6) are blocked on mutexes (if SCHED_PROXY_EXEC is enabled)
*/
if ((p->se.sched_delayed) && (env->migration_type != migrate_load))
return 0;
@@ -9419,6 +9314,9 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
if (kthread_is_per_cpu(p))
return 0;
+ if (task_is_blocked(p))
+ return 0;
+
if (!cpumask_test_cpu(env->dst_cpu, p->cpus_ptr)) {
int cpu;
@@ -9454,7 +9352,8 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
/* Record that we found at least one task that could run on dst_cpu */
env->flags &= ~LBF_ALL_PINNED;
- if (task_on_cpu(env->src_rq, p)) {
+ if (task_on_cpu(env->src_rq, p) ||
+ task_current_donor(env->src_rq, p)) {
schedstat_inc(p->stats.nr_failed_migrations_running);
return 0;
}
@@ -9498,6 +9397,9 @@ static void detach_task(struct task_struct *p, struct lb_env *env)
schedstat_inc(p->stats.nr_forced_migrations);
}
+ WARN_ON(task_current(env->src_rq, p));
+ WARN_ON(task_current_donor(env->src_rq, p));
+
deactivate_task(env->src_rq, p, DEQUEUE_NOCLOCK);
set_task_cpu(p, env->dst_cpu);
}
@@ -9762,12 +9664,12 @@ static inline void update_blocked_load_status(struct rq *rq, bool has_blocked)
if (!has_blocked)
rq->has_blocked_load = 0;
}
-#else
+#else /* !CONFIG_NO_HZ_COMMON: */
static inline bool cfs_rq_has_blocked(struct cfs_rq *cfs_rq) { return false; }
static inline bool others_have_blocked(struct rq *rq) { return false; }
static inline void update_blocked_load_tick(struct rq *rq) {}
static inline void update_blocked_load_status(struct rq *rq, bool has_blocked) {}
-#endif
+#endif /* !CONFIG_NO_HZ_COMMON */
static bool __update_blocked_others(struct rq *rq, bool *done)
{
@@ -9876,7 +9778,7 @@ static unsigned long task_h_load(struct task_struct *p)
return div64_ul(p->se.avg.load_avg * cfs_rq->h_load,
cfs_rq_load_avg(cfs_rq) + 1);
}
-#else
+#else /* !CONFIG_FAIR_GROUP_SCHED: */
static bool __update_blocked_fair(struct rq *rq, bool *done)
{
struct cfs_rq *cfs_rq = &rq->cfs;
@@ -9893,7 +9795,7 @@ static unsigned long task_h_load(struct task_struct *p)
{
return p->se.avg.load_avg;
}
-#endif
+#endif /* !CONFIG_FAIR_GROUP_SCHED */
static void sched_balance_update_blocked_averages(int cpu)
{
@@ -10038,9 +9940,9 @@ void update_group_capacity(struct sched_domain *sd, int cpu)
min_capacity = ULONG_MAX;
max_capacity = 0;
- if (child->flags & SD_OVERLAP) {
+ if (child->flags & SD_NUMA) {
/*
- * SD_OVERLAP domains cannot assume that child groups
+ * SD_NUMA domains cannot assume that child groups
* span the current group.
*/
@@ -10053,7 +9955,7 @@ void update_group_capacity(struct sched_domain *sd, int cpu)
}
} else {
/*
- * !SD_OVERLAP domains can assume that child groups
+ * !SD_NUMA domains can assume that child groups
* span the current group.
*/
@@ -10606,7 +10508,7 @@ static inline enum fbq_type fbq_classify_rq(struct rq *rq)
return remote;
return all;
}
-#else
+#else /* !CONFIG_NUMA_BALANCING: */
static inline enum fbq_type fbq_classify_group(struct sg_lb_stats *sgs)
{
return all;
@@ -10616,7 +10518,7 @@ static inline enum fbq_type fbq_classify_rq(struct rq *rq)
{
return regular;
}
-#endif /* CONFIG_NUMA_BALANCING */
+#endif /* !CONFIG_NUMA_BALANCING */
struct sg_lb_stats;
@@ -12164,8 +12066,14 @@ static inline bool update_newidle_cost(struct sched_domain *sd, u64 cost)
/*
* Track max cost of a domain to make sure to not delay the
* next wakeup on the CPU.
+ *
+ * sched_balance_newidle() bumps the cost whenever newidle
+ * balance fails, and we don't want things to grow out of
+ * control. Use the sysctl_sched_migration_cost as the upper
+ * limit, plus a litle extra to avoid off by ones.
*/
- sd->max_newidle_lb_cost = cost;
+ sd->max_newidle_lb_cost =
+ min(cost, sysctl_sched_migration_cost + 200);
sd->last_decay_max_lb_cost = jiffies;
} else if (time_after(jiffies, sd->last_decay_max_lb_cost + HZ)) {
/*
@@ -12762,7 +12670,7 @@ static void nohz_newidle_balance(struct rq *this_rq)
atomic_or(NOHZ_NEWILB_KICK, nohz_flags(this_cpu));
}
-#else /* !CONFIG_NO_HZ_COMMON */
+#else /* !CONFIG_NO_HZ_COMMON: */
static inline void nohz_balancer_kick(struct rq *rq) { }
static inline bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
@@ -12771,7 +12679,7 @@ static inline bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle
}
static inline void nohz_newidle_balance(struct rq *this_rq) { }
-#endif /* CONFIG_NO_HZ_COMMON */
+#endif /* !CONFIG_NO_HZ_COMMON */
/*
* sched_balance_newidle is called by schedule() if this_cpu is about to become
@@ -12857,10 +12765,17 @@ static int sched_balance_newidle(struct rq *this_rq, struct rq_flags *rf)
t1 = sched_clock_cpu(this_cpu);
domain_cost = t1 - t0;
- update_newidle_cost(sd, domain_cost);
-
curr_cost += domain_cost;
t0 = t1;
+
+ /*
+ * Failing newidle means it is not effective;
+ * bump the cost so we end up doing less of it.
+ */
+ if (!pulled_task)
+ domain_cost = (3 * sd->max_newidle_lb_cost) / 2;
+
+ update_newidle_cost(sd, domain_cost);
}
/*
@@ -12968,8 +12883,6 @@ static void rq_offline_fair(struct rq *rq)
clear_tg_offline_cfs_rqs(rq);
}
-#endif /* CONFIG_SMP */
-
#ifdef CONFIG_SCHED_CORE
static inline bool
__entity_slice_used(struct sched_entity *se, int min_nr_tasks)
@@ -13066,10 +12979,10 @@ bool cfs_prio_less(const struct task_struct *a, const struct task_struct *b,
cfs_rqa = sea->cfs_rq;
cfs_rqb = seb->cfs_rq;
-#else
+#else /* !CONFIG_FAIR_GROUP_SCHED: */
cfs_rqa = &task_rq(a)->cfs;
cfs_rqb = &task_rq(b)->cfs;
-#endif
+#endif /* !CONFIG_FAIR_GROUP_SCHED */
/*
* Find delta after normalizing se's vruntime with its cfs_rq's
@@ -13093,9 +13006,9 @@ static int task_is_throttled_fair(struct task_struct *p, int cpu)
#endif
return throttled_hierarchy(cfs_rq);
}
-#else
+#else /* !CONFIG_SCHED_CORE: */
static inline void task_tick_core(struct rq *rq, struct task_struct *curr) {}
-#endif
+#endif /* !CONFIG_SCHED_CORE */
/*
* scheduler tick hitting a task of our scheduling class.
@@ -13189,15 +13102,14 @@ static void propagate_entity_cfs_rq(struct sched_entity *se)
list_add_leaf_cfs_rq(cfs_rq);
}
}
-#else
+#else /* !CONFIG_FAIR_GROUP_SCHED: */
static void propagate_entity_cfs_rq(struct sched_entity *se) { }
-#endif
+#endif /* !CONFIG_FAIR_GROUP_SCHED */
static void detach_entity_cfs_rq(struct sched_entity *se)
{
struct cfs_rq *cfs_rq = cfs_rq_of(se);
-#ifdef CONFIG_SMP
/*
* In case the task sched_avg hasn't been attached:
* - A forked task which hasn't been woken up by wake_up_new_task().
@@ -13206,7 +13118,6 @@ static void detach_entity_cfs_rq(struct sched_entity *se)
*/
if (!se->avg.last_update_time)
return;
-#endif
/* Catch up with the cfs_rq and remove our load when we leave */
update_load_avg(cfs_rq, se, 0);
@@ -13270,7 +13181,6 @@ static void __set_next_task_fair(struct rq *rq, struct task_struct *p, bool firs
{
struct sched_entity *se = &p->se;
-#ifdef CONFIG_SMP
if (task_on_rq_queued(p)) {
/*
* Move the next running task to the front of the list, so our
@@ -13278,7 +13188,6 @@ static void __set_next_task_fair(struct rq *rq, struct task_struct *p, bool firs
*/
list_move(&se->group_node, &rq->cfs_tasks);
}
-#endif
if (!first)
return;
@@ -13316,9 +13225,7 @@ void init_cfs_rq(struct cfs_rq *cfs_rq)
{
cfs_rq->tasks_timeline = RB_ROOT_CACHED;
cfs_rq->min_vruntime = (u64)(-(1LL << 20));
-#ifdef CONFIG_SMP
raw_spin_lock_init(&cfs_rq->removed.lock);
-#endif
}
#ifdef CONFIG_FAIR_GROUP_SCHED
@@ -13333,10 +13240,8 @@ static void task_change_group_fair(struct task_struct *p)
detach_task_cfs_rq(p);
-#ifdef CONFIG_SMP
/* Tell se's cfs_rq has been changed -- migrated */
p->se.avg.last_update_time = 0;
-#endif
set_task_rq(p, task_cpu(p));
attach_task_cfs_rq(p);
}
@@ -13632,7 +13537,6 @@ DEFINE_SCHED_CLASS(fair) = {
.put_prev_task = put_prev_task_fair,
.set_next_task = set_next_task_fair,
-#ifdef CONFIG_SMP
.balance = balance_fair,
.select_task_rq = select_task_rq_fair,
.migrate_task_rq = migrate_task_rq_fair,
@@ -13642,7 +13546,6 @@ DEFINE_SCHED_CLASS(fair) = {
.task_dead = task_dead_fair,
.set_cpus_allowed = set_cpus_allowed_fair,
-#endif
.task_tick = task_tick_fair,
.task_fork = task_fork_fair,
@@ -13705,7 +13608,6 @@ void show_numa_stats(struct task_struct *p, struct seq_file *m)
__init void init_sched_fair_class(void)
{
-#ifdef CONFIG_SMP
int i;
for_each_possible_cpu(i) {
@@ -13727,6 +13629,4 @@ __init void init_sched_fair_class(void)
nohz.next_blocked = jiffies;
zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
#endif
-#endif /* SMP */
-
}
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index 2c85c86b455f..c39b089d4f09 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -6,6 +6,11 @@
* (NOTE: these are not related to SCHED_IDLE batch scheduled
* tasks which are handled in sched/fair.c )
*/
+#include <linux/cpuidle.h>
+#include <linux/suspend.h>
+#include <linux/livepatch.h>
+#include "sched.h"
+#include "smp.h"
/* Linker adds these: start and end of __cpuidle functions */
extern char __cpuidle_text_start[], __cpuidle_text_end[];
@@ -47,7 +52,7 @@ static int __init cpu_idle_nopoll_setup(char *__unused)
return 1;
}
__setup("hlt", cpu_idle_nopoll_setup);
-#endif
+#endif /* CONFIG_GENERIC_IDLE_POLL_SETUP */
static noinline int __cpuidle cpu_idle_poll(void)
{
@@ -95,10 +100,10 @@ static inline void cond_tick_broadcast_exit(void)
if (static_branch_unlikely(&arch_needs_tick_broadcast))
tick_broadcast_exit();
}
-#else
+#else /* !CONFIG_GENERIC_CLOCKEVENTS_BROADCAST_IDLE: */
static inline void cond_tick_broadcast_enter(void) { }
static inline void cond_tick_broadcast_exit(void) { }
-#endif
+#endif /* !CONFIG_GENERIC_CLOCKEVENTS_BROADCAST_IDLE */
/**
* default_idle_call - Default CPU idle routine.
@@ -427,7 +432,6 @@ void cpu_startup_entry(enum cpuhp_state state)
* idle-task scheduling class.
*/
-#ifdef CONFIG_SMP
static int
select_task_rq_idle(struct task_struct *p, int cpu, int flags)
{
@@ -439,7 +443,6 @@ balance_idle(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
{
return WARN_ON_ONCE(1);
}
-#endif
/*
* Idle tasks are unconditionally rescheduled:
@@ -526,11 +529,9 @@ DEFINE_SCHED_CLASS(idle) = {
.put_prev_task = put_prev_task_idle,
.set_next_task = set_next_task_idle,
-#ifdef CONFIG_SMP
.balance = balance_idle,
.select_task_rq = select_task_rq_idle,
.set_cpus_allowed = set_cpus_allowed_common,
-#endif
.task_tick = task_tick_idle,
diff --git a/kernel/sched/isolation.c b/kernel/sched/isolation.c
index 93b038d48900..a4cf17b1fab0 100644
--- a/kernel/sched/isolation.c
+++ b/kernel/sched/isolation.c
@@ -7,6 +7,8 @@
* Copyright (C) 2017-2018 SUSE, Frederic Weisbecker
*
*/
+#include <linux/sched/isolation.h>
+#include "sched.h"
enum hk_flags {
HK_FLAG_DOMAIN = BIT(HK_TYPE_DOMAIN),
diff --git a/kernel/sched/loadavg.c b/kernel/sched/loadavg.c
index c48900b856a2..b601e0243d0e 100644
--- a/kernel/sched/loadavg.c
+++ b/kernel/sched/loadavg.c
@@ -6,6 +6,8 @@
* figure. Its a silly number but people think its important. We go through
* great pains to make it work on big machines and tickless kernels.
*/
+#include <linux/sched/nohz.h>
+#include "sched.h"
/*
* Global load-average calculations
@@ -80,7 +82,7 @@ long calc_load_fold_active(struct rq *this_rq, long adjust)
long nr_active, delta = 0;
nr_active = this_rq->nr_running - adjust;
- nr_active += (int)this_rq->nr_uninterruptible;
+ nr_active += (long)this_rq->nr_uninterruptible;
if (nr_active != this_rq->calc_load_active) {
delta = nr_active - this_rq->calc_load_active;
@@ -333,12 +335,12 @@ static void calc_global_nohz(void)
smp_wmb();
calc_load_idx++;
}
-#else /* !CONFIG_NO_HZ_COMMON */
+#else /* !CONFIG_NO_HZ_COMMON: */
static inline long calc_load_nohz_read(void) { return 0; }
static inline void calc_global_nohz(void) { }
-#endif /* CONFIG_NO_HZ_COMMON */
+#endif /* !CONFIG_NO_HZ_COMMON */
/*
* calc_load - update the avenrun load estimates 10 ticks after the
diff --git a/kernel/sched/membarrier.c b/kernel/sched/membarrier.c
index 809194cd779f..62fba83b7bb1 100644
--- a/kernel/sched/membarrier.c
+++ b/kernel/sched/membarrier.c
@@ -4,6 +4,8 @@
*
* membarrier system call
*/
+#include <uapi/linux/membarrier.h>
+#include "sched.h"
/*
* For documentation purposes, here are some membarrier ordering
diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c
index 7a8534a2deff..fa83bbaf4f3e 100644
--- a/kernel/sched/pelt.c
+++ b/kernel/sched/pelt.c
@@ -23,6 +23,7 @@
* Move PELT related code from fair.c into this pelt.c file
* Author: Vincent Guittot <vincent.guittot@linaro.org>
*/
+#include "pelt.h"
/*
* Approximate:
@@ -413,7 +414,7 @@ int update_hw_load_avg(u64 now, struct rq *rq, u64 capacity)
return 0;
}
-#endif
+#endif /* CONFIG_SCHED_HW_PRESSURE */
#ifdef CONFIG_HAVE_SCHED_AVG_IRQ
/*
@@ -466,7 +467,7 @@ int update_irq_load_avg(struct rq *rq, u64 running)
return ret;
}
-#endif
+#endif /* CONFIG_HAVE_SCHED_AVG_IRQ */
/*
* Load avg and utiliztion metrics need to be updated periodically and before
diff --git a/kernel/sched/pelt.h b/kernel/sched/pelt.h
index f4f6a0875c66..62c3fa543c0f 100644
--- a/kernel/sched/pelt.h
+++ b/kernel/sched/pelt.h
@@ -1,4 +1,8 @@
-#ifdef CONFIG_SMP
+// SPDX-License-Identifier: GPL-2.0
+#ifndef _KERNEL_SCHED_PELT_H
+#define _KERNEL_SCHED_PELT_H
+#include "sched.h"
+
#include "sched-pelt.h"
int __update_load_avg_blocked_se(u64 now, struct sched_entity *se);
@@ -15,7 +19,7 @@ static inline u64 hw_load_avg(struct rq *rq)
{
return READ_ONCE(rq->avg_hw.load_avg);
}
-#else
+#else /* !CONFIG_SCHED_HW_PRESSURE: */
static inline int
update_hw_load_avg(u64 now, struct rq *rq, u64 capacity)
{
@@ -26,7 +30,7 @@ static inline u64 hw_load_avg(struct rq *rq)
{
return 0;
}
-#endif
+#endif /* !CONFIG_SCHED_HW_PRESSURE */
#ifdef CONFIG_HAVE_SCHED_AVG_IRQ
int update_irq_load_avg(struct rq *rq, u64 running);
@@ -174,63 +178,12 @@ static inline u64 cfs_rq_clock_pelt(struct cfs_rq *cfs_rq)
return rq_clock_pelt(rq_of(cfs_rq)) - cfs_rq->throttled_clock_pelt_time;
}
-#else
+#else /* !CONFIG_CFS_BANDWIDTH: */
static inline void update_idle_cfs_rq_clock_pelt(struct cfs_rq *cfs_rq) { }
static inline u64 cfs_rq_clock_pelt(struct cfs_rq *cfs_rq)
{
return rq_clock_pelt(rq_of(cfs_rq));
}
-#endif
-
-#else
-
-static inline int
-update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
-{
- return 0;
-}
-
-static inline int
-update_rt_rq_load_avg(u64 now, struct rq *rq, int running)
-{
- return 0;
-}
-
-static inline int
-update_dl_rq_load_avg(u64 now, struct rq *rq, int running)
-{
- return 0;
-}
-
-static inline int
-update_hw_load_avg(u64 now, struct rq *rq, u64 capacity)
-{
- return 0;
-}
-
-static inline u64 hw_load_avg(struct rq *rq)
-{
- return 0;
-}
-
-static inline int
-update_irq_load_avg(struct rq *rq, u64 running)
-{
- return 0;
-}
-
-static inline u64 rq_clock_pelt(struct rq *rq)
-{
- return rq_clock_task(rq);
-}
-
-static inline void
-update_rq_clock_pelt(struct rq *rq, s64 delta) { }
-
-static inline void
-update_idle_rq_clock_pelt(struct rq *rq) { }
-
-static inline void update_idle_cfs_rq_clock_pelt(struct cfs_rq *cfs_rq) { }
-#endif
-
+#endif /* !CONFIG_CFS_BANDWIDTH */
+#endif /* _KERNEL_SCHED_PELT_H */
diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
index 1396674fa722..2024c1d36402 100644
--- a/kernel/sched/psi.c
+++ b/kernel/sched/psi.c
@@ -136,6 +136,10 @@
* cost-wise, yet way more sensitive and accurate than periodic
* sampling of the aggregate task states would be.
*/
+#include <linux/sched/clock.h>
+#include <linux/workqueue.h>
+#include <linux/psi.h>
+#include "sched.h"
static int psi_bug __read_mostly;
@@ -172,6 +176,28 @@ struct psi_group psi_system = {
.pcpu = &system_group_pcpu,
};
+static DEFINE_PER_CPU(seqcount_t, psi_seq);
+
+static inline void psi_write_begin(int cpu)
+{
+ write_seqcount_begin(per_cpu_ptr(&psi_seq, cpu));
+}
+
+static inline void psi_write_end(int cpu)
+{
+ write_seqcount_end(per_cpu_ptr(&psi_seq, cpu));
+}
+
+static inline u32 psi_read_begin(int cpu)
+{
+ return read_seqcount_begin(per_cpu_ptr(&psi_seq, cpu));
+}
+
+static inline bool psi_read_retry(int cpu, u32 seq)
+{
+ return read_seqcount_retry(per_cpu_ptr(&psi_seq, cpu), seq);
+}
+
static void psi_avgs_work(struct work_struct *work);
static void poll_timer_fn(struct timer_list *t);
@@ -182,7 +208,7 @@ static void group_init(struct psi_group *group)
group->enabled = true;
for_each_possible_cpu(cpu)
- seqcount_init(&per_cpu_ptr(group->pcpu, cpu)->seq);
+ seqcount_init(per_cpu_ptr(&psi_seq, cpu));
group->avg_last_update = sched_clock();
group->avg_next_update = group->avg_last_update + psi_period;
mutex_init(&group->avgs_lock);
@@ -262,14 +288,14 @@ static void get_recent_times(struct psi_group *group, int cpu,
/* Snapshot a coherent view of the CPU state */
do {
- seq = read_seqcount_begin(&groupc->seq);
+ seq = psi_read_begin(cpu);
now = cpu_clock(cpu);
memcpy(times, groupc->times, sizeof(groupc->times));
state_mask = groupc->state_mask;
state_start = groupc->state_start;
if (cpu == current_cpu)
memcpy(tasks, groupc->tasks, sizeof(groupc->tasks));
- } while (read_seqcount_retry(&groupc->seq, seq));
+ } while (psi_read_retry(cpu, seq));
/* Calculate state time deltas against the previous snapshot */
for (s = 0; s < NR_PSI_STATES; s++) {
@@ -733,7 +759,7 @@ static int psi_rtpoll_worker(void *data)
static void poll_timer_fn(struct timer_list *t)
{
- struct psi_group *group = from_timer(group, t, rtpoll_timer);
+ struct psi_group *group = timer_container_of(group, t, rtpoll_timer);
atomic_set(&group->rtpoll_wakeup, 1);
wake_up_interruptible(&group->rtpoll_wait);
@@ -768,31 +794,21 @@ static void record_times(struct psi_group_cpu *groupc, u64 now)
groupc->times[PSI_NONIDLE] += delta;
}
+#define for_each_group(iter, group) \
+ for (typeof(group) iter = group; iter; iter = iter->parent)
+
static void psi_group_change(struct psi_group *group, int cpu,
unsigned int clear, unsigned int set,
- bool wake_clock)
+ u64 now, bool wake_clock)
{
struct psi_group_cpu *groupc;
unsigned int t, m;
u32 state_mask;
- u64 now;
lockdep_assert_rq_held(cpu_rq(cpu));
groupc = per_cpu_ptr(group->pcpu, cpu);
/*
- * First we update the task counts according to the state
- * change requested through the @clear and @set bits.
- *
- * Then if the cgroup PSI stats accounting enabled, we
- * assess the aggregate resource states this CPU's tasks
- * have been in since the last change, and account any
- * SOME and FULL time these may have resulted in.
- */
- write_seqcount_begin(&groupc->seq);
- now = cpu_clock(cpu);
-
- /*
* Start with TSK_ONCPU, which doesn't have a corresponding
* task count - it's just a boolean flag directly encoded in
* the state mask. Clear, set, or carry the current state if
@@ -843,7 +859,6 @@ static void psi_group_change(struct psi_group *group, int cpu,
groupc->state_mask = state_mask;
- write_seqcount_end(&groupc->seq);
return;
}
@@ -864,8 +879,6 @@ static void psi_group_change(struct psi_group *group, int cpu,
groupc->state_mask = state_mask;
- write_seqcount_end(&groupc->seq);
-
if (state_mask & group->rtpoll_states)
psi_schedule_rtpoll_work(group, 1, false);
@@ -900,24 +913,29 @@ static void psi_flags_change(struct task_struct *task, int clear, int set)
void psi_task_change(struct task_struct *task, int clear, int set)
{
int cpu = task_cpu(task);
- struct psi_group *group;
+ u64 now;
if (!task->pid)
return;
psi_flags_change(task, clear, set);
- group = task_psi_group(task);
- do {
- psi_group_change(group, cpu, clear, set, true);
- } while ((group = group->parent));
+ psi_write_begin(cpu);
+ now = cpu_clock(cpu);
+ for_each_group(group, task_psi_group(task))
+ psi_group_change(group, cpu, clear, set, now, true);
+ psi_write_end(cpu);
}
void psi_task_switch(struct task_struct *prev, struct task_struct *next,
bool sleep)
{
- struct psi_group *group, *common = NULL;
+ struct psi_group *common = NULL;
int cpu = task_cpu(prev);
+ u64 now;
+
+ psi_write_begin(cpu);
+ now = cpu_clock(cpu);
if (next->pid) {
psi_flags_change(next, 0, TSK_ONCPU);
@@ -926,16 +944,15 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next,
* ancestors with @prev, those will already have @prev's
* TSK_ONCPU bit set, and we can stop the iteration there.
*/
- group = task_psi_group(next);
- do {
- if (per_cpu_ptr(group->pcpu, cpu)->state_mask &
- PSI_ONCPU) {
+ for_each_group(group, task_psi_group(next)) {
+ struct psi_group_cpu *groupc = per_cpu_ptr(group->pcpu, cpu);
+
+ if (groupc->state_mask & PSI_ONCPU) {
common = group;
break;
}
-
- psi_group_change(group, cpu, 0, TSK_ONCPU, true);
- } while ((group = group->parent));
+ psi_group_change(group, cpu, 0, TSK_ONCPU, now, true);
+ }
}
if (prev->pid) {
@@ -968,12 +985,11 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next,
psi_flags_change(prev, clear, set);
- group = task_psi_group(prev);
- do {
+ for_each_group(group, task_psi_group(prev)) {
if (group == common)
break;
- psi_group_change(group, cpu, clear, set, wake_clock);
- } while ((group = group->parent));
+ psi_group_change(group, cpu, clear, set, now, wake_clock);
+ }
/*
* TSK_ONCPU is handled up to the common ancestor. If there are
@@ -983,20 +999,21 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next,
*/
if ((prev->psi_flags ^ next->psi_flags) & ~TSK_ONCPU) {
clear &= ~TSK_ONCPU;
- for (; group; group = group->parent)
- psi_group_change(group, cpu, clear, set, wake_clock);
+ for_each_group(group, common)
+ psi_group_change(group, cpu, clear, set, now, wake_clock);
}
}
+ psi_write_end(cpu);
}
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
void psi_account_irqtime(struct rq *rq, struct task_struct *curr, struct task_struct *prev)
{
int cpu = task_cpu(curr);
- struct psi_group *group;
struct psi_group_cpu *groupc;
s64 delta;
u64 irq;
+ u64 now;
if (static_branch_likely(&psi_disabled) || !irqtime_enabled())
return;
@@ -1005,8 +1022,7 @@ void psi_account_irqtime(struct rq *rq, struct task_struct *curr, struct task_st
return;
lockdep_assert_rq_held(rq);
- group = task_psi_group(curr);
- if (prev && task_psi_group(prev) == group)
+ if (prev && task_psi_group(prev) == task_psi_group(curr))
return;
irq = irq_time_read(cpu);
@@ -1015,27 +1031,24 @@ void psi_account_irqtime(struct rq *rq, struct task_struct *curr, struct task_st
return;
rq->psi_irq_time = irq;
- do {
- u64 now;
+ psi_write_begin(cpu);
+ now = cpu_clock(cpu);
+ for_each_group(group, task_psi_group(curr)) {
if (!group->enabled)
continue;
groupc = per_cpu_ptr(group->pcpu, cpu);
- write_seqcount_begin(&groupc->seq);
- now = cpu_clock(cpu);
-
record_times(groupc, now);
groupc->times[PSI_IRQ_FULL] += delta;
- write_seqcount_end(&groupc->seq);
-
if (group->rtpoll_states & (1 << PSI_IRQ_FULL))
psi_schedule_rtpoll_work(group, 1, false);
- } while ((group = group->parent));
+ }
+ psi_write_end(cpu);
}
-#endif
+#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
/**
* psi_memstall_enter - mark the beginning of a memory stall section
@@ -1221,12 +1234,14 @@ void psi_cgroup_restart(struct psi_group *group)
return;
for_each_possible_cpu(cpu) {
- struct rq *rq = cpu_rq(cpu);
- struct rq_flags rf;
+ u64 now;
+
+ guard(rq_lock_irq)(cpu_rq(cpu));
- rq_lock_irq(rq, &rf);
- psi_group_change(group, cpu, 0, 0, true);
- rq_unlock_irq(rq, &rf);
+ psi_write_begin(cpu);
+ now = cpu_clock(cpu);
+ psi_group_change(group, cpu, 0, 0, now, true);
+ psi_write_end(cpu);
}
}
#endif /* CONFIG_CGROUPS */
@@ -1651,7 +1666,7 @@ static const struct proc_ops psi_irq_proc_ops = {
.proc_poll = psi_fop_poll,
.proc_release = psi_fop_release,
};
-#endif
+#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
static int __init psi_proc_init(void)
{
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index e40422c37033..7936d4333731 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -4,6 +4,9 @@
* policies)
*/
+#include "sched.h"
+#include "pelt.h"
+
int sched_rr_timeslice = RR_TIMESLICE;
/* More than 4 hours if BW_SHIFT equals 20. */
static const u64 max_rt_runtime = MAX_BW;
@@ -60,7 +63,7 @@ static int __init sched_rt_sysctl_init(void)
return 0;
}
late_initcall(sched_rt_sysctl_init);
-#endif
+#endif /* CONFIG_SYSCTL */
void init_rt_rq(struct rt_rq *rt_rq)
{
@@ -75,12 +78,10 @@ void init_rt_rq(struct rt_rq *rt_rq)
/* delimiter for bitsearch: */
__set_bit(MAX_RT_PRIO, array->bitmap);
-#if defined CONFIG_SMP
rt_rq->highest_prio.curr = MAX_RT_PRIO-1;
rt_rq->highest_prio.next = MAX_RT_PRIO-1;
rt_rq->overloaded = 0;
plist_head_init(&rt_rq->pushable_tasks);
-#endif /* CONFIG_SMP */
/* We start is dequeued state, because no RT tasks are queued */
rt_rq->rt_queued = 0;
@@ -291,7 +292,7 @@ err:
return 0;
}
-#else /* CONFIG_RT_GROUP_SCHED */
+#else /* !CONFIG_RT_GROUP_SCHED: */
#define rt_entity_is_task(rt_se) (1)
@@ -327,9 +328,7 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
{
return 1;
}
-#endif /* CONFIG_RT_GROUP_SCHED */
-
-#ifdef CONFIG_SMP
+#endif /* !CONFIG_RT_GROUP_SCHED */
static inline bool need_pull_rt_task(struct rq *rq, struct task_struct *prev)
{
@@ -430,21 +429,6 @@ static void dequeue_pushable_task(struct rq *rq, struct task_struct *p)
}
}
-#else
-
-static inline void enqueue_pushable_task(struct rq *rq, struct task_struct *p)
-{
-}
-
-static inline void dequeue_pushable_task(struct rq *rq, struct task_struct *p)
-{
-}
-
-static inline void rt_queue_push_tasks(struct rq *rq)
-{
-}
-#endif /* CONFIG_SMP */
-
static void enqueue_top_rt_rq(struct rt_rq *rt_rq);
static void dequeue_top_rt_rq(struct rt_rq *rt_rq, unsigned int count);
@@ -485,12 +469,12 @@ static inline bool rt_task_fits_capacity(struct task_struct *p, int cpu)
return cpu_cap >= min(min_cap, max_cap);
}
-#else
+#else /* !CONFIG_UCLAMP_TASK: */
static inline bool rt_task_fits_capacity(struct task_struct *p, int cpu)
{
return true;
}
-#endif
+#endif /* !CONFIG_UCLAMP_TASK */
#ifdef CONFIG_RT_GROUP_SCHED
@@ -594,17 +578,10 @@ static int rt_se_boosted(struct sched_rt_entity *rt_se)
return p->prio != p->normal_prio;
}
-#ifdef CONFIG_SMP
static inline const struct cpumask *sched_rt_period_mask(void)
{
return this_rq()->rd->span;
}
-#else
-static inline const struct cpumask *sched_rt_period_mask(void)
-{
- return cpu_online_mask;
-}
-#endif
static inline
struct rt_rq *sched_rt_period_rt_rq(struct rt_bandwidth *rt_b, int cpu)
@@ -625,7 +602,6 @@ bool sched_rt_bandwidth_account(struct rt_rq *rt_rq)
rt_rq->rt_time < rt_b->rt_runtime);
}
-#ifdef CONFIG_SMP
/*
* We ran out of runtime, see if we can borrow some from our neighbours.
*/
@@ -798,9 +774,6 @@ static void balance_runtime(struct rt_rq *rt_rq)
raw_spin_lock(&rt_rq->rt_runtime_lock);
}
}
-#else /* !CONFIG_SMP */
-static inline void balance_runtime(struct rt_rq *rt_rq) {}
-#endif /* CONFIG_SMP */
static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
{
@@ -930,7 +903,7 @@ static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq)
return 0;
}
-#else /* !CONFIG_RT_GROUP_SCHED */
+#else /* !CONFIG_RT_GROUP_SCHED: */
typedef struct rt_rq *rt_rq_iter_t;
@@ -977,12 +950,10 @@ struct rt_rq *sched_rt_period_rt_rq(struct rt_bandwidth *rt_b, int cpu)
return &cpu_rq(cpu)->rt;
}
-#ifdef CONFIG_SMP
static void __enable_runtime(struct rq *rq) { }
static void __disable_runtime(struct rq *rq) { }
-#endif
-#endif /* CONFIG_RT_GROUP_SCHED */
+#endif /* !CONFIG_RT_GROUP_SCHED */
static inline int rt_se_prio(struct sched_rt_entity *rt_se)
{
@@ -1033,7 +1004,7 @@ static void update_curr_rt(struct rq *rq)
do_start_rt_bandwidth(sched_rt_bandwidth(rt_rq));
}
}
-#endif
+#endif /* CONFIG_RT_GROUP_SCHED */
}
static void
@@ -1075,8 +1046,6 @@ enqueue_top_rt_rq(struct rt_rq *rt_rq)
cpufreq_update_util(rq, 0);
}
-#if defined CONFIG_SMP
-
static void
inc_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio)
{
@@ -1107,16 +1076,6 @@ dec_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio)
cpupri_set(&rq->rd->cpupri, rq->cpu, rt_rq->highest_prio.curr);
}
-#else /* CONFIG_SMP */
-
-static inline
-void inc_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) {}
-static inline
-void dec_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) {}
-
-#endif /* CONFIG_SMP */
-
-#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
static void
inc_rt_prio(struct rt_rq *rt_rq, int prio)
{
@@ -1155,13 +1114,6 @@ dec_rt_prio(struct rt_rq *rt_rq, int prio)
dec_rt_prio_smp(rt_rq, prio, prev_prio);
}
-#else
-
-static inline void inc_rt_prio(struct rt_rq *rt_rq, int prio) {}
-static inline void dec_rt_prio(struct rt_rq *rt_rq, int prio) {}
-
-#endif /* CONFIG_SMP || CONFIG_RT_GROUP_SCHED */
-
#ifdef CONFIG_RT_GROUP_SCHED
static void
@@ -1182,7 +1134,7 @@ dec_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
WARN_ON(!rt_rq->rt_nr_running && rt_rq->rt_nr_boosted);
}
-#else /* CONFIG_RT_GROUP_SCHED */
+#else /* !CONFIG_RT_GROUP_SCHED: */
static void
inc_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
@@ -1192,7 +1144,7 @@ inc_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
static inline
void dec_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) {}
-#endif /* CONFIG_RT_GROUP_SCHED */
+#endif /* !CONFIG_RT_GROUP_SCHED */
static inline
unsigned int rt_se_nr_running(struct sched_rt_entity *rt_se)
@@ -1488,6 +1440,9 @@ enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags)
enqueue_rt_entity(rt_se, flags);
+ if (task_is_blocked(p))
+ return;
+
if (!task_current(rq, p) && p->nr_cpus_allowed > 1)
enqueue_pushable_task(rq, p);
}
@@ -1538,7 +1493,6 @@ static void yield_task_rt(struct rq *rq)
requeue_task_rt(rq, rq->curr, 0);
}
-#ifdef CONFIG_SMP
static int find_lowest_rq(struct task_struct *task);
static int
@@ -1653,7 +1607,6 @@ static int balance_rt(struct rq *rq, struct task_struct *p, struct rq_flags *rf)
return sched_stop_runnable(rq) || sched_dl_runnable(rq) || sched_rt_runnable(rq);
}
-#endif /* CONFIG_SMP */
/*
* Preempt the current task with a newly woken task if needed:
@@ -1667,7 +1620,6 @@ static void wakeup_preempt_rt(struct rq *rq, struct task_struct *p, int flags)
return;
}
-#ifdef CONFIG_SMP
/*
* If:
*
@@ -1682,7 +1634,6 @@ static void wakeup_preempt_rt(struct rq *rq, struct task_struct *p, int flags)
*/
if (p->prio == donor->prio && !test_tsk_need_resched(rq->curr))
check_preempt_equal_prio(rq, p);
-#endif
}
static inline void set_next_task_rt(struct rq *rq, struct task_struct *p, bool first)
@@ -1768,6 +1719,8 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p, struct task_s
update_rt_rq_load_avg(rq_clock_pelt(rq), rq, 1);
+ if (task_is_blocked(p))
+ return;
/*
* The previous task needs to be made eligible for pushing
* if it is still active
@@ -1776,8 +1729,6 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p, struct task_s
enqueue_pushable_task(rq, p);
}
-#ifdef CONFIG_SMP
-
/* Only try algorithms three times */
#define RT_MAX_TRIES 3
@@ -2451,7 +2402,6 @@ void __init init_sched_rt_class(void)
GFP_KERNEL, cpu_to_node(i));
}
}
-#endif /* CONFIG_SMP */
/*
* When switching a task to RT, we may overload the runqueue
@@ -2475,10 +2425,8 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p)
* then see if we can move to another run queue.
*/
if (task_on_rq_queued(p)) {
-#ifdef CONFIG_SMP
if (p->nr_cpus_allowed > 1 && rq->rt.overloaded)
rt_queue_push_tasks(rq);
-#endif /* CONFIG_SMP */
if (p->prio < rq->donor->prio && cpu_online(cpu_of(rq)))
resched_curr(rq);
}
@@ -2495,7 +2443,6 @@ prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio)
return;
if (task_current_donor(rq, p)) {
-#ifdef CONFIG_SMP
/*
* If our priority decreases while running, we
* may need to pull tasks to this runqueue.
@@ -2509,11 +2456,6 @@ prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio)
*/
if (p->prio > rq->rt.highest_prio.curr)
resched_curr(rq);
-#else
- /* For UP simply resched on drop of prio */
- if (oldprio < p->prio)
- resched_curr(rq);
-#endif /* CONFIG_SMP */
} else {
/*
* This task is not running, but if it is
@@ -2549,9 +2491,9 @@ static void watchdog(struct rq *rq, struct task_struct *p)
}
}
}
-#else
+#else /* !CONFIG_POSIX_TIMERS: */
static inline void watchdog(struct rq *rq, struct task_struct *p) { }
-#endif
+#endif /* !CONFIG_POSIX_TIMERS */
/*
* scheduler tick hitting a task of our scheduling class.
@@ -2620,7 +2562,7 @@ static int task_is_throttled_rt(struct task_struct *p, int cpu)
return rt_rq_throttled(rt_rq);
}
-#endif
+#endif /* CONFIG_SCHED_CORE */
DEFINE_SCHED_CLASS(rt) = {
@@ -2634,7 +2576,6 @@ DEFINE_SCHED_CLASS(rt) = {
.put_prev_task = put_prev_task_rt,
.set_next_task = set_next_task_rt,
-#ifdef CONFIG_SMP
.balance = balance_rt,
.select_task_rq = select_task_rq_rt,
.set_cpus_allowed = set_cpus_allowed_common,
@@ -2643,7 +2584,6 @@ DEFINE_SCHED_CLASS(rt) = {
.task_woken = task_woken_rt,
.switched_from = switched_from_rt,
.find_lock_rq = find_lock_lowest_rq,
-#endif
.task_tick = task_tick_rt,
@@ -2887,7 +2827,7 @@ int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk)
return 1;
}
-#else /* !CONFIG_RT_GROUP_SCHED */
+#else /* !CONFIG_RT_GROUP_SCHED: */
#ifdef CONFIG_SYSCTL
static int sched_rt_global_constraints(void)
@@ -2895,7 +2835,7 @@ static int sched_rt_global_constraints(void)
return 0;
}
#endif /* CONFIG_SYSCTL */
-#endif /* CONFIG_RT_GROUP_SCHED */
+#endif /* !CONFIG_RT_GROUP_SCHED */
#ifdef CONFIG_SYSCTL
static int sched_rt_global_validate(void)
@@ -2951,6 +2891,12 @@ undo:
sched_domains_mutex_unlock();
mutex_unlock(&mutex);
+ /*
+ * After changing maximum available bandwidth for DEADLINE, we need to
+ * recompute per root domain and per cpus variables accordingly.
+ */
+ rebuild_sched_domains();
+
return ret;
}
diff --git a/kernel/sched/sched-pelt.h b/kernel/sched/sched-pelt.h
index c529706bed11..6803cfec7a1e 100644
--- a/kernel/sched/sched-pelt.h
+++ b/kernel/sched/sched-pelt.h
@@ -1,5 +1,6 @@
/* SPDX-License-Identifier: GPL-2.0 */
/* Generated by Documentation/scheduler/sched-pelt; do not modify. */
+#include <linux/types.h>
static const u32 runnable_avg_yN_inv[] __maybe_unused = {
0xffffffff, 0xfa83b2da, 0xf5257d14, 0xefe4b99a, 0xeac0c6e6, 0xe5b906e6,
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index c5a6a503eb6d..d3f33d10c58c 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -69,6 +69,7 @@
#include <linux/wait_bit.h>
#include <linux/workqueue_api.h>
#include <linux/delayacct.h>
+#include <linux/mmu_context.h>
#include <trace/events/power.h>
#include <trace/events/sched.h>
@@ -384,6 +385,7 @@ extern void dl_server_stop(struct sched_dl_entity *dl_se);
extern void dl_server_init(struct sched_dl_entity *dl_se, struct rq *rq,
dl_server_has_tasks_f has_tasks,
dl_server_pick_f pick_task);
+extern void sched_init_dl_servers(void);
extern void dl_server_update_idle_time(struct rq *rq,
struct task_struct *p);
@@ -401,6 +403,19 @@ static inline bool dl_server_active(struct sched_dl_entity *dl_se)
extern struct list_head task_groups;
+#ifdef CONFIG_CFS_BANDWIDTH
+extern const u64 max_bw_quota_period_us;
+
+/*
+ * default period for group bandwidth.
+ * default: 0.1s, units: microseconds
+ */
+static inline u64 default_bw_period_us(void)
+{
+ return 100000ULL;
+}
+#endif /* CONFIG_CFS_BANDWIDTH */
+
struct cfs_bandwidth {
#ifdef CONFIG_CFS_BANDWIDTH
raw_spinlock_t lock;
@@ -424,7 +439,7 @@ struct cfs_bandwidth {
int nr_burst;
u64 throttled_time;
u64 burst_time;
-#endif
+#endif /* CONFIG_CFS_BANDWIDTH */
};
/* Task group related information */
@@ -442,15 +457,13 @@ struct task_group {
/* runqueue "owned" by this group on each CPU */
struct cfs_rq **cfs_rq;
unsigned long shares;
-#ifdef CONFIG_SMP
/*
* load_avg can be heavily contended at clock tick time, so put
* it in its own cache-line separated from the fields above which
* will also be accessed at each tick.
*/
atomic_long_t load_avg ____cacheline_aligned;
-#endif
-#endif
+#endif /* CONFIG_FAIR_GROUP_SCHED */
#ifdef CONFIG_RT_GROUP_SCHED
struct sched_rt_entity **rt_se;
@@ -531,7 +544,7 @@ extern void free_fair_sched_group(struct task_group *tg);
extern int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent);
extern void online_fair_sched_group(struct task_group *tg);
extern void unregister_fair_sched_group(struct task_group *tg);
-#else
+#else /* !CONFIG_FAIR_GROUP_SCHED: */
static inline void free_fair_sched_group(struct task_group *tg) { }
static inline int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
{
@@ -539,7 +552,7 @@ static inline int alloc_fair_sched_group(struct task_group *tg, struct task_grou
}
static inline void online_fair_sched_group(struct task_group *tg) { }
static inline void unregister_fair_sched_group(struct task_group *tg) { }
-#endif
+#endif /* !CONFIG_FAIR_GROUP_SCHED */
extern void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
struct sched_entity *se, int cpu,
@@ -573,25 +586,20 @@ extern int sched_group_set_shares(struct task_group *tg, unsigned long shares);
extern int sched_group_set_idle(struct task_group *tg, long idle);
-#ifdef CONFIG_SMP
extern void set_task_rq_fair(struct sched_entity *se,
struct cfs_rq *prev, struct cfs_rq *next);
-#else /* !CONFIG_SMP */
-static inline void set_task_rq_fair(struct sched_entity *se,
- struct cfs_rq *prev, struct cfs_rq *next) { }
-#endif /* CONFIG_SMP */
-#else /* !CONFIG_FAIR_GROUP_SCHED */
+#else /* !CONFIG_FAIR_GROUP_SCHED: */
static inline int sched_group_set_shares(struct task_group *tg, unsigned long shares) { return 0; }
static inline int sched_group_set_idle(struct task_group *tg, long idle) { return 0; }
-#endif /* CONFIG_FAIR_GROUP_SCHED */
+#endif /* !CONFIG_FAIR_GROUP_SCHED */
-#else /* CONFIG_CGROUP_SCHED */
+#else /* !CONFIG_CGROUP_SCHED: */
struct cfs_bandwidth { };
static inline bool cfs_task_bw_constrained(struct task_struct *p) { return false; }
-#endif /* CONFIG_CGROUP_SCHED */
+#endif /* !CONFIG_CGROUP_SCHED */
extern void unregister_rt_sched_group(struct task_group *tg);
extern void free_rt_sched_group(struct task_group *tg);
@@ -667,7 +675,6 @@ struct cfs_rq {
struct sched_entity *curr;
struct sched_entity *next;
-#ifdef CONFIG_SMP
/*
* CFS load tracking
*/
@@ -699,7 +706,6 @@ struct cfs_rq {
u64 last_h_load_update;
struct sched_entity *h_load_next;
#endif /* CONFIG_FAIR_GROUP_SCHED */
-#endif /* CONFIG_SMP */
#ifdef CONFIG_FAIR_GROUP_SCHED
struct rq *rq; /* CPU runqueue to which this cfs_rq is attached */
@@ -796,19 +802,13 @@ struct rt_rq {
struct rt_prio_array active;
unsigned int rt_nr_running;
unsigned int rr_nr_running;
-#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
struct {
int curr; /* highest queued rt task prio */
-#ifdef CONFIG_SMP
int next; /* next highest */
-#endif
} highest_prio;
-#endif
-#ifdef CONFIG_SMP
bool overloaded;
struct plist_head pushable_tasks;
-#endif /* CONFIG_SMP */
int rt_queued;
#ifdef CONFIG_RT_GROUP_SCHED
@@ -839,7 +839,6 @@ struct dl_rq {
unsigned int dl_nr_running;
-#ifdef CONFIG_SMP
/*
* Deadline values of the currently executing and the
* earliest ready task on this rq. Caching these facilitates
@@ -859,9 +858,7 @@ struct dl_rq {
* of the leftmost (earliest deadline) element.
*/
struct rb_root_cached pushable_dl_tasks_root;
-#else
- struct dl_bw dl_bw;
-#endif
+
/*
* "Active utilization" for this runqueue: increased when a
* task wakes up (becomes TASK_RUNNING) and decreased when a
@@ -932,7 +929,6 @@ static inline long se_runnable(struct sched_entity *se)
#endif /* !CONFIG_FAIR_GROUP_SCHED */
-#ifdef CONFIG_SMP
/*
* XXX we want to get rid of these helpers and use the full load resolution.
*/
@@ -1008,7 +1004,7 @@ struct root_domain {
/* These atomics are updated outside of a lock */
atomic_t rto_loop_next;
atomic_t rto_loop_start;
-#endif
+#endif /* HAVE_RT_PUSH_IPI */
/*
* The "RT overload" flag: it gets set if a CPU has more than
* one runnable RT task.
@@ -1043,7 +1039,6 @@ static inline void set_rd_overloaded(struct root_domain *rd, int status)
#ifdef HAVE_RT_PUSH_IPI
extern void rto_push_irq_work_func(struct irq_work *work);
#endif
-#endif /* CONFIG_SMP */
#ifdef CONFIG_UCLAMP_TASK
/*
@@ -1107,18 +1102,14 @@ struct rq {
unsigned int numa_migrate_on;
#endif
#ifdef CONFIG_NO_HZ_COMMON
-#ifdef CONFIG_SMP
unsigned long last_blocked_load_update_tick;
unsigned int has_blocked_load;
call_single_data_t nohz_csd;
-#endif /* CONFIG_SMP */
unsigned int nohz_tick_stopped;
atomic_t nohz_flags;
#endif /* CONFIG_NO_HZ_COMMON */
-#ifdef CONFIG_SMP
unsigned int ttwu_pending;
-#endif
u64 nr_switches;
#ifdef CONFIG_UCLAMP_TASK
@@ -1149,12 +1140,17 @@ struct rq {
* one CPU and if it got migrated afterwards it may decrease
* it on another CPU. Always updated under the runqueue lock:
*/
- unsigned int nr_uninterruptible;
+ unsigned long nr_uninterruptible;
+#ifdef CONFIG_SCHED_PROXY_EXEC
+ struct task_struct __rcu *donor; /* Scheduling context */
+ struct task_struct __rcu *curr; /* Execution context */
+#else
union {
struct task_struct __rcu *donor; /* Scheduler context */
struct task_struct __rcu *curr; /* Execution context */
};
+#endif
struct sched_dl_entity *dl_server;
struct task_struct *idle;
struct task_struct *stop;
@@ -1183,7 +1179,6 @@ struct rq {
int membarrier_state;
#endif
-#ifdef CONFIG_SMP
struct root_domain *rd;
struct sched_domain __rcu *sd;
@@ -1224,7 +1219,6 @@ struct rq {
#ifdef CONFIG_HOTPLUG_CPU
struct rcuwait hotplug_wait;
#endif
-#endif /* CONFIG_SMP */
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
u64 prev_irq_time;
@@ -1242,9 +1236,7 @@ struct rq {
long calc_load_active;
#ifdef CONFIG_SCHED_HRTICK
-#ifdef CONFIG_SMP
call_single_data_t hrtick_csd;
-#endif
struct hrtimer hrtick_timer;
ktime_t hrtick_time;
#endif
@@ -1271,9 +1263,7 @@ struct rq {
struct cpuidle_state *idle_state;
#endif
-#ifdef CONFIG_SMP
unsigned int nr_pinned;
-#endif
unsigned int push_busy;
struct cpu_stop_work push_work;
@@ -1294,12 +1284,12 @@ struct rq {
unsigned int core_forceidle_seq;
unsigned int core_forceidle_occupation;
u64 core_forceidle_start;
-#endif
+#endif /* CONFIG_SCHED_CORE */
/* Scratch cpumask to be temporarily used under rq_lock */
cpumask_var_t scratch_mask;
-#if defined(CONFIG_CFS_BANDWIDTH) && defined(CONFIG_SMP)
+#ifdef CONFIG_CFS_BANDWIDTH
call_single_data_t cfsb_csd;
struct list_head cfsb_csd_list;
#endif
@@ -1313,32 +1303,24 @@ static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
return cfs_rq->rq;
}
-#else
+#else /* !CONFIG_FAIR_GROUP_SCHED: */
static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
{
return container_of(cfs_rq, struct rq, cfs);
}
-#endif
+#endif /* !CONFIG_FAIR_GROUP_SCHED */
static inline int cpu_of(struct rq *rq)
{
-#ifdef CONFIG_SMP
return rq->cpu;
-#else
- return 0;
-#endif
}
#define MDF_PUSH 0x01
static inline bool is_migration_disabled(struct task_struct *p)
{
-#ifdef CONFIG_SMP
return p->migration_disabled;
-#else
- return false;
-#endif
}
DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
@@ -1349,10 +1331,17 @@ DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
#define cpu_curr(cpu) (cpu_rq(cpu)->curr)
#define raw_rq() raw_cpu_ptr(&runqueues)
+#ifdef CONFIG_SCHED_PROXY_EXEC
+static inline void rq_set_donor(struct rq *rq, struct task_struct *t)
+{
+ rcu_assign_pointer(rq->donor, t);
+}
+#else
static inline void rq_set_donor(struct rq *rq, struct task_struct *t)
{
/* Do nothing */
}
+#endif
#ifdef CONFIG_SCHED_CORE
static inline struct cpumask *sched_group_span(struct sched_group *sg);
@@ -1500,6 +1489,7 @@ static inline bool sched_group_cookie_match(struct rq *rq,
}
#endif /* !CONFIG_SCHED_CORE */
+
#ifdef CONFIG_RT_GROUP_SCHED
# ifdef CONFIG_RT_GROUP_SCHED_DEFAULT_DISABLED
DECLARE_STATIC_KEY_FALSE(rt_group_sched);
@@ -1507,16 +1497,16 @@ static inline bool rt_group_sched_enabled(void)
{
return static_branch_unlikely(&rt_group_sched);
}
-# else
+# else /* !CONFIG_RT_GROUP_SCHED_DEFAULT_DISABLED: */
DECLARE_STATIC_KEY_TRUE(rt_group_sched);
static inline bool rt_group_sched_enabled(void)
{
return static_branch_likely(&rt_group_sched);
}
-# endif /* CONFIG_RT_GROUP_SCHED_DEFAULT_DISABLED */
-#else
+# endif /* !CONFIG_RT_GROUP_SCHED_DEFAULT_DISABLED */
+#else /* !CONFIG_RT_GROUP_SCHED: */
# define rt_group_sched_enabled() false
-#endif /* CONFIG_RT_GROUP_SCHED */
+#endif /* !CONFIG_RT_GROUP_SCHED */
static inline void lockdep_assert_rq_held(struct rq *rq)
{
@@ -1574,9 +1564,9 @@ static inline void update_idle_core(struct rq *rq)
__update_idle_core(rq);
}
-#else
+#else /* !CONFIG_SCHED_SMT: */
static inline void update_idle_core(struct rq *rq) { }
-#endif
+#endif /* !CONFIG_SCHED_SMT */
#ifdef CONFIG_FAIR_GROUP_SCHED
@@ -1736,10 +1726,10 @@ extern struct balance_callback balance_push_callback;
#ifdef CONFIG_SCHED_CLASS_EXT
extern const struct sched_class ext_sched_class;
-DECLARE_STATIC_KEY_FALSE(__scx_ops_enabled); /* SCX BPF scheduler loaded */
+DECLARE_STATIC_KEY_FALSE(__scx_enabled); /* SCX BPF scheduler loaded */
DECLARE_STATIC_KEY_FALSE(__scx_switched_all); /* all fair class tasks on SCX */
-#define scx_enabled() static_branch_unlikely(&__scx_ops_enabled)
+#define scx_enabled() static_branch_unlikely(&__scx_enabled)
#define scx_switched_all() static_branch_unlikely(&__scx_switched_all)
static inline void scx_rq_clock_update(struct rq *rq, u64 clock)
@@ -1757,7 +1747,7 @@ static inline void scx_rq_clock_invalidate(struct rq *rq)
WRITE_ONCE(rq->scx.flags, rq->scx.flags & ~SCX_RQ_CLK_VALID);
}
-#else /* !CONFIG_SCHED_CLASS_EXT */
+#else /* !CONFIG_SCHED_CLASS_EXT: */
#define scx_enabled() false
#define scx_switched_all() false
@@ -1781,9 +1771,7 @@ static inline void rq_pin_lock(struct rq *rq, struct rq_flags *rf)
rq->clock_update_flags &= (RQCF_REQ_SKIP|RQCF_ACT_SKIP);
rf->clock_update_flags = 0;
-#ifdef CONFIG_SMP
WARN_ON_ONCE(rq->balance_callback && rq->balance_callback != &balance_push_callback);
-#endif
}
static inline void rq_unpin_lock(struct rq *rq, struct rq_flags *rf)
@@ -1961,8 +1949,6 @@ init_numa_balancing(unsigned long clone_flags, struct task_struct *p)
#endif /* !CONFIG_NUMA_BALANCING */
-#ifdef CONFIG_SMP
-
static inline void
queue_balance_callback(struct rq *rq,
struct balance_callback *head,
@@ -2128,8 +2114,6 @@ static inline const struct cpumask *task_user_cpus(struct task_struct *p)
return p->user_cpus_ptr;
}
-#endif /* CONFIG_SMP */
-
#ifdef CONFIG_CGROUP_SCHED
/*
@@ -2174,7 +2158,7 @@ static inline void set_task_rq(struct task_struct *p, unsigned int cpu)
tg = &root_task_group;
p->rt.rt_rq = tg->rt_rq[cpu];
p->rt.parent = tg->rt_se[cpu];
-#endif
+#endif /* CONFIG_RT_GROUP_SCHED */
}
#else /* !CONFIG_CGROUP_SCHED: */
@@ -2200,7 +2184,7 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
smp_wmb();
WRITE_ONCE(task_thread_info(p)->cpu, cpu);
p->wake_cpu = cpu;
-#endif
+#endif /* CONFIG_SMP */
}
/*
@@ -2278,13 +2262,17 @@ static inline int task_current_donor(struct rq *rq, struct task_struct *p)
return rq->donor == p;
}
+static inline bool task_is_blocked(struct task_struct *p)
+{
+ if (!sched_proxy_exec())
+ return false;
+
+ return !!p->blocked_on;
+}
+
static inline int task_on_cpu(struct rq *rq, struct task_struct *p)
{
-#ifdef CONFIG_SMP
return p->on_cpu;
-#else
- return task_current(rq, p);
-#endif
}
static inline int task_on_rq_queued(struct task_struct *p)
@@ -2307,11 +2295,9 @@ static inline int task_on_rq_migrating(struct task_struct *p)
#define WF_CURRENT_CPU 0x40 /* Prefer to move the wakee to the current CPU. */
#define WF_RQ_SELECTED 0x80 /* ->select_task_rq() was called */
-#ifdef CONFIG_SMP
static_assert(WF_EXEC == SD_BALANCE_EXEC);
static_assert(WF_FORK == SD_BALANCE_FORK);
static_assert(WF_TTWU == SD_BALANCE_WAKE);
-#endif
/*
* To aid in avoiding the subversion of "niceness" due to uneven distribution
@@ -2367,11 +2353,7 @@ extern const u32 sched_prio_to_wmult[40];
#define ENQUEUE_HEAD 0x10
#define ENQUEUE_REPLENISH 0x20
-#ifdef CONFIG_SMP
#define ENQUEUE_MIGRATED 0x40
-#else
-#define ENQUEUE_MIGRATED 0x00
-#endif
#define ENQUEUE_INITIAL 0x80
#define ENQUEUE_MIGRATING 0x100
#define ENQUEUE_DELAYED 0x200
@@ -2416,7 +2398,6 @@ struct sched_class {
void (*put_prev_task)(struct rq *rq, struct task_struct *p, struct task_struct *next);
void (*set_next_task)(struct rq *rq, struct task_struct *p, bool first);
-#ifdef CONFIG_SMP
int (*select_task_rq)(struct task_struct *p, int task_cpu, int flags);
void (*migrate_task_rq)(struct task_struct *p, int new_cpu);
@@ -2429,7 +2410,6 @@ struct sched_class {
void (*rq_offline)(struct rq *rq);
struct rq *(*find_lock_rq)(struct task_struct *p, struct rq *rq);
-#endif
void (*task_tick)(struct rq *rq, struct task_struct *p, int queued);
void (*task_fork)(struct task_struct *p);
@@ -2487,7 +2467,7 @@ static inline void put_prev_set_next_task(struct rq *rq,
struct task_struct *prev,
struct task_struct *next)
{
- WARN_ON_ONCE(rq->curr != prev);
+ WARN_ON_ONCE(rq->donor != prev);
__put_prev_set_next_dl_server(rq, prev, next);
@@ -2581,8 +2561,6 @@ extern struct task_struct *pick_task_idle(struct rq *rq);
#define SCA_MIGRATE_ENABLE 0x04
#define SCA_USER 0x08
-#ifdef CONFIG_SMP
-
extern void update_group_capacity(struct sched_domain *sd, int cpu);
extern void sched_balance_trigger(struct rq *rq);
@@ -2634,26 +2612,6 @@ static inline struct task_struct *get_push_task(struct rq *rq)
extern int push_cpu_stop(void *arg);
-#else /* !CONFIG_SMP: */
-
-static inline bool task_allowed_on_cpu(struct task_struct *p, int cpu)
-{
- return true;
-}
-
-static inline int __set_cpus_allowed_ptr(struct task_struct *p,
- struct affinity_context *ctx)
-{
- return set_cpus_allowed_ptr(p, ctx->new_mask);
-}
-
-static inline cpumask_t *alloc_user_cpus_ptr(int node)
-{
- return NULL;
-}
-
-#endif /* !CONFIG_SMP */
-
#ifdef CONFIG_CPU_IDLE
static inline void idle_set_state(struct rq *rq,
@@ -2749,10 +2707,8 @@ static inline void add_nr_running(struct rq *rq, unsigned count)
call_trace_sched_update_nr_running(rq, count);
}
-#ifdef CONFIG_SMP
if (prev_nr < 2 && rq->nr_running >= 2)
set_rd_overloaded(rq->rd, 1);
-#endif
sched_update_tick_dependency(rq);
}
@@ -2918,10 +2874,7 @@ unsigned long arch_scale_freq_capacity(int cpu)
static inline void double_rq_clock_clear_update(struct rq *rq1, struct rq *rq2)
{
rq1->clock_update_flags &= (RQCF_REQ_SKIP|RQCF_ACT_SKIP);
- /* rq1 == rq2 for !CONFIG_SMP, so just clear RQCF_UPDATED once. */
-#ifdef CONFIG_SMP
rq2->clock_update_flags &= (RQCF_REQ_SKIP|RQCF_ACT_SKIP);
-#endif
}
#define DEFINE_LOCK_GUARD_2(name, type, _lock, _unlock, ...) \
@@ -2930,8 +2883,6 @@ static inline class_##name##_t class_##name##_constructor(type *lock, type *lock
{ class_##name##_t _t = { .lock = lock, .lock2 = lock2 }, *_T = &_t; \
_lock; return _t; }
-#ifdef CONFIG_SMP
-
static inline bool rq_order_less(struct rq *rq1, struct rq *rq2)
{
#ifdef CONFIG_SCHED_CORE
@@ -2954,7 +2905,7 @@ static inline bool rq_order_less(struct rq *rq1, struct rq *rq2)
/*
* __sched_core_flip() relies on SMT having cpu-id lock order.
*/
-#endif
+#endif /* CONFIG_SCHED_CORE */
return rq1->cpu < rq2->cpu;
}
@@ -3091,42 +3042,6 @@ extern void set_rq_offline(struct rq *rq);
extern bool sched_smp_initialized;
-#else /* !CONFIG_SMP: */
-
-/*
- * double_rq_lock - safely lock two runqueues
- *
- * Note this does not disable interrupts like task_rq_lock,
- * you need to do so manually before calling.
- */
-static inline void double_rq_lock(struct rq *rq1, struct rq *rq2)
- __acquires(rq1->lock)
- __acquires(rq2->lock)
-{
- WARN_ON_ONCE(!irqs_disabled());
- WARN_ON_ONCE(rq1 != rq2);
- raw_spin_rq_lock(rq1);
- __acquire(rq2->lock); /* Fake it out ;) */
- double_rq_clock_clear_update(rq1, rq2);
-}
-
-/*
- * double_rq_unlock - safely unlock two runqueues
- *
- * Note this does not restore interrupts like task_rq_unlock,
- * you need to do so manually after calling.
- */
-static inline void double_rq_unlock(struct rq *rq1, struct rq *rq2)
- __releases(rq1->lock)
- __releases(rq2->lock)
-{
- WARN_ON_ONCE(rq1 != rq2);
- raw_spin_rq_unlock(rq1);
- __release(rq2->lock);
-}
-
-#endif /* !CONFIG_SMP */
-
DEFINE_LOCK_GUARD_2(double_rq_lock, struct rq,
double_rq_lock(_T->lock, _T->lock2),
double_rq_unlock(_T->lock, _T->lock2))
@@ -3145,6 +3060,7 @@ extern void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq);
extern void print_dl_rq(struct seq_file *m, int cpu, struct dl_rq *dl_rq);
extern void resched_latency_warn(int cpu, u64 latency);
+
#ifdef CONFIG_NUMA_BALANCING
extern void show_numa_stats(struct task_struct *p, struct seq_file *m);
extern void
@@ -3184,7 +3100,7 @@ extern void nohz_balance_exit_idle(struct rq *rq);
static inline void nohz_balance_exit_idle(struct rq *rq) { }
#endif /* !CONFIG_NO_HZ_COMMON */
-#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
+#ifdef CONFIG_NO_HZ_COMMON
extern void nohz_run_idle_balance(int cpu);
#else
static inline void nohz_run_idle_balance(int cpu) { }
@@ -3254,14 +3170,14 @@ static inline u64 irq_time_read(int cpu)
return total;
}
-#else
+#else /* !CONFIG_IRQ_TIME_ACCOUNTING: */
static inline int irqtime_enabled(void)
{
return 0;
}
-#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
+#endif /* !CONFIG_IRQ_TIME_ACCOUNTING */
#ifdef CONFIG_CPU_FREQ
@@ -3310,8 +3226,6 @@ static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) { }
# define arch_scale_freq_invariant() false
#endif
-#ifdef CONFIG_SMP
-
unsigned long effective_cpu_util(int cpu, unsigned long util_cfs,
unsigned long *min,
unsigned long *max);
@@ -3355,10 +3269,6 @@ static inline unsigned long cpu_util_rt(struct rq *rq)
return READ_ONCE(rq->avg_rt.util_avg);
}
-#else /* !CONFIG_SMP */
-static inline bool update_other_load_avgs(struct rq *rq) { return false; }
-#endif /* CONFIG_SMP */
-
#ifdef CONFIG_UCLAMP_TASK
unsigned long uclamp_eff_value(struct task_struct *p, enum uclamp_id clamp_id);
@@ -3535,15 +3445,13 @@ static inline bool sched_energy_enabled(void)
return static_branch_unlikely(&sched_energy_present);
}
-extern struct cpufreq_governor schedutil_gov;
-
-#else /* ! (CONFIG_ENERGY_MODEL && CONFIG_CPU_FREQ_GOV_SCHEDUTIL) */
+#else /* !(CONFIG_ENERGY_MODEL && CONFIG_CPU_FREQ_GOV_SCHEDUTIL): */
#define perf_domain_span(pd) NULL
static inline bool sched_energy_enabled(void) { return false; }
-#endif /* CONFIG_ENERGY_MODEL && CONFIG_CPU_FREQ_GOV_SCHEDUTIL */
+#endif /* !(CONFIG_ENERGY_MODEL && CONFIG_CPU_FREQ_GOV_SCHEDUTIL) */
#ifdef CONFIG_MEMBARRIER
@@ -3569,7 +3477,7 @@ static inline void membarrier_switch_mm(struct rq *rq,
WRITE_ONCE(rq->membarrier_state, membarrier_state);
}
-#else /* !CONFIG_MEMBARRIER :*/
+#else /* !CONFIG_MEMBARRIER: */
static inline void membarrier_switch_mm(struct rq *rq,
struct mm_struct *prev_mm,
@@ -3579,7 +3487,6 @@ static inline void membarrier_switch_mm(struct rq *rq,
#endif /* !CONFIG_MEMBARRIER */
-#ifdef CONFIG_SMP
static inline bool is_per_cpu_kthread(struct task_struct *p)
{
if (!(p->flags & PF_KTHREAD))
@@ -3590,7 +3497,6 @@ static inline bool is_per_cpu_kthread(struct task_struct *p)
return true;
}
-#endif
extern void swake_up_all_locked(struct swait_queue_head *q);
extern void __prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait);
@@ -3889,7 +3795,6 @@ static inline void init_sched_mm_cid(struct task_struct *t) { }
extern u64 avg_vruntime(struct cfs_rq *cfs_rq);
extern int entity_eligible(struct cfs_rq *cfs_rq, struct sched_entity *se);
-#ifdef CONFIG_SMP
static inline
void move_queued_task_locked(struct rq *src_rq, struct rq *dst_rq, struct task_struct *task)
{
@@ -3910,7 +3815,6 @@ bool task_is_pushable(struct rq *rq, struct task_struct *p, int cpu)
return false;
}
-#endif
#ifdef CONFIG_RT_MUTEXES
@@ -3951,21 +3855,8 @@ extern void check_class_changed(struct rq *rq, struct task_struct *p,
const struct sched_class *prev_class,
int oldprio);
-#ifdef CONFIG_SMP
extern struct balance_callback *splice_balance_callbacks(struct rq *rq);
extern void balance_callbacks(struct rq *rq, struct balance_callback *head);
-#else
-
-static inline struct balance_callback *splice_balance_callbacks(struct rq *rq)
-{
- return NULL;
-}
-
-static inline void balance_callbacks(struct rq *rq, struct balance_callback *head)
-{
-}
-
-#endif
#ifdef CONFIG_SCHED_CLASS_EXT
/*
diff --git a/kernel/sched/smp.h b/kernel/sched/smp.h
index 21ac44428bb0..7f151d96dba9 100644
--- a/kernel/sched/smp.h
+++ b/kernel/sched/smp.h
@@ -1,8 +1,13 @@
/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef _KERNEL_SCHED_SMP_H
+#define _KERNEL_SCHED_SMP_H
+
/*
* Scheduler internal SMP callback types and methods between the scheduler
* and other internal parts of the core kernel:
*/
+#include <linux/types.h>
extern void sched_ttwu_pending(void *arg);
@@ -13,3 +18,5 @@ extern void flush_smp_call_function_queue(void);
#else
static inline void flush_smp_call_function_queue(void) { }
#endif
+
+#endif /* _KERNEL_SCHED_SMP_H */
diff --git a/kernel/sched/stats.c b/kernel/sched/stats.c
index 4346fd81c31f..d1c9429a4ac5 100644
--- a/kernel/sched/stats.c
+++ b/kernel/sched/stats.c
@@ -2,6 +2,7 @@
/*
* /proc/schedstat implementation
*/
+#include "sched.h"
void __update_stats_wait_start(struct rq *rq, struct task_struct *p,
struct sched_statistics *stats)
@@ -114,10 +115,8 @@ static int show_schedstat(struct seq_file *seq, void *v)
seq_printf(seq, "timestamp %lu\n", jiffies);
} else {
struct rq *rq;
-#ifdef CONFIG_SMP
struct sched_domain *sd;
int dcount = 0;
-#endif
cpu = (unsigned long)(v - 2);
rq = cpu_rq(cpu);
@@ -132,7 +131,6 @@ static int show_schedstat(struct seq_file *seq, void *v)
seq_printf(seq, "\n");
-#ifdef CONFIG_SMP
/* domain-specific stats */
rcu_read_lock();
for_each_domain(cpu, sd) {
@@ -163,7 +161,6 @@ static int show_schedstat(struct seq_file *seq, void *v)
sd->ttwu_move_balance);
}
rcu_read_unlock();
-#endif
}
return 0;
}
diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h
index 452826df6ae1..26f3fd4d34ce 100644
--- a/kernel/sched/stats.h
+++ b/kernel/sched/stats.h
@@ -112,10 +112,10 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next,
bool sleep);
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
void psi_account_irqtime(struct rq *rq, struct task_struct *curr, struct task_struct *prev);
-#else
+#else /* !CONFIG_IRQ_TIME_ACCOUNTING: */
static inline void psi_account_irqtime(struct rq *rq, struct task_struct *curr,
struct task_struct *prev) {}
-#endif /*CONFIG_IRQ_TIME_ACCOUNTING */
+#endif /* !CONFIG_IRQ_TIME_ACCOUNTING */
/*
* PSI tracks state that persists across sleeps, such as iowaits and
* memory stalls. As a result, it has to distinguish between sleeps,
@@ -220,7 +220,7 @@ static inline void psi_sched_switch(struct task_struct *prev,
psi_task_switch(prev, next, sleep);
}
-#else /* CONFIG_PSI */
+#else /* !CONFIG_PSI: */
static inline void psi_enqueue(struct task_struct *p, bool migrate) {}
static inline void psi_dequeue(struct task_struct *p, bool migrate) {}
static inline void psi_ttwu_dequeue(struct task_struct *p) {}
@@ -229,7 +229,7 @@ static inline void psi_sched_switch(struct task_struct *prev,
bool sleep) {}
static inline void psi_account_irqtime(struct rq *rq, struct task_struct *curr,
struct task_struct *prev) {}
-#endif /* CONFIG_PSI */
+#endif /* !CONFIG_PSI */
#ifdef CONFIG_SCHED_INFO
/*
@@ -334,6 +334,6 @@ sched_info_switch(struct rq *rq, struct task_struct *prev, struct task_struct *n
# define sched_info_enqueue(rq, t) do { } while (0)
# define sched_info_dequeue(rq, t) do { } while (0)
# define sched_info_switch(rq, t, next) do { } while (0)
-#endif /* CONFIG_SCHED_INFO */
+#endif /* !CONFIG_SCHED_INFO */
#endif /* _KERNEL_STATS_H */
diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c
index 058dd42e3d9b..2d4e279f05ee 100644
--- a/kernel/sched/stop_task.c
+++ b/kernel/sched/stop_task.c
@@ -7,8 +7,8 @@
*
* See kernel/stop_machine.c
*/
+#include "sched.h"
-#ifdef CONFIG_SMP
static int
select_task_rq_stop(struct task_struct *p, int cpu, int flags)
{
@@ -20,7 +20,6 @@ balance_stop(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
{
return sched_stop_runnable(rq);
}
-#endif /* CONFIG_SMP */
static void
wakeup_preempt_stop(struct rq *rq, struct task_struct *p, int flags)
@@ -106,11 +105,9 @@ DEFINE_SCHED_CLASS(stop) = {
.put_prev_task = put_prev_task_stop,
.set_next_task = set_next_task_stop,
-#ifdef CONFIG_SMP
.balance = balance_stop,
.select_task_rq = select_task_rq_stop,
.set_cpus_allowed = set_cpus_allowed_common,
-#endif
.task_tick = task_tick_stop,
diff --git a/kernel/sched/swait.c b/kernel/sched/swait.c
index 72505cd3b60a..0fef6496c4c8 100644
--- a/kernel/sched/swait.c
+++ b/kernel/sched/swait.c
@@ -2,6 +2,7 @@
/*
* <linux/swait.h> (simple wait queues ) implementation:
*/
+#include "sched.h"
void __init_swait_queue_head(struct swait_queue_head *q, const char *name,
struct lock_class_key *key)
diff --git a/kernel/sched/syscalls.c b/kernel/sched/syscalls.c
index 547c1f05b667..77ae87f36e84 100644
--- a/kernel/sched/syscalls.c
+++ b/kernel/sched/syscalls.c
@@ -174,7 +174,7 @@ SYSCALL_DEFINE1(nice, int, increment)
return 0;
}
-#endif
+#endif /* __ARCH_WANT_SYS_NICE */
/**
* task_prio - return the priority value of a given task.
@@ -209,10 +209,8 @@ int idle_cpu(int cpu)
if (rq->nr_running)
return 0;
-#ifdef CONFIG_SMP
if (rq->ttwu_pending)
return 0;
-#endif
return 1;
}
@@ -255,8 +253,7 @@ int sched_core_idle_cpu(int cpu)
return idle_cpu(cpu);
}
-
-#endif
+#endif /* CONFIG_SCHED_CORE */
/**
* find_process_by_pid - find a process with a matching PID value.
@@ -448,7 +445,7 @@ static inline int uclamp_validate(struct task_struct *p,
}
static void __setscheduler_uclamp(struct task_struct *p,
const struct sched_attr *attr) { }
-#endif
+#endif /* !CONFIG_UCLAMP_TASK */
/*
* Allow unprivileged RT tasks to decrease priority.
@@ -642,7 +639,6 @@ change:
goto unlock;
}
#endif /* CONFIG_RT_GROUP_SCHED */
-#ifdef CONFIG_SMP
if (dl_bandwidth_enabled() && dl_policy(policy) &&
!(attr->sched_flags & SCHED_FLAG_SUGOV)) {
cpumask_t *span = rq->rd->span;
@@ -658,7 +654,6 @@ change:
goto unlock;
}
}
-#endif
}
/* Re-check policy now with rq lock held: */
@@ -1120,7 +1115,6 @@ SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr,
return copy_struct_to_user(uattr, usize, &kattr, sizeof(kattr), NULL);
}
-#ifdef CONFIG_SMP
int dl_task_check_affinity(struct task_struct *p, const struct cpumask *mask)
{
/*
@@ -1149,7 +1143,6 @@ int dl_task_check_affinity(struct task_struct *p, const struct cpumask *mask)
return 0;
}
-#endif /* CONFIG_SMP */
int __sched_setaffinity(struct task_struct *p, struct affinity_context *ctx)
{
@@ -1242,7 +1235,7 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
user_mask = alloc_user_cpus_ptr(NUMA_NO_NODE);
if (user_mask) {
cpumask_copy(user_mask, in_mask);
- } else if (IS_ENABLED(CONFIG_SMP)) {
+ } else {
return -ENOMEM;
}
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index a2a38e1b6f18..977e133bb8a4 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -3,7 +3,9 @@
* Scheduler topology setup/handling methods
*/
+#include <linux/sched/isolation.h>
#include <linux/bsearch.h>
+#include "sched.h"
DEFINE_MUTEX(sched_domains_mutex);
void sched_domains_mutex_lock(void)
@@ -87,7 +89,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
break;
}
- if (!(sd->flags & SD_OVERLAP) &&
+ if (!(sd->flags & SD_NUMA) &&
cpumask_intersects(groupmask, sched_group_span(group))) {
printk(KERN_CONT "\n");
printk(KERN_ERR "ERROR: repeated CPUs\n");
@@ -100,7 +102,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
group->sgc->id,
cpumask_pr_args(sched_group_span(group)));
- if ((sd->flags & SD_OVERLAP) &&
+ if ((sd->flags & SD_NUMA) &&
!cpumask_equal(group_balance_mask(group), sched_group_span(group))) {
printk(KERN_CONT " mask=%*pbl",
cpumask_pr_args(group_balance_mask(group)));
@@ -212,8 +214,6 @@ static bool sched_energy_update;
static bool sched_is_eas_possible(const struct cpumask *cpu_mask)
{
bool any_asym_capacity = false;
- struct cpufreq_policy *policy;
- struct cpufreq_governor *gov;
int i;
/* EAS is enabled for asymmetric CPU capacity topologies. */
@@ -248,25 +248,12 @@ static bool sched_is_eas_possible(const struct cpumask *cpu_mask)
return false;
}
- /* Do not attempt EAS if schedutil is not being used. */
- for_each_cpu(i, cpu_mask) {
- policy = cpufreq_cpu_get(i);
- if (!policy) {
- if (sched_debug()) {
- pr_info("rd %*pbl: Checking EAS, cpufreq policy not set for CPU: %d",
- cpumask_pr_args(cpu_mask), i);
- }
- return false;
- }
- gov = policy->governor;
- cpufreq_cpu_put(policy);
- if (gov != &schedutil_gov) {
- if (sched_debug()) {
- pr_info("rd %*pbl: Checking EAS, schedutil is mandatory\n",
- cpumask_pr_args(cpu_mask));
- }
- return false;
+ if (!cpufreq_ready_for_eas(cpu_mask)) {
+ if (sched_debug()) {
+ pr_info("rd %*pbl: Checking EAS: cpufreq is not ready\n",
+ cpumask_pr_args(cpu_mask));
}
+ return false;
}
return true;
@@ -328,7 +315,7 @@ static int __init sched_energy_aware_sysctl_init(void)
}
late_initcall(sched_energy_aware_sysctl_init);
-#endif
+#endif /* CONFIG_PROC_SYSCTL */
static void free_pd(struct perf_domain *pd)
{
@@ -464,9 +451,9 @@ free:
return false;
}
-#else
+#else /* !(CONFIG_ENERGY_MODEL && CONFIG_CPU_FREQ_GOV_SCHEDUTIL): */
static void free_pd(struct perf_domain *pd) { }
-#endif /* CONFIG_ENERGY_MODEL && CONFIG_CPU_FREQ_GOV_SCHEDUTIL*/
+#endif /* !(CONFIG_ENERGY_MODEL && CONFIG_CPU_FREQ_GOV_SCHEDUTIL) */
static void free_rootdomain(struct rcu_head *rcu)
{
@@ -1333,8 +1320,6 @@ next:
update_group_capacity(sd, cpu);
}
-#ifdef CONFIG_SMP
-
/* Update the "asym_prefer_cpu" when arch_asym_cpu_priority() changes. */
void sched_update_asym_prefer_cpu(int cpu, int old_prio, int new_prio)
{
@@ -1359,7 +1344,7 @@ void sched_update_asym_prefer_cpu(int cpu, int old_prio, int new_prio)
* "sg->asym_prefer_cpu" to "sg->sgc->asym_prefer_cpu"
* which is shared by all the overlapping groups.
*/
- WARN_ON_ONCE(sd->flags & SD_OVERLAP);
+ WARN_ON_ONCE(sd->flags & SD_NUMA);
sg = sd->groups;
if (cpu != sg->asym_prefer_cpu) {
@@ -1389,8 +1374,6 @@ void sched_update_asym_prefer_cpu(int cpu, int old_prio, int new_prio)
}
}
-#endif /* CONFIG_SMP */
-
/*
* Set of available CPUs grouped by their corresponding capacities
* Each list entry contains a CPU mask reflecting CPUs that share the same
@@ -1613,7 +1596,7 @@ static int sched_domains_curr_level;
int sched_max_numa_distance;
static int *sched_domains_numa_distance;
static struct cpumask ***sched_domains_numa_masks;
-#endif
+#endif /* CONFIG_NUMA */
/*
* SD_flags allowed in topology descriptions.
@@ -1729,7 +1712,7 @@ sd_init(struct sched_domain_topology_level *tl,
SD_WAKE_AFFINE);
}
-#endif
+#endif /* CONFIG_NUMA */
} else {
sd->cache_nice_tries = 1;
}
@@ -1754,17 +1737,17 @@ sd_init(struct sched_domain_topology_level *tl,
*/
static struct sched_domain_topology_level default_topology[] = {
#ifdef CONFIG_SCHED_SMT
- { cpu_smt_mask, cpu_smt_flags, SD_INIT_NAME(SMT) },
+ SDTL_INIT(cpu_smt_mask, cpu_smt_flags, SMT),
#endif
#ifdef CONFIG_SCHED_CLUSTER
- { cpu_clustergroup_mask, cpu_cluster_flags, SD_INIT_NAME(CLS) },
+ SDTL_INIT(cpu_clustergroup_mask, cpu_cluster_flags, CLS),
#endif
#ifdef CONFIG_SCHED_MC
- { cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) },
+ SDTL_INIT(cpu_coregroup_mask, cpu_core_flags, MC),
#endif
- { cpu_cpu_mask, SD_INIT_NAME(PKG) },
+ SDTL_INIT(cpu_cpu_mask, NULL, PKG),
{ NULL, },
};
@@ -2025,23 +2008,14 @@ void sched_init_numa(int offline_node)
/*
* Add the NUMA identity distance, aka single NODE.
*/
- tl[i++] = (struct sched_domain_topology_level){
- .mask = sd_numa_mask,
- .numa_level = 0,
- SD_INIT_NAME(NODE)
- };
+ tl[i++] = SDTL_INIT(sd_numa_mask, NULL, NODE);
/*
* .. and append 'j' levels of NUMA goodness.
*/
for (j = 1; j < nr_levels; i++, j++) {
- tl[i] = (struct sched_domain_topology_level){
- .mask = sd_numa_mask,
- .sd_flags = cpu_numa_flags,
- .flags = SDTL_OVERLAP,
- .numa_level = j,
- SD_INIT_NAME(NUMA)
- };
+ tl[i] = SDTL_INIT(sd_numa_mask, cpu_numa_flags, NUMA);
+ tl[i].numa_level = j;
}
sched_domain_topology_saved = sched_domain_topology;
@@ -2352,7 +2326,7 @@ static void __sdt_free(const struct cpumask *cpu_map)
if (sdd->sd) {
sd = *per_cpu_ptr(sdd->sd, j);
- if (sd && (sd->flags & SD_OVERLAP))
+ if (sd && (sd->flags & SD_NUMA))
free_sched_groups(sd->groups, 0);
kfree(*per_cpu_ptr(sdd->sd, j));
}
@@ -2418,9 +2392,13 @@ static bool topology_span_sane(const struct cpumask *cpu_map)
id_seen = sched_domains_tmpmask2;
for_each_sd_topology(tl) {
+ int tl_common_flags = 0;
+
+ if (tl->sd_flags)
+ tl_common_flags = (*tl->sd_flags)();
/* NUMA levels are allowed to overlap */
- if (tl->flags & SDTL_OVERLAP)
+ if (tl_common_flags & SD_NUMA)
continue;
cpumask_clear(covered);
@@ -2491,8 +2469,6 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
if (tl == sched_domain_topology)
*per_cpu_ptr(d.sd, i) = sd;
- if (tl->flags & SDTL_OVERLAP)
- sd->flags |= SD_OVERLAP;
if (cpumask_equal(cpu_map, sched_domain_span(sd)))
break;
}
@@ -2505,7 +2481,7 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
for_each_cpu(i, cpu_map) {
for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
sd->span_weight = cpumask_weight(sched_domain_span(sd));
- if (sd->flags & SD_OVERLAP) {
+ if (sd->flags & SD_NUMA) {
if (build_overlap_sched_groups(sd, i))
goto error;
} else {
diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c
index 51e38f5f4701..20f27e2cf7ae 100644
--- a/kernel/sched/wait.c
+++ b/kernel/sched/wait.c
@@ -4,6 +4,7 @@
*
* (C) 2004 Nadia Yvette Chambers, Oracle
*/
+#include "sched.h"
void __init_waitqueue_head(struct wait_queue_head *wq_head, const char *name, struct lock_class_key *key)
{
@@ -40,13 +41,31 @@ void add_wait_queue_priority(struct wait_queue_head *wq_head, struct wait_queue_
{
unsigned long flags;
- wq_entry->flags |= WQ_FLAG_EXCLUSIVE | WQ_FLAG_PRIORITY;
+ wq_entry->flags |= WQ_FLAG_PRIORITY;
spin_lock_irqsave(&wq_head->lock, flags);
__add_wait_queue(wq_head, wq_entry);
spin_unlock_irqrestore(&wq_head->lock, flags);
}
EXPORT_SYMBOL_GPL(add_wait_queue_priority);
+int add_wait_queue_priority_exclusive(struct wait_queue_head *wq_head,
+ struct wait_queue_entry *wq_entry)
+{
+ struct list_head *head = &wq_head->head;
+
+ wq_entry->flags |= WQ_FLAG_EXCLUSIVE | WQ_FLAG_PRIORITY;
+
+ guard(spinlock_irqsave)(&wq_head->lock);
+
+ if (!list_empty(head) &&
+ (list_first_entry(head, typeof(*wq_entry), entry)->flags & WQ_FLAG_PRIORITY))
+ return -EBUSY;
+
+ list_add(&wq_entry->entry, head);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(add_wait_queue_priority_exclusive);
+
void remove_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry)
{
unsigned long flags;
@@ -64,7 +83,7 @@ EXPORT_SYMBOL(remove_wait_queue);
* the non-exclusive tasks. Normally, exclusive tasks will be at the end of
* the list and any non-exclusive tasks will be woken first. A priority task
* may be at the head of the list, and can consume the event without any other
- * tasks being woken.
+ * tasks being woken if it's also an exclusive task.
*
* There are circumstances in which we can try to wake a task which has already
* started to run but is not in state TASK_RUNNING. try_to_wake_up() returns
diff --git a/kernel/sched/wait_bit.c b/kernel/sched/wait_bit.c
index b410b61cec95..1088d3b7012c 100644
--- a/kernel/sched/wait_bit.c
+++ b/kernel/sched/wait_bit.c
@@ -1,5 +1,8 @@
// SPDX-License-Identifier: GPL-2.0-only
+#include <linux/sched/debug.h>
+#include "sched.h"
+
/*
* The implementation of the wait_bit*() and related waiting APIs:
*/
diff --git a/kernel/signal.c b/kernel/signal.c
index f8859faa26c5..e2c928de7d2c 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -3016,7 +3016,7 @@ relock:
* first and our do_group_exit call below will use
* that value and ignore the one we pass it.
*/
- do_coredump(&ksig->info);
+ vfs_coredump(&ksig->info);
}
/*
@@ -4981,9 +4981,20 @@ static const struct ctl_table signal_debug_table[] = {
#endif
};
+static const struct ctl_table signal_table[] = {
+ {
+ .procname = "print-fatal-signals",
+ .data = &print_fatal_signals,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+};
+
static int __init init_signal_sysctls(void)
{
register_sysctl_init("debug", signal_debug_table);
+ register_sysctl_init("kernel", signal_table);
return 0;
}
early_initcall(init_signal_sysctls);
diff --git a/kernel/smp.c b/kernel/smp.c
index 974f3a3962e8..4649fa4872ff 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -86,13 +86,15 @@ int smpcfd_dead_cpu(unsigned int cpu)
int smpcfd_dying_cpu(unsigned int cpu)
{
/*
- * The IPIs for the smp-call-function callbacks queued by other
- * CPUs might arrive late, either due to hardware latencies or
- * because this CPU disabled interrupts (inside stop-machine)
- * before the IPIs were sent. So flush out any pending callbacks
- * explicitly (without waiting for the IPIs to arrive), to
- * ensure that the outgoing CPU doesn't go offline with work
- * still pending.
+ * The IPIs for the smp-call-function callbacks queued by other CPUs
+ * might arrive late, either due to hardware latencies or because this
+ * CPU disabled interrupts (inside stop-machine) before the IPIs were
+ * sent. So flush out any pending callbacks explicitly (without waiting
+ * for the IPIs to arrive), to ensure that the outgoing CPU doesn't go
+ * offline with work still pending.
+ *
+ * This runs with interrupts disabled inside the stopper task invoked by
+ * stop_machine(), ensuring mutually exclusive CPU offlining and IPI flush.
*/
__flush_smp_call_function_queue(false);
irq_work_run();
@@ -418,6 +420,10 @@ void __smp_call_single_queue(int cpu, struct llist_node *node)
*/
static int generic_exec_single(int cpu, call_single_data_t *csd)
{
+ /*
+ * Preemption already disabled here so stopper cannot run on this CPU,
+ * ensuring mutually exclusive CPU offlining and last IPI flush.
+ */
if (cpu == smp_processor_id()) {
smp_call_func_t func = csd->func;
void *info = csd->info;
@@ -638,8 +644,10 @@ int smp_call_function_single(int cpu, smp_call_func_t func, void *info,
int err;
/*
- * prevent preemption and reschedule on another processor,
- * as well as CPU removal
+ * Prevent preemption and reschedule on another CPU, as well as CPU
+ * removal. This prevents stopper from running on this CPU, thus
+ * providing mutual exclusion of the below cpu_online() check and
+ * IPI sending ensuring IPI are not missed by CPU going offline.
*/
this_cpu = get_cpu();
@@ -741,32 +749,19 @@ EXPORT_SYMBOL_GPL(smp_call_function_single_async);
*
* Selection preference:
* 1) current cpu if in @mask
- * 2) any cpu of current node if in @mask
- * 3) any other online cpu in @mask
+ * 2) nearest cpu in @mask, based on NUMA topology
*/
int smp_call_function_any(const struct cpumask *mask,
smp_call_func_t func, void *info, int wait)
{
unsigned int cpu;
- const struct cpumask *nodemask;
int ret;
/* Try for same CPU (cheapest) */
cpu = get_cpu();
- if (cpumask_test_cpu(cpu, mask))
- goto call;
-
- /* Try for same node. */
- nodemask = cpumask_of_node(cpu_to_node(cpu));
- for (cpu = cpumask_first_and(nodemask, mask); cpu < nr_cpu_ids;
- cpu = cpumask_next_and(cpu, nodemask, mask)) {
- if (cpu_online(cpu))
- goto call;
- }
+ if (!cpumask_test_cpu(cpu, mask))
+ cpu = sched_numa_find_nth_cpu(mask, 0, cpu_to_node(cpu));
- /* Any online will do: smp_call_function_single handles nr_cpu_ids. */
- cpu = cpumask_any_and(mask, cpu_online_mask);
-call:
ret = smp_call_function_single(cpu, func, info, wait);
put_cpu();
return ret;
@@ -792,7 +787,6 @@ static void smp_call_function_many_cond(const struct cpumask *mask,
bool wait = scf_flags & SCF_WAIT;
int nr_cpus = 0;
bool run_remote = false;
- bool run_local = false;
lockdep_assert_preemption_disabled();
@@ -814,19 +808,8 @@ static void smp_call_function_many_cond(const struct cpumask *mask,
*/
WARN_ON_ONCE(!in_task());
- /* Check if we need local execution. */
- if ((scf_flags & SCF_RUN_LOCAL) && cpumask_test_cpu(this_cpu, mask) &&
- (!cond_func || cond_func(this_cpu, info)))
- run_local = true;
-
/* Check if we need remote execution, i.e., any CPU excluding this one. */
- cpu = cpumask_first_and(mask, cpu_online_mask);
- if (cpu == this_cpu)
- cpu = cpumask_next_and(cpu, mask, cpu_online_mask);
- if (cpu < nr_cpu_ids)
- run_remote = true;
-
- if (run_remote) {
+ if (cpumask_any_and_but(mask, cpu_online_mask, this_cpu) < nr_cpu_ids) {
cfd = this_cpu_ptr(&cfd_data);
cpumask_and(cfd->cpumask, mask, cpu_online_mask);
__cpumask_clear_cpu(this_cpu, cfd->cpumask);
@@ -840,6 +823,9 @@ static void smp_call_function_many_cond(const struct cpumask *mask,
continue;
}
+ /* Work is enqueued on a remote CPU. */
+ run_remote = true;
+
csd_lock(csd);
if (wait)
csd->node.u_flags |= CSD_TYPE_SYNC;
@@ -851,6 +837,10 @@ static void smp_call_function_many_cond(const struct cpumask *mask,
#endif
trace_csd_queue_cpu(cpu, _RET_IP_, func, csd);
+ /*
+ * Kick the remote CPU if this is the first work
+ * item enqueued.
+ */
if (llist_add(&csd->node.llist, &per_cpu(call_single_queue, cpu))) {
__cpumask_set_cpu(cpu, cfd->cpumask_ipi);
nr_cpus++;
@@ -869,7 +859,9 @@ static void smp_call_function_many_cond(const struct cpumask *mask,
send_call_function_ipi_mask(cfd->cpumask_ipi);
}
- if (run_local) {
+ /* Check if we need local execution. */
+ if ((scf_flags & SCF_RUN_LOCAL) && cpumask_test_cpu(this_cpu, mask) &&
+ (!cond_func || cond_func(this_cpu, info))) {
unsigned long flags;
local_irq_save(flags);
diff --git a/kernel/smpboot.c b/kernel/smpboot.c
index 1992b62e980b..4503b60ce9bd 100644
--- a/kernel/smpboot.c
+++ b/kernel/smpboot.c
@@ -18,8 +18,6 @@
#include "smpboot.h"
-#ifdef CONFIG_SMP
-
#ifdef CONFIG_GENERIC_SMP_IDLE_THREAD
/*
* For the hotplug case we keep the task structs around and reuse
@@ -76,8 +74,6 @@ void __init idle_threads_init(void)
}
#endif
-#endif /* #ifdef CONFIG_SMP */
-
static LIST_HEAD(hotplug_threads);
static DEFINE_MUTEX(smpboot_threads_lock);
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 5d2d0562115b..3fe6b0c99f3d 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -82,18 +82,15 @@ static void cpu_stop_signal_done(struct cpu_stop_done *done)
}
static void __cpu_stop_queue_work(struct cpu_stopper *stopper,
- struct cpu_stop_work *work,
- struct wake_q_head *wakeq)
+ struct cpu_stop_work *work)
{
list_add_tail(&work->list, &stopper->works);
- wake_q_add(wakeq, stopper->thread);
}
/* queue @work to @stopper. if offline, @work is completed immediately */
static bool cpu_stop_queue_work(unsigned int cpu, struct cpu_stop_work *work)
{
struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
- DEFINE_WAKE_Q(wakeq);
unsigned long flags;
bool enabled;
@@ -101,12 +98,13 @@ static bool cpu_stop_queue_work(unsigned int cpu, struct cpu_stop_work *work)
raw_spin_lock_irqsave(&stopper->lock, flags);
enabled = stopper->enabled;
if (enabled)
- __cpu_stop_queue_work(stopper, work, &wakeq);
+ __cpu_stop_queue_work(stopper, work);
else if (work->done)
cpu_stop_signal_done(work->done);
raw_spin_unlock_irqrestore(&stopper->lock, flags);
- wake_up_q(&wakeq);
+ if (enabled)
+ wake_up_process(stopper->thread);
preempt_enable();
return enabled;
@@ -264,7 +262,6 @@ static int cpu_stop_queue_two_works(int cpu1, struct cpu_stop_work *work1,
{
struct cpu_stopper *stopper1 = per_cpu_ptr(&cpu_stopper, cpu1);
struct cpu_stopper *stopper2 = per_cpu_ptr(&cpu_stopper, cpu2);
- DEFINE_WAKE_Q(wakeq);
int err;
retry:
@@ -300,8 +297,8 @@ retry:
}
err = 0;
- __cpu_stop_queue_work(stopper1, work1, &wakeq);
- __cpu_stop_queue_work(stopper2, work2, &wakeq);
+ __cpu_stop_queue_work(stopper1, work1);
+ __cpu_stop_queue_work(stopper2, work2);
unlock:
raw_spin_unlock(&stopper2->lock);
@@ -316,7 +313,10 @@ unlock:
goto retry;
}
- wake_up_q(&wakeq);
+ if (!err) {
+ wake_up_process(stopper1->thread);
+ wake_up_process(stopper2->thread);
+ }
preempt_enable();
return err;
diff --git a/kernel/sys.c b/kernel/sys.c
index adc0de0aa364..18a037cc6f61 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -181,6 +181,35 @@ int fs_overflowgid = DEFAULT_FS_OVERFLOWGID;
EXPORT_SYMBOL(fs_overflowuid);
EXPORT_SYMBOL(fs_overflowgid);
+static const struct ctl_table overflow_sysctl_table[] = {
+ {
+ .procname = "overflowuid",
+ .data = &overflowuid,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_MAXOLDUID,
+ },
+ {
+ .procname = "overflowgid",
+ .data = &overflowgid,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_MAXOLDUID,
+ },
+};
+
+static int __init init_overflow_sysctl(void)
+{
+ register_sysctl_init("kernel", overflow_sysctl_table);
+ return 0;
+}
+
+postcore_initcall(init_overflow_sysctl);
+
/*
* Returns true if current's euid is same as p's uid or euid,
* or has CAP_SYS_NICE to p's user_ns.
diff --git a/kernel/sysctl-test.c b/kernel/sysctl-test.c
index eb2842bd0557..92f94ea28957 100644
--- a/kernel/sysctl-test.c
+++ b/kernel/sysctl-test.c
@@ -367,54 +367,6 @@ static void sysctl_test_api_dointvec_write_single_greater_int_max(
KUNIT_EXPECT_EQ(test, 0, *((int *)table.data));
}
-/*
- * Test that registering an invalid extra value is not allowed.
- */
-static void sysctl_test_register_sysctl_sz_invalid_extra_value(
- struct kunit *test)
-{
- unsigned char data = 0;
- const struct ctl_table table_foo[] = {
- {
- .procname = "foo",
- .data = &data,
- .maxlen = sizeof(u8),
- .mode = 0644,
- .proc_handler = proc_dou8vec_minmax,
- .extra1 = SYSCTL_FOUR,
- .extra2 = SYSCTL_ONE_THOUSAND,
- },
- };
-
- const struct ctl_table table_bar[] = {
- {
- .procname = "bar",
- .data = &data,
- .maxlen = sizeof(u8),
- .mode = 0644,
- .proc_handler = proc_dou8vec_minmax,
- .extra1 = SYSCTL_NEG_ONE,
- .extra2 = SYSCTL_ONE_HUNDRED,
- },
- };
-
- const struct ctl_table table_qux[] = {
- {
- .procname = "qux",
- .data = &data,
- .maxlen = sizeof(u8),
- .mode = 0644,
- .proc_handler = proc_dou8vec_minmax,
- .extra1 = SYSCTL_ZERO,
- .extra2 = SYSCTL_TWO_HUNDRED,
- },
- };
-
- KUNIT_EXPECT_NULL(test, register_sysctl("foo", table_foo));
- KUNIT_EXPECT_NULL(test, register_sysctl("foo", table_bar));
- KUNIT_EXPECT_NOT_NULL(test, register_sysctl("foo", table_qux));
-}
-
static struct kunit_case sysctl_test_cases[] = {
KUNIT_CASE(sysctl_test_api_dointvec_null_tbl_data),
KUNIT_CASE(sysctl_test_api_dointvec_table_maxlen_unset),
@@ -426,7 +378,6 @@ static struct kunit_case sysctl_test_cases[] = {
KUNIT_CASE(sysctl_test_dointvec_write_happy_single_negative),
KUNIT_CASE(sysctl_test_api_dointvec_write_single_less_int_min),
KUNIT_CASE(sysctl_test_api_dointvec_write_single_greater_int_max),
- KUNIT_CASE(sysctl_test_register_sysctl_sz_invalid_extra_value),
{}
};
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 3b7a7308e35b..cb6196e3fa99 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1,76 +1,28 @@
// SPDX-License-Identifier: GPL-2.0-only
/*
* sysctl.c: General linux system control interface
- *
- * Begun 24 March 1995, Stephen Tweedie
- * Added /proc support, Dec 1995
- * Added bdflush entry and intvec min/max checking, 2/23/96, Tom Dyas.
- * Added hooks for /proc/sys/net (minor, minor patch), 96/4/1, Mike Shaver.
- * Added kernel/java-{interpreter,appletviewer}, 96/5/10, Mike Shaver.
- * Dynamic registration fixes, Stephen Tweedie.
- * Added kswapd-interval, ctrl-alt-del, printk stuff, 1/8/97, Chris Horn.
- * Made sysctl support optional via CONFIG_SYSCTL, 1/10/97, Chris
- * Horn.
- * Added proc_doulongvec_ms_jiffies_minmax, 09/08/99, Carlos H. Bauer.
- * Added proc_doulongvec_minmax, 09/08/99, Carlos H. Bauer.
- * Changed linked lists to use list.h instead of lists.h, 02/24/00, Bill
- * Wendling.
- * The list_for_each() macro wasn't appropriate for the sysctl loop.
- * Removed it and replaced it with older style, 03/23/00, Bill Wendling
*/
-#include <linux/module.h>
#include <linux/sysctl.h>
#include <linux/bitmap.h>
-#include <linux/signal.h>
-#include <linux/panic.h>
-#include <linux/printk.h>
#include <linux/proc_fs.h>
-#include <linux/security.h>
#include <linux/ctype.h>
-#include <linux/filter.h>
-#include <linux/fs.h>
#include <linux/init.h>
#include <linux/kernel.h>
#include <linux/kobject.h>
-#include <linux/net.h>
-#include <linux/sysrq.h>
#include <linux/highuid.h>
#include <linux/writeback.h>
-#include <linux/ratelimit.h>
#include <linux/initrd.h>
-#include <linux/key.h>
#include <linux/times.h>
#include <linux/limits.h>
#include <linux/syscalls.h>
-#include <linux/nfs_fs.h>
-#include <linux/acpi.h>
-#include <linux/reboot.h>
-#include <linux/ftrace.h>
-#include <linux/kmod.h>
#include <linux/capability.h>
-#include <linux/binfmts.h>
-#include <linux/sched/sysctl.h>
-#include <linux/mount.h>
-#include <linux/pid.h>
#include "../lib/kstrtox.h"
#include <linux/uaccess.h>
#include <asm/processor.h>
-#ifdef CONFIG_X86
-#include <asm/nmi.h>
-#include <asm/stacktrace.h>
-#include <asm/io.h>
-#endif
-#ifdef CONFIG_SPARC
-#include <asm/setup.h>
-#endif
-#ifdef CONFIG_RT_MUTEXES
-#include <linux/rtmutex.h>
-#endif
-
/* shared constants to be used in various sysctls */
const int sysctl_vals[] = { 0, 1, 2, 3, 4, 100, 200, 1000, 3000, INT_MAX, 65535, -1 };
EXPORT_SYMBOL(sysctl_vals);
@@ -743,49 +695,6 @@ int proc_douintvec(const struct ctl_table *table, int write, void *buffer,
do_proc_douintvec_conv, NULL);
}
-/*
- * Taint values can only be increased
- * This means we can safely use a temporary.
- */
-static int proc_taint(const struct ctl_table *table, int write,
- void *buffer, size_t *lenp, loff_t *ppos)
-{
- struct ctl_table t;
- unsigned long tmptaint = get_taint();
- int err;
-
- if (write && !capable(CAP_SYS_ADMIN))
- return -EPERM;
-
- t = *table;
- t.data = &tmptaint;
- err = proc_doulongvec_minmax(&t, write, buffer, lenp, ppos);
- if (err < 0)
- return err;
-
- if (write) {
- int i;
-
- /*
- * If we are relying on panic_on_taint not producing
- * false positives due to userspace input, bail out
- * before setting the requested taint flags.
- */
- if (panic_on_taint_nousertaint && (tmptaint & panic_on_taint))
- return -EINVAL;
-
- /*
- * Poor man's atomic or. Not worth adding a primitive
- * to everyone's atomic.h for this
- */
- for (i = 0; i < TAINT_FLAGS_COUNT; i++)
- if ((1UL << i) & tmptaint)
- add_taint(i, LOCKDEP_STILL_OK);
- }
-
- return err;
-}
-
/**
* struct do_proc_dointvec_minmax_conv_param - proc_dointvec_minmax() range checking structure
* @min: pointer to minimum allowable value
@@ -975,26 +884,6 @@ int proc_dou8vec_minmax(const struct ctl_table *table, int write,
}
EXPORT_SYMBOL_GPL(proc_dou8vec_minmax);
-#ifdef CONFIG_MAGIC_SYSRQ
-static int sysrq_sysctl_handler(const struct ctl_table *table, int write,
- void *buffer, size_t *lenp, loff_t *ppos)
-{
- int tmp, ret;
-
- tmp = sysrq_mask();
-
- ret = __do_proc_dointvec(&tmp, table, write, buffer,
- lenp, ppos, NULL, NULL);
- if (ret || !write)
- return ret;
-
- if (write)
- sysrq_toggle_support(tmp);
-
- return 0;
-}
-#endif
-
static int __do_proc_doulongvec_minmax(void *data,
const struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos,
@@ -1299,28 +1188,6 @@ int proc_dointvec_ms_jiffies(const struct ctl_table *table, int write, void *buf
do_proc_dointvec_ms_jiffies_conv, NULL);
}
-static int proc_do_cad_pid(const struct ctl_table *table, int write, void *buffer,
- size_t *lenp, loff_t *ppos)
-{
- struct pid *new_pid;
- pid_t tmp;
- int r;
-
- tmp = pid_vnr(cad_pid);
-
- r = __do_proc_dointvec(&tmp, table, write, buffer,
- lenp, ppos, NULL, NULL);
- if (r || !write)
- return r;
-
- new_pid = find_get_pid(tmp);
- if (!new_pid)
- return -ESRCH;
-
- put_pid(xchg(&cad_pid, new_pid));
- return 0;
-}
-
/**
* proc_do_large_bitmap - read/write from/to a large bitmap
* @table: the sysctl table
@@ -1587,22 +1454,9 @@ int proc_do_static_key(const struct ctl_table *table, int write,
return ret;
}
-static const struct ctl_table kern_table[] = {
- {
- .procname = "panic",
- .data = &panic_timeout,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec,
- },
+static const struct ctl_table sysctl_subsys_table[] = {
#ifdef CONFIG_PROC_SYSCTL
{
- .procname = "tainted",
- .maxlen = sizeof(long),
- .mode = 0644,
- .proc_handler = proc_taint,
- },
- {
.procname = "sysctl_writes_strict",
.data = &sysctl_writes_strict,
.maxlen = sizeof(int),
@@ -1613,180 +1467,6 @@ static const struct ctl_table kern_table[] = {
},
#endif
{
- .procname = "print-fatal-signals",
- .data = &print_fatal_signals,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec,
- },
-#ifdef CONFIG_SPARC
- {
- .procname = "reboot-cmd",
- .data = reboot_command,
- .maxlen = 256,
- .mode = 0644,
- .proc_handler = proc_dostring,
- },
- {
- .procname = "stop-a",
- .data = &stop_a_enabled,
- .maxlen = sizeof (int),
- .mode = 0644,
- .proc_handler = proc_dointvec,
- },
- {
- .procname = "scons-poweroff",
- .data = &scons_pwroff,
- .maxlen = sizeof (int),
- .mode = 0644,
- .proc_handler = proc_dointvec,
- },
-#endif
-#ifdef CONFIG_SPARC64
- {
- .procname = "tsb-ratio",
- .data = &sysctl_tsb_ratio,
- .maxlen = sizeof (int),
- .mode = 0644,
- .proc_handler = proc_dointvec,
- },
-#endif
-#ifdef CONFIG_PARISC
- {
- .procname = "soft-power",
- .data = &pwrsw_enabled,
- .maxlen = sizeof (int),
- .mode = 0644,
- .proc_handler = proc_dointvec,
- },
-#endif
-#ifdef CONFIG_SYSCTL_ARCH_UNALIGN_ALLOW
- {
- .procname = "unaligned-trap",
- .data = &unaligned_enabled,
- .maxlen = sizeof (int),
- .mode = 0644,
- .proc_handler = proc_dointvec,
- },
-#endif
-#ifdef CONFIG_STACK_TRACER
- {
- .procname = "stack_tracer_enabled",
- .data = &stack_tracer_enabled,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = stack_trace_sysctl,
- },
-#endif
-#ifdef CONFIG_TRACING
- {
- .procname = "ftrace_dump_on_oops",
- .data = &ftrace_dump_on_oops,
- .maxlen = MAX_TRACER_SIZE,
- .mode = 0644,
- .proc_handler = proc_dostring,
- },
- {
- .procname = "traceoff_on_warning",
- .data = &__disable_trace_on_warning,
- .maxlen = sizeof(__disable_trace_on_warning),
- .mode = 0644,
- .proc_handler = proc_dointvec,
- },
- {
- .procname = "tracepoint_printk",
- .data = &tracepoint_printk,
- .maxlen = sizeof(tracepoint_printk),
- .mode = 0644,
- .proc_handler = tracepoint_printk_sysctl,
- },
-#endif
-#ifdef CONFIG_MODULES
- {
- .procname = "modprobe",
- .data = &modprobe_path,
- .maxlen = KMOD_PATH_LEN,
- .mode = 0644,
- .proc_handler = proc_dostring,
- },
- {
- .procname = "modules_disabled",
- .data = &modules_disabled,
- .maxlen = sizeof(int),
- .mode = 0644,
- /* only handle a transition from default "0" to "1" */
- .proc_handler = proc_dointvec_minmax,
- .extra1 = SYSCTL_ONE,
- .extra2 = SYSCTL_ONE,
- },
-#endif
-#ifdef CONFIG_UEVENT_HELPER
- {
- .procname = "hotplug",
- .data = &uevent_helper,
- .maxlen = UEVENT_HELPER_PATH_LEN,
- .mode = 0644,
- .proc_handler = proc_dostring,
- },
-#endif
-#ifdef CONFIG_MAGIC_SYSRQ
- {
- .procname = "sysrq",
- .data = NULL,
- .maxlen = sizeof (int),
- .mode = 0644,
- .proc_handler = sysrq_sysctl_handler,
- },
-#endif
-#ifdef CONFIG_PROC_SYSCTL
- {
- .procname = "cad_pid",
- .data = NULL,
- .maxlen = sizeof (int),
- .mode = 0600,
- .proc_handler = proc_do_cad_pid,
- },
-#endif
- {
- .procname = "threads-max",
- .data = NULL,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = sysctl_max_threads,
- },
- {
- .procname = "overflowuid",
- .data = &overflowuid,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec_minmax,
- .extra1 = SYSCTL_ZERO,
- .extra2 = SYSCTL_MAXOLDUID,
- },
- {
- .procname = "overflowgid",
- .data = &overflowgid,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec_minmax,
- .extra1 = SYSCTL_ZERO,
- .extra2 = SYSCTL_MAXOLDUID,
- },
- {
- .procname = "panic_on_oops",
- .data = &panic_on_oops,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec,
- },
- {
- .procname = "panic_print",
- .data = &panic_print,
- .maxlen = sizeof(unsigned long),
- .mode = 0644,
- .proc_handler = proc_doulongvec_minmax,
- },
- {
.procname = "ngroups_max",
.data = (void *)&ngroups_max,
.maxlen = sizeof (int),
@@ -1800,20 +1480,10 @@ static const struct ctl_table kern_table[] = {
.mode = 0444,
.proc_handler = proc_dointvec,
},
-#if (defined(CONFIG_X86_32) || defined(CONFIG_PARISC)) && \
- defined(CONFIG_DEBUG_STACKOVERFLOW)
- {
- .procname = "panic_on_stackoverflow",
- .data = &sysctl_panic_on_stackoverflow,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec,
- },
-#endif
-#if defined(CONFIG_MMU)
+#ifdef CONFIG_SYSCTL_ARCH_UNALIGN_ALLOW
{
- .procname = "randomize_va_space",
- .data = &randomize_va_space,
+ .procname = "unaligned-trap",
+ .data = &unaligned_enabled,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec,
@@ -1828,49 +1498,11 @@ static const struct ctl_table kern_table[] = {
.proc_handler = proc_dointvec,
},
#endif
-#ifdef CONFIG_RT_MUTEXES
- {
- .procname = "max_lock_depth",
- .data = &max_lock_depth,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec,
- },
-#endif
- {
- .procname = "panic_on_warn",
- .data = &panic_on_warn,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec_minmax,
- .extra1 = SYSCTL_ZERO,
- .extra2 = SYSCTL_ONE,
- },
-#ifdef CONFIG_TREE_RCU
- {
- .procname = "panic_on_rcu_stall",
- .data = &sysctl_panic_on_rcu_stall,
- .maxlen = sizeof(sysctl_panic_on_rcu_stall),
- .mode = 0644,
- .proc_handler = proc_dointvec_minmax,
- .extra1 = SYSCTL_ZERO,
- .extra2 = SYSCTL_ONE,
- },
- {
- .procname = "max_rcu_stall_to_panic",
- .data = &sysctl_max_rcu_stall_to_panic,
- .maxlen = sizeof(sysctl_max_rcu_stall_to_panic),
- .mode = 0644,
- .proc_handler = proc_dointvec_minmax,
- .extra1 = SYSCTL_ONE,
- .extra2 = SYSCTL_INT_MAX,
- },
-#endif
};
int __init sysctl_init_bases(void)
{
- register_sysctl_init("kernel", kern_table);
+ register_sysctl_init("kernel", sysctl_subsys_table);
return 0;
}
diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig
index b0b97a60aaa6..7c6a52f7836c 100644
--- a/kernel/time/Kconfig
+++ b/kernel/time/Kconfig
@@ -82,9 +82,9 @@ config CONTEXT_TRACKING_IDLE
help
Tracks idle state on behalf of RCU.
-if GENERIC_CLOCKEVENTS
menu "Timers subsystem"
+if GENERIC_CLOCKEVENTS
# Core internal switch. Selected by NO_HZ_COMMON / HIGH_RES_TIMERS. This is
# only related to the tick functionality. Oneshot clockevent devices
# are supported independent of this.
@@ -208,6 +208,17 @@ config CLOCKSOURCE_WATCHDOG_MAX_SKEW_US
interval and NTP's maximum frequency drift of 500 parts
per million. If the clocksource is good enough for NTP,
it is good enough for the clocksource watchdog!
+endif
+
+config POSIX_AUX_CLOCKS
+ bool "Enable auxiliary POSIX clocks"
+ depends on POSIX_TIMERS
+ help
+ Auxiliary POSIX clocks are clocks which can be steered
+ independently of the core timekeeper, which controls the
+ MONOTONIC, REALTIME, BOOTTIME and TAI clocks. They are useful to
+ provide e.g. lockless time accessors to independent PTP clocks
+ and other clock domains, which are not correlated to the TAI/NTP
+ notion of time.
endmenu
-endif
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index 0ddccdff119a..577f0e6842d4 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -70,12 +70,10 @@ static DEFINE_SPINLOCK(rtcdev_lock);
*/
struct rtc_device *alarmtimer_get_rtcdev(void)
{
- unsigned long flags;
struct rtc_device *ret;
- spin_lock_irqsave(&rtcdev_lock, flags);
+ guard(spinlock_irqsave)(&rtcdev_lock);
ret = rtcdev;
- spin_unlock_irqrestore(&rtcdev_lock, flags);
return ret;
}
@@ -83,7 +81,6 @@ EXPORT_SYMBOL_GPL(alarmtimer_get_rtcdev);
static int alarmtimer_rtc_add_device(struct device *dev)
{
- unsigned long flags;
struct rtc_device *rtc = to_rtc_device(dev);
struct platform_device *pdev;
int ret = 0;
@@ -101,25 +98,18 @@ static int alarmtimer_rtc_add_device(struct device *dev)
if (!IS_ERR(pdev))
device_init_wakeup(&pdev->dev, true);
- spin_lock_irqsave(&rtcdev_lock, flags);
- if (!IS_ERR(pdev) && !rtcdev) {
- if (!try_module_get(rtc->owner)) {
+ scoped_guard(spinlock_irqsave, &rtcdev_lock) {
+ if (!IS_ERR(pdev) && !rtcdev && try_module_get(rtc->owner)) {
+ rtcdev = rtc;
+ /* hold a reference so it doesn't go away */
+ get_device(dev);
+ pdev = NULL;
+ } else {
ret = -1;
- goto unlock;
}
-
- rtcdev = rtc;
- /* hold a reference so it doesn't go away */
- get_device(dev);
- pdev = NULL;
- } else {
- ret = -1;
}
-unlock:
- spin_unlock_irqrestore(&rtcdev_lock, flags);
platform_device_unregister(pdev);
-
return ret;
}
@@ -198,7 +188,7 @@ static enum hrtimer_restart alarmtimer_fired(struct hrtimer *timer)
struct alarm *alarm = container_of(timer, struct alarm, timer);
struct alarm_base *base = &alarm_bases[alarm->type];
- scoped_guard (spinlock_irqsave, &base->lock)
+ scoped_guard(spinlock_irqsave, &base->lock)
alarmtimer_dequeue(base, alarm);
if (alarm->function)
@@ -228,17 +218,16 @@ EXPORT_SYMBOL_GPL(alarm_expires_remaining);
static int alarmtimer_suspend(struct device *dev)
{
ktime_t min, now, expires;
- int i, ret, type;
struct rtc_device *rtc;
- unsigned long flags;
struct rtc_time tm;
+ int i, ret, type;
- spin_lock_irqsave(&freezer_delta_lock, flags);
- min = freezer_delta;
- expires = freezer_expires;
- type = freezer_alarmtype;
- freezer_delta = 0;
- spin_unlock_irqrestore(&freezer_delta_lock, flags);
+ scoped_guard(spinlock_irqsave, &freezer_delta_lock) {
+ min = freezer_delta;
+ expires = freezer_expires;
+ type = freezer_alarmtype;
+ freezer_delta = 0;
+ }
rtc = alarmtimer_get_rtcdev();
/* If we have no rtcdev, just return */
@@ -251,9 +240,8 @@ static int alarmtimer_suspend(struct device *dev)
struct timerqueue_node *next;
ktime_t delta;
- spin_lock_irqsave(&base->lock, flags);
- next = timerqueue_getnext(&base->timerqueue);
- spin_unlock_irqrestore(&base->lock, flags);
+ scoped_guard(spinlock_irqsave, &base->lock)
+ next = timerqueue_getnext(&base->timerqueue);
if (!next)
continue;
delta = ktime_sub(next->expires, base->get_ktime());
@@ -352,13 +340,12 @@ EXPORT_SYMBOL_GPL(alarm_init);
void alarm_start(struct alarm *alarm, ktime_t start)
{
struct alarm_base *base = &alarm_bases[alarm->type];
- unsigned long flags;
- spin_lock_irqsave(&base->lock, flags);
- alarm->node.expires = start;
- alarmtimer_enqueue(base, alarm);
- hrtimer_start(&alarm->timer, alarm->node.expires, HRTIMER_MODE_ABS);
- spin_unlock_irqrestore(&base->lock, flags);
+ scoped_guard(spinlock_irqsave, &base->lock) {
+ alarm->node.expires = start;
+ alarmtimer_enqueue(base, alarm);
+ hrtimer_start(&alarm->timer, alarm->node.expires, HRTIMER_MODE_ABS);
+ }
trace_alarmtimer_start(alarm, base->get_ktime());
}
@@ -381,13 +368,11 @@ EXPORT_SYMBOL_GPL(alarm_start_relative);
void alarm_restart(struct alarm *alarm)
{
struct alarm_base *base = &alarm_bases[alarm->type];
- unsigned long flags;
- spin_lock_irqsave(&base->lock, flags);
+ guard(spinlock_irqsave)(&base->lock);
hrtimer_set_expires(&alarm->timer, alarm->node.expires);
hrtimer_restart(&alarm->timer);
alarmtimer_enqueue(base, alarm);
- spin_unlock_irqrestore(&base->lock, flags);
}
EXPORT_SYMBOL_GPL(alarm_restart);
@@ -401,14 +386,13 @@ EXPORT_SYMBOL_GPL(alarm_restart);
int alarm_try_to_cancel(struct alarm *alarm)
{
struct alarm_base *base = &alarm_bases[alarm->type];
- unsigned long flags;
int ret;
- spin_lock_irqsave(&base->lock, flags);
- ret = hrtimer_try_to_cancel(&alarm->timer);
- if (ret >= 0)
- alarmtimer_dequeue(base, alarm);
- spin_unlock_irqrestore(&base->lock, flags);
+ scoped_guard(spinlock_irqsave, &base->lock) {
+ ret = hrtimer_try_to_cancel(&alarm->timer);
+ if (ret >= 0)
+ alarmtimer_dequeue(base, alarm);
+ }
trace_alarmtimer_cancel(alarm, base->get_ktime());
return ret;
@@ -479,7 +463,6 @@ EXPORT_SYMBOL_GPL(alarm_forward_now);
static void alarmtimer_freezerset(ktime_t absexp, enum alarmtimer_type type)
{
struct alarm_base *base;
- unsigned long flags;
ktime_t delta;
switch(type) {
@@ -498,13 +481,12 @@ static void alarmtimer_freezerset(ktime_t absexp, enum alarmtimer_type type)
delta = ktime_sub(absexp, base->get_ktime());
- spin_lock_irqsave(&freezer_delta_lock, flags);
+ guard(spinlock_irqsave)(&freezer_delta_lock);
if (!freezer_delta || (delta < freezer_delta)) {
freezer_delta = delta;
freezer_expires = absexp;
freezer_alarmtype = type;
}
- spin_unlock_irqrestore(&freezer_delta_lock, flags);
}
/**
@@ -515,9 +497,9 @@ static enum alarmtimer_type clock2alarm(clockid_t clockid)
{
if (clockid == CLOCK_REALTIME_ALARM)
return ALARM_REALTIME;
- if (clockid == CLOCK_BOOTTIME_ALARM)
- return ALARM_BOOTTIME;
- return -1;
+
+ WARN_ON_ONCE(clockid != CLOCK_BOOTTIME_ALARM);
+ return ALARM_BOOTTIME;
}
/**
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index bb48498ebb5a..e400fe150f9d 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -310,7 +310,7 @@ static void clocksource_verify_choose_cpus(void)
{
int cpu, i, n = verify_n_cpus;
- if (n < 0) {
+ if (n < 0 || n >= num_online_cpus()) {
/* Check all of the CPUs. */
cpumask_copy(&cpus_chosen, cpu_online_mask);
cpumask_clear_cpu(smp_processor_id(), &cpus_chosen);
@@ -323,9 +323,7 @@ static void clocksource_verify_choose_cpus(void)
return;
/* Make sure to select at least one CPU other than the current CPU. */
- cpu = cpumask_first(cpu_online_mask);
- if (cpu == smp_processor_id())
- cpu = cpumask_next(cpu, cpu_online_mask);
+ cpu = cpumask_any_but(cpu_online_mask, smp_processor_id());
if (WARN_ON_ONCE(cpu >= nr_cpu_ids))
return;
cpumask_set_cpu(cpu, &cpus_chosen);
@@ -589,9 +587,7 @@ static void clocksource_watchdog(struct timer_list *unused)
* Cycle through CPUs to check if the CPUs stay synchronized
* to each other.
*/
- next_cpu = cpumask_next(raw_smp_processor_id(), cpu_online_mask);
- if (next_cpu >= nr_cpu_ids)
- next_cpu = cpumask_first(cpu_online_mask);
+ next_cpu = cpumask_next_wrap(raw_smp_processor_id(), cpu_online_mask);
/*
* Arm timer if not already pending: could race with concurrent
diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c
index bc4db9e5ab70..34eeacac2253 100644
--- a/kernel/time/jiffies.c
+++ b/kernel/time/jiffies.c
@@ -75,13 +75,11 @@ struct clocksource * __init __weak clocksource_default_clock(void)
static struct clocksource refined_jiffies;
-int register_refined_jiffies(long cycles_per_second)
+void __init register_refined_jiffies(long cycles_per_second)
{
u64 nsec_per_tick, shift_hz;
long cycles_per_tick;
-
-
refined_jiffies = clocksource_jiffies;
refined_jiffies.name = "refined-jiffies";
refined_jiffies.rating++;
@@ -100,5 +98,4 @@ int register_refined_jiffies(long cycles_per_second)
refined_jiffies.mult = ((u32)nsec_per_tick) << JIFFIES_SHIFT;
__clocksource_register(&refined_jiffies);
- return 0;
}
diff --git a/kernel/time/namespace.c b/kernel/time/namespace.c
index e3642278df43..667452768ed3 100644
--- a/kernel/time/namespace.c
+++ b/kernel/time/namespace.c
@@ -242,6 +242,11 @@ static void timens_set_vvar_page(struct task_struct *task,
for (i = 0; i < CS_BASES; i++)
timens_setup_vdso_clock_data(&vc[i], ns);
+ if (IS_ENABLED(CONFIG_POSIX_AUX_CLOCKS)) {
+ for (i = 0; i < ARRAY_SIZE(vdata->aux_clock_data); i++)
+ timens_setup_vdso_clock_data(&vdata->aux_clock_data[i], ns);
+ }
+
out:
mutex_unlock(&offset_lock);
}
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index b837d3d9d325..97fa99b96dd0 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -18,6 +18,7 @@
#include <linux/module.h>
#include <linux/rtc.h>
#include <linux/audit.h>
+#include <linux/timekeeper_internal.h>
#include "ntp_internal.h"
#include "timekeeping_internal.h"
@@ -86,14 +87,16 @@ struct ntp_data {
#endif
};
-static struct ntp_data tk_ntp_data = {
- .tick_usec = USER_TICK_USEC,
- .time_state = TIME_OK,
- .time_status = STA_UNSYNC,
- .time_constant = 2,
- .time_maxerror = NTP_PHASE_LIMIT,
- .time_esterror = NTP_PHASE_LIMIT,
- .ntp_next_leap_sec = TIME64_MAX,
+static struct ntp_data tk_ntp_data[TIMEKEEPERS_MAX] = {
+ [ 0 ... TIMEKEEPERS_MAX - 1 ] = {
+ .tick_usec = USER_TICK_USEC,
+ .time_state = TIME_OK,
+ .time_status = STA_UNSYNC,
+ .time_constant = 2,
+ .time_maxerror = NTP_PHASE_LIMIT,
+ .time_esterror = NTP_PHASE_LIMIT,
+ .ntp_next_leap_sec = TIME64_MAX,
+ },
};
#define SECS_PER_DAY 86400
@@ -300,7 +303,7 @@ static void ntp_update_offset(struct ntp_data *ntpdata, long offset)
* Select how the frequency is to be controlled
* and in which mode (PLL or FLL).
*/
- real_secs = __ktime_get_real_seconds();
+ real_secs = ktime_get_ntp_seconds(ntpdata - tk_ntp_data);
secs = (long)(real_secs - ntpdata->time_reftime);
if (unlikely(ntpdata->time_status & STA_FREQHOLD))
secs = 0;
@@ -348,33 +351,38 @@ static void __ntp_clear(struct ntp_data *ntpdata)
/**
* ntp_clear - Clears the NTP state variables
+ * @tkid: Timekeeper ID to be able to select proper ntp data array member
*/
-void ntp_clear(void)
+void ntp_clear(unsigned int tkid)
{
- __ntp_clear(&tk_ntp_data);
+ __ntp_clear(&tk_ntp_data[tkid]);
}
-u64 ntp_tick_length(void)
+u64 ntp_tick_length(unsigned int tkid)
{
- return tk_ntp_data.tick_length;
+ return tk_ntp_data[tkid].tick_length;
}
/**
* ntp_get_next_leap - Returns the next leapsecond in CLOCK_REALTIME ktime_t
+ * @tkid: Timekeeper ID
*
- * Provides the time of the next leapsecond against CLOCK_REALTIME in
- * a ktime_t format. Returns KTIME_MAX if no leapsecond is pending.
+ * Returns: For @tkid == TIMEKEEPER_CORE this provides the time of the next
+ * leap second against CLOCK_REALTIME in a ktime_t format if a
+ * leap second is pending. KTIME_MAX otherwise.
*/
-ktime_t ntp_get_next_leap(void)
+ktime_t ntp_get_next_leap(unsigned int tkid)
{
- struct ntp_data *ntpdata = &tk_ntp_data;
- ktime_t ret;
+ struct ntp_data *ntpdata = &tk_ntp_data[TIMEKEEPER_CORE];
+
+ if (tkid != TIMEKEEPER_CORE)
+ return KTIME_MAX;
if ((ntpdata->time_state == TIME_INS) && (ntpdata->time_status & STA_INS))
return ktime_set(ntpdata->ntp_next_leap_sec, 0);
- ret = KTIME_MAX;
- return ret;
+
+ return KTIME_MAX;
}
/*
@@ -387,9 +395,9 @@ ktime_t ntp_get_next_leap(void)
*
* Also handles leap second processing, and returns leap offset
*/
-int second_overflow(time64_t secs)
+int second_overflow(unsigned int tkid, time64_t secs)
{
- struct ntp_data *ntpdata = &tk_ntp_data;
+ struct ntp_data *ntpdata = &tk_ntp_data[tkid];
s64 delta;
int leap = 0;
s32 rem;
@@ -605,7 +613,7 @@ static inline int update_rtc(struct timespec64 *to_set, unsigned long *offset_ns
*/
static inline bool ntp_synced(void)
{
- return !(tk_ntp_data.time_status & STA_UNSYNC);
+ return !(tk_ntp_data[TIMEKEEPER_CORE].time_status & STA_UNSYNC);
}
/*
@@ -702,7 +710,7 @@ static inline void process_adj_status(struct ntp_data *ntpdata, const struct __k
* reference time to current time.
*/
if (!(ntpdata->time_status & STA_PLL) && (txc->status & STA_PLL))
- ntpdata->time_reftime = __ktime_get_real_seconds();
+ ntpdata->time_reftime = ktime_get_ntp_seconds(ntpdata - tk_ntp_data);
/* only set allowed bits */
ntpdata->time_status &= STA_RONLY;
@@ -759,10 +767,10 @@ static inline void process_adjtimex_modes(struct ntp_data *ntpdata, const struct
* adjtimex() mainly allows reading (and writing, if superuser) of
* kernel time-keeping variables. used by xntpd.
*/
-int __do_adjtimex(struct __kernel_timex *txc, const struct timespec64 *ts,
- s32 *time_tai, struct audit_ntp_data *ad)
+int ntp_adjtimex(unsigned int tkid, struct __kernel_timex *txc, const struct timespec64 *ts,
+ s32 *time_tai, struct audit_ntp_data *ad)
{
- struct ntp_data *ntpdata = &tk_ntp_data;
+ struct ntp_data *ntpdata = &tk_ntp_data[tkid];
int result;
if (txc->modes & ADJ_ADJTIME) {
@@ -1031,8 +1039,8 @@ static void hardpps_update_phase(struct ntp_data *ntpdata, long error)
*/
void __hardpps(const struct timespec64 *phase_ts, const struct timespec64 *raw_ts)
{
+ struct ntp_data *ntpdata = &tk_ntp_data[TIMEKEEPER_CORE];
struct pps_normtime pts_norm, freq_norm;
- struct ntp_data *ntpdata = &tk_ntp_data;
pts_norm = pps_normalize_ts(*phase_ts);
@@ -1083,18 +1091,18 @@ void __hardpps(const struct timespec64 *phase_ts, const struct timespec64 *raw_t
static int __init ntp_tick_adj_setup(char *str)
{
- int rc = kstrtos64(str, 0, &tk_ntp_data.ntp_tick_adj);
+ int rc = kstrtos64(str, 0, &tk_ntp_data[TIMEKEEPER_CORE].ntp_tick_adj);
if (rc)
return rc;
- tk_ntp_data.ntp_tick_adj <<= NTP_SCALE_SHIFT;
+ tk_ntp_data[TIMEKEEPER_CORE].ntp_tick_adj <<= NTP_SCALE_SHIFT;
return 1;
}
-
__setup("ntp_tick_adj=", ntp_tick_adj_setup);
void __init ntp_init(void)
{
- ntp_clear();
+ for (int id = 0; id < TIMEKEEPERS_MAX; id++)
+ __ntp_clear(tk_ntp_data + id);
ntp_init_cmos_sync();
}
diff --git a/kernel/time/ntp_internal.h b/kernel/time/ntp_internal.h
index 5a633dce9057..7084d839c207 100644
--- a/kernel/time/ntp_internal.h
+++ b/kernel/time/ntp_internal.h
@@ -3,14 +3,13 @@
#define _LINUX_NTP_INTERNAL_H
extern void ntp_init(void);
-extern void ntp_clear(void);
+extern void ntp_clear(unsigned int tkid);
/* Returns how long ticks are at present, in ns / 2^NTP_SCALE_SHIFT. */
-extern u64 ntp_tick_length(void);
-extern ktime_t ntp_get_next_leap(void);
-extern int second_overflow(time64_t secs);
-extern int __do_adjtimex(struct __kernel_timex *txc,
- const struct timespec64 *ts,
- s32 *time_tai, struct audit_ntp_data *ad);
+extern u64 ntp_tick_length(unsigned int tkid);
+extern ktime_t ntp_get_next_leap(unsigned int tkid);
+extern int second_overflow(unsigned int tkid, time64_t secs);
+extern int ntp_adjtimex(unsigned int tkid, struct __kernel_timex *txc, const struct timespec64 *ts,
+ s32 *time_tai, struct audit_ntp_data *ad);
extern void __hardpps(const struct timespec64 *phase_ts, const struct timespec64 *raw_ts);
#if defined(CONFIG_GENERIC_CMOS_UPDATE) || defined(CONFIG_RTC_SYSTOHC)
diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index 50e8d04ab661..2e5b89d7d866 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -1406,6 +1406,15 @@ void run_posix_cpu_timers(void)
lockdep_assert_irqs_disabled();
/*
+ * Ensure that release_task(tsk) can't happen while
+ * handle_posix_cpu_timers() is running. Otherwise, a concurrent
+ * posix_cpu_timer_del() may fail to lock_task_sighand(tsk) and
+ * miss timer->it.cpu.firing != 0.
+ */
+ if (tsk->exit_state)
+ return;
+
+ /*
* If the actual expiry is deferred to task work context and the
* work is already scheduled there is no point to do anything here.
*/
diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c
index 6222112533a7..8b582174b1f9 100644
--- a/kernel/time/posix-timers.c
+++ b/kernel/time/posix-timers.c
@@ -30,8 +30,6 @@
#include "timekeeping.h"
#include "posix-timers.h"
-static struct kmem_cache *posix_timers_cache;
-
/*
* Timers are managed in a hash table for lockless lookup. The hash key is
* constructed from current::signal and the timer ID and the timer is
@@ -49,10 +47,12 @@ struct timer_hash_bucket {
static struct {
struct timer_hash_bucket *buckets;
unsigned long mask;
-} __timer_data __ro_after_init __aligned(2*sizeof(long));
+ struct kmem_cache *cache;
+} __timer_data __ro_after_init __aligned(4*sizeof(long));
-#define timer_buckets (__timer_data.buckets)
-#define timer_hashmask (__timer_data.mask)
+#define timer_buckets (__timer_data.buckets)
+#define timer_hashmask (__timer_data.mask)
+#define posix_timers_cache (__timer_data.cache)
static const struct k_clock * const posix_clocks[];
static const struct k_clock *clockid_to_kclock(const clockid_t id);
@@ -283,14 +283,6 @@ static int posix_get_hrtimer_res(clockid_t which_clock, struct timespec64 *tp)
return 0;
}
-static __init int init_posix_timers(void)
-{
- posix_timers_cache = kmem_cache_create("posix_timers_cache", sizeof(struct k_itimer),
- __alignof__(struct k_itimer), SLAB_ACCOUNT, NULL);
- return 0;
-}
-__initcall(init_posix_timers);
-
/*
* The siginfo si_overrun field and the return value of timer_getoverrun(2)
* are of type int. Clamp the overrun value to INT_MAX
@@ -1534,6 +1526,9 @@ static const struct k_clock * const posix_clocks[] = {
[CLOCK_REALTIME_ALARM] = &alarm_clock,
[CLOCK_BOOTTIME_ALARM] = &alarm_clock,
[CLOCK_TAI] = &clock_tai,
+#ifdef CONFIG_POSIX_AUX_CLOCKS
+ [CLOCK_AUX ... CLOCK_AUX_LAST] = &clock_aux,
+#endif
};
static const struct k_clock *clockid_to_kclock(const clockid_t id)
@@ -1556,6 +1551,11 @@ static int __init posixtimer_init(void)
unsigned long i, size;
unsigned int shift;
+ posix_timers_cache = kmem_cache_create("posix_timers_cache",
+ sizeof(struct k_itimer),
+ __alignof__(struct k_itimer),
+ SLAB_ACCOUNT, NULL);
+
if (IS_ENABLED(CONFIG_BASE_SMALL))
size = 512;
else
diff --git a/kernel/time/posix-timers.h b/kernel/time/posix-timers.h
index 61906f0688c1..7f259e845d24 100644
--- a/kernel/time/posix-timers.h
+++ b/kernel/time/posix-timers.h
@@ -41,6 +41,7 @@ extern const struct k_clock clock_posix_dynamic;
extern const struct k_clock clock_process;
extern const struct k_clock clock_thread;
extern const struct k_clock alarm_clock;
+extern const struct k_clock clock_aux;
void posix_timer_queue_signal(struct k_itimer *timr);
diff --git a/kernel/time/sleep_timeout.c b/kernel/time/sleep_timeout.c
index c0e960a5de39..3c90574bd904 100644
--- a/kernel/time/sleep_timeout.c
+++ b/kernel/time/sleep_timeout.c
@@ -22,7 +22,7 @@ struct process_timer {
static void process_timeout(struct timer_list *t)
{
- struct process_timer *timeout = from_timer(timeout, t, timer);
+ struct process_timer *timeout = timer_container_of(timeout, t, timer);
wake_up_process(timeout->task);
}
@@ -100,7 +100,7 @@ signed long __sched schedule_timeout(signed long timeout)
timer_delete_sync(&timer.timer);
/* Remove the timer from the object tracker */
- destroy_timer_on_stack(&timer.timer);
+ timer_destroy_on_stack(&timer.timer);
timeout = expire - jiffies;
diff --git a/kernel/time/timecounter.c b/kernel/time/timecounter.c
index e6285288d765..3d2a354cfe1c 100644
--- a/kernel/time/timecounter.c
+++ b/kernel/time/timecounter.c
@@ -6,7 +6,7 @@
#include <linux/timecounter.h>
void timecounter_init(struct timecounter *tc,
- const struct cyclecounter *cc,
+ struct cyclecounter *cc,
u64 start_tstamp)
{
tc->cc = cc;
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index a009c91f7b05..059fa8b79be6 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -6,6 +6,7 @@
#include <linux/timekeeper_internal.h>
#include <linux/module.h>
#include <linux/interrupt.h>
+#include <linux/kobject.h>
#include <linux/percpu.h>
#include <linux/init.h>
#include <linux/mm.h>
@@ -25,6 +26,8 @@
#include <linux/audit.h>
#include <linux/random.h>
+#include <vdso/auxclock.h>
+
#include "tick-internal.h"
#include "ntp_internal.h"
#include "timekeeping_internal.h"
@@ -53,7 +56,32 @@ struct tk_data {
raw_spinlock_t lock;
} ____cacheline_aligned;
-static struct tk_data tk_core;
+static struct tk_data timekeeper_data[TIMEKEEPERS_MAX];
+
+/* The core timekeeper */
+#define tk_core (timekeeper_data[TIMEKEEPER_CORE])
+
+#ifdef CONFIG_POSIX_AUX_CLOCKS
+static inline bool tk_get_aux_ts64(unsigned int tkid, struct timespec64 *ts)
+{
+ return ktime_get_aux_ts64(CLOCK_AUX + tkid - TIMEKEEPER_AUX_FIRST, ts);
+}
+
+static inline bool tk_is_aux(const struct timekeeper *tk)
+{
+ return tk->id >= TIMEKEEPER_AUX_FIRST && tk->id <= TIMEKEEPER_AUX_LAST;
+}
+#else
+static inline bool tk_get_aux_ts64(unsigned int tkid, struct timespec64 *ts)
+{
+ return false;
+}
+
+static inline bool tk_is_aux(const struct timekeeper *tk)
+{
+ return false;
+}
+#endif
/* flag for if timekeeping is suspended */
int __read_mostly timekeeping_suspended;
@@ -113,6 +141,16 @@ static struct tk_fast tk_fast_raw ____cacheline_aligned = {
.base[1] = FAST_TK_INIT,
};
+#ifdef CONFIG_POSIX_AUX_CLOCKS
+static __init void tk_aux_setup(void);
+static void tk_aux_update_clocksource(void);
+static void tk_aux_advance(void);
+#else
+static inline void tk_aux_setup(void) { }
+static inline void tk_aux_update_clocksource(void) { }
+static inline void tk_aux_advance(void) { }
+#endif
+
unsigned long timekeeper_lock_irqsave(void)
{
unsigned long flags;
@@ -601,7 +639,7 @@ EXPORT_SYMBOL_GPL(pvclock_gtod_unregister_notifier);
*/
static inline void tk_update_leap_state(struct timekeeper *tk)
{
- tk->next_leap_ktime = ntp_get_next_leap();
+ tk->next_leap_ktime = ntp_get_next_leap(tk->id);
if (tk->next_leap_ktime != KTIME_MAX)
/* Convert to monotonic time */
tk->next_leap_ktime = ktime_sub(tk->next_leap_ktime, tk->offs_real);
@@ -663,7 +701,7 @@ static void timekeeping_restore_shadow(struct tk_data *tkd)
static void timekeeping_update_from_shadow(struct tk_data *tkd, unsigned int action)
{
- struct timekeeper *tk = &tk_core.shadow_timekeeper;
+ struct timekeeper *tk = &tkd->shadow_timekeeper;
lockdep_assert_held(&tkd->lock);
@@ -678,18 +716,22 @@ static void timekeeping_update_from_shadow(struct tk_data *tkd, unsigned int act
if (action & TK_CLEAR_NTP) {
tk->ntp_error = 0;
- ntp_clear();
+ ntp_clear(tk->id);
}
tk_update_leap_state(tk);
tk_update_ktime_data(tk);
+ tk->tkr_mono.base_real = tk->tkr_mono.base + tk->offs_real;
- update_vsyscall(tk);
- update_pvclock_gtod(tk, action & TK_CLOCK_WAS_SET);
+ if (tk->id == TIMEKEEPER_CORE) {
+ update_vsyscall(tk);
+ update_pvclock_gtod(tk, action & TK_CLOCK_WAS_SET);
- tk->tkr_mono.base_real = tk->tkr_mono.base + tk->offs_real;
- update_fast_timekeeper(&tk->tkr_mono, &tk_fast_mono);
- update_fast_timekeeper(&tk->tkr_raw, &tk_fast_raw);
+ update_fast_timekeeper(&tk->tkr_mono, &tk_fast_mono);
+ update_fast_timekeeper(&tk->tkr_raw, &tk_fast_raw);
+ } else if (tk_is_aux(tk)) {
+ vdso_time_update_aux(tk);
+ }
if (action & TK_CLOCK_WAS_SET)
tk->clock_was_set_seq++;
@@ -975,9 +1017,14 @@ time64_t ktime_get_real_seconds(void)
EXPORT_SYMBOL_GPL(ktime_get_real_seconds);
/**
- * __ktime_get_real_seconds - The same as ktime_get_real_seconds
- * but without the sequence counter protect. This internal function
- * is called just when timekeeping lock is already held.
+ * __ktime_get_real_seconds - Unprotected access to CLOCK_REALTIME seconds
+ *
+ * The same as ktime_get_real_seconds() but without the sequence counter
+ * protection. This function is used in restricted contexts like the x86 MCE
+ * handler and in KGDB. It's unprotected on 32-bit vs. concurrent half
+ * completed modification and only to be used for such critical contexts.
+ *
+ * Returns: Racy snapshot of the CLOCK_REALTIME seconds value
*/
noinstr time64_t __ktime_get_real_seconds(void)
{
@@ -1256,7 +1303,7 @@ int get_device_system_crosststamp(int (*get_time_fn)
struct system_time_snapshot *history_begin,
struct system_device_crosststamp *xtstamp)
{
- struct system_counterval_t system_counterval;
+ struct system_counterval_t system_counterval = {};
struct timekeeper *tk = &tk_core.timekeeper;
u64 cycles, now, interval_start;
unsigned int clock_was_set_seq = 0;
@@ -1412,41 +1459,73 @@ int do_settimeofday64(const struct timespec64 *ts)
}
EXPORT_SYMBOL(do_settimeofday64);
+static inline bool timekeeper_is_core_tk(struct timekeeper *tk)
+{
+ return !IS_ENABLED(CONFIG_POSIX_AUX_CLOCKS) || tk->id == TIMEKEEPER_CORE;
+}
+
/**
- * timekeeping_inject_offset - Adds or subtracts from the current time.
+ * __timekeeping_inject_offset - Adds or subtracts from the current time.
+ * @tkd: Pointer to the timekeeper to modify
* @ts: Pointer to the timespec variable containing the offset
*
* Adds or subtracts an offset value from the current time.
*/
-static int timekeeping_inject_offset(const struct timespec64 *ts)
+static int __timekeeping_inject_offset(struct tk_data *tkd, const struct timespec64 *ts)
{
+ struct timekeeper *tks = &tkd->shadow_timekeeper;
+ struct timespec64 tmp;
+
if (ts->tv_nsec < 0 || ts->tv_nsec >= NSEC_PER_SEC)
return -EINVAL;
- scoped_guard (raw_spinlock_irqsave, &tk_core.lock) {
- struct timekeeper *tks = &tk_core.shadow_timekeeper;
- struct timespec64 tmp;
-
- timekeeping_forward_now(tks);
+ timekeeping_forward_now(tks);
+ if (timekeeper_is_core_tk(tks)) {
/* Make sure the proposed value is valid */
tmp = timespec64_add(tk_xtime(tks), *ts);
if (timespec64_compare(&tks->wall_to_monotonic, ts) > 0 ||
!timespec64_valid_settod(&tmp)) {
- timekeeping_restore_shadow(&tk_core);
+ timekeeping_restore_shadow(tkd);
return -EINVAL;
}
tk_xtime_add(tks, ts);
tk_set_wall_to_mono(tks, timespec64_sub(tks->wall_to_monotonic, *ts));
- timekeeping_update_from_shadow(&tk_core, TK_UPDATE_ALL);
+ } else {
+ struct tk_read_base *tkr_mono = &tks->tkr_mono;
+ ktime_t now, offs;
+
+ /* Get the current time */
+ now = ktime_add_ns(tkr_mono->base, timekeeping_get_ns(tkr_mono));
+ /* Add the relative offset change */
+ offs = ktime_add(tks->offs_aux, timespec64_to_ktime(*ts));
+
+ /* Prevent that the resulting time becomes negative */
+ if (ktime_add(now, offs) < 0) {
+ timekeeping_restore_shadow(tkd);
+ return -EINVAL;
+ }
+ tks->offs_aux = offs;
}
- /* Signal hrtimers about time change */
- clock_was_set(CLOCK_SET_WALL);
+ timekeeping_update_from_shadow(tkd, TK_UPDATE_ALL);
return 0;
}
+static int timekeeping_inject_offset(const struct timespec64 *ts)
+{
+ int ret;
+
+ scoped_guard (raw_spinlock_irqsave, &tk_core.lock)
+ ret = __timekeeping_inject_offset(&tk_core, ts);
+
+ /* Signal hrtimers about time change */
+ if (!ret)
+ clock_was_set(CLOCK_SET_WALL);
+ return ret;
+}
+
/*
* Indicates if there is an offset between the system clock and the hardware
* clock/persistent clock/rtc.
@@ -1522,6 +1601,8 @@ static int change_clocksource(void *data)
timekeeping_update_from_shadow(&tk_core, TK_UPDATE_ALL);
}
+ tk_aux_update_clocksource();
+
if (old) {
if (old->disable)
old->disable(old);
@@ -1573,6 +1654,39 @@ void ktime_get_raw_ts64(struct timespec64 *ts)
}
EXPORT_SYMBOL(ktime_get_raw_ts64);
+/**
+ * ktime_get_clock_ts64 - Returns time of a clock in a timespec
+ * @id: POSIX clock ID of the clock to read
+ * @ts: Pointer to the timespec64 to be set
+ *
+ * The timestamp is invalidated (@ts->sec is set to -1) if the
+ * clock @id is not available.
+ */
+void ktime_get_clock_ts64(clockid_t id, struct timespec64 *ts)
+{
+ /* Invalidate time stamp */
+ ts->tv_sec = -1;
+ ts->tv_nsec = 0;
+
+ switch (id) {
+ case CLOCK_REALTIME:
+ ktime_get_real_ts64(ts);
+ return;
+ case CLOCK_MONOTONIC:
+ ktime_get_ts64(ts);
+ return;
+ case CLOCK_MONOTONIC_RAW:
+ ktime_get_raw_ts64(ts);
+ return;
+ case CLOCK_AUX ... CLOCK_AUX_LAST:
+ if (IS_ENABLED(CONFIG_POSIX_AUX_CLOCKS))
+ ktime_get_aux_ts64(id, ts);
+ return;
+ default:
+ WARN_ON_ONCE(1);
+ }
+}
+EXPORT_SYMBOL_GPL(ktime_get_clock_ts64);
/**
* timekeeping_valid_for_hres - Check if timekeeping is suitable for hres
@@ -1649,10 +1763,12 @@ read_persistent_wall_and_boot_offset(struct timespec64 *wall_time,
*boot_offset = ns_to_timespec64(local_clock());
}
-static __init void tkd_basic_setup(struct tk_data *tkd)
+static __init void tkd_basic_setup(struct tk_data *tkd, enum timekeeper_ids tk_id, bool valid)
{
raw_spin_lock_init(&tkd->lock);
seqcount_raw_spinlock_init(&tkd->seq, &tkd->lock);
+ tkd->timekeeper.id = tkd->shadow_timekeeper.id = tk_id;
+ tkd->timekeeper.clock_valid = tkd->shadow_timekeeper.clock_valid = valid;
}
/*
@@ -1682,7 +1798,8 @@ void __init timekeeping_init(void)
struct timekeeper *tks = &tk_core.shadow_timekeeper;
struct clocksource *clock;
- tkd_basic_setup(&tk_core);
+ tkd_basic_setup(&tk_core, TIMEKEEPER_CORE, true);
+ tk_aux_setup();
read_persistent_wall_and_boot_offset(&wall_time, &boot_offset);
if (timespec64_valid_settod(&wall_time) &&
@@ -2034,7 +2151,7 @@ static __always_inline void timekeeping_apply_adjustment(struct timekeeper *tk,
*/
static void timekeeping_adjust(struct timekeeper *tk, s64 offset)
{
- u64 ntp_tl = ntp_tick_length();
+ u64 ntp_tl = ntp_tick_length(tk->id);
u32 mult;
/*
@@ -2115,7 +2232,7 @@ static inline unsigned int accumulate_nsecs_to_secs(struct timekeeper *tk)
}
/* Figure out if its a leap sec and apply if needed */
- leap = second_overflow(tk->xtime_sec);
+ leap = second_overflow(tk->id, tk->xtime_sec);
if (unlikely(leap)) {
struct timespec64 ts;
@@ -2181,16 +2298,14 @@ static u64 logarithmic_accumulation(struct timekeeper *tk, u64 offset,
* timekeeping_advance - Updates the timekeeper to the current time and
* current NTP tick length
*/
-static bool timekeeping_advance(enum timekeeping_adv_mode mode)
+static bool __timekeeping_advance(struct tk_data *tkd, enum timekeeping_adv_mode mode)
{
- struct timekeeper *tk = &tk_core.shadow_timekeeper;
- struct timekeeper *real_tk = &tk_core.timekeeper;
+ struct timekeeper *tk = &tkd->shadow_timekeeper;
+ struct timekeeper *real_tk = &tkd->timekeeper;
unsigned int clock_set = 0;
int shift = 0, maxshift;
u64 offset, orig_offset;
- guard(raw_spinlock_irqsave)(&tk_core.lock);
-
/* Make sure we're fully resumed: */
if (unlikely(timekeeping_suspended))
return false;
@@ -2214,7 +2329,7 @@ static bool timekeeping_advance(enum timekeeping_adv_mode mode)
shift = ilog2(offset) - ilog2(tk->cycle_interval);
shift = max(0, shift);
/* Bound shift to one less than what overflows tick_length */
- maxshift = (64 - (ilog2(ntp_tick_length())+1)) - 1;
+ maxshift = (64 - (ilog2(ntp_tick_length(tk->id)) + 1)) - 1;
shift = min(shift, maxshift);
while (offset >= tk->cycle_interval) {
offset = logarithmic_accumulation(tk, offset, shift, &clock_set);
@@ -2239,19 +2354,27 @@ static bool timekeeping_advance(enum timekeeping_adv_mode mode)
if (orig_offset != offset)
tk_update_coarse_nsecs(tk);
- timekeeping_update_from_shadow(&tk_core, clock_set);
+ timekeeping_update_from_shadow(tkd, clock_set);
return !!clock_set;
}
+static bool timekeeping_advance(enum timekeeping_adv_mode mode)
+{
+ guard(raw_spinlock_irqsave)(&tk_core.lock);
+ return __timekeeping_advance(&tk_core, mode);
+}
+
/**
* update_wall_time - Uses the current clocksource to increment the wall time
*
+ * It also updates the enabled auxiliary clock timekeepers
*/
void update_wall_time(void)
{
if (timekeeping_advance(TK_ADV_TICK))
clock_was_set_delayed();
+ tk_aux_advance();
}
/**
@@ -2449,7 +2572,7 @@ ktime_t ktime_get_update_offsets_now(unsigned int *cwsseq, ktime_t *offs_real,
/*
* timekeeping_validate_timex - Ensures the timex is ok for use in do_adjtimex
*/
-static int timekeeping_validate_timex(const struct __kernel_timex *txc)
+static int timekeeping_validate_timex(const struct __kernel_timex *txc, bool aux_clock)
{
if (txc->modes & ADJ_ADJTIME) {
/* singleshot must not be used with any other mode bits */
@@ -2508,6 +2631,20 @@ static int timekeeping_validate_timex(const struct __kernel_timex *txc)
return -EINVAL;
}
+ if (aux_clock) {
+ /* Auxiliary clocks are similar to TAI and do not have leap seconds */
+ if (txc->status & (STA_INS | STA_DEL))
+ return -EINVAL;
+
+ /* No TAI offset setting */
+ if (txc->modes & ADJ_TAI)
+ return -EINVAL;
+
+ /* No PPS support either */
+ if (txc->status & (STA_PPSFREQ | STA_PPSTIME))
+ return -EINVAL;
+ }
+
return 0;
}
@@ -2526,74 +2663,103 @@ unsigned long random_get_entropy_fallback(void)
}
EXPORT_SYMBOL_GPL(random_get_entropy_fallback);
-/**
- * do_adjtimex() - Accessor function to NTP __do_adjtimex function
- * @txc: Pointer to kernel_timex structure containing NTP parameters
- */
-int do_adjtimex(struct __kernel_timex *txc)
+struct adjtimex_result {
+ struct audit_ntp_data ad;
+ struct timespec64 delta;
+ bool clock_set;
+};
+
+static int __do_adjtimex(struct tk_data *tkd, struct __kernel_timex *txc,
+ struct adjtimex_result *result)
{
- struct audit_ntp_data ad;
- bool offset_set = false;
- bool clock_set = false;
+ struct timekeeper *tks = &tkd->shadow_timekeeper;
+ bool aux_clock = !timekeeper_is_core_tk(tks);
struct timespec64 ts;
+ s32 orig_tai, tai;
int ret;
/* Validate the data before disabling interrupts */
- ret = timekeeping_validate_timex(txc);
+ ret = timekeeping_validate_timex(txc, aux_clock);
if (ret)
return ret;
add_device_randomness(txc, sizeof(*txc));
- if (txc->modes & ADJ_SETOFFSET) {
- struct timespec64 delta;
+ if (!aux_clock)
+ ktime_get_real_ts64(&ts);
+ else
+ tk_get_aux_ts64(tkd->timekeeper.id, &ts);
- delta.tv_sec = txc->time.tv_sec;
- delta.tv_nsec = txc->time.tv_usec;
+ add_device_randomness(&ts, sizeof(ts));
+
+ guard(raw_spinlock_irqsave)(&tkd->lock);
+
+ if (!tks->clock_valid)
+ return -ENODEV;
+
+ if (txc->modes & ADJ_SETOFFSET) {
+ result->delta.tv_sec = txc->time.tv_sec;
+ result->delta.tv_nsec = txc->time.tv_usec;
if (!(txc->modes & ADJ_NANO))
- delta.tv_nsec *= 1000;
- ret = timekeeping_inject_offset(&delta);
+ result->delta.tv_nsec *= 1000;
+ ret = __timekeeping_inject_offset(tkd, &result->delta);
if (ret)
return ret;
-
- offset_set = delta.tv_sec != 0;
- audit_tk_injoffset(delta);
+ result->clock_set = true;
}
- audit_ntp_init(&ad);
+ orig_tai = tai = tks->tai_offset;
+ ret = ntp_adjtimex(tks->id, txc, &ts, &tai, &result->ad);
- ktime_get_real_ts64(&ts);
- add_device_randomness(&ts, sizeof(ts));
+ if (tai != orig_tai) {
+ __timekeeping_set_tai_offset(tks, tai);
+ timekeeping_update_from_shadow(tkd, TK_CLOCK_WAS_SET);
+ result->clock_set = true;
+ } else {
+ tk_update_leap_state_all(&tk_core);
+ }
- scoped_guard (raw_spinlock_irqsave, &tk_core.lock) {
- struct timekeeper *tks = &tk_core.shadow_timekeeper;
- s32 orig_tai, tai;
+ /* Update the multiplier immediately if frequency was set directly */
+ if (txc->modes & (ADJ_FREQUENCY | ADJ_TICK))
+ result->clock_set |= __timekeeping_advance(tkd, TK_ADV_FREQ);
- orig_tai = tai = tks->tai_offset;
- ret = __do_adjtimex(txc, &ts, &tai, &ad);
+ return ret;
+}
- if (tai != orig_tai) {
- __timekeeping_set_tai_offset(tks, tai);
- timekeeping_update_from_shadow(&tk_core, TK_CLOCK_WAS_SET);
- clock_set = true;
- } else {
- tk_update_leap_state_all(&tk_core);
- }
- }
+/**
+ * do_adjtimex() - Accessor function to NTP __do_adjtimex function
+ * @txc: Pointer to kernel_timex structure containing NTP parameters
+ */
+int do_adjtimex(struct __kernel_timex *txc)
+{
+ struct adjtimex_result result = { };
+ int ret;
- audit_ntp_log(&ad);
+ ret = __do_adjtimex(&tk_core, txc, &result);
+ if (ret < 0)
+ return ret;
- /* Update the multiplier immediately if frequency was set directly */
- if (txc->modes & (ADJ_FREQUENCY | ADJ_TICK))
- clock_set |= timekeeping_advance(TK_ADV_FREQ);
+ if (txc->modes & ADJ_SETOFFSET)
+ audit_tk_injoffset(result.delta);
- if (clock_set)
+ audit_ntp_log(&result.ad);
+
+ if (result.clock_set)
clock_was_set(CLOCK_SET_WALL);
- ntp_notify_cmos_timer(offset_set);
+ ntp_notify_cmos_timer(result.delta.tv_sec != 0);
return ret;
}
+/*
+ * Invoked from NTP with the time keeper lock held, so lockless access is
+ * fine.
+ */
+long ktime_get_ntp_seconds(unsigned int id)
+{
+ return timekeeper_data[id].timekeeper.xtime_sec;
+}
+
#ifdef CONFIG_NTP_PPS
/**
* hardpps() - Accessor function to NTP __hardpps function
@@ -2607,3 +2773,316 @@ void hardpps(const struct timespec64 *phase_ts, const struct timespec64 *raw_ts)
}
EXPORT_SYMBOL(hardpps);
#endif /* CONFIG_NTP_PPS */
+
+#ifdef CONFIG_POSIX_AUX_CLOCKS
+#include "posix-timers.h"
+
+/*
+ * Bitmap for the activated auxiliary timekeepers to allow lockless quick
+ * checks in the hot paths without touching extra cache lines. If set, then
+ * the state of the corresponding timekeeper has to be re-checked under
+ * timekeeper::lock.
+ */
+static unsigned long aux_timekeepers;
+
+static inline unsigned int clockid_to_tkid(unsigned int id)
+{
+ return TIMEKEEPER_AUX_FIRST + id - CLOCK_AUX;
+}
+
+static inline struct tk_data *aux_get_tk_data(clockid_t id)
+{
+ if (!clockid_aux_valid(id))
+ return NULL;
+ return &timekeeper_data[clockid_to_tkid(id)];
+}
+
+/* Invoked from timekeeping after a clocksource change */
+static void tk_aux_update_clocksource(void)
+{
+ unsigned long active = READ_ONCE(aux_timekeepers);
+ unsigned int id;
+
+ for_each_set_bit(id, &active, BITS_PER_LONG) {
+ struct tk_data *tkd = &timekeeper_data[id + TIMEKEEPER_AUX_FIRST];
+ struct timekeeper *tks = &tkd->shadow_timekeeper;
+
+ guard(raw_spinlock_irqsave)(&tkd->lock);
+ if (!tks->clock_valid)
+ continue;
+
+ timekeeping_forward_now(tks);
+ tk_setup_internals(tks, tk_core.timekeeper.tkr_mono.clock);
+ timekeeping_update_from_shadow(tkd, TK_UPDATE_ALL);
+ }
+}
+
+static void tk_aux_advance(void)
+{
+ unsigned long active = READ_ONCE(aux_timekeepers);
+ unsigned int id;
+
+ /* Lockless quick check to avoid extra cache lines */
+ for_each_set_bit(id, &active, BITS_PER_LONG) {
+ struct tk_data *aux_tkd = &timekeeper_data[id + TIMEKEEPER_AUX_FIRST];
+
+ guard(raw_spinlock)(&aux_tkd->lock);
+ if (aux_tkd->shadow_timekeeper.clock_valid)
+ __timekeeping_advance(aux_tkd, TK_ADV_TICK);
+ }
+}
+
+/**
+ * ktime_get_aux - Get time for a AUX clock
+ * @id: ID of the clock to read (CLOCK_AUX...)
+ * @kt: Pointer to ktime_t to store the time stamp
+ *
+ * Returns: True if the timestamp is valid, false otherwise
+ */
+bool ktime_get_aux(clockid_t id, ktime_t *kt)
+{
+ struct tk_data *aux_tkd = aux_get_tk_data(id);
+ struct timekeeper *aux_tk;
+ unsigned int seq;
+ ktime_t base;
+ u64 nsecs;
+
+ WARN_ON(timekeeping_suspended);
+
+ if (!aux_tkd)
+ return false;
+
+ aux_tk = &aux_tkd->timekeeper;
+ do {
+ seq = read_seqcount_begin(&aux_tkd->seq);
+ if (!aux_tk->clock_valid)
+ return false;
+
+ base = ktime_add(aux_tk->tkr_mono.base, aux_tk->offs_aux);
+ nsecs = timekeeping_get_ns(&aux_tk->tkr_mono);
+ } while (read_seqcount_retry(&aux_tkd->seq, seq));
+
+ *kt = ktime_add_ns(base, nsecs);
+ return true;
+}
+EXPORT_SYMBOL_GPL(ktime_get_aux);
+
+/**
+ * ktime_get_aux_ts64 - Get time for a AUX clock
+ * @id: ID of the clock to read (CLOCK_AUX...)
+ * @ts: Pointer to timespec64 to store the time stamp
+ *
+ * Returns: True if the timestamp is valid, false otherwise
+ */
+bool ktime_get_aux_ts64(clockid_t id, struct timespec64 *ts)
+{
+ ktime_t now;
+
+ if (!ktime_get_aux(id, &now))
+ return false;
+ *ts = ktime_to_timespec64(now);
+ return true;
+}
+EXPORT_SYMBOL_GPL(ktime_get_aux_ts64);
+
+static int aux_get_res(clockid_t id, struct timespec64 *tp)
+{
+ if (!clockid_aux_valid(id))
+ return -ENODEV;
+
+ tp->tv_sec = aux_clock_resolution_ns() / NSEC_PER_SEC;
+ tp->tv_nsec = aux_clock_resolution_ns() % NSEC_PER_SEC;
+ return 0;
+}
+
+static int aux_get_timespec(clockid_t id, struct timespec64 *tp)
+{
+ return ktime_get_aux_ts64(id, tp) ? 0 : -ENODEV;
+}
+
+static int aux_clock_set(const clockid_t id, const struct timespec64 *tnew)
+{
+ struct tk_data *aux_tkd = aux_get_tk_data(id);
+ struct timekeeper *aux_tks;
+ ktime_t tnow, nsecs;
+
+ if (!timespec64_valid_settod(tnew))
+ return -EINVAL;
+ if (!aux_tkd)
+ return -ENODEV;
+
+ aux_tks = &aux_tkd->shadow_timekeeper;
+
+ guard(raw_spinlock_irq)(&aux_tkd->lock);
+ if (!aux_tks->clock_valid)
+ return -ENODEV;
+
+ /* Forward the timekeeper base time */
+ timekeeping_forward_now(aux_tks);
+ /*
+ * Get the updated base time. tkr_mono.base has not been
+ * updated yet, so do that first. That makes the update
+ * in timekeeping_update_from_shadow() redundant, but
+ * that's harmless. After that @tnow can be calculated
+ * by using tkr_mono::cycle_last, which has been set
+ * by timekeeping_forward_now().
+ */
+ tk_update_ktime_data(aux_tks);
+ nsecs = timekeeping_cycles_to_ns(&aux_tks->tkr_mono, aux_tks->tkr_mono.cycle_last);
+ tnow = ktime_add(aux_tks->tkr_mono.base, nsecs);
+
+ /*
+ * Calculate the new AUX offset as delta to @tnow ("monotonic").
+ * That avoids all the tk::xtime back and forth conversions as
+ * xtime ("realtime") is not applicable for auxiliary clocks and
+ * kept in sync with "monotonic".
+ */
+ aux_tks->offs_aux = ktime_sub(timespec64_to_ktime(*tnew), tnow);
+
+ timekeeping_update_from_shadow(aux_tkd, TK_UPDATE_ALL);
+ return 0;
+}
+
+static int aux_clock_adj(const clockid_t id, struct __kernel_timex *txc)
+{
+ struct tk_data *aux_tkd = aux_get_tk_data(id);
+ struct adjtimex_result result = { };
+
+ if (!aux_tkd)
+ return -ENODEV;
+
+ /*
+ * @result is ignored for now as there are neither hrtimers nor a
+ * RTC related to auxiliary clocks for now.
+ */
+ return __do_adjtimex(aux_tkd, txc, &result);
+}
+
+const struct k_clock clock_aux = {
+ .clock_getres = aux_get_res,
+ .clock_get_timespec = aux_get_timespec,
+ .clock_set = aux_clock_set,
+ .clock_adj = aux_clock_adj,
+};
+
+static void aux_clock_enable(clockid_t id)
+{
+ struct tk_read_base *tkr_raw = &tk_core.timekeeper.tkr_raw;
+ struct tk_data *aux_tkd = aux_get_tk_data(id);
+ struct timekeeper *aux_tks = &aux_tkd->shadow_timekeeper;
+
+ /* Prevent the core timekeeper from changing. */
+ guard(raw_spinlock_irq)(&tk_core.lock);
+
+ /*
+ * Setup the auxiliary clock assuming that the raw core timekeeper
+ * clock frequency conversion is close enough. Userspace has to
+ * adjust for the deviation via clock_adjtime(2).
+ */
+ guard(raw_spinlock_nested)(&aux_tkd->lock);
+
+ /* Remove leftovers of a previous registration */
+ memset(aux_tks, 0, sizeof(*aux_tks));
+ /* Restore the timekeeper id */
+ aux_tks->id = aux_tkd->timekeeper.id;
+ /* Setup the timekeeper based on the current system clocksource */
+ tk_setup_internals(aux_tks, tkr_raw->clock);
+
+ /* Mark it valid and set it live */
+ aux_tks->clock_valid = true;
+ timekeeping_update_from_shadow(aux_tkd, TK_UPDATE_ALL);
+}
+
+static void aux_clock_disable(clockid_t id)
+{
+ struct tk_data *aux_tkd = aux_get_tk_data(id);
+
+ guard(raw_spinlock_irq)(&aux_tkd->lock);
+ aux_tkd->shadow_timekeeper.clock_valid = false;
+ timekeeping_update_from_shadow(aux_tkd, TK_UPDATE_ALL);
+}
+
+static DEFINE_MUTEX(aux_clock_mutex);
+
+static ssize_t aux_clock_enable_store(struct kobject *kobj, struct kobj_attribute *attr,
+ const char *buf, size_t count)
+{
+ /* Lazy atoi() as name is "0..7" */
+ int id = kobj->name[0] & 0x7;
+ bool enable;
+
+ if (!capable(CAP_SYS_TIME))
+ return -EPERM;
+
+ if (kstrtobool(buf, &enable) < 0)
+ return -EINVAL;
+
+ guard(mutex)(&aux_clock_mutex);
+ if (enable == test_bit(id, &aux_timekeepers))
+ return count;
+
+ if (enable) {
+ aux_clock_enable(CLOCK_AUX + id);
+ set_bit(id, &aux_timekeepers);
+ } else {
+ aux_clock_disable(CLOCK_AUX + id);
+ clear_bit(id, &aux_timekeepers);
+ }
+ return count;
+}
+
+static ssize_t aux_clock_enable_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf)
+{
+ unsigned long active = READ_ONCE(aux_timekeepers);
+ /* Lazy atoi() as name is "0..7" */
+ int id = kobj->name[0] & 0x7;
+
+ return sysfs_emit(buf, "%d\n", test_bit(id, &active));
+}
+
+static struct kobj_attribute aux_clock_enable_attr = __ATTR_RW(aux_clock_enable);
+
+static struct attribute *aux_clock_enable_attrs[] = {
+ &aux_clock_enable_attr.attr,
+ NULL
+};
+
+static const struct attribute_group aux_clock_enable_attr_group = {
+ .attrs = aux_clock_enable_attrs,
+};
+
+static int __init tk_aux_sysfs_init(void)
+{
+ struct kobject *auxo, *tko = kobject_create_and_add("time", kernel_kobj);
+
+ if (!tko)
+ return -ENOMEM;
+
+ auxo = kobject_create_and_add("aux_clocks", tko);
+ if (!auxo) {
+ kobject_put(tko);
+ return -ENOMEM;
+ }
+
+ for (int i = 0; i <= MAX_AUX_CLOCKS; i++) {
+ char id[2] = { [0] = '0' + i, };
+ struct kobject *clk = kobject_create_and_add(id, auxo);
+
+ if (!clk)
+ return -ENOMEM;
+
+ int ret = sysfs_create_group(clk, &aux_clock_enable_attr_group);
+
+ if (ret)
+ return ret;
+ }
+ return 0;
+}
+late_initcall(tk_aux_sysfs_init);
+
+static __init void tk_aux_setup(void)
+{
+ for (int i = TIMEKEEPER_AUX_FIRST; i <= TIMEKEEPER_AUX_LAST; i++)
+ tkd_basic_setup(&timekeeper_data[i], i, false);
+}
+#endif /* CONFIG_POSIX_AUX_CLOCKS */
diff --git a/kernel/time/timekeeping_internal.h b/kernel/time/timekeeping_internal.h
index 8c9079108ffb..973ede670a36 100644
--- a/kernel/time/timekeeping_internal.h
+++ b/kernel/time/timekeeping_internal.h
@@ -45,4 +45,7 @@ static inline u64 clocksource_delta(u64 now, u64 last, u64 mask, u64 max_delta)
unsigned long timekeeper_lock_irqsave(void);
void timekeeper_unlock_irqrestore(unsigned long flags);
+/* NTP specific interface to access the current seconds value */
+long ktime_get_ntp_seconds(unsigned int id);
+
#endif /* _TIMEKEEPING_INTERNAL_H */
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index 4d915c0a263c..553fa469d7cc 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -386,32 +386,6 @@ static unsigned long round_jiffies_common(unsigned long j, int cpu,
}
/**
- * __round_jiffies - function to round jiffies to a full second
- * @j: the time in (absolute) jiffies that should be rounded
- * @cpu: the processor number on which the timeout will happen
- *
- * __round_jiffies() rounds an absolute time in the future (in jiffies)
- * up or down to (approximately) full seconds. This is useful for timers
- * for which the exact time they fire does not matter too much, as long as
- * they fire approximately every X seconds.
- *
- * By rounding these timers to whole seconds, all such timers will fire
- * at the same time, rather than at various times spread out. The goal
- * of this is to have the CPU wake up less, which saves power.
- *
- * The exact rounding is skewed for each processor to avoid all
- * processors firing at the exact same time, which could lead
- * to lock contention or spurious cache line bouncing.
- *
- * The return value is the rounded version of the @j parameter.
- */
-unsigned long __round_jiffies(unsigned long j, int cpu)
-{
- return round_jiffies_common(j, cpu, false);
-}
-EXPORT_SYMBOL_GPL(__round_jiffies);
-
-/**
* __round_jiffies_relative - function to round jiffies to a full second
* @j: the time in (relative) jiffies that should be rounded
* @cpu: the processor number on which the timeout will happen
@@ -483,22 +457,6 @@ unsigned long round_jiffies_relative(unsigned long j)
EXPORT_SYMBOL_GPL(round_jiffies_relative);
/**
- * __round_jiffies_up - function to round jiffies up to a full second
- * @j: the time in (absolute) jiffies that should be rounded
- * @cpu: the processor number on which the timeout will happen
- *
- * This is the same as __round_jiffies() except that it will never
- * round down. This is useful for timeouts for which the exact time
- * of firing does not matter too much, as long as they don't fire too
- * early.
- */
-unsigned long __round_jiffies_up(unsigned long j, int cpu)
-{
- return round_jiffies_common(j, cpu, true);
-}
-EXPORT_SYMBOL_GPL(__round_jiffies_up);
-
-/**
* __round_jiffies_up_relative - function to round jiffies up to a full second
* @j: the time in (relative) jiffies that should be rounded
* @cpu: the processor number on which the timeout will happen
@@ -850,7 +808,7 @@ static void do_init_timer(struct timer_list *timer,
unsigned int flags,
const char *name, struct lock_class_key *key);
-void init_timer_on_stack_key(struct timer_list *timer,
+void timer_init_key_on_stack(struct timer_list *timer,
void (*func)(struct timer_list *),
unsigned int flags,
const char *name, struct lock_class_key *key)
@@ -858,13 +816,13 @@ void init_timer_on_stack_key(struct timer_list *timer,
debug_object_init_on_stack(timer, &timer_debug_descr);
do_init_timer(timer, func, flags, name, key);
}
-EXPORT_SYMBOL_GPL(init_timer_on_stack_key);
+EXPORT_SYMBOL_GPL(timer_init_key_on_stack);
-void destroy_timer_on_stack(struct timer_list *timer)
+void timer_destroy_on_stack(struct timer_list *timer)
{
debug_object_free(timer, &timer_debug_descr);
}
-EXPORT_SYMBOL_GPL(destroy_timer_on_stack);
+EXPORT_SYMBOL_GPL(timer_destroy_on_stack);
#else
static inline void debug_timer_init(struct timer_list *timer) { }
@@ -904,7 +862,7 @@ static void do_init_timer(struct timer_list *timer,
}
/**
- * init_timer_key - initialize a timer
+ * timer_init_key - initialize a timer
* @timer: the timer to be initialized
* @func: timer callback function
* @flags: timer flags
@@ -912,17 +870,17 @@ static void do_init_timer(struct timer_list *timer,
* @key: lockdep class key of the fake lock used for tracking timer
* sync lock dependencies
*
- * init_timer_key() must be done to a timer prior to calling *any* of the
+ * timer_init_key() must be done to a timer prior to calling *any* of the
* other timer functions.
*/
-void init_timer_key(struct timer_list *timer,
+void timer_init_key(struct timer_list *timer,
void (*func)(struct timer_list *), unsigned int flags,
const char *name, struct lock_class_key *key)
{
debug_init(timer);
do_init_timer(timer, func, flags, name, key);
}
-EXPORT_SYMBOL(init_timer_key);
+EXPORT_SYMBOL(timer_init_key);
static inline void detach_timer(struct timer_list *timer, bool clear_pending)
{
@@ -1511,7 +1469,7 @@ static int __try_to_del_timer_sync(struct timer_list *timer, bool shutdown)
}
/**
- * try_to_del_timer_sync - Try to deactivate a timer
+ * timer_delete_sync_try - Try to deactivate a timer
* @timer: Timer to deactivate
*
* This function tries to deactivate a timer. On success the timer is not
@@ -1526,11 +1484,11 @@ static int __try_to_del_timer_sync(struct timer_list *timer, bool shutdown)
* * %1 - The timer was pending and deactivated
* * %-1 - The timer callback function is running on a different CPU
*/
-int try_to_del_timer_sync(struct timer_list *timer)
+int timer_delete_sync_try(struct timer_list *timer)
{
return __try_to_del_timer_sync(timer, false);
}
-EXPORT_SYMBOL(try_to_del_timer_sync);
+EXPORT_SYMBOL(timer_delete_sync_try);
#ifdef CONFIG_PREEMPT_RT
static __init void timer_base_init_expiry_lock(struct timer_base *base)
@@ -1900,7 +1858,7 @@ static void timer_recalc_next_expiry(struct timer_base *base)
unsigned long clk, next, adj;
unsigned lvl, offset = 0;
- next = base->clk + NEXT_TIMER_MAX_DELTA;
+ next = base->clk + TIMER_NEXT_MAX_DELTA;
clk = base->clk;
for (lvl = 0; lvl < LVL_DEPTH; lvl++, offset += LVL_SIZE) {
int pos = next_pending_bucket(base, offset, clk & LVL_MASK);
@@ -1963,7 +1921,7 @@ static void timer_recalc_next_expiry(struct timer_base *base)
WRITE_ONCE(base->next_expiry, next);
base->next_expiry_recalc = false;
- base->timers_pending = !(next == base->clk + NEXT_TIMER_MAX_DELTA);
+ base->timers_pending = !(next == base->clk + TIMER_NEXT_MAX_DELTA);
}
#ifdef CONFIG_NO_HZ_COMMON
@@ -2015,7 +1973,7 @@ static unsigned long next_timer_interrupt(struct timer_base *base,
* easy comparable to find out which base holds the first pending timer.
*/
if (!base->timers_pending)
- WRITE_ONCE(base->next_expiry, basej + NEXT_TIMER_MAX_DELTA);
+ WRITE_ONCE(base->next_expiry, basej + TIMER_NEXT_MAX_DELTA);
return base->next_expiry;
}
@@ -2399,7 +2357,7 @@ static inline void __run_timers(struct timer_base *base)
* timer at this clk are that all matching timers have been
* dequeued or no timer has been queued since
* base::next_expiry was set to base::clk +
- * NEXT_TIMER_MAX_DELTA.
+ * TIMER_NEXT_MAX_DELTA.
*/
WARN_ON_ONCE(!levels && !base->next_expiry_recalc
&& base->timers_pending);
@@ -2544,7 +2502,7 @@ int timers_prepare_cpu(unsigned int cpu)
for (b = 0; b < NR_BASES; b++) {
base = per_cpu_ptr(&timer_bases[b], cpu);
base->clk = jiffies;
- base->next_expiry = base->clk + NEXT_TIMER_MAX_DELTA;
+ base->next_expiry = base->clk + TIMER_NEXT_MAX_DELTA;
base->next_expiry_recalc = false;
base->timers_pending = false;
base->is_idle = false;
@@ -2599,7 +2557,7 @@ static void __init init_timer_cpu(int cpu)
base->cpu = cpu;
raw_spin_lock_init(&base->lock);
base->clk = jiffies;
- base->next_expiry = base->clk + NEXT_TIMER_MAX_DELTA;
+ base->next_expiry = base->clk + TIMER_NEXT_MAX_DELTA;
timer_base_init_expiry_lock(base);
}
}
@@ -2612,7 +2570,7 @@ static void __init init_timer_cpus(void)
init_timer_cpu(cpu);
}
-void __init init_timers(void)
+void __init timers_init(void)
{
init_timer_cpus();
posix_cputimers_init_work();
diff --git a/kernel/time/timer_migration.c b/kernel/time/timer_migration.c
index 2f6330831f08..c0c54dc5314c 100644
--- a/kernel/time/timer_migration.c
+++ b/kernel/time/timer_migration.c
@@ -1405,23 +1405,20 @@ u64 tmigr_quick_check(u64 nextevt)
return KTIME_MAX;
do {
- if (!tmigr_check_lonely(group)) {
+ if (!tmigr_check_lonely(group))
return KTIME_MAX;
- } else {
- /*
- * Since current CPU is active, events may not be sorted
- * from bottom to the top because the CPU's event is ignored
- * up to the top and its sibling's events not propagated upwards.
- * Thus keep track of the lowest observed expiry.
- */
- nextevt = min_t(u64, nextevt, READ_ONCE(group->next_expiry));
- if (!group->parent)
- return nextevt;
- }
+
+ /*
+ * Since current CPU is active, events may not be sorted
+ * from bottom to the top because the CPU's event is ignored
+ * up to the top and its sibling's events not propagated upwards.
+ * Thus keep track of the lowest observed expiry.
+ */
+ nextevt = min_t(u64, nextevt, READ_ONCE(group->next_expiry));
group = group->parent;
} while (group);
- return KTIME_MAX;
+ return nextevt;
}
/*
diff --git a/kernel/time/vsyscall.c b/kernel/time/vsyscall.c
index 32ef27c71b57..8ba8b0d8a387 100644
--- a/kernel/time/vsyscall.c
+++ b/kernel/time/vsyscall.c
@@ -15,26 +15,25 @@
#include "timekeeping_internal.h"
+static inline void fill_clock_configuration(struct vdso_clock *vc, const struct tk_read_base *base)
+{
+ vc->cycle_last = base->cycle_last;
+#ifdef CONFIG_GENERIC_VDSO_OVERFLOW_PROTECT
+ vc->max_cycles = base->clock->max_cycles;
+#endif
+ vc->mask = base->mask;
+ vc->mult = base->mult;
+ vc->shift = base->shift;
+}
+
static inline void update_vdso_time_data(struct vdso_time_data *vdata, struct timekeeper *tk)
{
struct vdso_clock *vc = vdata->clock_data;
struct vdso_timestamp *vdso_ts;
u64 nsec, sec;
- vc[CS_HRES_COARSE].cycle_last = tk->tkr_mono.cycle_last;
-#ifdef CONFIG_GENERIC_VDSO_OVERFLOW_PROTECT
- vc[CS_HRES_COARSE].max_cycles = tk->tkr_mono.clock->max_cycles;
-#endif
- vc[CS_HRES_COARSE].mask = tk->tkr_mono.mask;
- vc[CS_HRES_COARSE].mult = tk->tkr_mono.mult;
- vc[CS_HRES_COARSE].shift = tk->tkr_mono.shift;
- vc[CS_RAW].cycle_last = tk->tkr_raw.cycle_last;
-#ifdef CONFIG_GENERIC_VDSO_OVERFLOW_PROTECT
- vc[CS_RAW].max_cycles = tk->tkr_raw.clock->max_cycles;
-#endif
- vc[CS_RAW].mask = tk->tkr_raw.mask;
- vc[CS_RAW].mult = tk->tkr_raw.mult;
- vc[CS_RAW].shift = tk->tkr_raw.shift;
+ fill_clock_configuration(&vc[CS_HRES_COARSE], &tk->tkr_mono);
+ fill_clock_configuration(&vc[CS_RAW], &tk->tkr_raw);
/* CLOCK_MONOTONIC */
vdso_ts = &vc[CS_HRES_COARSE].basetime[CLOCK_MONOTONIC];
@@ -119,7 +118,8 @@ void update_vsyscall(struct timekeeper *tk)
if (clock_mode != VDSO_CLOCKMODE_NONE)
update_vdso_time_data(vdata, tk);
- __arch_update_vsyscall(vdata);
+ __arch_update_vdso_clock(&vc[CS_HRES_COARSE]);
+ __arch_update_vdso_clock(&vc[CS_RAW]);
vdso_write_end(vdata);
@@ -136,6 +136,46 @@ void update_vsyscall_tz(void)
__arch_sync_vdso_time_data(vdata);
}
+#ifdef CONFIG_POSIX_AUX_CLOCKS
+void vdso_time_update_aux(struct timekeeper *tk)
+{
+ struct vdso_time_data *vdata = vdso_k_time_data;
+ struct vdso_timestamp *vdso_ts;
+ struct vdso_clock *vc;
+ s32 clock_mode;
+ u64 nsec;
+
+ vc = &vdata->aux_clock_data[tk->id - TIMEKEEPER_AUX_FIRST];
+ vdso_ts = &vc->basetime[VDSO_BASE_AUX];
+ clock_mode = tk->tkr_mono.clock->vdso_clock_mode;
+ if (!tk->clock_valid)
+ clock_mode = VDSO_CLOCKMODE_NONE;
+
+ /* copy vsyscall data */
+ vdso_write_begin_clock(vc);
+
+ vc->clock_mode = clock_mode;
+
+ if (clock_mode != VDSO_CLOCKMODE_NONE) {
+ fill_clock_configuration(vc, &tk->tkr_mono);
+
+ vdso_ts->sec = tk->xtime_sec;
+
+ nsec = tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift;
+ nsec += tk->offs_aux;
+ vdso_ts->sec += __iter_div_u64_rem(nsec, NSEC_PER_SEC, &nsec);
+ nsec = nsec << tk->tkr_mono.shift;
+ vdso_ts->nsec = nsec;
+ }
+
+ __arch_update_vdso_clock(vc);
+
+ vdso_write_end_clock(vc);
+
+ __arch_sync_vdso_time_data(vdata);
+}
+#endif
+
/**
* vdso_update_begin - Start of a VDSO update section
*
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index a3f35c7d83b6..f135d04e1860 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -53,6 +53,12 @@ config HAVE_DYNAMIC_FTRACE_WITH_DIRECT_CALLS
config HAVE_DYNAMIC_FTRACE_WITH_CALL_OPS
bool
+config HAVE_EXTRA_IPI_TRACEPOINTS
+ bool
+ help
+ For architectures that use ipi_raise, ipi_entry and ipi_exit
+ tracepoints.
+
config HAVE_DYNAMIC_FTRACE_WITH_ARGS
bool
help
@@ -74,11 +80,6 @@ config HAVE_DYNAMIC_FTRACE_NO_PATCHABLE
If the architecture generates __patchable_function_entries sections
but does not want them included in the ftrace locations.
-config HAVE_FTRACE_MCOUNT_RECORD
- bool
- help
- See Documentation/trace/ftrace-design.rst
-
config HAVE_SYSCALL_TRACEPOINTS
bool
help
@@ -275,7 +276,7 @@ config FUNCTION_TRACE_ARGS
funcgraph-args (for the function graph tracer)
config DYNAMIC_FTRACE
- bool "enable/disable function tracing dynamically"
+ bool
depends on FUNCTION_TRACER
depends on HAVE_DYNAMIC_FTRACE
default y
@@ -803,27 +804,22 @@ config BPF_KPROBE_OVERRIDE
Allows BPF to override the execution of a probed function and
set a different return value. This is used for error injection.
-config FTRACE_MCOUNT_RECORD
- def_bool y
- depends on DYNAMIC_FTRACE
- depends on HAVE_FTRACE_MCOUNT_RECORD
-
config FTRACE_MCOUNT_USE_PATCHABLE_FUNCTION_ENTRY
bool
- depends on FTRACE_MCOUNT_RECORD
+ depends on DYNAMIC_FTRACE
config FTRACE_MCOUNT_USE_CC
def_bool y
depends on $(cc-option,-mrecord-mcount)
depends on !FTRACE_MCOUNT_USE_PATCHABLE_FUNCTION_ENTRY
- depends on FTRACE_MCOUNT_RECORD
+ depends on DYNAMIC_FTRACE
config FTRACE_MCOUNT_USE_OBJTOOL
def_bool y
depends on HAVE_OBJTOOL_MCOUNT
depends on !FTRACE_MCOUNT_USE_PATCHABLE_FUNCTION_ENTRY
depends on !FTRACE_MCOUNT_USE_CC
- depends on FTRACE_MCOUNT_RECORD
+ depends on DYNAMIC_FTRACE
select OBJTOOL
config FTRACE_MCOUNT_USE_RECORDMCOUNT
@@ -831,7 +827,7 @@ config FTRACE_MCOUNT_USE_RECORDMCOUNT
depends on !FTRACE_MCOUNT_USE_PATCHABLE_FUNCTION_ENTRY
depends on !FTRACE_MCOUNT_USE_CC
depends on !FTRACE_MCOUNT_USE_OBJTOOL
- depends on FTRACE_MCOUNT_RECORD
+ depends on DYNAMIC_FTRACE
config TRACING_MAP
bool
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 3f6a7bdc6edf..47168d2afbf1 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -1875,6 +1875,29 @@ void blk_fill_rwbs(char *rwbs, blk_opf_t opf)
case REQ_OP_READ:
rwbs[i++] = 'R';
break;
+ case REQ_OP_ZONE_APPEND:
+ rwbs[i++] = 'Z';
+ rwbs[i++] = 'A';
+ break;
+ case REQ_OP_ZONE_RESET:
+ case REQ_OP_ZONE_RESET_ALL:
+ rwbs[i++] = 'Z';
+ rwbs[i++] = 'R';
+ if ((opf & REQ_OP_MASK) == REQ_OP_ZONE_RESET_ALL)
+ rwbs[i++] = 'A';
+ break;
+ case REQ_OP_ZONE_FINISH:
+ rwbs[i++] = 'Z';
+ rwbs[i++] = 'F';
+ break;
+ case REQ_OP_ZONE_OPEN:
+ rwbs[i++] = 'Z';
+ rwbs[i++] = 'O';
+ break;
+ case REQ_OP_ZONE_CLOSE:
+ rwbs[i++] = 'Z';
+ rwbs[i++] = 'C';
+ break;
default:
rwbs[i++] = 'N';
}
@@ -1890,6 +1913,8 @@ void blk_fill_rwbs(char *rwbs, blk_opf_t opf)
if (opf & REQ_ATOMIC)
rwbs[i++] = 'U';
+ WARN_ON_ONCE(i >= RWBS_LEN);
+
rwbs[i] = '\0';
}
EXPORT_SYMBOL_GPL(blk_fill_rwbs);
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 187dc37d61d4..3ae52978cae6 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -572,7 +572,7 @@ BPF_CALL_2(bpf_perf_event_read, struct bpf_map *, map, u64, flags)
return value;
}
-static const struct bpf_func_proto bpf_perf_event_read_proto = {
+const struct bpf_func_proto bpf_perf_event_read_proto = {
.func = bpf_perf_event_read,
.gpl_only = true,
.ret_type = RET_INTEGER,
@@ -781,8 +781,7 @@ BPF_CALL_1(bpf_task_pt_regs, struct task_struct *, task)
return (unsigned long) task_pt_regs(task);
}
-BTF_ID_LIST(bpf_task_pt_regs_ids)
-BTF_ID(struct, pt_regs)
+BTF_ID_LIST_SINGLE(bpf_task_pt_regs_ids, struct, pt_regs)
const struct bpf_func_proto bpf_task_pt_regs_proto = {
.func = bpf_task_pt_regs,
@@ -882,7 +881,7 @@ BPF_CALL_1(bpf_send_signal, u32, sig)
return bpf_send_signal_common(sig, PIDTYPE_TGID, NULL, 0);
}
-static const struct bpf_func_proto bpf_send_signal_proto = {
+const struct bpf_func_proto bpf_send_signal_proto = {
.func = bpf_send_signal,
.gpl_only = false,
.ret_type = RET_INTEGER,
@@ -894,7 +893,7 @@ BPF_CALL_1(bpf_send_signal_thread, u32, sig)
return bpf_send_signal_common(sig, PIDTYPE_PID, NULL, 0);
}
-static const struct bpf_func_proto bpf_send_signal_thread_proto = {
+const struct bpf_func_proto bpf_send_signal_thread_proto = {
.func = bpf_send_signal_thread,
.gpl_only = false,
.ret_type = RET_INTEGER,
@@ -1185,7 +1184,7 @@ BPF_CALL_3(bpf_get_branch_snapshot, void *, buf, u32, size, u64, flags)
return entry_cnt * br_entry_size;
}
-static const struct bpf_func_proto bpf_get_branch_snapshot_proto = {
+const struct bpf_func_proto bpf_get_branch_snapshot_proto = {
.func = bpf_get_branch_snapshot,
.gpl_only = true,
.ret_type = RET_INTEGER,
@@ -1270,7 +1269,7 @@ __bpf_kfunc_start_defs();
* Return: a bpf_key pointer with a valid key pointer if the key is found, a
* NULL pointer otherwise.
*/
-__bpf_kfunc struct bpf_key *bpf_lookup_user_key(u32 serial, u64 flags)
+__bpf_kfunc struct bpf_key *bpf_lookup_user_key(s32 serial, u64 flags)
{
key_ref_t key_ref;
struct bpf_key *bkey;
@@ -1430,56 +1429,8 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
const struct bpf_func_proto *func_proto;
switch (func_id) {
- case BPF_FUNC_map_lookup_elem:
- return &bpf_map_lookup_elem_proto;
- case BPF_FUNC_map_update_elem:
- return &bpf_map_update_elem_proto;
- case BPF_FUNC_map_delete_elem:
- return &bpf_map_delete_elem_proto;
- case BPF_FUNC_map_push_elem:
- return &bpf_map_push_elem_proto;
- case BPF_FUNC_map_pop_elem:
- return &bpf_map_pop_elem_proto;
- case BPF_FUNC_map_peek_elem:
- return &bpf_map_peek_elem_proto;
- case BPF_FUNC_map_lookup_percpu_elem:
- return &bpf_map_lookup_percpu_elem_proto;
- case BPF_FUNC_ktime_get_ns:
- return &bpf_ktime_get_ns_proto;
- case BPF_FUNC_ktime_get_boot_ns:
- return &bpf_ktime_get_boot_ns_proto;
- case BPF_FUNC_tail_call:
- return &bpf_tail_call_proto;
- case BPF_FUNC_get_current_task:
- return &bpf_get_current_task_proto;
- case BPF_FUNC_get_current_task_btf:
- return &bpf_get_current_task_btf_proto;
- case BPF_FUNC_task_pt_regs:
- return &bpf_task_pt_regs_proto;
- case BPF_FUNC_get_current_uid_gid:
- return &bpf_get_current_uid_gid_proto;
- case BPF_FUNC_get_current_comm:
- return &bpf_get_current_comm_proto;
- case BPF_FUNC_trace_printk:
- return bpf_get_trace_printk_proto();
case BPF_FUNC_get_smp_processor_id:
return &bpf_get_smp_processor_id_proto;
- case BPF_FUNC_get_numa_node_id:
- return &bpf_get_numa_node_id_proto;
- case BPF_FUNC_perf_event_read:
- return &bpf_perf_event_read_proto;
- case BPF_FUNC_get_prandom_u32:
- return &bpf_get_prandom_u32_proto;
- case BPF_FUNC_probe_read_user:
- return &bpf_probe_read_user_proto;
- case BPF_FUNC_probe_read_kernel:
- return security_locked_down(LOCKDOWN_BPF_READ_KERNEL) < 0 ?
- NULL : &bpf_probe_read_kernel_proto;
- case BPF_FUNC_probe_read_user_str:
- return &bpf_probe_read_user_str_proto;
- case BPF_FUNC_probe_read_kernel_str:
- return security_locked_down(LOCKDOWN_BPF_READ_KERNEL) < 0 ?
- NULL : &bpf_probe_read_kernel_str_proto;
#ifdef CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE
case BPF_FUNC_probe_read:
return security_locked_down(LOCKDOWN_BPF_READ_KERNEL) < 0 ?
@@ -1488,65 +1439,8 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return security_locked_down(LOCKDOWN_BPF_READ_KERNEL) < 0 ?
NULL : &bpf_probe_read_compat_str_proto;
#endif
-#ifdef CONFIG_CGROUPS
- case BPF_FUNC_cgrp_storage_get:
- return &bpf_cgrp_storage_get_proto;
- case BPF_FUNC_cgrp_storage_delete:
- return &bpf_cgrp_storage_delete_proto;
- case BPF_FUNC_current_task_under_cgroup:
- return &bpf_current_task_under_cgroup_proto;
-#endif
- case BPF_FUNC_send_signal:
- return &bpf_send_signal_proto;
- case BPF_FUNC_send_signal_thread:
- return &bpf_send_signal_thread_proto;
- case BPF_FUNC_perf_event_read_value:
- return &bpf_perf_event_read_value_proto;
- case BPF_FUNC_ringbuf_output:
- return &bpf_ringbuf_output_proto;
- case BPF_FUNC_ringbuf_reserve:
- return &bpf_ringbuf_reserve_proto;
- case BPF_FUNC_ringbuf_submit:
- return &bpf_ringbuf_submit_proto;
- case BPF_FUNC_ringbuf_discard:
- return &bpf_ringbuf_discard_proto;
- case BPF_FUNC_ringbuf_query:
- return &bpf_ringbuf_query_proto;
- case BPF_FUNC_jiffies64:
- return &bpf_jiffies64_proto;
- case BPF_FUNC_get_task_stack:
- return prog->sleepable ? &bpf_get_task_stack_sleepable_proto
- : &bpf_get_task_stack_proto;
- case BPF_FUNC_copy_from_user:
- return &bpf_copy_from_user_proto;
- case BPF_FUNC_copy_from_user_task:
- return &bpf_copy_from_user_task_proto;
- case BPF_FUNC_snprintf_btf:
- return &bpf_snprintf_btf_proto;
- case BPF_FUNC_per_cpu_ptr:
- return &bpf_per_cpu_ptr_proto;
- case BPF_FUNC_this_cpu_ptr:
- return &bpf_this_cpu_ptr_proto;
- case BPF_FUNC_task_storage_get:
- if (bpf_prog_check_recur(prog))
- return &bpf_task_storage_get_recur_proto;
- return &bpf_task_storage_get_proto;
- case BPF_FUNC_task_storage_delete:
- if (bpf_prog_check_recur(prog))
- return &bpf_task_storage_delete_recur_proto;
- return &bpf_task_storage_delete_proto;
- case BPF_FUNC_for_each_map_elem:
- return &bpf_for_each_map_elem_proto;
- case BPF_FUNC_snprintf:
- return &bpf_snprintf_proto;
case BPF_FUNC_get_func_ip:
return &bpf_get_func_ip_proto_tracing;
- case BPF_FUNC_get_branch_snapshot:
- return &bpf_get_branch_snapshot_proto;
- case BPF_FUNC_find_vma:
- return &bpf_find_vma_proto;
- case BPF_FUNC_trace_vprintk:
- return bpf_get_trace_vprintk_proto();
default:
break;
}
@@ -1858,7 +1752,7 @@ static struct pt_regs *get_bpf_raw_tp_regs(void)
struct bpf_raw_tp_regs *tp_regs = this_cpu_ptr(&bpf_raw_tp_regs);
int nest_level = this_cpu_inc_return(bpf_raw_tp_nest_level);
- if (WARN_ON_ONCE(nest_level > ARRAY_SIZE(tp_regs->regs))) {
+ if (nest_level > ARRAY_SIZE(tp_regs->regs)) {
this_cpu_dec(bpf_raw_tp_nest_level);
return ERR_PTR(-EBUSY);
}
@@ -2571,7 +2465,6 @@ struct bpf_kprobe_multi_link {
u32 cnt;
u32 mods_cnt;
struct module **mods;
- u32 flags;
};
struct bpf_kprobe_multi_run_ctx {
@@ -2691,7 +2584,7 @@ static int bpf_kprobe_multi_link_fill_link_info(const struct bpf_link *link,
kmulti_link = container_of(link, struct bpf_kprobe_multi_link, link);
info->kprobe_multi.count = kmulti_link->cnt;
- info->kprobe_multi.flags = kmulti_link->flags;
+ info->kprobe_multi.flags = kmulti_link->link.flags;
info->kprobe_multi.missed = kmulti_link->fp.nmissed;
if (!uaddrs)
@@ -2725,10 +2618,37 @@ static int bpf_kprobe_multi_link_fill_link_info(const struct bpf_link *link,
return err;
}
+#ifdef CONFIG_PROC_FS
+static void bpf_kprobe_multi_show_fdinfo(const struct bpf_link *link,
+ struct seq_file *seq)
+{
+ struct bpf_kprobe_multi_link *kmulti_link;
+
+ kmulti_link = container_of(link, struct bpf_kprobe_multi_link, link);
+
+ seq_printf(seq,
+ "kprobe_cnt:\t%u\n"
+ "missed:\t%lu\n",
+ kmulti_link->cnt,
+ kmulti_link->fp.nmissed);
+
+ seq_printf(seq, "%s\t %s\n", "cookie", "func");
+ for (int i = 0; i < kmulti_link->cnt; i++) {
+ seq_printf(seq,
+ "%llu\t %pS\n",
+ kmulti_link->cookies[i],
+ (void *)kmulti_link->addrs[i]);
+ }
+}
+#endif
+
static const struct bpf_link_ops bpf_kprobe_multi_link_lops = {
.release = bpf_kprobe_multi_link_release,
.dealloc_deferred = bpf_kprobe_multi_link_dealloc,
.fill_link_info = bpf_kprobe_multi_link_fill_link_info,
+#ifdef CONFIG_PROC_FS
+ .show_fdinfo = bpf_kprobe_multi_show_fdinfo,
+#endif
};
static void bpf_kprobe_multi_cookie_swap(void *a, void *b, int size, const void *priv)
@@ -2987,6 +2907,9 @@ int bpf_kprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr
if (sizeof(u64) != sizeof(void *))
return -EOPNOTSUPP;
+ if (attr->link_create.flags)
+ return -EINVAL;
+
if (!is_kprobe_multi(prog))
return -EINVAL;
@@ -3062,7 +2985,7 @@ int bpf_kprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr
}
bpf_link_init(&link->link, BPF_LINK_TYPE_KPROBE_MULTI,
- &bpf_kprobe_multi_link_lops, prog);
+ &bpf_kprobe_multi_link_lops, prog, attr->link_create.attach_type);
err = bpf_link_prime(&link->link, &link_primer);
if (err)
@@ -3078,7 +3001,7 @@ int bpf_kprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr
link->addrs = addrs;
link->cookies = cookies;
link->cnt = cnt;
- link->flags = flags;
+ link->link.flags = flags;
if (cookies) {
/*
@@ -3147,7 +3070,6 @@ struct bpf_uprobe_multi_link {
struct path path;
struct bpf_link link;
u32 cnt;
- u32 flags;
struct bpf_uprobe *uprobes;
struct task_struct *task;
};
@@ -3211,7 +3133,7 @@ static int bpf_uprobe_multi_link_fill_link_info(const struct bpf_link *link,
umulti_link = container_of(link, struct bpf_uprobe_multi_link, link);
info->uprobe_multi.count = umulti_link->cnt;
- info->uprobe_multi.flags = umulti_link->flags;
+ info->uprobe_multi.flags = umulti_link->link.flags;
info->uprobe_multi.pid = umulti_link->task ?
task_pid_nr_ns(umulti_link->task, task_active_pid_ns(current)) : 0;
@@ -3256,10 +3178,54 @@ static int bpf_uprobe_multi_link_fill_link_info(const struct bpf_link *link,
return err;
}
+#ifdef CONFIG_PROC_FS
+static void bpf_uprobe_multi_show_fdinfo(const struct bpf_link *link,
+ struct seq_file *seq)
+{
+ struct bpf_uprobe_multi_link *umulti_link;
+ char *p, *buf;
+ pid_t pid;
+
+ umulti_link = container_of(link, struct bpf_uprobe_multi_link, link);
+
+ buf = kmalloc(PATH_MAX, GFP_KERNEL);
+ if (!buf)
+ return;
+
+ p = d_path(&umulti_link->path, buf, PATH_MAX);
+ if (IS_ERR(p)) {
+ kfree(buf);
+ return;
+ }
+
+ pid = umulti_link->task ?
+ task_pid_nr_ns(umulti_link->task, task_active_pid_ns(current)) : 0;
+ seq_printf(seq,
+ "uprobe_cnt:\t%u\n"
+ "pid:\t%u\n"
+ "path:\t%s\n",
+ umulti_link->cnt, pid, p);
+
+ seq_printf(seq, "%s\t %s\t %s\n", "cookie", "offset", "ref_ctr_offset");
+ for (int i = 0; i < umulti_link->cnt; i++) {
+ seq_printf(seq,
+ "%llu\t %#llx\t %#lx\n",
+ umulti_link->uprobes[i].cookie,
+ umulti_link->uprobes[i].offset,
+ umulti_link->uprobes[i].ref_ctr_offset);
+ }
+
+ kfree(buf);
+}
+#endif
+
static const struct bpf_link_ops bpf_uprobe_multi_link_lops = {
.release = bpf_uprobe_multi_link_release,
.dealloc_deferred = bpf_uprobe_multi_link_dealloc,
.fill_link_info = bpf_uprobe_multi_link_fill_link_info,
+#ifdef CONFIG_PROC_FS
+ .show_fdinfo = bpf_uprobe_multi_show_fdinfo,
+#endif
};
static int uprobe_prog_run(struct bpf_uprobe *uprobe,
@@ -3376,6 +3342,9 @@ int bpf_uprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr
if (sizeof(u64) != sizeof(void *))
return -EOPNOTSUPP;
+ if (attr->link_create.flags)
+ return -EINVAL;
+
if (!is_uprobe_multi(prog))
return -EINVAL;
@@ -3417,7 +3386,9 @@ int bpf_uprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr
}
if (pid) {
+ rcu_read_lock();
task = get_pid_task(find_vpid(pid), PIDTYPE_TGID);
+ rcu_read_unlock();
if (!task) {
err = -ESRCH;
goto error_path_put;
@@ -3466,10 +3437,10 @@ int bpf_uprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr
link->uprobes = uprobes;
link->path = path;
link->task = task;
- link->flags = flags;
+ link->link.flags = flags;
bpf_link_init(&link->link, BPF_LINK_TYPE_UPROBE_MULTI,
- &bpf_uprobe_multi_link_lops, prog);
+ &bpf_uprobe_multi_link_lops, prog, attr->link_create.attach_type);
for (i = 0; i < cnt; i++) {
uprobes[i].uprobe = uprobe_register(d_real_inode(link->path.dentry),
@@ -3565,6 +3536,146 @@ static int __init bpf_kprobe_multi_kfuncs_init(void)
late_initcall(bpf_kprobe_multi_kfuncs_init);
+typedef int (*copy_fn_t)(void *dst, const void *src, u32 size, struct task_struct *tsk);
+
+/*
+ * The __always_inline is to make sure the compiler doesn't
+ * generate indirect calls into callbacks, which is expensive,
+ * on some kernel configurations. This allows compiler to put
+ * direct calls into all the specific callback implementations
+ * (copy_user_data_sleepable, copy_user_data_nofault, and so on)
+ */
+static __always_inline int __bpf_dynptr_copy_str(struct bpf_dynptr *dptr, u32 doff, u32 size,
+ const void *unsafe_src,
+ copy_fn_t str_copy_fn,
+ struct task_struct *tsk)
+{
+ struct bpf_dynptr_kern *dst;
+ u32 chunk_sz, off;
+ void *dst_slice;
+ int cnt, err;
+ char buf[256];
+
+ dst_slice = bpf_dynptr_slice_rdwr(dptr, doff, NULL, size);
+ if (likely(dst_slice))
+ return str_copy_fn(dst_slice, unsafe_src, size, tsk);
+
+ dst = (struct bpf_dynptr_kern *)dptr;
+ if (bpf_dynptr_check_off_len(dst, doff, size))
+ return -E2BIG;
+
+ for (off = 0; off < size; off += chunk_sz - 1) {
+ chunk_sz = min_t(u32, sizeof(buf), size - off);
+ /* Expect str_copy_fn to return count of copied bytes, including
+ * zero terminator. Next iteration increment off by chunk_sz - 1 to
+ * overwrite NUL.
+ */
+ cnt = str_copy_fn(buf, unsafe_src + off, chunk_sz, tsk);
+ if (cnt < 0)
+ return cnt;
+ err = __bpf_dynptr_write(dst, doff + off, buf, cnt, 0);
+ if (err)
+ return err;
+ if (cnt < chunk_sz || chunk_sz == 1) /* we are done */
+ return off + cnt;
+ }
+ return off;
+}
+
+static __always_inline int __bpf_dynptr_copy(const struct bpf_dynptr *dptr, u32 doff,
+ u32 size, const void *unsafe_src,
+ copy_fn_t copy_fn, struct task_struct *tsk)
+{
+ struct bpf_dynptr_kern *dst;
+ void *dst_slice;
+ char buf[256];
+ u32 off, chunk_sz;
+ int err;
+
+ dst_slice = bpf_dynptr_slice_rdwr(dptr, doff, NULL, size);
+ if (likely(dst_slice))
+ return copy_fn(dst_slice, unsafe_src, size, tsk);
+
+ dst = (struct bpf_dynptr_kern *)dptr;
+ if (bpf_dynptr_check_off_len(dst, doff, size))
+ return -E2BIG;
+
+ for (off = 0; off < size; off += chunk_sz) {
+ chunk_sz = min_t(u32, sizeof(buf), size - off);
+ err = copy_fn(buf, unsafe_src + off, chunk_sz, tsk);
+ if (err)
+ return err;
+ err = __bpf_dynptr_write(dst, doff + off, buf, chunk_sz, 0);
+ if (err)
+ return err;
+ }
+ return 0;
+}
+
+static __always_inline int copy_user_data_nofault(void *dst, const void *unsafe_src,
+ u32 size, struct task_struct *tsk)
+{
+ return copy_from_user_nofault(dst, (const void __user *)unsafe_src, size);
+}
+
+static __always_inline int copy_user_data_sleepable(void *dst, const void *unsafe_src,
+ u32 size, struct task_struct *tsk)
+{
+ int ret;
+
+ if (!tsk) { /* Read from the current task */
+ ret = copy_from_user(dst, (const void __user *)unsafe_src, size);
+ if (ret)
+ return -EFAULT;
+ return 0;
+ }
+
+ ret = access_process_vm(tsk, (unsigned long)unsafe_src, dst, size, 0);
+ if (ret != size)
+ return -EFAULT;
+ return 0;
+}
+
+static __always_inline int copy_kernel_data_nofault(void *dst, const void *unsafe_src,
+ u32 size, struct task_struct *tsk)
+{
+ return copy_from_kernel_nofault(dst, unsafe_src, size);
+}
+
+static __always_inline int copy_user_str_nofault(void *dst, const void *unsafe_src,
+ u32 size, struct task_struct *tsk)
+{
+ return strncpy_from_user_nofault(dst, (const void __user *)unsafe_src, size);
+}
+
+static __always_inline int copy_user_str_sleepable(void *dst, const void *unsafe_src,
+ u32 size, struct task_struct *tsk)
+{
+ int ret;
+
+ if (unlikely(size == 0))
+ return 0;
+
+ if (tsk) {
+ ret = copy_remote_vm_str(tsk, (unsigned long)unsafe_src, dst, size, 0);
+ } else {
+ ret = strncpy_from_user(dst, (const void __user *)unsafe_src, size - 1);
+ /* strncpy_from_user does not guarantee NUL termination */
+ if (ret >= 0)
+ ((char *)dst)[ret] = '\0';
+ }
+
+ if (ret < 0)
+ return ret;
+ return ret + 1;
+}
+
+static __always_inline int copy_kernel_str_nofault(void *dst, const void *unsafe_src,
+ u32 size, struct task_struct *tsk)
+{
+ return strncpy_from_kernel_nofault(dst, unsafe_src, size);
+}
+
__bpf_kfunc_start_defs();
__bpf_kfunc int bpf_send_signal_task(struct task_struct *task, int sig, enum pid_type type,
@@ -3576,4 +3687,62 @@ __bpf_kfunc int bpf_send_signal_task(struct task_struct *task, int sig, enum pid
return bpf_send_signal_common(sig, type, task, value);
}
+__bpf_kfunc int bpf_probe_read_user_dynptr(struct bpf_dynptr *dptr, u32 off,
+ u32 size, const void __user *unsafe_ptr__ign)
+{
+ return __bpf_dynptr_copy(dptr, off, size, (const void *)unsafe_ptr__ign,
+ copy_user_data_nofault, NULL);
+}
+
+__bpf_kfunc int bpf_probe_read_kernel_dynptr(struct bpf_dynptr *dptr, u32 off,
+ u32 size, const void *unsafe_ptr__ign)
+{
+ return __bpf_dynptr_copy(dptr, off, size, unsafe_ptr__ign,
+ copy_kernel_data_nofault, NULL);
+}
+
+__bpf_kfunc int bpf_probe_read_user_str_dynptr(struct bpf_dynptr *dptr, u32 off,
+ u32 size, const void __user *unsafe_ptr__ign)
+{
+ return __bpf_dynptr_copy_str(dptr, off, size, (const void *)unsafe_ptr__ign,
+ copy_user_str_nofault, NULL);
+}
+
+__bpf_kfunc int bpf_probe_read_kernel_str_dynptr(struct bpf_dynptr *dptr, u32 off,
+ u32 size, const void *unsafe_ptr__ign)
+{
+ return __bpf_dynptr_copy_str(dptr, off, size, unsafe_ptr__ign,
+ copy_kernel_str_nofault, NULL);
+}
+
+__bpf_kfunc int bpf_copy_from_user_dynptr(struct bpf_dynptr *dptr, u32 off,
+ u32 size, const void __user *unsafe_ptr__ign)
+{
+ return __bpf_dynptr_copy(dptr, off, size, (const void *)unsafe_ptr__ign,
+ copy_user_data_sleepable, NULL);
+}
+
+__bpf_kfunc int bpf_copy_from_user_str_dynptr(struct bpf_dynptr *dptr, u32 off,
+ u32 size, const void __user *unsafe_ptr__ign)
+{
+ return __bpf_dynptr_copy_str(dptr, off, size, (const void *)unsafe_ptr__ign,
+ copy_user_str_sleepable, NULL);
+}
+
+__bpf_kfunc int bpf_copy_from_user_task_dynptr(struct bpf_dynptr *dptr, u32 off,
+ u32 size, const void __user *unsafe_ptr__ign,
+ struct task_struct *tsk)
+{
+ return __bpf_dynptr_copy(dptr, off, size, (const void *)unsafe_ptr__ign,
+ copy_user_data_sleepable, tsk);
+}
+
+__bpf_kfunc int bpf_copy_from_user_task_str_dynptr(struct bpf_dynptr *dptr, u32 off,
+ u32 size, const void __user *unsafe_ptr__ign,
+ struct task_struct *tsk)
+{
+ return __bpf_dynptr_copy_str(dptr, off, size, (const void *)unsafe_ptr__ign,
+ copy_user_str_sleepable, tsk);
+}
+
__bpf_kfunc_end_defs();
diff --git a/kernel/trace/fgraph.c b/kernel/trace/fgraph.c
index 8d925cbdce3a..f4d200f0c610 100644
--- a/kernel/trace/fgraph.c
+++ b/kernel/trace/fgraph.c
@@ -1325,6 +1325,10 @@ int register_ftrace_graph(struct fgraph_ops *gops)
int ret = 0;
int i = -1;
+ if (WARN_ONCE(gops->ops.flags & FTRACE_OPS_FL_GRAPH,
+ "function graph ops registered again"))
+ return -EBUSY;
+
guard(mutex)(&ftrace_lock);
if (!fgraph_stack_cachep) {
@@ -1382,6 +1386,8 @@ int register_ftrace_graph(struct fgraph_ops *gops)
/* Always save the function, and reset at unregistering */
gops->saved_func = gops->entryfunc;
+ gops->ops.flags |= FTRACE_OPS_FL_GRAPH;
+
ret = ftrace_startup_subops(&graph_ops, &gops->ops, command);
if (!ret)
fgraph_array[i] = gops;
@@ -1399,17 +1405,21 @@ void unregister_ftrace_graph(struct fgraph_ops *gops)
{
int command = 0;
+ if (WARN_ONCE(!(gops->ops.flags & FTRACE_OPS_FL_GRAPH),
+ "function graph ops unregistered without registering"))
+ return;
+
guard(mutex)(&ftrace_lock);
if (unlikely(!ftrace_graph_active))
- return;
+ goto out;
if (unlikely(gops->idx < 0 || gops->idx >= FGRAPH_ARRAY_SIZE ||
fgraph_array[gops->idx] != gops))
- return;
+ goto out;
if (fgraph_lru_release_index(gops->idx) < 0)
- return;
+ goto out;
fgraph_array[gops->idx] = &fgraph_stub;
@@ -1432,4 +1442,6 @@ void unregister_ftrace_graph(struct fgraph_ops *gops)
unregister_trace_sched_switch(ftrace_graph_probe_sched_switch, NULL);
}
gops->saved_func = NULL;
+ out:
+ gops->ops.flags &= ~FTRACE_OPS_FL_GRAPH;
}
diff --git a/kernel/trace/fprobe.c b/kernel/trace/fprobe.c
index ba7ff14f5339..c8034dfc1070 100644
--- a/kernel/trace/fprobe.c
+++ b/kernel/trace/fprobe.c
@@ -352,7 +352,7 @@ static void fprobe_return(struct ftrace_graph_ret *trace,
size_words = SIZE_IN_LONG(size);
ret_ip = ftrace_regs_get_instruction_pointer(fregs);
- preempt_disable();
+ preempt_disable_notrace();
curr = 0;
while (size_words > curr) {
@@ -368,7 +368,7 @@ static void fprobe_return(struct ftrace_graph_ret *trace,
}
curr += size;
}
- preempt_enable();
+ preempt_enable_notrace();
}
NOKPROBE_SYMBOL(fprobe_return);
@@ -648,6 +648,11 @@ static int fprobe_init(struct fprobe *fp, unsigned long *addrs, int num)
#define FPROBE_IPS_MAX INT_MAX
+int fprobe_count_ips_from_filter(const char *filter, const char *notfilter)
+{
+ return get_ips_from_filter(filter, notfilter, NULL, NULL, FPROBE_IPS_MAX);
+}
+
/**
* register_fprobe() - Register fprobe to ftrace by pattern.
* @fp: A fprobe data structure to be registered.
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 6981830c3128..00b76d450a89 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -188,7 +188,7 @@ static void ftrace_pid_func(unsigned long ip, unsigned long parent_ip,
op->saved_func(ip, parent_ip, op, fregs);
}
-static void ftrace_sync_ipi(void *data)
+void ftrace_sync_ipi(void *data)
{
/* Probably not needed, but do it anyway */
smp_rmb();
@@ -1042,10 +1042,6 @@ static struct ftrace_ops *removed_ops;
*/
static bool update_all_ops;
-#ifndef CONFIG_FTRACE_MCOUNT_RECORD
-# error Dynamic ftrace depends on MCOUNT_RECORD
-#endif
-
struct ftrace_func_probe {
struct ftrace_probe_ops *probe_ops;
struct ftrace_ops ops;
@@ -4373,6 +4369,42 @@ static inline int print_rec(struct seq_file *m, unsigned long ip)
}
#endif
+static void print_subops(struct seq_file *m, struct ftrace_ops *ops, struct dyn_ftrace *rec)
+{
+ struct ftrace_ops *subops;
+ bool first = true;
+
+ list_for_each_entry(subops, &ops->subop_list, list) {
+ if (!((subops->flags & FTRACE_OPS_FL_ENABLED) &&
+ hash_contains_ip(rec->ip, subops->func_hash)))
+ continue;
+ if (first) {
+ seq_printf(m, "\tsubops:");
+ first = false;
+ }
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+ if (subops->flags & FTRACE_OPS_FL_GRAPH) {
+ struct fgraph_ops *gops;
+
+ gops = container_of(subops, struct fgraph_ops, ops);
+ seq_printf(m, " {ent:%pS ret:%pS}",
+ (void *)gops->entryfunc,
+ (void *)gops->retfunc);
+ continue;
+ }
+#endif
+ if (subops->trampoline) {
+ seq_printf(m, " {%pS (%pS)}",
+ (void *)subops->trampoline,
+ (void *)subops->func);
+ add_trampoline_func(m, subops, rec);
+ } else {
+ seq_printf(m, " {%pS}",
+ (void *)subops->func);
+ }
+ }
+}
+
static int t_show(struct seq_file *m, void *v)
{
struct ftrace_iterator *iter = m->private;
@@ -4425,6 +4457,7 @@ static int t_show(struct seq_file *m, void *v)
(void *)ops->trampoline,
(void *)ops->func);
add_trampoline_func(m, ops, rec);
+ print_subops(m, ops, rec);
ops = ftrace_find_tramp_ops_next(rec, ops);
} while (ops);
} else
@@ -4437,6 +4470,7 @@ static int t_show(struct seq_file *m, void *v)
if (ops) {
seq_printf(m, "\tops: %pS (%pS)",
ops, ops->func);
+ print_subops(m, ops, rec);
} else {
seq_puts(m, "\tops: ERROR!");
}
@@ -5170,8 +5204,12 @@ struct ftrace_func_map {
void *data;
};
+/*
+ * Note, ftrace_func_mapper is freed by free_ftrace_hash(&mapper->hash).
+ * The hash field must be the first field.
+ */
struct ftrace_func_mapper {
- struct ftrace_hash hash;
+ struct ftrace_hash hash; /* Must be first! */
};
/**
@@ -5306,6 +5344,7 @@ void free_ftrace_func_mapper(struct ftrace_func_mapper *mapper,
}
}
}
+ /* This also frees the mapper itself */
free_ftrace_hash(&mapper->hash);
}
@@ -7395,9 +7434,10 @@ void ftrace_release_mod(struct module *mod)
mutex_lock(&ftrace_lock);
- if (ftrace_disabled)
- goto out_unlock;
-
+ /*
+ * To avoid the UAF problem after the module is unloaded, the
+ * 'mod_map' resource needs to be released unconditionally.
+ */
list_for_each_entry_safe(mod_map, n, &ftrace_mod_maps, list) {
if (mod_map->mod == mod) {
list_del_rcu(&mod_map->list);
@@ -7406,6 +7446,9 @@ void ftrace_release_mod(struct module *mod)
}
}
+ if (ftrace_disabled)
+ goto out_unlock;
+
/*
* Each module has its own ftrace_pages, remove
* them from the list.
@@ -7584,6 +7627,9 @@ allocate_ftrace_mod_map(struct module *mod,
{
struct ftrace_mod_map *mod_map;
+ if (ftrace_disabled)
+ return NULL;
+
mod_map = kmalloc(sizeof(*mod_map), GFP_KERNEL);
if (!mod_map)
return NULL;
diff --git a/kernel/trace/pid_list.c b/kernel/trace/pid_list.c
index c62b9b3cfb3d..090bb5ea4a19 100644
--- a/kernel/trace/pid_list.c
+++ b/kernel/trace/pid_list.c
@@ -81,13 +81,9 @@ static inline bool upper_empty(union upper_chunk *chunk)
{
/*
* If chunk->data has no lower chunks, it will be the same
- * as a zeroed bitmask. Use find_first_bit() to test it
- * and if it doesn't find any bits set, then the array
- * is empty.
+ * as a zeroed bitmask.
*/
- int bit = find_first_bit((unsigned long *)chunk->data,
- sizeof(chunk->data) * 8);
- return bit >= sizeof(chunk->data) * 8;
+ return bitmap_empty((unsigned long *)chunk->data, BITS_PER_TYPE(chunk->data));
}
static inline int pid_split(unsigned int pid, unsigned int *upper1,
diff --git a/kernel/trace/power-traces.c b/kernel/trace/power-traces.c
index 21bb161c2316..f2fe33573e54 100644
--- a/kernel/trace/power-traces.c
+++ b/kernel/trace/power-traces.c
@@ -17,5 +17,4 @@
EXPORT_TRACEPOINT_SYMBOL_GPL(suspend_resume);
EXPORT_TRACEPOINT_SYMBOL_GPL(cpu_idle);
EXPORT_TRACEPOINT_SYMBOL_GPL(cpu_frequency);
-EXPORT_TRACEPOINT_SYMBOL_GPL(powernv_throttle);
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 3f9bf562beea..5176e0270f07 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -1358,6 +1358,13 @@ static inline void rb_inc_page(struct buffer_page **bpage)
*bpage = list_entry(p, struct buffer_page, list);
}
+static inline void rb_dec_page(struct buffer_page **bpage)
+{
+ struct list_head *p = rb_list_head((*bpage)->list.prev);
+
+ *bpage = list_entry(p, struct buffer_page, list);
+}
+
static struct buffer_page *
rb_set_head_page(struct ring_buffer_per_cpu *cpu_buffer)
{
@@ -1866,10 +1873,11 @@ static int rb_validate_buffer(struct buffer_data_page *dpage, int cpu)
static void rb_meta_validate_events(struct ring_buffer_per_cpu *cpu_buffer)
{
struct ring_buffer_cpu_meta *meta = cpu_buffer->ring_meta;
- struct buffer_page *head_page;
+ struct buffer_page *head_page, *orig_head;
unsigned long entry_bytes = 0;
unsigned long entries = 0;
int ret;
+ u64 ts;
int i;
if (!meta || !meta->head_buffer)
@@ -1885,8 +1893,98 @@ static void rb_meta_validate_events(struct ring_buffer_per_cpu *cpu_buffer)
entry_bytes += local_read(&cpu_buffer->reader_page->page->commit);
local_set(&cpu_buffer->reader_page->entries, ret);
- head_page = cpu_buffer->head_page;
+ orig_head = head_page = cpu_buffer->head_page;
+ ts = head_page->page->time_stamp;
+
+ /*
+ * Try to rewind the head so that we can read the pages which already
+ * read in the previous boot.
+ */
+ if (head_page == cpu_buffer->tail_page)
+ goto skip_rewind;
+
+ rb_dec_page(&head_page);
+ for (i = 0; i < meta->nr_subbufs + 1; i++, rb_dec_page(&head_page)) {
+
+ /* Rewind until tail (writer) page. */
+ if (head_page == cpu_buffer->tail_page)
+ break;
+
+ /* Ensure the page has older data than head. */
+ if (ts < head_page->page->time_stamp)
+ break;
+
+ ts = head_page->page->time_stamp;
+ /* Ensure the page has correct timestamp and some data. */
+ if (!ts || rb_page_commit(head_page) == 0)
+ break;
+
+ /* Stop rewind if the page is invalid. */
+ ret = rb_validate_buffer(head_page->page, cpu_buffer->cpu);
+ if (ret < 0)
+ break;
+
+ /* Recover the number of entries and update stats. */
+ local_set(&head_page->entries, ret);
+ if (ret)
+ local_inc(&cpu_buffer->pages_touched);
+ entries += ret;
+ entry_bytes += rb_page_commit(head_page);
+ }
+ if (i)
+ pr_info("Ring buffer [%d] rewound %d pages\n", cpu_buffer->cpu, i);
+
+ /* The last rewound page must be skipped. */
+ if (head_page != orig_head)
+ rb_inc_page(&head_page);
+
+ /*
+ * If the ring buffer was rewound, then inject the reader page
+ * into the location just before the original head page.
+ */
+ if (head_page != orig_head) {
+ struct buffer_page *bpage = orig_head;
+
+ rb_dec_page(&bpage);
+ /*
+ * Insert the reader_page before the original head page.
+ * Since the list encode RB_PAGE flags, general list
+ * operations should be avoided.
+ */
+ cpu_buffer->reader_page->list.next = &orig_head->list;
+ cpu_buffer->reader_page->list.prev = orig_head->list.prev;
+ orig_head->list.prev = &cpu_buffer->reader_page->list;
+ bpage->list.next = &cpu_buffer->reader_page->list;
+
+ /* Make the head_page the reader page */
+ cpu_buffer->reader_page = head_page;
+ bpage = head_page;
+ rb_inc_page(&head_page);
+ head_page->list.prev = bpage->list.prev;
+ rb_dec_page(&bpage);
+ bpage->list.next = &head_page->list;
+ rb_set_list_to_head(&bpage->list);
+ cpu_buffer->pages = &head_page->list;
+
+ cpu_buffer->head_page = head_page;
+ meta->head_buffer = (unsigned long)head_page->page;
+
+ /* Reset all the indexes */
+ bpage = cpu_buffer->reader_page;
+ meta->buffers[0] = rb_meta_subbuf_idx(meta, bpage->page);
+ bpage->id = 0;
+
+ for (i = 1, bpage = head_page; i < meta->nr_subbufs;
+ i++, rb_inc_page(&bpage)) {
+ meta->buffers[i] = rb_meta_subbuf_idx(meta, bpage->page);
+ bpage->id = i;
+ }
+
+ /* We'll restart verifying from orig_head */
+ head_page = orig_head;
+ }
+ skip_rewind:
/* If the commit_buffer is the reader page, update the commit page */
if (meta->commit_buffer == (unsigned long)cpu_buffer->reader_page->page) {
cpu_buffer->commit_page = cpu_buffer->reader_page;
@@ -2226,7 +2324,7 @@ static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer,
static struct ring_buffer_per_cpu *
rb_allocate_cpu_buffer(struct trace_buffer *buffer, long nr_pages, int cpu)
{
- struct ring_buffer_per_cpu *cpu_buffer;
+ struct ring_buffer_per_cpu *cpu_buffer __free(kfree) = NULL;
struct ring_buffer_cpu_meta *meta;
struct buffer_page *bpage;
struct page *page;
@@ -2252,7 +2350,7 @@ rb_allocate_cpu_buffer(struct trace_buffer *buffer, long nr_pages, int cpu)
bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()),
GFP_KERNEL, cpu_to_node(cpu));
if (!bpage)
- goto fail_free_buffer;
+ return NULL;
rb_check_bpage(cpu_buffer, bpage);
@@ -2318,13 +2416,11 @@ rb_allocate_cpu_buffer(struct trace_buffer *buffer, long nr_pages, int cpu)
rb_head_page_activate(cpu_buffer);
}
- return cpu_buffer;
+ return_ptr(cpu_buffer);
fail_free_reader:
free_buffer_page(cpu_buffer->reader_page);
- fail_free_buffer:
- kfree(cpu_buffer);
return NULL;
}
@@ -2359,7 +2455,7 @@ static struct trace_buffer *alloc_buffer(unsigned long size, unsigned flags,
unsigned long scratch_size,
struct lock_class_key *key)
{
- struct trace_buffer *buffer;
+ struct trace_buffer *buffer __free(kfree) = NULL;
long nr_pages;
int subbuf_size;
int bsize;
@@ -2373,7 +2469,7 @@ static struct trace_buffer *alloc_buffer(unsigned long size, unsigned flags,
return NULL;
if (!zalloc_cpumask_var(&buffer->cpumask, GFP_KERNEL))
- goto fail_free_buffer;
+ return NULL;
buffer->subbuf_order = order;
subbuf_size = (PAGE_SIZE << order);
@@ -2472,7 +2568,7 @@ static struct trace_buffer *alloc_buffer(unsigned long size, unsigned flags,
mutex_init(&buffer->mutex);
- return buffer;
+ return_ptr(buffer);
fail_free_buffers:
for_each_buffer_cpu(buffer, cpu) {
@@ -2484,8 +2580,6 @@ static struct trace_buffer *alloc_buffer(unsigned long size, unsigned flags,
fail_free_cpumask:
free_cpumask_var(buffer->cpumask);
- fail_free_buffer:
- kfree(buffer);
return NULL;
}
@@ -2849,6 +2943,12 @@ int ring_buffer_resize(struct trace_buffer *buffer, unsigned long size,
if (nr_pages < 2)
nr_pages = 2;
+ /*
+ * Keep CPUs from coming online while resizing to synchronize
+ * with new per CPU buffers being created.
+ */
+ guard(cpus_read_lock)();
+
/* prevent another thread from changing buffer sizes */
mutex_lock(&buffer->mutex);
atomic_inc(&buffer->resizing);
@@ -2893,7 +2993,6 @@ int ring_buffer_resize(struct trace_buffer *buffer, unsigned long size,
cond_resched();
}
- cpus_read_lock();
/*
* Fire off all the required work handlers
* We can't schedule on offline CPUs, but it's not necessary
@@ -2933,7 +3032,6 @@ int ring_buffer_resize(struct trace_buffer *buffer, unsigned long size,
cpu_buffer->nr_pages_to_update = 0;
}
- cpus_read_unlock();
} else {
cpu_buffer = buffer->buffers[cpu_id];
@@ -2961,8 +3059,6 @@ int ring_buffer_resize(struct trace_buffer *buffer, unsigned long size,
goto out_err;
}
- cpus_read_lock();
-
/* Can't run something on an offline CPU. */
if (!cpu_online(cpu_id))
rb_update_pages(cpu_buffer);
@@ -2981,7 +3077,6 @@ int ring_buffer_resize(struct trace_buffer *buffer, unsigned long size,
}
cpu_buffer->nr_pages_to_update = 0;
- cpus_read_unlock();
}
out:
@@ -4121,7 +4216,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_unlock_commit);
static const char *show_irq_str(int bits)
{
- const char *type[] = {
+ static const char * type[] = {
".", // 0
"s", // 1
"h", // 2
@@ -4684,10 +4779,7 @@ void ring_buffer_discard_commit(struct trace_buffer *buffer,
RB_WARN_ON(buffer, !local_read(&cpu_buffer->committing));
rb_decrement_entry(cpu_buffer, event);
- if (rb_try_to_discard(cpu_buffer, event))
- goto out;
-
- out:
+ rb_try_to_discard(cpu_buffer, event);
rb_end_commit(cpu_buffer);
trace_recursive_unlock(cpu_buffer);
@@ -4885,6 +4977,24 @@ bool ring_buffer_record_is_set_on(struct trace_buffer *buffer)
}
/**
+ * ring_buffer_record_is_on_cpu - return true if the ring buffer can write
+ * @buffer: The ring buffer to see if write is enabled
+ * @cpu: The CPU to test if the ring buffer can write too
+ *
+ * Returns true if the ring buffer is in a state that it accepts writes
+ * for a particular CPU.
+ */
+bool ring_buffer_record_is_on_cpu(struct trace_buffer *buffer, int cpu)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+
+ cpu_buffer = buffer->buffers[cpu];
+
+ return ring_buffer_record_is_set_on(buffer) &&
+ !atomic_read(&cpu_buffer->record_disabled);
+}
+
+/**
* ring_buffer_record_disable_cpu - stop all writes into the cpu_buffer
* @buffer: The ring buffer to stop writes to.
* @cpu: The CPU buffer to stop
@@ -5330,7 +5440,6 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
*/
local_set(&cpu_buffer->reader_page->write, 0);
local_set(&cpu_buffer->reader_page->entries, 0);
- local_set(&cpu_buffer->reader_page->page->commit, 0);
cpu_buffer->reader_page->real_end = 0;
spin:
@@ -5834,24 +5943,20 @@ ring_buffer_consume(struct trace_buffer *buffer, int cpu, u64 *ts,
EXPORT_SYMBOL_GPL(ring_buffer_consume);
/**
- * ring_buffer_read_prepare - Prepare for a non consuming read of the buffer
+ * ring_buffer_read_start - start a non consuming read of the buffer
* @buffer: The ring buffer to read from
* @cpu: The cpu buffer to iterate over
* @flags: gfp flags to use for memory allocation
*
- * This performs the initial preparations necessary to iterate
- * through the buffer. Memory is allocated, buffer resizing
- * is disabled, and the iterator pointer is returned to the caller.
- *
- * After a sequence of ring_buffer_read_prepare calls, the user is
- * expected to make at least one call to ring_buffer_read_prepare_sync.
- * Afterwards, ring_buffer_read_start is invoked to get things going
- * for real.
+ * This creates an iterator to allow non-consuming iteration through
+ * the buffer. If the buffer is disabled for writing, it will produce
+ * the same information each time, but if the buffer is still writing
+ * then the first hit of a write will cause the iteration to stop.
*
- * This overall must be paired with ring_buffer_read_finish.
+ * Must be paired with ring_buffer_read_finish.
*/
struct ring_buffer_iter *
-ring_buffer_read_prepare(struct trace_buffer *buffer, int cpu, gfp_t flags)
+ring_buffer_read_start(struct trace_buffer *buffer, int cpu, gfp_t flags)
{
struct ring_buffer_per_cpu *cpu_buffer;
struct ring_buffer_iter *iter;
@@ -5877,51 +5982,12 @@ ring_buffer_read_prepare(struct trace_buffer *buffer, int cpu, gfp_t flags)
atomic_inc(&cpu_buffer->resize_disabled);
- return iter;
-}
-EXPORT_SYMBOL_GPL(ring_buffer_read_prepare);
-
-/**
- * ring_buffer_read_prepare_sync - Synchronize a set of prepare calls
- *
- * All previously invoked ring_buffer_read_prepare calls to prepare
- * iterators will be synchronized. Afterwards, read_buffer_read_start
- * calls on those iterators are allowed.
- */
-void
-ring_buffer_read_prepare_sync(void)
-{
- synchronize_rcu();
-}
-EXPORT_SYMBOL_GPL(ring_buffer_read_prepare_sync);
-
-/**
- * ring_buffer_read_start - start a non consuming read of the buffer
- * @iter: The iterator returned by ring_buffer_read_prepare
- *
- * This finalizes the startup of an iteration through the buffer.
- * The iterator comes from a call to ring_buffer_read_prepare and
- * an intervening ring_buffer_read_prepare_sync must have been
- * performed.
- *
- * Must be paired with ring_buffer_read_finish.
- */
-void
-ring_buffer_read_start(struct ring_buffer_iter *iter)
-{
- struct ring_buffer_per_cpu *cpu_buffer;
- unsigned long flags;
-
- if (!iter)
- return;
-
- cpu_buffer = iter->cpu_buffer;
-
- raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
+ guard(raw_spinlock_irqsave)(&cpu_buffer->reader_lock);
arch_spin_lock(&cpu_buffer->lock);
rb_iter_reset(iter);
arch_spin_unlock(&cpu_buffer->lock);
- raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
+
+ return iter;
}
EXPORT_SYMBOL_GPL(ring_buffer_read_start);
@@ -6002,6 +6068,39 @@ static void rb_clear_buffer_page(struct buffer_page *page)
page->read = 0;
}
+/*
+ * When the buffer is memory mapped to user space, each sub buffer
+ * has a unique id that is used by the meta data to tell the user
+ * where the current reader page is.
+ *
+ * For a normal allocated ring buffer, the id is saved in the buffer page
+ * id field, and updated via this function.
+ *
+ * But for a fixed memory mapped buffer, the id is already assigned for
+ * fixed memory ording in the memory layout and can not be used. Instead
+ * the index of where the page lies in the memory layout is used.
+ *
+ * For the normal pages, set the buffer page id with the passed in @id
+ * value and return that.
+ *
+ * For fixed memory mapped pages, get the page index in the memory layout
+ * and return that as the id.
+ */
+static int rb_page_id(struct ring_buffer_per_cpu *cpu_buffer,
+ struct buffer_page *bpage, int id)
+{
+ /*
+ * For boot buffers, the id is the index,
+ * otherwise, set the buffer page with this id
+ */
+ if (cpu_buffer->ring_meta)
+ id = rb_meta_subbuf_idx(cpu_buffer->ring_meta, bpage->page);
+ else
+ bpage->id = id;
+
+ return id;
+}
+
static void rb_update_meta_page(struct ring_buffer_per_cpu *cpu_buffer)
{
struct trace_buffer_meta *meta = cpu_buffer->meta_page;
@@ -6010,7 +6109,9 @@ static void rb_update_meta_page(struct ring_buffer_per_cpu *cpu_buffer)
return;
meta->reader.read = cpu_buffer->reader_page->read;
- meta->reader.id = cpu_buffer->reader_page->id;
+ meta->reader.id = rb_page_id(cpu_buffer, cpu_buffer->reader_page,
+ cpu_buffer->reader_page->id);
+
meta->reader.lost_events = cpu_buffer->lost_events;
meta->entries = local_read(&cpu_buffer->entries);
@@ -6080,21 +6181,16 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer)
/* Must have disabled the cpu buffer then done a synchronize_rcu */
static void reset_disabled_cpu_buffer(struct ring_buffer_per_cpu *cpu_buffer)
{
- unsigned long flags;
-
- raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
+ guard(raw_spinlock_irqsave)(&cpu_buffer->reader_lock);
if (RB_WARN_ON(cpu_buffer, local_read(&cpu_buffer->committing)))
- goto out;
+ return;
arch_spin_lock(&cpu_buffer->lock);
rb_reset_cpu(cpu_buffer);
arch_spin_unlock(&cpu_buffer->lock);
-
- out:
- raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
}
/**
@@ -6282,37 +6378,33 @@ int ring_buffer_swap_cpu(struct trace_buffer *buffer_a,
if (!cpumask_test_cpu(cpu, buffer_a->cpumask) ||
!cpumask_test_cpu(cpu, buffer_b->cpumask))
- goto out;
+ return -EINVAL;
cpu_buffer_a = buffer_a->buffers[cpu];
cpu_buffer_b = buffer_b->buffers[cpu];
/* It's up to the callers to not try to swap mapped buffers */
- if (WARN_ON_ONCE(cpu_buffer_a->mapped || cpu_buffer_b->mapped)) {
- ret = -EBUSY;
- goto out;
- }
+ if (WARN_ON_ONCE(cpu_buffer_a->mapped || cpu_buffer_b->mapped))
+ return -EBUSY;
/* At least make sure the two buffers are somewhat the same */
if (cpu_buffer_a->nr_pages != cpu_buffer_b->nr_pages)
- goto out;
+ return -EINVAL;
if (buffer_a->subbuf_order != buffer_b->subbuf_order)
- goto out;
-
- ret = -EAGAIN;
+ return -EINVAL;
if (atomic_read(&buffer_a->record_disabled))
- goto out;
+ return -EAGAIN;
if (atomic_read(&buffer_b->record_disabled))
- goto out;
+ return -EAGAIN;
if (atomic_read(&cpu_buffer_a->record_disabled))
- goto out;
+ return -EAGAIN;
if (atomic_read(&cpu_buffer_b->record_disabled))
- goto out;
+ return -EAGAIN;
/*
* We can't do a synchronize_rcu here because this
@@ -6349,7 +6441,6 @@ int ring_buffer_swap_cpu(struct trace_buffer *buffer_a,
out_dec:
atomic_dec(&cpu_buffer_a->record_disabled);
atomic_dec(&cpu_buffer_b->record_disabled);
-out:
return ret;
}
EXPORT_SYMBOL_GPL(ring_buffer_swap_cpu);
@@ -6508,38 +6599,37 @@ int ring_buffer_read_page(struct trace_buffer *buffer,
struct buffer_data_page *bpage;
struct buffer_page *reader;
unsigned long missed_events;
- unsigned long flags;
unsigned int commit;
unsigned int read;
u64 save_timestamp;
- int ret = -1;
if (!cpumask_test_cpu(cpu, buffer->cpumask))
- goto out;
+ return -1;
/*
* If len is not big enough to hold the page header, then
* we can not copy anything.
*/
if (len <= BUF_PAGE_HDR_SIZE)
- goto out;
+ return -1;
len -= BUF_PAGE_HDR_SIZE;
if (!data_page || !data_page->data)
- goto out;
+ return -1;
+
if (data_page->order != buffer->subbuf_order)
- goto out;
+ return -1;
bpage = data_page->data;
if (!bpage)
- goto out;
+ return -1;
- raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
+ guard(raw_spinlock_irqsave)(&cpu_buffer->reader_lock);
reader = rb_get_reader_page(cpu_buffer);
if (!reader)
- goto out_unlock;
+ return -1;
event = rb_reader_event(cpu_buffer);
@@ -6573,7 +6663,7 @@ int ring_buffer_read_page(struct trace_buffer *buffer,
if (full &&
(!read || (len < (commit - read)) ||
cpu_buffer->reader_page == cpu_buffer->commit_page))
- goto out_unlock;
+ return -1;
if (len > (commit - read))
len = (commit - read);
@@ -6582,7 +6672,7 @@ int ring_buffer_read_page(struct trace_buffer *buffer,
size = rb_event_ts_length(event);
if (len < size)
- goto out_unlock;
+ return -1;
/* save the current timestamp, since the user will need it */
save_timestamp = cpu_buffer->read_stamp;
@@ -6640,7 +6730,6 @@ int ring_buffer_read_page(struct trace_buffer *buffer,
if (reader->real_end)
local_set(&bpage->commit, reader->real_end);
}
- ret = read;
cpu_buffer->lost_events = 0;
@@ -6667,11 +6756,7 @@ int ring_buffer_read_page(struct trace_buffer *buffer,
if (commit < buffer->subbuf_size)
memset(&bpage->data[commit], 0, buffer->subbuf_size - commit);
- out_unlock:
- raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
-
- out:
- return ret;
+ return read;
}
EXPORT_SYMBOL_GPL(ring_buffer_read_page);
@@ -6764,7 +6849,7 @@ int ring_buffer_subbuf_order_set(struct trace_buffer *buffer, int order)
old_size = buffer->subbuf_size;
/* prevent another thread from changing buffer sizes */
- mutex_lock(&buffer->mutex);
+ guard(mutex)(&buffer->mutex);
atomic_inc(&buffer->record_disabled);
/* Make sure all commits have finished */
@@ -6869,7 +6954,6 @@ int ring_buffer_subbuf_order_set(struct trace_buffer *buffer, int order)
}
atomic_dec(&buffer->record_disabled);
- mutex_unlock(&buffer->mutex);
return 0;
@@ -6878,7 +6962,6 @@ error:
buffer->subbuf_size = old_size;
atomic_dec(&buffer->record_disabled);
- mutex_unlock(&buffer->mutex);
for_each_buffer_cpu(buffer, cpu) {
cpu_buffer = buffer->buffers[cpu];
@@ -6926,23 +7009,29 @@ static void rb_setup_ids_meta_page(struct ring_buffer_per_cpu *cpu_buffer,
struct trace_buffer_meta *meta = cpu_buffer->meta_page;
unsigned int nr_subbufs = cpu_buffer->nr_pages + 1;
struct buffer_page *first_subbuf, *subbuf;
+ int cnt = 0;
int id = 0;
- subbuf_ids[id] = (unsigned long)cpu_buffer->reader_page->page;
- cpu_buffer->reader_page->id = id++;
+ id = rb_page_id(cpu_buffer, cpu_buffer->reader_page, id);
+ subbuf_ids[id++] = (unsigned long)cpu_buffer->reader_page->page;
+ cnt++;
first_subbuf = subbuf = rb_set_head_page(cpu_buffer);
do {
+ id = rb_page_id(cpu_buffer, subbuf, id);
+
if (WARN_ON(id >= nr_subbufs))
break;
subbuf_ids[id] = (unsigned long)subbuf->page;
- subbuf->id = id;
rb_inc_page(&subbuf);
id++;
+ cnt++;
} while (subbuf != first_subbuf);
+ WARN_ON(cnt != nr_subbufs);
+
/* install subbuf ID to kern VA translation */
cpu_buffer->subbuf_ids = subbuf_ids;
@@ -7034,7 +7123,7 @@ static int __rb_map_vma(struct ring_buffer_per_cpu *cpu_buffer,
{
unsigned long nr_subbufs, nr_pages, nr_vma_pages, pgoff = vma->vm_pgoff;
unsigned int subbuf_pages, subbuf_order;
- struct page **pages;
+ struct page **pages __free(kfree) = NULL;
int p = 0, s = 0;
int err;
@@ -7102,10 +7191,8 @@ static int __rb_map_vma(struct ring_buffer_per_cpu *cpu_buffer,
struct page *page;
int off = 0;
- if (WARN_ON_ONCE(s >= nr_subbufs)) {
- err = -EINVAL;
- goto out;
- }
+ if (WARN_ON_ONCE(s >= nr_subbufs))
+ return -EINVAL;
page = virt_to_page((void *)cpu_buffer->subbuf_ids[s]);
@@ -7120,9 +7207,6 @@ static int __rb_map_vma(struct ring_buffer_per_cpu *cpu_buffer,
err = vm_insert_pages(vma, vma->vm_start, pages, &nr_pages);
-out:
- kfree(pages);
-
return err;
}
#else
@@ -7138,36 +7222,34 @@ int ring_buffer_map(struct trace_buffer *buffer, int cpu,
{
struct ring_buffer_per_cpu *cpu_buffer;
unsigned long flags, *subbuf_ids;
- int err = 0;
+ int err;
if (!cpumask_test_cpu(cpu, buffer->cpumask))
return -EINVAL;
cpu_buffer = buffer->buffers[cpu];
- mutex_lock(&cpu_buffer->mapping_lock);
+ guard(mutex)(&cpu_buffer->mapping_lock);
if (cpu_buffer->user_mapped) {
err = __rb_map_vma(cpu_buffer, vma);
if (!err)
err = __rb_inc_dec_mapped(cpu_buffer, true);
- mutex_unlock(&cpu_buffer->mapping_lock);
return err;
}
/* prevent another thread from changing buffer/sub-buffer sizes */
- mutex_lock(&buffer->mutex);
+ guard(mutex)(&buffer->mutex);
err = rb_alloc_meta_page(cpu_buffer);
if (err)
- goto unlock;
+ return err;
/* subbuf_ids include the reader while nr_pages does not */
subbuf_ids = kcalloc(cpu_buffer->nr_pages + 1, sizeof(*subbuf_ids), GFP_KERNEL);
if (!subbuf_ids) {
rb_free_meta_page(cpu_buffer);
- err = -ENOMEM;
- goto unlock;
+ return -ENOMEM;
}
atomic_inc(&cpu_buffer->resize_disabled);
@@ -7195,35 +7277,29 @@ int ring_buffer_map(struct trace_buffer *buffer, int cpu,
atomic_dec(&cpu_buffer->resize_disabled);
}
-unlock:
- mutex_unlock(&buffer->mutex);
- mutex_unlock(&cpu_buffer->mapping_lock);
-
- return err;
+ return 0;
}
int ring_buffer_unmap(struct trace_buffer *buffer, int cpu)
{
struct ring_buffer_per_cpu *cpu_buffer;
unsigned long flags;
- int err = 0;
if (!cpumask_test_cpu(cpu, buffer->cpumask))
return -EINVAL;
cpu_buffer = buffer->buffers[cpu];
- mutex_lock(&cpu_buffer->mapping_lock);
+ guard(mutex)(&cpu_buffer->mapping_lock);
if (!cpu_buffer->user_mapped) {
- err = -ENODEV;
- goto out;
+ return -ENODEV;
} else if (cpu_buffer->user_mapped > 1) {
__rb_inc_dec_mapped(cpu_buffer, false);
- goto out;
+ return 0;
}
- mutex_lock(&buffer->mutex);
+ guard(mutex)(&buffer->mutex);
raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
/* This is the last user space mapping */
@@ -7238,12 +7314,7 @@ int ring_buffer_unmap(struct trace_buffer *buffer, int cpu)
rb_free_meta_page(cpu_buffer);
atomic_dec(&cpu_buffer->resize_disabled);
- mutex_unlock(&buffer->mutex);
-
-out:
- mutex_unlock(&cpu_buffer->mapping_lock);
-
- return err;
+ return 0;
}
int ring_buffer_map_get_reader(struct trace_buffer *buffer, int cpu)
@@ -7284,8 +7355,8 @@ consume:
/* Check if any events were dropped */
missed_events = cpu_buffer->lost_events;
- if (cpu_buffer->reader_page != cpu_buffer->commit_page) {
- if (missed_events) {
+ if (missed_events) {
+ if (cpu_buffer->reader_page != cpu_buffer->commit_page) {
struct buffer_data_page *bpage = reader->page;
unsigned int commit;
/*
@@ -7306,13 +7377,23 @@ consume:
local_add(RB_MISSED_STORED, &bpage->commit);
}
local_add(RB_MISSED_EVENTS, &bpage->commit);
+ } else if (!WARN_ONCE(cpu_buffer->reader_page == cpu_buffer->tail_page,
+ "Reader on commit with %ld missed events",
+ missed_events)) {
+ /*
+ * There shouldn't be any missed events if the tail_page
+ * is on the reader page. But if the tail page is not on the
+ * reader page and the commit_page is, that would mean that
+ * there's a commit_overrun (an interrupt preempted an
+ * addition of an event and then filled the buffer
+ * with new events). In this case it's not an
+ * error, but it should still be reported.
+ *
+ * TODO: Add missed events to the page for user space to know.
+ */
+ pr_info("Ring buffer [%d] commit overrun lost %ld events at timestamp:%lld\n",
+ cpu, missed_events, cpu_buffer->reader_page->page->time_stamp);
}
- } else {
- /*
- * There really shouldn't be any missed events if the commit
- * is on the reader page.
- */
- WARN_ON_ONCE(missed_events);
}
cpu_buffer->lost_events = 0;
diff --git a/kernel/trace/rv/Kconfig b/kernel/trace/rv/Kconfig
index b39f36013ef2..5b4be87ba59d 100644
--- a/kernel/trace/rv/Kconfig
+++ b/kernel/trace/rv/Kconfig
@@ -1,19 +1,31 @@
# SPDX-License-Identifier: GPL-2.0-only
#
-config DA_MON_EVENTS
+config RV_MON_EVENTS
+ bool
+
+config RV_MON_MAINTENANCE_EVENTS
bool
config DA_MON_EVENTS_IMPLICIT
- select DA_MON_EVENTS
+ select RV_MON_EVENTS
+ select RV_MON_MAINTENANCE_EVENTS
bool
config DA_MON_EVENTS_ID
- select DA_MON_EVENTS
+ select RV_MON_EVENTS
+ select RV_MON_MAINTENANCE_EVENTS
+ bool
+
+config LTL_MON_EVENTS_ID
+ select RV_MON_EVENTS
+ bool
+
+config RV_LTL_MONITOR
bool
menuconfig RV
bool "Runtime Verification"
- depends on TRACING
+ select TRACING
help
Enable the kernel runtime verification infrastructure. RV is a
lightweight (yet rigorous) method that complements classical
@@ -25,15 +37,34 @@ menuconfig RV
For further information, see:
Documentation/trace/rv/runtime-verification.rst
+config RV_PER_TASK_MONITORS
+ int "Maximum number of per-task monitor"
+ depends on RV
+ range 1 8
+ default 2
+ help
+ This option configures the maximum number of per-task RV monitors that can run
+ simultaneously.
+
source "kernel/trace/rv/monitors/wip/Kconfig"
source "kernel/trace/rv/monitors/wwnr/Kconfig"
+
source "kernel/trace/rv/monitors/sched/Kconfig"
-source "kernel/trace/rv/monitors/tss/Kconfig"
source "kernel/trace/rv/monitors/sco/Kconfig"
source "kernel/trace/rv/monitors/snroc/Kconfig"
source "kernel/trace/rv/monitors/scpd/Kconfig"
source "kernel/trace/rv/monitors/snep/Kconfig"
-source "kernel/trace/rv/monitors/sncid/Kconfig"
+source "kernel/trace/rv/monitors/sts/Kconfig"
+source "kernel/trace/rv/monitors/nrp/Kconfig"
+source "kernel/trace/rv/monitors/sssw/Kconfig"
+source "kernel/trace/rv/monitors/opid/Kconfig"
+# Add new sched monitors here
+
+source "kernel/trace/rv/monitors/rtapp/Kconfig"
+source "kernel/trace/rv/monitors/pagefault/Kconfig"
+source "kernel/trace/rv/monitors/sleep/Kconfig"
+# Add new rtapp monitors here
+
# Add new monitors here
config RV_REACTORS
diff --git a/kernel/trace/rv/Makefile b/kernel/trace/rv/Makefile
index f9b2cd0483c3..750e4ad6fa0f 100644
--- a/kernel/trace/rv/Makefile
+++ b/kernel/trace/rv/Makefile
@@ -6,12 +6,17 @@ obj-$(CONFIG_RV) += rv.o
obj-$(CONFIG_RV_MON_WIP) += monitors/wip/wip.o
obj-$(CONFIG_RV_MON_WWNR) += monitors/wwnr/wwnr.o
obj-$(CONFIG_RV_MON_SCHED) += monitors/sched/sched.o
-obj-$(CONFIG_RV_MON_TSS) += monitors/tss/tss.o
obj-$(CONFIG_RV_MON_SCO) += monitors/sco/sco.o
obj-$(CONFIG_RV_MON_SNROC) += monitors/snroc/snroc.o
obj-$(CONFIG_RV_MON_SCPD) += monitors/scpd/scpd.o
obj-$(CONFIG_RV_MON_SNEP) += monitors/snep/snep.o
-obj-$(CONFIG_RV_MON_SNCID) += monitors/sncid/sncid.o
+obj-$(CONFIG_RV_MON_RTAPP) += monitors/rtapp/rtapp.o
+obj-$(CONFIG_RV_MON_PAGEFAULT) += monitors/pagefault/pagefault.o
+obj-$(CONFIG_RV_MON_SLEEP) += monitors/sleep/sleep.o
+obj-$(CONFIG_RV_MON_STS) += monitors/sts/sts.o
+obj-$(CONFIG_RV_MON_NRP) += monitors/nrp/nrp.o
+obj-$(CONFIG_RV_MON_SSSW) += monitors/sssw/sssw.o
+obj-$(CONFIG_RV_MON_OPID) += monitors/opid/opid.o
# Add new monitors here
obj-$(CONFIG_RV_REACTORS) += rv_reactors.o
obj-$(CONFIG_RV_REACT_PRINTK) += reactor_printk.o
diff --git a/kernel/trace/rv/monitors/tss/Kconfig b/kernel/trace/rv/monitors/nrp/Kconfig
index 479f86f52e60..f5ec08f65535 100644
--- a/kernel/trace/rv/monitors/tss/Kconfig
+++ b/kernel/trace/rv/monitors/nrp/Kconfig
@@ -1,14 +1,16 @@
# SPDX-License-Identifier: GPL-2.0-only
#
-config RV_MON_TSS
+config RV_MON_NRP
depends on RV
depends on RV_MON_SCHED
- default y
- select DA_MON_EVENTS_IMPLICIT
- bool "tss monitor"
+ default y if !ARM64
+ select DA_MON_EVENTS_ID
+ bool "nrp monitor"
help
- Monitor to ensure sched_switch happens only in scheduling context.
+ Monitor to ensure preemption requires need resched.
This monitor is part of the sched monitors collection.
+ This monitor is unstable on arm64, say N unless you are testing it.
+
For further information, see:
Documentation/trace/rv/monitor_sched.rst
diff --git a/kernel/trace/rv/monitors/nrp/nrp.c b/kernel/trace/rv/monitors/nrp/nrp.c
new file mode 100644
index 000000000000..5a83b7171432
--- /dev/null
+++ b/kernel/trace/rv/monitors/nrp/nrp.c
@@ -0,0 +1,138 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/ftrace.h>
+#include <linux/tracepoint.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/rv.h>
+#include <rv/instrumentation.h>
+#include <rv/da_monitor.h>
+
+#define MODULE_NAME "nrp"
+
+#include <trace/events/irq.h>
+#include <trace/events/sched.h>
+#include <rv_trace.h>
+#include <monitors/sched/sched.h>
+
+#include "nrp.h"
+
+static struct rv_monitor rv_nrp;
+DECLARE_DA_MON_PER_TASK(nrp, unsigned char);
+
+#ifdef CONFIG_X86_LOCAL_APIC
+#include <asm/trace/irq_vectors.h>
+
+static void handle_vector_irq_entry(void *data, int vector)
+{
+ da_handle_event_nrp(current, irq_entry_nrp);
+}
+
+static void attach_vector_irq(void)
+{
+ rv_attach_trace_probe("nrp", local_timer_entry, handle_vector_irq_entry);
+ if (IS_ENABLED(CONFIG_IRQ_WORK))
+ rv_attach_trace_probe("nrp", irq_work_entry, handle_vector_irq_entry);
+ if (IS_ENABLED(CONFIG_SMP)) {
+ rv_attach_trace_probe("nrp", reschedule_entry, handle_vector_irq_entry);
+ rv_attach_trace_probe("nrp", call_function_entry, handle_vector_irq_entry);
+ rv_attach_trace_probe("nrp", call_function_single_entry, handle_vector_irq_entry);
+ }
+}
+
+static void detach_vector_irq(void)
+{
+ rv_detach_trace_probe("nrp", local_timer_entry, handle_vector_irq_entry);
+ if (IS_ENABLED(CONFIG_IRQ_WORK))
+ rv_detach_trace_probe("nrp", irq_work_entry, handle_vector_irq_entry);
+ if (IS_ENABLED(CONFIG_SMP)) {
+ rv_detach_trace_probe("nrp", reschedule_entry, handle_vector_irq_entry);
+ rv_detach_trace_probe("nrp", call_function_entry, handle_vector_irq_entry);
+ rv_detach_trace_probe("nrp", call_function_single_entry, handle_vector_irq_entry);
+ }
+}
+
+#else
+/* We assume irq_entry tracepoints are sufficient on other architectures */
+static void attach_vector_irq(void) { }
+static void detach_vector_irq(void) { }
+#endif
+
+static void handle_irq_entry(void *data, int irq, struct irqaction *action)
+{
+ da_handle_event_nrp(current, irq_entry_nrp);
+}
+
+static void handle_sched_need_resched(void *data, struct task_struct *tsk,
+ int cpu, int tif)
+{
+ /*
+ * Although need_resched leads to both the rescheduling and preempt_irq
+ * states, it is safer to start the monitor always in preempt_irq,
+ * which may not mirror the system state but makes the monitor simpler,
+ */
+ if (tif == TIF_NEED_RESCHED)
+ da_handle_start_event_nrp(tsk, sched_need_resched_nrp);
+}
+
+static void handle_schedule_entry(void *data, bool preempt)
+{
+ if (preempt)
+ da_handle_event_nrp(current, schedule_entry_preempt_nrp);
+ else
+ da_handle_event_nrp(current, schedule_entry_nrp);
+}
+
+static int enable_nrp(void)
+{
+ int retval;
+
+ retval = da_monitor_init_nrp();
+ if (retval)
+ return retval;
+
+ rv_attach_trace_probe("nrp", irq_handler_entry, handle_irq_entry);
+ rv_attach_trace_probe("nrp", sched_set_need_resched_tp, handle_sched_need_resched);
+ rv_attach_trace_probe("nrp", sched_entry_tp, handle_schedule_entry);
+ attach_vector_irq();
+
+ return 0;
+}
+
+static void disable_nrp(void)
+{
+ rv_nrp.enabled = 0;
+
+ rv_detach_trace_probe("nrp", irq_handler_entry, handle_irq_entry);
+ rv_detach_trace_probe("nrp", sched_set_need_resched_tp, handle_sched_need_resched);
+ rv_detach_trace_probe("nrp", sched_entry_tp, handle_schedule_entry);
+ detach_vector_irq();
+
+ da_monitor_destroy_nrp();
+}
+
+static struct rv_monitor rv_nrp = {
+ .name = "nrp",
+ .description = "need resched preempts.",
+ .enable = enable_nrp,
+ .disable = disable_nrp,
+ .reset = da_monitor_reset_all_nrp,
+ .enabled = 0,
+};
+
+static int __init register_nrp(void)
+{
+ return rv_register_monitor(&rv_nrp, &rv_sched);
+}
+
+static void __exit unregister_nrp(void)
+{
+ rv_unregister_monitor(&rv_nrp);
+}
+
+module_init(register_nrp);
+module_exit(unregister_nrp);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Gabriele Monaco <gmonaco@redhat.com>");
+MODULE_DESCRIPTION("nrp: need resched preempts.");
diff --git a/kernel/trace/rv/monitors/nrp/nrp.h b/kernel/trace/rv/monitors/nrp/nrp.h
new file mode 100644
index 000000000000..c9f12207cbf6
--- /dev/null
+++ b/kernel/trace/rv/monitors/nrp/nrp.h
@@ -0,0 +1,75 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Automatically generated C representation of nrp automaton
+ * For further information about this format, see kernel documentation:
+ * Documentation/trace/rv/deterministic_automata.rst
+ */
+
+enum states_nrp {
+ preempt_irq_nrp = 0,
+ any_thread_running_nrp,
+ nested_preempt_nrp,
+ rescheduling_nrp,
+ state_max_nrp
+};
+
+#define INVALID_STATE state_max_nrp
+
+enum events_nrp {
+ irq_entry_nrp = 0,
+ sched_need_resched_nrp,
+ schedule_entry_nrp,
+ schedule_entry_preempt_nrp,
+ event_max_nrp
+};
+
+struct automaton_nrp {
+ char *state_names[state_max_nrp];
+ char *event_names[event_max_nrp];
+ unsigned char function[state_max_nrp][event_max_nrp];
+ unsigned char initial_state;
+ bool final_states[state_max_nrp];
+};
+
+static const struct automaton_nrp automaton_nrp = {
+ .state_names = {
+ "preempt_irq",
+ "any_thread_running",
+ "nested_preempt",
+ "rescheduling"
+ },
+ .event_names = {
+ "irq_entry",
+ "sched_need_resched",
+ "schedule_entry",
+ "schedule_entry_preempt"
+ },
+ .function = {
+ {
+ preempt_irq_nrp,
+ preempt_irq_nrp,
+ nested_preempt_nrp,
+ nested_preempt_nrp
+ },
+ {
+ any_thread_running_nrp,
+ rescheduling_nrp,
+ any_thread_running_nrp,
+ INVALID_STATE
+ },
+ {
+ nested_preempt_nrp,
+ preempt_irq_nrp,
+ any_thread_running_nrp,
+ any_thread_running_nrp
+ },
+ {
+ preempt_irq_nrp,
+ rescheduling_nrp,
+ any_thread_running_nrp,
+ any_thread_running_nrp
+ },
+ },
+ .initial_state = preempt_irq_nrp,
+ .final_states = { 0, 1, 0, 0 },
+};
diff --git a/kernel/trace/rv/monitors/nrp/nrp_trace.h b/kernel/trace/rv/monitors/nrp/nrp_trace.h
new file mode 100644
index 000000000000..2e13497de3b6
--- /dev/null
+++ b/kernel/trace/rv/monitors/nrp/nrp_trace.h
@@ -0,0 +1,15 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+/*
+ * Snippet to be included in rv_trace.h
+ */
+
+#ifdef CONFIG_RV_MON_NRP
+DEFINE_EVENT(event_da_monitor_id, event_nrp,
+ TP_PROTO(int id, char *state, char *event, char *next_state, bool final_state),
+ TP_ARGS(id, state, event, next_state, final_state));
+
+DEFINE_EVENT(error_da_monitor_id, error_nrp,
+ TP_PROTO(int id, char *state, char *event),
+ TP_ARGS(id, state, event));
+#endif /* CONFIG_RV_MON_NRP */
diff --git a/kernel/trace/rv/monitors/opid/Kconfig b/kernel/trace/rv/monitors/opid/Kconfig
new file mode 100644
index 000000000000..561d32da572b
--- /dev/null
+++ b/kernel/trace/rv/monitors/opid/Kconfig
@@ -0,0 +1,19 @@
+# SPDX-License-Identifier: GPL-2.0-only
+#
+config RV_MON_OPID
+ depends on RV
+ depends on TRACE_IRQFLAGS
+ depends on TRACE_PREEMPT_TOGGLE
+ depends on RV_MON_SCHED
+ default y if PREEMPT_RT
+ select DA_MON_EVENTS_IMPLICIT
+ bool "opid monitor"
+ help
+ Monitor to ensure operations like wakeup and need resched occur with
+ interrupts and preemption disabled or during IRQs, where preemption
+ may not be disabled explicitly.
+
+ This monitor is unstable on !PREEMPT_RT, say N unless you are testing it.
+
+ For further information, see:
+ Documentation/trace/rv/monitor_sched.rst
diff --git a/kernel/trace/rv/monitors/opid/opid.c b/kernel/trace/rv/monitors/opid/opid.c
new file mode 100644
index 000000000000..50d64e7fb8c4
--- /dev/null
+++ b/kernel/trace/rv/monitors/opid/opid.c
@@ -0,0 +1,168 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/ftrace.h>
+#include <linux/tracepoint.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/rv.h>
+#include <rv/instrumentation.h>
+#include <rv/da_monitor.h>
+
+#define MODULE_NAME "opid"
+
+#include <trace/events/sched.h>
+#include <trace/events/irq.h>
+#include <trace/events/preemptirq.h>
+#include <rv_trace.h>
+#include <monitors/sched/sched.h>
+
+#include "opid.h"
+
+static struct rv_monitor rv_opid;
+DECLARE_DA_MON_PER_CPU(opid, unsigned char);
+
+#ifdef CONFIG_X86_LOCAL_APIC
+#include <asm/trace/irq_vectors.h>
+
+static void handle_vector_irq_entry(void *data, int vector)
+{
+ da_handle_event_opid(irq_entry_opid);
+}
+
+static void attach_vector_irq(void)
+{
+ rv_attach_trace_probe("opid", local_timer_entry, handle_vector_irq_entry);
+ if (IS_ENABLED(CONFIG_IRQ_WORK))
+ rv_attach_trace_probe("opid", irq_work_entry, handle_vector_irq_entry);
+ if (IS_ENABLED(CONFIG_SMP)) {
+ rv_attach_trace_probe("opid", reschedule_entry, handle_vector_irq_entry);
+ rv_attach_trace_probe("opid", call_function_entry, handle_vector_irq_entry);
+ rv_attach_trace_probe("opid", call_function_single_entry, handle_vector_irq_entry);
+ }
+}
+
+static void detach_vector_irq(void)
+{
+ rv_detach_trace_probe("opid", local_timer_entry, handle_vector_irq_entry);
+ if (IS_ENABLED(CONFIG_IRQ_WORK))
+ rv_detach_trace_probe("opid", irq_work_entry, handle_vector_irq_entry);
+ if (IS_ENABLED(CONFIG_SMP)) {
+ rv_detach_trace_probe("opid", reschedule_entry, handle_vector_irq_entry);
+ rv_detach_trace_probe("opid", call_function_entry, handle_vector_irq_entry);
+ rv_detach_trace_probe("opid", call_function_single_entry, handle_vector_irq_entry);
+ }
+}
+
+#else
+/* We assume irq_entry tracepoints are sufficient on other architectures */
+static void attach_vector_irq(void) { }
+static void detach_vector_irq(void) { }
+#endif
+
+static void handle_irq_disable(void *data, unsigned long ip, unsigned long parent_ip)
+{
+ da_handle_event_opid(irq_disable_opid);
+}
+
+static void handle_irq_enable(void *data, unsigned long ip, unsigned long parent_ip)
+{
+ da_handle_event_opid(irq_enable_opid);
+}
+
+static void handle_irq_entry(void *data, int irq, struct irqaction *action)
+{
+ da_handle_event_opid(irq_entry_opid);
+}
+
+static void handle_preempt_disable(void *data, unsigned long ip, unsigned long parent_ip)
+{
+ da_handle_event_opid(preempt_disable_opid);
+}
+
+static void handle_preempt_enable(void *data, unsigned long ip, unsigned long parent_ip)
+{
+ da_handle_event_opid(preempt_enable_opid);
+}
+
+static void handle_sched_need_resched(void *data, struct task_struct *tsk, int cpu, int tif)
+{
+ /* The monitor's intitial state is not in_irq */
+ if (this_cpu_read(hardirq_context))
+ da_handle_event_opid(sched_need_resched_opid);
+ else
+ da_handle_start_event_opid(sched_need_resched_opid);
+}
+
+static void handle_sched_waking(void *data, struct task_struct *p)
+{
+ /* The monitor's intitial state is not in_irq */
+ if (this_cpu_read(hardirq_context))
+ da_handle_event_opid(sched_waking_opid);
+ else
+ da_handle_start_event_opid(sched_waking_opid);
+}
+
+static int enable_opid(void)
+{
+ int retval;
+
+ retval = da_monitor_init_opid();
+ if (retval)
+ return retval;
+
+ rv_attach_trace_probe("opid", irq_disable, handle_irq_disable);
+ rv_attach_trace_probe("opid", irq_enable, handle_irq_enable);
+ rv_attach_trace_probe("opid", irq_handler_entry, handle_irq_entry);
+ rv_attach_trace_probe("opid", preempt_disable, handle_preempt_disable);
+ rv_attach_trace_probe("opid", preempt_enable, handle_preempt_enable);
+ rv_attach_trace_probe("opid", sched_set_need_resched_tp, handle_sched_need_resched);
+ rv_attach_trace_probe("opid", sched_waking, handle_sched_waking);
+ attach_vector_irq();
+
+ return 0;
+}
+
+static void disable_opid(void)
+{
+ rv_opid.enabled = 0;
+
+ rv_detach_trace_probe("opid", irq_disable, handle_irq_disable);
+ rv_detach_trace_probe("opid", irq_enable, handle_irq_enable);
+ rv_detach_trace_probe("opid", irq_handler_entry, handle_irq_entry);
+ rv_detach_trace_probe("opid", preempt_disable, handle_preempt_disable);
+ rv_detach_trace_probe("opid", preempt_enable, handle_preempt_enable);
+ rv_detach_trace_probe("opid", sched_set_need_resched_tp, handle_sched_need_resched);
+ rv_detach_trace_probe("opid", sched_waking, handle_sched_waking);
+ detach_vector_irq();
+
+ da_monitor_destroy_opid();
+}
+
+/*
+ * This is the monitor register section.
+ */
+static struct rv_monitor rv_opid = {
+ .name = "opid",
+ .description = "operations with preemption and irq disabled.",
+ .enable = enable_opid,
+ .disable = disable_opid,
+ .reset = da_monitor_reset_all_opid,
+ .enabled = 0,
+};
+
+static int __init register_opid(void)
+{
+ return rv_register_monitor(&rv_opid, &rv_sched);
+}
+
+static void __exit unregister_opid(void)
+{
+ rv_unregister_monitor(&rv_opid);
+}
+
+module_init(register_opid);
+module_exit(unregister_opid);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Gabriele Monaco <gmonaco@redhat.com>");
+MODULE_DESCRIPTION("opid: operations with preemption and irq disabled.");
diff --git a/kernel/trace/rv/monitors/opid/opid.h b/kernel/trace/rv/monitors/opid/opid.h
new file mode 100644
index 000000000000..b4b8c2ff7f64
--- /dev/null
+++ b/kernel/trace/rv/monitors/opid/opid.h
@@ -0,0 +1,104 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Automatically generated C representation of opid automaton
+ * For further information about this format, see kernel documentation:
+ * Documentation/trace/rv/deterministic_automata.rst
+ */
+
+enum states_opid {
+ disabled_opid = 0,
+ enabled_opid,
+ in_irq_opid,
+ irq_disabled_opid,
+ preempt_disabled_opid,
+ state_max_opid
+};
+
+#define INVALID_STATE state_max_opid
+
+enum events_opid {
+ irq_disable_opid = 0,
+ irq_enable_opid,
+ irq_entry_opid,
+ preempt_disable_opid,
+ preempt_enable_opid,
+ sched_need_resched_opid,
+ sched_waking_opid,
+ event_max_opid
+};
+
+struct automaton_opid {
+ char *state_names[state_max_opid];
+ char *event_names[event_max_opid];
+ unsigned char function[state_max_opid][event_max_opid];
+ unsigned char initial_state;
+ bool final_states[state_max_opid];
+};
+
+static const struct automaton_opid automaton_opid = {
+ .state_names = {
+ "disabled",
+ "enabled",
+ "in_irq",
+ "irq_disabled",
+ "preempt_disabled"
+ },
+ .event_names = {
+ "irq_disable",
+ "irq_enable",
+ "irq_entry",
+ "preempt_disable",
+ "preempt_enable",
+ "sched_need_resched",
+ "sched_waking"
+ },
+ .function = {
+ {
+ INVALID_STATE,
+ preempt_disabled_opid,
+ disabled_opid,
+ INVALID_STATE,
+ irq_disabled_opid,
+ disabled_opid,
+ disabled_opid
+ },
+ {
+ irq_disabled_opid,
+ INVALID_STATE,
+ INVALID_STATE,
+ preempt_disabled_opid,
+ enabled_opid,
+ INVALID_STATE,
+ INVALID_STATE
+ },
+ {
+ INVALID_STATE,
+ enabled_opid,
+ in_irq_opid,
+ INVALID_STATE,
+ INVALID_STATE,
+ in_irq_opid,
+ in_irq_opid
+ },
+ {
+ INVALID_STATE,
+ enabled_opid,
+ in_irq_opid,
+ disabled_opid,
+ INVALID_STATE,
+ irq_disabled_opid,
+ INVALID_STATE
+ },
+ {
+ disabled_opid,
+ INVALID_STATE,
+ INVALID_STATE,
+ INVALID_STATE,
+ enabled_opid,
+ INVALID_STATE,
+ INVALID_STATE
+ },
+ },
+ .initial_state = disabled_opid,
+ .final_states = { 0, 1, 0, 0, 0 },
+};
diff --git a/kernel/trace/rv/monitors/sncid/sncid_trace.h b/kernel/trace/rv/monitors/opid/opid_trace.h
index 3ce42a57671d..3df6ff955c30 100644
--- a/kernel/trace/rv/monitors/sncid/sncid_trace.h
+++ b/kernel/trace/rv/monitors/opid/opid_trace.h
@@ -4,12 +4,12 @@
* Snippet to be included in rv_trace.h
*/
-#ifdef CONFIG_RV_MON_SNCID
-DEFINE_EVENT(event_da_monitor, event_sncid,
+#ifdef CONFIG_RV_MON_OPID
+DEFINE_EVENT(event_da_monitor, event_opid,
TP_PROTO(char *state, char *event, char *next_state, bool final_state),
TP_ARGS(state, event, next_state, final_state));
-DEFINE_EVENT(error_da_monitor, error_sncid,
+DEFINE_EVENT(error_da_monitor, error_opid,
TP_PROTO(char *state, char *event),
TP_ARGS(state, event));
-#endif /* CONFIG_RV_MON_SNCID */
+#endif /* CONFIG_RV_MON_OPID */
diff --git a/kernel/trace/rv/monitors/pagefault/Kconfig b/kernel/trace/rv/monitors/pagefault/Kconfig
new file mode 100644
index 000000000000..5e16625f1653
--- /dev/null
+++ b/kernel/trace/rv/monitors/pagefault/Kconfig
@@ -0,0 +1,20 @@
+# SPDX-License-Identifier: GPL-2.0-only
+#
+config RV_MON_PAGEFAULT
+ depends on RV
+ select RV_LTL_MONITOR
+ depends on RV_MON_RTAPP
+ depends on X86 || RISCV
+ default y
+ select LTL_MON_EVENTS_ID
+ bool "pagefault monitor"
+ help
+ Monitor that real-time tasks do not raise page faults, causing
+ undesirable latency.
+
+ If you are developing a real-time system and not entirely sure whether
+ the applications are designed correctly for real-time, you want to say
+ Y here.
+
+ This monitor does not affect execution speed while it is not running,
+ therefore it is safe to enable this in production kernel.
diff --git a/kernel/trace/rv/monitors/pagefault/pagefault.c b/kernel/trace/rv/monitors/pagefault/pagefault.c
new file mode 100644
index 000000000000..9fe6123b2200
--- /dev/null
+++ b/kernel/trace/rv/monitors/pagefault/pagefault.c
@@ -0,0 +1,88 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/ftrace.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/rv.h>
+#include <linux/sched/deadline.h>
+#include <linux/sched/rt.h>
+#include <linux/tracepoint.h>
+#include <rv/instrumentation.h>
+
+#define MODULE_NAME "pagefault"
+
+#include <rv_trace.h>
+#include <trace/events/exceptions.h>
+#include <monitors/rtapp/rtapp.h>
+
+#include "pagefault.h"
+#include <rv/ltl_monitor.h>
+
+static void ltl_atoms_fetch(struct task_struct *task, struct ltl_monitor *mon)
+{
+ /*
+ * This includes "actual" real-time tasks and also PI-boosted
+ * tasks. A task being PI-boosted means it is blocking an "actual"
+ * real-task, therefore it should also obey the monitor's rule,
+ * otherwise the "actual" real-task may be delayed.
+ */
+ ltl_atom_set(mon, LTL_RT, rt_or_dl_task(task));
+}
+
+static void ltl_atoms_init(struct task_struct *task, struct ltl_monitor *mon, bool task_creation)
+{
+ if (task_creation)
+ ltl_atom_set(mon, LTL_PAGEFAULT, false);
+}
+
+static void handle_page_fault(void *data, unsigned long address, struct pt_regs *regs,
+ unsigned long error_code)
+{
+ ltl_atom_pulse(current, LTL_PAGEFAULT, true);
+}
+
+static int enable_pagefault(void)
+{
+ int retval;
+
+ retval = ltl_monitor_init();
+ if (retval)
+ return retval;
+
+ rv_attach_trace_probe("rtapp_pagefault", page_fault_kernel, handle_page_fault);
+ rv_attach_trace_probe("rtapp_pagefault", page_fault_user, handle_page_fault);
+
+ return 0;
+}
+
+static void disable_pagefault(void)
+{
+ rv_detach_trace_probe("rtapp_pagefault", page_fault_kernel, handle_page_fault);
+ rv_detach_trace_probe("rtapp_pagefault", page_fault_user, handle_page_fault);
+
+ ltl_monitor_destroy();
+}
+
+static struct rv_monitor rv_pagefault = {
+ .name = "pagefault",
+ .description = "Monitor that RT tasks do not raise page faults",
+ .enable = enable_pagefault,
+ .disable = disable_pagefault,
+};
+
+static int __init register_pagefault(void)
+{
+ return rv_register_monitor(&rv_pagefault, &rv_rtapp);
+}
+
+static void __exit unregister_pagefault(void)
+{
+ rv_unregister_monitor(&rv_pagefault);
+}
+
+module_init(register_pagefault);
+module_exit(unregister_pagefault);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Nam Cao <namcao@linutronix.de>");
+MODULE_DESCRIPTION("pagefault: Monitor that RT tasks do not raise page faults");
diff --git a/kernel/trace/rv/monitors/pagefault/pagefault.h b/kernel/trace/rv/monitors/pagefault/pagefault.h
new file mode 100644
index 000000000000..c580ec194009
--- /dev/null
+++ b/kernel/trace/rv/monitors/pagefault/pagefault.h
@@ -0,0 +1,64 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+/*
+ * C implementation of Buchi automaton, automatically generated by
+ * tools/verification/rvgen from the linear temporal logic specification.
+ * For further information, see kernel documentation:
+ * Documentation/trace/rv/linear_temporal_logic.rst
+ */
+
+#include <linux/rv.h>
+
+#define MONITOR_NAME pagefault
+
+enum ltl_atom {
+ LTL_PAGEFAULT,
+ LTL_RT,
+ LTL_NUM_ATOM
+};
+static_assert(LTL_NUM_ATOM <= RV_MAX_LTL_ATOM);
+
+static const char *ltl_atom_str(enum ltl_atom atom)
+{
+ static const char *const names[] = {
+ "pa",
+ "rt",
+ };
+
+ return names[atom];
+}
+
+enum ltl_buchi_state {
+ S0,
+ RV_NUM_BA_STATES
+};
+static_assert(RV_NUM_BA_STATES <= RV_MAX_BA_STATES);
+
+static void ltl_start(struct task_struct *task, struct ltl_monitor *mon)
+{
+ bool pagefault = test_bit(LTL_PAGEFAULT, mon->atoms);
+ bool val3 = !pagefault;
+ bool rt = test_bit(LTL_RT, mon->atoms);
+ bool val1 = !rt;
+ bool val4 = val1 || val3;
+
+ if (val4)
+ __set_bit(S0, mon->states);
+}
+
+static void
+ltl_possible_next_states(struct ltl_monitor *mon, unsigned int state, unsigned long *next)
+{
+ bool pagefault = test_bit(LTL_PAGEFAULT, mon->atoms);
+ bool val3 = !pagefault;
+ bool rt = test_bit(LTL_RT, mon->atoms);
+ bool val1 = !rt;
+ bool val4 = val1 || val3;
+
+ switch (state) {
+ case S0:
+ if (val4)
+ __set_bit(S0, next);
+ break;
+ }
+}
diff --git a/kernel/trace/rv/monitors/pagefault/pagefault_trace.h b/kernel/trace/rv/monitors/pagefault/pagefault_trace.h
new file mode 100644
index 000000000000..fe1f82597b1a
--- /dev/null
+++ b/kernel/trace/rv/monitors/pagefault/pagefault_trace.h
@@ -0,0 +1,14 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+/*
+ * Snippet to be included in rv_trace.h
+ */
+
+#ifdef CONFIG_RV_MON_PAGEFAULT
+DEFINE_EVENT(event_ltl_monitor_id, event_pagefault,
+ TP_PROTO(struct task_struct *task, char *states, char *atoms, char *next),
+ TP_ARGS(task, states, atoms, next));
+DEFINE_EVENT(error_ltl_monitor_id, error_pagefault,
+ TP_PROTO(struct task_struct *task),
+ TP_ARGS(task));
+#endif /* CONFIG_RV_MON_PAGEFAULT */
diff --git a/kernel/trace/rv/monitors/rtapp/Kconfig b/kernel/trace/rv/monitors/rtapp/Kconfig
new file mode 100644
index 000000000000..1ce9370a9ba8
--- /dev/null
+++ b/kernel/trace/rv/monitors/rtapp/Kconfig
@@ -0,0 +1,11 @@
+config RV_MON_RTAPP
+ depends on RV
+ depends on RV_PER_TASK_MONITORS >= 2
+ bool "rtapp monitor"
+ help
+ Collection of monitors to check for common problems with real-time
+ application that may cause unexpected latency.
+
+ If you are developing a real-time system and not entirely sure whether
+ the applications are designed correctly for real-time, you want to say
+ Y here.
diff --git a/kernel/trace/rv/monitors/rtapp/rtapp.c b/kernel/trace/rv/monitors/rtapp/rtapp.c
new file mode 100644
index 000000000000..fd75fc927d65
--- /dev/null
+++ b/kernel/trace/rv/monitors/rtapp/rtapp.c
@@ -0,0 +1,33 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/rv.h>
+
+#define MODULE_NAME "rtapp"
+
+#include "rtapp.h"
+
+struct rv_monitor rv_rtapp;
+
+struct rv_monitor rv_rtapp = {
+ .name = "rtapp",
+ .description = "Collection of monitors for detecting problems with real-time applications",
+};
+
+static int __init register_rtapp(void)
+{
+ return rv_register_monitor(&rv_rtapp, NULL);
+}
+
+static void __exit unregister_rtapp(void)
+{
+ rv_unregister_monitor(&rv_rtapp);
+}
+
+module_init(register_rtapp);
+module_exit(unregister_rtapp);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Nam Cao <namcao@linutronix.de>");
+MODULE_DESCRIPTION("Collection of monitors for detecting problems with real-time applications");
diff --git a/kernel/trace/rv/monitors/rtapp/rtapp.h b/kernel/trace/rv/monitors/rtapp/rtapp.h
new file mode 100644
index 000000000000..4c200d67c7f6
--- /dev/null
+++ b/kernel/trace/rv/monitors/rtapp/rtapp.h
@@ -0,0 +1,3 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+extern struct rv_monitor rv_rtapp;
diff --git a/kernel/trace/rv/monitors/sched/Kconfig b/kernel/trace/rv/monitors/sched/Kconfig
index ae3eb410abd7..aa16456da864 100644
--- a/kernel/trace/rv/monitors/sched/Kconfig
+++ b/kernel/trace/rv/monitors/sched/Kconfig
@@ -2,6 +2,7 @@
#
config RV_MON_SCHED
depends on RV
+ depends on RV_PER_TASK_MONITORS >= 3
bool "sched monitor"
help
Collection of monitors to check the scheduler behaves according to specifications.
diff --git a/kernel/trace/rv/monitors/sched/sched.c b/kernel/trace/rv/monitors/sched/sched.c
index 905e03c3c934..d04db4b543f9 100644
--- a/kernel/trace/rv/monitors/sched/sched.c
+++ b/kernel/trace/rv/monitors/sched/sched.c
@@ -21,8 +21,7 @@ struct rv_monitor rv_sched = {
static int __init register_sched(void)
{
- rv_register_monitor(&rv_sched, NULL);
- return 0;
+ return rv_register_monitor(&rv_sched, NULL);
}
static void __exit unregister_sched(void)
diff --git a/kernel/trace/rv/monitors/sco/sco.c b/kernel/trace/rv/monitors/sco/sco.c
index 4cff59220bfc..04c36405e2e3 100644
--- a/kernel/trace/rv/monitors/sco/sco.c
+++ b/kernel/trace/rv/monitors/sco/sco.c
@@ -24,12 +24,12 @@ static void handle_sched_set_state(void *data, struct task_struct *tsk, int stat
da_handle_start_event_sco(sched_set_state_sco);
}
-static void handle_schedule_entry(void *data, bool preempt, unsigned long ip)
+static void handle_schedule_entry(void *data, bool preempt)
{
da_handle_event_sco(schedule_entry_sco);
}
-static void handle_schedule_exit(void *data, bool is_switch, unsigned long ip)
+static void handle_schedule_exit(void *data, bool is_switch)
{
da_handle_start_event_sco(schedule_exit_sco);
}
@@ -71,8 +71,7 @@ static struct rv_monitor rv_sco = {
static int __init register_sco(void)
{
- rv_register_monitor(&rv_sco, &rv_sched);
- return 0;
+ return rv_register_monitor(&rv_sco, &rv_sched);
}
static void __exit unregister_sco(void)
diff --git a/kernel/trace/rv/monitors/scpd/Kconfig b/kernel/trace/rv/monitors/scpd/Kconfig
index b9114fbf680f..682d0416188b 100644
--- a/kernel/trace/rv/monitors/scpd/Kconfig
+++ b/kernel/trace/rv/monitors/scpd/Kconfig
@@ -2,7 +2,7 @@
#
config RV_MON_SCPD
depends on RV
- depends on PREEMPT_TRACER
+ depends on TRACE_PREEMPT_TOGGLE
depends on RV_MON_SCHED
default y
select DA_MON_EVENTS_IMPLICIT
diff --git a/kernel/trace/rv/monitors/scpd/scpd.c b/kernel/trace/rv/monitors/scpd/scpd.c
index cbdd6a5f8d7f..1e351ba52fee 100644
--- a/kernel/trace/rv/monitors/scpd/scpd.c
+++ b/kernel/trace/rv/monitors/scpd/scpd.c
@@ -30,12 +30,12 @@ static void handle_preempt_enable(void *data, unsigned long ip, unsigned long pa
da_handle_start_event_scpd(preempt_enable_scpd);
}
-static void handle_schedule_entry(void *data, bool preempt, unsigned long ip)
+static void handle_schedule_entry(void *data, bool preempt)
{
da_handle_event_scpd(schedule_entry_scpd);
}
-static void handle_schedule_exit(void *data, bool is_switch, unsigned long ip)
+static void handle_schedule_exit(void *data, bool is_switch)
{
da_handle_event_scpd(schedule_exit_scpd);
}
@@ -79,8 +79,7 @@ static struct rv_monitor rv_scpd = {
static int __init register_scpd(void)
{
- rv_register_monitor(&rv_scpd, &rv_sched);
- return 0;
+ return rv_register_monitor(&rv_scpd, &rv_sched);
}
static void __exit unregister_scpd(void)
diff --git a/kernel/trace/rv/monitors/sleep/Kconfig b/kernel/trace/rv/monitors/sleep/Kconfig
new file mode 100644
index 000000000000..6b7a122e7b47
--- /dev/null
+++ b/kernel/trace/rv/monitors/sleep/Kconfig
@@ -0,0 +1,22 @@
+# SPDX-License-Identifier: GPL-2.0-only
+#
+config RV_MON_SLEEP
+ depends on RV
+ select RV_LTL_MONITOR
+ depends on HAVE_SYSCALL_TRACEPOINTS
+ depends on RV_MON_RTAPP
+ select TRACE_IRQFLAGS
+ default y
+ select LTL_MON_EVENTS_ID
+ bool "sleep monitor"
+ help
+ Monitor that real-time tasks do not sleep in a manner that may
+ cause undesirable latency.
+
+ If you are developing a real-time system and not entirely sure whether
+ the applications are designed correctly for real-time, you want to say
+ Y here.
+
+ Enabling this monitor may have performance impact (due to select
+ TRACE_IRQFLAGS). Therefore, you probably should say N for
+ production kernel.
diff --git a/kernel/trace/rv/monitors/sleep/sleep.c b/kernel/trace/rv/monitors/sleep/sleep.c
new file mode 100644
index 000000000000..eea447b06907
--- /dev/null
+++ b/kernel/trace/rv/monitors/sleep/sleep.c
@@ -0,0 +1,237 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/ftrace.h>
+#include <linux/tracepoint.h>
+#include <linux/init.h>
+#include <linux/irqflags.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/rv.h>
+#include <linux/sched/deadline.h>
+#include <linux/sched/rt.h>
+#include <rv/instrumentation.h>
+
+#define MODULE_NAME "sleep"
+
+#include <trace/events/syscalls.h>
+#include <trace/events/sched.h>
+#include <trace/events/lock.h>
+#include <uapi/linux/futex.h>
+#include <rv_trace.h>
+#include <monitors/rtapp/rtapp.h>
+
+#include "sleep.h"
+#include <rv/ltl_monitor.h>
+
+static void ltl_atoms_fetch(struct task_struct *task, struct ltl_monitor *mon)
+{
+ /*
+ * This includes "actual" real-time tasks and also PI-boosted
+ * tasks. A task being PI-boosted means it is blocking an "actual"
+ * real-task, therefore it should also obey the monitor's rule,
+ * otherwise the "actual" real-task may be delayed.
+ */
+ ltl_atom_set(mon, LTL_RT, rt_or_dl_task(task));
+}
+
+static void ltl_atoms_init(struct task_struct *task, struct ltl_monitor *mon, bool task_creation)
+{
+ ltl_atom_set(mon, LTL_SLEEP, false);
+ ltl_atom_set(mon, LTL_WAKE, false);
+ ltl_atom_set(mon, LTL_ABORT_SLEEP, false);
+ ltl_atom_set(mon, LTL_WOKEN_BY_HARDIRQ, false);
+ ltl_atom_set(mon, LTL_WOKEN_BY_NMI, false);
+ ltl_atom_set(mon, LTL_WOKEN_BY_EQUAL_OR_HIGHER_PRIO, false);
+
+ if (task_creation) {
+ ltl_atom_set(mon, LTL_KTHREAD_SHOULD_STOP, false);
+ ltl_atom_set(mon, LTL_NANOSLEEP_CLOCK_MONOTONIC, false);
+ ltl_atom_set(mon, LTL_NANOSLEEP_CLOCK_TAI, false);
+ ltl_atom_set(mon, LTL_NANOSLEEP_TIMER_ABSTIME, false);
+ ltl_atom_set(mon, LTL_CLOCK_NANOSLEEP, false);
+ ltl_atom_set(mon, LTL_FUTEX_WAIT, false);
+ ltl_atom_set(mon, LTL_FUTEX_LOCK_PI, false);
+ ltl_atom_set(mon, LTL_BLOCK_ON_RT_MUTEX, false);
+ }
+
+ if (task->flags & PF_KTHREAD) {
+ ltl_atom_set(mon, LTL_KERNEL_THREAD, true);
+
+ /* kernel tasks do not do syscall */
+ ltl_atom_set(mon, LTL_FUTEX_WAIT, false);
+ ltl_atom_set(mon, LTL_FUTEX_LOCK_PI, false);
+ ltl_atom_set(mon, LTL_NANOSLEEP_CLOCK_MONOTONIC, false);
+ ltl_atom_set(mon, LTL_NANOSLEEP_CLOCK_TAI, false);
+ ltl_atom_set(mon, LTL_NANOSLEEP_TIMER_ABSTIME, false);
+ ltl_atom_set(mon, LTL_CLOCK_NANOSLEEP, false);
+
+ if (strstarts(task->comm, "migration/"))
+ ltl_atom_set(mon, LTL_TASK_IS_MIGRATION, true);
+ else
+ ltl_atom_set(mon, LTL_TASK_IS_MIGRATION, false);
+
+ if (strstarts(task->comm, "rcu"))
+ ltl_atom_set(mon, LTL_TASK_IS_RCU, true);
+ else
+ ltl_atom_set(mon, LTL_TASK_IS_RCU, false);
+ } else {
+ ltl_atom_set(mon, LTL_KTHREAD_SHOULD_STOP, false);
+ ltl_atom_set(mon, LTL_KERNEL_THREAD, false);
+ ltl_atom_set(mon, LTL_TASK_IS_RCU, false);
+ ltl_atom_set(mon, LTL_TASK_IS_MIGRATION, false);
+ }
+
+}
+
+static void handle_sched_set_state(void *data, struct task_struct *task, int state)
+{
+ if (state & TASK_INTERRUPTIBLE)
+ ltl_atom_pulse(task, LTL_SLEEP, true);
+ else if (state == TASK_RUNNING)
+ ltl_atom_pulse(task, LTL_ABORT_SLEEP, true);
+}
+
+static void handle_sched_wakeup(void *data, struct task_struct *task)
+{
+ ltl_atom_pulse(task, LTL_WAKE, true);
+}
+
+static void handle_sched_waking(void *data, struct task_struct *task)
+{
+ if (this_cpu_read(hardirq_context)) {
+ ltl_atom_pulse(task, LTL_WOKEN_BY_HARDIRQ, true);
+ } else if (in_task()) {
+ if (current->prio <= task->prio)
+ ltl_atom_pulse(task, LTL_WOKEN_BY_EQUAL_OR_HIGHER_PRIO, true);
+ } else if (in_nmi()) {
+ ltl_atom_pulse(task, LTL_WOKEN_BY_NMI, true);
+ }
+}
+
+static void handle_contention_begin(void *data, void *lock, unsigned int flags)
+{
+ if (flags & LCB_F_RT)
+ ltl_atom_update(current, LTL_BLOCK_ON_RT_MUTEX, true);
+}
+
+static void handle_contention_end(void *data, void *lock, int ret)
+{
+ ltl_atom_update(current, LTL_BLOCK_ON_RT_MUTEX, false);
+}
+
+static void handle_sys_enter(void *data, struct pt_regs *regs, long id)
+{
+ struct ltl_monitor *mon;
+ unsigned long args[6];
+ int op, cmd;
+
+ mon = ltl_get_monitor(current);
+
+ switch (id) {
+ case __NR_clock_nanosleep:
+#ifdef __NR_clock_nanosleep_time64
+ case __NR_clock_nanosleep_time64:
+#endif
+ syscall_get_arguments(current, regs, args);
+ ltl_atom_set(mon, LTL_NANOSLEEP_CLOCK_MONOTONIC, args[0] == CLOCK_MONOTONIC);
+ ltl_atom_set(mon, LTL_NANOSLEEP_CLOCK_TAI, args[0] == CLOCK_TAI);
+ ltl_atom_set(mon, LTL_NANOSLEEP_TIMER_ABSTIME, args[1] == TIMER_ABSTIME);
+ ltl_atom_update(current, LTL_CLOCK_NANOSLEEP, true);
+ break;
+
+ case __NR_futex:
+#ifdef __NR_futex_time64
+ case __NR_futex_time64:
+#endif
+ syscall_get_arguments(current, regs, args);
+ op = args[1];
+ cmd = op & FUTEX_CMD_MASK;
+
+ switch (cmd) {
+ case FUTEX_LOCK_PI:
+ case FUTEX_LOCK_PI2:
+ ltl_atom_update(current, LTL_FUTEX_LOCK_PI, true);
+ break;
+ case FUTEX_WAIT:
+ case FUTEX_WAIT_BITSET:
+ case FUTEX_WAIT_REQUEUE_PI:
+ ltl_atom_update(current, LTL_FUTEX_WAIT, true);
+ break;
+ }
+ break;
+ }
+}
+
+static void handle_sys_exit(void *data, struct pt_regs *regs, long ret)
+{
+ struct ltl_monitor *mon = ltl_get_monitor(current);
+
+ ltl_atom_set(mon, LTL_FUTEX_LOCK_PI, false);
+ ltl_atom_set(mon, LTL_FUTEX_WAIT, false);
+ ltl_atom_set(mon, LTL_NANOSLEEP_CLOCK_MONOTONIC, false);
+ ltl_atom_set(mon, LTL_NANOSLEEP_CLOCK_TAI, false);
+ ltl_atom_set(mon, LTL_NANOSLEEP_TIMER_ABSTIME, false);
+ ltl_atom_update(current, LTL_CLOCK_NANOSLEEP, false);
+}
+
+static void handle_kthread_stop(void *data, struct task_struct *task)
+{
+ /* FIXME: this could race with other tracepoint handlers */
+ ltl_atom_update(task, LTL_KTHREAD_SHOULD_STOP, true);
+}
+
+static int enable_sleep(void)
+{
+ int retval;
+
+ retval = ltl_monitor_init();
+ if (retval)
+ return retval;
+
+ rv_attach_trace_probe("rtapp_sleep", sched_waking, handle_sched_waking);
+ rv_attach_trace_probe("rtapp_sleep", sched_wakeup, handle_sched_wakeup);
+ rv_attach_trace_probe("rtapp_sleep", sched_set_state_tp, handle_sched_set_state);
+ rv_attach_trace_probe("rtapp_sleep", contention_begin, handle_contention_begin);
+ rv_attach_trace_probe("rtapp_sleep", contention_end, handle_contention_end);
+ rv_attach_trace_probe("rtapp_sleep", sched_kthread_stop, handle_kthread_stop);
+ rv_attach_trace_probe("rtapp_sleep", sys_enter, handle_sys_enter);
+ rv_attach_trace_probe("rtapp_sleep", sys_exit, handle_sys_exit);
+ return 0;
+}
+
+static void disable_sleep(void)
+{
+ rv_detach_trace_probe("rtapp_sleep", sched_waking, handle_sched_waking);
+ rv_detach_trace_probe("rtapp_sleep", sched_wakeup, handle_sched_wakeup);
+ rv_detach_trace_probe("rtapp_sleep", sched_set_state_tp, handle_sched_set_state);
+ rv_detach_trace_probe("rtapp_sleep", contention_begin, handle_contention_begin);
+ rv_detach_trace_probe("rtapp_sleep", contention_end, handle_contention_end);
+ rv_detach_trace_probe("rtapp_sleep", sched_kthread_stop, handle_kthread_stop);
+ rv_detach_trace_probe("rtapp_sleep", sys_enter, handle_sys_enter);
+ rv_detach_trace_probe("rtapp_sleep", sys_exit, handle_sys_exit);
+
+ ltl_monitor_destroy();
+}
+
+static struct rv_monitor rv_sleep = {
+ .name = "sleep",
+ .description = "Monitor that RT tasks do not undesirably sleep",
+ .enable = enable_sleep,
+ .disable = disable_sleep,
+};
+
+static int __init register_sleep(void)
+{
+ return rv_register_monitor(&rv_sleep, &rv_rtapp);
+}
+
+static void __exit unregister_sleep(void)
+{
+ rv_unregister_monitor(&rv_sleep);
+}
+
+module_init(register_sleep);
+module_exit(unregister_sleep);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Nam Cao <namcao@linutronix.de>");
+MODULE_DESCRIPTION("sleep: Monitor that RT tasks do not undesirably sleep");
diff --git a/kernel/trace/rv/monitors/sleep/sleep.h b/kernel/trace/rv/monitors/sleep/sleep.h
new file mode 100644
index 000000000000..2ab46fd218d2
--- /dev/null
+++ b/kernel/trace/rv/monitors/sleep/sleep.h
@@ -0,0 +1,257 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+/*
+ * C implementation of Buchi automaton, automatically generated by
+ * tools/verification/rvgen from the linear temporal logic specification.
+ * For further information, see kernel documentation:
+ * Documentation/trace/rv/linear_temporal_logic.rst
+ */
+
+#include <linux/rv.h>
+
+#define MONITOR_NAME sleep
+
+enum ltl_atom {
+ LTL_ABORT_SLEEP,
+ LTL_BLOCK_ON_RT_MUTEX,
+ LTL_CLOCK_NANOSLEEP,
+ LTL_FUTEX_LOCK_PI,
+ LTL_FUTEX_WAIT,
+ LTL_KERNEL_THREAD,
+ LTL_KTHREAD_SHOULD_STOP,
+ LTL_NANOSLEEP_CLOCK_MONOTONIC,
+ LTL_NANOSLEEP_CLOCK_TAI,
+ LTL_NANOSLEEP_TIMER_ABSTIME,
+ LTL_RT,
+ LTL_SLEEP,
+ LTL_TASK_IS_MIGRATION,
+ LTL_TASK_IS_RCU,
+ LTL_WAKE,
+ LTL_WOKEN_BY_EQUAL_OR_HIGHER_PRIO,
+ LTL_WOKEN_BY_HARDIRQ,
+ LTL_WOKEN_BY_NMI,
+ LTL_NUM_ATOM
+};
+static_assert(LTL_NUM_ATOM <= RV_MAX_LTL_ATOM);
+
+static const char *ltl_atom_str(enum ltl_atom atom)
+{
+ static const char *const names[] = {
+ "ab_sl",
+ "bl_on_rt_mu",
+ "cl_na",
+ "fu_lo_pi",
+ "fu_wa",
+ "ker_th",
+ "kth_sh_st",
+ "na_cl_mo",
+ "na_cl_ta",
+ "na_ti_ab",
+ "rt",
+ "sl",
+ "ta_mi",
+ "ta_rc",
+ "wak",
+ "wo_eq_hi_pr",
+ "wo_ha",
+ "wo_nm",
+ };
+
+ return names[atom];
+}
+
+enum ltl_buchi_state {
+ S0,
+ S1,
+ S2,
+ S3,
+ S4,
+ S5,
+ S6,
+ S7,
+ RV_NUM_BA_STATES
+};
+static_assert(RV_NUM_BA_STATES <= RV_MAX_BA_STATES);
+
+static void ltl_start(struct task_struct *task, struct ltl_monitor *mon)
+{
+ bool task_is_migration = test_bit(LTL_TASK_IS_MIGRATION, mon->atoms);
+ bool task_is_rcu = test_bit(LTL_TASK_IS_RCU, mon->atoms);
+ bool val40 = task_is_rcu || task_is_migration;
+ bool futex_lock_pi = test_bit(LTL_FUTEX_LOCK_PI, mon->atoms);
+ bool val41 = futex_lock_pi || val40;
+ bool block_on_rt_mutex = test_bit(LTL_BLOCK_ON_RT_MUTEX, mon->atoms);
+ bool val5 = block_on_rt_mutex || val41;
+ bool kthread_should_stop = test_bit(LTL_KTHREAD_SHOULD_STOP, mon->atoms);
+ bool abort_sleep = test_bit(LTL_ABORT_SLEEP, mon->atoms);
+ bool val32 = abort_sleep || kthread_should_stop;
+ bool woken_by_nmi = test_bit(LTL_WOKEN_BY_NMI, mon->atoms);
+ bool val33 = woken_by_nmi || val32;
+ bool woken_by_hardirq = test_bit(LTL_WOKEN_BY_HARDIRQ, mon->atoms);
+ bool val34 = woken_by_hardirq || val33;
+ bool woken_by_equal_or_higher_prio = test_bit(LTL_WOKEN_BY_EQUAL_OR_HIGHER_PRIO,
+ mon->atoms);
+ bool val14 = woken_by_equal_or_higher_prio || val34;
+ bool wake = test_bit(LTL_WAKE, mon->atoms);
+ bool val13 = !wake;
+ bool kernel_thread = test_bit(LTL_KERNEL_THREAD, mon->atoms);
+ bool nanosleep_clock_tai = test_bit(LTL_NANOSLEEP_CLOCK_TAI, mon->atoms);
+ bool nanosleep_clock_monotonic = test_bit(LTL_NANOSLEEP_CLOCK_MONOTONIC, mon->atoms);
+ bool val24 = nanosleep_clock_monotonic || nanosleep_clock_tai;
+ bool nanosleep_timer_abstime = test_bit(LTL_NANOSLEEP_TIMER_ABSTIME, mon->atoms);
+ bool val25 = nanosleep_timer_abstime && val24;
+ bool clock_nanosleep = test_bit(LTL_CLOCK_NANOSLEEP, mon->atoms);
+ bool val18 = clock_nanosleep && val25;
+ bool futex_wait = test_bit(LTL_FUTEX_WAIT, mon->atoms);
+ bool val9 = futex_wait || val18;
+ bool val11 = val9 || kernel_thread;
+ bool sleep = test_bit(LTL_SLEEP, mon->atoms);
+ bool val2 = !sleep;
+ bool rt = test_bit(LTL_RT, mon->atoms);
+ bool val1 = !rt;
+ bool val3 = val1 || val2;
+
+ if (val3)
+ __set_bit(S0, mon->states);
+ if (val11 && val13)
+ __set_bit(S1, mon->states);
+ if (val11 && val14)
+ __set_bit(S4, mon->states);
+ if (val5)
+ __set_bit(S5, mon->states);
+}
+
+static void
+ltl_possible_next_states(struct ltl_monitor *mon, unsigned int state, unsigned long *next)
+{
+ bool task_is_migration = test_bit(LTL_TASK_IS_MIGRATION, mon->atoms);
+ bool task_is_rcu = test_bit(LTL_TASK_IS_RCU, mon->atoms);
+ bool val40 = task_is_rcu || task_is_migration;
+ bool futex_lock_pi = test_bit(LTL_FUTEX_LOCK_PI, mon->atoms);
+ bool val41 = futex_lock_pi || val40;
+ bool block_on_rt_mutex = test_bit(LTL_BLOCK_ON_RT_MUTEX, mon->atoms);
+ bool val5 = block_on_rt_mutex || val41;
+ bool kthread_should_stop = test_bit(LTL_KTHREAD_SHOULD_STOP, mon->atoms);
+ bool abort_sleep = test_bit(LTL_ABORT_SLEEP, mon->atoms);
+ bool val32 = abort_sleep || kthread_should_stop;
+ bool woken_by_nmi = test_bit(LTL_WOKEN_BY_NMI, mon->atoms);
+ bool val33 = woken_by_nmi || val32;
+ bool woken_by_hardirq = test_bit(LTL_WOKEN_BY_HARDIRQ, mon->atoms);
+ bool val34 = woken_by_hardirq || val33;
+ bool woken_by_equal_or_higher_prio = test_bit(LTL_WOKEN_BY_EQUAL_OR_HIGHER_PRIO,
+ mon->atoms);
+ bool val14 = woken_by_equal_or_higher_prio || val34;
+ bool wake = test_bit(LTL_WAKE, mon->atoms);
+ bool val13 = !wake;
+ bool kernel_thread = test_bit(LTL_KERNEL_THREAD, mon->atoms);
+ bool nanosleep_clock_tai = test_bit(LTL_NANOSLEEP_CLOCK_TAI, mon->atoms);
+ bool nanosleep_clock_monotonic = test_bit(LTL_NANOSLEEP_CLOCK_MONOTONIC, mon->atoms);
+ bool val24 = nanosleep_clock_monotonic || nanosleep_clock_tai;
+ bool nanosleep_timer_abstime = test_bit(LTL_NANOSLEEP_TIMER_ABSTIME, mon->atoms);
+ bool val25 = nanosleep_timer_abstime && val24;
+ bool clock_nanosleep = test_bit(LTL_CLOCK_NANOSLEEP, mon->atoms);
+ bool val18 = clock_nanosleep && val25;
+ bool futex_wait = test_bit(LTL_FUTEX_WAIT, mon->atoms);
+ bool val9 = futex_wait || val18;
+ bool val11 = val9 || kernel_thread;
+ bool sleep = test_bit(LTL_SLEEP, mon->atoms);
+ bool val2 = !sleep;
+ bool rt = test_bit(LTL_RT, mon->atoms);
+ bool val1 = !rt;
+ bool val3 = val1 || val2;
+
+ switch (state) {
+ case S0:
+ if (val3)
+ __set_bit(S0, next);
+ if (val11 && val13)
+ __set_bit(S1, next);
+ if (val11 && val14)
+ __set_bit(S4, next);
+ if (val5)
+ __set_bit(S5, next);
+ break;
+ case S1:
+ if (val11 && val13)
+ __set_bit(S1, next);
+ if (val13 && val3)
+ __set_bit(S2, next);
+ if (val14 && val3)
+ __set_bit(S3, next);
+ if (val11 && val14)
+ __set_bit(S4, next);
+ if (val13 && val5)
+ __set_bit(S6, next);
+ if (val14 && val5)
+ __set_bit(S7, next);
+ break;
+ case S2:
+ if (val11 && val13)
+ __set_bit(S1, next);
+ if (val13 && val3)
+ __set_bit(S2, next);
+ if (val14 && val3)
+ __set_bit(S3, next);
+ if (val11 && val14)
+ __set_bit(S4, next);
+ if (val13 && val5)
+ __set_bit(S6, next);
+ if (val14 && val5)
+ __set_bit(S7, next);
+ break;
+ case S3:
+ if (val3)
+ __set_bit(S0, next);
+ if (val11 && val13)
+ __set_bit(S1, next);
+ if (val11 && val14)
+ __set_bit(S4, next);
+ if (val5)
+ __set_bit(S5, next);
+ break;
+ case S4:
+ if (val3)
+ __set_bit(S0, next);
+ if (val11 && val13)
+ __set_bit(S1, next);
+ if (val11 && val14)
+ __set_bit(S4, next);
+ if (val5)
+ __set_bit(S5, next);
+ break;
+ case S5:
+ if (val3)
+ __set_bit(S0, next);
+ if (val11 && val13)
+ __set_bit(S1, next);
+ if (val11 && val14)
+ __set_bit(S4, next);
+ if (val5)
+ __set_bit(S5, next);
+ break;
+ case S6:
+ if (val11 && val13)
+ __set_bit(S1, next);
+ if (val13 && val3)
+ __set_bit(S2, next);
+ if (val14 && val3)
+ __set_bit(S3, next);
+ if (val11 && val14)
+ __set_bit(S4, next);
+ if (val13 && val5)
+ __set_bit(S6, next);
+ if (val14 && val5)
+ __set_bit(S7, next);
+ break;
+ case S7:
+ if (val3)
+ __set_bit(S0, next);
+ if (val11 && val13)
+ __set_bit(S1, next);
+ if (val11 && val14)
+ __set_bit(S4, next);
+ if (val5)
+ __set_bit(S5, next);
+ break;
+ }
+}
diff --git a/kernel/trace/rv/monitors/sleep/sleep_trace.h b/kernel/trace/rv/monitors/sleep/sleep_trace.h
new file mode 100644
index 000000000000..22eaf31da987
--- /dev/null
+++ b/kernel/trace/rv/monitors/sleep/sleep_trace.h
@@ -0,0 +1,14 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+/*
+ * Snippet to be included in rv_trace.h
+ */
+
+#ifdef CONFIG_RV_MON_SLEEP
+DEFINE_EVENT(event_ltl_monitor_id, event_sleep,
+ TP_PROTO(struct task_struct *task, char *states, char *atoms, char *next),
+ TP_ARGS(task, states, atoms, next));
+DEFINE_EVENT(error_ltl_monitor_id, error_sleep,
+ TP_PROTO(struct task_struct *task),
+ TP_ARGS(task));
+#endif /* CONFIG_RV_MON_SLEEP */
diff --git a/kernel/trace/rv/monitors/sncid/sncid.c b/kernel/trace/rv/monitors/sncid/sncid.c
deleted file mode 100644
index f5037cd6214c..000000000000
--- a/kernel/trace/rv/monitors/sncid/sncid.c
+++ /dev/null
@@ -1,96 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#include <linux/ftrace.h>
-#include <linux/tracepoint.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/init.h>
-#include <linux/rv.h>
-#include <rv/instrumentation.h>
-#include <rv/da_monitor.h>
-
-#define MODULE_NAME "sncid"
-
-#include <trace/events/sched.h>
-#include <trace/events/preemptirq.h>
-#include <rv_trace.h>
-#include <monitors/sched/sched.h>
-
-#include "sncid.h"
-
-static struct rv_monitor rv_sncid;
-DECLARE_DA_MON_PER_CPU(sncid, unsigned char);
-
-static void handle_irq_disable(void *data, unsigned long ip, unsigned long parent_ip)
-{
- da_handle_event_sncid(irq_disable_sncid);
-}
-
-static void handle_irq_enable(void *data, unsigned long ip, unsigned long parent_ip)
-{
- da_handle_start_event_sncid(irq_enable_sncid);
-}
-
-static void handle_schedule_entry(void *data, bool preempt, unsigned long ip)
-{
- da_handle_start_event_sncid(schedule_entry_sncid);
-}
-
-static void handle_schedule_exit(void *data, bool is_switch, unsigned long ip)
-{
- da_handle_start_event_sncid(schedule_exit_sncid);
-}
-
-static int enable_sncid(void)
-{
- int retval;
-
- retval = da_monitor_init_sncid();
- if (retval)
- return retval;
-
- rv_attach_trace_probe("sncid", irq_disable, handle_irq_disable);
- rv_attach_trace_probe("sncid", irq_enable, handle_irq_enable);
- rv_attach_trace_probe("sncid", sched_entry_tp, handle_schedule_entry);
- rv_attach_trace_probe("sncid", sched_exit_tp, handle_schedule_exit);
-
- return 0;
-}
-
-static void disable_sncid(void)
-{
- rv_sncid.enabled = 0;
-
- rv_detach_trace_probe("sncid", irq_disable, handle_irq_disable);
- rv_detach_trace_probe("sncid", irq_enable, handle_irq_enable);
- rv_detach_trace_probe("sncid", sched_entry_tp, handle_schedule_entry);
- rv_detach_trace_probe("sncid", sched_exit_tp, handle_schedule_exit);
-
- da_monitor_destroy_sncid();
-}
-
-static struct rv_monitor rv_sncid = {
- .name = "sncid",
- .description = "schedule not called with interrupt disabled.",
- .enable = enable_sncid,
- .disable = disable_sncid,
- .reset = da_monitor_reset_all_sncid,
- .enabled = 0,
-};
-
-static int __init register_sncid(void)
-{
- rv_register_monitor(&rv_sncid, &rv_sched);
- return 0;
-}
-
-static void __exit unregister_sncid(void)
-{
- rv_unregister_monitor(&rv_sncid);
-}
-
-module_init(register_sncid);
-module_exit(unregister_sncid);
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Gabriele Monaco <gmonaco@redhat.com>");
-MODULE_DESCRIPTION("sncid: schedule not called with interrupt disabled.");
diff --git a/kernel/trace/rv/monitors/sncid/sncid.h b/kernel/trace/rv/monitors/sncid/sncid.h
deleted file mode 100644
index 21304725142b..000000000000
--- a/kernel/trace/rv/monitors/sncid/sncid.h
+++ /dev/null
@@ -1,49 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * Automatically generated C representation of sncid automaton
- * For further information about this format, see kernel documentation:
- * Documentation/trace/rv/deterministic_automata.rst
- */
-
-enum states_sncid {
- can_sched_sncid = 0,
- cant_sched_sncid,
- state_max_sncid
-};
-
-#define INVALID_STATE state_max_sncid
-
-enum events_sncid {
- irq_disable_sncid = 0,
- irq_enable_sncid,
- schedule_entry_sncid,
- schedule_exit_sncid,
- event_max_sncid
-};
-
-struct automaton_sncid {
- char *state_names[state_max_sncid];
- char *event_names[event_max_sncid];
- unsigned char function[state_max_sncid][event_max_sncid];
- unsigned char initial_state;
- bool final_states[state_max_sncid];
-};
-
-static const struct automaton_sncid automaton_sncid = {
- .state_names = {
- "can_sched",
- "cant_sched"
- },
- .event_names = {
- "irq_disable",
- "irq_enable",
- "schedule_entry",
- "schedule_exit"
- },
- .function = {
- { cant_sched_sncid, INVALID_STATE, can_sched_sncid, can_sched_sncid },
- { INVALID_STATE, can_sched_sncid, INVALID_STATE, INVALID_STATE },
- },
- .initial_state = can_sched_sncid,
- .final_states = { 1, 0 },
-};
diff --git a/kernel/trace/rv/monitors/snep/Kconfig b/kernel/trace/rv/monitors/snep/Kconfig
index 77527f971232..7dd54f434ff7 100644
--- a/kernel/trace/rv/monitors/snep/Kconfig
+++ b/kernel/trace/rv/monitors/snep/Kconfig
@@ -2,7 +2,7 @@
#
config RV_MON_SNEP
depends on RV
- depends on PREEMPT_TRACER
+ depends on TRACE_PREEMPT_TOGGLE
depends on RV_MON_SCHED
default y
select DA_MON_EVENTS_IMPLICIT
diff --git a/kernel/trace/rv/monitors/snep/snep.c b/kernel/trace/rv/monitors/snep/snep.c
index 0076ba6d7ea4..558950f524a5 100644
--- a/kernel/trace/rv/monitors/snep/snep.c
+++ b/kernel/trace/rv/monitors/snep/snep.c
@@ -30,12 +30,12 @@ static void handle_preempt_enable(void *data, unsigned long ip, unsigned long pa
da_handle_start_event_snep(preempt_enable_snep);
}
-static void handle_schedule_entry(void *data, bool preempt, unsigned long ip)
+static void handle_schedule_entry(void *data, bool preempt)
{
da_handle_event_snep(schedule_entry_snep);
}
-static void handle_schedule_exit(void *data, bool is_switch, unsigned long ip)
+static void handle_schedule_exit(void *data, bool is_switch)
{
da_handle_start_event_snep(schedule_exit_snep);
}
@@ -79,8 +79,7 @@ static struct rv_monitor rv_snep = {
static int __init register_snep(void)
{
- rv_register_monitor(&rv_snep, &rv_sched);
- return 0;
+ return rv_register_monitor(&rv_snep, &rv_sched);
}
static void __exit unregister_snep(void)
diff --git a/kernel/trace/rv/monitors/snep/snep.h b/kernel/trace/rv/monitors/snep/snep.h
index 6d16b9ad931e..4cd9abb77b7b 100644
--- a/kernel/trace/rv/monitors/snep/snep.h
+++ b/kernel/trace/rv/monitors/snep/snep.h
@@ -41,8 +41,18 @@ static const struct automaton_snep automaton_snep = {
"schedule_exit"
},
.function = {
- { non_scheduling_context_snep, non_scheduling_context_snep, scheduling_contex_snep, INVALID_STATE },
- { INVALID_STATE, INVALID_STATE, INVALID_STATE, non_scheduling_context_snep },
+ {
+ non_scheduling_context_snep,
+ non_scheduling_context_snep,
+ scheduling_contex_snep,
+ INVALID_STATE
+ },
+ {
+ INVALID_STATE,
+ INVALID_STATE,
+ INVALID_STATE,
+ non_scheduling_context_snep
+ },
},
.initial_state = non_scheduling_context_snep,
.final_states = { 1, 0 },
diff --git a/kernel/trace/rv/monitors/snroc/snroc.c b/kernel/trace/rv/monitors/snroc/snroc.c
index bb1f60d55296..540e686e699f 100644
--- a/kernel/trace/rv/monitors/snroc/snroc.c
+++ b/kernel/trace/rv/monitors/snroc/snroc.c
@@ -68,8 +68,7 @@ static struct rv_monitor rv_snroc = {
static int __init register_snroc(void)
{
- rv_register_monitor(&rv_snroc, &rv_sched);
- return 0;
+ return rv_register_monitor(&rv_snroc, &rv_sched);
}
static void __exit unregister_snroc(void)
diff --git a/kernel/trace/rv/monitors/sncid/Kconfig b/kernel/trace/rv/monitors/sssw/Kconfig
index 76bcfef4fd10..23b7eeb38bbf 100644
--- a/kernel/trace/rv/monitors/sncid/Kconfig
+++ b/kernel/trace/rv/monitors/sssw/Kconfig
@@ -1,14 +1,14 @@
# SPDX-License-Identifier: GPL-2.0-only
#
-config RV_MON_SNCID
+config RV_MON_SSSW
depends on RV
- depends on IRQSOFF_TRACER
depends on RV_MON_SCHED
default y
- select DA_MON_EVENTS_IMPLICIT
- bool "sncid monitor"
+ select DA_MON_EVENTS_ID
+ bool "sssw monitor"
help
- Monitor to ensure schedule is not called with interrupt disabled.
+ Monitor to ensure sched_set_state to sleepable leads to sleeping and
+ sleeping tasks require wakeup.
This monitor is part of the sched monitors collection.
For further information, see:
diff --git a/kernel/trace/rv/monitors/sssw/sssw.c b/kernel/trace/rv/monitors/sssw/sssw.c
new file mode 100644
index 000000000000..84b8d890d9d4
--- /dev/null
+++ b/kernel/trace/rv/monitors/sssw/sssw.c
@@ -0,0 +1,116 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/ftrace.h>
+#include <linux/tracepoint.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/rv.h>
+#include <rv/instrumentation.h>
+#include <rv/da_monitor.h>
+
+#define MODULE_NAME "sssw"
+
+#include <trace/events/sched.h>
+#include <trace/events/signal.h>
+#include <rv_trace.h>
+#include <monitors/sched/sched.h>
+
+#include "sssw.h"
+
+static struct rv_monitor rv_sssw;
+DECLARE_DA_MON_PER_TASK(sssw, unsigned char);
+
+static void handle_sched_set_state(void *data, struct task_struct *tsk, int state)
+{
+ if (state == TASK_RUNNING)
+ da_handle_start_event_sssw(tsk, sched_set_state_runnable_sssw);
+ else
+ da_handle_event_sssw(tsk, sched_set_state_sleepable_sssw);
+}
+
+static void handle_sched_switch(void *data, bool preempt,
+ struct task_struct *prev,
+ struct task_struct *next,
+ unsigned int prev_state)
+{
+ if (preempt)
+ da_handle_event_sssw(prev, sched_switch_preempt_sssw);
+ else if (prev_state == TASK_RUNNING)
+ da_handle_event_sssw(prev, sched_switch_yield_sssw);
+ else if (prev_state == TASK_RTLOCK_WAIT)
+ /* special case of sleeping task with racy conditions */
+ da_handle_event_sssw(prev, sched_switch_blocking_sssw);
+ else
+ da_handle_event_sssw(prev, sched_switch_suspend_sssw);
+ da_handle_event_sssw(next, sched_switch_in_sssw);
+}
+
+static void handle_sched_wakeup(void *data, struct task_struct *p)
+{
+ /*
+ * Wakeup can also lead to signal_wakeup although the system is
+ * actually runnable. The monitor can safely start with this event.
+ */
+ da_handle_start_event_sssw(p, sched_wakeup_sssw);
+}
+
+static void handle_signal_deliver(void *data, int sig,
+ struct kernel_siginfo *info,
+ struct k_sigaction *ka)
+{
+ da_handle_event_sssw(current, signal_deliver_sssw);
+}
+
+static int enable_sssw(void)
+{
+ int retval;
+
+ retval = da_monitor_init_sssw();
+ if (retval)
+ return retval;
+
+ rv_attach_trace_probe("sssw", sched_set_state_tp, handle_sched_set_state);
+ rv_attach_trace_probe("sssw", sched_switch, handle_sched_switch);
+ rv_attach_trace_probe("sssw", sched_wakeup, handle_sched_wakeup);
+ rv_attach_trace_probe("sssw", signal_deliver, handle_signal_deliver);
+
+ return 0;
+}
+
+static void disable_sssw(void)
+{
+ rv_sssw.enabled = 0;
+
+ rv_detach_trace_probe("sssw", sched_set_state_tp, handle_sched_set_state);
+ rv_detach_trace_probe("sssw", sched_switch, handle_sched_switch);
+ rv_detach_trace_probe("sssw", sched_wakeup, handle_sched_wakeup);
+ rv_detach_trace_probe("sssw", signal_deliver, handle_signal_deliver);
+
+ da_monitor_destroy_sssw();
+}
+
+static struct rv_monitor rv_sssw = {
+ .name = "sssw",
+ .description = "set state sleep and wakeup.",
+ .enable = enable_sssw,
+ .disable = disable_sssw,
+ .reset = da_monitor_reset_all_sssw,
+ .enabled = 0,
+};
+
+static int __init register_sssw(void)
+{
+ return rv_register_monitor(&rv_sssw, &rv_sched);
+}
+
+static void __exit unregister_sssw(void)
+{
+ rv_unregister_monitor(&rv_sssw);
+}
+
+module_init(register_sssw);
+module_exit(unregister_sssw);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Gabriele Monaco <gmonaco@redhat.com>");
+MODULE_DESCRIPTION("sssw: set state sleep and wakeup.");
diff --git a/kernel/trace/rv/monitors/sssw/sssw.h b/kernel/trace/rv/monitors/sssw/sssw.h
new file mode 100644
index 000000000000..243d54050c94
--- /dev/null
+++ b/kernel/trace/rv/monitors/sssw/sssw.h
@@ -0,0 +1,105 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Automatically generated C representation of sssw automaton
+ * For further information about this format, see kernel documentation:
+ * Documentation/trace/rv/deterministic_automata.rst
+ */
+
+enum states_sssw {
+ runnable_sssw = 0,
+ signal_wakeup_sssw,
+ sleepable_sssw,
+ sleeping_sssw,
+ state_max_sssw
+};
+
+#define INVALID_STATE state_max_sssw
+
+enum events_sssw {
+ sched_set_state_runnable_sssw = 0,
+ sched_set_state_sleepable_sssw,
+ sched_switch_blocking_sssw,
+ sched_switch_in_sssw,
+ sched_switch_preempt_sssw,
+ sched_switch_suspend_sssw,
+ sched_switch_yield_sssw,
+ sched_wakeup_sssw,
+ signal_deliver_sssw,
+ event_max_sssw
+};
+
+struct automaton_sssw {
+ char *state_names[state_max_sssw];
+ char *event_names[event_max_sssw];
+ unsigned char function[state_max_sssw][event_max_sssw];
+ unsigned char initial_state;
+ bool final_states[state_max_sssw];
+};
+
+static const struct automaton_sssw automaton_sssw = {
+ .state_names = {
+ "runnable",
+ "signal_wakeup",
+ "sleepable",
+ "sleeping"
+ },
+ .event_names = {
+ "sched_set_state_runnable",
+ "sched_set_state_sleepable",
+ "sched_switch_blocking",
+ "sched_switch_in",
+ "sched_switch_preempt",
+ "sched_switch_suspend",
+ "sched_switch_yield",
+ "sched_wakeup",
+ "signal_deliver"
+ },
+ .function = {
+ {
+ runnable_sssw,
+ sleepable_sssw,
+ sleeping_sssw,
+ runnable_sssw,
+ runnable_sssw,
+ INVALID_STATE,
+ runnable_sssw,
+ runnable_sssw,
+ runnable_sssw
+ },
+ {
+ INVALID_STATE,
+ sleepable_sssw,
+ INVALID_STATE,
+ signal_wakeup_sssw,
+ signal_wakeup_sssw,
+ INVALID_STATE,
+ signal_wakeup_sssw,
+ signal_wakeup_sssw,
+ runnable_sssw
+ },
+ {
+ runnable_sssw,
+ sleepable_sssw,
+ sleeping_sssw,
+ sleepable_sssw,
+ sleepable_sssw,
+ sleeping_sssw,
+ signal_wakeup_sssw,
+ runnable_sssw,
+ sleepable_sssw
+ },
+ {
+ INVALID_STATE,
+ INVALID_STATE,
+ INVALID_STATE,
+ INVALID_STATE,
+ INVALID_STATE,
+ INVALID_STATE,
+ INVALID_STATE,
+ runnable_sssw,
+ INVALID_STATE
+ },
+ },
+ .initial_state = runnable_sssw,
+ .final_states = { 1, 0, 0, 0 },
+};
diff --git a/kernel/trace/rv/monitors/sssw/sssw_trace.h b/kernel/trace/rv/monitors/sssw/sssw_trace.h
new file mode 100644
index 000000000000..6c03cfc6960b
--- /dev/null
+++ b/kernel/trace/rv/monitors/sssw/sssw_trace.h
@@ -0,0 +1,15 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+/*
+ * Snippet to be included in rv_trace.h
+ */
+
+#ifdef CONFIG_RV_MON_SSSW
+DEFINE_EVENT(event_da_monitor_id, event_sssw,
+ TP_PROTO(int id, char *state, char *event, char *next_state, bool final_state),
+ TP_ARGS(id, state, event, next_state, final_state));
+
+DEFINE_EVENT(error_da_monitor_id, error_sssw,
+ TP_PROTO(int id, char *state, char *event),
+ TP_ARGS(id, state, event));
+#endif /* CONFIG_RV_MON_SSSW */
diff --git a/kernel/trace/rv/monitors/sts/Kconfig b/kernel/trace/rv/monitors/sts/Kconfig
new file mode 100644
index 000000000000..7d1ff0f6fc91
--- /dev/null
+++ b/kernel/trace/rv/monitors/sts/Kconfig
@@ -0,0 +1,19 @@
+# SPDX-License-Identifier: GPL-2.0-only
+#
+config RV_MON_STS
+ depends on RV
+ depends on TRACE_IRQFLAGS
+ depends on RV_MON_SCHED
+ default y
+ select DA_MON_EVENTS_IMPLICIT
+ bool "sts monitor"
+ help
+ Monitor to ensure relationships between scheduler and task switches
+ * the scheduler is called and returns with interrupts disabled
+ * each call to the scheduler has up to one switch
+ * switches only happen inside the scheduler
+ * each call to the scheduler disables interrupts to switch
+ This monitor is part of the sched monitors collection.
+
+ For further information, see:
+ Documentation/trace/rv/monitor_sched.rst
diff --git a/kernel/trace/rv/monitors/sts/sts.c b/kernel/trace/rv/monitors/sts/sts.c
new file mode 100644
index 000000000000..c4a9cd67c1d2
--- /dev/null
+++ b/kernel/trace/rv/monitors/sts/sts.c
@@ -0,0 +1,156 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/ftrace.h>
+#include <linux/tracepoint.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/rv.h>
+#include <rv/instrumentation.h>
+#include <rv/da_monitor.h>
+
+#define MODULE_NAME "sts"
+
+#include <trace/events/sched.h>
+#include <trace/events/irq.h>
+#include <trace/events/preemptirq.h>
+#include <rv_trace.h>
+#include <monitors/sched/sched.h>
+
+#include "sts.h"
+
+static struct rv_monitor rv_sts;
+DECLARE_DA_MON_PER_CPU(sts, unsigned char);
+
+#ifdef CONFIG_X86_LOCAL_APIC
+#include <asm/trace/irq_vectors.h>
+
+static void handle_vector_irq_entry(void *data, int vector)
+{
+ da_handle_event_sts(irq_entry_sts);
+}
+
+static void attach_vector_irq(void)
+{
+ rv_attach_trace_probe("sts", local_timer_entry, handle_vector_irq_entry);
+ if (IS_ENABLED(CONFIG_IRQ_WORK))
+ rv_attach_trace_probe("sts", irq_work_entry, handle_vector_irq_entry);
+ if (IS_ENABLED(CONFIG_SMP)) {
+ rv_attach_trace_probe("sts", reschedule_entry, handle_vector_irq_entry);
+ rv_attach_trace_probe("sts", call_function_entry, handle_vector_irq_entry);
+ rv_attach_trace_probe("sts", call_function_single_entry, handle_vector_irq_entry);
+ }
+}
+
+static void detach_vector_irq(void)
+{
+ rv_detach_trace_probe("sts", local_timer_entry, handle_vector_irq_entry);
+ if (IS_ENABLED(CONFIG_IRQ_WORK))
+ rv_detach_trace_probe("sts", irq_work_entry, handle_vector_irq_entry);
+ if (IS_ENABLED(CONFIG_SMP)) {
+ rv_detach_trace_probe("sts", reschedule_entry, handle_vector_irq_entry);
+ rv_detach_trace_probe("sts", call_function_entry, handle_vector_irq_entry);
+ rv_detach_trace_probe("sts", call_function_single_entry, handle_vector_irq_entry);
+ }
+}
+
+#else
+/* We assume irq_entry tracepoints are sufficient on other architectures */
+static void attach_vector_irq(void) { }
+static void detach_vector_irq(void) { }
+#endif
+
+static void handle_irq_disable(void *data, unsigned long ip, unsigned long parent_ip)
+{
+ da_handle_event_sts(irq_disable_sts);
+}
+
+static void handle_irq_enable(void *data, unsigned long ip, unsigned long parent_ip)
+{
+ da_handle_event_sts(irq_enable_sts);
+}
+
+static void handle_irq_entry(void *data, int irq, struct irqaction *action)
+{
+ da_handle_event_sts(irq_entry_sts);
+}
+
+static void handle_sched_switch(void *data, bool preempt,
+ struct task_struct *prev,
+ struct task_struct *next,
+ unsigned int prev_state)
+{
+ da_handle_event_sts(sched_switch_sts);
+}
+
+static void handle_schedule_entry(void *data, bool preempt)
+{
+ da_handle_event_sts(schedule_entry_sts);
+}
+
+static void handle_schedule_exit(void *data, bool is_switch)
+{
+ da_handle_start_event_sts(schedule_exit_sts);
+}
+
+static int enable_sts(void)
+{
+ int retval;
+
+ retval = da_monitor_init_sts();
+ if (retval)
+ return retval;
+
+ rv_attach_trace_probe("sts", irq_disable, handle_irq_disable);
+ rv_attach_trace_probe("sts", irq_enable, handle_irq_enable);
+ rv_attach_trace_probe("sts", irq_handler_entry, handle_irq_entry);
+ rv_attach_trace_probe("sts", sched_switch, handle_sched_switch);
+ rv_attach_trace_probe("sts", sched_entry_tp, handle_schedule_entry);
+ rv_attach_trace_probe("sts", sched_exit_tp, handle_schedule_exit);
+ attach_vector_irq();
+
+ return 0;
+}
+
+static void disable_sts(void)
+{
+ rv_sts.enabled = 0;
+
+ rv_detach_trace_probe("sts", irq_disable, handle_irq_disable);
+ rv_detach_trace_probe("sts", irq_enable, handle_irq_enable);
+ rv_detach_trace_probe("sts", irq_handler_entry, handle_irq_entry);
+ rv_detach_trace_probe("sts", sched_switch, handle_sched_switch);
+ rv_detach_trace_probe("sts", sched_entry_tp, handle_schedule_entry);
+ rv_detach_trace_probe("sts", sched_exit_tp, handle_schedule_exit);
+ detach_vector_irq();
+
+ da_monitor_destroy_sts();
+}
+
+/*
+ * This is the monitor register section.
+ */
+static struct rv_monitor rv_sts = {
+ .name = "sts",
+ .description = "schedule implies task switch.",
+ .enable = enable_sts,
+ .disable = disable_sts,
+ .reset = da_monitor_reset_all_sts,
+ .enabled = 0,
+};
+
+static int __init register_sts(void)
+{
+ return rv_register_monitor(&rv_sts, &rv_sched);
+}
+
+static void __exit unregister_sts(void)
+{
+ rv_unregister_monitor(&rv_sts);
+}
+
+module_init(register_sts);
+module_exit(unregister_sts);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Gabriele Monaco <gmonaco@redhat.com>");
+MODULE_DESCRIPTION("sts: schedule implies task switch.");
diff --git a/kernel/trace/rv/monitors/sts/sts.h b/kernel/trace/rv/monitors/sts/sts.h
new file mode 100644
index 000000000000..3368b6599a00
--- /dev/null
+++ b/kernel/trace/rv/monitors/sts/sts.h
@@ -0,0 +1,117 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Automatically generated C representation of sts automaton
+ * For further information about this format, see kernel documentation:
+ * Documentation/trace/rv/deterministic_automata.rst
+ */
+
+enum states_sts {
+ can_sched_sts = 0,
+ cant_sched_sts,
+ disable_to_switch_sts,
+ enable_to_exit_sts,
+ in_irq_sts,
+ scheduling_sts,
+ switching_sts,
+ state_max_sts
+};
+
+#define INVALID_STATE state_max_sts
+
+enum events_sts {
+ irq_disable_sts = 0,
+ irq_enable_sts,
+ irq_entry_sts,
+ sched_switch_sts,
+ schedule_entry_sts,
+ schedule_exit_sts,
+ event_max_sts
+};
+
+struct automaton_sts {
+ char *state_names[state_max_sts];
+ char *event_names[event_max_sts];
+ unsigned char function[state_max_sts][event_max_sts];
+ unsigned char initial_state;
+ bool final_states[state_max_sts];
+};
+
+static const struct automaton_sts automaton_sts = {
+ .state_names = {
+ "can_sched",
+ "cant_sched",
+ "disable_to_switch",
+ "enable_to_exit",
+ "in_irq",
+ "scheduling",
+ "switching"
+ },
+ .event_names = {
+ "irq_disable",
+ "irq_enable",
+ "irq_entry",
+ "sched_switch",
+ "schedule_entry",
+ "schedule_exit"
+ },
+ .function = {
+ {
+ cant_sched_sts,
+ INVALID_STATE,
+ INVALID_STATE,
+ INVALID_STATE,
+ scheduling_sts,
+ INVALID_STATE
+ },
+ {
+ INVALID_STATE,
+ can_sched_sts,
+ cant_sched_sts,
+ INVALID_STATE,
+ INVALID_STATE,
+ INVALID_STATE
+ },
+ {
+ INVALID_STATE,
+ enable_to_exit_sts,
+ in_irq_sts,
+ switching_sts,
+ INVALID_STATE,
+ INVALID_STATE
+ },
+ {
+ enable_to_exit_sts,
+ enable_to_exit_sts,
+ enable_to_exit_sts,
+ INVALID_STATE,
+ INVALID_STATE,
+ can_sched_sts
+ },
+ {
+ INVALID_STATE,
+ scheduling_sts,
+ in_irq_sts,
+ INVALID_STATE,
+ INVALID_STATE,
+ INVALID_STATE
+ },
+ {
+ disable_to_switch_sts,
+ INVALID_STATE,
+ INVALID_STATE,
+ INVALID_STATE,
+ INVALID_STATE,
+ INVALID_STATE
+ },
+ {
+ INVALID_STATE,
+ enable_to_exit_sts,
+ INVALID_STATE,
+ INVALID_STATE,
+ INVALID_STATE,
+ INVALID_STATE
+ },
+ },
+ .initial_state = can_sched_sts,
+ .final_states = { 1, 0, 0, 0, 0, 0, 0 },
+};
diff --git a/kernel/trace/rv/monitors/tss/tss_trace.h b/kernel/trace/rv/monitors/sts/sts_trace.h
index 4619dbb50cc0..d78beb58d5b3 100644
--- a/kernel/trace/rv/monitors/tss/tss_trace.h
+++ b/kernel/trace/rv/monitors/sts/sts_trace.h
@@ -4,12 +4,12 @@
* Snippet to be included in rv_trace.h
*/
-#ifdef CONFIG_RV_MON_TSS
-DEFINE_EVENT(event_da_monitor, event_tss,
+#ifdef CONFIG_RV_MON_STS
+DEFINE_EVENT(event_da_monitor, event_sts,
TP_PROTO(char *state, char *event, char *next_state, bool final_state),
TP_ARGS(state, event, next_state, final_state));
-DEFINE_EVENT(error_da_monitor, error_tss,
+DEFINE_EVENT(error_da_monitor, error_sts,
TP_PROTO(char *state, char *event),
TP_ARGS(state, event));
-#endif /* CONFIG_RV_MON_TSS */
+#endif /* CONFIG_RV_MON_STS */
diff --git a/kernel/trace/rv/monitors/tss/tss.c b/kernel/trace/rv/monitors/tss/tss.c
deleted file mode 100644
index 542787e6524f..000000000000
--- a/kernel/trace/rv/monitors/tss/tss.c
+++ /dev/null
@@ -1,91 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#include <linux/ftrace.h>
-#include <linux/tracepoint.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/init.h>
-#include <linux/rv.h>
-#include <rv/instrumentation.h>
-#include <rv/da_monitor.h>
-
-#define MODULE_NAME "tss"
-
-#include <trace/events/sched.h>
-#include <rv_trace.h>
-#include <monitors/sched/sched.h>
-
-#include "tss.h"
-
-static struct rv_monitor rv_tss;
-DECLARE_DA_MON_PER_CPU(tss, unsigned char);
-
-static void handle_sched_switch(void *data, bool preempt,
- struct task_struct *prev,
- struct task_struct *next,
- unsigned int prev_state)
-{
- da_handle_event_tss(sched_switch_tss);
-}
-
-static void handle_schedule_entry(void *data, bool preempt, unsigned long ip)
-{
- da_handle_event_tss(schedule_entry_tss);
-}
-
-static void handle_schedule_exit(void *data, bool is_switch, unsigned long ip)
-{
- da_handle_start_event_tss(schedule_exit_tss);
-}
-
-static int enable_tss(void)
-{
- int retval;
-
- retval = da_monitor_init_tss();
- if (retval)
- return retval;
-
- rv_attach_trace_probe("tss", sched_switch, handle_sched_switch);
- rv_attach_trace_probe("tss", sched_entry_tp, handle_schedule_entry);
- rv_attach_trace_probe("tss", sched_exit_tp, handle_schedule_exit);
-
- return 0;
-}
-
-static void disable_tss(void)
-{
- rv_tss.enabled = 0;
-
- rv_detach_trace_probe("tss", sched_switch, handle_sched_switch);
- rv_detach_trace_probe("tss", sched_entry_tp, handle_schedule_entry);
- rv_detach_trace_probe("tss", sched_exit_tp, handle_schedule_exit);
-
- da_monitor_destroy_tss();
-}
-
-static struct rv_monitor rv_tss = {
- .name = "tss",
- .description = "task switch while scheduling.",
- .enable = enable_tss,
- .disable = disable_tss,
- .reset = da_monitor_reset_all_tss,
- .enabled = 0,
-};
-
-static int __init register_tss(void)
-{
- rv_register_monitor(&rv_tss, &rv_sched);
- return 0;
-}
-
-static void __exit unregister_tss(void)
-{
- rv_unregister_monitor(&rv_tss);
-}
-
-module_init(register_tss);
-module_exit(unregister_tss);
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Gabriele Monaco <gmonaco@redhat.com>");
-MODULE_DESCRIPTION("tss: task switch while scheduling.");
diff --git a/kernel/trace/rv/monitors/tss/tss.h b/kernel/trace/rv/monitors/tss/tss.h
deleted file mode 100644
index f0a36fda1b87..000000000000
--- a/kernel/trace/rv/monitors/tss/tss.h
+++ /dev/null
@@ -1,47 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * Automatically generated C representation of tss automaton
- * For further information about this format, see kernel documentation:
- * Documentation/trace/rv/deterministic_automata.rst
- */
-
-enum states_tss {
- thread_tss = 0,
- sched_tss,
- state_max_tss
-};
-
-#define INVALID_STATE state_max_tss
-
-enum events_tss {
- sched_switch_tss = 0,
- schedule_entry_tss,
- schedule_exit_tss,
- event_max_tss
-};
-
-struct automaton_tss {
- char *state_names[state_max_tss];
- char *event_names[event_max_tss];
- unsigned char function[state_max_tss][event_max_tss];
- unsigned char initial_state;
- bool final_states[state_max_tss];
-};
-
-static const struct automaton_tss automaton_tss = {
- .state_names = {
- "thread",
- "sched"
- },
- .event_names = {
- "sched_switch",
- "schedule_entry",
- "schedule_exit"
- },
- .function = {
- { INVALID_STATE, sched_tss, INVALID_STATE },
- { sched_tss, INVALID_STATE, thread_tss },
- },
- .initial_state = thread_tss,
- .final_states = { 1, 0 },
-};
diff --git a/kernel/trace/rv/monitors/wip/Kconfig b/kernel/trace/rv/monitors/wip/Kconfig
index e464b9294865..87a26195792b 100644
--- a/kernel/trace/rv/monitors/wip/Kconfig
+++ b/kernel/trace/rv/monitors/wip/Kconfig
@@ -2,7 +2,7 @@
#
config RV_MON_WIP
depends on RV
- depends on PREEMPT_TRACER
+ depends on TRACE_PREEMPT_TOGGLE
select DA_MON_EVENTS_IMPLICIT
bool "wip monitor"
help
diff --git a/kernel/trace/rv/monitors/wip/wip.c b/kernel/trace/rv/monitors/wip/wip.c
index ed758fec8608..4b4e99615a11 100644
--- a/kernel/trace/rv/monitors/wip/wip.c
+++ b/kernel/trace/rv/monitors/wip/wip.c
@@ -71,8 +71,7 @@ static struct rv_monitor rv_wip = {
static int __init register_wip(void)
{
- rv_register_monitor(&rv_wip, NULL);
- return 0;
+ return rv_register_monitor(&rv_wip, NULL);
}
static void __exit unregister_wip(void)
diff --git a/kernel/trace/rv/monitors/wwnr/wwnr.c b/kernel/trace/rv/monitors/wwnr/wwnr.c
index 172f31c4b0f3..4145bea2729e 100644
--- a/kernel/trace/rv/monitors/wwnr/wwnr.c
+++ b/kernel/trace/rv/monitors/wwnr/wwnr.c
@@ -70,8 +70,7 @@ static struct rv_monitor rv_wwnr = {
static int __init register_wwnr(void)
{
- rv_register_monitor(&rv_wwnr, NULL);
- return 0;
+ return rv_register_monitor(&rv_wwnr, NULL);
}
static void __exit unregister_wwnr(void)
diff --git a/kernel/trace/rv/reactor_panic.c b/kernel/trace/rv/reactor_panic.c
index 0186ff4cbd0b..74c6bcc2c749 100644
--- a/kernel/trace/rv/reactor_panic.c
+++ b/kernel/trace/rv/reactor_panic.c
@@ -13,9 +13,13 @@
#include <linux/init.h>
#include <linux/rv.h>
-static void rv_panic_reaction(char *msg)
+__printf(1, 2) static void rv_panic_reaction(const char *msg, ...)
{
- panic(msg);
+ va_list args;
+
+ va_start(args, msg);
+ vpanic(msg, args);
+ va_end(args);
}
static struct rv_reactor rv_panic = {
diff --git a/kernel/trace/rv/reactor_printk.c b/kernel/trace/rv/reactor_printk.c
index 178759dbf89f..2dae2916c05f 100644
--- a/kernel/trace/rv/reactor_printk.c
+++ b/kernel/trace/rv/reactor_printk.c
@@ -12,9 +12,13 @@
#include <linux/init.h>
#include <linux/rv.h>
-static void rv_printk_reaction(char *msg)
+__printf(1, 2) static void rv_printk_reaction(const char *msg, ...)
{
- printk_deferred(msg);
+ va_list args;
+
+ va_start(args, msg);
+ vprintk_deferred(msg, args);
+ va_end(args);
}
static struct rv_reactor rv_printk = {
diff --git a/kernel/trace/rv/rv.c b/kernel/trace/rv/rv.c
index e4077500a91d..bd7d56dbf6c2 100644
--- a/kernel/trace/rv/rv.c
+++ b/kernel/trace/rv/rv.c
@@ -143,7 +143,7 @@
#include <linux/init.h>
#include <linux/slab.h>
-#ifdef CONFIG_DA_MON_EVENTS
+#ifdef CONFIG_RV_MON_EVENTS
#define CREATE_TRACE_POINTS
#include <rv_trace.h>
#endif
@@ -165,7 +165,7 @@ struct dentry *get_monitors_root(void)
LIST_HEAD(rv_monitors_list);
static int task_monitor_count;
-static bool task_monitor_slots[RV_PER_TASK_MONITORS];
+static bool task_monitor_slots[CONFIG_RV_PER_TASK_MONITORS];
int rv_get_task_monitor_slot(void)
{
@@ -173,12 +173,12 @@ int rv_get_task_monitor_slot(void)
lockdep_assert_held(&rv_interface_lock);
- if (task_monitor_count == RV_PER_TASK_MONITORS)
+ if (task_monitor_count == CONFIG_RV_PER_TASK_MONITORS)
return -EBUSY;
task_monitor_count++;
- for (i = 0; i < RV_PER_TASK_MONITORS; i++) {
+ for (i = 0; i < CONFIG_RV_PER_TASK_MONITORS; i++) {
if (task_monitor_slots[i] == false) {
task_monitor_slots[i] = true;
return i;
@@ -194,7 +194,7 @@ void rv_put_task_monitor_slot(int slot)
{
lockdep_assert_held(&rv_interface_lock);
- if (slot < 0 || slot >= RV_PER_TASK_MONITORS) {
+ if (slot < 0 || slot >= CONFIG_RV_PER_TASK_MONITORS) {
WARN_ONCE(1, "RV releasing an invalid slot!: %d\n", slot);
return;
}
@@ -210,9 +210,9 @@ void rv_put_task_monitor_slot(int slot)
* Monitors with a parent are nested,
* Monitors without a parent could be standalone or containers.
*/
-bool rv_is_nested_monitor(struct rv_monitor_def *mdef)
+bool rv_is_nested_monitor(struct rv_monitor *mon)
{
- return mdef->parent != NULL;
+ return mon->parent != NULL;
}
/*
@@ -223,16 +223,16 @@ bool rv_is_nested_monitor(struct rv_monitor_def *mdef)
* for enable()/disable(). Use this condition to find empty containers.
* Keep both conditions in case we have some non-compliant containers.
*/
-bool rv_is_container_monitor(struct rv_monitor_def *mdef)
+bool rv_is_container_monitor(struct rv_monitor *mon)
{
- struct rv_monitor_def *next;
+ struct rv_monitor *next;
- if (list_is_last(&mdef->list, &rv_monitors_list))
+ if (list_is_last(&mon->list, &rv_monitors_list))
return false;
- next = list_next_entry(mdef, list);
+ next = list_next_entry(mon, list);
- return next->parent == mdef->monitor || !mdef->monitor->enable;
+ return next->parent == mon || !mon->enable;
}
/*
@@ -241,10 +241,10 @@ bool rv_is_container_monitor(struct rv_monitor_def *mdef)
static ssize_t monitor_enable_read_data(struct file *filp, char __user *user_buf, size_t count,
loff_t *ppos)
{
- struct rv_monitor_def *mdef = filp->private_data;
+ struct rv_monitor *mon = filp->private_data;
const char *buff;
- buff = mdef->monitor->enabled ? "1\n" : "0\n";
+ buff = mon->enabled ? "1\n" : "0\n";
return simple_read_from_buffer(user_buf, count, ppos, buff, strlen(buff)+1);
}
@@ -252,14 +252,14 @@ static ssize_t monitor_enable_read_data(struct file *filp, char __user *user_buf
/*
* __rv_disable_monitor - disabled an enabled monitor
*/
-static int __rv_disable_monitor(struct rv_monitor_def *mdef, bool sync)
+static int __rv_disable_monitor(struct rv_monitor *mon, bool sync)
{
lockdep_assert_held(&rv_interface_lock);
- if (mdef->monitor->enabled) {
- mdef->monitor->enabled = 0;
- if (mdef->monitor->disable)
- mdef->monitor->disable();
+ if (mon->enabled) {
+ mon->enabled = 0;
+ if (mon->disable)
+ mon->disable();
/*
* Wait for the execution of all events to finish.
@@ -273,90 +273,90 @@ static int __rv_disable_monitor(struct rv_monitor_def *mdef, bool sync)
return 0;
}
-static void rv_disable_single(struct rv_monitor_def *mdef)
+static void rv_disable_single(struct rv_monitor *mon)
{
- __rv_disable_monitor(mdef, true);
+ __rv_disable_monitor(mon, true);
}
-static int rv_enable_single(struct rv_monitor_def *mdef)
+static int rv_enable_single(struct rv_monitor *mon)
{
int retval;
lockdep_assert_held(&rv_interface_lock);
- if (mdef->monitor->enabled)
+ if (mon->enabled)
return 0;
- retval = mdef->monitor->enable();
+ retval = mon->enable();
if (!retval)
- mdef->monitor->enabled = 1;
+ mon->enabled = 1;
return retval;
}
-static void rv_disable_container(struct rv_monitor_def *mdef)
+static void rv_disable_container(struct rv_monitor *mon)
{
- struct rv_monitor_def *p = mdef;
+ struct rv_monitor *p = mon;
int enabled = 0;
list_for_each_entry_continue(p, &rv_monitors_list, list) {
- if (p->parent != mdef->monitor)
+ if (p->parent != mon)
break;
enabled += __rv_disable_monitor(p, false);
}
if (enabled)
tracepoint_synchronize_unregister();
- mdef->monitor->enabled = 0;
+ mon->enabled = 0;
}
-static int rv_enable_container(struct rv_monitor_def *mdef)
+static int rv_enable_container(struct rv_monitor *mon)
{
- struct rv_monitor_def *p = mdef;
+ struct rv_monitor *p = mon;
int retval = 0;
list_for_each_entry_continue(p, &rv_monitors_list, list) {
- if (retval || p->parent != mdef->monitor)
+ if (retval || p->parent != mon)
break;
retval = rv_enable_single(p);
}
if (retval)
- rv_disable_container(mdef);
+ rv_disable_container(mon);
else
- mdef->monitor->enabled = 1;
+ mon->enabled = 1;
return retval;
}
/**
* rv_disable_monitor - disable a given runtime monitor
- * @mdef: Pointer to the monitor definition structure.
+ * @mon: Pointer to the monitor definition structure.
*
* Returns 0 on success.
*/
-int rv_disable_monitor(struct rv_monitor_def *mdef)
+int rv_disable_monitor(struct rv_monitor *mon)
{
- if (rv_is_container_monitor(mdef))
- rv_disable_container(mdef);
+ if (rv_is_container_monitor(mon))
+ rv_disable_container(mon);
else
- rv_disable_single(mdef);
+ rv_disable_single(mon);
return 0;
}
/**
* rv_enable_monitor - enable a given runtime monitor
- * @mdef: Pointer to the monitor definition structure.
+ * @mon: Pointer to the monitor definition structure.
*
* Returns 0 on success, error otherwise.
*/
-int rv_enable_monitor(struct rv_monitor_def *mdef)
+int rv_enable_monitor(struct rv_monitor *mon)
{
int retval;
- if (rv_is_container_monitor(mdef))
- retval = rv_enable_container(mdef);
+ if (rv_is_container_monitor(mon))
+ retval = rv_enable_container(mon);
else
- retval = rv_enable_single(mdef);
+ retval = rv_enable_single(mon);
return retval;
}
@@ -367,7 +367,7 @@ int rv_enable_monitor(struct rv_monitor_def *mdef)
static ssize_t monitor_enable_write_data(struct file *filp, const char __user *user_buf,
size_t count, loff_t *ppos)
{
- struct rv_monitor_def *mdef = filp->private_data;
+ struct rv_monitor *mon = filp->private_data;
int retval;
bool val;
@@ -378,9 +378,9 @@ static ssize_t monitor_enable_write_data(struct file *filp, const char __user *u
mutex_lock(&rv_interface_lock);
if (val)
- retval = rv_enable_monitor(mdef);
+ retval = rv_enable_monitor(mon);
else
- retval = rv_disable_monitor(mdef);
+ retval = rv_disable_monitor(mon);
mutex_unlock(&rv_interface_lock);
@@ -399,12 +399,12 @@ static const struct file_operations interface_enable_fops = {
static ssize_t monitor_desc_read_data(struct file *filp, char __user *user_buf, size_t count,
loff_t *ppos)
{
- struct rv_monitor_def *mdef = filp->private_data;
+ struct rv_monitor *mon = filp->private_data;
char buff[256];
memset(buff, 0, sizeof(buff));
- snprintf(buff, sizeof(buff), "%s\n", mdef->monitor->description);
+ snprintf(buff, sizeof(buff), "%s\n", mon->description);
return simple_read_from_buffer(user_buf, count, ppos, buff, strlen(buff) + 1);
}
@@ -419,37 +419,37 @@ static const struct file_operations interface_desc_fops = {
* the monitor dir, where the specific options of the monitor
* are exposed.
*/
-static int create_monitor_dir(struct rv_monitor_def *mdef, struct rv_monitor_def *parent)
+static int create_monitor_dir(struct rv_monitor *mon, struct rv_monitor *parent)
{
struct dentry *root = parent ? parent->root_d : get_monitors_root();
- const char *name = mdef->monitor->name;
+ const char *name = mon->name;
struct dentry *tmp;
int retval;
- mdef->root_d = rv_create_dir(name, root);
- if (!mdef->root_d)
+ mon->root_d = rv_create_dir(name, root);
+ if (!mon->root_d)
return -ENOMEM;
- tmp = rv_create_file("enable", RV_MODE_WRITE, mdef->root_d, mdef, &interface_enable_fops);
+ tmp = rv_create_file("enable", RV_MODE_WRITE, mon->root_d, mon, &interface_enable_fops);
if (!tmp) {
retval = -ENOMEM;
goto out_remove_root;
}
- tmp = rv_create_file("desc", RV_MODE_READ, mdef->root_d, mdef, &interface_desc_fops);
+ tmp = rv_create_file("desc", RV_MODE_READ, mon->root_d, mon, &interface_desc_fops);
if (!tmp) {
retval = -ENOMEM;
goto out_remove_root;
}
- retval = reactor_populate_monitor(mdef);
+ retval = reactor_populate_monitor(mon);
if (retval)
goto out_remove_root;
return 0;
out_remove_root:
- rv_remove(mdef->root_d);
+ rv_remove(mon->root_d);
return retval;
}
@@ -458,13 +458,12 @@ out_remove_root:
*/
static int monitors_show(struct seq_file *m, void *p)
{
- struct rv_monitor_def *mon_def = p;
+ struct rv_monitor *mon = container_of(p, struct rv_monitor, list);
- if (mon_def->parent)
- seq_printf(m, "%s:%s\n", mon_def->parent->name,
- mon_def->monitor->name);
+ if (mon->parent)
+ seq_printf(m, "%s:%s\n", mon->parent->name, mon->name);
else
- seq_printf(m, "%s\n", mon_def->monitor->name);
+ seq_printf(m, "%s\n", mon->name);
return 0;
}
@@ -496,13 +495,13 @@ static void *available_monitors_next(struct seq_file *m, void *p, loff_t *pos)
*/
static void *enabled_monitors_next(struct seq_file *m, void *p, loff_t *pos)
{
- struct rv_monitor_def *m_def = p;
+ struct rv_monitor *mon = p;
(*pos)++;
- list_for_each_entry_continue(m_def, &rv_monitors_list, list) {
- if (m_def->monitor->enabled)
- return m_def;
+ list_for_each_entry_continue(mon, &rv_monitors_list, list) {
+ if (mon->enabled)
+ return mon;
}
return NULL;
@@ -510,7 +509,7 @@ static void *enabled_monitors_next(struct seq_file *m, void *p, loff_t *pos)
static void *enabled_monitors_start(struct seq_file *m, loff_t *pos)
{
- struct rv_monitor_def *m_def;
+ struct rv_monitor *mon;
loff_t l;
mutex_lock(&rv_interface_lock);
@@ -518,15 +517,15 @@ static void *enabled_monitors_start(struct seq_file *m, loff_t *pos)
if (list_empty(&rv_monitors_list))
return NULL;
- m_def = list_entry(&rv_monitors_list, struct rv_monitor_def, list);
+ mon = list_entry(&rv_monitors_list, struct rv_monitor, list);
for (l = 0; l <= *pos; ) {
- m_def = enabled_monitors_next(m, m_def, &l);
- if (!m_def)
+ mon = enabled_monitors_next(m, mon, &l);
+ if (!mon)
break;
}
- return m_def;
+ return mon;
}
/*
@@ -566,13 +565,13 @@ static const struct file_operations available_monitors_ops = {
*/
static void disable_all_monitors(void)
{
- struct rv_monitor_def *mdef;
+ struct rv_monitor *mon;
int enabled = 0;
mutex_lock(&rv_interface_lock);
- list_for_each_entry(mdef, &rv_monitors_list, list)
- enabled += __rv_disable_monitor(mdef, false);
+ list_for_each_entry(mon, &rv_monitors_list, list)
+ enabled += __rv_disable_monitor(mon, false);
if (enabled) {
/*
@@ -598,7 +597,7 @@ static ssize_t enabled_monitors_write(struct file *filp, const char __user *user
size_t count, loff_t *ppos)
{
char buff[MAX_RV_MONITOR_NAME_SIZE + 2];
- struct rv_monitor_def *mdef;
+ struct rv_monitor *mon;
int retval = -EINVAL;
bool enable = true;
char *ptr, *tmp;
@@ -633,17 +632,17 @@ static ssize_t enabled_monitors_write(struct file *filp, const char __user *user
if (tmp)
ptr = tmp+1;
- list_for_each_entry(mdef, &rv_monitors_list, list) {
- if (strcmp(ptr, mdef->monitor->name) != 0)
+ list_for_each_entry(mon, &rv_monitors_list, list) {
+ if (strcmp(ptr, mon->name) != 0)
continue;
/*
* Monitor found!
*/
if (enable)
- retval = rv_enable_monitor(mdef);
+ retval = rv_enable_monitor(mon);
else
- retval = rv_disable_monitor(mdef);
+ retval = rv_disable_monitor(mon);
if (!retval)
retval = count;
@@ -702,11 +701,11 @@ static void turn_monitoring_off(void)
static void reset_all_monitors(void)
{
- struct rv_monitor_def *mdef;
+ struct rv_monitor *mon;
- list_for_each_entry(mdef, &rv_monitors_list, list) {
- if (mdef->monitor->enabled && mdef->monitor->reset)
- mdef->monitor->reset();
+ list_for_each_entry(mon, &rv_monitors_list, list) {
+ if (mon->enabled && mon->reset)
+ mon->reset();
}
}
@@ -768,10 +767,9 @@ static const struct file_operations monitoring_on_fops = {
.read = monitoring_on_read_data,
};
-static void destroy_monitor_dir(struct rv_monitor_def *mdef)
+static void destroy_monitor_dir(struct rv_monitor *mon)
{
- reactor_cleanup_monitor(mdef);
- rv_remove(mdef->root_d);
+ rv_remove(mon->root_d);
}
/**
@@ -783,7 +781,7 @@ static void destroy_monitor_dir(struct rv_monitor_def *mdef)
*/
int rv_register_monitor(struct rv_monitor *monitor, struct rv_monitor *parent)
{
- struct rv_monitor_def *r, *p = NULL;
+ struct rv_monitor *r;
int retval = 0;
if (strlen(monitor->name) >= MAX_RV_MONITOR_NAME_SIZE) {
@@ -795,49 +793,31 @@ int rv_register_monitor(struct rv_monitor *monitor, struct rv_monitor *parent)
mutex_lock(&rv_interface_lock);
list_for_each_entry(r, &rv_monitors_list, list) {
- if (strcmp(monitor->name, r->monitor->name) == 0) {
+ if (strcmp(monitor->name, r->name) == 0) {
pr_info("Monitor %s is already registered\n", monitor->name);
retval = -EEXIST;
goto out_unlock;
}
}
- if (parent) {
- list_for_each_entry(r, &rv_monitors_list, list) {
- if (strcmp(parent->name, r->monitor->name) == 0) {
- p = r;
- break;
- }
- }
- }
-
- if (p && rv_is_nested_monitor(p)) {
+ if (parent && rv_is_nested_monitor(parent)) {
pr_info("Parent monitor %s is already nested, cannot nest further\n",
parent->name);
retval = -EINVAL;
goto out_unlock;
}
- r = kzalloc(sizeof(struct rv_monitor_def), GFP_KERNEL);
- if (!r) {
- retval = -ENOMEM;
- goto out_unlock;
- }
-
- r->monitor = monitor;
- r->parent = parent;
+ monitor->parent = parent;
- retval = create_monitor_dir(r, p);
- if (retval) {
- kfree(r);
- goto out_unlock;
- }
+ retval = create_monitor_dir(monitor, parent);
+ if (retval)
+ return retval;
/* keep children close to the parent for easier visualisation */
- if (p)
- list_add(&r->list, &p->list);
+ if (parent)
+ list_add(&monitor->list, &parent->list);
else
- list_add_tail(&r->list, &rv_monitors_list);
+ list_add_tail(&monitor->list, &rv_monitors_list);
out_unlock:
mutex_unlock(&rv_interface_lock);
@@ -852,17 +832,11 @@ out_unlock:
*/
int rv_unregister_monitor(struct rv_monitor *monitor)
{
- struct rv_monitor_def *ptr, *next;
-
mutex_lock(&rv_interface_lock);
- list_for_each_entry_safe(ptr, next, &rv_monitors_list, list) {
- if (strcmp(monitor->name, ptr->monitor->name) == 0) {
- rv_disable_monitor(ptr);
- list_del(&ptr->list);
- destroy_monitor_dir(ptr);
- }
- }
+ rv_disable_monitor(monitor);
+ list_del(&monitor->list);
+ destroy_monitor_dir(monitor);
mutex_unlock(&rv_interface_lock);
return 0;
diff --git a/kernel/trace/rv/rv.h b/kernel/trace/rv/rv.h
index 98fca0a1adbc..1485a70c1bf4 100644
--- a/kernel/trace/rv/rv.h
+++ b/kernel/trace/rv/rv.h
@@ -23,48 +23,21 @@ struct rv_interface {
extern struct mutex rv_interface_lock;
extern struct list_head rv_monitors_list;
-#ifdef CONFIG_RV_REACTORS
-struct rv_reactor_def {
- struct list_head list;
- struct rv_reactor *reactor;
- /* protected by the monitor interface lock */
- int counter;
-};
-#endif
-
-struct rv_monitor_def {
- struct list_head list;
- struct rv_monitor *monitor;
- struct rv_monitor *parent;
- struct dentry *root_d;
-#ifdef CONFIG_RV_REACTORS
- struct rv_reactor_def *rdef;
- bool reacting;
-#endif
- bool task_monitor;
-};
-
struct dentry *get_monitors_root(void);
-int rv_disable_monitor(struct rv_monitor_def *mdef);
-int rv_enable_monitor(struct rv_monitor_def *mdef);
-bool rv_is_container_monitor(struct rv_monitor_def *mdef);
-bool rv_is_nested_monitor(struct rv_monitor_def *mdef);
+int rv_disable_monitor(struct rv_monitor *mon);
+int rv_enable_monitor(struct rv_monitor *mon);
+bool rv_is_container_monitor(struct rv_monitor *mon);
+bool rv_is_nested_monitor(struct rv_monitor *mon);
#ifdef CONFIG_RV_REACTORS
-int reactor_populate_monitor(struct rv_monitor_def *mdef);
-void reactor_cleanup_monitor(struct rv_monitor_def *mdef);
+int reactor_populate_monitor(struct rv_monitor *mon);
int init_rv_reactors(struct dentry *root_dir);
#else
-static inline int reactor_populate_monitor(struct rv_monitor_def *mdef)
+static inline int reactor_populate_monitor(struct rv_monitor *mon)
{
return 0;
}
-static inline void reactor_cleanup_monitor(struct rv_monitor_def *mdef)
-{
- return;
-}
-
static inline int init_rv_reactors(struct dentry *root_dir)
{
return 0;
diff --git a/kernel/trace/rv/rv_reactors.c b/kernel/trace/rv/rv_reactors.c
index 9501ca886d83..d32859fec238 100644
--- a/kernel/trace/rv/rv_reactors.c
+++ b/kernel/trace/rv/rv_reactors.c
@@ -70,12 +70,12 @@
*/
static LIST_HEAD(rv_reactors_list);
-static struct rv_reactor_def *get_reactor_rdef_by_name(char *name)
+static struct rv_reactor *get_reactor_rdef_by_name(char *name)
{
- struct rv_reactor_def *r;
+ struct rv_reactor *r;
list_for_each_entry(r, &rv_reactors_list, list) {
- if (strcmp(name, r->reactor->name) == 0)
+ if (strcmp(name, r->name) == 0)
return r;
}
return NULL;
@@ -86,9 +86,9 @@ static struct rv_reactor_def *get_reactor_rdef_by_name(char *name)
*/
static int reactors_show(struct seq_file *m, void *p)
{
- struct rv_reactor_def *rea_def = p;
+ struct rv_reactor *reactor = container_of(p, struct rv_reactor, list);
- seq_printf(m, "%s\n", rea_def->reactor->name);
+ seq_printf(m, "%s\n", reactor->name);
return 0;
}
@@ -138,13 +138,13 @@ static const struct file_operations available_reactors_ops = {
*/
static int monitor_reactor_show(struct seq_file *m, void *p)
{
- struct rv_monitor_def *mdef = m->private;
- struct rv_reactor_def *rdef = p;
+ struct rv_monitor *mon = m->private;
+ struct rv_reactor *reactor = container_of(p, struct rv_reactor, list);
- if (mdef->rdef == rdef)
- seq_printf(m, "[%s]\n", rdef->reactor->name);
+ if (mon->reactor == reactor)
+ seq_printf(m, "[%s]\n", reactor->name);
else
- seq_printf(m, "%s\n", rdef->reactor->name);
+ seq_printf(m, "%s\n", reactor->name);
return 0;
}
@@ -158,43 +158,37 @@ static const struct seq_operations monitor_reactors_seq_ops = {
.show = monitor_reactor_show
};
-static void monitor_swap_reactors_single(struct rv_monitor_def *mdef,
- struct rv_reactor_def *rdef,
- bool reacting, bool nested)
+static void monitor_swap_reactors_single(struct rv_monitor *mon,
+ struct rv_reactor *reactor,
+ bool nested)
{
bool monitor_enabled;
/* nothing to do */
- if (mdef->rdef == rdef)
+ if (mon->reactor == reactor)
return;
- monitor_enabled = mdef->monitor->enabled;
+ monitor_enabled = mon->enabled;
if (monitor_enabled)
- rv_disable_monitor(mdef);
+ rv_disable_monitor(mon);
- /* swap reactor's usage */
- mdef->rdef->counter--;
- rdef->counter++;
-
- mdef->rdef = rdef;
- mdef->reacting = reacting;
- mdef->monitor->react = rdef->reactor->react;
+ mon->reactor = reactor;
+ mon->react = reactor->react;
/* enable only once if iterating through a container */
if (monitor_enabled && !nested)
- rv_enable_monitor(mdef);
+ rv_enable_monitor(mon);
}
-static void monitor_swap_reactors(struct rv_monitor_def *mdef,
- struct rv_reactor_def *rdef, bool reacting)
+static void monitor_swap_reactors(struct rv_monitor *mon, struct rv_reactor *reactor)
{
- struct rv_monitor_def *p = mdef;
+ struct rv_monitor *p = mon;
- if (rv_is_container_monitor(mdef))
+ if (rv_is_container_monitor(mon))
list_for_each_entry_continue(p, &rv_monitors_list, list) {
- if (p->parent != mdef->monitor)
+ if (p->parent != mon)
break;
- monitor_swap_reactors_single(p, rdef, reacting, true);
+ monitor_swap_reactors_single(p, reactor, true);
}
/*
* This call enables and disables the monitor if they were active.
@@ -202,7 +196,7 @@ static void monitor_swap_reactors(struct rv_monitor_def *mdef,
* All nested monitors are enabled also if they were off, we may refine
* this logic in the future.
*/
- monitor_swap_reactors_single(mdef, rdef, reacting, false);
+ monitor_swap_reactors_single(mon, reactor, false);
}
static ssize_t
@@ -210,11 +204,10 @@ monitor_reactors_write(struct file *file, const char __user *user_buf,
size_t count, loff_t *ppos)
{
char buff[MAX_RV_REACTOR_NAME_SIZE + 2];
- struct rv_monitor_def *mdef;
- struct rv_reactor_def *rdef;
+ struct rv_monitor *mon;
+ struct rv_reactor *reactor;
struct seq_file *seq_f;
int retval = -EINVAL;
- bool enable;
char *ptr;
int len;
@@ -237,22 +230,17 @@ monitor_reactors_write(struct file *file, const char __user *user_buf,
* See monitor_reactors_open()
*/
seq_f = file->private_data;
- mdef = seq_f->private;
+ mon = seq_f->private;
mutex_lock(&rv_interface_lock);
retval = -EINVAL;
- list_for_each_entry(rdef, &rv_reactors_list, list) {
- if (strcmp(ptr, rdef->reactor->name) != 0)
+ list_for_each_entry(reactor, &rv_reactors_list, list) {
+ if (strcmp(ptr, reactor->name) != 0)
continue;
- if (rdef == get_reactor_rdef_by_name("nop"))
- enable = false;
- else
- enable = true;
-
- monitor_swap_reactors(mdef, rdef, enable);
+ monitor_swap_reactors(mon, reactor);
retval = count;
break;
@@ -268,7 +256,7 @@ monitor_reactors_write(struct file *file, const char __user *user_buf,
*/
static int monitor_reactors_open(struct inode *inode, struct file *file)
{
- struct rv_monitor_def *mdef = inode->i_private;
+ struct rv_monitor *mon = inode->i_private;
struct seq_file *seq_f;
int ret;
@@ -284,7 +272,7 @@ static int monitor_reactors_open(struct inode *inode, struct file *file)
/*
* Copy the create file "private" data to the seq_file private data.
*/
- seq_f->private = mdef;
+ seq_f->private = mon;
return 0;
};
@@ -299,23 +287,16 @@ static const struct file_operations monitor_reactors_ops = {
static int __rv_register_reactor(struct rv_reactor *reactor)
{
- struct rv_reactor_def *r;
+ struct rv_reactor *r;
list_for_each_entry(r, &rv_reactors_list, list) {
- if (strcmp(reactor->name, r->reactor->name) == 0) {
+ if (strcmp(reactor->name, r->name) == 0) {
pr_info("Reactor %s is already registered\n", reactor->name);
return -EINVAL;
}
}
- r = kzalloc(sizeof(struct rv_reactor_def), GFP_KERNEL);
- if (!r)
- return -ENOMEM;
-
- r->reactor = reactor;
- r->counter = 0;
-
- list_add_tail(&r->list, &rv_reactors_list);
+ list_add_tail(&reactor->list, &rv_reactors_list);
return 0;
}
@@ -350,30 +331,10 @@ int rv_register_reactor(struct rv_reactor *reactor)
*/
int rv_unregister_reactor(struct rv_reactor *reactor)
{
- struct rv_reactor_def *ptr, *next;
- int ret = 0;
-
mutex_lock(&rv_interface_lock);
-
- list_for_each_entry_safe(ptr, next, &rv_reactors_list, list) {
- if (strcmp(reactor->name, ptr->reactor->name) == 0) {
-
- if (!ptr->counter) {
- list_del(&ptr->list);
- } else {
- printk(KERN_WARNING
- "rv: the rv_reactor %s is in use by %d monitor(s)\n",
- ptr->reactor->name, ptr->counter);
- printk(KERN_WARNING "rv: the rv_reactor %s cannot be removed\n",
- ptr->reactor->name);
- ret = -EBUSY;
- break;
- }
- }
- }
-
+ list_del(&reactor->list);
mutex_unlock(&rv_interface_lock);
- return ret;
+ return 0;
}
/*
@@ -454,43 +415,30 @@ static const struct file_operations reacting_on_fops = {
/**
* reactor_populate_monitor - creates per monitor reactors file
- * @mdef: monitor's definition.
+ * @mon: The monitor.
*
* Returns 0 if successful, error otherwise.
*/
-int reactor_populate_monitor(struct rv_monitor_def *mdef)
+int reactor_populate_monitor(struct rv_monitor *mon)
{
struct dentry *tmp;
- tmp = rv_create_file("reactors", RV_MODE_WRITE, mdef->root_d, mdef, &monitor_reactors_ops);
+ tmp = rv_create_file("reactors", RV_MODE_WRITE, mon->root_d, mon, &monitor_reactors_ops);
if (!tmp)
return -ENOMEM;
/*
* Configure as the rv_nop reactor.
*/
- mdef->rdef = get_reactor_rdef_by_name("nop");
- mdef->rdef->counter++;
- mdef->reacting = false;
+ mon->reactor = get_reactor_rdef_by_name("nop");
return 0;
}
-/**
- * reactor_cleanup_monitor - cleanup a monitor reference
- * @mdef: monitor's definition.
- */
-void reactor_cleanup_monitor(struct rv_monitor_def *mdef)
-{
- lockdep_assert_held(&rv_interface_lock);
- mdef->rdef->counter--;
- WARN_ON_ONCE(mdef->rdef->counter < 0);
-}
-
/*
* Nop reactor register
*/
-static void rv_nop_reaction(char *msg)
+__printf(1, 2) static void rv_nop_reaction(const char *msg, ...)
{
}
diff --git a/kernel/trace/rv/rv_trace.h b/kernel/trace/rv/rv_trace.h
index 422b75f58891..4a6faddac614 100644
--- a/kernel/trace/rv/rv_trace.h
+++ b/kernel/trace/rv/rv_trace.h
@@ -16,24 +16,24 @@ DECLARE_EVENT_CLASS(event_da_monitor,
TP_ARGS(state, event, next_state, final_state),
TP_STRUCT__entry(
- __array( char, state, MAX_DA_NAME_LEN )
- __array( char, event, MAX_DA_NAME_LEN )
- __array( char, next_state, MAX_DA_NAME_LEN )
- __field( bool, final_state )
+ __string( state, state )
+ __string( event, event )
+ __string( next_state, next_state )
+ __field( bool, final_state )
),
TP_fast_assign(
- memcpy(__entry->state, state, MAX_DA_NAME_LEN);
- memcpy(__entry->event, event, MAX_DA_NAME_LEN);
- memcpy(__entry->next_state, next_state, MAX_DA_NAME_LEN);
- __entry->final_state = final_state;
+ __assign_str(state);
+ __assign_str(event);
+ __assign_str(next_state);
+ __entry->final_state = final_state;
),
- TP_printk("%s x %s -> %s %s",
- __entry->state,
- __entry->event,
- __entry->next_state,
- __entry->final_state ? "(final)" : "")
+ TP_printk("%s x %s -> %s%s",
+ __get_str(state),
+ __get_str(event),
+ __get_str(next_state),
+ __entry->final_state ? " (final)" : "")
);
DECLARE_EVENT_CLASS(error_da_monitor,
@@ -43,26 +43,26 @@ DECLARE_EVENT_CLASS(error_da_monitor,
TP_ARGS(state, event),
TP_STRUCT__entry(
- __array( char, state, MAX_DA_NAME_LEN )
- __array( char, event, MAX_DA_NAME_LEN )
+ __string( state, state )
+ __string( event, event )
),
TP_fast_assign(
- memcpy(__entry->state, state, MAX_DA_NAME_LEN);
- memcpy(__entry->event, event, MAX_DA_NAME_LEN);
+ __assign_str(state);
+ __assign_str(event);
),
TP_printk("event %s not expected in the state %s",
- __entry->event,
- __entry->state)
+ __get_str(event),
+ __get_str(state))
);
#include <monitors/wip/wip_trace.h>
-#include <monitors/tss/tss_trace.h>
#include <monitors/sco/sco_trace.h>
#include <monitors/scpd/scpd_trace.h>
#include <monitors/snep/snep_trace.h>
-#include <monitors/sncid/sncid_trace.h>
+#include <monitors/sts/sts_trace.h>
+#include <monitors/opid/opid_trace.h>
// Add new monitors based on CONFIG_DA_MON_EVENTS_IMPLICIT here
#endif /* CONFIG_DA_MON_EVENTS_IMPLICIT */
@@ -75,27 +75,27 @@ DECLARE_EVENT_CLASS(event_da_monitor_id,
TP_ARGS(id, state, event, next_state, final_state),
TP_STRUCT__entry(
- __field( int, id )
- __array( char, state, MAX_DA_NAME_LEN )
- __array( char, event, MAX_DA_NAME_LEN )
- __array( char, next_state, MAX_DA_NAME_LEN )
- __field( bool, final_state )
+ __field( int, id )
+ __string( state, state )
+ __string( event, event )
+ __string( next_state, next_state )
+ __field( bool, final_state )
),
TP_fast_assign(
- memcpy(__entry->state, state, MAX_DA_NAME_LEN);
- memcpy(__entry->event, event, MAX_DA_NAME_LEN);
- memcpy(__entry->next_state, next_state, MAX_DA_NAME_LEN);
- __entry->id = id;
- __entry->final_state = final_state;
+ __assign_str(state);
+ __assign_str(event);
+ __assign_str(next_state);
+ __entry->id = id;
+ __entry->final_state = final_state;
),
- TP_printk("%d: %s x %s -> %s %s",
+ TP_printk("%d: %s x %s -> %s%s",
__entry->id,
- __entry->state,
- __entry->event,
- __entry->next_state,
- __entry->final_state ? "(final)" : "")
+ __get_str(state),
+ __get_str(event),
+ __get_str(next_state),
+ __entry->final_state ? " (final)" : "")
);
DECLARE_EVENT_CLASS(error_da_monitor_id,
@@ -105,32 +105,108 @@ DECLARE_EVENT_CLASS(error_da_monitor_id,
TP_ARGS(id, state, event),
TP_STRUCT__entry(
- __field( int, id )
- __array( char, state, MAX_DA_NAME_LEN )
- __array( char, event, MAX_DA_NAME_LEN )
+ __field( int, id )
+ __string( state, state )
+ __string( event, event )
),
TP_fast_assign(
- memcpy(__entry->state, state, MAX_DA_NAME_LEN);
- memcpy(__entry->event, event, MAX_DA_NAME_LEN);
- __entry->id = id;
+ __assign_str(state);
+ __assign_str(event);
+ __entry->id = id;
),
TP_printk("%d: event %s not expected in the state %s",
__entry->id,
- __entry->event,
- __entry->state)
+ __get_str(event),
+ __get_str(state))
);
#include <monitors/wwnr/wwnr_trace.h>
#include <monitors/snroc/snroc_trace.h>
+#include <monitors/nrp/nrp_trace.h>
+#include <monitors/sssw/sssw_trace.h>
// Add new monitors based on CONFIG_DA_MON_EVENTS_ID here
#endif /* CONFIG_DA_MON_EVENTS_ID */
+#ifdef CONFIG_LTL_MON_EVENTS_ID
+DECLARE_EVENT_CLASS(event_ltl_monitor_id,
+
+ TP_PROTO(struct task_struct *task, char *states, char *atoms, char *next),
+
+ TP_ARGS(task, states, atoms, next),
+
+ TP_STRUCT__entry(
+ __string(comm, task->comm)
+ __field(pid_t, pid)
+ __string(states, states)
+ __string(atoms, atoms)
+ __string(next, next)
+ ),
+
+ TP_fast_assign(
+ __assign_str(comm);
+ __entry->pid = task->pid;
+ __assign_str(states);
+ __assign_str(atoms);
+ __assign_str(next);
+ ),
+
+ TP_printk("%s[%d]: (%s) x (%s) -> (%s)", __get_str(comm), __entry->pid,
+ __get_str(states), __get_str(atoms), __get_str(next))
+);
+
+DECLARE_EVENT_CLASS(error_ltl_monitor_id,
+
+ TP_PROTO(struct task_struct *task),
+
+ TP_ARGS(task),
+
+ TP_STRUCT__entry(
+ __string(comm, task->comm)
+ __field(pid_t, pid)
+ ),
+
+ TP_fast_assign(
+ __assign_str(comm);
+ __entry->pid = task->pid;
+ ),
+
+ TP_printk("%s[%d]: violation detected", __get_str(comm), __entry->pid)
+);
+#include <monitors/pagefault/pagefault_trace.h>
+#include <monitors/sleep/sleep_trace.h>
+// Add new monitors based on CONFIG_LTL_MON_EVENTS_ID here
+#endif /* CONFIG_LTL_MON_EVENTS_ID */
+
+#ifdef CONFIG_RV_MON_MAINTENANCE_EVENTS
+/* Tracepoint useful for monitors development, currenly only used in DA */
+TRACE_EVENT(rv_retries_error,
+
+ TP_PROTO(char *name, char *event),
+
+ TP_ARGS(name, event),
+
+ TP_STRUCT__entry(
+ __string( name, name )
+ __string( event, event )
+ ),
+
+ TP_fast_assign(
+ __assign_str(name);
+ __assign_str(event);
+ ),
+
+ TP_printk(__stringify(MAX_DA_RETRY_RACING_EVENTS)
+ " retries reached for event %s, resetting monitor %s",
+ __get_str(event), __get_str(name))
+);
+#endif /* CONFIG_RV_MON_MAINTENANCE_EVENTS */
#endif /* _TRACE_RV_H */
-/* This part ust be outside protection */
+/* This part must be outside protection */
#undef TRACE_INCLUDE_PATH
#define TRACE_INCLUDE_PATH .
+#undef TRACE_INCLUDE_FILE
#define TRACE_INCLUDE_FILE rv_trace
#include <trace/define_trace.h>
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 5b8db27fb6ef..7996f26c3f46 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -51,6 +51,7 @@
#include <linux/workqueue.h>
#include <linux/sort.h>
#include <linux/io.h> /* vmap_page_range() */
+#include <linux/fs_context.h>
#include <asm/setup.h> /* COMMAND_LINE_SIZE */
@@ -120,6 +121,7 @@ static int tracing_disabled = 1;
cpumask_var_t __read_mostly tracing_buffer_mask;
+#define MAX_TRACER_SIZE 100
/*
* ftrace_dump_on_oops - variable to dump ftrace buffer on oops
*
@@ -142,7 +144,40 @@ cpumask_var_t __read_mostly tracing_buffer_mask;
char ftrace_dump_on_oops[MAX_TRACER_SIZE] = "0";
/* When set, tracing will stop when a WARN*() is hit */
-int __disable_trace_on_warning;
+static int __disable_trace_on_warning;
+
+int tracepoint_printk_sysctl(const struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos);
+static const struct ctl_table trace_sysctl_table[] = {
+ {
+ .procname = "ftrace_dump_on_oops",
+ .data = &ftrace_dump_on_oops,
+ .maxlen = MAX_TRACER_SIZE,
+ .mode = 0644,
+ .proc_handler = proc_dostring,
+ },
+ {
+ .procname = "traceoff_on_warning",
+ .data = &__disable_trace_on_warning,
+ .maxlen = sizeof(__disable_trace_on_warning),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "tracepoint_printk",
+ .data = &tracepoint_printk,
+ .maxlen = sizeof(tracepoint_printk),
+ .mode = 0644,
+ .proc_handler = tracepoint_printk_sysctl,
+ },
+};
+
+static int __init init_trace_sysctls(void)
+{
+ register_sysctl_init("kernel", trace_sysctl_table);
+ return 0;
+}
+subsys_initcall(init_trace_sysctls);
#ifdef CONFIG_TRACE_EVAL_MAP_FILE
/* Map of enums to their values, for "eval_map" file */
@@ -493,7 +528,8 @@ EXPORT_SYMBOL_GPL(unregister_ftrace_export);
TRACE_ITER_ANNOTATE | TRACE_ITER_CONTEXT_INFO | \
TRACE_ITER_RECORD_CMD | TRACE_ITER_OVERWRITE | \
TRACE_ITER_IRQ_INFO | TRACE_ITER_MARKERS | \
- TRACE_ITER_HASH_PTR | TRACE_ITER_TRACE_PRINTK)
+ TRACE_ITER_HASH_PTR | TRACE_ITER_TRACE_PRINTK | \
+ TRACE_ITER_COPY_MARKER)
/* trace_options that are only supported by global_trace */
#define TOP_LEVEL_TRACE_FLAGS (TRACE_ITER_PRINTK | \
@@ -501,7 +537,8 @@ EXPORT_SYMBOL_GPL(unregister_ftrace_export);
/* trace_flags that are default zero for instances */
#define ZEROED_TRACE_FLAGS \
- (TRACE_ITER_EVENT_FORK | TRACE_ITER_FUNC_FORK | TRACE_ITER_TRACE_PRINTK)
+ (TRACE_ITER_EVENT_FORK | TRACE_ITER_FUNC_FORK | TRACE_ITER_TRACE_PRINTK | \
+ TRACE_ITER_COPY_MARKER)
/*
* The global_trace is the descriptor that holds the top-level tracing
@@ -513,6 +550,9 @@ static struct trace_array global_trace = {
static struct trace_array *printk_trace = &global_trace;
+/* List of trace_arrays interested in the top level trace_marker */
+static LIST_HEAD(marker_copies);
+
static __always_inline bool printk_binsafe(struct trace_array *tr)
{
/*
@@ -534,6 +574,28 @@ static void update_printk_trace(struct trace_array *tr)
tr->trace_flags |= TRACE_ITER_TRACE_PRINTK;
}
+/* Returns true if the status of tr changed */
+static bool update_marker_trace(struct trace_array *tr, int enabled)
+{
+ lockdep_assert_held(&event_mutex);
+
+ if (enabled) {
+ if (!list_empty(&tr->marker_list))
+ return false;
+
+ list_add_rcu(&tr->marker_list, &marker_copies);
+ tr->trace_flags |= TRACE_ITER_COPY_MARKER;
+ return true;
+ }
+
+ if (list_empty(&tr->marker_list))
+ return false;
+
+ list_del_init(&tr->marker_list);
+ tr->trace_flags &= ~TRACE_ITER_COPY_MARKER;
+ return true;
+}
+
void trace_set_ring_buffer_expanded(struct trace_array *tr)
{
if (!tr)
@@ -1583,6 +1645,39 @@ void tracer_tracing_off(struct trace_array *tr)
}
/**
+ * tracer_tracing_disable() - temporary disable the buffer from write
+ * @tr: The trace array to disable its buffer for
+ *
+ * Expects trace_tracing_enable() to re-enable tracing.
+ * The difference between this and tracer_tracing_off() is that this
+ * is a counter and can nest, whereas, tracer_tracing_off() can
+ * be called multiple times and a single trace_tracing_on() will
+ * enable it.
+ */
+void tracer_tracing_disable(struct trace_array *tr)
+{
+ if (WARN_ON_ONCE(!tr->array_buffer.buffer))
+ return;
+
+ ring_buffer_record_disable(tr->array_buffer.buffer);
+}
+
+/**
+ * tracer_tracing_enable() - counter part of tracer_tracing_disable()
+ * @tr: The trace array that had tracer_tracincg_disable() called on it
+ *
+ * This is called after tracer_tracing_disable() has been called on @tr,
+ * when it's safe to re-enable tracing.
+ */
+void tracer_tracing_enable(struct trace_array *tr)
+{
+ if (WARN_ON_ONCE(!tr->array_buffer.buffer))
+ return;
+
+ ring_buffer_record_enable(tr->array_buffer.buffer);
+}
+
+/**
* tracing_off - turn off tracing buffers
*
* This function stops the tracing buffers from recording data.
@@ -4640,21 +4735,15 @@ __tracing_open(struct inode *inode, struct file *file, bool snapshot)
if (iter->cpu_file == RING_BUFFER_ALL_CPUS) {
for_each_tracing_cpu(cpu) {
iter->buffer_iter[cpu] =
- ring_buffer_read_prepare(iter->array_buffer->buffer,
- cpu, GFP_KERNEL);
- }
- ring_buffer_read_prepare_sync();
- for_each_tracing_cpu(cpu) {
- ring_buffer_read_start(iter->buffer_iter[cpu]);
+ ring_buffer_read_start(iter->array_buffer->buffer,
+ cpu, GFP_KERNEL);
tracing_iter_reset(iter, cpu);
}
} else {
cpu = iter->cpu_file;
iter->buffer_iter[cpu] =
- ring_buffer_read_prepare(iter->array_buffer->buffer,
- cpu, GFP_KERNEL);
- ring_buffer_read_prepare_sync();
- ring_buffer_read_start(iter->buffer_iter[cpu]);
+ ring_buffer_read_start(iter->array_buffer->buffer,
+ cpu, GFP_KERNEL);
tracing_iter_reset(iter, cpu);
}
@@ -5048,7 +5137,6 @@ int tracing_set_cpumask(struct trace_array *tr,
*/
if (cpumask_test_cpu(cpu, tr->tracing_cpumask) &&
!cpumask_test_cpu(cpu, tracing_cpumask_new)) {
- atomic_inc(&per_cpu_ptr(tr->array_buffer.data, cpu)->disabled);
ring_buffer_record_disable_cpu(tr->array_buffer.buffer, cpu);
#ifdef CONFIG_TRACER_MAX_TRACE
ring_buffer_record_disable_cpu(tr->max_buffer.buffer, cpu);
@@ -5056,7 +5144,6 @@ int tracing_set_cpumask(struct trace_array *tr,
}
if (!cpumask_test_cpu(cpu, tr->tracing_cpumask) &&
cpumask_test_cpu(cpu, tracing_cpumask_new)) {
- atomic_dec(&per_cpu_ptr(tr->array_buffer.data, cpu)->disabled);
ring_buffer_record_enable_cpu(tr->array_buffer.buffer, cpu);
#ifdef CONFIG_TRACER_MAX_TRACE
ring_buffer_record_enable_cpu(tr->max_buffer.buffer, cpu);
@@ -5189,7 +5276,8 @@ int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled)
{
if ((mask == TRACE_ITER_RECORD_TGID) ||
(mask == TRACE_ITER_RECORD_CMD) ||
- (mask == TRACE_ITER_TRACE_PRINTK))
+ (mask == TRACE_ITER_TRACE_PRINTK) ||
+ (mask == TRACE_ITER_COPY_MARKER))
lockdep_assert_held(&event_mutex);
/* do nothing if flag is already set */
@@ -5220,6 +5308,9 @@ int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled)
}
}
+ if (mask == TRACE_ITER_COPY_MARKER)
+ update_marker_trace(tr, enabled);
+
if (enabled)
tr->trace_flags |= mask;
else
@@ -6004,6 +6095,7 @@ struct trace_mod_entry {
};
struct trace_scratch {
+ unsigned int clock_id;
unsigned long text_addr;
unsigned long nr_entries;
struct trace_mod_entry entries[];
@@ -6032,6 +6124,7 @@ unsigned long trace_adjust_address(struct trace_array *tr, unsigned long addr)
struct trace_module_delta *module_delta;
struct trace_scratch *tscratch;
struct trace_mod_entry *entry;
+ unsigned long raddr;
int idx = 0, nr_entries;
/* If we don't have last boot delta, return the address */
@@ -6045,7 +6138,9 @@ unsigned long trace_adjust_address(struct trace_array *tr, unsigned long addr)
module_delta = READ_ONCE(tr->module_delta);
if (!module_delta || !tscratch->nr_entries ||
tscratch->entries[0].mod_addr > addr) {
- return addr + tr->text_delta;
+ raddr = addr + tr->text_delta;
+ return __is_kernel(raddr) || is_kernel_core_data(raddr) ||
+ is_kernel_rodata(raddr) ? raddr : addr;
}
/* Note that entries must be sorted. */
@@ -6116,6 +6211,7 @@ static void update_last_data(struct trace_array *tr)
if (tr->scratch) {
struct trace_scratch *tscratch = tr->scratch;
+ tscratch->clock_id = tr->clock_id;
memset(tscratch->entries, 0,
flex_array_size(tscratch, entries, tscratch->nr_entries));
tscratch->nr_entries = 0;
@@ -6610,6 +6706,22 @@ static int tracing_wait_pipe(struct file *filp)
return 1;
}
+static bool update_last_data_if_empty(struct trace_array *tr)
+{
+ if (!(tr->flags & TRACE_ARRAY_FL_LAST_BOOT))
+ return false;
+
+ if (!ring_buffer_empty(tr->array_buffer.buffer))
+ return false;
+
+ /*
+ * If the buffer contains the last boot data and all per-cpu
+ * buffers are empty, reset it from the kernel side.
+ */
+ update_last_data(tr);
+ return true;
+}
+
/*
* Consumer reader.
*/
@@ -6641,6 +6753,9 @@ tracing_read_pipe(struct file *filp, char __user *ubuf,
}
waitagain:
+ if (update_last_data_if_empty(iter->tr))
+ return 0;
+
sret = tracing_wait_pipe(filp);
if (sret <= 0)
return sret;
@@ -6824,7 +6939,7 @@ static ssize_t tracing_splice_read_pipe(struct file *filp,
ret = trace_seq_to_buffer(&iter->seq,
page_address(spd.pages[i]),
min((size_t)trace_seq_used(&iter->seq),
- PAGE_SIZE));
+ (size_t)PAGE_SIZE));
if (ret < 0) {
__free_page(spd.pages[i]);
break;
@@ -7100,11 +7215,9 @@ tracing_free_buffer_release(struct inode *inode, struct file *filp)
#define TRACE_MARKER_MAX_SIZE 4096
-static ssize_t
-tracing_mark_write(struct file *filp, const char __user *ubuf,
- size_t cnt, loff_t *fpos)
+static ssize_t write_marker_to_buffer(struct trace_array *tr, const char __user *ubuf,
+ size_t cnt, unsigned long ip)
{
- struct trace_array *tr = filp->private_data;
struct ring_buffer_event *event;
enum event_trigger_type tt = ETT_NONE;
struct trace_buffer *buffer;
@@ -7118,18 +7231,6 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,
#define FAULTED_STR "<faulted>"
#define FAULTED_SIZE (sizeof(FAULTED_STR) - 1) /* '\0' is already accounted for */
- if (tracing_disabled)
- return -EINVAL;
-
- if (!(tr->trace_flags & TRACE_ITER_MARKERS))
- return -EINVAL;
-
- if ((ssize_t)cnt < 0)
- return -EINVAL;
-
- if (cnt > TRACE_MARKER_MAX_SIZE)
- cnt = TRACE_MARKER_MAX_SIZE;
-
meta_size = sizeof(*entry) + 2; /* add '\0' and possible '\n' */
again:
size = cnt + meta_size;
@@ -7162,7 +7263,7 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,
}
entry = ring_buffer_event_data(event);
- entry->ip = _THIS_IP_;
+ entry->ip = ip;
len = __copy_from_user_inatomic(&entry->buf, ubuf, cnt);
if (len) {
@@ -7195,18 +7296,12 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,
}
static ssize_t
-tracing_mark_raw_write(struct file *filp, const char __user *ubuf,
+tracing_mark_write(struct file *filp, const char __user *ubuf,
size_t cnt, loff_t *fpos)
{
struct trace_array *tr = filp->private_data;
- struct ring_buffer_event *event;
- struct trace_buffer *buffer;
- struct raw_data_entry *entry;
- ssize_t written;
- int size;
- int len;
-
-#define FAULT_SIZE_ID (FAULTED_SIZE + sizeof(int))
+ ssize_t written = -ENODEV;
+ unsigned long ip;
if (tracing_disabled)
return -EINVAL;
@@ -7214,10 +7309,42 @@ tracing_mark_raw_write(struct file *filp, const char __user *ubuf,
if (!(tr->trace_flags & TRACE_ITER_MARKERS))
return -EINVAL;
- /* The marker must at least have a tag id */
- if (cnt < sizeof(unsigned int))
+ if ((ssize_t)cnt < 0)
return -EINVAL;
+ if (cnt > TRACE_MARKER_MAX_SIZE)
+ cnt = TRACE_MARKER_MAX_SIZE;
+
+ /* The selftests expect this function to be the IP address */
+ ip = _THIS_IP_;
+
+ /* The global trace_marker can go to multiple instances */
+ if (tr == &global_trace) {
+ guard(rcu)();
+ list_for_each_entry_rcu(tr, &marker_copies, marker_list) {
+ written = write_marker_to_buffer(tr, ubuf, cnt, ip);
+ if (written < 0)
+ break;
+ }
+ } else {
+ written = write_marker_to_buffer(tr, ubuf, cnt, ip);
+ }
+
+ return written;
+}
+
+static ssize_t write_raw_marker_to_buffer(struct trace_array *tr,
+ const char __user *ubuf, size_t cnt)
+{
+ struct ring_buffer_event *event;
+ struct trace_buffer *buffer;
+ struct raw_data_entry *entry;
+ ssize_t written;
+ int size;
+ int len;
+
+#define FAULT_SIZE_ID (FAULTED_SIZE + sizeof(int))
+
size = sizeof(*entry) + cnt;
if (cnt < FAULT_SIZE_ID)
size += FAULT_SIZE_ID - cnt;
@@ -7248,6 +7375,40 @@ tracing_mark_raw_write(struct file *filp, const char __user *ubuf,
return written;
}
+static ssize_t
+tracing_mark_raw_write(struct file *filp, const char __user *ubuf,
+ size_t cnt, loff_t *fpos)
+{
+ struct trace_array *tr = filp->private_data;
+ ssize_t written = -ENODEV;
+
+#define FAULT_SIZE_ID (FAULTED_SIZE + sizeof(int))
+
+ if (tracing_disabled)
+ return -EINVAL;
+
+ if (!(tr->trace_flags & TRACE_ITER_MARKERS))
+ return -EINVAL;
+
+ /* The marker must at least have a tag id */
+ if (cnt < sizeof(unsigned int))
+ return -EINVAL;
+
+ /* The global trace_marker_raw can go to multiple instances */
+ if (tr == &global_trace) {
+ guard(rcu)();
+ list_for_each_entry_rcu(tr, &marker_copies, marker_list) {
+ written = write_raw_marker_to_buffer(tr, ubuf, cnt);
+ if (written < 0)
+ break;
+ }
+ } else {
+ written = write_raw_marker_to_buffer(tr, ubuf, cnt);
+ }
+
+ return written;
+}
+
static int tracing_clock_show(struct seq_file *m, void *v)
{
struct trace_array *tr = m->private;
@@ -7292,6 +7453,12 @@ int tracing_set_clock(struct trace_array *tr, const char *clockstr)
tracing_reset_online_cpus(&tr->max_buffer);
#endif
+ if (tr->scratch && !(tr->flags & TRACE_ARRAY_FL_LAST_BOOT)) {
+ struct trace_scratch *tscratch = tr->scratch;
+
+ tscratch->clock_id = i;
+ }
+
mutex_unlock(&trace_types_lock);
return 0;
@@ -8167,6 +8334,9 @@ tracing_buffers_read(struct file *filp, char __user *ubuf,
if (ret < 0) {
if (trace_empty(iter) && !iter->closed) {
+ if (update_last_data_if_empty(iter->tr))
+ return 0;
+
if ((filp->f_flags & O_NONBLOCK))
return -EAGAIN;
@@ -8508,10 +8678,6 @@ static int tracing_buffers_mmap(struct file *filp, struct vm_area_struct *vma)
if (iter->tr->flags & TRACE_ARRAY_FL_MEMMAP)
return -ENODEV;
- /* Currently the boot mapped buffer is not supported for mmap */
- if (iter->tr->flags & TRACE_ARRAY_FL_BOOT)
- return -ENODEV;
-
ret = get_snapshot_map(iter->tr);
if (ret)
return ret;
@@ -9517,6 +9683,15 @@ static void setup_trace_scratch(struct trace_array *tr,
/* Scan modules to make text delta for modules. */
module_for_each_mod(make_mod_delta, tr);
+
+ /* Set trace_clock as the same of the previous boot. */
+ if (tscratch->clock_id != tr->clock_id) {
+ if (tscratch->clock_id >= ARRAY_SIZE(trace_clocks) ||
+ tracing_set_clock(tr, trace_clocks[tscratch->clock_id].name) < 0) {
+ pr_info("the previous trace_clock info is not valid.");
+ goto reset;
+ }
+ }
return;
reset:
/* Invalid trace modules */
@@ -9741,6 +9916,7 @@ trace_array_create_systems(const char *name, const char *systems,
INIT_LIST_HEAD(&tr->events);
INIT_LIST_HEAD(&tr->hist_vars);
INIT_LIST_HEAD(&tr->err_log);
+ INIT_LIST_HEAD(&tr->marker_list);
#ifdef CONFIG_MODULES
INIT_LIST_HEAD(&tr->mod_events);
@@ -9900,6 +10076,9 @@ static int __remove_instance(struct trace_array *tr)
if (printk_trace == tr)
update_printk_trace(&global_trace);
+ if (update_marker_trace(tr, 0))
+ synchronize_rcu();
+
tracing_set_nop(tr);
clear_ftrace_function_probes(tr);
event_trace_del_tracer(tr);
@@ -10075,6 +10254,8 @@ static struct vfsmount *trace_automount(struct dentry *mntpt, void *ingore)
{
struct vfsmount *mnt;
struct file_system_type *type;
+ struct fs_context *fc;
+ int ret;
/*
* To maintain backward compatibility for tools that mount
@@ -10084,12 +10265,20 @@ static struct vfsmount *trace_automount(struct dentry *mntpt, void *ingore)
type = get_fs_type("tracefs");
if (!type)
return NULL;
- mnt = vfs_submount(mntpt, type, "tracefs", NULL);
+
+ fc = fs_context_for_submount(type, mntpt);
put_filesystem(type);
- if (IS_ERR(mnt))
- return NULL;
- mntget(mnt);
+ if (IS_ERR(fc))
+ return ERR_CAST(fc);
+
+ ret = vfs_parse_fs_string(fc, "source",
+ "tracefs", strlen("tracefs"));
+ if (!ret)
+ mnt = fc_mount(fc);
+ else
+ mnt = ERR_PTR(ret);
+ put_fs_context(fc);
return mnt;
}
@@ -10448,7 +10637,7 @@ static void ftrace_dump_one(struct trace_array *tr, enum ftrace_dump_mode dump_m
static struct trace_iterator iter;
unsigned int old_userobj;
unsigned long flags;
- int cnt = 0, cpu;
+ int cnt = 0;
/*
* Always turn off tracing when we dump.
@@ -10465,9 +10654,8 @@ static void ftrace_dump_one(struct trace_array *tr, enum ftrace_dump_mode dump_m
/* Simulate the iterator */
trace_init_iter(&iter, tr);
- for_each_tracing_cpu(cpu) {
- atomic_inc(&per_cpu_ptr(iter.array_buffer->data, cpu)->disabled);
- }
+ /* While dumping, do not allow the buffer to be enable */
+ tracer_tracing_disable(tr);
old_userobj = tr->trace_flags & TRACE_ITER_SYM_USEROBJ;
@@ -10526,9 +10714,7 @@ static void ftrace_dump_one(struct trace_array *tr, enum ftrace_dump_mode dump_m
tr->trace_flags |= old_userobj;
- for_each_tracing_cpu(cpu) {
- atomic_dec(&per_cpu_ptr(iter.array_buffer->data, cpu)->disabled);
- }
+ tracer_tracing_enable(tr);
local_irq_restore(flags);
}
@@ -10968,6 +11154,7 @@ __init static int tracer_alloc_buffers(void)
INIT_LIST_HEAD(&global_trace.events);
INIT_LIST_HEAD(&global_trace.hist_vars);
INIT_LIST_HEAD(&global_trace.err_log);
+ list_add(&global_trace.marker_list, &marker_copies);
list_add(&global_trace.list, &ftrace_trace_arrays);
apply_trace_boot_options();
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 79be1995db44..bd084953a98b 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -183,8 +183,7 @@ struct trace_array;
* the trace, etc.)
*/
struct trace_array_cpu {
- atomic_t disabled;
- void *buffer_page; /* ring buffer spare */
+ local_t disabled;
unsigned long entries;
unsigned long saved_latency;
@@ -404,6 +403,7 @@ struct trace_array {
struct trace_options *topts;
struct list_head systems;
struct list_head events;
+ struct list_head marker_list;
struct trace_event_file *trace_marker_file;
cpumask_var_t tracing_cpumask; /* only trace on set CPUs */
/* one per_cpu trace_pipe can be opened by only one user */
@@ -665,12 +665,29 @@ bool tracing_is_disabled(void);
bool tracer_tracing_is_on(struct trace_array *tr);
void tracer_tracing_on(struct trace_array *tr);
void tracer_tracing_off(struct trace_array *tr);
+void tracer_tracing_disable(struct trace_array *tr);
+void tracer_tracing_enable(struct trace_array *tr);
struct dentry *trace_create_file(const char *name,
umode_t mode,
struct dentry *parent,
void *data,
const struct file_operations *fops);
+
+/**
+ * tracer_tracing_is_on_cpu - show real state of ring buffer enabled on for a cpu
+ * @tr : the trace array to know if ring buffer is enabled
+ * @cpu: The cpu buffer to check if enabled
+ *
+ * Shows real state of the per CPU buffer if it is enabled or not.
+ */
+static inline bool tracer_tracing_is_on_cpu(struct trace_array *tr, int cpu)
+{
+ if (tr->array_buffer.buffer)
+ return ring_buffer_record_is_on_cpu(tr->array_buffer.buffer, cpu);
+ return false;
+}
+
int tracing_init_dentry(void);
struct ring_buffer_event;
@@ -1368,6 +1385,7 @@ extern int trace_get_user(struct trace_parser *parser, const char __user *ubuf,
C(MARKERS, "markers"), \
C(EVENT_FORK, "event-fork"), \
C(TRACE_PRINTK, "trace_printk_dest"), \
+ C(COPY_MARKER, "copy_trace_marker"),\
C(PAUSE_ON_TRACE, "pause-on-trace"), \
C(HASH_PTR, "hash-ptr"), /* Print hashed pointer */ \
FUNCTION_FLAGS \
@@ -1772,6 +1790,9 @@ extern int event_enable_register_trigger(char *glob,
extern void event_enable_unregister_trigger(char *glob,
struct event_trigger_data *test,
struct trace_event_file *file);
+extern struct event_trigger_data *
+trigger_data_alloc(struct event_command *cmd_ops, char *cmd, char *param,
+ void *private_data);
extern void trigger_data_free(struct event_trigger_data *data);
extern int event_trigger_init(struct event_trigger_data *data);
extern int trace_event_trigger_enable_disable(struct trace_event_file *file,
@@ -1798,11 +1819,6 @@ extern bool event_trigger_check_remove(const char *glob);
extern bool event_trigger_empty_param(const char *param);
extern int event_trigger_separate_filter(char *param_and_filter, char **param,
char **filter, bool param_required);
-extern struct event_trigger_data *
-event_trigger_alloc(struct event_command *cmd_ops,
- char *cmd,
- char *param,
- void *private_data);
extern int event_trigger_parse_num(char *trigger,
struct event_trigger_data *trigger_data);
extern int event_trigger_set_filter(struct event_command *cmd_ops,
diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c
index 6d08a5523ce0..6809b370e991 100644
--- a/kernel/trace/trace_branch.c
+++ b/kernel/trace/trace_branch.c
@@ -32,7 +32,6 @@ probe_likely_condition(struct ftrace_likely_data *f, int val, int expect)
{
struct trace_array *tr = branch_tracer;
struct trace_buffer *buffer;
- struct trace_array_cpu *data;
struct ring_buffer_event *event;
struct trace_branch *entry;
unsigned long flags;
@@ -54,8 +53,7 @@ probe_likely_condition(struct ftrace_likely_data *f, int val, int expect)
raw_local_irq_save(flags);
current->trace_recursion |= TRACE_BRANCH_BIT;
- data = this_cpu_ptr(tr->array_buffer.data);
- if (atomic_read(&data->disabled))
+ if (!tracer_tracing_is_on_cpu(tr, raw_smp_processor_id()))
goto out;
trace_ctx = tracing_gen_ctx_flags(flags);
diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h
index 4ef4df6623a8..de294ae2c5c5 100644
--- a/kernel/trace/trace_entries.h
+++ b/kernel/trace/trace_entries.h
@@ -97,11 +97,11 @@ FTRACE_ENTRY_PACKED(fgraph_retaddr_entry, fgraph_retaddr_ent_entry,
F_STRUCT(
__field_struct( struct fgraph_retaddr_ent, graph_ent )
__field_packed( unsigned long, graph_ent, func )
- __field_packed( int, graph_ent, depth )
+ __field_packed( unsigned int, graph_ent, depth )
__field_packed( unsigned long, graph_ent, retaddr )
),
- F_printk("--> %ps (%d) <- %ps", (void *)__entry->func, __entry->depth,
+ F_printk("--> %ps (%u) <- %ps", (void *)__entry->func, __entry->depth,
(void *)__entry->retaddr)
);
@@ -124,13 +124,13 @@ FTRACE_ENTRY_PACKED(funcgraph_exit, ftrace_graph_ret_entry,
__field_struct( struct ftrace_graph_ret, ret )
__field_packed( unsigned long, ret, func )
__field_packed( unsigned long, ret, retval )
- __field_packed( int, ret, depth )
+ __field_packed( unsigned int, ret, depth )
__field_packed( unsigned int, ret, overrun )
__field(unsigned long long, calltime )
__field(unsigned long long, rettime )
),
- F_printk("<-- %ps (%d) (start: %llx end: %llx) over: %d retval: %lx",
+ F_printk("<-- %ps (%u) (start: %llx end: %llx) over: %u retval: %lx",
(void *)__entry->func, __entry->depth,
__entry->calltime, __entry->rettime,
__entry->depth, __entry->retval)
@@ -146,13 +146,13 @@ FTRACE_ENTRY_PACKED(funcgraph_exit, ftrace_graph_ret_entry,
F_STRUCT(
__field_struct( struct ftrace_graph_ret, ret )
__field_packed( unsigned long, ret, func )
- __field_packed( int, ret, depth )
+ __field_packed( unsigned int, ret, depth )
__field_packed( unsigned int, ret, overrun )
__field(unsigned long long, calltime )
__field(unsigned long long, rettime )
),
- F_printk("<-- %ps (%d) (start: %llx end: %llx) over: %d",
+ F_printk("<-- %ps (%u) (start: %llx end: %llx) over: %u",
(void *)__entry->func, __entry->depth,
__entry->calltime, __entry->rettime,
__entry->depth)
diff --git a/kernel/trace/trace_eprobe.c b/kernel/trace/trace_eprobe.c
index 916555f0de81..a1d402124836 100644
--- a/kernel/trace/trace_eprobe.c
+++ b/kernel/trace/trace_eprobe.c
@@ -9,14 +9,15 @@
* Copyright (C) 2021, VMware Inc, Tzvetomir Stoyanov tz.stoyanov@gmail.com>
*
*/
+#include <linux/cleanup.h>
+#include <linux/ftrace.h>
#include <linux/module.h>
#include <linux/mutex.h>
-#include <linux/ftrace.h>
#include "trace_dynevent.h"
#include "trace_probe.h"
-#include "trace_probe_tmpl.h"
#include "trace_probe_kernel.h"
+#include "trace_probe_tmpl.h"
#define EPROBE_EVENT_SYSTEM "eprobes"
@@ -343,10 +344,15 @@ get_event_field(struct fetch_insn *code, void *rec)
val = *(unsigned int *)addr;
break;
default:
- if (field->is_signed)
- val = *(long *)addr;
- else
- val = *(unsigned long *)addr;
+ if (field->size == sizeof(long)) {
+ if (field->is_signed)
+ val = *(long *)addr;
+ else
+ val = *(unsigned long *)addr;
+ break;
+ }
+ /* This is an array, point to the addr itself */
+ val = (unsigned long)addr;
break;
}
return val;
@@ -797,18 +803,20 @@ find_and_get_event(const char *system, const char *event_name)
static int trace_eprobe_tp_update_arg(struct trace_eprobe *ep, const char *argv[], int i)
{
- struct traceprobe_parse_context ctx = {
- .event = ep->event,
- .flags = TPARG_FL_KERNEL | TPARG_FL_TEVENT,
- };
+ struct traceprobe_parse_context *ctx __free(traceprobe_parse_context) = NULL;
int ret;
- ret = traceprobe_parse_probe_arg(&ep->tp, i, argv[i], &ctx);
+ ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
+ if (!ctx)
+ return -ENOMEM;
+ ctx->event = ep->event;
+ ctx->flags = TPARG_FL_KERNEL | TPARG_FL_TEVENT;
+
+ ret = traceprobe_parse_probe_arg(&ep->tp, i, argv[i], ctx);
/* Handle symbols "@" */
if (!ret)
ret = traceprobe_update_arg(&ep->tp.args[i]);
- traceprobe_finish_parse(&ctx);
return ret;
}
@@ -869,10 +877,10 @@ static int __trace_eprobe_create(int argc, const char *argv[])
const char *event = NULL, *group = EPROBE_EVENT_SYSTEM;
const char *sys_event = NULL, *sys_name = NULL;
struct trace_event_call *event_call;
+ char *buf1 __free(kfree) = NULL;
+ char *buf2 __free(kfree) = NULL;
+ char *gbuf __free(kfree) = NULL;
struct trace_eprobe *ep = NULL;
- char buf1[MAX_EVENT_NAME_LEN];
- char buf2[MAX_EVENT_NAME_LEN];
- char gbuf[MAX_EVENT_NAME_LEN];
int ret = 0, filter_idx = 0;
int i, filter_cnt;
@@ -883,6 +891,9 @@ static int __trace_eprobe_create(int argc, const char *argv[])
event = strchr(&argv[0][1], ':');
if (event) {
+ gbuf = kmalloc(MAX_EVENT_NAME_LEN, GFP_KERNEL);
+ if (!gbuf)
+ goto mem_error;
event++;
ret = traceprobe_parse_event_name(&event, &group, gbuf,
event - argv[0]);
@@ -892,6 +903,11 @@ static int __trace_eprobe_create(int argc, const char *argv[])
trace_probe_log_set_index(1);
sys_event = argv[1];
+
+ buf2 = kmalloc(MAX_EVENT_NAME_LEN, GFP_KERNEL);
+ if (!buf2)
+ goto mem_error;
+
ret = traceprobe_parse_event_name(&sys_event, &sys_name, buf2, 0);
if (ret || !sys_event || !sys_name) {
trace_probe_log_err(0, NO_EVENT_INFO);
@@ -899,7 +915,9 @@ static int __trace_eprobe_create(int argc, const char *argv[])
}
if (!event) {
- strscpy(buf1, sys_event, MAX_EVENT_NAME_LEN);
+ buf1 = kstrdup(sys_event, GFP_KERNEL);
+ if (!buf1)
+ goto mem_error;
event = buf1;
}
@@ -972,6 +990,9 @@ static int __trace_eprobe_create(int argc, const char *argv[])
trace_probe_log_clear();
return ret;
+mem_error:
+ ret = -ENOMEM;
+ goto error;
parse_error:
ret = -EINVAL;
error:
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 069e92856bda..d01e5c910ce1 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -400,6 +400,20 @@ static bool process_string(const char *fmt, int len, struct trace_event_call *ca
return true;
}
+static void handle_dereference_arg(const char *arg_str, u64 string_flags, int len,
+ u64 *dereference_flags, int arg,
+ struct trace_event_call *call)
+{
+ if (string_flags & (1ULL << arg)) {
+ if (process_string(arg_str, len, call))
+ *dereference_flags &= ~(1ULL << arg);
+ } else if (process_pointer(arg_str, len, call))
+ *dereference_flags &= ~(1ULL << arg);
+ else
+ pr_warn("TRACE EVENT ERROR: Bad dereference argument: '%.*s'\n",
+ len, arg_str);
+}
+
/*
* Examine the print fmt of the event looking for unsafe dereference
* pointers using %p* that could be recorded in the trace event and
@@ -563,11 +577,9 @@ static void test_event_printk(struct trace_event_call *call)
}
if (dereference_flags & (1ULL << arg)) {
- if (string_flags & (1ULL << arg)) {
- if (process_string(fmt + start_arg, e - start_arg, call))
- dereference_flags &= ~(1ULL << arg);
- } else if (process_pointer(fmt + start_arg, e - start_arg, call))
- dereference_flags &= ~(1ULL << arg);
+ handle_dereference_arg(fmt + start_arg, string_flags,
+ e - start_arg,
+ &dereference_flags, arg, call);
}
start_arg = i;
@@ -578,11 +590,9 @@ static void test_event_printk(struct trace_event_call *call)
}
if (dereference_flags & (1ULL << arg)) {
- if (string_flags & (1ULL << arg)) {
- if (process_string(fmt + start_arg, i - start_arg, call))
- dereference_flags &= ~(1ULL << arg);
- } else if (process_pointer(fmt + start_arg, i - start_arg, call))
- dereference_flags &= ~(1ULL << arg);
+ handle_dereference_arg(fmt + start_arg, string_flags,
+ i - start_arg,
+ &dereference_flags, arg, call);
}
/*
@@ -622,7 +632,6 @@ EXPORT_SYMBOL_GPL(trace_event_raw_init);
bool trace_event_ignore_this_pid(struct trace_event_file *trace_file)
{
struct trace_array *tr = trace_file->tr;
- struct trace_array_cpu *data;
struct trace_pid_list *no_pid_list;
struct trace_pid_list *pid_list;
@@ -632,9 +641,11 @@ bool trace_event_ignore_this_pid(struct trace_event_file *trace_file)
if (!pid_list && !no_pid_list)
return false;
- data = this_cpu_ptr(tr->array_buffer.data);
-
- return data->ignore_pid;
+ /*
+ * This is recorded at every sched_switch for this task.
+ * Thus, even if the task migrates the ignore value will be the same.
+ */
+ return this_cpu_read(tr->array_buffer.data->ignore_pid) != 0;
}
EXPORT_SYMBOL_GPL(trace_event_ignore_this_pid);
@@ -3125,7 +3136,10 @@ __register_event(struct trace_event_call *call, struct module *mod)
if (ret < 0)
return ret;
+ down_write(&trace_event_sem);
list_add(&call->list, &ftrace_events);
+ up_write(&trace_event_sem);
+
if (call->flags & TRACE_EVENT_FL_DYNAMIC)
atomic_set(&call->refcnt, 0);
else
@@ -3739,6 +3753,8 @@ __trace_add_event_dirs(struct trace_array *tr)
struct trace_event_call *call;
int ret;
+ lockdep_assert_held(&trace_event_sem);
+
list_for_each_entry(call, &ftrace_events, list) {
ret = __trace_add_new_event(call, tr);
if (ret < 0)
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 2048560264bb..e4581e10782b 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -1250,7 +1250,9 @@ static void append_filter_err(struct trace_array *tr,
static inline struct event_filter *event_filter(struct trace_event_file *file)
{
- return file->filter;
+ return rcu_dereference_protected(file->filter,
+ lockdep_is_held(&event_mutex));
+
}
/* caller must hold event_mutex */
@@ -1320,7 +1322,7 @@ void free_event_filter(struct event_filter *filter)
static inline void __remove_filter(struct trace_event_file *file)
{
filter_disable(file);
- remove_filter_string(file->filter);
+ remove_filter_string(event_filter(file));
}
static void filter_free_subsystem_preds(struct trace_subsystem_dir *dir,
@@ -1335,22 +1337,137 @@ static void filter_free_subsystem_preds(struct trace_subsystem_dir *dir,
}
}
+struct filter_list {
+ struct list_head list;
+ struct event_filter *filter;
+};
+
+struct filter_head {
+ struct list_head list;
+ struct rcu_head rcu;
+};
+
+
+static void free_filter_list(struct rcu_head *rhp)
+{
+ struct filter_head *filter_list = container_of(rhp, struct filter_head, rcu);
+ struct filter_list *filter_item, *tmp;
+
+ list_for_each_entry_safe(filter_item, tmp, &filter_list->list, list) {
+ __free_filter(filter_item->filter);
+ list_del(&filter_item->list);
+ kfree(filter_item);
+ }
+ kfree(filter_list);
+}
+
+static void free_filter_list_tasks(struct rcu_head *rhp)
+{
+ call_rcu(rhp, free_filter_list);
+}
+
+/*
+ * The tracepoint_synchronize_unregister() is a double rcu call.
+ * It calls synchronize_rcu_tasks_trace() followed by synchronize_rcu().
+ * Instead of waiting for it, simply call these via the call_rcu*()
+ * variants.
+ */
+static void delay_free_filter(struct filter_head *head)
+{
+ call_rcu_tasks_trace(&head->rcu, free_filter_list_tasks);
+}
+
+static void try_delay_free_filter(struct event_filter *filter)
+{
+ struct filter_head *head;
+ struct filter_list *item;
+
+ head = kmalloc(sizeof(*head), GFP_KERNEL);
+ if (!head)
+ goto free_now;
+
+ INIT_LIST_HEAD(&head->list);
+
+ item = kmalloc(sizeof(*item), GFP_KERNEL);
+ if (!item) {
+ kfree(head);
+ goto free_now;
+ }
+
+ item->filter = filter;
+ list_add_tail(&item->list, &head->list);
+ delay_free_filter(head);
+ return;
+
+ free_now:
+ /* Make sure the filter is not being used */
+ tracepoint_synchronize_unregister();
+ __free_filter(filter);
+}
+
static inline void __free_subsystem_filter(struct trace_event_file *file)
{
- __free_filter(file->filter);
+ __free_filter(event_filter(file));
file->filter = NULL;
}
+static inline void event_set_filter(struct trace_event_file *file,
+ struct event_filter *filter)
+{
+ rcu_assign_pointer(file->filter, filter);
+}
+
+static inline void event_clear_filter(struct trace_event_file *file)
+{
+ RCU_INIT_POINTER(file->filter, NULL);
+}
+
static void filter_free_subsystem_filters(struct trace_subsystem_dir *dir,
- struct trace_array *tr)
+ struct trace_array *tr,
+ struct event_filter *filter)
{
struct trace_event_file *file;
+ struct filter_head *head;
+ struct filter_list *item;
+
+ head = kmalloc(sizeof(*head), GFP_KERNEL);
+ if (!head)
+ goto free_now;
+
+ INIT_LIST_HEAD(&head->list);
list_for_each_entry(file, &tr->events, list) {
if (file->system != dir)
continue;
+ item = kmalloc(sizeof(*item), GFP_KERNEL);
+ if (!item)
+ goto free_now;
+ item->filter = event_filter(file);
+ list_add_tail(&item->list, &head->list);
+ event_clear_filter(file);
+ }
+
+ item = kmalloc(sizeof(*item), GFP_KERNEL);
+ if (!item)
+ goto free_now;
+
+ item->filter = filter;
+ list_add_tail(&item->list, &head->list);
+
+ delay_free_filter(head);
+ return;
+ free_now:
+ tracepoint_synchronize_unregister();
+
+ if (head)
+ free_filter_list(&head->rcu);
+
+ list_for_each_entry(file, &tr->events, list) {
+ if (file->system != dir || !file->filter)
+ continue;
__free_subsystem_filter(file);
}
+ __free_filter(filter);
}
int filter_assign_type(const char *type)
@@ -2120,22 +2237,6 @@ static inline void event_set_filtered_flag(struct trace_event_file *file)
trace_buffered_event_enable();
}
-static inline void event_set_filter(struct trace_event_file *file,
- struct event_filter *filter)
-{
- rcu_assign_pointer(file->filter, filter);
-}
-
-static inline void event_clear_filter(struct trace_event_file *file)
-{
- RCU_INIT_POINTER(file->filter, NULL);
-}
-
-struct filter_list {
- struct list_head list;
- struct event_filter *filter;
-};
-
static int process_system_preds(struct trace_subsystem_dir *dir,
struct trace_array *tr,
struct filter_parse_error *pe,
@@ -2144,11 +2245,16 @@ static int process_system_preds(struct trace_subsystem_dir *dir,
struct trace_event_file *file;
struct filter_list *filter_item;
struct event_filter *filter = NULL;
- struct filter_list *tmp;
- LIST_HEAD(filter_list);
+ struct filter_head *filter_list;
bool fail = true;
int err;
+ filter_list = kmalloc(sizeof(*filter_list), GFP_KERNEL);
+ if (!filter_list)
+ return -ENOMEM;
+
+ INIT_LIST_HEAD(&filter_list->list);
+
list_for_each_entry(file, &tr->events, list) {
if (file->system != dir)
@@ -2175,7 +2281,7 @@ static int process_system_preds(struct trace_subsystem_dir *dir,
if (!filter_item)
goto fail_mem;
- list_add_tail(&filter_item->list, &filter_list);
+ list_add_tail(&filter_item->list, &filter_list->list);
/*
* Regardless of if this returned an error, we still
* replace the filter for the call.
@@ -2195,31 +2301,22 @@ static int process_system_preds(struct trace_subsystem_dir *dir,
* Do a synchronize_rcu() and to ensure all calls are
* done with them before we free them.
*/
- tracepoint_synchronize_unregister();
- list_for_each_entry_safe(filter_item, tmp, &filter_list, list) {
- __free_filter(filter_item->filter);
- list_del(&filter_item->list);
- kfree(filter_item);
- }
+ delay_free_filter(filter_list);
return 0;
fail:
/* No call succeeded */
- list_for_each_entry_safe(filter_item, tmp, &filter_list, list) {
- list_del(&filter_item->list);
- kfree(filter_item);
- }
+ free_filter_list(&filter_list->rcu);
parse_error(pe, FILT_ERR_BAD_SUBSYS_FILTER, 0);
return -EINVAL;
fail_mem:
__free_filter(filter);
+
/* If any call succeeded, we still need to sync */
if (!fail)
- tracepoint_synchronize_unregister();
- list_for_each_entry_safe(filter_item, tmp, &filter_list, list) {
- __free_filter(filter_item->filter);
- list_del(&filter_item->list);
- kfree(filter_item);
- }
+ delay_free_filter(filter_list);
+ else
+ free_filter_list(&filter_list->rcu);
+
return -ENOMEM;
}
@@ -2361,9 +2458,7 @@ int apply_event_filter(struct trace_event_file *file, char *filter_string)
event_clear_filter(file);
- /* Make sure the filter is not being used */
- tracepoint_synchronize_unregister();
- __free_filter(filter);
+ try_delay_free_filter(filter);
return 0;
}
@@ -2387,11 +2482,8 @@ int apply_event_filter(struct trace_event_file *file, char *filter_string)
event_set_filter(file, filter);
- if (tmp) {
- /* Make sure the call is done with the filter */
- tracepoint_synchronize_unregister();
- __free_filter(tmp);
- }
+ if (tmp)
+ try_delay_free_filter(tmp);
}
return err;
@@ -2417,9 +2509,7 @@ int apply_subsystem_event_filter(struct trace_subsystem_dir *dir,
filter = system->filter;
system->filter = NULL;
/* Ensure all filters are no longer used */
- tracepoint_synchronize_unregister();
- filter_free_subsystem_filters(dir, tr);
- __free_filter(filter);
+ filter_free_subsystem_filters(dir, tr, filter);
return 0;
}
@@ -2810,6 +2900,10 @@ static __init int ftrace_test_event_filter(void)
if (i == DATA_CNT)
printk(KERN_CONT "OK\n");
+ /* Need to call ftrace_test_filter to prevent a warning */
+ if (!trace_ftrace_test_filter_enabled())
+ trace_ftrace_test_filter(1, 2, 3, 4, 5, 6, 7, 8);
+
return 0;
}
diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c
index 1260c23cfa5f..1d536219b624 100644
--- a/kernel/trace/trace_events_hist.c
+++ b/kernel/trace/trace_events_hist.c
@@ -114,6 +114,7 @@ enum hist_field_fn {
HIST_FIELD_FN_BUCKET,
HIST_FIELD_FN_TIMESTAMP,
HIST_FIELD_FN_CPU,
+ HIST_FIELD_FN_COMM,
HIST_FIELD_FN_STRING,
HIST_FIELD_FN_DYNSTRING,
HIST_FIELD_FN_RELDYNSTRING,
@@ -506,6 +507,7 @@ enum hist_field_flags {
HIST_FIELD_FL_CONST = 1 << 18,
HIST_FIELD_FL_PERCENT = 1 << 19,
HIST_FIELD_FL_GRAPH = 1 << 20,
+ HIST_FIELD_FL_COMM = 1 << 21,
};
struct var_defs {
@@ -885,6 +887,15 @@ static u64 hist_field_cpu(struct hist_field *hist_field,
return cpu;
}
+static u64 hist_field_comm(struct hist_field *hist_field,
+ struct tracing_map_elt *elt,
+ struct trace_buffer *buffer,
+ struct ring_buffer_event *rbe,
+ void *event)
+{
+ return (u64)(unsigned long)current->comm;
+}
+
/**
* check_field_for_var_ref - Check if a VAR_REF field references a variable
* @hist_field: The VAR_REF field to check
@@ -1338,6 +1349,8 @@ static const char *hist_field_name(struct hist_field *field,
field_name = hist_field_name(field->operands[0], ++level);
else if (field->flags & HIST_FIELD_FL_CPU)
field_name = "common_cpu";
+ else if (field->flags & HIST_FIELD_FL_COMM)
+ field_name = "common_comm";
else if (field->flags & HIST_FIELD_FL_EXPR ||
field->flags & HIST_FIELD_FL_VAR_REF) {
if (field->system) {
@@ -2015,6 +2028,13 @@ static struct hist_field *create_hist_field(struct hist_trigger_data *hist_data,
goto out;
}
+ if (flags & HIST_FIELD_FL_COMM) {
+ hist_field->fn_num = HIST_FIELD_FN_COMM;
+ hist_field->size = MAX_FILTER_STR_VAL;
+ hist_field->type = "char[]";
+ goto out;
+ }
+
if (WARN_ON_ONCE(!field))
goto out;
@@ -2359,9 +2379,11 @@ parse_field(struct hist_trigger_data *hist_data, struct trace_event_file *file,
hist_data->attrs->ts_in_usecs = true;
} else if (strcmp(field_name, "common_stacktrace") == 0) {
*flags |= HIST_FIELD_FL_STACKTRACE;
- } else if (strcmp(field_name, "common_cpu") == 0)
+ } else if (strcmp(field_name, "common_cpu") == 0) {
*flags |= HIST_FIELD_FL_CPU;
- else if (strcmp(field_name, "hitcount") == 0)
+ } else if (strcmp(field_name, "common_comm") == 0) {
+ *flags |= HIST_FIELD_FL_COMM | HIST_FIELD_FL_STRING;
+ } else if (strcmp(field_name, "hitcount") == 0)
*flags |= HIST_FIELD_FL_HITCOUNT;
else {
field = trace_find_event_field(file->event_call, field_name);
@@ -2377,6 +2399,8 @@ parse_field(struct hist_trigger_data *hist_data, struct trace_event_file *file,
*flags |= HIST_FIELD_FL_CPU;
} else if (field && field->filter_type == FILTER_STACKTRACE) {
*flags |= HIST_FIELD_FL_STACKTRACE;
+ } else if (field && field->filter_type == FILTER_COMM) {
+ *flags |= HIST_FIELD_FL_COMM | HIST_FIELD_FL_STRING;
} else {
hist_err(tr, HIST_ERR_FIELD_NOT_FOUND,
errpos(field_name));
@@ -4327,6 +4351,8 @@ static u64 hist_fn_call(struct hist_field *hist_field,
return hist_field_timestamp(hist_field, elt, buffer, rbe, event);
case HIST_FIELD_FN_CPU:
return hist_field_cpu(hist_field, elt, buffer, rbe, event);
+ case HIST_FIELD_FN_COMM:
+ return hist_field_comm(hist_field, elt, buffer, rbe, event);
case HIST_FIELD_FN_STRING:
return hist_field_string(hist_field, elt, buffer, rbe, event);
case HIST_FIELD_FN_DYNSTRING:
@@ -5212,22 +5238,25 @@ static inline void add_to_key(char *compound_key, void *key,
size_t size = key_field->size;
if (key_field->flags & HIST_FIELD_FL_STRING) {
- struct ftrace_event_field *field;
- field = key_field->field;
- if (field->filter_type == FILTER_DYN_STRING ||
- field->filter_type == FILTER_RDYN_STRING)
- size = *(u32 *)(rec + field->offset) >> 16;
- else if (field->filter_type == FILTER_STATIC_STRING)
- size = field->size;
+ if (key_field->flags & HIST_FIELD_FL_COMM) {
+ size = strlen((char *)key);
+ } else {
+ struct ftrace_event_field *field;
+
+ field = key_field->field;
+ if (field->filter_type == FILTER_DYN_STRING ||
+ field->filter_type == FILTER_RDYN_STRING)
+ size = *(u32 *)(rec + field->offset) >> 16;
+ else if (field->filter_type == FILTER_STATIC_STRING)
+ size = field->size;
+ }
/* ensure NULL-termination */
if (size > key_field->size - 1)
size = key_field->size - 1;
-
- strncpy(compound_key + key_field->offset, (char *)key, size);
- } else
- memcpy(compound_key + key_field->offset, key, size);
+ }
+ memcpy(compound_key + key_field->offset, key, size);
}
static void
@@ -5246,17 +5275,94 @@ hist_trigger_actions(struct hist_trigger_data *hist_data,
}
}
+/*
+ * The hist_pad structure is used to save information to create
+ * a histogram from the histogram trigger. It's too big to store
+ * on the stack, so when the histogram trigger is initialized
+ * a percpu array of 4 hist_pad structures is allocated.
+ * This will cover every context from normal, softirq, irq and NMI
+ * in the very unlikely event that a tigger happens at each of
+ * these contexts and interrupts a currently active trigger.
+ */
+struct hist_pad {
+ unsigned long entries[HIST_STACKTRACE_DEPTH];
+ u64 var_ref_vals[TRACING_MAP_VARS_MAX];
+ char compound_key[HIST_KEY_SIZE_MAX];
+};
+
+static struct hist_pad __percpu *hist_pads;
+static DEFINE_PER_CPU(int, hist_pad_cnt);
+static refcount_t hist_pad_ref;
+
+/* One hist_pad for every context (normal, softirq, irq, NMI) */
+#define MAX_HIST_CNT 4
+
+static int alloc_hist_pad(void)
+{
+ lockdep_assert_held(&event_mutex);
+
+ if (refcount_read(&hist_pad_ref)) {
+ refcount_inc(&hist_pad_ref);
+ return 0;
+ }
+
+ hist_pads = __alloc_percpu(sizeof(struct hist_pad) * MAX_HIST_CNT,
+ __alignof__(struct hist_pad));
+ if (!hist_pads)
+ return -ENOMEM;
+
+ refcount_set(&hist_pad_ref, 1);
+ return 0;
+}
+
+static void free_hist_pad(void)
+{
+ lockdep_assert_held(&event_mutex);
+
+ if (!refcount_dec_and_test(&hist_pad_ref))
+ return;
+
+ free_percpu(hist_pads);
+ hist_pads = NULL;
+}
+
+static struct hist_pad *get_hist_pad(void)
+{
+ struct hist_pad *hist_pad;
+ int cnt;
+
+ if (WARN_ON_ONCE(!hist_pads))
+ return NULL;
+
+ preempt_disable();
+
+ hist_pad = per_cpu_ptr(hist_pads, smp_processor_id());
+
+ if (this_cpu_read(hist_pad_cnt) == MAX_HIST_CNT) {
+ preempt_enable();
+ return NULL;
+ }
+
+ cnt = this_cpu_inc_return(hist_pad_cnt) - 1;
+
+ return &hist_pad[cnt];
+}
+
+static void put_hist_pad(void)
+{
+ this_cpu_dec(hist_pad_cnt);
+ preempt_enable();
+}
+
static void event_hist_trigger(struct event_trigger_data *data,
struct trace_buffer *buffer, void *rec,
struct ring_buffer_event *rbe)
{
struct hist_trigger_data *hist_data = data->private_data;
bool use_compound_key = (hist_data->n_keys > 1);
- unsigned long entries[HIST_STACKTRACE_DEPTH];
- u64 var_ref_vals[TRACING_MAP_VARS_MAX];
- char compound_key[HIST_KEY_SIZE_MAX];
struct tracing_map_elt *elt = NULL;
struct hist_field *key_field;
+ struct hist_pad *hist_pad;
u64 field_contents;
void *key = NULL;
unsigned int i;
@@ -5264,12 +5370,18 @@ static void event_hist_trigger(struct event_trigger_data *data,
if (unlikely(!rbe))
return;
- memset(compound_key, 0, hist_data->key_size);
+ hist_pad = get_hist_pad();
+ if (!hist_pad)
+ return;
+
+ memset(hist_pad->compound_key, 0, hist_data->key_size);
for_each_hist_key_field(i, hist_data) {
key_field = hist_data->fields[i];
if (key_field->flags & HIST_FIELD_FL_STACKTRACE) {
+ unsigned long *entries = hist_pad->entries;
+
memset(entries, 0, HIST_STACKTRACE_SIZE);
if (key_field->field) {
unsigned long *stack, n_entries;
@@ -5293,26 +5405,31 @@ static void event_hist_trigger(struct event_trigger_data *data,
}
if (use_compound_key)
- add_to_key(compound_key, key, key_field, rec);
+ add_to_key(hist_pad->compound_key, key, key_field, rec);
}
if (use_compound_key)
- key = compound_key;
+ key = hist_pad->compound_key;
if (hist_data->n_var_refs &&
- !resolve_var_refs(hist_data, key, var_ref_vals, false))
- return;
+ !resolve_var_refs(hist_data, key, hist_pad->var_ref_vals, false))
+ goto out;
elt = tracing_map_insert(hist_data->map, key);
if (!elt)
- return;
+ goto out;
- hist_trigger_elt_update(hist_data, elt, buffer, rec, rbe, var_ref_vals);
+ hist_trigger_elt_update(hist_data, elt, buffer, rec, rbe, hist_pad->var_ref_vals);
- if (resolve_var_refs(hist_data, key, var_ref_vals, true))
- hist_trigger_actions(hist_data, elt, buffer, rec, rbe, key, var_ref_vals);
+ if (resolve_var_refs(hist_data, key, hist_pad->var_ref_vals, true)) {
+ hist_trigger_actions(hist_data, elt, buffer, rec, rbe,
+ key, hist_pad->var_ref_vals);
+ }
hist_poll_wakeup();
+
+ out:
+ put_hist_pad();
}
static void hist_trigger_stacktrace_print(struct seq_file *m,
@@ -6011,6 +6128,8 @@ static void hist_field_print(struct seq_file *m, struct hist_field *hist_field)
if (hist_field->flags & HIST_FIELD_FL_CPU)
seq_puts(m, "common_cpu");
+ if (hist_field->flags & HIST_FIELD_FL_COMM)
+ seq_puts(m, "common_comm");
else if (hist_field->flags & HIST_FIELD_FL_CONST)
seq_printf(m, "%llu", hist_field->constant);
else if (field_name) {
@@ -6157,6 +6276,9 @@ static int event_hist_trigger_init(struct event_trigger_data *data)
{
struct hist_trigger_data *hist_data = data->private_data;
+ if (alloc_hist_pad() < 0)
+ return -ENOMEM;
+
if (!data->ref && hist_data->attrs->name)
save_named_trigger(hist_data->attrs->name, data);
@@ -6201,6 +6323,7 @@ static void event_hist_trigger_free(struct event_trigger_data *data)
destroy_hist_data(hist_data);
}
+ free_hist_pad();
}
static const struct event_trigger_ops event_hist_trigger_ops = {
@@ -6216,9 +6339,7 @@ static int event_hist_trigger_named_init(struct event_trigger_data *data)
save_named_trigger(data->named_data->name, data);
- event_hist_trigger_init(data->named_data);
-
- return 0;
+ return event_hist_trigger_init(data->named_data);
}
static void event_hist_trigger_named_free(struct event_trigger_data *data)
@@ -6705,7 +6826,7 @@ static int event_hist_trigger_parse(struct event_command *cmd_ops,
return PTR_ERR(hist_data);
}
- trigger_data = event_trigger_alloc(cmd_ops, cmd, param, hist_data);
+ trigger_data = trigger_data_alloc(cmd_ops, cmd, param, hist_data);
if (!trigger_data) {
ret = -ENOMEM;
goto out_free;
diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c
index 6e87ae2a1a66..cbfc306c0159 100644
--- a/kernel/trace/trace_events_trigger.c
+++ b/kernel/trace/trace_events_trigger.c
@@ -552,16 +552,14 @@ static int register_trigger(char *glob,
lockdep_assert_held(&event_mutex);
list_for_each_entry(test, &file->triggers, list) {
- if (test->cmd_ops->trigger_type == data->cmd_ops->trigger_type) {
- ret = -EEXIST;
- goto out;
- }
+ if (test->cmd_ops->trigger_type == data->cmd_ops->trigger_type)
+ return -EEXIST;
}
if (data->ops->init) {
ret = data->ops->init(data);
if (ret < 0)
- goto out;
+ return ret;
}
list_add_rcu(&data->list, &file->triggers);
@@ -572,7 +570,6 @@ static int register_trigger(char *glob,
list_del_rcu(&data->list);
update_cond_flag(file);
}
-out:
return ret;
}
@@ -770,7 +767,7 @@ int event_trigger_separate_filter(char *param_and_filter, char **param,
if (!param_and_filter) {
if (param_required)
ret = -EINVAL;
- goto out;
+ return ret;
}
/*
@@ -781,7 +778,7 @@ int event_trigger_separate_filter(char *param_and_filter, char **param,
*/
if (!param_required && param_and_filter && !isdigit(param_and_filter[0])) {
*filter = param_and_filter;
- goto out;
+ return ret;
}
/*
@@ -799,12 +796,11 @@ int event_trigger_separate_filter(char *param_and_filter, char **param,
if (!**filter)
*filter = NULL;
}
-out:
return ret;
}
/**
- * event_trigger_alloc - allocate and init event_trigger_data for a trigger
+ * trigger_data_alloc - allocate and init event_trigger_data for a trigger
* @cmd_ops: The event_command operations for the trigger
* @cmd: The cmd string
* @param: The param string
@@ -815,14 +811,14 @@ out:
* trigger_ops to assign to the event_trigger_data. @private_data can
* also be passed in and associated with the event_trigger_data.
*
- * Use event_trigger_free() to free an event_trigger_data object.
+ * Use trigger_data_free() to free an event_trigger_data object.
*
* Return: The trigger_data object success, NULL otherwise
*/
-struct event_trigger_data *event_trigger_alloc(struct event_command *cmd_ops,
- char *cmd,
- char *param,
- void *private_data)
+struct event_trigger_data *trigger_data_alloc(struct event_command *cmd_ops,
+ char *cmd,
+ char *param,
+ void *private_data)
{
struct event_trigger_data *trigger_data;
const struct event_trigger_ops *trigger_ops;
@@ -989,15 +985,14 @@ event_trigger_parse(struct event_command *cmd_ops,
return ret;
ret = -ENOMEM;
- trigger_data = event_trigger_alloc(cmd_ops, cmd, param, file);
+ trigger_data = trigger_data_alloc(cmd_ops, cmd, param, file);
if (!trigger_data)
- goto out;
+ return ret;
if (remove) {
event_trigger_unregister(cmd_ops, file, glob+1, trigger_data);
- kfree(trigger_data);
- ret = 0;
- goto out;
+ trigger_data_free(trigger_data);
+ return 0;
}
ret = event_trigger_parse_num(param, trigger_data);
@@ -1017,13 +1012,12 @@ event_trigger_parse(struct event_command *cmd_ops,
/* Down the counter of trigger_data or free it if not used anymore */
event_trigger_free(trigger_data);
- out:
return ret;
out_free:
event_trigger_reset_filter(cmd_ops, trigger_data);
- kfree(trigger_data);
- goto out;
+ trigger_data_free(trigger_data);
+ return ret;
}
/**
@@ -1057,10 +1051,10 @@ int set_trigger_filter(char *filter_str,
s = strsep(&filter_str, " \t");
if (!strlen(s) || strcmp(s, "if") != 0)
- goto out;
+ return ret;
if (!filter_str)
- goto out;
+ return ret;
/* The filter is for the 'trigger' event, not the triggered event */
ret = create_event_filter(file->tr, file->event_call,
@@ -1104,7 +1098,6 @@ int set_trigger_filter(char *filter_str,
ret = -ENOMEM;
}
}
- out:
return ret;
}
@@ -1772,7 +1765,7 @@ int event_enable_trigger_parse(struct event_command *cmd_ops,
ret = -EINVAL;
event_enable_file = find_event_file(tr, system, event);
if (!event_enable_file)
- goto out;
+ return ret;
#ifdef CONFIG_HIST_TRIGGERS
hist = ((strcmp(cmd, ENABLE_HIST_STR) == 0) ||
@@ -1787,16 +1780,16 @@ int event_enable_trigger_parse(struct event_command *cmd_ops,
enable_data = kzalloc(sizeof(*enable_data), GFP_KERNEL);
if (!enable_data)
- goto out;
+ return ret;
enable_data->hist = hist;
enable_data->enable = enable;
enable_data->file = event_enable_file;
- trigger_data = event_trigger_alloc(cmd_ops, cmd, param, enable_data);
+ trigger_data = trigger_data_alloc(cmd_ops, cmd, param, enable_data);
if (!trigger_data) {
kfree(enable_data);
- goto out;
+ return ret;
}
if (remove) {
@@ -1804,7 +1797,7 @@ int event_enable_trigger_parse(struct event_command *cmd_ops,
kfree(trigger_data);
kfree(enable_data);
ret = 0;
- goto out;
+ return ret;
}
/* Up the trigger_data count to make sure nothing frees it on failure */
@@ -1834,7 +1827,6 @@ int event_enable_trigger_parse(struct event_command *cmd_ops,
goto out_disable;
event_trigger_free(trigger_data);
- out:
return ret;
out_disable:
trace_event_enable_disable(event_enable_file, 0, 1);
@@ -1845,7 +1837,7 @@ int event_enable_trigger_parse(struct event_command *cmd_ops,
event_trigger_free(trigger_data);
kfree(enable_data);
- goto out;
+ return ret;
}
int event_enable_register_trigger(char *glob,
@@ -1865,15 +1857,14 @@ int event_enable_register_trigger(char *glob,
(test->cmd_ops->trigger_type ==
data->cmd_ops->trigger_type) &&
(test_enable_data->file == enable_data->file)) {
- ret = -EEXIST;
- goto out;
+ return -EEXIST;
}
}
if (data->ops->init) {
ret = data->ops->init(data);
if (ret < 0)
- goto out;
+ return ret;
}
list_add_rcu(&data->list, &file->triggers);
@@ -1884,7 +1875,6 @@ int event_enable_register_trigger(char *glob,
list_del_rcu(&data->list);
update_cond_flag(file);
}
-out:
return ret;
}
diff --git a/kernel/trace/trace_fprobe.c b/kernel/trace/trace_fprobe.c
index b40fa59159ac..b36ade43d4b3 100644
--- a/kernel/trace/trace_fprobe.c
+++ b/kernel/trace/trace_fprobe.c
@@ -4,15 +4,18 @@
* Copyright (C) 2022 Google LLC.
*/
#define pr_fmt(fmt) "trace_fprobe: " fmt
-#include <asm/ptrace.h>
#include <linux/fprobe.h>
+#include <linux/list.h>
#include <linux/module.h>
+#include <linux/mutex.h>
#include <linux/rculist.h>
#include <linux/security.h>
#include <linux/tracepoint.h>
#include <linux/uaccess.h>
+#include <asm/ptrace.h>
+
#include "trace_dynevent.h"
#include "trace_probe.h"
#include "trace_probe_kernel.h"
@@ -21,7 +24,6 @@
#define FPROBE_EVENT_SYSTEM "fprobes"
#define TRACEPOINT_EVENT_SYSTEM "tracepoints"
#define RETHOOK_MAXACTIVE_MAX 4096
-#define TRACEPOINT_STUB ERR_PTR(-ENOENT)
static int trace_fprobe_create(const char *raw_command);
static int trace_fprobe_show(struct seq_file *m, struct dyn_event *ev);
@@ -38,15 +40,156 @@ static struct dyn_event_operations trace_fprobe_ops = {
.match = trace_fprobe_match,
};
+/* List of tracepoint_user */
+static LIST_HEAD(tracepoint_user_list);
+static DEFINE_MUTEX(tracepoint_user_mutex);
+
+/* While living tracepoint_user, @tpoint can be NULL and @refcount != 0. */
+struct tracepoint_user {
+ struct list_head list;
+ const char *name;
+ struct tracepoint *tpoint;
+ unsigned int refcount;
+};
+
+/* NOTE: you must lock tracepoint_user_mutex. */
+#define for_each_tracepoint_user(tuser) \
+ list_for_each_entry(tuser, &tracepoint_user_list, list)
+
+static int tracepoint_user_register(struct tracepoint_user *tuser)
+{
+ struct tracepoint *tpoint = tuser->tpoint;
+
+ if (!tpoint)
+ return 0;
+
+ return tracepoint_probe_register_prio_may_exist(tpoint,
+ tpoint->probestub, NULL, 0);
+}
+
+static void tracepoint_user_unregister(struct tracepoint_user *tuser)
+{
+ if (!tuser->tpoint)
+ return;
+
+ WARN_ON_ONCE(tracepoint_probe_unregister(tuser->tpoint, tuser->tpoint->probestub, NULL));
+ tuser->tpoint = NULL;
+}
+
+static unsigned long tracepoint_user_ip(struct tracepoint_user *tuser)
+{
+ if (!tuser->tpoint)
+ return 0UL;
+
+ return (unsigned long)tuser->tpoint->probestub;
+}
+
+static void __tracepoint_user_free(struct tracepoint_user *tuser)
+{
+ if (!tuser)
+ return;
+ kfree(tuser->name);
+ kfree(tuser);
+}
+
+DEFINE_FREE(tuser_free, struct tracepoint_user *, __tracepoint_user_free(_T))
+
+static struct tracepoint_user *__tracepoint_user_init(const char *name, struct tracepoint *tpoint)
+{
+ struct tracepoint_user *tuser __free(tuser_free) = NULL;
+ int ret;
+
+ tuser = kzalloc(sizeof(*tuser), GFP_KERNEL);
+ if (!tuser)
+ return NULL;
+ tuser->name = kstrdup(name, GFP_KERNEL);
+ if (!tuser->name)
+ return NULL;
+
+ if (tpoint) {
+ ret = tracepoint_user_register(tuser);
+ if (ret)
+ return ERR_PTR(ret);
+ }
+
+ tuser->tpoint = tpoint;
+ tuser->refcount = 1;
+ INIT_LIST_HEAD(&tuser->list);
+ list_add(&tuser->list, &tracepoint_user_list);
+
+ return_ptr(tuser);
+}
+
+static struct tracepoint *find_tracepoint(const char *tp_name,
+ struct module **tp_mod);
+
+/*
+ * Get tracepoint_user if exist, or allocate new one and register it.
+ * If tracepoint is on a module, get its refcounter too.
+ * This returns errno or NULL (not loaded yet) or tracepoint_user.
+ */
+static struct tracepoint_user *tracepoint_user_find_get(const char *name, struct module **pmod)
+{
+ struct module *mod __free(module_put) = NULL;
+ struct tracepoint_user *tuser;
+ struct tracepoint *tpoint;
+
+ if (!name || !pmod)
+ return ERR_PTR(-EINVAL);
+
+ /* Get and lock the module which has tracepoint. */
+ tpoint = find_tracepoint(name, &mod);
+
+ guard(mutex)(&tracepoint_user_mutex);
+ /* Search existing tracepoint_user */
+ for_each_tracepoint_user(tuser) {
+ if (!strcmp(tuser->name, name)) {
+ tuser->refcount++;
+ *pmod = no_free_ptr(mod);
+ return tuser;
+ }
+ }
+
+ /* The corresponding tracepoint_user is not found. */
+ tuser = __tracepoint_user_init(name, tpoint);
+ if (!IS_ERR_OR_NULL(tuser))
+ *pmod = no_free_ptr(mod);
+
+ return tuser;
+}
+
+static void tracepoint_user_put(struct tracepoint_user *tuser)
+{
+ scoped_guard(mutex, &tracepoint_user_mutex) {
+ if (--tuser->refcount > 0)
+ return;
+
+ list_del(&tuser->list);
+ tracepoint_user_unregister(tuser);
+ }
+
+ __tracepoint_user_free(tuser);
+}
+
+DEFINE_FREE(tuser_put, struct tracepoint_user *,
+ if (!IS_ERR_OR_NULL(_T))
+ tracepoint_user_put(_T))
+
/*
* Fprobe event core functions
*/
+
+/*
+ * @tprobe is true for tracepoint probe.
+ * @tuser can be NULL if the trace_fprobe is disabled or the tracepoint is not
+ * loaded with a module. If @tuser != NULL, this trace_fprobe is enabled.
+ */
struct trace_fprobe {
struct dyn_event devent;
struct fprobe fp;
const char *symbol;
- struct tracepoint *tpoint;
- struct module *mod;
+ bool tprobe;
+ struct tracepoint_user *tuser;
struct trace_probe tp;
};
@@ -76,7 +219,7 @@ static bool trace_fprobe_is_return(struct trace_fprobe *tf)
static bool trace_fprobe_is_tracepoint(struct trace_fprobe *tf)
{
- return tf->tpoint != NULL;
+ return tf->tprobe;
}
static const char *trace_fprobe_symbol(struct trace_fprobe *tf)
@@ -411,6 +554,8 @@ static void free_trace_fprobe(struct trace_fprobe *tf)
{
if (tf) {
trace_probe_cleanup(&tf->tp);
+ if (tf->tuser)
+ tracepoint_user_put(tf->tuser);
kfree(tf->symbol);
kfree(tf);
}
@@ -425,9 +570,8 @@ DEFINE_FREE(free_trace_fprobe, struct trace_fprobe *, if (!IS_ERR_OR_NULL(_T)) f
static struct trace_fprobe *alloc_trace_fprobe(const char *group,
const char *event,
const char *symbol,
- struct tracepoint *tpoint,
- struct module *mod,
- int nargs, bool is_return)
+ int nargs, bool is_return,
+ bool is_tracepoint)
{
struct trace_fprobe *tf __free(free_trace_fprobe) = NULL;
int ret = -ENOMEM;
@@ -445,8 +589,7 @@ static struct trace_fprobe *alloc_trace_fprobe(const char *group,
else
tf->fp.entry_handler = fentry_dispatcher;
- tf->tpoint = tpoint;
- tf->mod = mod;
+ tf->tprobe = is_tracepoint;
ret = trace_probe_init(&tf->tp, event, group, false, nargs);
if (ret < 0)
@@ -469,98 +612,6 @@ static struct trace_fprobe *find_trace_fprobe(const char *event,
return NULL;
}
-static inline int __enable_trace_fprobe(struct trace_fprobe *tf)
-{
- if (trace_fprobe_is_registered(tf))
- enable_fprobe(&tf->fp);
-
- return 0;
-}
-
-static void __disable_trace_fprobe(struct trace_probe *tp)
-{
- struct trace_fprobe *tf;
-
- list_for_each_entry(tf, trace_probe_probe_list(tp), tp.list) {
- if (!trace_fprobe_is_registered(tf))
- continue;
- disable_fprobe(&tf->fp);
- }
-}
-
-/*
- * Enable trace_probe
- * if the file is NULL, enable "perf" handler, or enable "trace" handler.
- */
-static int enable_trace_fprobe(struct trace_event_call *call,
- struct trace_event_file *file)
-{
- struct trace_probe *tp;
- struct trace_fprobe *tf;
- bool enabled;
- int ret = 0;
-
- tp = trace_probe_primary_from_call(call);
- if (WARN_ON_ONCE(!tp))
- return -ENODEV;
- enabled = trace_probe_is_enabled(tp);
-
- /* This also changes "enabled" state */
- if (file) {
- ret = trace_probe_add_file(tp, file);
- if (ret)
- return ret;
- } else
- trace_probe_set_flag(tp, TP_FLAG_PROFILE);
-
- if (!enabled) {
- list_for_each_entry(tf, trace_probe_probe_list(tp), tp.list) {
- /* TODO: check the fprobe is gone */
- __enable_trace_fprobe(tf);
- }
- }
-
- return 0;
-}
-
-/*
- * Disable trace_probe
- * if the file is NULL, disable "perf" handler, or disable "trace" handler.
- */
-static int disable_trace_fprobe(struct trace_event_call *call,
- struct trace_event_file *file)
-{
- struct trace_probe *tp;
-
- tp = trace_probe_primary_from_call(call);
- if (WARN_ON_ONCE(!tp))
- return -ENODEV;
-
- if (file) {
- if (!trace_probe_get_file_link(tp, file))
- return -ENOENT;
- if (!trace_probe_has_single_file(tp))
- goto out;
- trace_probe_clear_flag(tp, TP_FLAG_TRACE);
- } else
- trace_probe_clear_flag(tp, TP_FLAG_PROFILE);
-
- if (!trace_probe_is_enabled(tp))
- __disable_trace_fprobe(tp);
-
- out:
- if (file)
- /*
- * Synchronization is done in below function. For perf event,
- * file == NULL and perf_trace_event_unreg() calls
- * tracepoint_synchronize_unregister() to ensure synchronize
- * event. We don't need to care about it.
- */
- trace_probe_remove_file(tp, file);
-
- return 0;
-}
-
/* Event entry printers */
static enum print_line_t
print_fentry_event(struct trace_iterator *iter, int flags,
@@ -712,20 +763,52 @@ static int unregister_fprobe_event(struct trace_fprobe *tf)
static int __regsiter_tracepoint_fprobe(struct trace_fprobe *tf)
{
- struct tracepoint *tpoint = tf->tpoint;
- unsigned long ip = (unsigned long)tpoint->probestub;
+ struct tracepoint_user *tuser __free(tuser_put) = NULL;
+ struct module *mod __free(module_put) = NULL;
+ unsigned long ip;
int ret;
+ if (WARN_ON_ONCE(tf->tuser))
+ return -EINVAL;
+
+ /* If the tracepoint is in a module, it must be locked in this function. */
+ tuser = tracepoint_user_find_get(tf->symbol, &mod);
+ /* This tracepoint is not loaded yet */
+ if (IS_ERR(tuser))
+ return PTR_ERR(tuser);
+ if (!tuser)
+ return -ENOMEM;
+
+ /* Register fprobe only if the tracepoint is loaded. */
+ if (tuser->tpoint) {
+ ip = tracepoint_user_ip(tuser);
+ if (WARN_ON_ONCE(!ip))
+ return -ENOENT;
+
+ ret = register_fprobe_ips(&tf->fp, &ip, 1);
+ if (ret < 0)
+ return ret;
+ }
+
+ tf->tuser = no_free_ptr(tuser);
+ return 0;
+}
+
+/* Returns an error if the target function is not available, or 0 */
+static int trace_fprobe_verify_target(struct trace_fprobe *tf)
+{
+ int ret;
+
+ /* Tracepoint should have a stub function. */
+ if (trace_fprobe_is_tracepoint(tf))
+ return 0;
+
/*
- * Here, we do 2 steps to enable fprobe on a tracepoint.
- * At first, put __probestub_##TP function on the tracepoint
- * and put a fprobe on the stub function.
+ * Note: since we don't lock the module, even if this succeeded,
+ * register_fprobe() later can fail.
*/
- ret = tracepoint_probe_register_prio_may_exist(tpoint,
- tpoint->probestub, NULL, 0);
- if (ret < 0)
- return ret;
- return register_fprobe_ips(&tf->fp, &ip, 1);
+ ret = fprobe_count_ips_from_filter(tf->symbol, NULL);
+ return (ret < 0) ? ret : 0;
}
/* Internal register function - just handle fprobe and flags */
@@ -747,20 +830,10 @@ static int __register_trace_fprobe(struct trace_fprobe *tf)
return ret;
}
- /* Set/clear disabled flag according to tp->flag */
- if (trace_probe_is_enabled(&tf->tp))
- tf->fp.flags &= ~FPROBE_FL_DISABLED;
- else
- tf->fp.flags |= FPROBE_FL_DISABLED;
-
- if (trace_fprobe_is_tracepoint(tf)) {
-
- /* This tracepoint is not loaded yet */
- if (tf->tpoint == TRACEPOINT_STUB)
- return 0;
+ tf->fp.flags &= ~FPROBE_FL_DISABLED;
+ if (trace_fprobe_is_tracepoint(tf))
return __regsiter_tracepoint_fprobe(tf);
- }
/* TODO: handle filter, nofilter or symbol list */
return register_fprobe(&tf->fp, tf->symbol, NULL);
@@ -769,15 +842,11 @@ static int __register_trace_fprobe(struct trace_fprobe *tf)
/* Internal unregister function - just handle fprobe and flags */
static void __unregister_trace_fprobe(struct trace_fprobe *tf)
{
- if (trace_fprobe_is_registered(tf)) {
+ if (trace_fprobe_is_registered(tf))
unregister_fprobe(&tf->fp);
- memset(&tf->fp, 0, sizeof(tf->fp));
- if (trace_fprobe_is_tracepoint(tf)) {
- tracepoint_probe_unregister(tf->tpoint,
- tf->tpoint->probestub, NULL);
- tf->tpoint = NULL;
- tf->mod = NULL;
- }
+ if (tf->tuser) {
+ tracepoint_user_put(tf->tuser);
+ tf->tuser = NULL;
}
}
@@ -837,7 +906,7 @@ static bool trace_fprobe_has_same_fprobe(struct trace_fprobe *orig,
return false;
}
-static int append_trace_fprobe(struct trace_fprobe *tf, struct trace_fprobe *to)
+static int append_trace_fprobe_event(struct trace_fprobe *tf, struct trace_fprobe *to)
{
int ret;
@@ -865,7 +934,7 @@ static int append_trace_fprobe(struct trace_fprobe *tf, struct trace_fprobe *to)
if (ret)
return ret;
- ret = __register_trace_fprobe(tf);
+ ret = trace_fprobe_verify_target(tf);
if (ret)
trace_probe_unlink(&tf->tp);
else
@@ -874,8 +943,8 @@ static int append_trace_fprobe(struct trace_fprobe *tf, struct trace_fprobe *to)
return ret;
}
-/* Register a trace_probe and probe_event */
-static int register_trace_fprobe(struct trace_fprobe *tf)
+/* Register a trace_probe and probe_event, and check the fprobe is available. */
+static int register_trace_fprobe_event(struct trace_fprobe *tf)
{
struct trace_fprobe *old_tf;
int ret;
@@ -885,7 +954,7 @@ static int register_trace_fprobe(struct trace_fprobe *tf)
old_tf = find_trace_fprobe(trace_probe_name(&tf->tp),
trace_probe_group_name(&tf->tp));
if (old_tf)
- return append_trace_fprobe(tf, old_tf);
+ return append_trace_fprobe_event(tf, old_tf);
/* Register new event */
ret = register_fprobe_event(tf);
@@ -898,8 +967,8 @@ static int register_trace_fprobe(struct trace_fprobe *tf)
return ret;
}
- /* Register fprobe */
- ret = __register_trace_fprobe(tf);
+ /* Verify fprobe is sane. */
+ ret = trace_fprobe_verify_target(tf);
if (ret < 0)
unregister_fprobe_event(tf);
else
@@ -963,15 +1032,6 @@ static struct tracepoint *find_tracepoint(const char *tp_name,
}
#ifdef CONFIG_MODULES
-static void reenable_trace_fprobe(struct trace_fprobe *tf)
-{
- struct trace_probe *tp = &tf->tp;
-
- list_for_each_entry(tf, trace_probe_probe_list(tp), tp.list) {
- __enable_trace_fprobe(tf);
- }
-}
-
/*
* Find a tracepoint from specified module. In this case, this does not get the
* module's refcount. The caller must ensure the module is not freed.
@@ -988,36 +1048,94 @@ static struct tracepoint *find_tracepoint_in_module(struct module *mod,
return data.tpoint;
}
+/* These are CONFIG_MODULES=y specific functions. */
+static bool tracepoint_user_within_module(struct tracepoint_user *tuser,
+ struct module *mod)
+{
+ return within_module(tracepoint_user_ip(tuser), mod);
+}
+
+static int tracepoint_user_register_again(struct tracepoint_user *tuser,
+ struct tracepoint *tpoint)
+{
+ tuser->tpoint = tpoint;
+ return tracepoint_user_register(tuser);
+}
+
+static void tracepoint_user_unregister_clear(struct tracepoint_user *tuser)
+{
+ tracepoint_user_unregister(tuser);
+ tuser->tpoint = NULL;
+}
+
+/* module callback for tracepoint_user */
static int __tracepoint_probe_module_cb(struct notifier_block *self,
unsigned long val, void *data)
{
struct tp_module *tp_mod = data;
+ struct tracepoint_user *tuser;
struct tracepoint *tpoint;
+
+ if (val != MODULE_STATE_GOING && val != MODULE_STATE_COMING)
+ return NOTIFY_DONE;
+
+ mutex_lock(&tracepoint_user_mutex);
+ for_each_tracepoint_user(tuser) {
+ if (val == MODULE_STATE_COMING) {
+ /* This is not a tracepoint in this module. Skip it. */
+ tpoint = find_tracepoint_in_module(tp_mod->mod, tuser->name);
+ if (!tpoint)
+ continue;
+ WARN_ON_ONCE(tracepoint_user_register_again(tuser, tpoint));
+ } else if (val == MODULE_STATE_GOING &&
+ tracepoint_user_within_module(tuser, tp_mod->mod)) {
+ /* Unregister all tracepoint_user in this module. */
+ tracepoint_user_unregister_clear(tuser);
+ }
+ }
+ mutex_unlock(&tracepoint_user_mutex);
+
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block tracepoint_module_nb = {
+ .notifier_call = __tracepoint_probe_module_cb,
+};
+
+/* module callback for tprobe events */
+static int __tprobe_event_module_cb(struct notifier_block *self,
+ unsigned long val, void *data)
+{
struct trace_fprobe *tf;
struct dyn_event *pos;
+ struct module *mod = data;
if (val != MODULE_STATE_GOING && val != MODULE_STATE_COMING)
return NOTIFY_DONE;
mutex_lock(&event_mutex);
for_each_trace_fprobe(tf, pos) {
- if (val == MODULE_STATE_COMING && tf->tpoint == TRACEPOINT_STUB) {
- tpoint = find_tracepoint_in_module(tp_mod->mod, tf->symbol);
- if (tpoint) {
- tf->tpoint = tpoint;
- tf->mod = tp_mod->mod;
- if (!WARN_ON_ONCE(__regsiter_tracepoint_fprobe(tf)) &&
- trace_probe_is_enabled(&tf->tp))
- reenable_trace_fprobe(tf);
- }
- } else if (val == MODULE_STATE_GOING && tp_mod->mod == tf->mod) {
+ /* Skip fprobe and disabled tprobe events. */
+ if (!trace_fprobe_is_tracepoint(tf) || !tf->tuser)
+ continue;
+
+ /* Before this notification, tracepoint notifier has already done. */
+ if (val == MODULE_STATE_COMING &&
+ tracepoint_user_within_module(tf->tuser, mod)) {
+ unsigned long ip = tracepoint_user_ip(tf->tuser);
+
+ WARN_ON_ONCE(register_fprobe_ips(&tf->fp, &ip, 1));
+ } else if (val == MODULE_STATE_GOING &&
+ /*
+ * tracepoint_user_within_module() does not work here because
+ * tracepoint_user is already unregistered and cleared tpoint.
+ * Instead, checking whether the fprobe is registered but
+ * tpoint is cleared(unregistered). Such unbalance probes
+ * must be adjusted anyway.
+ */
+ trace_fprobe_is_registered(tf) &&
+ !tf->tuser->tpoint) {
unregister_fprobe(&tf->fp);
- if (trace_fprobe_is_tracepoint(tf)) {
- tracepoint_probe_unregister(tf->tpoint,
- tf->tpoint->probestub, NULL);
- tf->tpoint = TRACEPOINT_STUB;
- tf->mod = NULL;
- }
}
}
mutex_unlock(&event_mutex);
@@ -1025,8 +1143,11 @@ static int __tracepoint_probe_module_cb(struct notifier_block *self,
return NOTIFY_DONE;
}
-static struct notifier_block tracepoint_module_nb = {
- .notifier_call = __tracepoint_probe_module_cb,
+/* NOTE: this must be called after tracepoint callback */
+static struct notifier_block tprobe_event_module_nb = {
+ .notifier_call = __tprobe_event_module_cb,
+ /* Make sure this is later than tracepoint module notifier. */
+ .priority = -10,
};
#endif /* CONFIG_MODULES */
@@ -1086,8 +1207,6 @@ static int parse_symbol_and_return(int argc, const char *argv[],
return 0;
}
-DEFINE_FREE(module_put, struct module *, if (_T) module_put(_T))
-
static int trace_fprobe_create_internal(int argc, const char *argv[],
struct traceprobe_parse_context *ctx)
{
@@ -1116,19 +1235,18 @@ static int trace_fprobe_create_internal(int argc, const char *argv[],
* FETCHARG:TYPE : use TYPE instead of unsigned long.
*/
struct trace_fprobe *tf __free(free_trace_fprobe) = NULL;
- int i, new_argc = 0, ret = 0;
- bool is_return = false;
- char *symbol __free(kfree) = NULL;
const char *event = NULL, *group = FPROBE_EVENT_SYSTEM;
+ struct module *mod __free(module_put) = NULL;
const char **new_argv __free(kfree) = NULL;
- char buf[MAX_EVENT_NAME_LEN];
- char gbuf[MAX_EVENT_NAME_LEN];
- char sbuf[KSYM_NAME_LEN];
- char abuf[MAX_BTF_ARGS_LEN];
+ char *symbol __free(kfree) = NULL;
+ char *ebuf __free(kfree) = NULL;
+ char *gbuf __free(kfree) = NULL;
+ char *sbuf __free(kfree) = NULL;
+ char *abuf __free(kfree) = NULL;
char *dbuf __free(kfree) = NULL;
+ int i, new_argc = 0, ret = 0;
bool is_tracepoint = false;
- struct module *tp_mod __free(module_put) = NULL;
- struct tracepoint *tpoint = NULL;
+ bool is_return = false;
if ((argv[0][0] != 'f' && argv[0][0] != 't') || argc < 2)
return -ECANCELED;
@@ -1156,6 +1274,9 @@ static int trace_fprobe_create_internal(int argc, const char *argv[],
trace_probe_log_set_index(0);
if (event) {
+ gbuf = kmalloc(MAX_EVENT_NAME_LEN, GFP_KERNEL);
+ if (!gbuf)
+ return -ENOMEM;
ret = traceprobe_parse_event_name(&event, &group, gbuf,
event - argv[0]);
if (ret)
@@ -1163,15 +1284,18 @@ static int trace_fprobe_create_internal(int argc, const char *argv[],
}
if (!event) {
+ ebuf = kmalloc(MAX_EVENT_NAME_LEN, GFP_KERNEL);
+ if (!ebuf)
+ return -ENOMEM;
/* Make a new event name */
if (is_tracepoint)
- snprintf(buf, MAX_EVENT_NAME_LEN, "%s%s",
+ snprintf(ebuf, MAX_EVENT_NAME_LEN, "%s%s",
isdigit(*symbol) ? "_" : "", symbol);
else
- snprintf(buf, MAX_EVENT_NAME_LEN, "%s__%s", symbol,
+ snprintf(ebuf, MAX_EVENT_NAME_LEN, "%s__%s", symbol,
is_return ? "exit" : "entry");
- sanitize_event_name(buf);
- event = buf;
+ sanitize_event_name(ebuf);
+ event = ebuf;
}
if (is_return)
@@ -1179,25 +1303,28 @@ static int trace_fprobe_create_internal(int argc, const char *argv[],
else
ctx->flags |= TPARG_FL_FENTRY;
+ ctx->funcname = NULL;
if (is_tracepoint) {
+ /* Get tracepoint and lock its module until the end of the registration. */
+ struct tracepoint *tpoint;
+
ctx->flags |= TPARG_FL_TPOINT;
- tpoint = find_tracepoint(symbol, &tp_mod);
+ mod = NULL;
+ tpoint = find_tracepoint(symbol, &mod);
if (tpoint) {
- ctx->funcname = kallsyms_lookup(
- (unsigned long)tpoint->probestub,
- NULL, NULL, NULL, sbuf);
- } else if (IS_ENABLED(CONFIG_MODULES)) {
- /* This *may* be loaded afterwards */
- tpoint = TRACEPOINT_STUB;
- ctx->funcname = symbol;
- } else {
- trace_probe_log_set_index(1);
- trace_probe_log_err(0, NO_TRACEPOINT);
- return -EINVAL;
+ sbuf = kmalloc(KSYM_NAME_LEN, GFP_KERNEL);
+ if (!sbuf)
+ return -ENOMEM;
+ ctx->funcname = kallsyms_lookup((unsigned long)tpoint->probestub,
+ NULL, NULL, NULL, sbuf);
}
- } else
+ }
+ if (!ctx->funcname)
ctx->funcname = symbol;
+ abuf = kmalloc(MAX_BTF_ARGS_LEN, GFP_KERNEL);
+ if (!abuf)
+ return -ENOMEM;
argc -= 2; argv += 2;
new_argv = traceprobe_expand_meta_args(argc, argv, &new_argc,
abuf, MAX_BTF_ARGS_LEN, ctx);
@@ -1218,8 +1345,7 @@ static int trace_fprobe_create_internal(int argc, const char *argv[],
return ret;
/* setup a probe */
- tf = alloc_trace_fprobe(group, event, symbol, tpoint, tp_mod,
- argc, is_return);
+ tf = alloc_trace_fprobe(group, event, symbol, argc, is_return, is_tracepoint);
if (IS_ERR(tf)) {
ret = PTR_ERR(tf);
/* This must return -ENOMEM, else there is a bug */
@@ -1251,7 +1377,7 @@ static int trace_fprobe_create_internal(int argc, const char *argv[],
if (ret < 0)
return ret;
- ret = register_trace_fprobe(tf);
+ ret = register_trace_fprobe_event(tf);
if (ret) {
trace_probe_log_set_index(1);
if (ret == -EILSEQ)
@@ -1271,14 +1397,17 @@ static int trace_fprobe_create_internal(int argc, const char *argv[],
static int trace_fprobe_create_cb(int argc, const char *argv[])
{
- struct traceprobe_parse_context ctx = {
- .flags = TPARG_FL_KERNEL | TPARG_FL_FPROBE,
- };
+ struct traceprobe_parse_context *ctx __free(traceprobe_parse_context) = NULL;
int ret;
+ ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
+ if (!ctx)
+ return -ENOMEM;
+
+ ctx->flags = TPARG_FL_KERNEL | TPARG_FL_FPROBE;
+
trace_probe_log_init("trace_fprobe", argc, argv);
- ret = trace_fprobe_create_internal(argc, argv, &ctx);
- traceprobe_finish_parse(&ctx);
+ ret = trace_fprobe_create_internal(argc, argv, ctx);
trace_probe_log_clear();
return ret;
}
@@ -1321,6 +1450,84 @@ static int trace_fprobe_show(struct seq_file *m, struct dyn_event *ev)
}
/*
+ * Enable trace_probe
+ * if the file is NULL, enable "perf" handler, or enable "trace" handler.
+ */
+static int enable_trace_fprobe(struct trace_event_call *call,
+ struct trace_event_file *file)
+{
+ struct trace_probe *tp;
+ struct trace_fprobe *tf;
+ bool enabled;
+ int ret = 0;
+
+ tp = trace_probe_primary_from_call(call);
+ if (WARN_ON_ONCE(!tp))
+ return -ENODEV;
+ enabled = trace_probe_is_enabled(tp);
+
+ /* This also changes "enabled" state */
+ if (file) {
+ ret = trace_probe_add_file(tp, file);
+ if (ret)
+ return ret;
+ } else
+ trace_probe_set_flag(tp, TP_FLAG_PROFILE);
+
+ if (!enabled) {
+ list_for_each_entry(tf, trace_probe_probe_list(tp), tp.list) {
+ ret = __register_trace_fprobe(tf);
+ if (ret < 0)
+ return ret;
+ }
+ }
+
+ return 0;
+}
+
+/*
+ * Disable trace_probe
+ * if the file is NULL, disable "perf" handler, or disable "trace" handler.
+ */
+static int disable_trace_fprobe(struct trace_event_call *call,
+ struct trace_event_file *file)
+{
+ struct trace_fprobe *tf;
+ struct trace_probe *tp;
+
+ tp = trace_probe_primary_from_call(call);
+ if (WARN_ON_ONCE(!tp))
+ return -ENODEV;
+
+ if (file) {
+ if (!trace_probe_get_file_link(tp, file))
+ return -ENOENT;
+ if (!trace_probe_has_single_file(tp))
+ goto out;
+ trace_probe_clear_flag(tp, TP_FLAG_TRACE);
+ } else
+ trace_probe_clear_flag(tp, TP_FLAG_PROFILE);
+
+ if (!trace_probe_is_enabled(tp)) {
+ list_for_each_entry(tf, trace_probe_probe_list(tp), tp.list) {
+ unregister_fprobe(&tf->fp);
+ }
+ }
+
+ out:
+ if (file)
+ /*
+ * Synchronization is done in below function. For perf event,
+ * file == NULL and perf_trace_event_unreg() calls
+ * tracepoint_synchronize_unregister() to ensure synchronize
+ * event. We don't need to care about it.
+ */
+ trace_probe_remove_file(tp, file);
+
+ return 0;
+}
+
+/*
* called by perf_trace_init() or __ftrace_set_clr_event() under event_mutex.
*/
static int fprobe_register(struct trace_event_call *event,
@@ -1365,6 +1572,9 @@ static __init int init_fprobe_trace_early(void)
ret = register_tracepoint_module_notifier(&tracepoint_module_nb);
if (ret)
return ret;
+ ret = register_module_notifier(&tprobe_event_module_nb);
+ if (ret)
+ return ret;
#endif
return 0;
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index 4e37a0f6aaa3..d17c18934445 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -209,7 +209,6 @@ function_trace_call(unsigned long ip, unsigned long parent_ip,
struct ftrace_ops *op, struct ftrace_regs *fregs)
{
struct trace_array *tr = op->private;
- struct trace_array_cpu *data;
unsigned int trace_ctx;
int bit;
@@ -224,9 +223,7 @@ function_trace_call(unsigned long ip, unsigned long parent_ip,
trace_ctx = tracing_gen_ctx_dec();
- data = this_cpu_ptr(tr->array_buffer.data);
- if (!atomic_read(&data->disabled))
- trace_function(tr, ip, parent_ip, trace_ctx, NULL);
+ trace_function(tr, ip, parent_ip, trace_ctx, NULL);
ftrace_test_recursion_unlock(bit);
}
@@ -236,10 +233,8 @@ function_args_trace_call(unsigned long ip, unsigned long parent_ip,
struct ftrace_ops *op, struct ftrace_regs *fregs)
{
struct trace_array *tr = op->private;
- struct trace_array_cpu *data;
unsigned int trace_ctx;
int bit;
- int cpu;
if (unlikely(!tr->function_enabled))
return;
@@ -250,10 +245,7 @@ function_args_trace_call(unsigned long ip, unsigned long parent_ip,
trace_ctx = tracing_gen_ctx();
- cpu = smp_processor_id();
- data = per_cpu_ptr(tr->array_buffer.data, cpu);
- if (!atomic_read(&data->disabled))
- trace_function(tr, ip, parent_ip, trace_ctx, fregs);
+ trace_function(tr, ip, parent_ip, trace_ctx, fregs);
ftrace_test_recursion_unlock(bit);
}
@@ -299,7 +291,7 @@ function_stack_trace_call(unsigned long ip, unsigned long parent_ip,
parent_ip = function_get_true_parent_ip(parent_ip, fregs);
cpu = raw_smp_processor_id();
data = per_cpu_ptr(tr->array_buffer.data, cpu);
- disabled = atomic_inc_return(&data->disabled);
+ disabled = local_inc_return(&data->disabled);
if (likely(disabled == 1)) {
trace_ctx = tracing_gen_ctx_flags(flags);
@@ -311,7 +303,7 @@ function_stack_trace_call(unsigned long ip, unsigned long parent_ip,
__trace_stack(tr, trace_ctx, skip);
}
- atomic_dec(&data->disabled);
+ local_dec(&data->disabled);
local_irq_restore(flags);
}
@@ -352,7 +344,6 @@ function_no_repeats_trace_call(unsigned long ip, unsigned long parent_ip,
{
struct trace_func_repeats *last_info;
struct trace_array *tr = op->private;
- struct trace_array_cpu *data;
unsigned int trace_ctx;
int bit;
@@ -364,8 +355,7 @@ function_no_repeats_trace_call(unsigned long ip, unsigned long parent_ip,
return;
parent_ip = function_get_true_parent_ip(parent_ip, fregs);
- data = this_cpu_ptr(tr->array_buffer.data);
- if (atomic_read(&data->disabled))
+ if (!tracer_tracing_is_on(tr))
goto out;
/*
@@ -412,7 +402,7 @@ function_stack_no_repeats_trace_call(unsigned long ip, unsigned long parent_ip,
parent_ip = function_get_true_parent_ip(parent_ip, fregs);
cpu = raw_smp_processor_id();
data = per_cpu_ptr(tr->array_buffer.data, cpu);
- disabled = atomic_inc_return(&data->disabled);
+ disabled = local_inc_return(&data->disabled);
if (likely(disabled == 1)) {
last_info = per_cpu_ptr(tr->last_func_repeats, cpu);
@@ -427,7 +417,7 @@ function_stack_no_repeats_trace_call(unsigned long ip, unsigned long parent_ip,
}
out:
- atomic_dec(&data->disabled);
+ local_dec(&data->disabled);
local_irq_restore(flags);
}
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 0c357a89c58e..66e1a527cf1a 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -202,12 +202,9 @@ static int graph_entry(struct ftrace_graph_ent *trace,
{
unsigned long *task_var = fgraph_get_task_var(gops);
struct trace_array *tr = gops->private;
- struct trace_array_cpu *data;
struct fgraph_times *ftimes;
unsigned int trace_ctx;
- long disabled;
int ret = 0;
- int cpu;
if (*task_var & TRACE_GRAPH_NOTRACE)
return 0;
@@ -257,21 +254,14 @@ static int graph_entry(struct ftrace_graph_ent *trace,
if (tracing_thresh)
return 1;
- preempt_disable_notrace();
- cpu = raw_smp_processor_id();
- data = per_cpu_ptr(tr->array_buffer.data, cpu);
- disabled = atomic_read(&data->disabled);
- if (likely(!disabled)) {
- trace_ctx = tracing_gen_ctx();
- if (IS_ENABLED(CONFIG_FUNCTION_GRAPH_RETADDR) &&
- tracer_flags_is_set(TRACE_GRAPH_PRINT_RETADDR)) {
- unsigned long retaddr = ftrace_graph_top_ret_addr(current);
- ret = __trace_graph_retaddr_entry(tr, trace, trace_ctx, retaddr);
- } else {
- ret = __graph_entry(tr, trace, trace_ctx, fregs);
- }
+ trace_ctx = tracing_gen_ctx();
+ if (IS_ENABLED(CONFIG_FUNCTION_GRAPH_RETADDR) &&
+ tracer_flags_is_set(TRACE_GRAPH_PRINT_RETADDR)) {
+ unsigned long retaddr = ftrace_graph_top_ret_addr(current);
+ ret = __trace_graph_retaddr_entry(tr, trace, trace_ctx, retaddr);
+ } else {
+ ret = __graph_entry(tr, trace, trace_ctx, fregs);
}
- preempt_enable_notrace();
return ret;
}
@@ -351,13 +341,10 @@ void trace_graph_return(struct ftrace_graph_ret *trace,
{
unsigned long *task_var = fgraph_get_task_var(gops);
struct trace_array *tr = gops->private;
- struct trace_array_cpu *data;
struct fgraph_times *ftimes;
unsigned int trace_ctx;
u64 calltime, rettime;
- long disabled;
int size;
- int cpu;
rettime = trace_clock_local();
@@ -376,15 +363,8 @@ void trace_graph_return(struct ftrace_graph_ret *trace,
calltime = ftimes->calltime;
- preempt_disable_notrace();
- cpu = raw_smp_processor_id();
- data = per_cpu_ptr(tr->array_buffer.data, cpu);
- disabled = atomic_read(&data->disabled);
- if (likely(!disabled)) {
- trace_ctx = tracing_gen_ctx();
- __trace_graph_return(tr, trace, trace_ctx, calltime, rettime);
- }
- preempt_enable_notrace();
+ trace_ctx = tracing_gen_ctx();
+ __trace_graph_return(tr, trace, trace_ctx, calltime, rettime);
}
static void trace_graph_thresh_return(struct ftrace_graph_ret *trace,
@@ -475,10 +455,16 @@ static int graph_trace_init(struct trace_array *tr)
return 0;
}
+static struct tracer graph_trace;
+
static int ftrace_graph_trace_args(struct trace_array *tr, int set)
{
trace_func_graph_ent_t entry;
+ /* Do nothing if the current tracer is not this tracer */
+ if (tr->current_trace != &graph_trace)
+ return 0;
+
if (set)
entry = trace_graph_entry_args;
else
@@ -527,7 +513,7 @@ static void print_graph_proc(struct trace_seq *s, pid_t pid)
{
char comm[TASK_COMM_LEN];
/* sign + log10(MAX_INT) + '\0' */
- char pid_str[11];
+ char pid_str[12];
int spaces = 0;
int len;
int i;
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index 40c39e946940..5496758b6c76 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -123,12 +123,12 @@ static int func_prolog_dec(struct trace_array *tr,
return 0;
*data = per_cpu_ptr(tr->array_buffer.data, cpu);
- disabled = atomic_inc_return(&(*data)->disabled);
+ disabled = local_inc_return(&(*data)->disabled);
if (likely(disabled == 1))
return 1;
- atomic_dec(&(*data)->disabled);
+ local_dec(&(*data)->disabled);
return 0;
}
@@ -152,7 +152,7 @@ irqsoff_tracer_call(unsigned long ip, unsigned long parent_ip,
trace_function(tr, ip, parent_ip, trace_ctx, fregs);
- atomic_dec(&data->disabled);
+ local_dec(&data->disabled);
}
#endif /* CONFIG_FUNCTION_TRACER */
@@ -209,7 +209,7 @@ static int irqsoff_graph_entry(struct ftrace_graph_ent *trace,
trace_ctx = tracing_gen_ctx_flags(flags);
ret = __trace_graph_entry(tr, trace, trace_ctx);
- atomic_dec(&data->disabled);
+ local_dec(&data->disabled);
return ret;
}
@@ -238,7 +238,7 @@ static void irqsoff_graph_return(struct ftrace_graph_ret *trace,
trace_ctx = tracing_gen_ctx_flags(flags);
__trace_graph_return(tr, trace, trace_ctx, *calltime, rettime);
- atomic_dec(&data->disabled);
+ local_dec(&data->disabled);
}
static struct fgraph_ops fgraph_ops = {
@@ -397,6 +397,7 @@ start_critical_timing(unsigned long ip, unsigned long parent_ip)
int cpu;
struct trace_array *tr = irqsoff_trace;
struct trace_array_cpu *data;
+ long disabled;
if (!tracer_enabled || !tracing_is_enabled())
return;
@@ -408,20 +409,22 @@ start_critical_timing(unsigned long ip, unsigned long parent_ip)
data = per_cpu_ptr(tr->array_buffer.data, cpu);
- if (unlikely(!data) || atomic_read(&data->disabled))
+ if (unlikely(!data) || local_read(&data->disabled))
return;
- atomic_inc(&data->disabled);
+ disabled = local_inc_return(&data->disabled);
- data->critical_sequence = max_sequence;
- data->preempt_timestamp = ftrace_now(cpu);
- data->critical_start = parent_ip ? : ip;
+ if (disabled == 1) {
+ data->critical_sequence = max_sequence;
+ data->preempt_timestamp = ftrace_now(cpu);
+ data->critical_start = parent_ip ? : ip;
- __trace_function(tr, ip, parent_ip, tracing_gen_ctx());
+ __trace_function(tr, ip, parent_ip, tracing_gen_ctx());
- per_cpu(tracing_cpu, cpu) = 1;
+ per_cpu(tracing_cpu, cpu) = 1;
+ }
- atomic_dec(&data->disabled);
+ local_dec(&data->disabled);
}
static nokprobe_inline void
@@ -431,6 +434,7 @@ stop_critical_timing(unsigned long ip, unsigned long parent_ip)
struct trace_array *tr = irqsoff_trace;
struct trace_array_cpu *data;
unsigned int trace_ctx;
+ long disabled;
cpu = raw_smp_processor_id();
/* Always clear the tracing cpu on stopping the trace */
@@ -445,16 +449,19 @@ stop_critical_timing(unsigned long ip, unsigned long parent_ip)
data = per_cpu_ptr(tr->array_buffer.data, cpu);
if (unlikely(!data) ||
- !data->critical_start || atomic_read(&data->disabled))
+ !data->critical_start || local_read(&data->disabled))
return;
- atomic_inc(&data->disabled);
+ disabled = local_inc_return(&data->disabled);
- trace_ctx = tracing_gen_ctx();
- __trace_function(tr, ip, parent_ip, trace_ctx);
- check_critical_timing(tr, data, parent_ip ? : ip, cpu);
- data->critical_start = 0;
- atomic_dec(&data->disabled);
+ if (disabled == 1) {
+ trace_ctx = tracing_gen_ctx();
+ __trace_function(tr, ip, parent_ip, trace_ctx);
+ check_critical_timing(tr, data, parent_ip ? : ip, cpu);
+ data->critical_start = 0;
+ }
+
+ local_dec(&data->disabled);
}
/* start and stop critical timings used to for stoppage (in idle) */
diff --git a/kernel/trace/trace_kdb.c b/kernel/trace/trace_kdb.c
index 1e72d20b3c2f..896ff78b8349 100644
--- a/kernel/trace/trace_kdb.c
+++ b/kernel/trace/trace_kdb.c
@@ -43,17 +43,15 @@ static void ftrace_dump_buf(int skip_entries, long cpu_file)
if (cpu_file == RING_BUFFER_ALL_CPUS) {
for_each_tracing_cpu(cpu) {
iter.buffer_iter[cpu] =
- ring_buffer_read_prepare(iter.array_buffer->buffer,
- cpu, GFP_ATOMIC);
- ring_buffer_read_start(iter.buffer_iter[cpu]);
+ ring_buffer_read_start(iter.array_buffer->buffer,
+ cpu, GFP_ATOMIC);
tracing_iter_reset(&iter, cpu);
}
} else {
iter.cpu_file = cpu_file;
iter.buffer_iter[cpu_file] =
- ring_buffer_read_prepare(iter.array_buffer->buffer,
+ ring_buffer_read_start(iter.array_buffer->buffer,
cpu_file, GFP_ATOMIC);
- ring_buffer_read_start(iter.buffer_iter[cpu_file]);
tracing_iter_reset(&iter, cpu_file);
}
@@ -98,7 +96,6 @@ static int kdb_ftdump(int argc, const char **argv)
long cpu_file;
int err;
int cnt;
- int cpu;
if (argc > 2)
return KDB_ARGCOUNT;
@@ -120,9 +117,7 @@ static int kdb_ftdump(int argc, const char **argv)
trace_init_global_iter(&iter);
iter.buffer_iter = buffer_iter;
- for_each_tracing_cpu(cpu) {
- atomic_inc(&per_cpu_ptr(iter.array_buffer->data, cpu)->disabled);
- }
+ tracer_tracing_disable(iter.tr);
/* A negative skip_entries means skip all but the last entries */
if (skip_entries < 0) {
@@ -135,9 +130,7 @@ static int kdb_ftdump(int argc, const char **argv)
ftrace_dump_buf(skip_entries, cpu_file);
- for_each_tracing_cpu(cpu) {
- atomic_dec(&per_cpu_ptr(iter.array_buffer->data, cpu)->disabled);
- }
+ tracer_tracing_enable(iter.tr);
kdb_trap_printk--;
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 3e5c47b6d7b2..ccae62d4fb91 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -9,19 +9,19 @@
#include <linux/bpf-cgroup.h>
#include <linux/cleanup.h>
-#include <linux/security.h>
+#include <linux/error-injection.h>
#include <linux/module.h>
-#include <linux/uaccess.h>
#include <linux/rculist.h>
-#include <linux/error-injection.h>
+#include <linux/security.h>
+#include <linux/uaccess.h>
#include <asm/setup.h> /* for COMMAND_LINE_SIZE */
#include "trace_dynevent.h"
#include "trace_kprobe_selftest.h"
#include "trace_probe.h"
-#include "trace_probe_tmpl.h"
#include "trace_probe_kernel.h"
+#include "trace_probe_tmpl.h"
#define KPROBE_EVENT_SYSTEM "kprobes"
#define KRETPROBE_MAXACTIVE_MAX 4096
@@ -861,20 +861,20 @@ static int trace_kprobe_create_internal(int argc, const char *argv[],
* FETCHARG:TYPE : use TYPE instead of unsigned long.
*/
struct trace_kprobe *tk __free(free_trace_kprobe) = NULL;
+ const char *event = NULL, *group = KPROBE_EVENT_SYSTEM;
+ const char **new_argv __free(kfree) = NULL;
int i, len, new_argc = 0, ret = 0;
- bool is_return = false;
char *symbol __free(kfree) = NULL;
- char *tmp = NULL;
- const char **new_argv __free(kfree) = NULL;
- const char *event = NULL, *group = KPROBE_EVENT_SYSTEM;
+ char *ebuf __free(kfree) = NULL;
+ char *gbuf __free(kfree) = NULL;
+ char *abuf __free(kfree) = NULL;
+ char *dbuf __free(kfree) = NULL;
enum probe_print_type ptype;
+ bool is_return = false;
int maxactive = 0;
- long offset = 0;
void *addr = NULL;
- char buf[MAX_EVENT_NAME_LEN];
- char gbuf[MAX_EVENT_NAME_LEN];
- char abuf[MAX_BTF_ARGS_LEN];
- char *dbuf __free(kfree) = NULL;
+ char *tmp = NULL;
+ long offset = 0;
switch (argv[0][0]) {
case 'r':
@@ -893,6 +893,8 @@ static int trace_kprobe_create_internal(int argc, const char *argv[],
event++;
if (isdigit(argv[0][1])) {
+ char *buf __free(kfree) = NULL;
+
if (!is_return) {
trace_probe_log_err(1, BAD_MAXACT_TYPE);
return -EINVAL;
@@ -905,7 +907,7 @@ static int trace_kprobe_create_internal(int argc, const char *argv[],
trace_probe_log_err(1, BAD_MAXACT);
return -EINVAL;
}
- memcpy(buf, &argv[0][1], len);
+ buf = kmemdup(&argv[0][1], len + 1, GFP_KERNEL);
buf[len] = '\0';
ret = kstrtouint(buf, 0, &maxactive);
if (ret || !maxactive) {
@@ -973,6 +975,9 @@ static int trace_kprobe_create_internal(int argc, const char *argv[],
trace_probe_log_set_index(0);
if (event) {
+ gbuf = kmalloc(MAX_EVENT_NAME_LEN, GFP_KERNEL);
+ if (!gbuf)
+ return -ENOMEM;
ret = traceprobe_parse_event_name(&event, &group, gbuf,
event - argv[0]);
if (ret)
@@ -981,16 +986,22 @@ static int trace_kprobe_create_internal(int argc, const char *argv[],
if (!event) {
/* Make a new event name */
+ ebuf = kmalloc(MAX_EVENT_NAME_LEN, GFP_KERNEL);
+ if (!ebuf)
+ return -ENOMEM;
if (symbol)
- snprintf(buf, MAX_EVENT_NAME_LEN, "%c_%s_%ld",
+ snprintf(ebuf, MAX_EVENT_NAME_LEN, "%c_%s_%ld",
is_return ? 'r' : 'p', symbol, offset);
else
- snprintf(buf, MAX_EVENT_NAME_LEN, "%c_0x%p",
+ snprintf(ebuf, MAX_EVENT_NAME_LEN, "%c_0x%p",
is_return ? 'r' : 'p', addr);
- sanitize_event_name(buf);
- event = buf;
+ sanitize_event_name(ebuf);
+ event = ebuf;
}
+ abuf = kmalloc(MAX_BTF_ARGS_LEN, GFP_KERNEL);
+ if (!abuf)
+ return -ENOMEM;
argc -= 2; argv += 2;
ctx->funcname = symbol;
new_argv = traceprobe_expand_meta_args(argc, argv, &new_argc,
@@ -1065,14 +1076,18 @@ static int trace_kprobe_create_internal(int argc, const char *argv[],
static int trace_kprobe_create_cb(int argc, const char *argv[])
{
- struct traceprobe_parse_context ctx = { .flags = TPARG_FL_KERNEL };
+ struct traceprobe_parse_context *ctx __free(traceprobe_parse_context) = NULL;
int ret;
+ ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
+ if (!ctx)
+ return -ENOMEM;
+ ctx->flags = TPARG_FL_KERNEL;
+
trace_probe_log_init("trace_kprobe", argc, argv);
- ret = trace_kprobe_create_internal(argc, argv, &ctx);
+ ret = trace_kprobe_create_internal(argc, argv, ctx);
- traceprobe_finish_parse(&ctx);
trace_probe_log_clear();
return ret;
}
diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c
index ba5858866b2f..c706544be60c 100644
--- a/kernel/trace/trace_mmiotrace.c
+++ b/kernel/trace/trace_mmiotrace.c
@@ -291,7 +291,6 @@ __init static int init_mmio_trace(void)
device_initcall(init_mmio_trace);
static void __trace_mmiotrace_rw(struct trace_array *tr,
- struct trace_array_cpu *data,
struct mmiotrace_rw *rw)
{
struct trace_buffer *buffer = tr->array_buffer.buffer;
@@ -315,12 +314,10 @@ static void __trace_mmiotrace_rw(struct trace_array *tr,
void mmio_trace_rw(struct mmiotrace_rw *rw)
{
struct trace_array *tr = mmio_trace_array;
- struct trace_array_cpu *data = per_cpu_ptr(tr->array_buffer.data, smp_processor_id());
- __trace_mmiotrace_rw(tr, data, rw);
+ __trace_mmiotrace_rw(tr, rw);
}
static void __trace_mmiotrace_map(struct trace_array *tr,
- struct trace_array_cpu *data,
struct mmiotrace_map *map)
{
struct trace_buffer *buffer = tr->array_buffer.buffer;
@@ -344,12 +341,7 @@ static void __trace_mmiotrace_map(struct trace_array *tr,
void mmio_trace_mapping(struct mmiotrace_map *map)
{
struct trace_array *tr = mmio_trace_array;
- struct trace_array_cpu *data;
-
- preempt_disable();
- data = per_cpu_ptr(tr->array_buffer.data, smp_processor_id());
- __trace_mmiotrace_map(tr, data, map);
- preempt_enable();
+ __trace_mmiotrace_map(tr, map);
}
int mmio_trace_printk(const char *fmt, va_list args)
diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c
index e732c9e37e14..fd259da0aa64 100644
--- a/kernel/trace/trace_osnoise.c
+++ b/kernel/trace/trace_osnoise.c
@@ -637,8 +637,8 @@ __timerlat_dump_stack(struct trace_buffer *buffer, struct trace_stack *fstack, u
entry = ring_buffer_event_data(event);
- memcpy(&entry->caller, fstack->calls, size);
entry->size = fstack->nr_entries;
+ memcpy(&entry->caller, fstack->calls, size);
trace_buffer_unlock_commit_nostack(buffer, event);
}
@@ -2302,7 +2302,7 @@ osnoise_cpus_read(struct file *filp, char __user *ubuf, size_t count,
* osnoise_cpus_write - Write function for "cpus" entry
* @filp: The active open file structure
* @ubuf: The user buffer that contains the value to write
- * @cnt: The maximum number of bytes to write to "file"
+ * @count: The maximum number of bytes to write to "file"
* @ppos: The current position in @file
*
* This function provides a write implementation for the "cpus"
@@ -2320,10 +2320,11 @@ osnoise_cpus_write(struct file *filp, const char __user *ubuf, size_t count,
{
cpumask_var_t osnoise_cpumask_new;
int running, err;
- char buf[256];
+ char *buf __free(kfree) = NULL;
- if (count >= 256)
- return -EINVAL;
+ buf = kmalloc(count, GFP_KERNEL);
+ if (!buf)
+ return -ENOMEM;
if (copy_from_user(buf, ubuf, count))
return -EFAULT;
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index b9ab06c99543..0b3db02030a7 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -938,6 +938,9 @@ static void print_fields(struct trace_iterator *iter, struct trace_event_call *c
struct list_head *head)
{
struct ftrace_event_field *field;
+ struct trace_array *tr = iter->tr;
+ unsigned long long laddr;
+ unsigned long addr;
int offset;
int len;
int ret;
@@ -974,8 +977,8 @@ static void print_fields(struct trace_iterator *iter, struct trace_event_call *c
case FILTER_PTR_STRING:
if (!iter->fmt_size)
trace_iter_expand_format(iter);
- pos = *(void **)pos;
- ret = strncpy_from_kernel_nofault(iter->fmt, pos,
+ addr = trace_adjust_address(tr, *(unsigned long *)pos);
+ ret = strncpy_from_kernel_nofault(iter->fmt, (void *)addr,
iter->fmt_size);
if (ret < 0)
trace_seq_printf(&iter->seq, "(0x%px)", pos);
@@ -984,8 +987,8 @@ static void print_fields(struct trace_iterator *iter, struct trace_event_call *c
pos, iter->fmt);
break;
case FILTER_TRACE_FN:
- pos = *(void **)pos;
- trace_seq_printf(&iter->seq, "%pS", pos);
+ addr = trace_adjust_address(tr, *(unsigned long *)pos);
+ trace_seq_printf(&iter->seq, "%pS", (void *)addr);
break;
case FILTER_CPU:
case FILTER_OTHER:
@@ -1015,14 +1018,36 @@ static void print_fields(struct trace_iterator *iter, struct trace_event_call *c
break;
}
- trace_seq_printf(&iter->seq, "0x%x (%d)",
- *(unsigned int *)pos,
- *(unsigned int *)pos);
+ addr = *(unsigned int *)pos;
+
+ /* Some fields reference offset from _stext. */
+ if (!strcmp(field->name, "caller_offs") ||
+ !strcmp(field->name, "parent_offs")) {
+ unsigned long ip;
+
+ ip = addr + (unsigned long)_stext;
+ ip = trace_adjust_address(tr, ip);
+ trace_seq_printf(&iter->seq, "%pS ", (void *)ip);
+ }
+
+ if (sizeof(long) == 4) {
+ addr = trace_adjust_address(tr, addr);
+ trace_seq_printf(&iter->seq, "%pS (%d)",
+ (void *)addr, (int)addr);
+ } else {
+ trace_seq_printf(&iter->seq, "0x%x (%d)",
+ (unsigned int)addr, (int)addr);
+ }
break;
case 8:
- trace_seq_printf(&iter->seq, "0x%llx (%lld)",
- *(unsigned long long *)pos,
- *(unsigned long long *)pos);
+ laddr = *(unsigned long long *)pos;
+ if (sizeof(long) == 8) {
+ laddr = trace_adjust_address(tr, (unsigned long)laddr);
+ trace_seq_printf(&iter->seq, "%pS (%lld)",
+ (void *)(long)laddr, laddr);
+ } else {
+ trace_seq_printf(&iter->seq, "0x%llx (%lld)", laddr, laddr);
+ }
break;
default:
trace_seq_puts(&iter->seq, "<INVALID-SIZE>");
@@ -1086,11 +1111,11 @@ enum print_line_t trace_nop_print(struct trace_iterator *iter, int flags,
}
static void print_fn_trace(struct trace_seq *s, unsigned long ip,
- unsigned long parent_ip, long delta,
- unsigned long *args, int flags)
+ unsigned long parent_ip, unsigned long *args,
+ struct trace_array *tr, int flags)
{
- ip += delta;
- parent_ip += delta;
+ ip = trace_adjust_address(tr, ip);
+ parent_ip = trace_adjust_address(tr, parent_ip);
seq_print_ip_sym(s, ip, flags);
if (args)
@@ -1119,8 +1144,7 @@ static enum print_line_t trace_fn_trace(struct trace_iterator *iter, int flags,
else
args = NULL;
- print_fn_trace(s, field->ip, field->parent_ip, iter->tr->text_delta,
- args, flags);
+ print_fn_trace(s, field->ip, field->parent_ip, args, iter->tr, flags);
trace_seq_putc(s, '\n');
return trace_handle_return(s);
@@ -1706,7 +1730,7 @@ static enum print_line_t trace_print_print(struct trace_iterator *iter,
trace_assign_type(field, iter->ent);
- ip = field->ip + iter->tr->text_delta;
+ ip = trace_adjust_address(iter->tr, field->ip);
seq_print_ip_sym(s, ip, flags);
trace_seq_printf(s, ": %s", field->buf);
@@ -1792,7 +1816,7 @@ trace_func_repeats_print(struct trace_iterator *iter, int flags,
trace_assign_type(field, iter->ent);
- print_fn_trace(s, field->ip, field->parent_ip, iter->tr->text_delta, NULL, flags);
+ print_fn_trace(s, field->ip, field->parent_ip, NULL, iter->tr, flags);
trace_seq_printf(s, " (repeats: %u, last_ts:", field->count);
trace_print_time(s, iter,
iter->ts - FUNC_REPEATS_GET_DELTA_TS(field));
diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c
index 424751cdf31f..5cbdc423afeb 100644
--- a/kernel/trace/trace_probe.c
+++ b/kernel/trace/trace_probe.c
@@ -13,8 +13,8 @@
#include <linux/bpf.h>
#include <linux/fs.h>
-#include "trace_btf.h"
+#include "trace_btf.h"
#include "trace_probe.h"
#undef C
@@ -247,7 +247,25 @@ int traceprobe_split_symbol_offset(char *symbol, long *offset)
return 0;
}
-/* @buf must has MAX_EVENT_NAME_LEN size */
+/**
+ * traceprobe_parse_event_name() - Parse a string into group and event names
+ * @pevent: A pointer to the string to be parsed.
+ * @pgroup: A pointer to the group name.
+ * @buf: A buffer to store the parsed group name.
+ * @offset: The offset of the string in the original user command, for logging.
+ *
+ * This parses a string with the format `[GROUP/][EVENT]` or `[GROUP.][EVENT]`
+ * (either GROUP or EVENT or both must be specified).
+ * Since the parsed group name is stored in @buf, the caller must ensure @buf
+ * is at least MAX_EVENT_NAME_LEN bytes.
+ *
+ * Return: 0 on success, or -EINVAL on failure.
+ *
+ * If success, *@pevent is updated to point to the event name part of the
+ * original string, or NULL if there is no event name.
+ * Also, *@pgroup is updated to point to the parsed group which is stored
+ * in @buf, or NULL if there is no group name.
+ */
int traceprobe_parse_event_name(const char **pevent, const char **pgroup,
char *buf, int offset)
{
@@ -657,7 +675,7 @@ static int parse_btf_arg(char *varname,
ret = query_btf_context(ctx);
if (ret < 0 || ctx->nr_params == 0) {
trace_probe_log_err(ctx->offset, NO_BTF_ENTRY);
- return PTR_ERR(params);
+ return -ENOENT;
}
}
params = ctx->params;
@@ -779,6 +797,36 @@ static int check_prepare_btf_string_fetch(char *typename,
#ifdef CONFIG_HAVE_FUNCTION_ARG_ACCESS_API
+static void store_entry_arg_at(struct fetch_insn *code, int argnum, int offset)
+{
+ code[0].op = FETCH_OP_ARG;
+ code[0].param = argnum;
+ code[1].op = FETCH_OP_ST_EDATA;
+ code[1].offset = offset;
+}
+
+static int get_entry_arg_max_offset(struct probe_entry_arg *earg)
+{
+ int i, max_offset = 0;
+
+ /*
+ * earg->code[] array has an operation sequence which is run in
+ * the entry handler.
+ * The sequence stopped by FETCH_OP_END and each data stored in
+ * the entry data buffer by FETCH_OP_ST_EDATA. The FETCH_OP_ST_EDATA
+ * stores the data at the data buffer + its offset, and all data are
+ * "unsigned long" size. The offset must be increased when a data is
+ * stored. Thus we need to find the last FETCH_OP_ST_EDATA in the
+ * code array.
+ */
+ for (i = 0; i < earg->size - 1 && earg->code[i].op != FETCH_OP_END; i++) {
+ if (earg->code[i].op == FETCH_OP_ST_EDATA)
+ if (earg->code[i].offset > max_offset)
+ max_offset = earg->code[i].offset;
+ }
+ return max_offset;
+}
+
/*
* Add the entry code to store the 'argnum'th parameter and return the offset
* in the entry data buffer where the data will be stored.
@@ -786,8 +834,7 @@ static int check_prepare_btf_string_fetch(char *typename,
static int __store_entry_arg(struct trace_probe *tp, int argnum)
{
struct probe_entry_arg *earg = tp->entry_arg;
- bool match = false;
- int i, offset;
+ int i, offset, last_offset = 0;
if (!earg) {
earg = kzalloc(sizeof(*tp->entry_arg), GFP_KERNEL);
@@ -804,78 +851,59 @@ static int __store_entry_arg(struct trace_probe *tp, int argnum)
for (i = 0; i < earg->size; i++)
earg->code[i].op = FETCH_OP_END;
tp->entry_arg = earg;
+ store_entry_arg_at(earg->code, argnum, 0);
+ return 0;
}
/*
- * The entry code array is repeating the pair of
- * [FETCH_OP_ARG(argnum)][FETCH_OP_ST_EDATA(offset of entry data buffer)]
- * and the rest of entries are filled with [FETCH_OP_END].
+ * NOTE: if anyone change the following rule, please rewrite this.
+ * The entry code array is filled with the pair of
*
- * To reduce the redundant function parameter fetching, we scan the entry
- * code array to find the FETCH_OP_ARG which already fetches the 'argnum'
- * parameter. If it doesn't match, update 'offset' to find the last
- * offset.
- * If we find the FETCH_OP_END without matching FETCH_OP_ARG entry, we
- * will save the entry with FETCH_OP_ARG and FETCH_OP_ST_EDATA, and
- * return data offset so that caller can find the data offset in the entry
- * data buffer.
+ * [FETCH_OP_ARG(argnum)]
+ * [FETCH_OP_ST_EDATA(offset of entry data buffer)]
+ *
+ * and the rest of entries are filled with [FETCH_OP_END].
+ * The offset should be incremented, thus the last pair should
+ * have the largest offset.
*/
- offset = 0;
- for (i = 0; i < earg->size - 1; i++) {
- switch (earg->code[i].op) {
- case FETCH_OP_END:
- earg->code[i].op = FETCH_OP_ARG;
- earg->code[i].param = argnum;
- earg->code[i + 1].op = FETCH_OP_ST_EDATA;
- earg->code[i + 1].offset = offset;
- return offset;
- case FETCH_OP_ARG:
- match = (earg->code[i].param == argnum);
- break;
- case FETCH_OP_ST_EDATA:
- offset = earg->code[i].offset;
- if (match)
- return offset;
- offset += sizeof(unsigned long);
- break;
- default:
- break;
- }
+
+ /* Search the offset for the sprcified argnum. */
+ for (i = 0; i < earg->size - 1 && earg->code[i].op != FETCH_OP_END; i += 2) {
+ if (WARN_ON_ONCE(earg->code[i].op != FETCH_OP_ARG))
+ return -EINVAL;
+
+ if (earg->code[i].param != argnum)
+ continue;
+
+ if (WARN_ON_ONCE(earg->code[i + 1].op != FETCH_OP_ST_EDATA))
+ return -EINVAL;
+
+ return earg->code[i + 1].offset;
+ }
+ /* Not found, append new entry if possible. */
+ if (i >= earg->size - 1)
+ return -ENOSPC;
+
+ /* The last entry must have the largest offset. */
+ if (i != 0) {
+ if (WARN_ON_ONCE(earg->code[i - 1].op != FETCH_OP_ST_EDATA))
+ return -EINVAL;
+ last_offset = earg->code[i - 1].offset;
}
- return -ENOSPC;
+
+ offset = last_offset + sizeof(unsigned long);
+ store_entry_arg_at(&earg->code[i], argnum, offset);
+ return offset;
}
int traceprobe_get_entry_data_size(struct trace_probe *tp)
{
struct probe_entry_arg *earg = tp->entry_arg;
- int i, size = 0;
if (!earg)
return 0;
- /*
- * earg->code[] array has an operation sequence which is run in
- * the entry handler.
- * The sequence stopped by FETCH_OP_END and each data stored in
- * the entry data buffer by FETCH_OP_ST_EDATA. The FETCH_OP_ST_EDATA
- * stores the data at the data buffer + its offset, and all data are
- * "unsigned long" size. The offset must be increased when a data is
- * stored. Thus we need to find the last FETCH_OP_ST_EDATA in the
- * code array.
- */
- for (i = 0; i < earg->size; i++) {
- switch (earg->code[i].op) {
- case FETCH_OP_END:
- goto out;
- case FETCH_OP_ST_EDATA:
- size = earg->code[i].offset + sizeof(unsigned long);
- break;
- default:
- break;
- }
- }
-out:
- return size;
+ return get_entry_arg_max_offset(earg) + sizeof(unsigned long);
}
void store_trace_entry_data(void *edata, struct trace_probe *tp, struct pt_regs *regs)
diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h
index 854e5668f5ee..842383fbc03b 100644
--- a/kernel/trace/trace_probe.h
+++ b/kernel/trace/trace_probe.h
@@ -10,20 +10,22 @@
* Author: Srikar Dronamraju
*/
+#include <linux/bitops.h>
+#include <linux/btf.h>
+#include <linux/cleanup.h>
+#include <linux/kprobes.h>
+#include <linux/limits.h>
+#include <linux/perf_event.h>
+#include <linux/ptrace.h>
#include <linux/seq_file.h>
#include <linux/slab.h>
#include <linux/smp.h>
-#include <linux/tracefs.h>
-#include <linux/types.h>
#include <linux/string.h>
-#include <linux/ptrace.h>
-#include <linux/perf_event.h>
-#include <linux/kprobes.h>
#include <linux/stringify.h>
-#include <linux/limits.h>
+#include <linux/tracefs.h>
+#include <linux/types.h>
#include <linux/uaccess.h>
-#include <linux/bitops.h>
-#include <linux/btf.h>
+
#include <asm/bitsperlong.h>
#include "trace.h"
@@ -438,6 +440,14 @@ extern void traceprobe_free_probe_arg(struct probe_arg *arg);
* this MUST be called for clean up the context and return a resource.
*/
void traceprobe_finish_parse(struct traceprobe_parse_context *ctx);
+static inline void traceprobe_free_parse_ctx(struct traceprobe_parse_context *ctx)
+{
+ traceprobe_finish_parse(ctx);
+ kfree(ctx);
+}
+
+DEFINE_FREE(traceprobe_parse_context, struct traceprobe_parse_context *,
+ if (_T) traceprobe_free_parse_ctx(_T))
extern int traceprobe_split_symbol_offset(char *symbol, long *offset);
int traceprobe_parse_event_name(const char **pevent, const char **pgroup,
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index a0db3404f7f7..bf1cb80742ae 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -83,14 +83,14 @@ func_prolog_preempt_disable(struct trace_array *tr,
goto out_enable;
*data = per_cpu_ptr(tr->array_buffer.data, cpu);
- disabled = atomic_inc_return(&(*data)->disabled);
+ disabled = local_inc_return(&(*data)->disabled);
if (unlikely(disabled != 1))
goto out;
return 1;
out:
- atomic_dec(&(*data)->disabled);
+ local_dec(&(*data)->disabled);
out_enable:
preempt_enable_notrace();
@@ -144,7 +144,7 @@ static int wakeup_graph_entry(struct ftrace_graph_ent *trace,
*calltime = trace_clock_local();
ret = __trace_graph_entry(tr, trace, trace_ctx);
- atomic_dec(&data->disabled);
+ local_dec(&data->disabled);
preempt_enable_notrace();
return ret;
@@ -173,7 +173,7 @@ static void wakeup_graph_return(struct ftrace_graph_ret *trace,
return;
__trace_graph_return(tr, trace, trace_ctx, *calltime, rettime);
- atomic_dec(&data->disabled);
+ local_dec(&data->disabled);
preempt_enable_notrace();
return;
@@ -243,7 +243,7 @@ wakeup_tracer_call(unsigned long ip, unsigned long parent_ip,
trace_function(tr, ip, parent_ip, trace_ctx, fregs);
local_irq_restore(flags);
- atomic_dec(&data->disabled);
+ local_dec(&data->disabled);
preempt_enable_notrace();
}
@@ -471,7 +471,7 @@ probe_wakeup_sched_switch(void *ignore, bool preempt,
/* disable local data, not wakeup_cpu data */
cpu = raw_smp_processor_id();
- disabled = atomic_inc_return(&per_cpu_ptr(wakeup_trace->array_buffer.data, cpu)->disabled);
+ disabled = local_inc_return(&per_cpu_ptr(wakeup_trace->array_buffer.data, cpu)->disabled);
if (likely(disabled != 1))
goto out;
@@ -508,7 +508,7 @@ out_unlock:
arch_spin_unlock(&wakeup_lock);
local_irq_restore(flags);
out:
- atomic_dec(&per_cpu_ptr(wakeup_trace->array_buffer.data, cpu)->disabled);
+ local_dec(&per_cpu_ptr(wakeup_trace->array_buffer.data, cpu)->disabled);
}
static void __wakeup_reset(struct trace_array *tr)
@@ -563,7 +563,7 @@ probe_wakeup(void *ignore, struct task_struct *p)
(!dl_task(p) && (p->prio >= wakeup_prio || p->prio >= current->prio)))
return;
- disabled = atomic_inc_return(&per_cpu_ptr(wakeup_trace->array_buffer.data, cpu)->disabled);
+ disabled = local_inc_return(&per_cpu_ptr(wakeup_trace->array_buffer.data, cpu)->disabled);
if (unlikely(disabled != 1))
goto out;
@@ -610,7 +610,7 @@ probe_wakeup(void *ignore, struct task_struct *p)
out_locked:
arch_spin_unlock(&wakeup_lock);
out:
- atomic_dec(&per_cpu_ptr(wakeup_trace->array_buffer.data, cpu)->disabled);
+ local_dec(&per_cpu_ptr(wakeup_trace->array_buffer.data, cpu)->disabled);
}
static void start_wakeup_tracer(struct trace_array *tr)
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index 14c6f272c4d8..0aa2514a6593 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -32,7 +32,7 @@ static arch_spinlock_t stack_trace_max_lock =
DEFINE_PER_CPU(int, disable_stack_tracer);
static DEFINE_MUTEX(stack_sysctl_mutex);
-int stack_tracer_enabled;
+static int stack_tracer_enabled;
static void print_max_stack(void)
{
@@ -542,7 +542,7 @@ static __init int enable_stacktrace(char *str)
int len;
if ((len = str_has_prefix(str, "_filter=")))
- strncpy(stack_trace_filter_buf, str + len, COMMAND_LINE_SIZE);
+ strscpy(stack_trace_filter_buf, str + len);
stack_tracer_enabled = 1;
return 1;
@@ -578,3 +578,23 @@ static __init int stack_trace_init(void)
}
device_initcall(stack_trace_init);
+
+
+static const struct ctl_table trace_stack_sysctl_table[] = {
+ {
+ .procname = "stack_tracer_enabled",
+ .data = &stack_tracer_enabled,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = stack_trace_sysctl,
+ },
+};
+
+static int __init init_trace_stack_sysctls(void)
+{
+ register_sysctl_init("kernel", trace_stack_sysctl_table);
+ return 0;
+}
+subsys_initcall(init_trace_stack_sysctls);
+
+
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index 35cf76c75dd7..8b0bcc0d8f41 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -8,17 +8,19 @@
#define pr_fmt(fmt) "trace_uprobe: " fmt
#include <linux/bpf-cgroup.h>
-#include <linux/security.h>
+#include <linux/cleanup.h>
#include <linux/ctype.h>
+#include <linux/filter.h>
#include <linux/module.h>
-#include <linux/uaccess.h>
-#include <linux/uprobes.h>
#include <linux/namei.h>
-#include <linux/string.h>
-#include <linux/rculist.h>
-#include <linux/filter.h>
#include <linux/percpu.h>
+#include <linux/rculist.h>
+#include <linux/security.h>
+#include <linux/string.h>
+#include <linux/uaccess.h>
+#include <linux/uprobes.h>
+#include "trace.h"
#include "trace_dynevent.h"
#include "trace_probe.h"
#include "trace_probe_tmpl.h"
@@ -537,15 +539,15 @@ static int register_trace_uprobe(struct trace_uprobe *tu)
*/
static int __trace_uprobe_create(int argc, const char **argv)
{
- struct trace_uprobe *tu;
const char *event = NULL, *group = UPROBE_EVENT_SYSTEM;
char *arg, *filename, *rctr, *rctr_end, *tmp;
- char buf[MAX_EVENT_NAME_LEN];
- char gbuf[MAX_EVENT_NAME_LEN];
- enum probe_print_type ptype;
- struct path path;
unsigned long offset, ref_ctr_offset;
+ char *gbuf __free(kfree) = NULL;
+ char *buf __free(kfree) = NULL;
+ enum probe_print_type ptype;
+ struct trace_uprobe *tu;
bool is_return = false;
+ struct path path;
int i, ret;
ref_ctr_offset = 0;
@@ -653,6 +655,10 @@ static int __trace_uprobe_create(int argc, const char **argv)
/* setup a probe */
trace_probe_log_set_index(0);
if (event) {
+ gbuf = kmalloc(MAX_EVENT_NAME_LEN, GFP_KERNEL);
+ if (!gbuf)
+ goto fail_mem;
+
ret = traceprobe_parse_event_name(&event, &group, gbuf,
event - argv[0]);
if (ret)
@@ -664,15 +670,16 @@ static int __trace_uprobe_create(int argc, const char **argv)
char *ptr;
tail = kstrdup(kbasename(filename), GFP_KERNEL);
- if (!tail) {
- ret = -ENOMEM;
- goto fail_address_parse;
- }
+ if (!tail)
+ goto fail_mem;
ptr = strpbrk(tail, ".-_");
if (ptr)
*ptr = '\0';
+ buf = kmalloc(MAX_EVENT_NAME_LEN, GFP_KERNEL);
+ if (!buf)
+ goto fail_mem;
snprintf(buf, MAX_EVENT_NAME_LEN, "%c_%s_0x%lx", 'p', tail, offset);
event = buf;
kfree(tail);
@@ -695,13 +702,16 @@ static int __trace_uprobe_create(int argc, const char **argv)
/* parse arguments */
for (i = 0; i < argc; i++) {
- struct traceprobe_parse_context ctx = {
- .flags = (is_return ? TPARG_FL_RETURN : 0) | TPARG_FL_USER,
- };
+ struct traceprobe_parse_context *ctx __free(traceprobe_parse_context)
+ = kzalloc(sizeof(*ctx), GFP_KERNEL);
+ if (!ctx) {
+ ret = -ENOMEM;
+ goto error;
+ }
+ ctx->flags = (is_return ? TPARG_FL_RETURN : 0) | TPARG_FL_USER;
trace_probe_log_set_index(i + 2);
- ret = traceprobe_parse_probe_arg(&tu->tp, i, argv[i], &ctx);
- traceprobe_finish_parse(&ctx);
+ ret = traceprobe_parse_probe_arg(&tu->tp, i, argv[i], ctx);
if (ret)
goto error;
}
@@ -721,6 +731,9 @@ out:
trace_probe_log_clear();
return ret;
+fail_mem:
+ ret = -ENOMEM;
+
fail_address_parse:
trace_probe_log_clear();
path_put(&path);
@@ -1489,7 +1502,7 @@ int bpf_get_uprobe_info(const struct perf_event *event, u32 *fd_type,
: BPF_FD_TYPE_UPROBE;
*filename = tu->filename;
*probe_offset = tu->offset;
- *probe_addr = 0;
+ *probe_addr = tu->ref_ctr_offset;
return 0;
}
#endif /* CONFIG_PERF_EVENTS */
diff --git a/kernel/usermode_driver.c b/kernel/usermode_driver.c
deleted file mode 100644
index 8303f4c7ca71..000000000000
--- a/kernel/usermode_driver.c
+++ /dev/null
@@ -1,191 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * umd - User mode driver support
- */
-#include <linux/shmem_fs.h>
-#include <linux/pipe_fs_i.h>
-#include <linux/mount.h>
-#include <linux/fs_struct.h>
-#include <linux/task_work.h>
-#include <linux/usermode_driver.h>
-
-static struct vfsmount *blob_to_mnt(const void *data, size_t len, const char *name)
-{
- struct file_system_type *type;
- struct vfsmount *mnt;
- struct file *file;
- ssize_t written;
- loff_t pos = 0;
-
- type = get_fs_type("tmpfs");
- if (!type)
- return ERR_PTR(-ENODEV);
-
- mnt = kern_mount(type);
- put_filesystem(type);
- if (IS_ERR(mnt))
- return mnt;
-
- file = file_open_root_mnt(mnt, name, O_CREAT | O_WRONLY, 0700);
- if (IS_ERR(file)) {
- kern_unmount(mnt);
- return ERR_CAST(file);
- }
-
- written = kernel_write(file, data, len, &pos);
- if (written != len) {
- int err = written;
- if (err >= 0)
- err = -ENOMEM;
- filp_close(file, NULL);
- kern_unmount(mnt);
- return ERR_PTR(err);
- }
-
- fput(file);
-
- /* Flush delayed fput so exec can open the file read-only */
- flush_delayed_fput();
- task_work_run();
- return mnt;
-}
-
-/**
- * umd_load_blob - Remember a blob of bytes for fork_usermode_driver
- * @info: information about usermode driver
- * @data: a blob of bytes that can be executed as a file
- * @len: The lentgh of the blob
- *
- */
-int umd_load_blob(struct umd_info *info, const void *data, size_t len)
-{
- struct vfsmount *mnt;
-
- if (WARN_ON_ONCE(info->wd.dentry || info->wd.mnt))
- return -EBUSY;
-
- mnt = blob_to_mnt(data, len, info->driver_name);
- if (IS_ERR(mnt))
- return PTR_ERR(mnt);
-
- info->wd.mnt = mnt;
- info->wd.dentry = mnt->mnt_root;
- return 0;
-}
-EXPORT_SYMBOL_GPL(umd_load_blob);
-
-/**
- * umd_unload_blob - Disassociate @info from a previously loaded blob
- * @info: information about usermode driver
- *
- */
-int umd_unload_blob(struct umd_info *info)
-{
- if (WARN_ON_ONCE(!info->wd.mnt ||
- !info->wd.dentry ||
- info->wd.mnt->mnt_root != info->wd.dentry))
- return -EINVAL;
-
- kern_unmount(info->wd.mnt);
- info->wd.mnt = NULL;
- info->wd.dentry = NULL;
- return 0;
-}
-EXPORT_SYMBOL_GPL(umd_unload_blob);
-
-static int umd_setup(struct subprocess_info *info, struct cred *new)
-{
- struct umd_info *umd_info = info->data;
- struct file *from_umh[2];
- struct file *to_umh[2];
- int err;
-
- /* create pipe to send data to umh */
- err = create_pipe_files(to_umh, 0);
- if (err)
- return err;
- err = replace_fd(0, to_umh[0], 0);
- fput(to_umh[0]);
- if (err < 0) {
- fput(to_umh[1]);
- return err;
- }
-
- /* create pipe to receive data from umh */
- err = create_pipe_files(from_umh, 0);
- if (err) {
- fput(to_umh[1]);
- replace_fd(0, NULL, 0);
- return err;
- }
- err = replace_fd(1, from_umh[1], 0);
- fput(from_umh[1]);
- if (err < 0) {
- fput(to_umh[1]);
- replace_fd(0, NULL, 0);
- fput(from_umh[0]);
- return err;
- }
-
- set_fs_pwd(current->fs, &umd_info->wd);
- umd_info->pipe_to_umh = to_umh[1];
- umd_info->pipe_from_umh = from_umh[0];
- umd_info->tgid = get_pid(task_tgid(current));
- return 0;
-}
-
-static void umd_cleanup(struct subprocess_info *info)
-{
- struct umd_info *umd_info = info->data;
-
- /* cleanup if umh_setup() was successful but exec failed */
- if (info->retval)
- umd_cleanup_helper(umd_info);
-}
-
-/**
- * umd_cleanup_helper - release the resources which were allocated in umd_setup
- * @info: information about usermode driver
- */
-void umd_cleanup_helper(struct umd_info *info)
-{
- fput(info->pipe_to_umh);
- fput(info->pipe_from_umh);
- put_pid(info->tgid);
- info->tgid = NULL;
-}
-EXPORT_SYMBOL_GPL(umd_cleanup_helper);
-
-/**
- * fork_usermode_driver - fork a usermode driver
- * @info: information about usermode driver (shouldn't be NULL)
- *
- * Returns either negative error or zero which indicates success in
- * executing a usermode driver. In such case 'struct umd_info *info'
- * is populated with two pipes and a tgid of the process. The caller is
- * responsible for health check of the user process, killing it via
- * tgid, and closing the pipes when user process is no longer needed.
- */
-int fork_usermode_driver(struct umd_info *info)
-{
- struct subprocess_info *sub_info;
- const char *argv[] = { info->driver_name, NULL };
- int err;
-
- if (WARN_ON_ONCE(info->tgid))
- return -EBUSY;
-
- err = -ENOMEM;
- sub_info = call_usermodehelper_setup(info->driver_name,
- (char **)argv, NULL, GFP_KERNEL,
- umd_setup, umd_cleanup, info);
- if (!sub_info)
- goto out;
-
- err = call_usermodehelper_exec(sub_info, UMH_WAIT_EXEC);
-out:
- return err;
-}
-EXPORT_SYMBOL_GPL(fork_usermode_driver);
-
-
diff --git a/kernel/vmcore_info.c b/kernel/vmcore_info.c
index 1fec61603ef3..e066d31d08f8 100644
--- a/kernel/vmcore_info.c
+++ b/kernel/vmcore_info.c
@@ -210,6 +210,10 @@ static int __init crash_save_vmcoreinfo_init(void)
VMCOREINFO_NUMBER(PAGE_HUGETLB_MAPCOUNT_VALUE);
#define PAGE_OFFLINE_MAPCOUNT_VALUE (PGTY_offline << 24)
VMCOREINFO_NUMBER(PAGE_OFFLINE_MAPCOUNT_VALUE);
+#ifdef CONFIG_UNACCEPTED_MEMORY
+#define PAGE_UNACCEPTED_MAPCOUNT_VALUE (PGTY_unaccepted << 24)
+ VMCOREINFO_NUMBER(PAGE_UNACCEPTED_MAPCOUNT_VALUE);
+#endif
#ifdef CONFIG_KALLSYMS
VMCOREINFO_SYMBOL(kallsyms_names);
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 9fa2af9dbf2c..80b56c002c7f 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -47,6 +47,7 @@ int __read_mostly watchdog_user_enabled = 1;
static int __read_mostly watchdog_hardlockup_user_enabled = WATCHDOG_HARDLOCKUP_DEFAULT;
static int __read_mostly watchdog_softlockup_user_enabled = 1;
int __read_mostly watchdog_thresh = 10;
+static int __read_mostly watchdog_thresh_next;
static int __read_mostly watchdog_hardlockup_available;
struct cpumask watchdog_cpumask __read_mostly;
@@ -63,6 +64,29 @@ int __read_mostly sysctl_hardlockup_all_cpu_backtrace;
*/
unsigned int __read_mostly hardlockup_panic =
IS_ENABLED(CONFIG_BOOTPARAM_HARDLOCKUP_PANIC);
+
+#ifdef CONFIG_SYSFS
+
+static unsigned int hardlockup_count;
+
+static ssize_t hardlockup_count_show(struct kobject *kobj, struct kobj_attribute *attr,
+ char *page)
+{
+ return sysfs_emit(page, "%u\n", hardlockup_count);
+}
+
+static struct kobj_attribute hardlockup_count_attr = __ATTR_RO(hardlockup_count);
+
+static __init int kernel_hardlockup_sysfs_init(void)
+{
+ sysfs_add_file_to_group(kernel_kobj, &hardlockup_count_attr.attr, NULL);
+ return 0;
+}
+
+late_initcall(kernel_hardlockup_sysfs_init);
+
+#endif // CONFIG_SYSFS
+
/*
* We may not want to enable hard lockup detection by default in all cases,
* for example when running the kernel as a guest on a hypervisor. In these
@@ -169,6 +193,10 @@ void watchdog_hardlockup_check(unsigned int cpu, struct pt_regs *regs)
unsigned int this_cpu = smp_processor_id();
unsigned long flags;
+#ifdef CONFIG_SYSFS
+ ++hardlockup_count;
+#endif
+
/* Only print hardlockups once. */
if (per_cpu(watchdog_hardlockup_warned, cpu))
return;
@@ -311,6 +339,28 @@ unsigned int __read_mostly softlockup_panic =
static bool softlockup_initialized __read_mostly;
static u64 __read_mostly sample_period;
+#ifdef CONFIG_SYSFS
+
+static unsigned int softlockup_count;
+
+static ssize_t softlockup_count_show(struct kobject *kobj, struct kobj_attribute *attr,
+ char *page)
+{
+ return sysfs_emit(page, "%u\n", softlockup_count);
+}
+
+static struct kobj_attribute softlockup_count_attr = __ATTR_RO(softlockup_count);
+
+static __init int kernel_softlockup_sysfs_init(void)
+{
+ sysfs_add_file_to_group(kernel_kobj, &softlockup_count_attr.attr, NULL);
+ return 0;
+}
+
+late_initcall(kernel_softlockup_sysfs_init);
+
+#endif // CONFIG_SYSFS
+
/* Timestamp taken after the last successful reschedule. */
static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts);
/* Timestamp of the last softlockup report. */
@@ -742,6 +792,10 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
touch_ts = __this_cpu_read(watchdog_touch_ts);
duration = is_softlockup(touch_ts, period_ts, now);
if (unlikely(duration)) {
+#ifdef CONFIG_SYSFS
+ ++softlockup_count;
+#endif
+
/*
* Prevent multiple soft-lockup reports if one cpu is already
* engaged in dumping all cpu back traces.
@@ -870,12 +924,20 @@ int lockup_detector_offline_cpu(unsigned int cpu)
return 0;
}
-static void __lockup_detector_reconfigure(void)
+static void __lockup_detector_reconfigure(bool thresh_changed)
{
cpus_read_lock();
watchdog_hardlockup_stop();
softlockup_stop_all();
+ /*
+ * To prevent watchdog_timer_fn from using the old interval and
+ * the new watchdog_thresh at the same time, which could lead to
+ * false softlockup reports, it is necessary to update the
+ * watchdog_thresh after the softlockup is completed.
+ */
+ if (thresh_changed)
+ watchdog_thresh = READ_ONCE(watchdog_thresh_next);
set_sample_period();
lockup_detector_update_enable();
if (watchdog_enabled && watchdog_thresh)
@@ -888,7 +950,7 @@ static void __lockup_detector_reconfigure(void)
void lockup_detector_reconfigure(void)
{
mutex_lock(&watchdog_mutex);
- __lockup_detector_reconfigure();
+ __lockup_detector_reconfigure(false);
mutex_unlock(&watchdog_mutex);
}
@@ -908,27 +970,29 @@ static __init void lockup_detector_setup(void)
return;
mutex_lock(&watchdog_mutex);
- __lockup_detector_reconfigure();
+ __lockup_detector_reconfigure(false);
softlockup_initialized = true;
mutex_unlock(&watchdog_mutex);
}
#else /* CONFIG_SOFTLOCKUP_DETECTOR */
-static void __lockup_detector_reconfigure(void)
+static void __lockup_detector_reconfigure(bool thresh_changed)
{
cpus_read_lock();
watchdog_hardlockup_stop();
+ if (thresh_changed)
+ watchdog_thresh = READ_ONCE(watchdog_thresh_next);
lockup_detector_update_enable();
watchdog_hardlockup_start();
cpus_read_unlock();
}
void lockup_detector_reconfigure(void)
{
- __lockup_detector_reconfigure();
+ __lockup_detector_reconfigure(false);
}
static inline void lockup_detector_setup(void)
{
- __lockup_detector_reconfigure();
+ __lockup_detector_reconfigure(false);
}
#endif /* !CONFIG_SOFTLOCKUP_DETECTOR */
@@ -946,11 +1010,11 @@ void lockup_detector_soft_poweroff(void)
#ifdef CONFIG_SYSCTL
/* Propagate any changes to the watchdog infrastructure */
-static void proc_watchdog_update(void)
+static void proc_watchdog_update(bool thresh_changed)
{
/* Remove impossible cpus to keep sysctl output clean. */
cpumask_and(&watchdog_cpumask, &watchdog_cpumask, cpu_possible_mask);
- __lockup_detector_reconfigure();
+ __lockup_detector_reconfigure(thresh_changed);
}
/*
@@ -984,7 +1048,7 @@ static int proc_watchdog_common(int which, const struct ctl_table *table, int wr
} else {
err = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
if (!err && old != READ_ONCE(*param))
- proc_watchdog_update();
+ proc_watchdog_update(false);
}
mutex_unlock(&watchdog_mutex);
return err;
@@ -1035,11 +1099,13 @@ static int proc_watchdog_thresh(const struct ctl_table *table, int write,
mutex_lock(&watchdog_mutex);
- old = READ_ONCE(watchdog_thresh);
+ watchdog_thresh_next = READ_ONCE(watchdog_thresh);
+
+ old = watchdog_thresh_next;
err = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
- if (!err && write && old != READ_ONCE(watchdog_thresh))
- proc_watchdog_update();
+ if (!err && write && old != READ_ONCE(watchdog_thresh_next))
+ proc_watchdog_update(true);
mutex_unlock(&watchdog_mutex);
return err;
@@ -1060,7 +1126,7 @@ static int proc_watchdog_cpumask(const struct ctl_table *table, int write,
err = proc_do_large_bitmap(table, write, buffer, lenp, ppos);
if (!err && write)
- proc_watchdog_update();
+ proc_watchdog_update(false);
mutex_unlock(&watchdog_mutex);
return err;
@@ -1080,7 +1146,7 @@ static const struct ctl_table watchdog_sysctls[] = {
},
{
.procname = "watchdog_thresh",
- .data = &watchdog_thresh,
+ .data = &watchdog_thresh_next,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_watchdog_thresh,
diff --git a/kernel/watchdog_perf.c b/kernel/watchdog_perf.c
index 75af12ff774e..9c58f5b4381d 100644
--- a/kernel/watchdog_perf.c
+++ b/kernel/watchdog_perf.c
@@ -187,6 +187,28 @@ void watchdog_hardlockup_disable(unsigned int cpu)
}
/**
+ * hardlockup_detector_perf_adjust_period - Adjust the event period due
+ * to current cpu frequency change
+ * @period: The target period to be set
+ */
+void hardlockup_detector_perf_adjust_period(u64 period)
+{
+ struct perf_event *event = this_cpu_read(watchdog_ev);
+
+ if (!(watchdog_enabled & WATCHDOG_HARDLOCKUP_ENABLED))
+ return;
+
+ if (!event)
+ return;
+
+ if (event->attr.sample_period == period)
+ return;
+
+ if (perf_event_period(event, period))
+ pr_err("failed to change period to %llu\n", period);
+}
+
+/**
* hardlockup_detector_perf_stop - Globally stop watchdog events
*
* Special interface for x86 to handle the perf HT bug.
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index cf6203282737..9f9148075828 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -686,7 +686,7 @@ EXPORT_SYMBOL_GPL(destroy_work_on_stack);
void destroy_delayed_work_on_stack(struct delayed_work *work)
{
- destroy_timer_on_stack(&work->timer);
+ timer_destroy_on_stack(&work->timer);
debug_object_free(&work->work, &work_debug_descr);
}
EXPORT_SYMBOL_GPL(destroy_delayed_work_on_stack);
@@ -2481,7 +2481,7 @@ EXPORT_SYMBOL_GPL(queue_work_node);
void delayed_work_timer_fn(struct timer_list *t)
{
- struct delayed_work *dwork = from_timer(dwork, t, timer);
+ struct delayed_work *dwork = timer_container_of(dwork, t, timer);
/* should have been called from irqsafe timer with irq already off */
__queue_work(dwork->cpu, dwork->wq, &dwork->work);
@@ -2909,7 +2909,7 @@ static void set_worker_dying(struct worker *worker, struct list_head *list)
*/
static void idle_worker_timeout(struct timer_list *t)
{
- struct worker_pool *pool = from_timer(pool, t, idle_timer);
+ struct worker_pool *pool = timer_container_of(pool, t, idle_timer);
bool do_cull = false;
if (work_pending(&pool->idle_cull_work))
@@ -3008,7 +3008,7 @@ static void send_mayday(struct work_struct *work)
static void pool_mayday_timeout(struct timer_list *t)
{
- struct worker_pool *pool = from_timer(pool, t, mayday_timer);
+ struct worker_pool *pool = timer_container_of(pool, t, mayday_timer);
struct work_struct *work;
raw_spin_lock_irq(&pool->lock);
@@ -3241,7 +3241,7 @@ __acquires(&pool->lock)
* point will only record its address.
*/
trace_workqueue_execute_end(work, worker->current_func);
- pwq->stats[PWQ_STAT_COMPLETED]++;
+
lock_map_release(&lockdep_map);
if (!bh_draining)
lock_map_release(pwq->wq->lockdep_map);
@@ -3272,6 +3272,8 @@ __acquires(&pool->lock)
raw_spin_lock_irq(&pool->lock);
+ pwq->stats[PWQ_STAT_COMPLETED]++;
+
/*
* In addition to %WQ_CPU_INTENSIVE, @worker may also have been marked
* CPU intensive by wq_worker_tick() if @work hogged CPU longer than
@@ -5837,6 +5839,17 @@ static bool pwq_busy(struct pool_workqueue *pwq)
* @wq: target workqueue
*
* Safely destroy a workqueue. All work currently pending will be done first.
+ *
+ * This function does NOT guarantee that non-pending work that has been
+ * submitted with queue_delayed_work() and similar functions will be done
+ * before destroying the workqueue. The fundamental problem is that, currently,
+ * the workqueue has no way of accessing non-pending delayed_work. delayed_work
+ * is only linked on the timer-side. All delayed_work must, therefore, be
+ * canceled before calling this function.
+ *
+ * TODO: It would be better if the problem described above wouldn't exist and
+ * destroy_workqueue() would cleanly cancel all pending and non-pending
+ * delayed_work.
*/
void destroy_workqueue(struct workqueue_struct *wq)
{
@@ -7754,7 +7767,8 @@ void __init workqueue_init_early(void)
restrict_unbound_cpumask("workqueue.unbound_cpus", &wq_cmdline_cpumask);
cpumask_copy(wq_requested_unbound_cpumask, wq_unbound_cpumask);
-
+ cpumask_andnot(wq_isolated_cpumask, cpu_possible_mask,
+ housekeeping_cpumask(HK_TYPE_DOMAIN));
pwq_cache = KMEM_CACHE(pool_workqueue, SLAB_PANIC);
unbound_wq_update_pwq_attrs_buf = alloc_workqueue_attrs();