summaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/.gitignore3
-rw-r--r--kernel/Kconfig.freezer1
-rw-r--r--kernel/Kconfig.hz1
-rw-r--r--kernel/Kconfig.locks13
-rw-r--r--kernel/Kconfig.preempt1
-rw-r--r--kernel/Makefile18
-rw-r--r--kernel/acct.c4
-rw-r--r--kernel/async.c63
-rw-r--r--kernel/audit.c309
-rw-r--r--kernel/audit.h102
-rw-r--r--kernel/audit_fsnotify.c15
-rw-r--r--kernel/audit_tree.c21
-rw-r--r--kernel/audit_watch.c21
-rw-r--r--kernel/auditfilter.c103
-rw-r--r--kernel/auditsc.c439
-rw-r--r--kernel/backtracetest.c17
-rw-r--r--kernel/bpf/Makefile1
-rw-r--r--kernel/bpf/arraymap.c104
-rw-r--r--kernel/bpf/bpf_lru_list.c5
-rw-r--r--kernel/bpf/bpf_lru_list.h5
-rw-r--r--kernel/bpf/btf.c565
-rw-r--r--kernel/bpf/cgroup.c826
-rw-r--r--kernel/bpf/core.c395
-rw-r--r--kernel/bpf/cpumap.c185
-rw-r--r--kernel/bpf/devmap.c142
-rw-r--r--kernel/bpf/disasm.c49
-rw-r--r--kernel/bpf/disasm.h10
-rw-r--r--kernel/bpf/hashtab.c116
-rw-r--r--kernel/bpf/helpers.c237
-rw-r--r--kernel/bpf/inode.c33
-rw-r--r--kernel/bpf/local_storage.c35
-rw-r--r--kernel/bpf/lpm_trie.c26
-rw-r--r--kernel/bpf/map_in_map.c11
-rw-r--r--kernel/bpf/map_in_map.h5
-rw-r--r--kernel/bpf/offload.c45
-rw-r--r--kernel/bpf/percpu_freelist.c5
-rw-r--r--kernel/bpf/percpu_freelist.h5
-rw-r--r--kernel/bpf/queue_stack_maps.c19
-rw-r--r--kernel/bpf/reuseport_array.c17
-rw-r--r--kernel/bpf/stackmap.c41
-rw-r--r--kernel/bpf/syscall.c423
-rw-r--r--kernel/bpf/tnum.c1
-rw-r--r--kernel/bpf/verifier.c3023
-rw-r--r--kernel/bpf/xskmap.c22
-rw-r--r--kernel/capability.c45
-rw-r--r--kernel/cgroup/Makefile4
-rw-r--r--kernel/cgroup/cgroup-internal.h59
-rw-r--r--kernel/cgroup/cgroup-v1.c445
-rw-r--r--kernel/cgroup/cgroup.c688
-rw-r--r--kernel/cgroup/cpuset.c99
-rw-r--r--kernel/cgroup/debug.c8
-rw-r--r--kernel/cgroup/freezer.c639
-rw-r--r--kernel/cgroup/legacy_freezer.c481
-rw-r--r--kernel/cgroup/pids.c9
-rw-r--r--kernel/cgroup/rdma.c10
-rw-r--r--kernel/cgroup/rstat.c11
-rw-r--r--kernel/compat.c72
-rw-r--r--kernel/configs.c42
-rw-r--r--kernel/context_tracking.c1
-rw-r--r--kernel/cpu.c139
-rw-r--r--kernel/cpu_pm.c11
-rw-r--r--kernel/crash_core.c6
-rw-r--r--kernel/crash_dump.c1
-rw-r--r--kernel/cred.c41
-rw-r--r--kernel/debug/Makefile1
-rw-r--r--kernel/debug/gdbstub.c9
-rw-r--r--kernel/debug/kdb/Makefile1
-rw-r--r--kernel/debug/kdb/kdb_io.c2
-rw-r--r--kernel/debug/kdb/kdb_main.c3
-rw-r--r--kernel/debug/kdb/kdb_support.c2
-rw-r--r--kernel/delayacct.c11
-rw-r--r--kernel/dma/Kconfig129
-rw-r--r--kernel/dma/Makefile2
-rw-r--r--kernel/dma/coherent.c50
-rw-r--r--kernel/dma/contiguous.c56
-rw-r--r--kernel/dma/debug.c136
-rw-r--r--kernel/dma/direct.c85
-rw-r--r--kernel/dma/mapping.c46
-rw-r--r--kernel/dma/remap.c16
-rw-r--r--kernel/dma/swiotlb.c81
-rw-r--r--kernel/events/callchain.c3
-rw-r--r--kernel/events/core.c625
-rw-r--r--kernel/events/hw_breakpoint.c15
-rw-r--r--kernel/events/internal.h9
-rw-r--r--kernel/events/ring_buffer.c147
-rw-r--r--kernel/events/uprobes.c42
-rw-r--r--kernel/exit.c8
-rw-r--r--kernel/extable.c14
-rw-r--r--kernel/fail_function.c25
-rw-r--r--kernel/fork.c393
-rw-r--r--kernel/freezer.c1
-rw-r--r--kernel/futex.c306
-rw-r--r--kernel/gcov/Kconfig4
-rw-r--r--kernel/gcov/Makefile5
-rw-r--r--kernel/gcov/base.c86
-rw-r--r--kernel/gcov/clang.c581
-rw-r--r--kernel/gcov/fs.c24
-rw-r--r--kernel/gcov/gcc_3_4.c18
-rw-r--r--kernel/gcov/gcc_4_7.c12
-rw-r--r--kernel/gcov/gcc_base.c86
-rw-r--r--kernel/gcov/gcov.h5
-rwxr-xr-xkernel/gen_kheaders.sh81
-rw-r--r--kernel/hung_task.c4
-rw-r--r--kernel/iomem.c6
-rw-r--r--kernel/irq/Kconfig4
-rw-r--r--kernel/irq/Makefile3
-rw-r--r--kernel/irq/affinity.c129
-rw-r--r--kernel/irq/autoprobe.c6
-rw-r--r--kernel/irq/chip.c119
-rw-r--r--kernel/irq/cpuhotplug.c2
-rw-r--r--kernel/irq/debugfs.c10
-rw-r--r--kernel/irq/devres.c5
-rw-r--r--kernel/irq/handle.c4
-rw-r--r--kernel/irq/internals.h36
-rw-r--r--kernel/irq/irq_sim.c12
-rw-r--r--kernel/irq/irqdesc.c56
-rw-r--r--kernel/irq/irqdomain.c67
-rw-r--r--kernel/irq/manage.c501
-rw-r--r--kernel/irq/spurious.c4
-rw-r--r--kernel/irq/timings.c913
-rw-r--r--kernel/irq_work.c76
-rw-r--r--kernel/jump_label.c128
-rw-r--r--kernel/kallsyms.c3
-rw-r--r--kernel/kcov.c15
-rw-r--r--kernel/kexec.c4
-rw-r--r--kernel/kexec_core.c8
-rw-r--r--kernel/kexec_file.c30
-rw-r--r--kernel/kheaders.c66
-rw-r--r--kernel/kprobes.c67
-rw-r--r--kernel/ksysfs.c4
-rw-r--r--kernel/kthread.c56
-rw-r--r--kernel/latencytop.c43
-rw-r--r--kernel/livepatch/Kconfig1
-rw-r--r--kernel/livepatch/Makefile1
-rw-r--r--kernel/livepatch/core.c878
-rw-r--r--kernel/livepatch/core.h11
-rw-r--r--kernel/livepatch/patch.c71
-rw-r--r--kernel/livepatch/patch.h5
-rw-r--r--kernel/livepatch/shadow.c14
-rw-r--r--kernel/livepatch/transition.c169
-rw-r--r--kernel/livepatch/transition.h1
-rw-r--r--kernel/locking/Makefile3
-rw-r--r--kernel/locking/lock_events.c179
-rw-r--r--kernel/locking/lock_events.h60
-rw-r--r--kernel/locking/lock_events_list.h71
-rw-r--r--kernel/locking/lockdep.c2000
-rw-r--r--kernel/locking/lockdep_internals.h77
-rw-r--r--kernel/locking/lockdep_proc.c17
-rw-r--r--kernel/locking/locktorture.c25
-rw-r--r--kernel/locking/mutex.c1
-rw-r--r--kernel/locking/percpu-rwsem.c5
-rw-r--r--kernel/locking/qrwlock.c11
-rw-r--r--kernel/locking/qspinlock.c36
-rw-r--r--kernel/locking/qspinlock_paravirt.h19
-rw-r--r--kernel/locking/qspinlock_stat.h243
-rw-r--r--kernel/locking/rtmutex.c1
-rw-r--r--kernel/locking/rwsem-spinlock.c339
-rw-r--r--kernel/locking/rwsem-xadd.c725
-rw-r--r--kernel/locking/rwsem.c1474
-rw-r--r--kernel/locking/rwsem.h136
-rw-r--r--kernel/locking/semaphore.c3
-rw-r--r--kernel/locking/spinlock.c7
-rw-r--r--kernel/locking/spinlock_debug.c6
-rw-r--r--kernel/locking/test-ww_mutex.c15
-rw-r--r--kernel/memremap.c36
-rw-r--r--kernel/module-internal.h8
-rw-r--r--kernel/module.c133
-rw-r--r--kernel/module_signing.c6
-rw-r--r--kernel/notifier.c2
-rw-r--r--kernel/nsproxy.c6
-rw-r--r--kernel/padata.c3
-rw-r--r--kernel/panic.c29
-rw-r--r--kernel/params.c14
-rw-r--r--kernel/pid.c73
-rw-r--r--kernel/pid_namespace.c3
-rw-r--r--kernel/power/Kconfig10
-rw-r--r--kernel/power/energy_model.c59
-rw-r--r--kernel/power/hibernate.c33
-rw-r--r--kernel/power/main.c18
-rw-r--r--kernel/power/power.h2
-rw-r--r--kernel/power/poweroff.c3
-rw-r--r--kernel/power/qos.c9
-rw-r--r--kernel/power/snapshot.c29
-rw-r--r--kernel/power/suspend.c33
-rw-r--r--kernel/power/suspend_test.c3
-rw-r--r--kernel/power/swap.c7
-rw-r--r--kernel/power/user.c9
-rw-r--r--kernel/printk/Makefile1
-rw-r--r--kernel/printk/internal.h14
-rw-r--r--kernel/printk/printk.c123
-rw-r--r--kernel/printk/printk_safe.c14
-rw-r--r--kernel/profile.c1
-rw-r--r--kernel/ptrace.c43
-rw-r--r--kernel/rcu/Kconfig31
-rw-r--r--kernel/rcu/Kconfig.debug1
-rw-r--r--kernel/rcu/rcu.h36
-rw-r--r--kernel/rcu/rcu_segcblist.c17
-rw-r--r--kernel/rcu/rcu_segcblist.h17
-rw-r--r--kernel/rcu/rcuperf.c32
-rw-r--r--kernel/rcu/rcutorture.c176
-rw-r--r--kernel/rcu/srcutiny.c26
-rw-r--r--kernel/rcu/srcutree.c161
-rw-r--r--kernel/rcu/sync.c229
-rw-r--r--kernel/rcu/tiny.c21
-rw-r--r--kernel/rcu/tree.c853
-rw-r--r--kernel/rcu/tree.h71
-rw-r--r--kernel/rcu/tree_exp.h270
-rw-r--r--kernel/rcu/tree_plugin.h658
-rw-r--r--kernel/rcu/tree_stall.h711
-rw-r--r--kernel/rcu/update.c87
-rw-r--r--kernel/reboot.c21
-rw-r--r--kernel/relay.c1
-rw-r--r--kernel/resource.c34
-rw-r--r--kernel/rseq.c13
-rw-r--r--kernel/sched/autogroup.c2
-rw-r--r--kernel/sched/clock.c1
-rw-r--r--kernel/sched/core.c781
-rw-r--r--kernel/sched/cpudeadline.c10
-rw-r--r--kernel/sched/cpufreq.c6
-rw-r--r--kernel/sched/cpufreq_schedutil.c98
-rw-r--r--kernel/sched/cpupri.c10
-rw-r--r--kernel/sched/cputime.c1
-rw-r--r--kernel/sched/deadline.c19
-rw-r--r--kernel/sched/debug.c54
-rw-r--r--kernel/sched/fair.c1229
-rw-r--r--kernel/sched/features.h1
-rw-r--r--kernel/sched/idle.c1
-rw-r--r--kernel/sched/isolation.c21
-rw-r--r--kernel/sched/membarrier.c11
-rw-r--r--kernel/sched/pelt.c56
-rw-r--r--kernel/sched/pelt.h114
-rw-r--r--kernel/sched/psi.c619
-rw-r--r--kernel/sched/rt.c19
-rw-r--r--kernel/sched/sched-pelt.h2
-rw-r--r--kernel/sched/sched.h208
-rw-r--r--kernel/sched/topology.c86
-rw-r--r--kernel/sched/wait.c9
-rw-r--r--kernel/sched/wait_bit.c1
-rw-r--r--kernel/seccomp.c33
-rw-r--r--kernel/signal.c473
-rw-r--r--kernel/smp.c13
-rw-r--r--kernel/smpboot.c1
-rw-r--r--kernel/softirq.c59
-rw-r--r--kernel/stacktrace.c336
-rw-r--r--kernel/stop_machine.c24
-rw-r--r--kernel/sys.c75
-rw-r--r--kernel/sys_ni.c32
-rw-r--r--kernel/sysctl.c222
-rw-r--r--kernel/taskstats.c45
-rw-r--r--kernel/test_kprobes.c11
-rw-r--r--kernel/time/Kconfig30
-rw-r--r--kernel/time/Makefile1
-rw-r--r--kernel/time/alarmtimer.c3
-rw-r--r--kernel/time/clockevents.c18
-rw-r--r--kernel/time/clocksource.c4
-rw-r--r--kernel/time/hrtimer.c12
-rw-r--r--kernel/time/jiffies.c4
-rw-r--r--kernel/time/ntp.c42
-rw-r--r--kernel/time/ntp_internal.h4
-rw-r--r--kernel/time/posix-clock.c2
-rw-r--r--kernel/time/posix-cpu-timers.c13
-rw-r--r--kernel/time/posix-stubs.c25
-rw-r--r--kernel/time/posix-timers.c85
-rw-r--r--kernel/time/posix-timers.h2
-rw-r--r--kernel/time/sched_clock.c10
-rw-r--r--kernel/time/tick-broadcast.c49
-rw-r--r--kernel/time/tick-common.c54
-rw-r--r--kernel/time/tick-internal.h10
-rw-r--r--kernel/time/tick-sched.c51
-rw-r--r--kernel/time/tick-sched.h13
-rw-r--r--kernel/time/time.c108
-rw-r--r--kernel/time/timekeeping.c46
-rw-r--r--kernel/time/timekeeping.h7
-rw-r--r--kernel/time/timekeeping_debug.c11
-rw-r--r--kernel/time/timer.c36
-rw-r--r--kernel/time/timer_list.c36
-rw-r--r--kernel/time/vsyscall.c129
-rw-r--r--kernel/torture.c50
-rw-r--r--kernel/trace/Kconfig9
-rw-r--r--kernel/trace/blktrace.c7
-rw-r--r--kernel/trace/bpf_trace.c243
-rw-r--r--kernel/trace/ftrace.c69
-rw-r--r--kernel/trace/ring_buffer.c23
-rw-r--r--kernel/trace/ring_buffer_benchmark.c2
-rw-r--r--kernel/trace/trace.c830
-rw-r--r--kernel/trace/trace.h106
-rw-r--r--kernel/trace/trace_branch.c4
-rw-r--r--kernel/trace/trace_dynevent.c2
-rw-r--r--kernel/trace/trace_entries.h41
-rw-r--r--kernel/trace/trace_event_perf.c16
-rw-r--r--kernel/trace/trace_events.c4
-rw-r--r--kernel/trace/trace_events_filter.c104
-rw-r--r--kernel/trace/trace_events_hist.c1289
-rw-r--r--kernel/trace/trace_events_trigger.c3
-rw-r--r--kernel/trace/trace_functions_graph.c30
-rw-r--r--kernel/trace/trace_hwlat.c2
-rw-r--r--kernel/trace/trace_irqsoff.c11
-rw-r--r--kernel/trace/trace_kdb.c73
-rw-r--r--kernel/trace/trace_kprobe.c102
-rw-r--r--kernel/trace/trace_output.c2
-rw-r--r--kernel/trace/trace_preemptirq.c5
-rw-r--r--kernel/trace/trace_probe.c302
-rw-r--r--kernel/trace/trace_probe.h79
-rw-r--r--kernel/trace/trace_probe_tmpl.h2
-rw-r--r--kernel/trace/trace_sched_wakeup.c11
-rw-r--r--kernel/trace/trace_selftest.c5
-rw-r--r--kernel/trace/trace_stack.c85
-rw-r--r--kernel/trace/trace_syscalls.c9
-rw-r--r--kernel/trace/trace_uprobe.c82
-rw-r--r--kernel/tracepoint.c15
-rw-r--r--kernel/tsacct.c13
-rw-r--r--kernel/ucount.c7
-rw-r--r--kernel/umh.c1
-rw-r--r--kernel/up.c4
-rw-r--r--kernel/user-return-notifier.c1
-rw-r--r--kernel/user.c16
-rw-r--r--kernel/user_namespace.c16
-rw-r--r--kernel/utsname.c6
-rw-r--r--kernel/utsname_sysctl.c6
-rw-r--r--kernel/watchdog.c19
-rw-r--r--kernel/watchdog_hld.c3
-rw-r--r--kernel/workqueue.c359
-rw-r--r--kernel/workqueue_internal.h5
323 files changed, 24688 insertions, 11461 deletions
diff --git a/kernel/.gitignore b/kernel/.gitignore
index b3097bde4e9c..34d1e77ee9df 100644
--- a/kernel/.gitignore
+++ b/kernel/.gitignore
@@ -1,7 +1,6 @@
#
# Generated files
#
-config_data.h
-config_data.gz
+kheaders.md5
timeconst.h
hz.bc
diff --git a/kernel/Kconfig.freezer b/kernel/Kconfig.freezer
index a3bb4cb52539..68646feefb3d 100644
--- a/kernel/Kconfig.freezer
+++ b/kernel/Kconfig.freezer
@@ -1,2 +1,3 @@
+# SPDX-License-Identifier: GPL-2.0-only
config FREEZER
def_bool PM_SLEEP || CGROUP_FREEZER
diff --git a/kernel/Kconfig.hz b/kernel/Kconfig.hz
index 2a202a846757..38ef6d06888e 100644
--- a/kernel/Kconfig.hz
+++ b/kernel/Kconfig.hz
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
#
# Timer Interrupt Frequency Configuration
#
diff --git a/kernel/Kconfig.locks b/kernel/Kconfig.locks
index 84d882f3e299..e0852dc333ac 100644
--- a/kernel/Kconfig.locks
+++ b/kernel/Kconfig.locks
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
#
# The ARCH_INLINE foo is necessary because select ignores "depends on"
#
@@ -229,7 +230,7 @@ config MUTEX_SPIN_ON_OWNER
config RWSEM_SPIN_ON_OWNER
def_bool y
- depends on SMP && RWSEM_XCHGADD_ALGORITHM && ARCH_SUPPORTS_ATOMIC_RMW
+ depends on SMP && ARCH_SUPPORTS_ATOMIC_RMW
config LOCK_SPIN_ON_OWNER
def_bool y
@@ -242,9 +243,19 @@ config QUEUED_SPINLOCKS
def_bool y if ARCH_USE_QUEUED_SPINLOCKS
depends on SMP
+config BPF_ARCH_SPINLOCK
+ bool
+
config ARCH_USE_QUEUED_RWLOCKS
bool
config QUEUED_RWLOCKS
def_bool y if ARCH_USE_QUEUED_RWLOCKS
depends on SMP
+
+config ARCH_HAS_MMIOWB
+ bool
+
+config MMIOWB
+ def_bool y if ARCH_HAS_MMIOWB
+ depends on SMP
diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt
index 0fee5fe6c899..dc0b682ec2d9 100644
--- a/kernel/Kconfig.preempt
+++ b/kernel/Kconfig.preempt
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
choice
prompt "Preemption Model"
diff --git a/kernel/Makefile b/kernel/Makefile
index 6aa7543bcdb2..a8d923b5481b 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -30,6 +30,7 @@ KCOV_INSTRUMENT_extable.o := n
# Don't self-instrument.
KCOV_INSTRUMENT_kcov.o := n
KASAN_SANITIZE_kcov.o := n
+CFLAGS_kcov.o := $(call cc-option, -fno-conserve-stack -fno-stack-protector)
# cond_syscall is currently not LTO compatible
CFLAGS_sys_ni.o = $(DISABLE_LTO)
@@ -70,6 +71,7 @@ obj-$(CONFIG_UTS_NS) += utsname.o
obj-$(CONFIG_USER_NS) += user_namespace.o
obj-$(CONFIG_PID_NS) += pid_namespace.o
obj-$(CONFIG_IKCONFIG) += configs.o
+obj-$(CONFIG_IKHEADERS) += kheaders.o
obj-$(CONFIG_SMP) += stop_machine.o
obj-$(CONFIG_KPROBES_SANITY_TEST) += test_kprobes.o
obj-$(CONFIG_AUDIT) += audit.o auditfilter.o
@@ -116,17 +118,17 @@ obj-$(CONFIG_GCC_PLUGIN_STACKLEAK) += stackleak.o
KASAN_SANITIZE_stackleak.o := n
KCOV_INSTRUMENT_stackleak.o := n
-$(obj)/configs.o: $(obj)/config_data.h
+$(obj)/configs.o: $(obj)/config_data.gz
targets += config_data.gz
$(obj)/config_data.gz: $(KCONFIG_CONFIG) FORCE
$(call if_changed,gzip)
-filechk_ikconfiggz = \
- echo "static const char kernel_config_data[] __used = MAGIC_START"; \
- cat $< | scripts/bin2c; \
- echo "MAGIC_END;"
+$(obj)/kheaders.o: $(obj)/kheaders_data.tar.xz
-targets += config_data.h
-$(obj)/config_data.h: $(obj)/config_data.gz FORCE
- $(call filechk,ikconfiggz)
+quiet_cmd_genikh = CHK $(obj)/kheaders_data.tar.xz
+cmd_genikh = $(CONFIG_SHELL) $(srctree)/kernel/gen_kheaders.sh $@
+$(obj)/kheaders_data.tar.xz: FORCE
+ $(call cmd,genikh)
+
+clean-files := kheaders_data.tar.xz kheaders.md5
diff --git a/kernel/acct.c b/kernel/acct.c
index addf7732fb56..81f9831a7859 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -227,7 +227,7 @@ static int acct_on(struct filename *pathname)
filp_close(file, NULL);
return PTR_ERR(internal);
}
- err = mnt_want_write(internal);
+ err = __mnt_want_write(internal);
if (err) {
mntput(internal);
kfree(acct);
@@ -252,7 +252,7 @@ static int acct_on(struct filename *pathname)
old = xchg(&ns->bacct, &acct->pin);
mutex_unlock(&acct->lock);
pin_kill(old);
- mnt_drop_write(mnt);
+ __mnt_drop_write(mnt);
mntput(mnt);
return 0;
}
diff --git a/kernel/async.c b/kernel/async.c
index a893d6170944..4f9c1d614016 100644
--- a/kernel/async.c
+++ b/kernel/async.c
@@ -1,13 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* async.c: Asynchronous function calls for boot performance
*
* (C) Copyright 2009 Intel Corporation
* Author: Arjan van de Ven <arjan@linux.intel.com>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; version 2
- * of the License.
*/
@@ -119,7 +115,7 @@ static void async_run_entry_fn(struct work_struct *work)
/* 1) run (and print duration) */
if (initcall_debug && system_state < SYSTEM_RUNNING) {
- pr_debug("calling %lli_%pF @ %i\n",
+ pr_debug("calling %lli_%pS @ %i\n",
(long long)entry->cookie,
entry->func, task_pid_nr(current));
calltime = ktime_get();
@@ -128,7 +124,7 @@ static void async_run_entry_fn(struct work_struct *work)
if (initcall_debug && system_state < SYSTEM_RUNNING) {
rettime = ktime_get();
delta = ktime_sub(rettime, calltime);
- pr_debug("initcall %lli_%pF returned 0 after %lld usecs\n",
+ pr_debug("initcall %lli_%pS returned 0 after %lld usecs\n",
(long long)entry->cookie,
entry->func,
(long long)ktime_to_ns(delta) >> 10);
@@ -149,7 +145,25 @@ static void async_run_entry_fn(struct work_struct *work)
wake_up(&async_done);
}
-static async_cookie_t __async_schedule(async_func_t func, void *data, struct async_domain *domain)
+/**
+ * async_schedule_node_domain - NUMA specific version of async_schedule_domain
+ * @func: function to execute asynchronously
+ * @data: data pointer to pass to the function
+ * @node: NUMA node that we want to schedule this on or close to
+ * @domain: the domain
+ *
+ * Returns an async_cookie_t that may be used for checkpointing later.
+ * @domain may be used in the async_synchronize_*_domain() functions to
+ * wait within a certain synchronization domain rather than globally.
+ *
+ * Note: This function may be called from atomic or non-atomic contexts.
+ *
+ * The node requested will be honored on a best effort basis. If the node
+ * has no CPUs associated with it then the work is distributed among all
+ * available CPUs.
+ */
+async_cookie_t async_schedule_node_domain(async_func_t func, void *data,
+ int node, struct async_domain *domain)
{
struct async_entry *entry;
unsigned long flags;
@@ -195,43 +209,30 @@ static async_cookie_t __async_schedule(async_func_t func, void *data, struct asy
current->flags |= PF_USED_ASYNC;
/* schedule for execution */
- queue_work(system_unbound_wq, &entry->work);
+ queue_work_node(node, system_unbound_wq, &entry->work);
return newcookie;
}
+EXPORT_SYMBOL_GPL(async_schedule_node_domain);
/**
- * async_schedule - schedule a function for asynchronous execution
+ * async_schedule_node - NUMA specific version of async_schedule
* @func: function to execute asynchronously
* @data: data pointer to pass to the function
+ * @node: NUMA node that we want to schedule this on or close to
*
* Returns an async_cookie_t that may be used for checkpointing later.
* Note: This function may be called from atomic or non-atomic contexts.
- */
-async_cookie_t async_schedule(async_func_t func, void *data)
-{
- return __async_schedule(func, data, &async_dfl_domain);
-}
-EXPORT_SYMBOL_GPL(async_schedule);
-
-/**
- * async_schedule_domain - schedule a function for asynchronous execution within a certain domain
- * @func: function to execute asynchronously
- * @data: data pointer to pass to the function
- * @domain: the domain
*
- * Returns an async_cookie_t that may be used for checkpointing later.
- * @domain may be used in the async_synchronize_*_domain() functions to
- * wait within a certain synchronization domain rather than globally. A
- * synchronization domain is specified via @domain. Note: This function
- * may be called from atomic or non-atomic contexts.
+ * The node requested will be honored on a best effort basis. If the node
+ * has no CPUs associated with it then the work is distributed among all
+ * available CPUs.
*/
-async_cookie_t async_schedule_domain(async_func_t func, void *data,
- struct async_domain *domain)
+async_cookie_t async_schedule_node(async_func_t func, void *data, int node)
{
- return __async_schedule(func, data, domain);
+ return async_schedule_node_domain(func, data, node, &async_dfl_domain);
}
-EXPORT_SYMBOL_GPL(async_schedule_domain);
+EXPORT_SYMBOL_GPL(async_schedule_node);
/**
* async_synchronize_full - synchronize all asynchronous function calls
diff --git a/kernel/audit.c b/kernel/audit.c
index 632d36059556..da8dc0db5bd3 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/* audit.c -- Auditing support
* Gateway between the kernel (e.g., selinux) and the user-space audit daemon.
* System-call specific features have moved to auditsc.c
@@ -5,20 +6,6 @@
* Copyright 2003-2007 Red Hat Inc., Durham, North Carolina.
* All Rights Reserved.
*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
- *
* Written by Rickard E. (Rik) Faith <faith@redhat.com>
*
* Goals: 1) Integrate fully with Security Modules.
@@ -396,10 +383,10 @@ static int audit_log_config_change(char *function_name, u32 new, u32 old,
struct audit_buffer *ab;
int rc = 0;
- ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE);
+ ab = audit_log_start(audit_context(), GFP_KERNEL, AUDIT_CONFIG_CHANGE);
if (unlikely(!ab))
return rc;
- audit_log_format(ab, "%s=%u old=%u ", function_name, new, old);
+ audit_log_format(ab, "op=set %s=%u old=%u ", function_name, new, old);
audit_log_session_info(ab);
rc = audit_log_task_context(ab);
if (rc)
@@ -1053,7 +1040,8 @@ static int audit_netlink_ok(struct sk_buff *skb, u16 msg_type)
return err;
}
-static void audit_log_common_recv_msg(struct audit_buffer **ab, u16 msg_type)
+static void audit_log_common_recv_msg(struct audit_context *context,
+ struct audit_buffer **ab, u16 msg_type)
{
uid_t uid = from_kuid(&init_user_ns, current_uid());
pid_t pid = task_tgid_nr(current);
@@ -1063,7 +1051,7 @@ static void audit_log_common_recv_msg(struct audit_buffer **ab, u16 msg_type)
return;
}
- *ab = audit_log_start(NULL, GFP_KERNEL, msg_type);
+ *ab = audit_log_start(context, GFP_KERNEL, msg_type);
if (unlikely(!*ab))
return;
audit_log_format(*ab, "pid=%d uid=%u ", pid, uid);
@@ -1071,6 +1059,12 @@ static void audit_log_common_recv_msg(struct audit_buffer **ab, u16 msg_type)
audit_log_task_context(*ab);
}
+static inline void audit_log_user_recv_msg(struct audit_buffer **ab,
+ u16 msg_type)
+{
+ audit_log_common_recv_msg(NULL, ab, msg_type);
+}
+
int is_audit_feature_set(int i)
{
return af.features & AUDIT_FEATURE_TO_MASK(i);
@@ -1338,7 +1332,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
if (err)
break;
}
- audit_log_common_recv_msg(&ab, msg_type);
+ audit_log_user_recv_msg(&ab, msg_type);
if (msg_type != AUDIT_USER_TTY)
audit_log_format(ab, " msg='%.*s'",
AUDIT_MESSAGE_TEXT_MAX,
@@ -1361,8 +1355,12 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
if (nlmsg_len(nlh) < sizeof(struct audit_rule_data))
return -EINVAL;
if (audit_enabled == AUDIT_LOCKED) {
- audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE);
- audit_log_format(ab, " audit_enabled=%d res=0", audit_enabled);
+ audit_log_common_recv_msg(audit_context(), &ab,
+ AUDIT_CONFIG_CHANGE);
+ audit_log_format(ab, " op=%s audit_enabled=%d res=0",
+ msg_type == AUDIT_ADD_RULE ?
+ "add_rule" : "remove_rule",
+ audit_enabled);
audit_log_end(ab);
return -EPERM;
}
@@ -1373,7 +1371,8 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
break;
case AUDIT_TRIM:
audit_trim_trees();
- audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE);
+ audit_log_common_recv_msg(audit_context(), &ab,
+ AUDIT_CONFIG_CHANGE);
audit_log_format(ab, " op=trim res=1");
audit_log_end(ab);
break;
@@ -1403,8 +1402,8 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
/* OK, here comes... */
err = audit_tag_tree(old, new);
- audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE);
-
+ audit_log_common_recv_msg(audit_context(), &ab,
+ AUDIT_CONFIG_CHANGE);
audit_log_format(ab, " op=make_equiv old=");
audit_log_untrustedstring(ab, old);
audit_log_format(ab, " new=");
@@ -1471,7 +1470,8 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
old.enabled = t & AUDIT_TTY_ENABLE;
old.log_passwd = !!(t & AUDIT_TTY_LOG_PASSWD);
- audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE);
+ audit_log_common_recv_msg(audit_context(), &ab,
+ AUDIT_CONFIG_CHANGE);
audit_log_format(ab, " op=tty_set old-enabled=%d new-enabled=%d"
" old-log_passwd=%d new-log_passwd=%d res=%d",
old.enabled, s.enabled, old.log_passwd,
@@ -2054,153 +2054,6 @@ void audit_log_key(struct audit_buffer *ab, char *key)
audit_log_format(ab, "(null)");
}
-void audit_log_cap(struct audit_buffer *ab, char *prefix, kernel_cap_t *cap)
-{
- int i;
-
- if (cap_isclear(*cap)) {
- audit_log_format(ab, " %s=0", prefix);
- return;
- }
- audit_log_format(ab, " %s=", prefix);
- CAP_FOR_EACH_U32(i)
- audit_log_format(ab, "%08x", cap->cap[CAP_LAST_U32 - i]);
-}
-
-static void audit_log_fcaps(struct audit_buffer *ab, struct audit_names *name)
-{
- audit_log_cap(ab, "cap_fp", &name->fcap.permitted);
- audit_log_cap(ab, "cap_fi", &name->fcap.inheritable);
- audit_log_format(ab, " cap_fe=%d cap_fver=%x",
- name->fcap.fE, name->fcap_ver);
-}
-
-static inline int audit_copy_fcaps(struct audit_names *name,
- const struct dentry *dentry)
-{
- struct cpu_vfs_cap_data caps;
- int rc;
-
- if (!dentry)
- return 0;
-
- rc = get_vfs_caps_from_disk(dentry, &caps);
- if (rc)
- return rc;
-
- name->fcap.permitted = caps.permitted;
- name->fcap.inheritable = caps.inheritable;
- name->fcap.fE = !!(caps.magic_etc & VFS_CAP_FLAGS_EFFECTIVE);
- name->fcap_ver = (caps.magic_etc & VFS_CAP_REVISION_MASK) >>
- VFS_CAP_REVISION_SHIFT;
-
- return 0;
-}
-
-/* Copy inode data into an audit_names. */
-void audit_copy_inode(struct audit_names *name, const struct dentry *dentry,
- struct inode *inode)
-{
- name->ino = inode->i_ino;
- name->dev = inode->i_sb->s_dev;
- name->mode = inode->i_mode;
- name->uid = inode->i_uid;
- name->gid = inode->i_gid;
- name->rdev = inode->i_rdev;
- security_inode_getsecid(inode, &name->osid);
- audit_copy_fcaps(name, dentry);
-}
-
-/**
- * audit_log_name - produce AUDIT_PATH record from struct audit_names
- * @context: audit_context for the task
- * @n: audit_names structure with reportable details
- * @path: optional path to report instead of audit_names->name
- * @record_num: record number to report when handling a list of names
- * @call_panic: optional pointer to int that will be updated if secid fails
- */
-void audit_log_name(struct audit_context *context, struct audit_names *n,
- const struct path *path, int record_num, int *call_panic)
-{
- struct audit_buffer *ab;
- ab = audit_log_start(context, GFP_KERNEL, AUDIT_PATH);
- if (!ab)
- return;
-
- audit_log_format(ab, "item=%d", record_num);
-
- if (path)
- audit_log_d_path(ab, " name=", path);
- else if (n->name) {
- switch (n->name_len) {
- case AUDIT_NAME_FULL:
- /* log the full path */
- audit_log_format(ab, " name=");
- audit_log_untrustedstring(ab, n->name->name);
- break;
- case 0:
- /* name was specified as a relative path and the
- * directory component is the cwd */
- audit_log_d_path(ab, " name=", &context->pwd);
- break;
- default:
- /* log the name's directory component */
- audit_log_format(ab, " name=");
- audit_log_n_untrustedstring(ab, n->name->name,
- n->name_len);
- }
- } else
- audit_log_format(ab, " name=(null)");
-
- if (n->ino != AUDIT_INO_UNSET)
- audit_log_format(ab, " inode=%lu"
- " dev=%02x:%02x mode=%#ho"
- " ouid=%u ogid=%u rdev=%02x:%02x",
- n->ino,
- MAJOR(n->dev),
- MINOR(n->dev),
- n->mode,
- from_kuid(&init_user_ns, n->uid),
- from_kgid(&init_user_ns, n->gid),
- MAJOR(n->rdev),
- MINOR(n->rdev));
- if (n->osid != 0) {
- char *ctx = NULL;
- u32 len;
- if (security_secid_to_secctx(
- n->osid, &ctx, &len)) {
- audit_log_format(ab, " osid=%u", n->osid);
- if (call_panic)
- *call_panic = 2;
- } else {
- audit_log_format(ab, " obj=%s", ctx);
- security_release_secctx(ctx, len);
- }
- }
-
- /* log the audit_names record type */
- switch(n->type) {
- case AUDIT_TYPE_NORMAL:
- audit_log_format(ab, " nametype=NORMAL");
- break;
- case AUDIT_TYPE_PARENT:
- audit_log_format(ab, " nametype=PARENT");
- break;
- case AUDIT_TYPE_CHILD_DELETE:
- audit_log_format(ab, " nametype=DELETE");
- break;
- case AUDIT_TYPE_CHILD_CREATE:
- audit_log_format(ab, " nametype=CREATE");
- break;
- default:
- audit_log_format(ab, " nametype=UNKNOWN");
- break;
- }
-
- audit_log_fcaps(ab, n);
- audit_log_end(ab);
-}
-
int audit_log_task_context(struct audit_buffer *ab)
{
char *ctx = NULL;
@@ -2322,6 +2175,118 @@ void audit_log_link_denied(const char *operation)
audit_log_end(ab);
}
+/* global counter which is incremented every time something logs in */
+static atomic_t session_id = ATOMIC_INIT(0);
+
+static int audit_set_loginuid_perm(kuid_t loginuid)
+{
+ /* if we are unset, we don't need privs */
+ if (!audit_loginuid_set(current))
+ return 0;
+ /* if AUDIT_FEATURE_LOGINUID_IMMUTABLE means never ever allow a change*/
+ if (is_audit_feature_set(AUDIT_FEATURE_LOGINUID_IMMUTABLE))
+ return -EPERM;
+ /* it is set, you need permission */
+ if (!capable(CAP_AUDIT_CONTROL))
+ return -EPERM;
+ /* reject if this is not an unset and we don't allow that */
+ if (is_audit_feature_set(AUDIT_FEATURE_ONLY_UNSET_LOGINUID)
+ && uid_valid(loginuid))
+ return -EPERM;
+ return 0;
+}
+
+static void audit_log_set_loginuid(kuid_t koldloginuid, kuid_t kloginuid,
+ unsigned int oldsessionid,
+ unsigned int sessionid, int rc)
+{
+ struct audit_buffer *ab;
+ uid_t uid, oldloginuid, loginuid;
+ struct tty_struct *tty;
+
+ if (!audit_enabled)
+ return;
+
+ ab = audit_log_start(audit_context(), GFP_KERNEL, AUDIT_LOGIN);
+ if (!ab)
+ return;
+
+ uid = from_kuid(&init_user_ns, task_uid(current));
+ oldloginuid = from_kuid(&init_user_ns, koldloginuid);
+ loginuid = from_kuid(&init_user_ns, kloginuid),
+ tty = audit_get_tty();
+
+ audit_log_format(ab, "pid=%d uid=%u", task_tgid_nr(current), uid);
+ audit_log_task_context(ab);
+ audit_log_format(ab, " old-auid=%u auid=%u tty=%s old-ses=%u ses=%u res=%d",
+ oldloginuid, loginuid, tty ? tty_name(tty) : "(none)",
+ oldsessionid, sessionid, !rc);
+ audit_put_tty(tty);
+ audit_log_end(ab);
+}
+
+/**
+ * audit_set_loginuid - set current task's loginuid
+ * @loginuid: loginuid value
+ *
+ * Returns 0.
+ *
+ * Called (set) from fs/proc/base.c::proc_loginuid_write().
+ */
+int audit_set_loginuid(kuid_t loginuid)
+{
+ unsigned int oldsessionid, sessionid = AUDIT_SID_UNSET;
+ kuid_t oldloginuid;
+ int rc;
+
+ oldloginuid = audit_get_loginuid(current);
+ oldsessionid = audit_get_sessionid(current);
+
+ rc = audit_set_loginuid_perm(loginuid);
+ if (rc)
+ goto out;
+
+ /* are we setting or clearing? */
+ if (uid_valid(loginuid)) {
+ sessionid = (unsigned int)atomic_inc_return(&session_id);
+ if (unlikely(sessionid == AUDIT_SID_UNSET))
+ sessionid = (unsigned int)atomic_inc_return(&session_id);
+ }
+
+ current->sessionid = sessionid;
+ current->loginuid = loginuid;
+out:
+ audit_log_set_loginuid(oldloginuid, loginuid, oldsessionid, sessionid, rc);
+ return rc;
+}
+
+/**
+ * audit_signal_info - record signal info for shutting down audit subsystem
+ * @sig: signal value
+ * @t: task being signaled
+ *
+ * If the audit subsystem is being terminated, record the task (pid)
+ * and uid that is doing that.
+ */
+int audit_signal_info(int sig, struct task_struct *t)
+{
+ kuid_t uid = current_uid(), auid;
+
+ if (auditd_test_task(t) &&
+ (sig == SIGTERM || sig == SIGHUP ||
+ sig == SIGUSR1 || sig == SIGUSR2)) {
+ audit_sig_pid = task_tgid_nr(current);
+ auid = audit_get_loginuid(current);
+ if (uid_valid(auid))
+ audit_sig_uid = auid;
+ else
+ audit_sig_uid = uid;
+ security_task_getsecid(current, &audit_sig_sid);
+ }
+
+ return audit_signal_info_syscall(t);
+}
+
/**
* audit_log_end - end one audit record
* @ab: the audit_buffer
diff --git a/kernel/audit.h b/kernel/audit.h
index 91421679a168..6fb7160412d4 100644
--- a/kernel/audit.h
+++ b/kernel/audit.h
@@ -1,22 +1,9 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
/* audit -- definition of audit_context structure and supporting types
*
* Copyright 2003-2004 Red Hat, Inc.
* Copyright 2005 Hewlett-Packard Development Company, L.P.
* Copyright 2005 IBM Corporation
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
#include <linux/fs.h>
@@ -69,6 +56,7 @@ struct audit_cap_data {
kernel_cap_t effective; /* effective set of process */
};
kernel_cap_t ambient;
+ kuid_t rootid;
};
/* When fs/namei.c:getname() is called, we store the pointer in name and bump
@@ -212,15 +200,6 @@ extern bool audit_ever_enabled;
extern void audit_log_session_info(struct audit_buffer *ab);
-extern void audit_copy_inode(struct audit_names *name,
- const struct dentry *dentry,
- struct inode *inode);
-extern void audit_log_cap(struct audit_buffer *ab, char *prefix,
- kernel_cap_t *cap);
-extern void audit_log_name(struct audit_context *context,
- struct audit_names *n, const struct path *path,
- int record_num, int *call_panic);
-
extern int auditd_test_task(struct task_struct *task);
#define AUDIT_INODE_BUCKETS 32
@@ -239,7 +218,7 @@ extern int audit_comparator(const u32 left, const u32 op, const u32 right);
extern int audit_uid_comparator(kuid_t left, u32 op, kuid_t right);
extern int audit_gid_comparator(kgid_t left, u32 op, kgid_t right);
extern int parent_len(const char *path);
-extern int audit_compare_dname_path(const char *dname, const char *path, int plen);
+extern int audit_compare_dname_path(const struct qstr *dname, const char *path, int plen);
extern struct sk_buff *audit_make_reply(int seq, int type, int done, int multi,
const void *payload, int size);
extern void audit_panic(const char *message);
@@ -267,25 +246,52 @@ extern void audit_log_d_path_exe(struct audit_buffer *ab,
extern struct tty_struct *audit_get_tty(void);
extern void audit_put_tty(struct tty_struct *tty);
-/* audit watch functions */
+/* audit watch/mark/tree functions */
#ifdef CONFIG_AUDITSYSCALL
+extern unsigned int audit_serial(void);
+extern int auditsc_get_stamp(struct audit_context *ctx,
+ struct timespec64 *t, unsigned int *serial);
+
extern void audit_put_watch(struct audit_watch *watch);
extern void audit_get_watch(struct audit_watch *watch);
-extern int audit_to_watch(struct audit_krule *krule, char *path, int len, u32 op);
+extern int audit_to_watch(struct audit_krule *krule, char *path, int len,
+ u32 op);
extern int audit_add_watch(struct audit_krule *krule, struct list_head **list);
extern void audit_remove_watch_rule(struct audit_krule *krule);
extern char *audit_watch_path(struct audit_watch *watch);
-extern int audit_watch_compare(struct audit_watch *watch, unsigned long ino, dev_t dev);
+extern int audit_watch_compare(struct audit_watch *watch, unsigned long ino,
+ dev_t dev);
-extern struct audit_fsnotify_mark *audit_alloc_mark(struct audit_krule *krule, char *pathname, int len);
+extern struct audit_fsnotify_mark *audit_alloc_mark(struct audit_krule *krule,
+ char *pathname, int len);
extern char *audit_mark_path(struct audit_fsnotify_mark *mark);
extern void audit_remove_mark(struct audit_fsnotify_mark *audit_mark);
extern void audit_remove_mark_rule(struct audit_krule *krule);
-extern int audit_mark_compare(struct audit_fsnotify_mark *mark, unsigned long ino, dev_t dev);
+extern int audit_mark_compare(struct audit_fsnotify_mark *mark,
+ unsigned long ino, dev_t dev);
extern int audit_dupe_exe(struct audit_krule *new, struct audit_krule *old);
-extern int audit_exe_compare(struct task_struct *tsk, struct audit_fsnotify_mark *mark);
+extern int audit_exe_compare(struct task_struct *tsk,
+ struct audit_fsnotify_mark *mark);
+
+extern struct audit_chunk *audit_tree_lookup(const struct inode *inode);
+extern void audit_put_chunk(struct audit_chunk *chunk);
+extern bool audit_tree_match(struct audit_chunk *chunk,
+ struct audit_tree *tree);
+extern int audit_make_tree(struct audit_krule *rule, char *pathname, u32 op);
+extern int audit_add_tree_rule(struct audit_krule *rule);
+extern int audit_remove_tree_rule(struct audit_krule *rule);
+extern void audit_trim_trees(void);
+extern int audit_tag_tree(char *old, char *new);
+extern const char *audit_tree_path(struct audit_tree *tree);
+extern void audit_put_tree(struct audit_tree *tree);
+extern void audit_kill_trees(struct audit_context *context);
-#else
+extern int audit_signal_info_syscall(struct task_struct *t);
+extern void audit_filter_inodes(struct task_struct *tsk,
+ struct audit_context *ctx);
+extern struct list_head *audit_killed_trees(void);
+#else /* CONFIG_AUDITSYSCALL */
+#define auditsc_get_stamp(c, t, s) 0
#define audit_put_watch(w) {}
#define audit_get_watch(w) {}
#define audit_to_watch(k, p, l, o) (-EINVAL)
@@ -301,21 +307,7 @@ extern int audit_exe_compare(struct task_struct *tsk, struct audit_fsnotify_mark
#define audit_mark_compare(m, i, d) 0
#define audit_exe_compare(t, m) (-EINVAL)
#define audit_dupe_exe(n, o) (-EINVAL)
-#endif /* CONFIG_AUDITSYSCALL */
-#ifdef CONFIG_AUDITSYSCALL
-extern struct audit_chunk *audit_tree_lookup(const struct inode *inode);
-extern void audit_put_chunk(struct audit_chunk *chunk);
-extern bool audit_tree_match(struct audit_chunk *chunk, struct audit_tree *tree);
-extern int audit_make_tree(struct audit_krule *rule, char *pathname, u32 op);
-extern int audit_add_tree_rule(struct audit_krule *rule);
-extern int audit_remove_tree_rule(struct audit_krule *rule);
-extern void audit_trim_trees(void);
-extern int audit_tag_tree(char *old, char *new);
-extern const char *audit_tree_path(struct audit_tree *tree);
-extern void audit_put_tree(struct audit_tree *tree);
-extern void audit_kill_trees(struct list_head *list);
-#else
#define audit_remove_tree_rule(rule) BUG()
#define audit_add_tree_rule(rule) -EINVAL
#define audit_make_tree(rule, str, op) -EINVAL
@@ -323,8 +315,15 @@ extern void audit_kill_trees(struct list_head *list);
#define audit_put_tree(tree) (void)0
#define audit_tag_tree(old, new) -EINVAL
#define audit_tree_path(rule) "" /* never called */
-#define audit_kill_trees(list) BUG()
-#endif
+#define audit_kill_trees(context) BUG()
+
+static inline int audit_signal_info_syscall(struct task_struct *t)
+{
+ return 0;
+}
+
+#define audit_filter_inodes(t, c) AUDIT_DISABLED
+#endif /* CONFIG_AUDITSYSCALL */
extern char *audit_unpack_string(void **bufp, size_t *remain, size_t len);
@@ -334,14 +333,5 @@ extern u32 audit_sig_sid;
extern int audit_filter(int msgtype, unsigned int listtype);
-#ifdef CONFIG_AUDITSYSCALL
-extern int audit_signal_info(int sig, struct task_struct *t);
-extern void audit_filter_inodes(struct task_struct *tsk, struct audit_context *ctx);
-extern struct list_head *audit_killed_trees(void);
-#else
-#define audit_signal_info(s,t) AUDIT_DISABLED
-#define audit_filter_inodes(t,c) AUDIT_DISABLED
-#endif
-
extern void audit_ctl_lock(void);
extern void audit_ctl_unlock(void);
diff --git a/kernel/audit_fsnotify.c b/kernel/audit_fsnotify.c
index cf4512a33675..f0d243318452 100644
--- a/kernel/audit_fsnotify.c
+++ b/kernel/audit_fsnotify.c
@@ -1,18 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/* audit_fsnotify.c -- tracking inodes
*
* Copyright 2003-2009,2014-2015 Red Hat, Inc.
* Copyright 2005 Hewlett-Packard Development Company, L.P.
* Copyright 2005 IBM Corporation
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
*/
#include <linux/kernel.h>
@@ -127,7 +118,7 @@ static void audit_mark_log_rule_change(struct audit_fsnotify_mark *audit_mark, c
if (!audit_enabled)
return;
- ab = audit_log_start(NULL, GFP_NOFS, AUDIT_CONFIG_CHANGE);
+ ab = audit_log_start(audit_context(), GFP_NOFS, AUDIT_CONFIG_CHANGE);
if (unlikely(!ab))
return;
audit_log_session_info(ab);
@@ -164,7 +155,7 @@ static void audit_autoremove_mark_rule(struct audit_fsnotify_mark *audit_mark)
static int audit_mark_handle_event(struct fsnotify_group *group,
struct inode *to_tell,
u32 mask, const void *data, int data_type,
- const unsigned char *dname, u32 cookie,
+ const struct qstr *dname, u32 cookie,
struct fsnotify_iter_info *iter_info)
{
struct fsnotify_mark *inode_mark = fsnotify_iter_inode_mark(iter_info);
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index d4af4d97f847..e49c912f862d 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -524,13 +524,14 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree)
return 0;
}
-static void audit_tree_log_remove_rule(struct audit_krule *rule)
+static void audit_tree_log_remove_rule(struct audit_context *context,
+ struct audit_krule *rule)
{
struct audit_buffer *ab;
if (!audit_enabled)
return;
- ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE);
+ ab = audit_log_start(context, GFP_KERNEL, AUDIT_CONFIG_CHANGE);
if (unlikely(!ab))
return;
audit_log_format(ab, "op=remove_rule dir=");
@@ -540,7 +541,7 @@ static void audit_tree_log_remove_rule(struct audit_krule *rule)
audit_log_end(ab);
}
-static void kill_rules(struct audit_tree *tree)
+static void kill_rules(struct audit_context *context, struct audit_tree *tree)
{
struct audit_krule *rule, *next;
struct audit_entry *entry;
@@ -551,7 +552,7 @@ static void kill_rules(struct audit_tree *tree)
list_del_init(&rule->rlist);
if (rule->tree) {
/* not a half-baked one */
- audit_tree_log_remove_rule(rule);
+ audit_tree_log_remove_rule(context, rule);
if (entry->rule.exe)
audit_remove_mark(entry->rule.exe);
rule->tree = NULL;
@@ -633,7 +634,7 @@ static void trim_marked(struct audit_tree *tree)
tree->goner = 1;
spin_unlock(&hash_lock);
mutex_lock(&audit_filter_mutex);
- kill_rules(tree);
+ kill_rules(audit_context(), tree);
list_del_init(&tree->list);
mutex_unlock(&audit_filter_mutex);
prune_one(tree);
@@ -973,8 +974,10 @@ static void audit_schedule_prune(void)
* ... and that one is done if evict_chunk() decides to delay until the end
* of syscall. Runs synchronously.
*/
-void audit_kill_trees(struct list_head *list)
+void audit_kill_trees(struct audit_context *context)
{
+ struct list_head *list = &context->killed_trees;
+
audit_ctl_lock();
mutex_lock(&audit_filter_mutex);
@@ -982,7 +985,7 @@ void audit_kill_trees(struct list_head *list)
struct audit_tree *victim;
victim = list_entry(list->next, struct audit_tree, list);
- kill_rules(victim);
+ kill_rules(context, victim);
list_del_init(&victim->list);
mutex_unlock(&audit_filter_mutex);
@@ -1017,7 +1020,7 @@ static void evict_chunk(struct audit_chunk *chunk)
list_del_init(&owner->same_root);
spin_unlock(&hash_lock);
if (!postponed) {
- kill_rules(owner);
+ kill_rules(audit_context(), owner);
list_move(&owner->list, &prune_list);
need_prune = 1;
} else {
@@ -1037,7 +1040,7 @@ static void evict_chunk(struct audit_chunk *chunk)
static int audit_tree_handle_event(struct fsnotify_group *group,
struct inode *to_tell,
u32 mask, const void *data, int data_type,
- const unsigned char *file_name, u32 cookie,
+ const struct qstr *file_name, u32 cookie,
struct fsnotify_iter_info *iter_info)
{
return 0;
diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c
index 20ef9ba134b0..1f31c2f1e6fc 100644
--- a/kernel/audit_watch.c
+++ b/kernel/audit_watch.c
@@ -1,22 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/* audit_watch.c -- watching inodes
*
* Copyright 2003-2009 Red Hat, Inc.
* Copyright 2005 Hewlett-Packard Development Company, L.P.
* Copyright 2005 IBM Corporation
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
#include <linux/file.h>
@@ -242,7 +229,7 @@ static void audit_watch_log_rule_change(struct audit_krule *r, struct audit_watc
if (!audit_enabled)
return;
- ab = audit_log_start(NULL, GFP_NOFS, AUDIT_CONFIG_CHANGE);
+ ab = audit_log_start(audit_context(), GFP_NOFS, AUDIT_CONFIG_CHANGE);
if (!ab)
return;
audit_log_session_info(ab);
@@ -255,7 +242,7 @@ static void audit_watch_log_rule_change(struct audit_krule *r, struct audit_watc
/* Update inode info in audit rules based on filesystem event. */
static void audit_update_watch(struct audit_parent *parent,
- const char *dname, dev_t dev,
+ const struct qstr *dname, dev_t dev,
unsigned long ino, unsigned invalidating)
{
struct audit_watch *owatch, *nwatch, *nextw;
@@ -482,7 +469,7 @@ void audit_remove_watch_rule(struct audit_krule *krule)
static int audit_watch_handle_event(struct fsnotify_group *group,
struct inode *to_tell,
u32 mask, const void *data, int data_type,
- const unsigned char *dname, u32 cookie,
+ const struct qstr *dname, u32 cookie,
struct fsnotify_iter_info *iter_info)
{
struct fsnotify_mark *inode_mark = fsnotify_iter_inode_mark(iter_info);
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index bf309f2592c4..b0126e9c0743 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -1,22 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/* auditfilter.c -- filtering of audit events
*
* Copyright 2003-2004 Red Hat, Inc.
* Copyright 2005 Hewlett-Packard Development Company, L.P.
* Copyright 2005 IBM Corporation
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
@@ -335,7 +322,7 @@ static u32 audit_to_op(u32 op)
/* check if an audit field is valid */
static int audit_field_valid(struct audit_entry *entry, struct audit_field *f)
{
- switch(f->type) {
+ switch (f->type) {
case AUDIT_MSGTYPE:
if (entry->rule.listnr != AUDIT_FILTER_EXCLUDE &&
entry->rule.listnr != AUDIT_FILTER_USER)
@@ -347,7 +334,7 @@ static int audit_field_valid(struct audit_entry *entry, struct audit_field *f)
break;
}
- switch(entry->rule.listnr) {
+ switch (entry->rule.listnr) {
case AUDIT_FILTER_FS:
switch(f->type) {
case AUDIT_FSTYPE:
@@ -358,9 +345,16 @@ static int audit_field_valid(struct audit_entry *entry, struct audit_field *f)
}
}
- switch(f->type) {
- default:
- return -EINVAL;
+ /* Check for valid field type and op */
+ switch (f->type) {
+ case AUDIT_ARG0:
+ case AUDIT_ARG1:
+ case AUDIT_ARG2:
+ case AUDIT_ARG3:
+ case AUDIT_PERS: /* <uapi/linux/personality.h> */
+ case AUDIT_DEVMINOR:
+ /* all ops are valid */
+ break;
case AUDIT_UID:
case AUDIT_EUID:
case AUDIT_SUID:
@@ -373,46 +367,53 @@ static int audit_field_valid(struct audit_entry *entry, struct audit_field *f)
case AUDIT_FSGID:
case AUDIT_OBJ_GID:
case AUDIT_PID:
- case AUDIT_PERS:
case AUDIT_MSGTYPE:
case AUDIT_PPID:
case AUDIT_DEVMAJOR:
- case AUDIT_DEVMINOR:
case AUDIT_EXIT:
case AUDIT_SUCCESS:
case AUDIT_INODE:
case AUDIT_SESSIONID:
+ case AUDIT_SUBJ_SEN:
+ case AUDIT_SUBJ_CLR:
+ case AUDIT_OBJ_LEV_LOW:
+ case AUDIT_OBJ_LEV_HIGH:
+ case AUDIT_SADDR_FAM:
/* bit ops are only useful on syscall args */
if (f->op == Audit_bitmask || f->op == Audit_bittest)
return -EINVAL;
break;
- case AUDIT_ARG0:
- case AUDIT_ARG1:
- case AUDIT_ARG2:
- case AUDIT_ARG3:
case AUDIT_SUBJ_USER:
case AUDIT_SUBJ_ROLE:
case AUDIT_SUBJ_TYPE:
- case AUDIT_SUBJ_SEN:
- case AUDIT_SUBJ_CLR:
case AUDIT_OBJ_USER:
case AUDIT_OBJ_ROLE:
case AUDIT_OBJ_TYPE:
- case AUDIT_OBJ_LEV_LOW:
- case AUDIT_OBJ_LEV_HIGH:
case AUDIT_WATCH:
case AUDIT_DIR:
case AUDIT_FILTERKEY:
- break;
case AUDIT_LOGINUID_SET:
- if ((f->val != 0) && (f->val != 1))
- return -EINVAL;
- /* FALL THROUGH */
case AUDIT_ARCH:
case AUDIT_FSTYPE:
+ case AUDIT_PERM:
+ case AUDIT_FILETYPE:
+ case AUDIT_FIELD_COMPARE:
+ case AUDIT_EXE:
+ /* only equal and not equal valid ops */
if (f->op != Audit_not_equal && f->op != Audit_equal)
return -EINVAL;
break;
+ default:
+ /* field not recognized */
+ return -EINVAL;
+ }
+
+ /* Check for select valid field values */
+ switch (f->type) {
+ case AUDIT_LOGINUID_SET:
+ if ((f->val != 0) && (f->val != 1))
+ return -EINVAL;
+ break;
case AUDIT_PERM:
if (f->val & ~15)
return -EINVAL;
@@ -425,11 +426,14 @@ static int audit_field_valid(struct audit_entry *entry, struct audit_field *f)
if (f->val > AUDIT_MAX_FIELD_COMPARE)
return -EINVAL;
break;
- case AUDIT_EXE:
- if (f->op != Audit_not_equal && f->op != Audit_equal)
+ case AUDIT_SADDR_FAM:
+ if (f->val >= AF_MAX)
return -EINVAL;
break;
+ default:
+ break;
}
+
return 0;
}
@@ -670,7 +674,7 @@ static struct audit_rule_data *audit_krule_to_data(struct audit_krule *krule)
data->values[i] = AUDIT_UID_UNSET;
break;
}
- /* fallthrough if set */
+ /* fall through - if set */
default:
data->values[i] = f->val;
}
@@ -1091,7 +1095,7 @@ static void audit_log_rule_change(char *action, struct audit_krule *rule, int re
if (!audit_enabled)
return;
- ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE);
+ ab = audit_log_start(audit_context(), GFP_KERNEL, AUDIT_CONFIG_CHANGE);
if (!ab)
return;
audit_log_session_info(ab);
@@ -1114,22 +1118,24 @@ int audit_rule_change(int type, int seq, void *data, size_t datasz)
int err = 0;
struct audit_entry *entry;
- entry = audit_data_to_entry(data, datasz);
- if (IS_ERR(entry))
- return PTR_ERR(entry);
-
switch (type) {
case AUDIT_ADD_RULE:
+ entry = audit_data_to_entry(data, datasz);
+ if (IS_ERR(entry))
+ return PTR_ERR(entry);
err = audit_add_rule(entry);
audit_log_rule_change("add_rule", &entry->rule, !err);
break;
case AUDIT_DEL_RULE:
+ entry = audit_data_to_entry(data, datasz);
+ if (IS_ERR(entry))
+ return PTR_ERR(entry);
err = audit_del_rule(entry);
audit_log_rule_change("remove_rule", &entry->rule, !err);
break;
default:
- err = -EINVAL;
WARN_ON(1);
+ return -EINVAL;
}
if (err || type == AUDIT_DEL_RULE) {
@@ -1201,7 +1207,6 @@ int audit_comparator(u32 left, u32 op, u32 right)
case Audit_bittest:
return ((left & right) == right);
default:
- BUG();
return 0;
}
}
@@ -1224,7 +1229,6 @@ int audit_uid_comparator(kuid_t left, u32 op, kuid_t right)
case Audit_bitmask:
case Audit_bittest:
default:
- BUG();
return 0;
}
}
@@ -1247,7 +1251,6 @@ int audit_gid_comparator(kgid_t left, u32 op, kgid_t right)
case Audit_bitmask:
case Audit_bittest:
default:
- BUG();
return 0;
}
}
@@ -1290,12 +1293,12 @@ int parent_len(const char *path)
* @parentlen: length of the parent if known. Passing in AUDIT_NAME_FULL
* here indicates that we must compute this value.
*/
-int audit_compare_dname_path(const char *dname, const char *path, int parentlen)
+int audit_compare_dname_path(const struct qstr *dname, const char *path, int parentlen)
{
int dlen, pathlen;
const char *p;
- dlen = strlen(dname);
+ dlen = dname->len;
pathlen = strlen(path);
if (pathlen < dlen)
return 1;
@@ -1306,7 +1309,7 @@ int audit_compare_dname_path(const char *dname, const char *path, int parentlen)
p = path + parentlen;
- return strncmp(p, dname, dlen);
+ return strncmp(p, dname->name, dlen);
}
int audit_filter(int msgtype, unsigned int listtype)
@@ -1315,8 +1318,6 @@ int audit_filter(int msgtype, unsigned int listtype)
int ret = 1; /* Audit by default */
rcu_read_lock();
- if (list_empty(&audit_filter_list[listtype]))
- goto unlock_and_return;
list_for_each_entry_rcu(e, &audit_filter_list[listtype], list) {
int i, result = 0;
@@ -1355,7 +1356,7 @@ int audit_filter(int msgtype, unsigned int listtype)
if (f->lsm_rule) {
security_task_getsecid(current, &sid);
result = security_audit_rule_match(sid,
- f->type, f->op, f->lsm_rule, NULL);
+ f->type, f->op, f->lsm_rule);
}
break;
case AUDIT_EXE:
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 6593a5207fb0..4effe01ebbe2 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -601,12 +601,20 @@ static int audit_filter_rules(struct task_struct *tsk,
}
break;
case AUDIT_WATCH:
- if (name)
- result = audit_watch_compare(rule->watch, name->ino, name->dev);
+ if (name) {
+ result = audit_watch_compare(rule->watch,
+ name->ino,
+ name->dev);
+ if (f->op == Audit_not_equal)
+ result = !result;
+ }
break;
case AUDIT_DIR:
- if (ctx)
+ if (ctx) {
result = match_tree_refs(ctx, rule->tree);
+ if (f->op == Audit_not_equal)
+ result = !result;
+ }
break;
case AUDIT_LOGINUID:
result = audit_uid_comparator(audit_get_loginuid(tsk),
@@ -615,6 +623,11 @@ static int audit_filter_rules(struct task_struct *tsk,
case AUDIT_LOGINUID_SET:
result = audit_comparator(audit_loginuid_set(tsk), f->op, f->val);
break;
+ case AUDIT_SADDR_FAM:
+ if (ctx->sockaddr)
+ result = audit_comparator(ctx->sockaddr->ss_family,
+ f->op, f->val);
+ break;
case AUDIT_SUBJ_USER:
case AUDIT_SUBJ_ROLE:
case AUDIT_SUBJ_TYPE:
@@ -631,9 +644,8 @@ static int audit_filter_rules(struct task_struct *tsk,
need_sid = 0;
}
result = security_audit_rule_match(sid, f->type,
- f->op,
- f->lsm_rule,
- ctx);
+ f->op,
+ f->lsm_rule);
}
break;
case AUDIT_OBJ_USER:
@@ -647,13 +659,17 @@ static int audit_filter_rules(struct task_struct *tsk,
/* Find files that match */
if (name) {
result = security_audit_rule_match(
- name->osid, f->type, f->op,
- f->lsm_rule, ctx);
+ name->osid,
+ f->type,
+ f->op,
+ f->lsm_rule);
} else if (ctx) {
list_for_each_entry(n, &ctx->names_list, list) {
- if (security_audit_rule_match(n->osid, f->type,
- f->op, f->lsm_rule,
- ctx)) {
+ if (security_audit_rule_match(
+ n->osid,
+ f->type,
+ f->op,
+ f->lsm_rule)) {
++result;
break;
}
@@ -664,7 +680,7 @@ static int audit_filter_rules(struct task_struct *tsk,
break;
if (security_audit_rule_match(ctx->ipc.osid,
f->type, f->op,
- f->lsm_rule, ctx))
+ f->lsm_rule))
++result;
}
break;
@@ -681,9 +697,13 @@ static int audit_filter_rules(struct task_struct *tsk,
break;
case AUDIT_PERM:
result = audit_match_perm(ctx, f->val);
+ if (f->op == Audit_not_equal)
+ result = !result;
break;
case AUDIT_FILETYPE:
result = audit_match_filetype(ctx, f->val);
+ if (f->op == Audit_not_equal)
+ result = !result;
break;
case AUDIT_FIELD_COMPARE:
result = audit_field_compare(tsk, cred, f, ctx, name);
@@ -768,15 +788,13 @@ static enum audit_state audit_filter_syscall(struct task_struct *tsk,
return AUDIT_DISABLED;
rcu_read_lock();
- if (!list_empty(list)) {
- list_for_each_entry_rcu(e, list, list) {
- if (audit_in_mask(&e->rule, ctx->major) &&
- audit_filter_rules(tsk, &e->rule, ctx, NULL,
- &state, false)) {
- rcu_read_unlock();
- ctx->current_state = state;
- return state;
- }
+ list_for_each_entry_rcu(e, list, list) {
+ if (audit_in_mask(&e->rule, ctx->major) &&
+ audit_filter_rules(tsk, &e->rule, ctx, NULL,
+ &state, false)) {
+ rcu_read_unlock();
+ ctx->current_state = state;
+ return state;
}
}
rcu_read_unlock();
@@ -795,9 +813,6 @@ static int audit_filter_inode_name(struct task_struct *tsk,
struct audit_entry *e;
enum audit_state state;
- if (list_empty(list))
- return 0;
-
list_for_each_entry_rcu(e, list, list) {
if (audit_in_mask(&e->rule, ctx->major) &&
audit_filter_rules(tsk, &e->rule, ctx, n, &state, false)) {
@@ -805,7 +820,6 @@ static int audit_filter_inode_name(struct task_struct *tsk,
return 1;
}
}
-
return 0;
}
@@ -837,6 +851,13 @@ static inline void audit_proctitle_free(struct audit_context *context)
context->proctitle.len = 0;
}
+static inline void audit_free_module(struct audit_context *context)
+{
+ if (context->type == AUDIT_KERN_MODULE) {
+ kfree(context->module.name);
+ context->module.name = NULL;
+ }
+}
static inline void audit_free_names(struct audit_context *context)
{
struct audit_names *n, *next;
@@ -920,6 +941,7 @@ int audit_alloc(struct task_struct *tsk)
static inline void audit_free_context(struct audit_context *context)
{
+ audit_free_module(context);
audit_free_names(context);
unroll_tree_refs(context, NULL, 0);
free_tree_refs(context);
@@ -1136,6 +1158,33 @@ out:
kfree(buf_head);
}
+static void audit_log_cap(struct audit_buffer *ab, char *prefix,
+ kernel_cap_t *cap)
+{
+ int i;
+
+ if (cap_isclear(*cap)) {
+ audit_log_format(ab, " %s=0", prefix);
+ return;
+ }
+ audit_log_format(ab, " %s=", prefix);
+ CAP_FOR_EACH_U32(i)
+ audit_log_format(ab, "%08x", cap->cap[CAP_LAST_U32 - i]);
+}
+
+static void audit_log_fcaps(struct audit_buffer *ab, struct audit_names *name)
+{
+ if (name->fcap_ver == -1) {
+ audit_log_format(ab, " cap_fe=? cap_fver=? cap_fp=? cap_fi=?");
+ return;
+ }
+ audit_log_cap(ab, "cap_fp", &name->fcap.permitted);
+ audit_log_cap(ab, "cap_fi", &name->fcap.inheritable);
+ audit_log_format(ab, " cap_fe=%d cap_fver=%x cap_frootid=%d",
+ name->fcap.fE, name->fcap_ver,
+ from_kuid(&init_user_ns, name->fcap.rootid));
+}
+
static void show_special(struct audit_context *context, int *call_panic)
{
struct audit_buffer *ab;
@@ -1237,7 +1286,6 @@ static void show_special(struct audit_context *context, int *call_panic)
audit_log_format(ab, "name=");
if (context->module.name) {
audit_log_untrustedstring(ab, context->module.name);
- kfree(context->module.name);
} else
audit_log_format(ab, "(null)");
@@ -1258,6 +1306,97 @@ static inline int audit_proctitle_rtrim(char *proctitle, int len)
return len;
}
+/*
+ * audit_log_name - produce AUDIT_PATH record from struct audit_names
+ * @context: audit_context for the task
+ * @n: audit_names structure with reportable details
+ * @path: optional path to report instead of audit_names->name
+ * @record_num: record number to report when handling a list of names
+ * @call_panic: optional pointer to int that will be updated if secid fails
+ */
+static void audit_log_name(struct audit_context *context, struct audit_names *n,
+ const struct path *path, int record_num, int *call_panic)
+{
+ struct audit_buffer *ab;
+
+ ab = audit_log_start(context, GFP_KERNEL, AUDIT_PATH);
+ if (!ab)
+ return;
+
+ audit_log_format(ab, "item=%d", record_num);
+
+ if (path)
+ audit_log_d_path(ab, " name=", path);
+ else if (n->name) {
+ switch (n->name_len) {
+ case AUDIT_NAME_FULL:
+ /* log the full path */
+ audit_log_format(ab, " name=");
+ audit_log_untrustedstring(ab, n->name->name);
+ break;
+ case 0:
+ /* name was specified as a relative path and the
+ * directory component is the cwd
+ */
+ audit_log_d_path(ab, " name=", &context->pwd);
+ break;
+ default:
+ /* log the name's directory component */
+ audit_log_format(ab, " name=");
+ audit_log_n_untrustedstring(ab, n->name->name,
+ n->name_len);
+ }
+ } else
+ audit_log_format(ab, " name=(null)");
+
+ if (n->ino != AUDIT_INO_UNSET)
+ audit_log_format(ab, " inode=%lu dev=%02x:%02x mode=%#ho ouid=%u ogid=%u rdev=%02x:%02x",
+ n->ino,
+ MAJOR(n->dev),
+ MINOR(n->dev),
+ n->mode,
+ from_kuid(&init_user_ns, n->uid),
+ from_kgid(&init_user_ns, n->gid),
+ MAJOR(n->rdev),
+ MINOR(n->rdev));
+ if (n->osid != 0) {
+ char *ctx = NULL;
+ u32 len;
+
+ if (security_secid_to_secctx(
+ n->osid, &ctx, &len)) {
+ audit_log_format(ab, " osid=%u", n->osid);
+ if (call_panic)
+ *call_panic = 2;
+ } else {
+ audit_log_format(ab, " obj=%s", ctx);
+ security_release_secctx(ctx, len);
+ }
+ }
+
+ /* log the audit_names record type */
+ switch (n->type) {
+ case AUDIT_TYPE_NORMAL:
+ audit_log_format(ab, " nametype=NORMAL");
+ break;
+ case AUDIT_TYPE_PARENT:
+ audit_log_format(ab, " nametype=PARENT");
+ break;
+ case AUDIT_TYPE_CHILD_DELETE:
+ audit_log_format(ab, " nametype=DELETE");
+ break;
+ case AUDIT_TYPE_CHILD_CREATE:
+ audit_log_format(ab, " nametype=CREATE");
+ break;
+ default:
+ audit_log_format(ab, " nametype=UNKNOWN");
+ break;
+ }
+
+ audit_log_fcaps(ab, n);
+ audit_log_end(ab);
+}
+
static void audit_log_proctitle(void)
{
int res;
@@ -1358,6 +1497,9 @@ static void audit_log_exit(void)
audit_log_cap(ab, "pi", &axs->new_pcap.inheritable);
audit_log_cap(ab, "pe", &axs->new_pcap.effective);
audit_log_cap(ab, "pa", &axs->new_pcap.ambient);
+ audit_log_format(ab, " frootid=%d",
+ from_kuid(&init_user_ns,
+ axs->fcap.rootid));
break; }
}
@@ -1444,6 +1586,9 @@ void __audit_free(struct task_struct *tsk)
if (!context)
return;
+ if (!list_empty(&context->killed_trees))
+ audit_kill_trees(context);
+
/* We are called either by do_exit() or the fork() error handling code;
* in the former case tsk == current and in the latter tsk is a
* random task_struct that doesn't doesn't have any meaningful data we
@@ -1460,9 +1605,6 @@ void __audit_free(struct task_struct *tsk)
audit_log_exit();
}
- if (!list_empty(&context->killed_trees))
- audit_kill_trees(&context->killed_trees);
-
audit_set_context(tsk, NULL);
audit_free_context(context);
}
@@ -1505,7 +1647,7 @@ void __audit_syscall_entry(int major, unsigned long a1, unsigned long a2,
return;
}
- context->arch = syscall_get_arch();
+ context->arch = syscall_get_arch(current);
context->major = major;
context->argv[0] = a1;
context->argv[1] = a2;
@@ -1537,6 +1679,9 @@ void __audit_syscall_exit(int success, long return_code)
if (!context)
return;
+ if (!list_empty(&context->killed_trees))
+ audit_kill_trees(context);
+
if (!context->dummy && context->in_syscall) {
if (success)
context->return_valid = AUDITSC_SUCCESS;
@@ -1571,9 +1716,7 @@ void __audit_syscall_exit(int success, long return_code)
context->in_syscall = 0;
context->prio = context->state == AUDIT_RECORD_CONTEXT ? ~0ULL : 0;
- if (!list_empty(&context->killed_trees))
- audit_kill_trees(&context->killed_trees);
-
+ audit_free_module(context);
audit_free_names(context);
unroll_tree_refs(context, NULL, 0);
audit_free_aux(context);
@@ -1750,6 +1893,48 @@ void __audit_getname(struct filename *name)
get_fs_pwd(current->fs, &context->pwd);
}
+static inline int audit_copy_fcaps(struct audit_names *name,
+ const struct dentry *dentry)
+{
+ struct cpu_vfs_cap_data caps;
+ int rc;
+
+ if (!dentry)
+ return 0;
+
+ rc = get_vfs_caps_from_disk(dentry, &caps);
+ if (rc)
+ return rc;
+
+ name->fcap.permitted = caps.permitted;
+ name->fcap.inheritable = caps.inheritable;
+ name->fcap.fE = !!(caps.magic_etc & VFS_CAP_FLAGS_EFFECTIVE);
+ name->fcap.rootid = caps.rootid;
+ name->fcap_ver = (caps.magic_etc & VFS_CAP_REVISION_MASK) >>
+ VFS_CAP_REVISION_SHIFT;
+
+ return 0;
+}
+
+/* Copy inode data into an audit_names. */
+static void audit_copy_inode(struct audit_names *name,
+ const struct dentry *dentry,
+ struct inode *inode, unsigned int flags)
+{
+ name->ino = inode->i_ino;
+ name->dev = inode->i_sb->s_dev;
+ name->mode = inode->i_mode;
+ name->uid = inode->i_uid;
+ name->gid = inode->i_gid;
+ name->rdev = inode->i_rdev;
+ security_inode_getsecid(inode, &name->osid);
+ if (flags & AUDIT_INODE_NOEVAL) {
+ name->fcap_ver = -1;
+ return;
+ }
+ audit_copy_fcaps(name, dentry);
+}
+
/**
* __audit_inode - store the inode and device from a lookup
* @name: name being audited
@@ -1763,10 +1948,29 @@ void __audit_inode(struct filename *name, const struct dentry *dentry,
struct inode *inode = d_backing_inode(dentry);
struct audit_names *n;
bool parent = flags & AUDIT_INODE_PARENT;
+ struct audit_entry *e;
+ struct list_head *list = &audit_filter_list[AUDIT_FILTER_FS];
+ int i;
if (!context->in_syscall)
return;
+ rcu_read_lock();
+ list_for_each_entry_rcu(e, list, list) {
+ for (i = 0; i < e->rule.field_count; i++) {
+ struct audit_field *f = &e->rule.fields[i];
+
+ if (f->type == AUDIT_FSTYPE
+ && audit_comparator(inode->i_sb->s_magic,
+ f->op, f->val)
+ && e->rule.action == AUDIT_NEVER) {
+ rcu_read_unlock();
+ return;
+ }
+ }
+ }
+ rcu_read_unlock();
+
if (!name)
goto out_alloc;
@@ -1832,7 +2036,7 @@ out:
n->type = AUDIT_TYPE_NORMAL;
}
handle_path(dentry);
- audit_copy_inode(n, dentry, inode);
+ audit_copy_inode(n, dentry, inode, flags & AUDIT_INODE_NOEVAL);
}
void __audit_file(const struct file *file)
@@ -1860,7 +2064,7 @@ void __audit_inode_child(struct inode *parent,
{
struct audit_context *context = audit_context();
struct inode *inode = d_backing_inode(dentry);
- const char *dname = dentry->d_name.name;
+ const struct qstr *dname = &dentry->d_name;
struct audit_names *n, *found_parent = NULL, *found_child = NULL;
struct audit_entry *e;
struct list_head *list = &audit_filter_list[AUDIT_FILTER_FS];
@@ -1870,20 +2074,16 @@ void __audit_inode_child(struct inode *parent,
return;
rcu_read_lock();
- if (!list_empty(list)) {
- list_for_each_entry_rcu(e, list, list) {
- for (i = 0; i < e->rule.field_count; i++) {
- struct audit_field *f = &e->rule.fields[i];
-
- if (f->type == AUDIT_FSTYPE) {
- if (audit_comparator(parent->i_sb->s_magic,
- f->op, f->val)) {
- if (e->rule.action == AUDIT_NEVER) {
- rcu_read_unlock();
- return;
- }
- }
- }
+ list_for_each_entry_rcu(e, list, list) {
+ for (i = 0; i < e->rule.field_count; i++) {
+ struct audit_field *f = &e->rule.fields[i];
+
+ if (f->type == AUDIT_FSTYPE
+ && audit_comparator(parent->i_sb->s_magic,
+ f->op, f->val)
+ && e->rule.action == AUDIT_NEVER) {
+ rcu_read_unlock();
+ return;
}
}
}
@@ -1916,7 +2116,7 @@ void __audit_inode_child(struct inode *parent,
(n->type != type && n->type != AUDIT_TYPE_UNKNOWN))
continue;
- if (!strcmp(dname, n->name->name) ||
+ if (!strcmp(dname->name, n->name->name) ||
!audit_compare_dname_path(dname, n->name->name,
found_parent ?
found_parent->name_len :
@@ -1933,7 +2133,7 @@ void __audit_inode_child(struct inode *parent,
n = audit_alloc_name(context, AUDIT_TYPE_PARENT);
if (!n)
return;
- audit_copy_inode(n, NULL, parent);
+ audit_copy_inode(n, NULL, parent, 0);
}
if (!found_child) {
@@ -1952,7 +2152,7 @@ void __audit_inode_child(struct inode *parent,
}
if (inode)
- audit_copy_inode(found_child, dentry, inode);
+ audit_copy_inode(found_child, dentry, inode, 0);
else
found_child->ino = AUDIT_INO_UNSET;
}
@@ -1983,90 +2183,6 @@ int auditsc_get_stamp(struct audit_context *ctx,
return 1;
}
-/* global counter which is incremented every time something logs in */
-static atomic_t session_id = ATOMIC_INIT(0);
-
-static int audit_set_loginuid_perm(kuid_t loginuid)
-{
- /* if we are unset, we don't need privs */
- if (!audit_loginuid_set(current))
- return 0;
- /* if AUDIT_FEATURE_LOGINUID_IMMUTABLE means never ever allow a change*/
- if (is_audit_feature_set(AUDIT_FEATURE_LOGINUID_IMMUTABLE))
- return -EPERM;
- /* it is set, you need permission */
- if (!capable(CAP_AUDIT_CONTROL))
- return -EPERM;
- /* reject if this is not an unset and we don't allow that */
- if (is_audit_feature_set(AUDIT_FEATURE_ONLY_UNSET_LOGINUID) && uid_valid(loginuid))
- return -EPERM;
- return 0;
-}
-
-static void audit_log_set_loginuid(kuid_t koldloginuid, kuid_t kloginuid,
- unsigned int oldsessionid, unsigned int sessionid,
- int rc)
-{
- struct audit_buffer *ab;
- uid_t uid, oldloginuid, loginuid;
- struct tty_struct *tty;
-
- if (!audit_enabled)
- return;
-
- ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_LOGIN);
- if (!ab)
- return;
-
- uid = from_kuid(&init_user_ns, task_uid(current));
- oldloginuid = from_kuid(&init_user_ns, koldloginuid);
- loginuid = from_kuid(&init_user_ns, kloginuid),
- tty = audit_get_tty();
-
- audit_log_format(ab, "pid=%d uid=%u", task_tgid_nr(current), uid);
- audit_log_task_context(ab);
- audit_log_format(ab, " old-auid=%u auid=%u tty=%s old-ses=%u ses=%u res=%d",
- oldloginuid, loginuid, tty ? tty_name(tty) : "(none)",
- oldsessionid, sessionid, !rc);
- audit_put_tty(tty);
- audit_log_end(ab);
-}
-
-/**
- * audit_set_loginuid - set current task's audit_context loginuid
- * @loginuid: loginuid value
- *
- * Returns 0.
- *
- * Called (set) from fs/proc/base.c::proc_loginuid_write().
- */
-int audit_set_loginuid(kuid_t loginuid)
-{
- unsigned int oldsessionid, sessionid = AUDIT_SID_UNSET;
- kuid_t oldloginuid;
- int rc;
-
- oldloginuid = audit_get_loginuid(current);
- oldsessionid = audit_get_sessionid(current);
-
- rc = audit_set_loginuid_perm(loginuid);
- if (rc)
- goto out;
-
- /* are we setting or clearing? */
- if (uid_valid(loginuid)) {
- sessionid = (unsigned int)atomic_inc_return(&session_id);
- if (unlikely(sessionid == AUDIT_SID_UNSET))
- sessionid = (unsigned int)atomic_inc_return(&session_id);
- }
-
- current->sessionid = sessionid;
- current->loginuid = loginuid;
-out:
- audit_log_set_loginuid(oldloginuid, loginuid, oldsessionid, sessionid, rc);
- return rc;
-}
-
/**
* __audit_mq_open - record audit data for a POSIX MQ open
* @oflag: open flag
@@ -2261,30 +2377,17 @@ void __audit_ptrace(struct task_struct *t)
}
/**
- * audit_signal_info - record signal info for shutting down audit subsystem
- * @sig: signal value
+ * audit_signal_info_syscall - record signal info for syscalls
* @t: task being signaled
*
* If the audit subsystem is being terminated, record the task (pid)
* and uid that is doing that.
*/
-int audit_signal_info(int sig, struct task_struct *t)
+int audit_signal_info_syscall(struct task_struct *t)
{
struct audit_aux_data_pids *axp;
struct audit_context *ctx = audit_context();
- kuid_t uid = current_uid(), auid, t_uid = task_uid(t);
-
- if (auditd_test_task(t) &&
- (sig == SIGTERM || sig == SIGHUP ||
- sig == SIGUSR1 || sig == SIGUSR2)) {
- audit_sig_pid = task_tgid_nr(current);
- auid = audit_get_loginuid(current);
- if (uid_valid(auid))
- audit_sig_uid = auid;
- else
- audit_sig_uid = uid;
- security_task_getsecid(current, &audit_sig_sid);
- }
+ kuid_t t_uid = task_uid(t);
if (!audit_signals || audit_dummy_context())
return 0;
@@ -2355,6 +2458,7 @@ int __audit_log_bprm_fcaps(struct linux_binprm *bprm,
ax->fcap.permitted = vcaps.permitted;
ax->fcap.inheritable = vcaps.inheritable;
ax->fcap.fE = !!(vcaps.magic_etc & VFS_CAP_FLAGS_EFFECTIVE);
+ ax->fcap.rootid = vcaps.rootid;
ax->fcap_ver = (vcaps.magic_etc & VFS_CAP_REVISION_MASK) >> VFS_CAP_REVISION_SHIFT;
ax->old_pcap.permitted = old->cap_permitted;
@@ -2412,6 +2516,35 @@ void __audit_fanotify(unsigned int response)
AUDIT_FANOTIFY, "resp=%u", response);
}
+void __audit_tk_injoffset(struct timespec64 offset)
+{
+ audit_log(audit_context(), GFP_KERNEL, AUDIT_TIME_INJOFFSET,
+ "sec=%lli nsec=%li",
+ (long long)offset.tv_sec, offset.tv_nsec);
+}
+
+static void audit_log_ntp_val(const struct audit_ntp_data *ad,
+ const char *op, enum audit_ntp_type type)
+{
+ const struct audit_ntp_val *val = &ad->vals[type];
+
+ if (val->newval == val->oldval)
+ return;
+
+ audit_log(audit_context(), GFP_KERNEL, AUDIT_TIME_ADJNTPVAL,
+ "op=%s old=%lli new=%lli", op, val->oldval, val->newval);
+}
+
+void __audit_ntp_log(const struct audit_ntp_data *ad)
+{
+ audit_log_ntp_val(ad, "offset", AUDIT_NTP_OFFSET);
+ audit_log_ntp_val(ad, "freq", AUDIT_NTP_FREQ);
+ audit_log_ntp_val(ad, "status", AUDIT_NTP_STATUS);
+ audit_log_ntp_val(ad, "tai", AUDIT_NTP_TAI);
+ audit_log_ntp_val(ad, "tick", AUDIT_NTP_TICK);
+ audit_log_ntp_val(ad, "adjust", AUDIT_NTP_ADJUST);
+}
+
static void audit_log_task(struct audit_buffer *ab)
{
kuid_t auid, uid;
@@ -2480,7 +2613,7 @@ void audit_seccomp(unsigned long syscall, long signr, int code)
return;
audit_log_task(ab);
audit_log_format(ab, " sig=%ld arch=%x syscall=%ld compat=%d ip=0x%lx code=0x%x",
- signr, syscall_get_arch(), syscall,
+ signr, syscall_get_arch(current), syscall,
in_compat_syscall(), KSTK_EIP(current), code);
audit_log_end(ab);
}
diff --git a/kernel/backtracetest.c b/kernel/backtracetest.c
index 1323360d90e3..a2a97fa3071b 100644
--- a/kernel/backtracetest.c
+++ b/kernel/backtracetest.c
@@ -1,13 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Simple stack backtrace regression test module
*
* (C) Copyright 2008 Intel Corporation
* Author: Arjan van de Ven <arjan@linux.intel.com>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; version 2
- * of the License.
*/
#include <linux/completion.h>
@@ -48,19 +44,14 @@ static void backtrace_test_irq(void)
#ifdef CONFIG_STACKTRACE
static void backtrace_test_saved(void)
{
- struct stack_trace trace;
unsigned long entries[8];
+ unsigned int nr_entries;
pr_info("Testing a saved backtrace.\n");
pr_info("The following trace is a kernel self test and not a bug!\n");
- trace.nr_entries = 0;
- trace.max_entries = ARRAY_SIZE(entries);
- trace.entries = entries;
- trace.skip = 0;
-
- save_stack_trace(&trace);
- print_stack_trace(&trace, 0);
+ nr_entries = stack_trace_save(entries, ARRAY_SIZE(entries), 0);
+ stack_trace_print(entries, nr_entries, 0);
}
#else
static void backtrace_test_saved(void)
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index 4c2fa3ac56f6..29d781061cd5 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -1,5 +1,6 @@
# SPDX-License-Identifier: GPL-2.0
obj-y := core.o
+CFLAGS_core.o += $(call cc-disable-warning, override-init)
obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o
obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index 25632a75d630..1c65ce0098a9 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -1,14 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0-only
/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
* Copyright (c) 2016,2017 Facebook
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of version 2 of the GNU General Public
- * License as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
*/
#include <linux/bpf.h>
#include <linux/btf.h>
@@ -22,7 +14,7 @@
#include "map_in_map.h"
#define ARRAY_CREATE_FLAG_MASK \
- (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY)
+ (BPF_F_NUMA_NODE | BPF_F_ACCESS_MASK)
static void bpf_array_free_percpu(struct bpf_array *array)
{
@@ -63,6 +55,7 @@ int array_map_alloc_check(union bpf_attr *attr)
if (attr->max_entries == 0 || attr->key_size != 4 ||
attr->value_size == 0 ||
attr->map_flags & ~ARRAY_CREATE_FLAG_MASK ||
+ !bpf_map_flags_access_ok(attr->map_flags) ||
(percpu && numa_node != NUMA_NO_NODE))
return -EINVAL;
@@ -82,6 +75,7 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr)
u32 elem_size, index_mask, max_entries;
bool unpriv = !capable(CAP_SYS_ADMIN);
u64 cost, array_size, mask64;
+ struct bpf_map_memory mem;
struct bpf_array *array;
elem_size = round_up(attr->value_size, 8);
@@ -115,32 +109,29 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr)
/* make sure there is no u32 overflow later in round_up() */
cost = array_size;
- if (cost >= U32_MAX - PAGE_SIZE)
- return ERR_PTR(-ENOMEM);
- if (percpu) {
+ if (percpu)
cost += (u64)attr->max_entries * elem_size * num_possible_cpus();
- if (cost >= U32_MAX - PAGE_SIZE)
- return ERR_PTR(-ENOMEM);
- }
- cost = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
- ret = bpf_map_precharge_memlock(cost);
+ ret = bpf_map_charge_init(&mem, cost);
if (ret < 0)
return ERR_PTR(ret);
/* allocate all map elements and zero-initialize them */
array = bpf_map_area_alloc(array_size, numa_node);
- if (!array)
+ if (!array) {
+ bpf_map_charge_finish(&mem);
return ERR_PTR(-ENOMEM);
+ }
array->index_mask = index_mask;
array->map.unpriv_array = unpriv;
/* copy mandatory map attributes */
bpf_map_init_from_attr(&array->map, attr);
- array->map.pages = cost;
+ bpf_map_charge_move(&array->map.memory, &mem);
array->elem_size = elem_size;
if (percpu && bpf_array_alloc_percpu(array)) {
+ bpf_map_charge_finish(&array->map.memory);
bpf_map_area_free(array);
return ERR_PTR(-ENOMEM);
}
@@ -160,6 +151,36 @@ static void *array_map_lookup_elem(struct bpf_map *map, void *key)
return array->value + array->elem_size * (index & array->index_mask);
}
+static int array_map_direct_value_addr(const struct bpf_map *map, u64 *imm,
+ u32 off)
+{
+ struct bpf_array *array = container_of(map, struct bpf_array, map);
+
+ if (map->max_entries != 1)
+ return -ENOTSUPP;
+ if (off >= map->value_size)
+ return -EINVAL;
+
+ *imm = (unsigned long)array->value;
+ return 0;
+}
+
+static int array_map_direct_value_meta(const struct bpf_map *map, u64 imm,
+ u32 *off)
+{
+ struct bpf_array *array = container_of(map, struct bpf_array, map);
+ u64 base = (unsigned long)array->value;
+ u64 range = array->elem_size;
+
+ if (map->max_entries != 1)
+ return -ENOTSUPP;
+ if (imm < base || imm >= base + range)
+ return -ENOENT;
+
+ *off = imm - base;
+ return 0;
+}
+
/* emit BPF instructions equivalent to C code of array_map_lookup_elem() */
static u32 array_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf)
{
@@ -253,8 +274,9 @@ static int array_map_update_elem(struct bpf_map *map, void *key, void *value,
{
struct bpf_array *array = container_of(map, struct bpf_array, map);
u32 index = *(u32 *)key;
+ char *val;
- if (unlikely(map_flags > BPF_EXIST))
+ if (unlikely((map_flags & ~BPF_F_LOCK) > BPF_EXIST))
/* unknown flags */
return -EINVAL;
@@ -262,17 +284,25 @@ static int array_map_update_elem(struct bpf_map *map, void *key, void *value,
/* all elements were pre-allocated, cannot insert a new one */
return -E2BIG;
- if (unlikely(map_flags == BPF_NOEXIST))
+ if (unlikely(map_flags & BPF_NOEXIST))
/* all elements already exist */
return -EEXIST;
- if (array->map.map_type == BPF_MAP_TYPE_PERCPU_ARRAY)
+ if (unlikely((map_flags & BPF_F_LOCK) &&
+ !map_value_has_spin_lock(map)))
+ return -EINVAL;
+
+ if (array->map.map_type == BPF_MAP_TYPE_PERCPU_ARRAY) {
memcpy(this_cpu_ptr(array->pptrs[index & array->index_mask]),
value, map->value_size);
- else
- memcpy(array->value +
- array->elem_size * (index & array->index_mask),
- value, map->value_size);
+ } else {
+ val = array->value +
+ array->elem_size * (index & array->index_mask);
+ if (map_flags & BPF_F_LOCK)
+ copy_map_value_locked(map, val, value, false);
+ else
+ copy_map_value(map, val, value);
+ }
return 0;
}
@@ -351,7 +381,8 @@ static void array_map_seq_show_elem(struct bpf_map *map, void *key,
return;
}
- seq_printf(m, "%u: ", *(u32 *)key);
+ if (map->btf_key_type_id)
+ seq_printf(m, "%u: ", *(u32 *)key);
btf_type_seq_show(map->btf, map->btf_value_type_id, value, m);
seq_puts(m, "\n");
@@ -388,6 +419,18 @@ static int array_map_check_btf(const struct bpf_map *map,
{
u32 int_data;
+ /* One exception for keyless BTF: .bss/.data/.rodata map */
+ if (btf_type_is_void(key_type)) {
+ if (map->map_type != BPF_MAP_TYPE_ARRAY ||
+ map->max_entries != 1)
+ return -EINVAL;
+
+ if (BTF_INFO_KIND(value_type->info) != BTF_KIND_DATASEC)
+ return -EINVAL;
+
+ return 0;
+ }
+
if (BTF_INFO_KIND(key_type->info) != BTF_KIND_INT)
return -EINVAL;
@@ -410,6 +453,8 @@ const struct bpf_map_ops array_map_ops = {
.map_update_elem = array_map_update_elem,
.map_delete_elem = array_map_delete_elem,
.map_gen_lookup = array_map_gen_lookup,
+ .map_direct_value_addr = array_map_direct_value_addr,
+ .map_direct_value_meta = array_map_direct_value_meta,
.map_seq_show_elem = array_map_seq_show_elem,
.map_check_btf = array_map_check_btf,
};
@@ -431,6 +476,9 @@ static int fd_array_map_alloc_check(union bpf_attr *attr)
/* only file descriptors can be stored in this type of map */
if (attr->value_size != sizeof(u32))
return -EINVAL;
+ /* Program read-only/write-only not supported for special maps yet. */
+ if (attr->map_flags & (BPF_F_RDONLY_PROG | BPF_F_WRONLY_PROG))
+ return -EINVAL;
return array_map_alloc_check(attr);
}
diff --git a/kernel/bpf/bpf_lru_list.c b/kernel/bpf/bpf_lru_list.c
index e6ef4401a138..1b6b9349cb85 100644
--- a/kernel/bpf/bpf_lru_list.c
+++ b/kernel/bpf/bpf_lru_list.c
@@ -1,8 +1,5 @@
+// SPDX-License-Identifier: GPL-2.0-only
/* Copyright (c) 2016 Facebook
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of version 2 of the GNU General Public
- * License as published by the Free Software Foundation.
*/
#include <linux/cpumask.h>
#include <linux/spinlock.h>
diff --git a/kernel/bpf/bpf_lru_list.h b/kernel/bpf/bpf_lru_list.h
index 7d4f89b7cb84..f02504640e18 100644
--- a/kernel/bpf/bpf_lru_list.h
+++ b/kernel/bpf/bpf_lru_list.h
@@ -1,8 +1,5 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
/* Copyright (c) 2016 Facebook
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of version 2 of the GNU General Public
- * License as published by the Free Software Foundation.
*/
#ifndef __BPF_LRU_LIST_H_
#define __BPF_LRU_LIST_H_
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index c57bd10340ed..546ebee39e2a 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -157,7 +157,7 @@
*
*/
-#define BITS_PER_U64 (sizeof(u64) * BITS_PER_BYTE)
+#define BITS_PER_U128 (sizeof(u64) * BITS_PER_BYTE * 2)
#define BITS_PER_BYTE_MASK (BITS_PER_BYTE - 1)
#define BITS_PER_BYTE_MASKED(bits) ((bits) & BITS_PER_BYTE_MASK)
#define BITS_ROUNDDOWN_BYTES(bits) ((bits) >> 3)
@@ -185,6 +185,16 @@
i < btf_type_vlen(struct_type); \
i++, member++)
+#define for_each_vsi(i, struct_type, member) \
+ for (i = 0, member = btf_type_var_secinfo(struct_type); \
+ i < btf_type_vlen(struct_type); \
+ i++, member++)
+
+#define for_each_vsi_from(i, from, struct_type, member) \
+ for (i = from, member = btf_type_var_secinfo(struct_type) + from; \
+ i < btf_type_vlen(struct_type); \
+ i++, member++)
+
static DEFINE_IDR(btf_idr);
static DEFINE_SPINLOCK(btf_idr_lock);
@@ -262,6 +272,8 @@ static const char * const btf_kind_str[NR_BTF_KINDS] = {
[BTF_KIND_RESTRICT] = "RESTRICT",
[BTF_KIND_FUNC] = "FUNC",
[BTF_KIND_FUNC_PROTO] = "FUNC_PROTO",
+ [BTF_KIND_VAR] = "VAR",
+ [BTF_KIND_DATASEC] = "DATASEC",
};
struct btf_kind_operations {
@@ -314,7 +326,7 @@ static bool btf_type_is_modifier(const struct btf_type *t)
return false;
}
-static bool btf_type_is_void(const struct btf_type *t)
+bool btf_type_is_void(const struct btf_type *t)
{
return t == &btf_void;
}
@@ -355,6 +367,11 @@ static bool btf_type_is_struct(const struct btf_type *t)
return kind == BTF_KIND_STRUCT || kind == BTF_KIND_UNION;
}
+static bool __btf_type_is_struct(const struct btf_type *t)
+{
+ return BTF_INFO_KIND(t->info) == BTF_KIND_STRUCT;
+}
+
static bool btf_type_is_array(const struct btf_type *t)
{
return BTF_INFO_KIND(t->info) == BTF_KIND_ARRAY;
@@ -370,13 +387,36 @@ static bool btf_type_is_int(const struct btf_type *t)
return BTF_INFO_KIND(t->info) == BTF_KIND_INT;
}
+static bool btf_type_is_var(const struct btf_type *t)
+{
+ return BTF_INFO_KIND(t->info) == BTF_KIND_VAR;
+}
+
+static bool btf_type_is_datasec(const struct btf_type *t)
+{
+ return BTF_INFO_KIND(t->info) == BTF_KIND_DATASEC;
+}
+
+/* Types that act only as a source, not sink or intermediate
+ * type when resolving.
+ */
+static bool btf_type_is_resolve_source_only(const struct btf_type *t)
+{
+ return btf_type_is_var(t) ||
+ btf_type_is_datasec(t);
+}
+
/* What types need to be resolved?
*
* btf_type_is_modifier() is an obvious one.
*
* btf_type_is_struct() because its member refers to
* another type (through member->type).
-
+ *
+ * btf_type_is_var() because the variable refers to
+ * another type. btf_type_is_datasec() holds multiple
+ * btf_type_is_var() types that need resolving.
+ *
* btf_type_is_array() because its element (array->type)
* refers to another type. Array can be thought of a
* special case of struct while array just has the same
@@ -385,9 +425,11 @@ static bool btf_type_is_int(const struct btf_type *t)
static bool btf_type_needs_resolve(const struct btf_type *t)
{
return btf_type_is_modifier(t) ||
- btf_type_is_ptr(t) ||
- btf_type_is_struct(t) ||
- btf_type_is_array(t);
+ btf_type_is_ptr(t) ||
+ btf_type_is_struct(t) ||
+ btf_type_is_array(t) ||
+ btf_type_is_var(t) ||
+ btf_type_is_datasec(t);
}
/* t->size can be used */
@@ -398,6 +440,7 @@ static bool btf_type_has_size(const struct btf_type *t)
case BTF_KIND_STRUCT:
case BTF_KIND_UNION:
case BTF_KIND_ENUM:
+ case BTF_KIND_DATASEC:
return true;
}
@@ -462,6 +505,16 @@ static const struct btf_enum *btf_type_enum(const struct btf_type *t)
return (const struct btf_enum *)(t + 1);
}
+static const struct btf_var *btf_type_var(const struct btf_type *t)
+{
+ return (const struct btf_var *)(t + 1);
+}
+
+static const struct btf_var_secinfo *btf_type_var_secinfo(const struct btf_type *t)
+{
+ return (const struct btf_var_secinfo *)(t + 1);
+}
+
static const struct btf_kind_operations *btf_type_ops(const struct btf_type *t)
{
return kind_ops[BTF_INFO_KIND(t->info)];
@@ -473,23 +526,31 @@ static bool btf_name_offset_valid(const struct btf *btf, u32 offset)
offset < btf->hdr.str_len;
}
-/* Only C-style identifier is permitted. This can be relaxed if
- * necessary.
- */
-static bool btf_name_valid_identifier(const struct btf *btf, u32 offset)
+static bool __btf_name_char_ok(char c, bool first, bool dot_ok)
+{
+ if ((first ? !isalpha(c) :
+ !isalnum(c)) &&
+ c != '_' &&
+ ((c == '.' && !dot_ok) ||
+ c != '.'))
+ return false;
+ return true;
+}
+
+static bool __btf_name_valid(const struct btf *btf, u32 offset, bool dot_ok)
{
/* offset must be valid */
const char *src = &btf->strings[offset];
const char *src_limit;
- if (!isalpha(*src) && *src != '_')
+ if (!__btf_name_char_ok(*src, true, dot_ok))
return false;
/* set a limit on identifier length */
src_limit = src + KSYM_NAME_LEN;
src++;
while (*src && src < src_limit) {
- if (!isalnum(*src) && *src != '_')
+ if (!__btf_name_char_ok(*src, false, dot_ok))
return false;
src++;
}
@@ -497,6 +558,19 @@ static bool btf_name_valid_identifier(const struct btf *btf, u32 offset)
return !*src;
}
+/* Only C-style identifier is permitted. This can be relaxed if
+ * necessary.
+ */
+static bool btf_name_valid_identifier(const struct btf *btf, u32 offset)
+{
+ return __btf_name_valid(btf, offset, false);
+}
+
+static bool btf_name_valid_section(const struct btf *btf, u32 offset)
+{
+ return __btf_name_valid(btf, offset, true);
+}
+
static const char *__btf_name_by_offset(const struct btf *btf, u32 offset)
{
if (!offset)
@@ -525,7 +599,7 @@ const struct btf_type *btf_type_by_id(const struct btf *btf, u32 type_id)
/*
* Regular int is not a bit field and it must be either
- * u8/u16/u32/u64.
+ * u8/u16/u32/u64 or __int128.
*/
static bool btf_type_int_is_regular(const struct btf_type *t)
{
@@ -538,7 +612,8 @@ static bool btf_type_int_is_regular(const struct btf_type *t)
if (BITS_PER_BYTE_MASKED(nr_bits) ||
BTF_INT_OFFSET(int_data) ||
(nr_bytes != sizeof(u8) && nr_bytes != sizeof(u16) &&
- nr_bytes != sizeof(u32) && nr_bytes != sizeof(u64))) {
+ nr_bytes != sizeof(u32) && nr_bytes != sizeof(u64) &&
+ nr_bytes != (2 * sizeof(u64)))) {
return false;
}
@@ -691,6 +766,32 @@ static void btf_verifier_log_member(struct btf_verifier_env *env,
__btf_verifier_log(log, "\n");
}
+__printf(4, 5)
+static void btf_verifier_log_vsi(struct btf_verifier_env *env,
+ const struct btf_type *datasec_type,
+ const struct btf_var_secinfo *vsi,
+ const char *fmt, ...)
+{
+ struct bpf_verifier_log *log = &env->log;
+ va_list args;
+
+ if (!bpf_verifier_log_needed(log))
+ return;
+ if (env->phase != CHECK_META)
+ btf_verifier_log_type(env, datasec_type, NULL);
+
+ __btf_verifier_log(log, "\t type_id=%u offset=%u size=%u",
+ vsi->type, vsi->offset, vsi->size);
+ if (fmt && *fmt) {
+ __btf_verifier_log(log, " ");
+ va_start(args, fmt);
+ bpf_verifier_vlog(log, fmt, args);
+ va_end(args);
+ }
+
+ __btf_verifier_log(log, "\n");
+}
+
static void btf_verifier_log_hdr(struct btf_verifier_env *env,
u32 btf_data_size)
{
@@ -968,7 +1069,8 @@ const struct btf_type *btf_type_id_size(const struct btf *btf,
} else if (btf_type_is_ptr(size_type)) {
size = sizeof(void *);
} else {
- if (WARN_ON_ONCE(!btf_type_is_modifier(size_type)))
+ if (WARN_ON_ONCE(!btf_type_is_modifier(size_type) &&
+ !btf_type_is_var(size_type)))
return NULL;
size = btf->resolved_sizes[size_type_id];
@@ -1063,9 +1165,9 @@ static int btf_int_check_member(struct btf_verifier_env *env,
nr_copy_bits = BTF_INT_BITS(int_data) +
BITS_PER_BYTE_MASKED(struct_bits_off);
- if (nr_copy_bits > BITS_PER_U64) {
+ if (nr_copy_bits > BITS_PER_U128) {
btf_verifier_log_member(env, struct_type, member,
- "nr_copy_bits exceeds 64");
+ "nr_copy_bits exceeds 128");
return -EINVAL;
}
@@ -1119,9 +1221,9 @@ static int btf_int_check_kflag_member(struct btf_verifier_env *env,
bytes_offset = BITS_ROUNDDOWN_BYTES(struct_bits_off);
nr_copy_bits = nr_bits + BITS_PER_BYTE_MASKED(struct_bits_off);
- if (nr_copy_bits > BITS_PER_U64) {
+ if (nr_copy_bits > BITS_PER_U128) {
btf_verifier_log_member(env, struct_type, member,
- "nr_copy_bits exceeds 64");
+ "nr_copy_bits exceeds 128");
return -EINVAL;
}
@@ -1168,9 +1270,9 @@ static s32 btf_int_check_meta(struct btf_verifier_env *env,
nr_bits = BTF_INT_BITS(int_data) + BTF_INT_OFFSET(int_data);
- if (nr_bits > BITS_PER_U64) {
+ if (nr_bits > BITS_PER_U128) {
btf_verifier_log_type(env, t, "nr_bits exceeds %zu",
- BITS_PER_U64);
+ BITS_PER_U128);
return -EINVAL;
}
@@ -1211,31 +1313,93 @@ static void btf_int_log(struct btf_verifier_env *env,
btf_int_encoding_str(BTF_INT_ENCODING(int_data)));
}
+static void btf_int128_print(struct seq_file *m, void *data)
+{
+ /* data points to a __int128 number.
+ * Suppose
+ * int128_num = *(__int128 *)data;
+ * The below formulas shows what upper_num and lower_num represents:
+ * upper_num = int128_num >> 64;
+ * lower_num = int128_num & 0xffffffffFFFFFFFFULL;
+ */
+ u64 upper_num, lower_num;
+
+#ifdef __BIG_ENDIAN_BITFIELD
+ upper_num = *(u64 *)data;
+ lower_num = *(u64 *)(data + 8);
+#else
+ upper_num = *(u64 *)(data + 8);
+ lower_num = *(u64 *)data;
+#endif
+ if (upper_num == 0)
+ seq_printf(m, "0x%llx", lower_num);
+ else
+ seq_printf(m, "0x%llx%016llx", upper_num, lower_num);
+}
+
+static void btf_int128_shift(u64 *print_num, u16 left_shift_bits,
+ u16 right_shift_bits)
+{
+ u64 upper_num, lower_num;
+
+#ifdef __BIG_ENDIAN_BITFIELD
+ upper_num = print_num[0];
+ lower_num = print_num[1];
+#else
+ upper_num = print_num[1];
+ lower_num = print_num[0];
+#endif
+
+ /* shake out un-needed bits by shift/or operations */
+ if (left_shift_bits >= 64) {
+ upper_num = lower_num << (left_shift_bits - 64);
+ lower_num = 0;
+ } else {
+ upper_num = (upper_num << left_shift_bits) |
+ (lower_num >> (64 - left_shift_bits));
+ lower_num = lower_num << left_shift_bits;
+ }
+
+ if (right_shift_bits >= 64) {
+ lower_num = upper_num >> (right_shift_bits - 64);
+ upper_num = 0;
+ } else {
+ lower_num = (lower_num >> right_shift_bits) |
+ (upper_num << (64 - right_shift_bits));
+ upper_num = upper_num >> right_shift_bits;
+ }
+
+#ifdef __BIG_ENDIAN_BITFIELD
+ print_num[0] = upper_num;
+ print_num[1] = lower_num;
+#else
+ print_num[0] = lower_num;
+ print_num[1] = upper_num;
+#endif
+}
+
static void btf_bitfield_seq_show(void *data, u8 bits_offset,
u8 nr_bits, struct seq_file *m)
{
u16 left_shift_bits, right_shift_bits;
u8 nr_copy_bytes;
u8 nr_copy_bits;
- u64 print_num;
+ u64 print_num[2] = {};
nr_copy_bits = nr_bits + bits_offset;
nr_copy_bytes = BITS_ROUNDUP_BYTES(nr_copy_bits);
- print_num = 0;
- memcpy(&print_num, data, nr_copy_bytes);
+ memcpy(print_num, data, nr_copy_bytes);
#ifdef __BIG_ENDIAN_BITFIELD
left_shift_bits = bits_offset;
#else
- left_shift_bits = BITS_PER_U64 - nr_copy_bits;
+ left_shift_bits = BITS_PER_U128 - nr_copy_bits;
#endif
- right_shift_bits = BITS_PER_U64 - nr_bits;
-
- print_num <<= left_shift_bits;
- print_num >>= right_shift_bits;
+ right_shift_bits = BITS_PER_U128 - nr_bits;
- seq_printf(m, "0x%llx", print_num);
+ btf_int128_shift(print_num, left_shift_bits, right_shift_bits);
+ btf_int128_print(m, print_num);
}
@@ -1250,7 +1414,7 @@ static void btf_int_bits_seq_show(const struct btf *btf,
/*
* bits_offset is at most 7.
- * BTF_INT_OFFSET() cannot exceed 64 bits.
+ * BTF_INT_OFFSET() cannot exceed 128 bits.
*/
total_bits_offset = bits_offset + BTF_INT_OFFSET(int_data);
data += BITS_ROUNDDOWN_BYTES(total_bits_offset);
@@ -1274,6 +1438,9 @@ static void btf_int_seq_show(const struct btf *btf, const struct btf_type *t,
}
switch (nr_bits) {
+ case 128:
+ btf_int128_print(m, data);
+ break;
case 64:
if (sign)
seq_printf(m, "%lld", *(s64 *)data);
@@ -1438,7 +1605,7 @@ static int btf_modifier_resolve(struct btf_verifier_env *env,
u32 next_type_size = 0;
next_type = btf_type_by_id(btf, next_type_id);
- if (!next_type) {
+ if (!next_type || btf_type_is_resolve_source_only(next_type)) {
btf_verifier_log_type(env, v->t, "Invalid type_id");
return -EINVAL;
}
@@ -1471,6 +1638,53 @@ static int btf_modifier_resolve(struct btf_verifier_env *env,
return 0;
}
+static int btf_var_resolve(struct btf_verifier_env *env,
+ const struct resolve_vertex *v)
+{
+ const struct btf_type *next_type;
+ const struct btf_type *t = v->t;
+ u32 next_type_id = t->type;
+ struct btf *btf = env->btf;
+ u32 next_type_size;
+
+ next_type = btf_type_by_id(btf, next_type_id);
+ if (!next_type || btf_type_is_resolve_source_only(next_type)) {
+ btf_verifier_log_type(env, v->t, "Invalid type_id");
+ return -EINVAL;
+ }
+
+ if (!env_type_is_resolve_sink(env, next_type) &&
+ !env_type_is_resolved(env, next_type_id))
+ return env_stack_push(env, next_type, next_type_id);
+
+ if (btf_type_is_modifier(next_type)) {
+ const struct btf_type *resolved_type;
+ u32 resolved_type_id;
+
+ resolved_type_id = next_type_id;
+ resolved_type = btf_type_id_resolve(btf, &resolved_type_id);
+
+ if (btf_type_is_ptr(resolved_type) &&
+ !env_type_is_resolve_sink(env, resolved_type) &&
+ !env_type_is_resolved(env, resolved_type_id))
+ return env_stack_push(env, resolved_type,
+ resolved_type_id);
+ }
+
+ /* We must resolve to something concrete at this point, no
+ * forward types or similar that would resolve to size of
+ * zero is allowed.
+ */
+ if (!btf_type_id_size(btf, &next_type_id, &next_type_size)) {
+ btf_verifier_log_type(env, v->t, "Invalid type_id");
+ return -EINVAL;
+ }
+
+ env_stack_pop_resolved(env, next_type_id, next_type_size);
+
+ return 0;
+}
+
static int btf_ptr_resolve(struct btf_verifier_env *env,
const struct resolve_vertex *v)
{
@@ -1480,7 +1694,7 @@ static int btf_ptr_resolve(struct btf_verifier_env *env,
struct btf *btf = env->btf;
next_type = btf_type_by_id(btf, next_type_id);
- if (!next_type) {
+ if (!next_type || btf_type_is_resolve_source_only(next_type)) {
btf_verifier_log_type(env, v->t, "Invalid type_id");
return -EINVAL;
}
@@ -1538,6 +1752,15 @@ static void btf_modifier_seq_show(const struct btf *btf,
btf_type_ops(t)->seq_show(btf, t, type_id, data, bits_offset, m);
}
+static void btf_var_seq_show(const struct btf *btf, const struct btf_type *t,
+ u32 type_id, void *data, u8 bits_offset,
+ struct seq_file *m)
+{
+ t = btf_type_id_resolve(btf, &type_id);
+
+ btf_type_ops(t)->seq_show(btf, t, type_id, data, bits_offset, m);
+}
+
static void btf_ptr_seq_show(const struct btf *btf, const struct btf_type *t,
u32 type_id, void *data, u8 bits_offset,
struct seq_file *m)
@@ -1705,7 +1928,8 @@ static int btf_array_resolve(struct btf_verifier_env *env,
/* Check array->index_type */
index_type_id = array->index_type;
index_type = btf_type_by_id(btf, index_type_id);
- if (btf_type_nosize_or_null(index_type)) {
+ if (btf_type_nosize_or_null(index_type) ||
+ btf_type_is_resolve_source_only(index_type)) {
btf_verifier_log_type(env, v->t, "Invalid index");
return -EINVAL;
}
@@ -1724,7 +1948,8 @@ static int btf_array_resolve(struct btf_verifier_env *env,
/* Check array->type */
elem_type_id = array->type;
elem_type = btf_type_by_id(btf, elem_type_id);
- if (btf_type_nosize_or_null(elem_type)) {
+ if (btf_type_nosize_or_null(elem_type) ||
+ btf_type_is_resolve_source_only(elem_type)) {
btf_verifier_log_type(env, v->t,
"Invalid elem");
return -EINVAL;
@@ -1945,7 +2170,8 @@ static int btf_struct_resolve(struct btf_verifier_env *env,
const struct btf_type *member_type = btf_type_by_id(env->btf,
member_type_id);
- if (btf_type_nosize_or_null(member_type)) {
+ if (btf_type_nosize_or_null(member_type) ||
+ btf_type_is_resolve_source_only(member_type)) {
btf_verifier_log_member(env, v->t, member,
"Invalid member");
return -EINVAL;
@@ -1980,6 +2206,43 @@ static void btf_struct_log(struct btf_verifier_env *env,
btf_verifier_log(env, "size=%u vlen=%u", t->size, btf_type_vlen(t));
}
+/* find 'struct bpf_spin_lock' in map value.
+ * return >= 0 offset if found
+ * and < 0 in case of error
+ */
+int btf_find_spin_lock(const struct btf *btf, const struct btf_type *t)
+{
+ const struct btf_member *member;
+ u32 i, off = -ENOENT;
+
+ if (!__btf_type_is_struct(t))
+ return -EINVAL;
+
+ for_each_member(i, t, member) {
+ const struct btf_type *member_type = btf_type_by_id(btf,
+ member->type);
+ if (!__btf_type_is_struct(member_type))
+ continue;
+ if (member_type->size != sizeof(struct bpf_spin_lock))
+ continue;
+ if (strcmp(__btf_name_by_offset(btf, member_type->name_off),
+ "bpf_spin_lock"))
+ continue;
+ if (off != -ENOENT)
+ /* only one 'struct bpf_spin_lock' is allowed */
+ return -E2BIG;
+ off = btf_member_bit_offset(t, member);
+ if (off % 8)
+ /* valid C code cannot generate such BTF */
+ return -EINVAL;
+ off /= 8;
+ if (off % __alignof__(struct bpf_spin_lock))
+ /* valid struct bpf_spin_lock will be 4 byte aligned */
+ return -EINVAL;
+ }
+ return off;
+}
+
static void btf_struct_seq_show(const struct btf *btf, const struct btf_type *t,
u32 type_id, void *data, u8 bits_offset,
struct seq_file *m)
@@ -2303,6 +2566,222 @@ static struct btf_kind_operations func_ops = {
.seq_show = btf_df_seq_show,
};
+static s32 btf_var_check_meta(struct btf_verifier_env *env,
+ const struct btf_type *t,
+ u32 meta_left)
+{
+ const struct btf_var *var;
+ u32 meta_needed = sizeof(*var);
+
+ if (meta_left < meta_needed) {
+ btf_verifier_log_basic(env, t,
+ "meta_left:%u meta_needed:%u",
+ meta_left, meta_needed);
+ return -EINVAL;
+ }
+
+ if (btf_type_vlen(t)) {
+ btf_verifier_log_type(env, t, "vlen != 0");
+ return -EINVAL;
+ }
+
+ if (btf_type_kflag(t)) {
+ btf_verifier_log_type(env, t, "Invalid btf_info kind_flag");
+ return -EINVAL;
+ }
+
+ if (!t->name_off ||
+ !__btf_name_valid(env->btf, t->name_off, true)) {
+ btf_verifier_log_type(env, t, "Invalid name");
+ return -EINVAL;
+ }
+
+ /* A var cannot be in type void */
+ if (!t->type || !BTF_TYPE_ID_VALID(t->type)) {
+ btf_verifier_log_type(env, t, "Invalid type_id");
+ return -EINVAL;
+ }
+
+ var = btf_type_var(t);
+ if (var->linkage != BTF_VAR_STATIC &&
+ var->linkage != BTF_VAR_GLOBAL_ALLOCATED) {
+ btf_verifier_log_type(env, t, "Linkage not supported");
+ return -EINVAL;
+ }
+
+ btf_verifier_log_type(env, t, NULL);
+
+ return meta_needed;
+}
+
+static void btf_var_log(struct btf_verifier_env *env, const struct btf_type *t)
+{
+ const struct btf_var *var = btf_type_var(t);
+
+ btf_verifier_log(env, "type_id=%u linkage=%u", t->type, var->linkage);
+}
+
+static const struct btf_kind_operations var_ops = {
+ .check_meta = btf_var_check_meta,
+ .resolve = btf_var_resolve,
+ .check_member = btf_df_check_member,
+ .check_kflag_member = btf_df_check_kflag_member,
+ .log_details = btf_var_log,
+ .seq_show = btf_var_seq_show,
+};
+
+static s32 btf_datasec_check_meta(struct btf_verifier_env *env,
+ const struct btf_type *t,
+ u32 meta_left)
+{
+ const struct btf_var_secinfo *vsi;
+ u64 last_vsi_end_off = 0, sum = 0;
+ u32 i, meta_needed;
+
+ meta_needed = btf_type_vlen(t) * sizeof(*vsi);
+ if (meta_left < meta_needed) {
+ btf_verifier_log_basic(env, t,
+ "meta_left:%u meta_needed:%u",
+ meta_left, meta_needed);
+ return -EINVAL;
+ }
+
+ if (!btf_type_vlen(t)) {
+ btf_verifier_log_type(env, t, "vlen == 0");
+ return -EINVAL;
+ }
+
+ if (!t->size) {
+ btf_verifier_log_type(env, t, "size == 0");
+ return -EINVAL;
+ }
+
+ if (btf_type_kflag(t)) {
+ btf_verifier_log_type(env, t, "Invalid btf_info kind_flag");
+ return -EINVAL;
+ }
+
+ if (!t->name_off ||
+ !btf_name_valid_section(env->btf, t->name_off)) {
+ btf_verifier_log_type(env, t, "Invalid name");
+ return -EINVAL;
+ }
+
+ btf_verifier_log_type(env, t, NULL);
+
+ for_each_vsi(i, t, vsi) {
+ /* A var cannot be in type void */
+ if (!vsi->type || !BTF_TYPE_ID_VALID(vsi->type)) {
+ btf_verifier_log_vsi(env, t, vsi,
+ "Invalid type_id");
+ return -EINVAL;
+ }
+
+ if (vsi->offset < last_vsi_end_off || vsi->offset >= t->size) {
+ btf_verifier_log_vsi(env, t, vsi,
+ "Invalid offset");
+ return -EINVAL;
+ }
+
+ if (!vsi->size || vsi->size > t->size) {
+ btf_verifier_log_vsi(env, t, vsi,
+ "Invalid size");
+ return -EINVAL;
+ }
+
+ last_vsi_end_off = vsi->offset + vsi->size;
+ if (last_vsi_end_off > t->size) {
+ btf_verifier_log_vsi(env, t, vsi,
+ "Invalid offset+size");
+ return -EINVAL;
+ }
+
+ btf_verifier_log_vsi(env, t, vsi, NULL);
+ sum += vsi->size;
+ }
+
+ if (t->size < sum) {
+ btf_verifier_log_type(env, t, "Invalid btf_info size");
+ return -EINVAL;
+ }
+
+ return meta_needed;
+}
+
+static int btf_datasec_resolve(struct btf_verifier_env *env,
+ const struct resolve_vertex *v)
+{
+ const struct btf_var_secinfo *vsi;
+ struct btf *btf = env->btf;
+ u16 i;
+
+ for_each_vsi_from(i, v->next_member, v->t, vsi) {
+ u32 var_type_id = vsi->type, type_id, type_size = 0;
+ const struct btf_type *var_type = btf_type_by_id(env->btf,
+ var_type_id);
+ if (!var_type || !btf_type_is_var(var_type)) {
+ btf_verifier_log_vsi(env, v->t, vsi,
+ "Not a VAR kind member");
+ return -EINVAL;
+ }
+
+ if (!env_type_is_resolve_sink(env, var_type) &&
+ !env_type_is_resolved(env, var_type_id)) {
+ env_stack_set_next_member(env, i + 1);
+ return env_stack_push(env, var_type, var_type_id);
+ }
+
+ type_id = var_type->type;
+ if (!btf_type_id_size(btf, &type_id, &type_size)) {
+ btf_verifier_log_vsi(env, v->t, vsi, "Invalid type");
+ return -EINVAL;
+ }
+
+ if (vsi->size < type_size) {
+ btf_verifier_log_vsi(env, v->t, vsi, "Invalid size");
+ return -EINVAL;
+ }
+ }
+
+ env_stack_pop_resolved(env, 0, 0);
+ return 0;
+}
+
+static void btf_datasec_log(struct btf_verifier_env *env,
+ const struct btf_type *t)
+{
+ btf_verifier_log(env, "size=%u vlen=%u", t->size, btf_type_vlen(t));
+}
+
+static void btf_datasec_seq_show(const struct btf *btf,
+ const struct btf_type *t, u32 type_id,
+ void *data, u8 bits_offset,
+ struct seq_file *m)
+{
+ const struct btf_var_secinfo *vsi;
+ const struct btf_type *var;
+ u32 i;
+
+ seq_printf(m, "section (\"%s\") = {", __btf_name_by_offset(btf, t->name_off));
+ for_each_vsi(i, t, vsi) {
+ var = btf_type_by_id(btf, vsi->type);
+ if (i)
+ seq_puts(m, ",");
+ btf_type_ops(var)->seq_show(btf, var, vsi->type,
+ data + vsi->offset, bits_offset, m);
+ }
+ seq_puts(m, "}");
+}
+
+static const struct btf_kind_operations datasec_ops = {
+ .check_meta = btf_datasec_check_meta,
+ .resolve = btf_datasec_resolve,
+ .check_member = btf_df_check_member,
+ .check_kflag_member = btf_df_check_kflag_member,
+ .log_details = btf_datasec_log,
+ .seq_show = btf_datasec_seq_show,
+};
+
static int btf_func_proto_check(struct btf_verifier_env *env,
const struct btf_type *t)
{
@@ -2434,6 +2913,8 @@ static const struct btf_kind_operations * const kind_ops[NR_BTF_KINDS] = {
[BTF_KIND_RESTRICT] = &modifier_ops,
[BTF_KIND_FUNC] = &func_ops,
[BTF_KIND_FUNC_PROTO] = &func_proto_ops,
+ [BTF_KIND_VAR] = &var_ops,
+ [BTF_KIND_DATASEC] = &datasec_ops,
};
static s32 btf_check_meta(struct btf_verifier_env *env,
@@ -2514,13 +2995,17 @@ static bool btf_resolve_valid(struct btf_verifier_env *env,
if (!env_type_is_resolved(env, type_id))
return false;
- if (btf_type_is_struct(t))
+ if (btf_type_is_struct(t) || btf_type_is_datasec(t))
return !btf->resolved_ids[type_id] &&
- !btf->resolved_sizes[type_id];
+ !btf->resolved_sizes[type_id];
- if (btf_type_is_modifier(t) || btf_type_is_ptr(t)) {
+ if (btf_type_is_modifier(t) || btf_type_is_ptr(t) ||
+ btf_type_is_var(t)) {
t = btf_type_id_resolve(btf, &type_id);
- return t && !btf_type_is_modifier(t);
+ return t &&
+ !btf_type_is_modifier(t) &&
+ !btf_type_is_var(t) &&
+ !btf_type_is_datasec(t);
}
if (btf_type_is_array(t)) {
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index d17d05570a3f..0a00eaca6fae 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -1,33 +1,48 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Functions to manage eBPF programs attached to cgroups
*
* Copyright (c) 2016 Daniel Mack
- *
- * This file is subject to the terms and conditions of version 2 of the GNU
- * General Public License. See the file COPYING in the main directory of the
- * Linux distribution for more details.
*/
#include <linux/kernel.h>
#include <linux/atomic.h>
#include <linux/cgroup.h>
+#include <linux/filter.h>
#include <linux/slab.h>
+#include <linux/sysctl.h>
+#include <linux/string.h>
#include <linux/bpf.h>
#include <linux/bpf-cgroup.h>
#include <net/sock.h>
+#include <net/bpf_sk_storage.h>
+
+#include "../cgroup/cgroup-internal.h"
DEFINE_STATIC_KEY_FALSE(cgroup_bpf_enabled_key);
EXPORT_SYMBOL(cgroup_bpf_enabled_key);
+void cgroup_bpf_offline(struct cgroup *cgrp)
+{
+ cgroup_get(cgrp);
+ percpu_ref_kill(&cgrp->bpf.refcnt);
+}
+
/**
- * cgroup_bpf_put() - put references of all bpf programs
- * @cgrp: the cgroup to modify
+ * cgroup_bpf_release() - put references of all bpf programs and
+ * release all cgroup bpf data
+ * @work: work structure embedded into the cgroup to modify
*/
-void cgroup_bpf_put(struct cgroup *cgrp)
+static void cgroup_bpf_release(struct work_struct *work)
{
+ struct cgroup *cgrp = container_of(work, struct cgroup,
+ bpf.release_work);
enum bpf_cgroup_storage_type stype;
+ struct bpf_prog_array *old_array;
unsigned int type;
+ mutex_lock(&cgroup_mutex);
+
for (type = 0; type < ARRAY_SIZE(cgrp->bpf.progs); type++) {
struct list_head *progs = &cgrp->bpf.progs[type];
struct bpf_prog_list *pl, *tmp;
@@ -42,8 +57,29 @@ void cgroup_bpf_put(struct cgroup *cgrp)
kfree(pl);
static_branch_dec(&cgroup_bpf_enabled_key);
}
- bpf_prog_array_free(cgrp->bpf.effective[type]);
+ old_array = rcu_dereference_protected(
+ cgrp->bpf.effective[type],
+ lockdep_is_held(&cgroup_mutex));
+ bpf_prog_array_free(old_array);
}
+
+ mutex_unlock(&cgroup_mutex);
+
+ percpu_ref_exit(&cgrp->bpf.refcnt);
+ cgroup_put(cgrp);
+}
+
+/**
+ * cgroup_bpf_release_fn() - callback used to schedule releasing
+ * of bpf cgroup data
+ * @ref: percpu ref counter structure
+ */
+static void cgroup_bpf_release_fn(struct percpu_ref *ref)
+{
+ struct cgroup *cgrp = container_of(ref, struct cgroup, bpf.refcnt);
+
+ INIT_WORK(&cgrp->bpf.release_work, cgroup_bpf_release);
+ queue_work(system_wq, &cgrp->bpf.release_work);
}
/* count number of elements in the list.
@@ -98,7 +134,7 @@ static bool hierarchy_allows_attach(struct cgroup *cgrp,
*/
static int compute_effective_progs(struct cgroup *cgrp,
enum bpf_attach_type type,
- struct bpf_prog_array __rcu **array)
+ struct bpf_prog_array **array)
{
enum bpf_cgroup_storage_type stype;
struct bpf_prog_array *progs;
@@ -136,17 +172,16 @@ static int compute_effective_progs(struct cgroup *cgrp,
}
} while ((p = cgroup_parent(p)));
- rcu_assign_pointer(*array, progs);
+ *array = progs;
return 0;
}
static void activate_effective_progs(struct cgroup *cgrp,
enum bpf_attach_type type,
- struct bpf_prog_array __rcu *array)
+ struct bpf_prog_array *old_array)
{
- struct bpf_prog_array __rcu *old_array;
-
- old_array = xchg(&cgrp->bpf.effective[type], array);
+ rcu_swap_protected(cgrp->bpf.effective[type], old_array,
+ lockdep_is_held(&cgroup_mutex));
/* free prog array after grace period, since __cgroup_bpf_run_*()
* might be still walking the array
*/
@@ -163,8 +198,13 @@ int cgroup_bpf_inherit(struct cgroup *cgrp)
* that array below is variable length
*/
#define NR ARRAY_SIZE(cgrp->bpf.effective)
- struct bpf_prog_array __rcu *arrays[NR] = {};
- int i;
+ struct bpf_prog_array *arrays[NR] = {};
+ int ret, i;
+
+ ret = percpu_ref_init(&cgrp->bpf.refcnt, cgroup_bpf_release_fn, 0,
+ GFP_KERNEL);
+ if (ret)
+ return ret;
for (i = 0; i < NR; i++)
INIT_LIST_HEAD(&cgrp->bpf.progs[i]);
@@ -180,6 +220,9 @@ int cgroup_bpf_inherit(struct cgroup *cgrp)
cleanup:
for (i = 0; i < NR; i++)
bpf_prog_array_free(arrays[i]);
+
+ percpu_ref_exit(&cgrp->bpf.refcnt);
+
return -ENOMEM;
}
@@ -193,6 +236,9 @@ static int update_effective_progs(struct cgroup *cgrp,
css_for_each_descendant_pre(css, &cgrp->self) {
struct cgroup *desc = container_of(css, struct cgroup, self);
+ if (percpu_ref_is_zero(&desc->bpf.refcnt))
+ continue;
+
err = compute_effective_progs(desc, type, &desc->bpf.inactive);
if (err)
goto cleanup;
@@ -202,6 +248,14 @@ static int update_effective_progs(struct cgroup *cgrp,
css_for_each_descendant_pre(css, &cgrp->self) {
struct cgroup *desc = container_of(css, struct cgroup, self);
+ if (percpu_ref_is_zero(&desc->bpf.refcnt)) {
+ if (unlikely(desc->bpf.inactive)) {
+ bpf_prog_array_free(desc->bpf.inactive);
+ desc->bpf.inactive = NULL;
+ }
+ continue;
+ }
+
activate_effective_progs(desc, type, desc->bpf.inactive);
desc->bpf.inactive = NULL;
}
@@ -230,6 +284,7 @@ cleanup:
* @cgrp: The cgroup which descendants to traverse
* @prog: A program to attach
* @type: Type of attach operation
+ * @flags: Option flags
*
* Must be called with cgroup_mutex held.
*/
@@ -363,7 +418,7 @@ cleanup:
* Must be called with cgroup_mutex held.
*/
int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
- enum bpf_attach_type type, u32 unused_flags)
+ enum bpf_attach_type type)
{
struct list_head *progs = &cgrp->bpf.progs[type];
enum bpf_cgroup_storage_type stype;
@@ -440,10 +495,14 @@ int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
enum bpf_attach_type type = attr->query.attach_type;
struct list_head *progs = &cgrp->bpf.progs[type];
u32 flags = cgrp->bpf.flags[type];
+ struct bpf_prog_array *effective;
int cnt, ret = 0, i;
+ effective = rcu_dereference_protected(cgrp->bpf.effective[type],
+ lockdep_is_held(&cgroup_mutex));
+
if (attr->query.query_flags & BPF_F_QUERY_EFFECTIVE)
- cnt = bpf_prog_array_length(cgrp->bpf.effective[type]);
+ cnt = bpf_prog_array_length(effective);
else
cnt = prog_list_length(progs);
@@ -460,8 +519,7 @@ int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
}
if (attr->query.query_flags & BPF_F_QUERY_EFFECTIVE) {
- return bpf_prog_array_copy_to_user(cgrp->bpf.effective[type],
- prog_ids, cnt);
+ return bpf_prog_array_copy_to_user(effective, prog_ids, cnt);
} else {
struct bpf_prog_list *pl;
u32 id;
@@ -544,8 +602,16 @@ int cgroup_bpf_prog_query(const union bpf_attr *attr,
* The program type passed in via @type must be suitable for network
* filtering. No further check is performed to assert that.
*
- * This function will return %-EPERM if any if an attached program was found
- * and if it returned != 1 during execution. In all other cases, 0 is returned.
+ * For egress packets, this function can return:
+ * NET_XMIT_SUCCESS (0) - continue with packet output
+ * NET_XMIT_DROP (1) - drop packet and notify TCP to call cwr
+ * NET_XMIT_CN (2) - continue with packet output and notify TCP
+ * to call cwr
+ * -EPERM - drop packet
+ *
+ * For ingress packets, this function will return -EPERM if any
+ * attached program was found and if it returned != 1 during execution.
+ * Otherwise 0 is returned.
*/
int __cgroup_bpf_run_filter_skb(struct sock *sk,
struct sk_buff *skb,
@@ -571,12 +637,19 @@ int __cgroup_bpf_run_filter_skb(struct sock *sk,
/* compute pointers for the bpf prog */
bpf_compute_and_save_data_end(skb, &saved_data_end);
- ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], skb,
- __bpf_prog_run_save_cb);
+ if (type == BPF_CGROUP_INET_EGRESS) {
+ ret = BPF_PROG_CGROUP_INET_EGRESS_RUN_ARRAY(
+ cgrp->bpf.effective[type], skb, __bpf_prog_run_save_cb);
+ } else {
+ ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], skb,
+ __bpf_prog_run_save_cb);
+ ret = (ret == 1 ? 0 : -EPERM);
+ }
bpf_restore_data_end(skb, saved_data_end);
__skb_pull(skb, offset);
skb->sk = save_sk;
- return ret == 1 ? 0 : -EPERM;
+
+ return ret;
}
EXPORT_SYMBOL(__cgroup_bpf_run_filter_skb);
@@ -700,7 +773,7 @@ int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor,
EXPORT_SYMBOL(__cgroup_bpf_check_dev_permission);
static const struct bpf_func_proto *
-cgroup_dev_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
+cgroup_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
switch (func_id) {
case BPF_FUNC_map_lookup_elem:
@@ -709,6 +782,12 @@ cgroup_dev_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_map_update_elem_proto;
case BPF_FUNC_map_delete_elem:
return &bpf_map_delete_elem_proto;
+ case BPF_FUNC_map_push_elem:
+ return &bpf_map_push_elem_proto;
+ case BPF_FUNC_map_pop_elem:
+ return &bpf_map_pop_elem_proto;
+ case BPF_FUNC_map_peek_elem:
+ return &bpf_map_peek_elem_proto;
case BPF_FUNC_get_current_uid_gid:
return &bpf_get_current_uid_gid_proto;
case BPF_FUNC_get_local_storage:
@@ -724,6 +803,12 @@ cgroup_dev_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
}
}
+static const struct bpf_func_proto *
+cgroup_dev_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
+{
+ return cgroup_base_func_proto(func_id, prog);
+}
+
static bool cgroup_dev_is_valid_access(int off, int size,
enum bpf_access_type type,
const struct bpf_prog *prog,
@@ -761,3 +846,692 @@ const struct bpf_verifier_ops cg_dev_verifier_ops = {
.get_func_proto = cgroup_dev_func_proto,
.is_valid_access = cgroup_dev_is_valid_access,
};
+
+/**
+ * __cgroup_bpf_run_filter_sysctl - Run a program on sysctl
+ *
+ * @head: sysctl table header
+ * @table: sysctl table
+ * @write: sysctl is being read (= 0) or written (= 1)
+ * @buf: pointer to buffer passed by user space
+ * @pcount: value-result argument: value is size of buffer pointed to by @buf,
+ * result is size of @new_buf if program set new value, initial value
+ * otherwise
+ * @ppos: value-result argument: value is position at which read from or write
+ * to sysctl is happening, result is new position if program overrode it,
+ * initial value otherwise
+ * @new_buf: pointer to pointer to new buffer that will be allocated if program
+ * overrides new value provided by user space on sysctl write
+ * NOTE: it's caller responsibility to free *new_buf if it was set
+ * @type: type of program to be executed
+ *
+ * Program is run when sysctl is being accessed, either read or written, and
+ * can allow or deny such access.
+ *
+ * This function will return %-EPERM if an attached program is found and
+ * returned value != 1 during execution. In all other cases 0 is returned.
+ */
+int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head,
+ struct ctl_table *table, int write,
+ void __user *buf, size_t *pcount,
+ loff_t *ppos, void **new_buf,
+ enum bpf_attach_type type)
+{
+ struct bpf_sysctl_kern ctx = {
+ .head = head,
+ .table = table,
+ .write = write,
+ .ppos = ppos,
+ .cur_val = NULL,
+ .cur_len = PAGE_SIZE,
+ .new_val = NULL,
+ .new_len = 0,
+ .new_updated = 0,
+ };
+ struct cgroup *cgrp;
+ int ret;
+
+ ctx.cur_val = kmalloc_track_caller(ctx.cur_len, GFP_KERNEL);
+ if (ctx.cur_val) {
+ mm_segment_t old_fs;
+ loff_t pos = 0;
+
+ old_fs = get_fs();
+ set_fs(KERNEL_DS);
+ if (table->proc_handler(table, 0, (void __user *)ctx.cur_val,
+ &ctx.cur_len, &pos)) {
+ /* Let BPF program decide how to proceed. */
+ ctx.cur_len = 0;
+ }
+ set_fs(old_fs);
+ } else {
+ /* Let BPF program decide how to proceed. */
+ ctx.cur_len = 0;
+ }
+
+ if (write && buf && *pcount) {
+ /* BPF program should be able to override new value with a
+ * buffer bigger than provided by user.
+ */
+ ctx.new_val = kmalloc_track_caller(PAGE_SIZE, GFP_KERNEL);
+ ctx.new_len = min_t(size_t, PAGE_SIZE, *pcount);
+ if (!ctx.new_val ||
+ copy_from_user(ctx.new_val, buf, ctx.new_len))
+ /* Let BPF program decide how to proceed. */
+ ctx.new_len = 0;
+ }
+
+ rcu_read_lock();
+ cgrp = task_dfl_cgroup(current);
+ ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], &ctx, BPF_PROG_RUN);
+ rcu_read_unlock();
+
+ kfree(ctx.cur_val);
+
+ if (ret == 1 && ctx.new_updated) {
+ *new_buf = ctx.new_val;
+ *pcount = ctx.new_len;
+ } else {
+ kfree(ctx.new_val);
+ }
+
+ return ret == 1 ? 0 : -EPERM;
+}
+EXPORT_SYMBOL(__cgroup_bpf_run_filter_sysctl);
+
+#ifdef CONFIG_NET
+static bool __cgroup_bpf_prog_array_is_empty(struct cgroup *cgrp,
+ enum bpf_attach_type attach_type)
+{
+ struct bpf_prog_array *prog_array;
+ bool empty;
+
+ rcu_read_lock();
+ prog_array = rcu_dereference(cgrp->bpf.effective[attach_type]);
+ empty = bpf_prog_array_is_empty(prog_array);
+ rcu_read_unlock();
+
+ return empty;
+}
+
+static int sockopt_alloc_buf(struct bpf_sockopt_kern *ctx, int max_optlen)
+{
+ if (unlikely(max_optlen > PAGE_SIZE) || max_optlen < 0)
+ return -EINVAL;
+
+ ctx->optval = kzalloc(max_optlen, GFP_USER);
+ if (!ctx->optval)
+ return -ENOMEM;
+
+ ctx->optval_end = ctx->optval + max_optlen;
+ ctx->optlen = max_optlen;
+
+ return 0;
+}
+
+static void sockopt_free_buf(struct bpf_sockopt_kern *ctx)
+{
+ kfree(ctx->optval);
+}
+
+int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level,
+ int *optname, char __user *optval,
+ int *optlen, char **kernel_optval)
+{
+ struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
+ struct bpf_sockopt_kern ctx = {
+ .sk = sk,
+ .level = *level,
+ .optname = *optname,
+ };
+ int ret;
+
+ /* Opportunistic check to see whether we have any BPF program
+ * attached to the hook so we don't waste time allocating
+ * memory and locking the socket.
+ */
+ if (!cgroup_bpf_enabled ||
+ __cgroup_bpf_prog_array_is_empty(cgrp, BPF_CGROUP_SETSOCKOPT))
+ return 0;
+
+ ret = sockopt_alloc_buf(&ctx, *optlen);
+ if (ret)
+ return ret;
+
+ if (copy_from_user(ctx.optval, optval, *optlen) != 0) {
+ ret = -EFAULT;
+ goto out;
+ }
+
+ lock_sock(sk);
+ ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[BPF_CGROUP_SETSOCKOPT],
+ &ctx, BPF_PROG_RUN);
+ release_sock(sk);
+
+ if (!ret) {
+ ret = -EPERM;
+ goto out;
+ }
+
+ if (ctx.optlen == -1) {
+ /* optlen set to -1, bypass kernel */
+ ret = 1;
+ } else if (ctx.optlen > *optlen || ctx.optlen < -1) {
+ /* optlen is out of bounds */
+ ret = -EFAULT;
+ } else {
+ /* optlen within bounds, run kernel handler */
+ ret = 0;
+
+ /* export any potential modifications */
+ *level = ctx.level;
+ *optname = ctx.optname;
+ *optlen = ctx.optlen;
+ *kernel_optval = ctx.optval;
+ }
+
+out:
+ if (ret)
+ sockopt_free_buf(&ctx);
+ return ret;
+}
+EXPORT_SYMBOL(__cgroup_bpf_run_filter_setsockopt);
+
+int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level,
+ int optname, char __user *optval,
+ int __user *optlen, int max_optlen,
+ int retval)
+{
+ struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
+ struct bpf_sockopt_kern ctx = {
+ .sk = sk,
+ .level = level,
+ .optname = optname,
+ .retval = retval,
+ };
+ int ret;
+
+ /* Opportunistic check to see whether we have any BPF program
+ * attached to the hook so we don't waste time allocating
+ * memory and locking the socket.
+ */
+ if (!cgroup_bpf_enabled ||
+ __cgroup_bpf_prog_array_is_empty(cgrp, BPF_CGROUP_GETSOCKOPT))
+ return retval;
+
+ ret = sockopt_alloc_buf(&ctx, max_optlen);
+ if (ret)
+ return ret;
+
+ if (!retval) {
+ /* If kernel getsockopt finished successfully,
+ * copy whatever was returned to the user back
+ * into our temporary buffer. Set optlen to the
+ * one that kernel returned as well to let
+ * BPF programs inspect the value.
+ */
+
+ if (get_user(ctx.optlen, optlen)) {
+ ret = -EFAULT;
+ goto out;
+ }
+
+ if (ctx.optlen > max_optlen)
+ ctx.optlen = max_optlen;
+
+ if (copy_from_user(ctx.optval, optval, ctx.optlen) != 0) {
+ ret = -EFAULT;
+ goto out;
+ }
+ }
+
+ lock_sock(sk);
+ ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[BPF_CGROUP_GETSOCKOPT],
+ &ctx, BPF_PROG_RUN);
+ release_sock(sk);
+
+ if (!ret) {
+ ret = -EPERM;
+ goto out;
+ }
+
+ if (ctx.optlen > max_optlen) {
+ ret = -EFAULT;
+ goto out;
+ }
+
+ /* BPF programs only allowed to set retval to 0, not some
+ * arbitrary value.
+ */
+ if (ctx.retval != 0 && ctx.retval != retval) {
+ ret = -EFAULT;
+ goto out;
+ }
+
+ if (copy_to_user(optval, ctx.optval, ctx.optlen) ||
+ put_user(ctx.optlen, optlen)) {
+ ret = -EFAULT;
+ goto out;
+ }
+
+ ret = ctx.retval;
+
+out:
+ sockopt_free_buf(&ctx);
+ return ret;
+}
+EXPORT_SYMBOL(__cgroup_bpf_run_filter_getsockopt);
+#endif
+
+static ssize_t sysctl_cpy_dir(const struct ctl_dir *dir, char **bufp,
+ size_t *lenp)
+{
+ ssize_t tmp_ret = 0, ret;
+
+ if (dir->header.parent) {
+ tmp_ret = sysctl_cpy_dir(dir->header.parent, bufp, lenp);
+ if (tmp_ret < 0)
+ return tmp_ret;
+ }
+
+ ret = strscpy(*bufp, dir->header.ctl_table[0].procname, *lenp);
+ if (ret < 0)
+ return ret;
+ *bufp += ret;
+ *lenp -= ret;
+ ret += tmp_ret;
+
+ /* Avoid leading slash. */
+ if (!ret)
+ return ret;
+
+ tmp_ret = strscpy(*bufp, "/", *lenp);
+ if (tmp_ret < 0)
+ return tmp_ret;
+ *bufp += tmp_ret;
+ *lenp -= tmp_ret;
+
+ return ret + tmp_ret;
+}
+
+BPF_CALL_4(bpf_sysctl_get_name, struct bpf_sysctl_kern *, ctx, char *, buf,
+ size_t, buf_len, u64, flags)
+{
+ ssize_t tmp_ret = 0, ret;
+
+ if (!buf)
+ return -EINVAL;
+
+ if (!(flags & BPF_F_SYSCTL_BASE_NAME)) {
+ if (!ctx->head)
+ return -EINVAL;
+ tmp_ret = sysctl_cpy_dir(ctx->head->parent, &buf, &buf_len);
+ if (tmp_ret < 0)
+ return tmp_ret;
+ }
+
+ ret = strscpy(buf, ctx->table->procname, buf_len);
+
+ return ret < 0 ? ret : tmp_ret + ret;
+}
+
+static const struct bpf_func_proto bpf_sysctl_get_name_proto = {
+ .func = bpf_sysctl_get_name,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_PTR_TO_MEM,
+ .arg3_type = ARG_CONST_SIZE,
+ .arg4_type = ARG_ANYTHING,
+};
+
+static int copy_sysctl_value(char *dst, size_t dst_len, char *src,
+ size_t src_len)
+{
+ if (!dst)
+ return -EINVAL;
+
+ if (!dst_len)
+ return -E2BIG;
+
+ if (!src || !src_len) {
+ memset(dst, 0, dst_len);
+ return -EINVAL;
+ }
+
+ memcpy(dst, src, min(dst_len, src_len));
+
+ if (dst_len > src_len) {
+ memset(dst + src_len, '\0', dst_len - src_len);
+ return src_len;
+ }
+
+ dst[dst_len - 1] = '\0';
+
+ return -E2BIG;
+}
+
+BPF_CALL_3(bpf_sysctl_get_current_value, struct bpf_sysctl_kern *, ctx,
+ char *, buf, size_t, buf_len)
+{
+ return copy_sysctl_value(buf, buf_len, ctx->cur_val, ctx->cur_len);
+}
+
+static const struct bpf_func_proto bpf_sysctl_get_current_value_proto = {
+ .func = bpf_sysctl_get_current_value,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_PTR_TO_UNINIT_MEM,
+ .arg3_type = ARG_CONST_SIZE,
+};
+
+BPF_CALL_3(bpf_sysctl_get_new_value, struct bpf_sysctl_kern *, ctx, char *, buf,
+ size_t, buf_len)
+{
+ if (!ctx->write) {
+ if (buf && buf_len)
+ memset(buf, '\0', buf_len);
+ return -EINVAL;
+ }
+ return copy_sysctl_value(buf, buf_len, ctx->new_val, ctx->new_len);
+}
+
+static const struct bpf_func_proto bpf_sysctl_get_new_value_proto = {
+ .func = bpf_sysctl_get_new_value,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_PTR_TO_UNINIT_MEM,
+ .arg3_type = ARG_CONST_SIZE,
+};
+
+BPF_CALL_3(bpf_sysctl_set_new_value, struct bpf_sysctl_kern *, ctx,
+ const char *, buf, size_t, buf_len)
+{
+ if (!ctx->write || !ctx->new_val || !ctx->new_len || !buf || !buf_len)
+ return -EINVAL;
+
+ if (buf_len > PAGE_SIZE - 1)
+ return -E2BIG;
+
+ memcpy(ctx->new_val, buf, buf_len);
+ ctx->new_len = buf_len;
+ ctx->new_updated = 1;
+
+ return 0;
+}
+
+static const struct bpf_func_proto bpf_sysctl_set_new_value_proto = {
+ .func = bpf_sysctl_set_new_value,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_PTR_TO_MEM,
+ .arg3_type = ARG_CONST_SIZE,
+};
+
+static const struct bpf_func_proto *
+sysctl_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
+{
+ switch (func_id) {
+ case BPF_FUNC_strtol:
+ return &bpf_strtol_proto;
+ case BPF_FUNC_strtoul:
+ return &bpf_strtoul_proto;
+ case BPF_FUNC_sysctl_get_name:
+ return &bpf_sysctl_get_name_proto;
+ case BPF_FUNC_sysctl_get_current_value:
+ return &bpf_sysctl_get_current_value_proto;
+ case BPF_FUNC_sysctl_get_new_value:
+ return &bpf_sysctl_get_new_value_proto;
+ case BPF_FUNC_sysctl_set_new_value:
+ return &bpf_sysctl_set_new_value_proto;
+ default:
+ return cgroup_base_func_proto(func_id, prog);
+ }
+}
+
+static bool sysctl_is_valid_access(int off, int size, enum bpf_access_type type,
+ const struct bpf_prog *prog,
+ struct bpf_insn_access_aux *info)
+{
+ const int size_default = sizeof(__u32);
+
+ if (off < 0 || off + size > sizeof(struct bpf_sysctl) || off % size)
+ return false;
+
+ switch (off) {
+ case offsetof(struct bpf_sysctl, write):
+ if (type != BPF_READ)
+ return false;
+ bpf_ctx_record_field_size(info, size_default);
+ return bpf_ctx_narrow_access_ok(off, size, size_default);
+ case offsetof(struct bpf_sysctl, file_pos):
+ if (type == BPF_READ) {
+ bpf_ctx_record_field_size(info, size_default);
+ return bpf_ctx_narrow_access_ok(off, size, size_default);
+ } else {
+ return size == size_default;
+ }
+ default:
+ return false;
+ }
+}
+
+static u32 sysctl_convert_ctx_access(enum bpf_access_type type,
+ const struct bpf_insn *si,
+ struct bpf_insn *insn_buf,
+ struct bpf_prog *prog, u32 *target_size)
+{
+ struct bpf_insn *insn = insn_buf;
+
+ switch (si->off) {
+ case offsetof(struct bpf_sysctl, write):
+ *insn++ = BPF_LDX_MEM(
+ BPF_SIZE(si->code), si->dst_reg, si->src_reg,
+ bpf_target_off(struct bpf_sysctl_kern, write,
+ FIELD_SIZEOF(struct bpf_sysctl_kern,
+ write),
+ target_size));
+ break;
+ case offsetof(struct bpf_sysctl, file_pos):
+ /* ppos is a pointer so it should be accessed via indirect
+ * loads and stores. Also for stores additional temporary
+ * register is used since neither src_reg nor dst_reg can be
+ * overridden.
+ */
+ if (type == BPF_WRITE) {
+ int treg = BPF_REG_9;
+
+ if (si->src_reg == treg || si->dst_reg == treg)
+ --treg;
+ if (si->src_reg == treg || si->dst_reg == treg)
+ --treg;
+ *insn++ = BPF_STX_MEM(
+ BPF_DW, si->dst_reg, treg,
+ offsetof(struct bpf_sysctl_kern, tmp_reg));
+ *insn++ = BPF_LDX_MEM(
+ BPF_FIELD_SIZEOF(struct bpf_sysctl_kern, ppos),
+ treg, si->dst_reg,
+ offsetof(struct bpf_sysctl_kern, ppos));
+ *insn++ = BPF_STX_MEM(
+ BPF_SIZEOF(u32), treg, si->src_reg, 0);
+ *insn++ = BPF_LDX_MEM(
+ BPF_DW, treg, si->dst_reg,
+ offsetof(struct bpf_sysctl_kern, tmp_reg));
+ } else {
+ *insn++ = BPF_LDX_MEM(
+ BPF_FIELD_SIZEOF(struct bpf_sysctl_kern, ppos),
+ si->dst_reg, si->src_reg,
+ offsetof(struct bpf_sysctl_kern, ppos));
+ *insn++ = BPF_LDX_MEM(
+ BPF_SIZE(si->code), si->dst_reg, si->dst_reg, 0);
+ }
+ *target_size = sizeof(u32);
+ break;
+ }
+
+ return insn - insn_buf;
+}
+
+const struct bpf_verifier_ops cg_sysctl_verifier_ops = {
+ .get_func_proto = sysctl_func_proto,
+ .is_valid_access = sysctl_is_valid_access,
+ .convert_ctx_access = sysctl_convert_ctx_access,
+};
+
+const struct bpf_prog_ops cg_sysctl_prog_ops = {
+};
+
+static const struct bpf_func_proto *
+cg_sockopt_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
+{
+ switch (func_id) {
+#ifdef CONFIG_NET
+ case BPF_FUNC_sk_storage_get:
+ return &bpf_sk_storage_get_proto;
+ case BPF_FUNC_sk_storage_delete:
+ return &bpf_sk_storage_delete_proto;
+#endif
+#ifdef CONFIG_INET
+ case BPF_FUNC_tcp_sock:
+ return &bpf_tcp_sock_proto;
+#endif
+ default:
+ return cgroup_base_func_proto(func_id, prog);
+ }
+}
+
+static bool cg_sockopt_is_valid_access(int off, int size,
+ enum bpf_access_type type,
+ const struct bpf_prog *prog,
+ struct bpf_insn_access_aux *info)
+{
+ const int size_default = sizeof(__u32);
+
+ if (off < 0 || off >= sizeof(struct bpf_sockopt))
+ return false;
+
+ if (off % size != 0)
+ return false;
+
+ if (type == BPF_WRITE) {
+ switch (off) {
+ case offsetof(struct bpf_sockopt, retval):
+ if (size != size_default)
+ return false;
+ return prog->expected_attach_type ==
+ BPF_CGROUP_GETSOCKOPT;
+ case offsetof(struct bpf_sockopt, optname):
+ /* fallthrough */
+ case offsetof(struct bpf_sockopt, level):
+ if (size != size_default)
+ return false;
+ return prog->expected_attach_type ==
+ BPF_CGROUP_SETSOCKOPT;
+ case offsetof(struct bpf_sockopt, optlen):
+ return size == size_default;
+ default:
+ return false;
+ }
+ }
+
+ switch (off) {
+ case offsetof(struct bpf_sockopt, sk):
+ if (size != sizeof(__u64))
+ return false;
+ info->reg_type = PTR_TO_SOCKET;
+ break;
+ case offsetof(struct bpf_sockopt, optval):
+ if (size != sizeof(__u64))
+ return false;
+ info->reg_type = PTR_TO_PACKET;
+ break;
+ case offsetof(struct bpf_sockopt, optval_end):
+ if (size != sizeof(__u64))
+ return false;
+ info->reg_type = PTR_TO_PACKET_END;
+ break;
+ case offsetof(struct bpf_sockopt, retval):
+ if (size != size_default)
+ return false;
+ return prog->expected_attach_type == BPF_CGROUP_GETSOCKOPT;
+ default:
+ if (size != size_default)
+ return false;
+ break;
+ }
+ return true;
+}
+
+#define CG_SOCKOPT_ACCESS_FIELD(T, F) \
+ T(BPF_FIELD_SIZEOF(struct bpf_sockopt_kern, F), \
+ si->dst_reg, si->src_reg, \
+ offsetof(struct bpf_sockopt_kern, F))
+
+static u32 cg_sockopt_convert_ctx_access(enum bpf_access_type type,
+ const struct bpf_insn *si,
+ struct bpf_insn *insn_buf,
+ struct bpf_prog *prog,
+ u32 *target_size)
+{
+ struct bpf_insn *insn = insn_buf;
+
+ switch (si->off) {
+ case offsetof(struct bpf_sockopt, sk):
+ *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, sk);
+ break;
+ case offsetof(struct bpf_sockopt, level):
+ if (type == BPF_WRITE)
+ *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_STX_MEM, level);
+ else
+ *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, level);
+ break;
+ case offsetof(struct bpf_sockopt, optname):
+ if (type == BPF_WRITE)
+ *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_STX_MEM, optname);
+ else
+ *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, optname);
+ break;
+ case offsetof(struct bpf_sockopt, optlen):
+ if (type == BPF_WRITE)
+ *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_STX_MEM, optlen);
+ else
+ *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, optlen);
+ break;
+ case offsetof(struct bpf_sockopt, retval):
+ if (type == BPF_WRITE)
+ *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_STX_MEM, retval);
+ else
+ *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, retval);
+ break;
+ case offsetof(struct bpf_sockopt, optval):
+ *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, optval);
+ break;
+ case offsetof(struct bpf_sockopt, optval_end):
+ *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, optval_end);
+ break;
+ }
+
+ return insn - insn_buf;
+}
+
+static int cg_sockopt_get_prologue(struct bpf_insn *insn_buf,
+ bool direct_write,
+ const struct bpf_prog *prog)
+{
+ /* Nothing to do for sockopt argument. The data is kzalloc'ated.
+ */
+ return 0;
+}
+
+const struct bpf_verifier_ops cg_sockopt_verifier_ops = {
+ .get_func_proto = cg_sockopt_func_proto,
+ .is_valid_access = cg_sockopt_is_valid_access,
+ .convert_ctx_access = cg_sockopt_convert_ctx_access,
+ .gen_prologue = cg_sockopt_get_prologue,
+};
+
+const struct bpf_prog_ops cg_sockopt_prog_ops = {
+};
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index f908b9356025..16079550db6d 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* Linux Socket Filter - Kernel level socket filtering
*
@@ -12,11 +13,6 @@
* Alexei Starovoitov <ast@plumgrid.com>
* Daniel Borkmann <dborkman@redhat.com>
*
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
* Andi Kleen - Fix a few bad bugs and races.
* Kris Katterjohn - Added many additional checks in bpf_check_classic()
*/
@@ -78,7 +74,7 @@ void *bpf_internal_load_pointer_neg_helper(const struct sk_buff *skb, int k, uns
return NULL;
}
-struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags)
+struct bpf_prog *bpf_prog_alloc_no_stats(unsigned int size, gfp_t gfp_extra_flags)
{
gfp_t gfp_flags = GFP_KERNEL | __GFP_ZERO | gfp_extra_flags;
struct bpf_prog_aux *aux;
@@ -104,6 +100,32 @@ struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags)
return fp;
}
+
+struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags)
+{
+ gfp_t gfp_flags = GFP_KERNEL | __GFP_ZERO | gfp_extra_flags;
+ struct bpf_prog *prog;
+ int cpu;
+
+ prog = bpf_prog_alloc_no_stats(size, gfp_extra_flags);
+ if (!prog)
+ return NULL;
+
+ prog->aux->stats = alloc_percpu_gfp(struct bpf_prog_stats, gfp_flags);
+ if (!prog->aux->stats) {
+ kfree(prog->aux);
+ vfree(prog);
+ return NULL;
+ }
+
+ for_each_possible_cpu(cpu) {
+ struct bpf_prog_stats *pstats;
+
+ pstats = per_cpu_ptr(prog->aux->stats, cpu);
+ u64_stats_init(&pstats->syncp);
+ }
+ return prog;
+}
EXPORT_SYMBOL_GPL(bpf_prog_alloc);
int bpf_prog_alloc_jited_linfo(struct bpf_prog *prog)
@@ -231,7 +253,10 @@ struct bpf_prog *bpf_prog_realloc(struct bpf_prog *fp_old, unsigned int size,
void __bpf_prog_free(struct bpf_prog *fp)
{
- kfree(fp->aux);
+ if (fp->aux) {
+ free_percpu(fp->aux->stats);
+ kfree(fp->aux);
+ }
vfree(fp);
}
@@ -263,7 +288,8 @@ int bpf_prog_calc_tag(struct bpf_prog *fp)
dst[i] = fp->insnsi[i];
if (!was_ld_map &&
dst[i].code == (BPF_LD | BPF_IMM | BPF_DW) &&
- dst[i].src_reg == BPF_PSEUDO_MAP_FD) {
+ (dst[i].src_reg == BPF_PSEUDO_MAP_FD ||
+ dst[i].src_reg == BPF_PSEUDO_MAP_VALUE)) {
was_ld_map = true;
dst[i].imm = 0;
} else if (was_ld_map &&
@@ -307,15 +333,16 @@ int bpf_prog_calc_tag(struct bpf_prog *fp)
return 0;
}
-static int bpf_adj_delta_to_imm(struct bpf_insn *insn, u32 pos, u32 delta,
- u32 curr, const bool probe_pass)
+static int bpf_adj_delta_to_imm(struct bpf_insn *insn, u32 pos, s32 end_old,
+ s32 end_new, s32 curr, const bool probe_pass)
{
const s64 imm_min = S32_MIN, imm_max = S32_MAX;
+ s32 delta = end_new - end_old;
s64 imm = insn->imm;
- if (curr < pos && curr + imm + 1 > pos)
+ if (curr < pos && curr + imm + 1 >= end_old)
imm += delta;
- else if (curr > pos + delta && curr + imm + 1 <= pos + delta)
+ else if (curr >= end_new && curr + imm + 1 < end_new)
imm -= delta;
if (imm < imm_min || imm > imm_max)
return -ERANGE;
@@ -324,15 +351,16 @@ static int bpf_adj_delta_to_imm(struct bpf_insn *insn, u32 pos, u32 delta,
return 0;
}
-static int bpf_adj_delta_to_off(struct bpf_insn *insn, u32 pos, u32 delta,
- u32 curr, const bool probe_pass)
+static int bpf_adj_delta_to_off(struct bpf_insn *insn, u32 pos, s32 end_old,
+ s32 end_new, s32 curr, const bool probe_pass)
{
const s32 off_min = S16_MIN, off_max = S16_MAX;
+ s32 delta = end_new - end_old;
s32 off = insn->off;
- if (curr < pos && curr + off + 1 > pos)
+ if (curr < pos && curr + off + 1 >= end_old)
off += delta;
- else if (curr > pos + delta && curr + off + 1 <= pos + delta)
+ else if (curr >= end_new && curr + off + 1 < end_new)
off -= delta;
if (off < off_min || off > off_max)
return -ERANGE;
@@ -341,10 +369,10 @@ static int bpf_adj_delta_to_off(struct bpf_insn *insn, u32 pos, u32 delta,
return 0;
}
-static int bpf_adj_branches(struct bpf_prog *prog, u32 pos, u32 delta,
- const bool probe_pass)
+static int bpf_adj_branches(struct bpf_prog *prog, u32 pos, s32 end_old,
+ s32 end_new, const bool probe_pass)
{
- u32 i, insn_cnt = prog->len + (probe_pass ? delta : 0);
+ u32 i, insn_cnt = prog->len + (probe_pass ? end_new - end_old : 0);
struct bpf_insn *insn = prog->insnsi;
int ret = 0;
@@ -356,22 +384,23 @@ static int bpf_adj_branches(struct bpf_prog *prog, u32 pos, u32 delta,
* do any other adjustments. Therefore skip the patchlet.
*/
if (probe_pass && i == pos) {
- i += delta + 1;
- insn++;
+ i = end_new;
+ insn = prog->insnsi + end_old;
}
code = insn->code;
- if (BPF_CLASS(code) != BPF_JMP ||
+ if ((BPF_CLASS(code) != BPF_JMP &&
+ BPF_CLASS(code) != BPF_JMP32) ||
BPF_OP(code) == BPF_EXIT)
continue;
/* Adjust offset of jmps if we cross patch boundaries. */
if (BPF_OP(code) == BPF_CALL) {
if (insn->src_reg != BPF_PSEUDO_CALL)
continue;
- ret = bpf_adj_delta_to_imm(insn, pos, delta, i,
- probe_pass);
+ ret = bpf_adj_delta_to_imm(insn, pos, end_old,
+ end_new, i, probe_pass);
} else {
- ret = bpf_adj_delta_to_off(insn, pos, delta, i,
- probe_pass);
+ ret = bpf_adj_delta_to_off(insn, pos, end_old,
+ end_new, i, probe_pass);
}
if (ret)
break;
@@ -406,6 +435,7 @@ struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off,
u32 insn_adj_cnt, insn_rest, insn_delta = len - 1;
const u32 cnt_max = S16_MAX;
struct bpf_prog *prog_adj;
+ int err;
/* Since our patchlet doesn't expand the image, we're done. */
if (insn_delta == 0) {
@@ -421,8 +451,8 @@ struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off,
* we afterwards may not fail anymore.
*/
if (insn_adj_cnt > cnt_max &&
- bpf_adj_branches(prog, off, insn_delta, true))
- return NULL;
+ (err = bpf_adj_branches(prog, off, off + 1, off + len, true)))
+ return ERR_PTR(err);
/* Several new instructions need to be inserted. Make room
* for them. Likely, there's no need for a new allocation as
@@ -431,7 +461,7 @@ struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off,
prog_adj = bpf_prog_realloc(prog, bpf_prog_size(insn_adj_cnt),
GFP_USER);
if (!prog_adj)
- return NULL;
+ return ERR_PTR(-ENOMEM);
prog_adj->len = insn_adj_cnt;
@@ -453,13 +483,25 @@ struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off,
* the ship has sailed to reverse to the original state. An
* overflow cannot happen at this point.
*/
- BUG_ON(bpf_adj_branches(prog_adj, off, insn_delta, false));
+ BUG_ON(bpf_adj_branches(prog_adj, off, off + 1, off + len, false));
bpf_adj_linfo(prog_adj, off, insn_delta);
return prog_adj;
}
+int bpf_remove_insns(struct bpf_prog *prog, u32 off, u32 cnt)
+{
+ /* Branch offsets can't overflow when program is shrinking, no need
+ * to call bpf_adj_branches(..., true) here
+ */
+ memmove(prog->insnsi + off, prog->insnsi + off + cnt,
+ sizeof(struct bpf_insn) * (prog->len - off - cnt));
+ prog->len -= cnt;
+
+ return WARN_ON_ONCE(bpf_adj_branches(prog, off, off + cnt, off, false));
+}
+
void bpf_prog_kallsyms_del_subprogs(struct bpf_prog *fp)
{
int i;
@@ -495,7 +537,7 @@ bpf_get_prog_addr_region(const struct bpf_prog *prog,
*symbol_end = addr + hdr->pages * PAGE_SIZE;
}
-static void bpf_get_prog_name(const struct bpf_prog *prog, char *sym)
+void bpf_get_prog_name(const struct bpf_prog *prog, char *sym)
{
const char *end = sym + KSYM_NAME_LEN;
const struct btf_type *type;
@@ -804,7 +846,6 @@ void __weak bpf_jit_free(struct bpf_prog *fp)
if (fp->jited) {
struct bpf_binary_header *hdr = bpf_jit_binary_hdr(fp);
- bpf_jit_binary_unlock_ro(hdr);
bpf_jit_binary_free(hdr);
WARN_ON_ONCE(!bpf_prog_kallsyms_verify_off(fp));
@@ -934,6 +975,27 @@ static int bpf_jit_blind_insn(const struct bpf_insn *from,
*to++ = BPF_JMP_REG(from->code, from->dst_reg, BPF_REG_AX, off);
break;
+ case BPF_JMP32 | BPF_JEQ | BPF_K:
+ case BPF_JMP32 | BPF_JNE | BPF_K:
+ case BPF_JMP32 | BPF_JGT | BPF_K:
+ case BPF_JMP32 | BPF_JLT | BPF_K:
+ case BPF_JMP32 | BPF_JGE | BPF_K:
+ case BPF_JMP32 | BPF_JLE | BPF_K:
+ case BPF_JMP32 | BPF_JSGT | BPF_K:
+ case BPF_JMP32 | BPF_JSLT | BPF_K:
+ case BPF_JMP32 | BPF_JSGE | BPF_K:
+ case BPF_JMP32 | BPF_JSLE | BPF_K:
+ case BPF_JMP32 | BPF_JSET | BPF_K:
+ /* Accommodate for extra offset in case of a backjump. */
+ off = from->off;
+ if (off < 0)
+ off -= 2;
+ *to++ = BPF_ALU32_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ from->imm);
+ *to++ = BPF_ALU32_IMM(BPF_XOR, BPF_REG_AX, imm_rnd);
+ *to++ = BPF_JMP32_REG(from->code, from->dst_reg, BPF_REG_AX,
+ off);
+ break;
+
case BPF_LD | BPF_IMM | BPF_DW:
*to++ = BPF_ALU64_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ aux[1].imm);
*to++ = BPF_ALU64_IMM(BPF_XOR, BPF_REG_AX, imm_rnd);
@@ -1031,13 +1093,13 @@ struct bpf_prog *bpf_jit_blind_constants(struct bpf_prog *prog)
continue;
tmp = bpf_patch_insn_single(clone, i, insn_buff, rewritten);
- if (!tmp) {
+ if (IS_ERR(tmp)) {
/* Patching may have repointed aux->prog during
* realloc from the original one, so we need to
* fix it up here on error.
*/
bpf_jit_prog_release_other(prog, clone);
- return ERR_PTR(-ENOMEM);
+ return tmp;
}
clone = tmp;
@@ -1130,6 +1192,31 @@ EXPORT_SYMBOL_GPL(__bpf_call_base);
INSN_2(JMP, CALL), \
/* Exit instruction. */ \
INSN_2(JMP, EXIT), \
+ /* 32-bit Jump instructions. */ \
+ /* Register based. */ \
+ INSN_3(JMP32, JEQ, X), \
+ INSN_3(JMP32, JNE, X), \
+ INSN_3(JMP32, JGT, X), \
+ INSN_3(JMP32, JLT, X), \
+ INSN_3(JMP32, JGE, X), \
+ INSN_3(JMP32, JLE, X), \
+ INSN_3(JMP32, JSGT, X), \
+ INSN_3(JMP32, JSLT, X), \
+ INSN_3(JMP32, JSGE, X), \
+ INSN_3(JMP32, JSLE, X), \
+ INSN_3(JMP32, JSET, X), \
+ /* Immediate based. */ \
+ INSN_3(JMP32, JEQ, K), \
+ INSN_3(JMP32, JNE, K), \
+ INSN_3(JMP32, JGT, K), \
+ INSN_3(JMP32, JLT, K), \
+ INSN_3(JMP32, JGE, K), \
+ INSN_3(JMP32, JLE, K), \
+ INSN_3(JMP32, JSGT, K), \
+ INSN_3(JMP32, JSLT, K), \
+ INSN_3(JMP32, JSGE, K), \
+ INSN_3(JMP32, JSLE, K), \
+ INSN_3(JMP32, JSET, K), \
/* Jump instructions. */ \
/* Register based. */ \
INSN_3(JMP, JEQ, X), \
@@ -1202,8 +1289,9 @@ bool bpf_opcode_in_insntable(u8 code)
#ifndef CONFIG_BPF_JIT_ALWAYS_ON
/**
* __bpf_prog_run - run eBPF program on a given context
- * @ctx: is the data we are operating on
+ * @regs: is the array of MAX_BPF_EXT_REG eBPF pseudo-registers
* @insn: is the array of eBPF instructions
+ * @stack: is the eBPF storage stack
*
* Decode and execute eBPF instructions.
*/
@@ -1276,10 +1364,10 @@ select_insn:
insn++;
CONT;
ALU_ARSH_X:
- DST = (u64) (u32) ((*(s32 *) &DST) >> SRC);
+ DST = (u64) (u32) (((s32) DST) >> SRC);
CONT;
ALU_ARSH_K:
- DST = (u64) (u32) ((*(s32 *) &DST) >> IMM);
+ DST = (u64) (u32) (((s32) DST) >> IMM);
CONT;
ALU64_ARSH_X:
(*(s64 *) &DST) >>= SRC;
@@ -1390,145 +1478,49 @@ select_insn:
out:
CONT;
}
- /* JMP */
JMP_JA:
insn += insn->off;
CONT;
- JMP_JEQ_X:
- if (DST == SRC) {
- insn += insn->off;
- CONT_JMP;
- }
- CONT;
- JMP_JEQ_K:
- if (DST == IMM) {
- insn += insn->off;
- CONT_JMP;
- }
- CONT;
- JMP_JNE_X:
- if (DST != SRC) {
- insn += insn->off;
- CONT_JMP;
- }
- CONT;
- JMP_JNE_K:
- if (DST != IMM) {
- insn += insn->off;
- CONT_JMP;
- }
- CONT;
- JMP_JGT_X:
- if (DST > SRC) {
- insn += insn->off;
- CONT_JMP;
- }
- CONT;
- JMP_JGT_K:
- if (DST > IMM) {
- insn += insn->off;
- CONT_JMP;
- }
- CONT;
- JMP_JLT_X:
- if (DST < SRC) {
- insn += insn->off;
- CONT_JMP;
- }
- CONT;
- JMP_JLT_K:
- if (DST < IMM) {
- insn += insn->off;
- CONT_JMP;
- }
- CONT;
- JMP_JGE_X:
- if (DST >= SRC) {
- insn += insn->off;
- CONT_JMP;
- }
- CONT;
- JMP_JGE_K:
- if (DST >= IMM) {
- insn += insn->off;
- CONT_JMP;
- }
- CONT;
- JMP_JLE_X:
- if (DST <= SRC) {
- insn += insn->off;
- CONT_JMP;
- }
- CONT;
- JMP_JLE_K:
- if (DST <= IMM) {
- insn += insn->off;
- CONT_JMP;
- }
- CONT;
- JMP_JSGT_X:
- if (((s64) DST) > ((s64) SRC)) {
- insn += insn->off;
- CONT_JMP;
- }
- CONT;
- JMP_JSGT_K:
- if (((s64) DST) > ((s64) IMM)) {
- insn += insn->off;
- CONT_JMP;
- }
- CONT;
- JMP_JSLT_X:
- if (((s64) DST) < ((s64) SRC)) {
- insn += insn->off;
- CONT_JMP;
- }
- CONT;
- JMP_JSLT_K:
- if (((s64) DST) < ((s64) IMM)) {
- insn += insn->off;
- CONT_JMP;
- }
- CONT;
- JMP_JSGE_X:
- if (((s64) DST) >= ((s64) SRC)) {
- insn += insn->off;
- CONT_JMP;
- }
- CONT;
- JMP_JSGE_K:
- if (((s64) DST) >= ((s64) IMM)) {
- insn += insn->off;
- CONT_JMP;
- }
- CONT;
- JMP_JSLE_X:
- if (((s64) DST) <= ((s64) SRC)) {
- insn += insn->off;
- CONT_JMP;
- }
- CONT;
- JMP_JSLE_K:
- if (((s64) DST) <= ((s64) IMM)) {
- insn += insn->off;
- CONT_JMP;
- }
- CONT;
- JMP_JSET_X:
- if (DST & SRC) {
- insn += insn->off;
- CONT_JMP;
- }
- CONT;
- JMP_JSET_K:
- if (DST & IMM) {
- insn += insn->off;
- CONT_JMP;
- }
- CONT;
JMP_EXIT:
return BPF_R0;
-
+ /* JMP */
+#define COND_JMP(SIGN, OPCODE, CMP_OP) \
+ JMP_##OPCODE##_X: \
+ if ((SIGN##64) DST CMP_OP (SIGN##64) SRC) { \
+ insn += insn->off; \
+ CONT_JMP; \
+ } \
+ CONT; \
+ JMP32_##OPCODE##_X: \
+ if ((SIGN##32) DST CMP_OP (SIGN##32) SRC) { \
+ insn += insn->off; \
+ CONT_JMP; \
+ } \
+ CONT; \
+ JMP_##OPCODE##_K: \
+ if ((SIGN##64) DST CMP_OP (SIGN##64) IMM) { \
+ insn += insn->off; \
+ CONT_JMP; \
+ } \
+ CONT; \
+ JMP32_##OPCODE##_K: \
+ if ((SIGN##32) DST CMP_OP (SIGN##32) IMM) { \
+ insn += insn->off; \
+ CONT_JMP; \
+ } \
+ CONT;
+ COND_JMP(u, JEQ, ==)
+ COND_JMP(u, JNE, !=)
+ COND_JMP(u, JGT, >)
+ COND_JMP(u, JLT, <)
+ COND_JMP(u, JGE, >=)
+ COND_JMP(u, JLE, <=)
+ COND_JMP(u, JSET, &)
+ COND_JMP(s, JSGT, >)
+ COND_JMP(s, JSLT, <)
+ COND_JMP(s, JSGE, >=)
+ COND_JMP(s, JSLE, <=)
+#undef COND_JMP
/* STX and ST and LDX*/
#define LDST(SIZEOP, SIZE) \
STX_MEM_##SIZEOP: \
@@ -1799,38 +1791,42 @@ struct bpf_prog_array *bpf_prog_array_alloc(u32 prog_cnt, gfp_t flags)
return &empty_prog_array.hdr;
}
-void bpf_prog_array_free(struct bpf_prog_array __rcu *progs)
+void bpf_prog_array_free(struct bpf_prog_array *progs)
{
- if (!progs ||
- progs == (struct bpf_prog_array __rcu *)&empty_prog_array.hdr)
+ if (!progs || progs == &empty_prog_array.hdr)
return;
kfree_rcu(progs, rcu);
}
-int bpf_prog_array_length(struct bpf_prog_array __rcu *array)
+int bpf_prog_array_length(struct bpf_prog_array *array)
{
struct bpf_prog_array_item *item;
u32 cnt = 0;
- rcu_read_lock();
- item = rcu_dereference(array)->items;
- for (; item->prog; item++)
+ for (item = array->items; item->prog; item++)
if (item->prog != &dummy_bpf_prog.prog)
cnt++;
- rcu_read_unlock();
return cnt;
}
+bool bpf_prog_array_is_empty(struct bpf_prog_array *array)
+{
+ struct bpf_prog_array_item *item;
-static bool bpf_prog_array_copy_core(struct bpf_prog_array __rcu *array,
+ for (item = array->items; item->prog; item++)
+ if (item->prog != &dummy_bpf_prog.prog)
+ return false;
+ return true;
+}
+
+static bool bpf_prog_array_copy_core(struct bpf_prog_array *array,
u32 *prog_ids,
u32 request_cnt)
{
struct bpf_prog_array_item *item;
int i = 0;
- item = rcu_dereference_check(array, 1)->items;
- for (; item->prog; item++) {
+ for (item = array->items; item->prog; item++) {
if (item->prog == &dummy_bpf_prog.prog)
continue;
prog_ids[i] = item->prog->aux->id;
@@ -1843,7 +1839,7 @@ static bool bpf_prog_array_copy_core(struct bpf_prog_array __rcu *array,
return !!(item->prog);
}
-int bpf_prog_array_copy_to_user(struct bpf_prog_array __rcu *array,
+int bpf_prog_array_copy_to_user(struct bpf_prog_array *array,
__u32 __user *prog_ids, u32 cnt)
{
unsigned long err = 0;
@@ -1854,18 +1850,12 @@ int bpf_prog_array_copy_to_user(struct bpf_prog_array __rcu *array,
* cnt = bpf_prog_array_length();
* if (cnt > 0)
* bpf_prog_array_copy_to_user(..., cnt);
- * so below kcalloc doesn't need extra cnt > 0 check, but
- * bpf_prog_array_length() releases rcu lock and
- * prog array could have been swapped with empty or larger array,
- * so always copy 'cnt' prog_ids to the user.
- * In a rare race the user will see zero prog_ids
+ * so below kcalloc doesn't need extra cnt > 0 check.
*/
ids = kcalloc(cnt, sizeof(u32), GFP_USER | __GFP_NOWARN);
if (!ids)
return -ENOMEM;
- rcu_read_lock();
nospc = bpf_prog_array_copy_core(array, ids, cnt);
- rcu_read_unlock();
err = copy_to_user(prog_ids, ids, cnt * sizeof(u32));
kfree(ids);
if (err)
@@ -1875,19 +1865,19 @@ int bpf_prog_array_copy_to_user(struct bpf_prog_array __rcu *array,
return 0;
}
-void bpf_prog_array_delete_safe(struct bpf_prog_array __rcu *array,
+void bpf_prog_array_delete_safe(struct bpf_prog_array *array,
struct bpf_prog *old_prog)
{
- struct bpf_prog_array_item *item = array->items;
+ struct bpf_prog_array_item *item;
- for (; item->prog; item++)
+ for (item = array->items; item->prog; item++)
if (item->prog == old_prog) {
WRITE_ONCE(item->prog, &dummy_bpf_prog.prog);
break;
}
}
-int bpf_prog_array_copy(struct bpf_prog_array __rcu *old_array,
+int bpf_prog_array_copy(struct bpf_prog_array *old_array,
struct bpf_prog *exclude_prog,
struct bpf_prog *include_prog,
struct bpf_prog_array **new_array)
@@ -1951,7 +1941,7 @@ int bpf_prog_array_copy(struct bpf_prog_array __rcu *old_array,
return 0;
}
-int bpf_prog_array_copy_info(struct bpf_prog_array __rcu *array,
+int bpf_prog_array_copy_info(struct bpf_prog_array *array,
u32 *prog_ids, u32 request_cnt,
u32 *prog_cnt)
{
@@ -2036,6 +2026,8 @@ const struct bpf_func_proto bpf_map_delete_elem_proto __weak;
const struct bpf_func_proto bpf_map_push_elem_proto __weak;
const struct bpf_func_proto bpf_map_pop_elem_proto __weak;
const struct bpf_func_proto bpf_map_peek_elem_proto __weak;
+const struct bpf_func_proto bpf_spin_lock_proto __weak;
+const struct bpf_func_proto bpf_spin_unlock_proto __weak;
const struct bpf_func_proto bpf_get_prandom_u32_proto __weak;
const struct bpf_func_proto bpf_get_smp_processor_id_proto __weak;
@@ -2092,6 +2084,15 @@ bool __weak bpf_helper_changes_pkt_data(void *func)
return false;
}
+/* Return TRUE if the JIT backend wants verifier to enable sub-register usage
+ * analysis code and wants explicit zero extension inserted by verifier.
+ * Otherwise, return FALSE.
+ */
+bool __weak bpf_jit_needs_zext(void)
+{
+ return false;
+}
+
/* To execute LD_ABS/LD_IND instructions __bpf_prog_run() may call
* skb_copy_bits(), so provide a weak definition of it for NET-less config.
*/
@@ -2101,8 +2102,12 @@ int __weak skb_copy_bits(const struct sk_buff *skb, int offset, void *to,
return -EFAULT;
}
+DEFINE_STATIC_KEY_FALSE(bpf_stats_enabled_key);
+EXPORT_SYMBOL(bpf_stats_enabled_key);
+
/* All definitions of tracepoints related to BPF. */
#define CREATE_TRACE_POINTS
#include <linux/bpf_trace.h>
EXPORT_TRACEPOINT_SYMBOL_GPL(xdp_exception);
+EXPORT_TRACEPOINT_SYMBOL_GPL(xdp_bulk_tx);
diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c
index 8974b3755670..ef49e17ae47c 100644
--- a/kernel/bpf/cpumap.c
+++ b/kernel/bpf/cpumap.c
@@ -1,7 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-only
/* bpf/cpumap.c
*
* Copyright (c) 2017 Jesper Dangaard Brouer, Red Hat Inc.
- * Released under terms in GPL version 2. See COPYING.
*/
/* The 'cpumap' is primarily used as a backend map for XDP BPF helper
@@ -32,14 +32,19 @@
/* General idea: XDP packets getting XDP redirected to another CPU,
* will maximum be stored/queued for one driver ->poll() call. It is
- * guaranteed that setting flush bit and flush operation happen on
+ * guaranteed that queueing the frame and the flush operation happen on
* same CPU. Thus, cpu_map_flush operation can deduct via this_cpu_ptr()
* which queue in bpf_cpu_map_entry contains packets.
*/
#define CPU_MAP_BULK_SIZE 8 /* 8 == one cacheline on 64-bit archs */
+struct bpf_cpu_map_entry;
+struct bpf_cpu_map;
+
struct xdp_bulk_queue {
void *q[CPU_MAP_BULK_SIZE];
+ struct list_head flush_node;
+ struct bpf_cpu_map_entry *obj;
unsigned int count;
};
@@ -52,6 +57,8 @@ struct bpf_cpu_map_entry {
/* XDP can run multiple RX-ring queues, need __percpu enqueue store */
struct xdp_bulk_queue __percpu *bulkq;
+ struct bpf_cpu_map *cmap;
+
/* Queue with potential multi-producers, and single-consumer kthread */
struct ptr_ring *queue;
struct task_struct *kthread;
@@ -65,23 +72,17 @@ struct bpf_cpu_map {
struct bpf_map map;
/* Below members specific for map type */
struct bpf_cpu_map_entry **cpu_map;
- unsigned long __percpu *flush_needed;
+ struct list_head __percpu *flush_list;
};
-static int bq_flush_to_queue(struct bpf_cpu_map_entry *rcpu,
- struct xdp_bulk_queue *bq, bool in_napi_ctx);
-
-static u64 cpu_map_bitmap_size(const union bpf_attr *attr)
-{
- return BITS_TO_LONGS(attr->max_entries) * sizeof(unsigned long);
-}
+static int bq_flush_to_queue(struct xdp_bulk_queue *bq, bool in_napi_ctx);
static struct bpf_map *cpu_map_alloc(union bpf_attr *attr)
{
struct bpf_cpu_map *cmap;
int err = -ENOMEM;
+ int ret, cpu;
u64 cost;
- int ret;
if (!capable(CAP_SYS_ADMIN))
return ERR_PTR(-EPERM);
@@ -105,23 +106,21 @@ static struct bpf_map *cpu_map_alloc(union bpf_attr *attr)
/* make sure page count doesn't overflow */
cost = (u64) cmap->map.max_entries * sizeof(struct bpf_cpu_map_entry *);
- cost += cpu_map_bitmap_size(attr) * num_possible_cpus();
- if (cost >= U32_MAX - PAGE_SIZE)
- goto free_cmap;
- cmap->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
+ cost += sizeof(struct list_head) * num_possible_cpus();
/* Notice returns -EPERM on if map size is larger than memlock limit */
- ret = bpf_map_precharge_memlock(cmap->map.pages);
+ ret = bpf_map_charge_init(&cmap->map.memory, cost);
if (ret) {
err = ret;
goto free_cmap;
}
- /* A per cpu bitfield with a bit per possible CPU in map */
- cmap->flush_needed = __alloc_percpu(cpu_map_bitmap_size(attr),
- __alignof__(unsigned long));
- if (!cmap->flush_needed)
- goto free_cmap;
+ cmap->flush_list = alloc_percpu(struct list_head);
+ if (!cmap->flush_list)
+ goto free_charge;
+
+ for_each_possible_cpu(cpu)
+ INIT_LIST_HEAD(per_cpu_ptr(cmap->flush_list, cpu));
/* Alloc array for possible remote "destination" CPUs */
cmap->cpu_map = bpf_map_area_alloc(cmap->map.max_entries *
@@ -132,7 +131,9 @@ static struct bpf_map *cpu_map_alloc(union bpf_attr *attr)
return &cmap->map;
free_percpu:
- free_percpu(cmap->flush_needed);
+ free_percpu(cmap->flush_list);
+free_charge:
+ bpf_map_charge_finish(&cmap->map.memory);
free_cmap:
kfree(cmap);
return ERR_PTR(err);
@@ -160,11 +161,15 @@ static void cpu_map_kthread_stop(struct work_struct *work)
}
static struct sk_buff *cpu_map_build_skb(struct bpf_cpu_map_entry *rcpu,
- struct xdp_frame *xdpf)
+ struct xdp_frame *xdpf,
+ struct sk_buff *skb)
{
+ unsigned int hard_start_headroom;
unsigned int frame_size;
void *pkt_data_start;
- struct sk_buff *skb;
+
+ /* Part of headroom was reserved to xdpf */
+ hard_start_headroom = sizeof(struct xdp_frame) + xdpf->headroom;
/* build_skb need to place skb_shared_info after SKB end, and
* also want to know the memory "truesize". Thus, need to
@@ -183,15 +188,15 @@ static struct sk_buff *cpu_map_build_skb(struct bpf_cpu_map_entry *rcpu,
* is not at a fixed memory location, with mixed length
* packets, which is bad for cache-line hotness.
*/
- frame_size = SKB_DATA_ALIGN(xdpf->len + xdpf->headroom) +
+ frame_size = SKB_DATA_ALIGN(xdpf->len + hard_start_headroom) +
SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
- pkt_data_start = xdpf->data - xdpf->headroom;
- skb = build_skb(pkt_data_start, frame_size);
- if (!skb)
+ pkt_data_start = xdpf->data - hard_start_headroom;
+ skb = build_skb_around(skb, pkt_data_start, frame_size);
+ if (unlikely(!skb))
return NULL;
- skb_reserve(skb, xdpf->headroom);
+ skb_reserve(skb, hard_start_headroom);
__skb_put(skb, xdpf->len);
if (xdpf->metasize)
skb_metadata_set(skb, xdpf->metasize);
@@ -205,6 +210,12 @@ static struct sk_buff *cpu_map_build_skb(struct bpf_cpu_map_entry *rcpu,
* - RX ring dev queue index (skb_record_rx_queue)
*/
+ /* Until page_pool get SKB return path, release DMA here */
+ xdp_release_frame(xdpf);
+
+ /* Allow SKB to reuse area used by xdp_frame */
+ xdp_scrub_frame(xdpf);
+
return skb;
}
@@ -233,6 +244,8 @@ static void put_cpu_map_entry(struct bpf_cpu_map_entry *rcpu)
}
}
+#define CPUMAP_BATCH 8
+
static int cpu_map_kthread_run(void *data)
{
struct bpf_cpu_map_entry *rcpu = data;
@@ -245,8 +258,11 @@ static int cpu_map_kthread_run(void *data)
* kthread_stop signal until queue is empty.
*/
while (!kthread_should_stop() || !__ptr_ring_empty(rcpu->queue)) {
- unsigned int processed = 0, drops = 0, sched = 0;
- struct xdp_frame *xdpf;
+ unsigned int drops = 0, sched = 0;
+ void *frames[CPUMAP_BATCH];
+ void *skbs[CPUMAP_BATCH];
+ gfp_t gfp = __GFP_ZERO | GFP_ATOMIC;
+ int i, n, m;
/* Release CPU reschedule checks */
if (__ptr_ring_empty(rcpu->queue)) {
@@ -262,18 +278,38 @@ static int cpu_map_kthread_run(void *data)
sched = cond_resched();
}
- /* Process packets in rcpu->queue */
- local_bh_disable();
/*
* The bpf_cpu_map_entry is single consumer, with this
* kthread CPU pinned. Lockless access to ptr_ring
* consume side valid as no-resize allowed of queue.
*/
- while ((xdpf = __ptr_ring_consume(rcpu->queue))) {
- struct sk_buff *skb;
+ n = ptr_ring_consume_batched(rcpu->queue, frames, CPUMAP_BATCH);
+
+ for (i = 0; i < n; i++) {
+ void *f = frames[i];
+ struct page *page = virt_to_page(f);
+
+ /* Bring struct page memory area to curr CPU. Read by
+ * build_skb_around via page_is_pfmemalloc(), and when
+ * freed written by page_frag_free call.
+ */
+ prefetchw(page);
+ }
+
+ m = kmem_cache_alloc_bulk(skbuff_head_cache, gfp, n, skbs);
+ if (unlikely(m == 0)) {
+ for (i = 0; i < n; i++)
+ skbs[i] = NULL; /* effect: xdp_return_frame */
+ drops = n;
+ }
+
+ local_bh_disable();
+ for (i = 0; i < n; i++) {
+ struct xdp_frame *xdpf = frames[i];
+ struct sk_buff *skb = skbs[i];
int ret;
- skb = cpu_map_build_skb(rcpu, xdpf);
+ skb = cpu_map_build_skb(rcpu, xdpf, skb);
if (!skb) {
xdp_return_frame(xdpf);
continue;
@@ -283,13 +319,9 @@ static int cpu_map_kthread_run(void *data)
ret = netif_receive_skb_core(skb);
if (ret == NET_RX_DROP)
drops++;
-
- /* Limit BH-disable period */
- if (++processed == 8)
- break;
}
/* Feedback loop via tracepoint */
- trace_xdp_cpumap_kthread(rcpu->map_id, processed, drops, sched);
+ trace_xdp_cpumap_kthread(rcpu->map_id, n, drops, sched);
local_bh_enable(); /* resched point, may call do_softirq() */
}
@@ -304,7 +336,8 @@ static struct bpf_cpu_map_entry *__cpu_map_entry_alloc(u32 qsize, u32 cpu,
{
gfp_t gfp = GFP_KERNEL | __GFP_NOWARN;
struct bpf_cpu_map_entry *rcpu;
- int numa, err;
+ struct xdp_bulk_queue *bq;
+ int numa, err, i;
/* Have map->numa_node, but choose node of redirect target CPU */
numa = cpu_to_node(cpu);
@@ -319,6 +352,11 @@ static struct bpf_cpu_map_entry *__cpu_map_entry_alloc(u32 qsize, u32 cpu,
if (!rcpu->bulkq)
goto free_rcu;
+ for_each_possible_cpu(i) {
+ bq = per_cpu_ptr(rcpu->bulkq, i);
+ bq->obj = rcpu;
+ }
+
/* Alloc queue */
rcpu->queue = kzalloc_node(sizeof(*rcpu->queue), gfp, numa);
if (!rcpu->queue)
@@ -375,7 +413,7 @@ static void __cpu_map_entry_free(struct rcu_head *rcu)
struct xdp_bulk_queue *bq = per_cpu_ptr(rcpu->bulkq, cpu);
/* No concurrent bq_enqueue can run at this point */
- bq_flush_to_queue(rcpu, bq, false);
+ bq_flush_to_queue(bq, false);
}
free_percpu(rcpu->bulkq);
/* Cannot kthread_stop() here, last put free rcpu resources */
@@ -458,6 +496,7 @@ static int cpu_map_update_elem(struct bpf_map *map, void *key, void *value,
rcpu = __cpu_map_entry_alloc(qsize, key_cpu, map->id);
if (!rcpu)
return -ENOMEM;
+ rcpu->cmap = cmap;
}
rcu_read_lock();
__cpu_map_entry_replace(cmap, key_cpu, rcpu);
@@ -484,14 +523,14 @@ static void cpu_map_free(struct bpf_map *map)
synchronize_rcu();
/* To ensure all pending flush operations have completed wait for flush
- * bitmap to indicate all flush_needed bits to be zero on _all_ cpus.
- * Because the above synchronize_rcu() ensures the map is disconnected
- * from the program we can assume no new bits will be set.
+ * list be empty on _all_ cpus. Because the above synchronize_rcu()
+ * ensures the map is disconnected from the program we can assume no new
+ * items will be added to the list.
*/
for_each_online_cpu(cpu) {
- unsigned long *bitmap = per_cpu_ptr(cmap->flush_needed, cpu);
+ struct list_head *flush_list = per_cpu_ptr(cmap->flush_list, cpu);
- while (!bitmap_empty(bitmap, cmap->map.max_entries))
+ while (!list_empty(flush_list))
cond_resched();
}
@@ -508,7 +547,7 @@ static void cpu_map_free(struct bpf_map *map)
/* bq flush and cleanup happens after RCU graze-period */
__cpu_map_entry_replace(cmap, i, NULL); /* call_rcu */
}
- free_percpu(cmap->flush_needed);
+ free_percpu(cmap->flush_list);
bpf_map_area_free(cmap->cpu_map);
kfree(cmap);
}
@@ -560,9 +599,9 @@ const struct bpf_map_ops cpu_map_ops = {
.map_check_btf = map_check_no_btf,
};
-static int bq_flush_to_queue(struct bpf_cpu_map_entry *rcpu,
- struct xdp_bulk_queue *bq, bool in_napi_ctx)
+static int bq_flush_to_queue(struct xdp_bulk_queue *bq, bool in_napi_ctx)
{
+ struct bpf_cpu_map_entry *rcpu = bq->obj;
unsigned int processed = 0, drops = 0;
const int to_cpu = rcpu->cpu;
struct ptr_ring *q;
@@ -591,6 +630,8 @@ static int bq_flush_to_queue(struct bpf_cpu_map_entry *rcpu,
bq->count = 0;
spin_unlock(&q->producer_lock);
+ __list_del_clearprev(&bq->flush_node);
+
/* Feedback loop via tracepoints */
trace_xdp_cpumap_enqueue(rcpu->map_id, processed, drops, to_cpu);
return 0;
@@ -601,10 +642,11 @@ static int bq_flush_to_queue(struct bpf_cpu_map_entry *rcpu,
*/
static int bq_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_frame *xdpf)
{
+ struct list_head *flush_list = this_cpu_ptr(rcpu->cmap->flush_list);
struct xdp_bulk_queue *bq = this_cpu_ptr(rcpu->bulkq);
if (unlikely(bq->count == CPU_MAP_BULK_SIZE))
- bq_flush_to_queue(rcpu, bq, true);
+ bq_flush_to_queue(bq, true);
/* Notice, xdp_buff/page MUST be queued here, long enough for
* driver to code invoking us to finished, due to driver
@@ -616,6 +658,10 @@ static int bq_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_frame *xdpf)
* operation, when completing napi->poll call.
*/
bq->q[bq->count++] = xdpf;
+
+ if (!bq->flush_node.prev)
+ list_add(&bq->flush_node, flush_list);
+
return 0;
}
@@ -635,41 +681,16 @@ int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_buff *xdp,
return 0;
}
-void __cpu_map_insert_ctx(struct bpf_map *map, u32 bit)
-{
- struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map);
- unsigned long *bitmap = this_cpu_ptr(cmap->flush_needed);
-
- __set_bit(bit, bitmap);
-}
-
void __cpu_map_flush(struct bpf_map *map)
{
struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map);
- unsigned long *bitmap = this_cpu_ptr(cmap->flush_needed);
- u32 bit;
-
- /* The napi->poll softirq makes sure __cpu_map_insert_ctx()
- * and __cpu_map_flush() happen on same CPU. Thus, the percpu
- * bitmap indicate which percpu bulkq have packets.
- */
- for_each_set_bit(bit, bitmap, map->max_entries) {
- struct bpf_cpu_map_entry *rcpu = READ_ONCE(cmap->cpu_map[bit]);
- struct xdp_bulk_queue *bq;
-
- /* This is possible if entry is removed by user space
- * between xdp redirect and flush op.
- */
- if (unlikely(!rcpu))
- continue;
-
- __clear_bit(bit, bitmap);
+ struct list_head *flush_list = this_cpu_ptr(cmap->flush_list);
+ struct xdp_bulk_queue *bq, *tmp;
- /* Flush all frames in bulkq to real queue */
- bq = this_cpu_ptr(rcpu->bulkq);
- bq_flush_to_queue(rcpu, bq, true);
+ list_for_each_entry_safe(bq, tmp, flush_list, flush_node) {
+ bq_flush_to_queue(bq, true);
/* If already running, costs spin_lock_irqsave + smb_mb */
- wake_up_process(rcpu->kthread);
+ wake_up_process(bq->obj->kthread);
}
}
diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
index 191b79948424..d83cf8ccc872 100644
--- a/kernel/bpf/devmap.c
+++ b/kernel/bpf/devmap.c
@@ -1,13 +1,5 @@
+// SPDX-License-Identifier: GPL-2.0-only
/* Copyright (c) 2017 Covalent IO, Inc. http://covalent.io
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of version 2 of the GNU General Public
- * License as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
*/
/* Devmaps primary use is as a backend map for XDP BPF helper call
@@ -25,9 +17,8 @@
* datapath always has a valid copy. However, the datapath does a "flush"
* operation that pushes any pending packets in the driver outside the RCU
* critical section. Each bpf_dtab_netdev tracks these pending operations using
- * an atomic per-cpu bitmap. The bpf_dtab_netdev object will not be destroyed
- * until all bits are cleared indicating outstanding flush operations have
- * completed.
+ * a per-cpu flush list. The bpf_dtab_netdev object will not be destroyed until
+ * this list is empty, indicating outstanding flush operations have completed.
*
* BPF syscalls may race with BPF program calls on any of the update, delete
* or lookup operations. As noted above the xchg() operation also keep the
@@ -56,9 +47,13 @@
(BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY)
#define DEV_MAP_BULK_SIZE 16
+struct bpf_dtab_netdev;
+
struct xdp_bulk_queue {
struct xdp_frame *q[DEV_MAP_BULK_SIZE];
+ struct list_head flush_node;
struct net_device *dev_rx;
+ struct bpf_dtab_netdev *obj;
unsigned int count;
};
@@ -73,22 +68,17 @@ struct bpf_dtab_netdev {
struct bpf_dtab {
struct bpf_map map;
struct bpf_dtab_netdev **netdev_map;
- unsigned long __percpu *flush_needed;
+ struct list_head __percpu *flush_list;
struct list_head list;
};
static DEFINE_SPINLOCK(dev_map_lock);
static LIST_HEAD(dev_map_list);
-static u64 dev_map_bitmap_size(const union bpf_attr *attr)
-{
- return BITS_TO_LONGS((u64) attr->max_entries) * sizeof(unsigned long);
-}
-
static struct bpf_map *dev_map_alloc(union bpf_attr *attr)
{
struct bpf_dtab *dtab;
- int err = -EINVAL;
+ int err, cpu;
u64 cost;
if (!capable(CAP_NET_ADMIN))
@@ -99,6 +89,11 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr)
attr->value_size != 4 || attr->map_flags & ~DEV_CREATE_FLAG_MASK)
return ERR_PTR(-EINVAL);
+ /* Lookup returns a pointer straight to dev->ifindex, so make sure the
+ * verifier prevents writes from the BPF side
+ */
+ attr->map_flags |= BPF_F_RDONLY_PROG;
+
dtab = kzalloc(sizeof(*dtab), GFP_USER);
if (!dtab)
return ERR_PTR(-ENOMEM);
@@ -107,39 +102,39 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr)
/* make sure page count doesn't overflow */
cost = (u64) dtab->map.max_entries * sizeof(struct bpf_dtab_netdev *);
- cost += dev_map_bitmap_size(attr) * num_possible_cpus();
- if (cost >= U32_MAX - PAGE_SIZE)
- goto free_dtab;
+ cost += sizeof(struct list_head) * num_possible_cpus();
- dtab->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
-
- /* if map size is larger than memlock limit, reject it early */
- err = bpf_map_precharge_memlock(dtab->map.pages);
+ /* if map size is larger than memlock limit, reject it */
+ err = bpf_map_charge_init(&dtab->map.memory, cost);
if (err)
goto free_dtab;
err = -ENOMEM;
- /* A per cpu bitfield with a bit per possible net device */
- dtab->flush_needed = __alloc_percpu_gfp(dev_map_bitmap_size(attr),
- __alignof__(unsigned long),
- GFP_KERNEL | __GFP_NOWARN);
- if (!dtab->flush_needed)
- goto free_dtab;
+ dtab->flush_list = alloc_percpu(struct list_head);
+ if (!dtab->flush_list)
+ goto free_charge;
+
+ for_each_possible_cpu(cpu)
+ INIT_LIST_HEAD(per_cpu_ptr(dtab->flush_list, cpu));
dtab->netdev_map = bpf_map_area_alloc(dtab->map.max_entries *
sizeof(struct bpf_dtab_netdev *),
dtab->map.numa_node);
if (!dtab->netdev_map)
- goto free_dtab;
+ goto free_percpu;
spin_lock(&dev_map_lock);
list_add_tail_rcu(&dtab->list, &dev_map_list);
spin_unlock(&dev_map_lock);
return &dtab->map;
+
+free_percpu:
+ free_percpu(dtab->flush_list);
+free_charge:
+ bpf_map_charge_finish(&dtab->map.memory);
free_dtab:
- free_percpu(dtab->flush_needed);
kfree(dtab);
return ERR_PTR(err);
}
@@ -164,15 +159,18 @@ static void dev_map_free(struct bpf_map *map)
bpf_clear_redirect_map(map);
synchronize_rcu();
+ /* Make sure prior __dev_map_entry_free() have completed. */
+ rcu_barrier();
+
/* To ensure all pending flush operations have completed wait for flush
- * bitmap to indicate all flush_needed bits to be zero on _all_ cpus.
+ * list to empty on _all_ cpus.
* Because the above synchronize_rcu() ensures the map is disconnected
- * from the program we can assume no new bits will be set.
+ * from the program we can assume no new items will be added.
*/
for_each_online_cpu(cpu) {
- unsigned long *bitmap = per_cpu_ptr(dtab->flush_needed, cpu);
+ struct list_head *flush_list = per_cpu_ptr(dtab->flush_list, cpu);
- while (!bitmap_empty(bitmap, dtab->map.max_entries))
+ while (!list_empty(flush_list))
cond_resched();
}
@@ -183,11 +181,12 @@ static void dev_map_free(struct bpf_map *map)
if (!dev)
continue;
+ free_percpu(dev->bulkq);
dev_put(dev->dev);
kfree(dev);
}
- free_percpu(dtab->flush_needed);
+ free_percpu(dtab->flush_list);
bpf_map_area_free(dtab->netdev_map);
kfree(dtab);
}
@@ -209,18 +208,10 @@ static int dev_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
return 0;
}
-void __dev_map_insert_ctx(struct bpf_map *map, u32 bit)
-{
- struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
- unsigned long *bitmap = this_cpu_ptr(dtab->flush_needed);
-
- __set_bit(bit, bitmap);
-}
-
-static int bq_xmit_all(struct bpf_dtab_netdev *obj,
- struct xdp_bulk_queue *bq, u32 flags,
+static int bq_xmit_all(struct xdp_bulk_queue *bq, u32 flags,
bool in_napi_ctx)
{
+ struct bpf_dtab_netdev *obj = bq->obj;
struct net_device *dev = obj->dev;
int sent = 0, drops = 0, err = 0;
int i;
@@ -247,6 +238,7 @@ out:
trace_xdp_devmap_xmit(&obj->dtab->map, obj->bit,
sent, drops, bq->dev_rx, dev, err);
bq->dev_rx = NULL;
+ __list_del_clearprev(&bq->flush_node);
return 0;
error:
/* If ndo_xdp_xmit fails with an errno, no frames have been
@@ -269,30 +261,19 @@ error:
* from the driver before returning from its napi->poll() routine. The poll()
* routine is called either from busy_poll context or net_rx_action signaled
* from NET_RX_SOFTIRQ. Either way the poll routine must complete before the
- * net device can be torn down. On devmap tear down we ensure the ctx bitmap
- * is zeroed before completing to ensure all flush operations have completed.
+ * net device can be torn down. On devmap tear down we ensure the flush list
+ * is empty before completing to ensure all flush operations have completed.
*/
void __dev_map_flush(struct bpf_map *map)
{
struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
- unsigned long *bitmap = this_cpu_ptr(dtab->flush_needed);
- u32 bit;
-
- for_each_set_bit(bit, bitmap, map->max_entries) {
- struct bpf_dtab_netdev *dev = READ_ONCE(dtab->netdev_map[bit]);
- struct xdp_bulk_queue *bq;
-
- /* This is possible if the dev entry is removed by user space
- * between xdp redirect and flush op.
- */
- if (unlikely(!dev))
- continue;
-
- __clear_bit(bit, bitmap);
+ struct list_head *flush_list = this_cpu_ptr(dtab->flush_list);
+ struct xdp_bulk_queue *bq, *tmp;
- bq = this_cpu_ptr(dev->bulkq);
- bq_xmit_all(dev, bq, XDP_XMIT_FLUSH, true);
- }
+ rcu_read_lock();
+ list_for_each_entry_safe(bq, tmp, flush_list, flush_node)
+ bq_xmit_all(bq, XDP_XMIT_FLUSH, true);
+ rcu_read_unlock();
}
/* rcu_read_lock (from syscall and BPF contexts) ensures that if a delete and/or
@@ -318,10 +299,11 @@ static int bq_enqueue(struct bpf_dtab_netdev *obj, struct xdp_frame *xdpf,
struct net_device *dev_rx)
{
+ struct list_head *flush_list = this_cpu_ptr(obj->dtab->flush_list);
struct xdp_bulk_queue *bq = this_cpu_ptr(obj->bulkq);
if (unlikely(bq->count == DEV_MAP_BULK_SIZE))
- bq_xmit_all(obj, bq, 0, true);
+ bq_xmit_all(bq, 0, true);
/* Ingress dev_rx will be the same for all xdp_frame's in
* bulk_queue, because bq stored per-CPU and must be flushed
@@ -331,6 +313,10 @@ static int bq_enqueue(struct bpf_dtab_netdev *obj, struct xdp_frame *xdpf,
bq->dev_rx = dev_rx;
bq->q[bq->count++] = xdpf;
+
+ if (!bq->flush_node.prev)
+ list_add(&bq->flush_node, flush_list);
+
return 0;
}
@@ -381,17 +367,14 @@ static void dev_map_flush_old(struct bpf_dtab_netdev *dev)
{
if (dev->dev->netdev_ops->ndo_xdp_xmit) {
struct xdp_bulk_queue *bq;
- unsigned long *bitmap;
-
int cpu;
+ rcu_read_lock();
for_each_online_cpu(cpu) {
- bitmap = per_cpu_ptr(dev->dtab->flush_needed, cpu);
- __clear_bit(dev->bit, bitmap);
-
bq = per_cpu_ptr(dev->bulkq, cpu);
- bq_xmit_all(dev, bq, XDP_XMIT_FLUSH, false);
+ bq_xmit_all(bq, XDP_XMIT_FLUSH, false);
}
+ rcu_read_unlock();
}
}
@@ -436,8 +419,10 @@ static int dev_map_update_elem(struct bpf_map *map, void *key, void *value,
struct net *net = current->nsproxy->net_ns;
gfp_t gfp = GFP_ATOMIC | __GFP_NOWARN;
struct bpf_dtab_netdev *dev, *old_dev;
- u32 i = *(u32 *)key;
u32 ifindex = *(u32 *)value;
+ struct xdp_bulk_queue *bq;
+ u32 i = *(u32 *)key;
+ int cpu;
if (unlikely(map_flags > BPF_EXIST))
return -EINVAL;
@@ -460,6 +445,11 @@ static int dev_map_update_elem(struct bpf_map *map, void *key, void *value,
return -ENOMEM;
}
+ for_each_possible_cpu(cpu) {
+ bq = per_cpu_ptr(dev->bulkq, cpu);
+ bq->obj = dev;
+ }
+
dev->dev = dev_get_by_index(net, ifindex);
if (!dev->dev) {
free_percpu(dev->bulkq);
diff --git a/kernel/bpf/disasm.c b/kernel/bpf/disasm.c
index d6b76377cb6e..b44d8c447afd 100644
--- a/kernel/bpf/disasm.c
+++ b/kernel/bpf/disasm.c
@@ -1,14 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0-only
/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
* Copyright (c) 2016 Facebook
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of version 2 of the GNU General Public
- * License as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
*/
#include <linux/bpf.h>
@@ -67,7 +59,7 @@ const char *const bpf_class_string[8] = {
[BPF_STX] = "stx",
[BPF_ALU] = "alu",
[BPF_JMP] = "jmp",
- [BPF_RET] = "BUG",
+ [BPF_JMP32] = "jmp32",
[BPF_ALU64] = "alu64",
};
@@ -136,23 +128,22 @@ void print_bpf_insn(const struct bpf_insn_cbs *cbs,
else
print_bpf_end_insn(verbose, cbs->private_data, insn);
} else if (BPF_OP(insn->code) == BPF_NEG) {
- verbose(cbs->private_data, "(%02x) r%d = %s-r%d\n",
- insn->code, insn->dst_reg,
- class == BPF_ALU ? "(u32) " : "",
+ verbose(cbs->private_data, "(%02x) %c%d = -%c%d\n",
+ insn->code, class == BPF_ALU ? 'w' : 'r',
+ insn->dst_reg, class == BPF_ALU ? 'w' : 'r',
insn->dst_reg);
} else if (BPF_SRC(insn->code) == BPF_X) {
- verbose(cbs->private_data, "(%02x) %sr%d %s %sr%d\n",
- insn->code, class == BPF_ALU ? "(u32) " : "",
+ verbose(cbs->private_data, "(%02x) %c%d %s %c%d\n",
+ insn->code, class == BPF_ALU ? 'w' : 'r',
insn->dst_reg,
bpf_alu_string[BPF_OP(insn->code) >> 4],
- class == BPF_ALU ? "(u32) " : "",
+ class == BPF_ALU ? 'w' : 'r',
insn->src_reg);
} else {
- verbose(cbs->private_data, "(%02x) %sr%d %s %s%d\n",
- insn->code, class == BPF_ALU ? "(u32) " : "",
+ verbose(cbs->private_data, "(%02x) %c%d %s %d\n",
+ insn->code, class == BPF_ALU ? 'w' : 'r',
insn->dst_reg,
bpf_alu_string[BPF_OP(insn->code) >> 4],
- class == BPF_ALU ? "(u32) " : "",
insn->imm);
}
} else if (class == BPF_STX) {
@@ -206,10 +197,11 @@ void print_bpf_insn(const struct bpf_insn_cbs *cbs,
* part of the ldimm64 insn is accessible.
*/
u64 imm = ((u64)(insn + 1)->imm << 32) | (u32)insn->imm;
- bool map_ptr = insn->src_reg == BPF_PSEUDO_MAP_FD;
+ bool is_ptr = insn->src_reg == BPF_PSEUDO_MAP_FD ||
+ insn->src_reg == BPF_PSEUDO_MAP_VALUE;
char tmp[64];
- if (map_ptr && !allow_ptr_leaks)
+ if (is_ptr && !allow_ptr_leaks)
imm = 0;
verbose(cbs->private_data, "(%02x) r%d = %s\n",
@@ -220,7 +212,7 @@ void print_bpf_insn(const struct bpf_insn_cbs *cbs,
verbose(cbs->private_data, "BUG_ld_%02x\n", insn->code);
return;
}
- } else if (class == BPF_JMP) {
+ } else if (class == BPF_JMP32 || class == BPF_JMP) {
u8 opcode = BPF_OP(insn->code);
if (opcode == BPF_CALL) {
@@ -244,13 +236,18 @@ void print_bpf_insn(const struct bpf_insn_cbs *cbs,
} else if (insn->code == (BPF_JMP | BPF_EXIT)) {
verbose(cbs->private_data, "(%02x) exit\n", insn->code);
} else if (BPF_SRC(insn->code) == BPF_X) {
- verbose(cbs->private_data, "(%02x) if r%d %s r%d goto pc%+d\n",
- insn->code, insn->dst_reg,
+ verbose(cbs->private_data,
+ "(%02x) if %c%d %s %c%d goto pc%+d\n",
+ insn->code, class == BPF_JMP32 ? 'w' : 'r',
+ insn->dst_reg,
bpf_jmp_string[BPF_OP(insn->code) >> 4],
+ class == BPF_JMP32 ? 'w' : 'r',
insn->src_reg, insn->off);
} else {
- verbose(cbs->private_data, "(%02x) if r%d %s 0x%x goto pc%+d\n",
- insn->code, insn->dst_reg,
+ verbose(cbs->private_data,
+ "(%02x) if %c%d %s 0x%x goto pc%+d\n",
+ insn->code, class == BPF_JMP32 ? 'w' : 'r',
+ insn->dst_reg,
bpf_jmp_string[BPF_OP(insn->code) >> 4],
insn->imm, insn->off);
}
diff --git a/kernel/bpf/disasm.h b/kernel/bpf/disasm.h
index e1324a834a24..e546b18d27da 100644
--- a/kernel/bpf/disasm.h
+++ b/kernel/bpf/disasm.h
@@ -1,14 +1,6 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
* Copyright (c) 2016 Facebook
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of version 2 of the GNU General Public
- * License as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
*/
#ifndef __BPF_DISASM_H__
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index f9274114c88d..22066a62c8c9 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -1,14 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0-only
/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
* Copyright (c) 2016 Facebook
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of version 2 of the GNU General Public
- * License as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
*/
#include <linux/bpf.h>
#include <linux/btf.h>
@@ -23,7 +15,7 @@
#define HTAB_CREATE_FLAG_MASK \
(BPF_F_NO_PREALLOC | BPF_F_NO_COMMON_LRU | BPF_F_NUMA_NODE | \
- BPF_F_RDONLY | BPF_F_WRONLY | BPF_F_ZERO_SEED)
+ BPF_F_ACCESS_MASK | BPF_F_ZERO_SEED)
struct bucket {
struct hlist_nulls_head head;
@@ -262,8 +254,8 @@ static int htab_map_alloc_check(union bpf_attr *attr)
/* Guard against local DoS, and discourage production use. */
return -EPERM;
- if (attr->map_flags & ~HTAB_CREATE_FLAG_MASK)
- /* reserved bits should not be used */
+ if (attr->map_flags & ~HTAB_CREATE_FLAG_MASK ||
+ !bpf_map_flags_access_ok(attr->map_flags))
return -EINVAL;
if (!lru && percpu_lru)
@@ -360,14 +352,8 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
else
cost += (u64) htab->elem_size * num_possible_cpus();
- if (cost >= U32_MAX - PAGE_SIZE)
- /* make sure page count doesn't overflow */
- goto free_htab;
-
- htab->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
-
- /* if map size is larger than memlock limit, reject it early */
- err = bpf_map_precharge_memlock(htab->map.pages);
+ /* if map size is larger than memlock limit, reject it */
+ err = bpf_map_charge_init(&htab->map.memory, cost);
if (err)
goto free_htab;
@@ -376,7 +362,7 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
sizeof(struct bucket),
htab->map.numa_node);
if (!htab->buckets)
- goto free_htab;
+ goto free_charge;
if (htab->map.map_flags & BPF_F_ZERO_SEED)
htab->hashrnd = 0;
@@ -409,6 +395,8 @@ free_prealloc:
prealloc_destroy(htab);
free_buckets:
bpf_map_area_free(htab->buckets);
+free_charge:
+ bpf_map_charge_finish(&htab->map.memory);
free_htab:
kfree(htab);
return ERR_PTR(err);
@@ -527,18 +515,30 @@ static u32 htab_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf)
return insn - insn_buf;
}
-static void *htab_lru_map_lookup_elem(struct bpf_map *map, void *key)
+static __always_inline void *__htab_lru_map_lookup_elem(struct bpf_map *map,
+ void *key, const bool mark)
{
struct htab_elem *l = __htab_map_lookup_elem(map, key);
if (l) {
- bpf_lru_node_set_ref(&l->lru_node);
+ if (mark)
+ bpf_lru_node_set_ref(&l->lru_node);
return l->key + round_up(map->key_size, 8);
}
return NULL;
}
+static void *htab_lru_map_lookup_elem(struct bpf_map *map, void *key)
+{
+ return __htab_lru_map_lookup_elem(map, key, true);
+}
+
+static void *htab_lru_map_lookup_elem_sys(struct bpf_map *map, void *key)
+{
+ return __htab_lru_map_lookup_elem(map, key, false);
+}
+
static u32 htab_lru_map_gen_lookup(struct bpf_map *map,
struct bpf_insn *insn_buf)
{
@@ -718,21 +718,12 @@ static bool fd_htab_map_needs_adjust(const struct bpf_htab *htab)
BITS_PER_LONG == 64;
}
-static u32 htab_size_value(const struct bpf_htab *htab, bool percpu)
-{
- u32 size = htab->map.value_size;
-
- if (percpu || fd_htab_map_needs_adjust(htab))
- size = round_up(size, 8);
- return size;
-}
-
static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key,
void *value, u32 key_size, u32 hash,
bool percpu, bool onallcpus,
struct htab_elem *old_elem)
{
- u32 size = htab_size_value(htab, percpu);
+ u32 size = htab->map.value_size;
bool prealloc = htab_is_prealloc(htab);
struct htab_elem *l_new, **pl_new;
void __percpu *pptr;
@@ -770,10 +761,13 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key,
l_new = ERR_PTR(-ENOMEM);
goto dec_count;
}
+ check_and_init_map_lock(&htab->map,
+ l_new->key + round_up(key_size, 8));
}
memcpy(l_new->key, key, key_size);
if (percpu) {
+ size = round_up(size, 8);
if (prealloc) {
pptr = htab_elem_get_ptr(l_new, key_size);
} else {
@@ -791,8 +785,13 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key,
if (!prealloc)
htab_elem_set_ptr(l_new, key_size, pptr);
- } else {
+ } else if (fd_htab_map_needs_adjust(htab)) {
+ size = round_up(size, 8);
memcpy(l_new->key + round_up(key_size, 8), value, size);
+ } else {
+ copy_map_value(&htab->map,
+ l_new->key + round_up(key_size, 8),
+ value);
}
l_new->hash = hash;
@@ -805,11 +804,11 @@ dec_count:
static int check_flags(struct bpf_htab *htab, struct htab_elem *l_old,
u64 map_flags)
{
- if (l_old && map_flags == BPF_NOEXIST)
+ if (l_old && (map_flags & ~BPF_F_LOCK) == BPF_NOEXIST)
/* elem already exists */
return -EEXIST;
- if (!l_old && map_flags == BPF_EXIST)
+ if (!l_old && (map_flags & ~BPF_F_LOCK) == BPF_EXIST)
/* elem doesn't exist, cannot update it */
return -ENOENT;
@@ -828,7 +827,7 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,
u32 key_size, hash;
int ret;
- if (unlikely(map_flags > BPF_EXIST))
+ if (unlikely((map_flags & ~BPF_F_LOCK) > BPF_EXIST))
/* unknown flags */
return -EINVAL;
@@ -841,6 +840,28 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,
b = __select_bucket(htab, hash);
head = &b->head;
+ if (unlikely(map_flags & BPF_F_LOCK)) {
+ if (unlikely(!map_value_has_spin_lock(map)))
+ return -EINVAL;
+ /* find an element without taking the bucket lock */
+ l_old = lookup_nulls_elem_raw(head, hash, key, key_size,
+ htab->n_buckets);
+ ret = check_flags(htab, l_old, map_flags);
+ if (ret)
+ return ret;
+ if (l_old) {
+ /* grab the element lock and update value in place */
+ copy_map_value_locked(map,
+ l_old->key + round_up(key_size, 8),
+ value, false);
+ return 0;
+ }
+ /* fall through, grab the bucket lock and lookup again.
+ * 99.9% chance that the element won't be found,
+ * but second lookup under lock has to be done.
+ */
+ }
+
/* bpf_map_update_elem() can be called in_irq() */
raw_spin_lock_irqsave(&b->lock, flags);
@@ -850,6 +871,20 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,
if (ret)
goto err;
+ if (unlikely(l_old && (map_flags & BPF_F_LOCK))) {
+ /* first lookup without the bucket lock didn't find the element,
+ * but second lookup with the bucket lock found it.
+ * This case is highly unlikely, but has to be dealt with:
+ * grab the element lock in addition to the bucket lock
+ * and update element in place
+ */
+ copy_map_value_locked(map,
+ l_old->key + round_up(key_size, 8),
+ value, false);
+ ret = 0;
+ goto err;
+ }
+
l_new = alloc_htab_elem(htab, key, value, key_size, hash, false, false,
l_old);
if (IS_ERR(l_new)) {
@@ -1215,6 +1250,7 @@ const struct bpf_map_ops htab_lru_map_ops = {
.map_free = htab_map_free,
.map_get_next_key = htab_map_get_next_key,
.map_lookup_elem = htab_lru_map_lookup_elem,
+ .map_lookup_elem_sys_only = htab_lru_map_lookup_elem_sys,
.map_update_elem = htab_lru_map_update_elem,
.map_delete_elem = htab_lru_map_delete_elem,
.map_gen_lookup = htab_lru_map_gen_lookup,
@@ -1246,7 +1282,6 @@ static void *htab_lru_percpu_map_lookup_elem(struct bpf_map *map, void *key)
int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value)
{
- struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
struct htab_elem *l;
void __percpu *pptr;
int ret = -ENOENT;
@@ -1262,8 +1297,9 @@ int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value)
l = __htab_map_lookup_elem(map, key);
if (!l)
goto out;
- if (htab_is_lru(htab))
- bpf_lru_node_set_ref(&l->lru_node);
+ /* We do not mark LRU map element here in order to not mess up
+ * eviction heuristics when user space does a map walk.
+ */
pptr = htab_elem_get_ptr(l, map->key_size);
for_each_possible_cpu(cpu) {
bpf_long_memcpy(value + off,
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index a74972b07e74..5e28718928ca 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -1,13 +1,5 @@
+// SPDX-License-Identifier: GPL-2.0-only
/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of version 2 of the GNU General Public
- * License as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
*/
#include <linux/bpf.h>
#include <linux/rcupdate.h>
@@ -18,6 +10,9 @@
#include <linux/sched.h>
#include <linux/uidgid.h>
#include <linux/filter.h>
+#include <linux/ctype.h>
+
+#include "../../lib/kstrtox.h"
/* If kernel subsystem is allowing eBPF programs to call this function,
* inside its own verifier_ops->get_func_proto() callback it should return
@@ -221,6 +216,102 @@ const struct bpf_func_proto bpf_get_current_comm_proto = {
.arg2_type = ARG_CONST_SIZE,
};
+#if defined(CONFIG_QUEUED_SPINLOCKS) || defined(CONFIG_BPF_ARCH_SPINLOCK)
+
+static inline void __bpf_spin_lock(struct bpf_spin_lock *lock)
+{
+ arch_spinlock_t *l = (void *)lock;
+ union {
+ __u32 val;
+ arch_spinlock_t lock;
+ } u = { .lock = __ARCH_SPIN_LOCK_UNLOCKED };
+
+ compiletime_assert(u.val == 0, "__ARCH_SPIN_LOCK_UNLOCKED not 0");
+ BUILD_BUG_ON(sizeof(*l) != sizeof(__u32));
+ BUILD_BUG_ON(sizeof(*lock) != sizeof(__u32));
+ arch_spin_lock(l);
+}
+
+static inline void __bpf_spin_unlock(struct bpf_spin_lock *lock)
+{
+ arch_spinlock_t *l = (void *)lock;
+
+ arch_spin_unlock(l);
+}
+
+#else
+
+static inline void __bpf_spin_lock(struct bpf_spin_lock *lock)
+{
+ atomic_t *l = (void *)lock;
+
+ BUILD_BUG_ON(sizeof(*l) != sizeof(*lock));
+ do {
+ atomic_cond_read_relaxed(l, !VAL);
+ } while (atomic_xchg(l, 1));
+}
+
+static inline void __bpf_spin_unlock(struct bpf_spin_lock *lock)
+{
+ atomic_t *l = (void *)lock;
+
+ atomic_set_release(l, 0);
+}
+
+#endif
+
+static DEFINE_PER_CPU(unsigned long, irqsave_flags);
+
+notrace BPF_CALL_1(bpf_spin_lock, struct bpf_spin_lock *, lock)
+{
+ unsigned long flags;
+
+ local_irq_save(flags);
+ __bpf_spin_lock(lock);
+ __this_cpu_write(irqsave_flags, flags);
+ return 0;
+}
+
+const struct bpf_func_proto bpf_spin_lock_proto = {
+ .func = bpf_spin_lock,
+ .gpl_only = false,
+ .ret_type = RET_VOID,
+ .arg1_type = ARG_PTR_TO_SPIN_LOCK,
+};
+
+notrace BPF_CALL_1(bpf_spin_unlock, struct bpf_spin_lock *, lock)
+{
+ unsigned long flags;
+
+ flags = __this_cpu_read(irqsave_flags);
+ __bpf_spin_unlock(lock);
+ local_irq_restore(flags);
+ return 0;
+}
+
+const struct bpf_func_proto bpf_spin_unlock_proto = {
+ .func = bpf_spin_unlock,
+ .gpl_only = false,
+ .ret_type = RET_VOID,
+ .arg1_type = ARG_PTR_TO_SPIN_LOCK,
+};
+
+void copy_map_value_locked(struct bpf_map *map, void *dst, void *src,
+ bool lock_src)
+{
+ struct bpf_spin_lock *lock;
+
+ if (lock_src)
+ lock = src + map->spin_lock_off;
+ else
+ lock = dst + map->spin_lock_off;
+ preempt_disable();
+ ____bpf_spin_lock(lock);
+ copy_map_value(map, dst, src);
+ ____bpf_spin_unlock(lock);
+ preempt_enable();
+}
+
#ifdef CONFIG_CGROUPS
BPF_CALL_0(bpf_get_current_cgroup_id)
{
@@ -267,4 +358,132 @@ const struct bpf_func_proto bpf_get_local_storage_proto = {
.arg2_type = ARG_ANYTHING,
};
#endif
+
+#define BPF_STRTOX_BASE_MASK 0x1F
+
+static int __bpf_strtoull(const char *buf, size_t buf_len, u64 flags,
+ unsigned long long *res, bool *is_negative)
+{
+ unsigned int base = flags & BPF_STRTOX_BASE_MASK;
+ const char *cur_buf = buf;
+ size_t cur_len = buf_len;
+ unsigned int consumed;
+ size_t val_len;
+ char str[64];
+
+ if (!buf || !buf_len || !res || !is_negative)
+ return -EINVAL;
+
+ if (base != 0 && base != 8 && base != 10 && base != 16)
+ return -EINVAL;
+
+ if (flags & ~BPF_STRTOX_BASE_MASK)
+ return -EINVAL;
+
+ while (cur_buf < buf + buf_len && isspace(*cur_buf))
+ ++cur_buf;
+
+ *is_negative = (cur_buf < buf + buf_len && *cur_buf == '-');
+ if (*is_negative)
+ ++cur_buf;
+
+ consumed = cur_buf - buf;
+ cur_len -= consumed;
+ if (!cur_len)
+ return -EINVAL;
+
+ cur_len = min(cur_len, sizeof(str) - 1);
+ memcpy(str, cur_buf, cur_len);
+ str[cur_len] = '\0';
+ cur_buf = str;
+
+ cur_buf = _parse_integer_fixup_radix(cur_buf, &base);
+ val_len = _parse_integer(cur_buf, base, res);
+
+ if (val_len & KSTRTOX_OVERFLOW)
+ return -ERANGE;
+
+ if (val_len == 0)
+ return -EINVAL;
+
+ cur_buf += val_len;
+ consumed += cur_buf - str;
+
+ return consumed;
+}
+
+static int __bpf_strtoll(const char *buf, size_t buf_len, u64 flags,
+ long long *res)
+{
+ unsigned long long _res;
+ bool is_negative;
+ int err;
+
+ err = __bpf_strtoull(buf, buf_len, flags, &_res, &is_negative);
+ if (err < 0)
+ return err;
+ if (is_negative) {
+ if ((long long)-_res > 0)
+ return -ERANGE;
+ *res = -_res;
+ } else {
+ if ((long long)_res < 0)
+ return -ERANGE;
+ *res = _res;
+ }
+ return err;
+}
+
+BPF_CALL_4(bpf_strtol, const char *, buf, size_t, buf_len, u64, flags,
+ long *, res)
+{
+ long long _res;
+ int err;
+
+ err = __bpf_strtoll(buf, buf_len, flags, &_res);
+ if (err < 0)
+ return err;
+ if (_res != (long)_res)
+ return -ERANGE;
+ *res = _res;
+ return err;
+}
+
+const struct bpf_func_proto bpf_strtol_proto = {
+ .func = bpf_strtol,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_MEM,
+ .arg2_type = ARG_CONST_SIZE,
+ .arg3_type = ARG_ANYTHING,
+ .arg4_type = ARG_PTR_TO_LONG,
+};
+
+BPF_CALL_4(bpf_strtoul, const char *, buf, size_t, buf_len, u64, flags,
+ unsigned long *, res)
+{
+ unsigned long long _res;
+ bool is_negative;
+ int err;
+
+ err = __bpf_strtoull(buf, buf_len, flags, &_res, &is_negative);
+ if (err < 0)
+ return err;
+ if (is_negative)
+ return -EINVAL;
+ if (_res != (unsigned long)_res)
+ return -ERANGE;
+ *res = _res;
+ return err;
+}
+
+const struct bpf_func_proto bpf_strtoul_proto = {
+ .func = bpf_strtoul,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_MEM,
+ .arg2_type = ARG_CONST_SIZE,
+ .arg3_type = ARG_ANYTHING,
+ .arg4_type = ARG_PTR_TO_LONG,
+};
#endif
diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c
index 2ada5e21dfa6..cc0d0cf114e3 100644
--- a/kernel/bpf/inode.c
+++ b/kernel/bpf/inode.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Minimal file system backend for holding eBPF maps and programs,
* used by bpf(2) object pinning.
@@ -5,10 +6,6 @@
* Authors:
*
* Daniel Borkmann <daniel@iogearbox.net>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * version 2 as published by the Free Software Foundation.
*/
#include <linux/init.h>
@@ -518,7 +515,7 @@ out:
static struct bpf_prog *__get_prog_inode(struct inode *inode, enum bpf_prog_type type)
{
struct bpf_prog *prog;
- int ret = inode_permission(inode, MAY_READ | MAY_WRITE);
+ int ret = inode_permission(inode, MAY_READ);
if (ret)
return ERR_PTR(ret);
@@ -554,19 +551,6 @@ struct bpf_prog *bpf_prog_get_type_path(const char *name, enum bpf_prog_type typ
}
EXPORT_SYMBOL(bpf_prog_get_type_path);
-static void bpf_evict_inode(struct inode *inode)
-{
- enum bpf_type type;
-
- truncate_inode_pages_final(&inode->i_data);
- clear_inode(inode);
-
- if (S_ISLNK(inode->i_mode))
- kfree(inode->i_link);
- if (!bpf_inode_type(inode, &type))
- bpf_any_put(inode->i_private, type);
-}
-
/*
* Display the mount options in /proc/mounts.
*/
@@ -579,11 +563,22 @@ static int bpf_show_options(struct seq_file *m, struct dentry *root)
return 0;
}
+static void bpf_free_inode(struct inode *inode)
+{
+ enum bpf_type type;
+
+ if (S_ISLNK(inode->i_mode))
+ kfree(inode->i_link);
+ if (!bpf_inode_type(inode, &type))
+ bpf_any_put(inode->i_private, type);
+ free_inode_nonrcu(inode);
+}
+
static const struct super_operations bpf_super_ops = {
.statfs = simple_statfs,
.drop_inode = generic_delete_inode,
.show_options = bpf_show_options,
- .evict_inode = bpf_evict_inode,
+ .free_inode = bpf_free_inode,
};
enum {
diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c
index 07a34ef562a0..addd6fdceec8 100644
--- a/kernel/bpf/local_storage.c
+++ b/kernel/bpf/local_storage.c
@@ -14,7 +14,7 @@ DEFINE_PER_CPU(struct bpf_cgroup_storage*, bpf_cgroup_storage[MAX_BPF_CGROUP_STO
#ifdef CONFIG_CGROUP_BPF
#define LOCAL_STORAGE_CREATE_FLAG_MASK \
- (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY)
+ (BPF_F_NUMA_NODE | BPF_F_ACCESS_MASK)
struct bpf_cgroup_storage_map {
struct bpf_map map;
@@ -131,7 +131,14 @@ static int cgroup_storage_update_elem(struct bpf_map *map, void *_key,
struct bpf_cgroup_storage *storage;
struct bpf_storage_buffer *new;
- if (flags != BPF_ANY && flags != BPF_EXIST)
+ if (unlikely(flags & ~(BPF_F_LOCK | BPF_EXIST | BPF_NOEXIST)))
+ return -EINVAL;
+
+ if (unlikely(flags & BPF_NOEXIST))
+ return -EINVAL;
+
+ if (unlikely((flags & BPF_F_LOCK) &&
+ !map_value_has_spin_lock(map)))
return -EINVAL;
storage = cgroup_storage_lookup((struct bpf_cgroup_storage_map *)map,
@@ -139,6 +146,11 @@ static int cgroup_storage_update_elem(struct bpf_map *map, void *_key,
if (!storage)
return -ENOENT;
+ if (flags & BPF_F_LOCK) {
+ copy_map_value_locked(map, storage->buf->data, value, false);
+ return 0;
+ }
+
new = kmalloc_node(sizeof(struct bpf_storage_buffer) +
map->value_size,
__GFP_ZERO | GFP_ATOMIC | __GFP_NOWARN,
@@ -147,6 +159,7 @@ static int cgroup_storage_update_elem(struct bpf_map *map, void *_key,
return -ENOMEM;
memcpy(&new->data[0], value, map->value_size);
+ check_and_init_map_lock(map, new->data);
new = xchg(&storage->buf, new);
kfree_rcu(new, rcu);
@@ -259,6 +272,8 @@ static struct bpf_map *cgroup_storage_map_alloc(union bpf_attr *attr)
{
int numa_node = bpf_map_attr_numa_node(attr);
struct bpf_cgroup_storage_map *map;
+ struct bpf_map_memory mem;
+ int ret;
if (attr->key_size != sizeof(struct bpf_cgroup_storage_key))
return ERR_PTR(-EINVAL);
@@ -269,21 +284,26 @@ static struct bpf_map *cgroup_storage_map_alloc(union bpf_attr *attr)
if (attr->value_size > PAGE_SIZE)
return ERR_PTR(-E2BIG);
- if (attr->map_flags & ~LOCAL_STORAGE_CREATE_FLAG_MASK)
- /* reserved bits should not be used */
+ if (attr->map_flags & ~LOCAL_STORAGE_CREATE_FLAG_MASK ||
+ !bpf_map_flags_access_ok(attr->map_flags))
return ERR_PTR(-EINVAL);
if (attr->max_entries)
/* max_entries is not used and enforced to be 0 */
return ERR_PTR(-EINVAL);
+ ret = bpf_map_charge_init(&mem, sizeof(struct bpf_cgroup_storage_map));
+ if (ret < 0)
+ return ERR_PTR(ret);
+
map = kmalloc_node(sizeof(struct bpf_cgroup_storage_map),
__GFP_ZERO | GFP_USER, numa_node);
- if (!map)
+ if (!map) {
+ bpf_map_charge_finish(&mem);
return ERR_PTR(-ENOMEM);
+ }
- map->map.pages = round_up(sizeof(struct bpf_cgroup_storage_map),
- PAGE_SIZE) >> PAGE_SHIFT;
+ bpf_map_charge_move(&map->map.memory, &mem);
/* copy mandatory map attributes */
bpf_map_init_from_attr(&map->map, attr);
@@ -483,6 +503,7 @@ struct bpf_cgroup_storage *bpf_cgroup_storage_alloc(struct bpf_prog *prog,
storage->buf = kmalloc_node(size, flags, map->numa_node);
if (!storage->buf)
goto enomem;
+ check_and_init_map_lock(map, storage->buf->data);
} else {
storage->percpu_buf = __alloc_percpu_gfp(size, 8, flags);
if (!storage->percpu_buf)
diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c
index abf1002080df..56e6c75d354d 100644
--- a/kernel/bpf/lpm_trie.c
+++ b/kernel/bpf/lpm_trie.c
@@ -1,12 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Longest prefix match list implementation
*
* Copyright (c) 2016,2017 Daniel Mack
* Copyright (c) 2016 David Herrmann
- *
- * This file is subject to the terms and conditions of version 2 of the GNU
- * General Public License. See the file COPYING in the main directory of the
- * Linux distribution for more details.
*/
#include <linux/bpf.h>
@@ -471,6 +468,7 @@ static int trie_delete_elem(struct bpf_map *map, void *_key)
}
if (!node || node->prefixlen != key->prefixlen ||
+ node->prefixlen != matchlen ||
(node->flags & LPM_TREE_NODE_FLAG_IM)) {
ret = -ENOENT;
goto out;
@@ -537,7 +535,7 @@ out:
#define LPM_KEY_SIZE_MIN LPM_KEY_SIZE(LPM_DATA_SIZE_MIN)
#define LPM_CREATE_FLAG_MASK (BPF_F_NO_PREALLOC | BPF_F_NUMA_NODE | \
- BPF_F_RDONLY | BPF_F_WRONLY)
+ BPF_F_ACCESS_MASK)
static struct bpf_map *trie_alloc(union bpf_attr *attr)
{
@@ -552,6 +550,7 @@ static struct bpf_map *trie_alloc(union bpf_attr *attr)
if (attr->max_entries == 0 ||
!(attr->map_flags & BPF_F_NO_PREALLOC) ||
attr->map_flags & ~LPM_CREATE_FLAG_MASK ||
+ !bpf_map_flags_access_ok(attr->map_flags) ||
attr->key_size < LPM_KEY_SIZE_MIN ||
attr->key_size > LPM_KEY_SIZE_MAX ||
attr->value_size < LPM_VAL_SIZE_MIN ||
@@ -571,14 +570,8 @@ static struct bpf_map *trie_alloc(union bpf_attr *attr)
cost_per_node = sizeof(struct lpm_trie_node) +
attr->value_size + trie->data_size;
cost += (u64) attr->max_entries * cost_per_node;
- if (cost >= U32_MAX - PAGE_SIZE) {
- ret = -E2BIG;
- goto out_err;
- }
-
- trie->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
- ret = bpf_map_precharge_memlock(trie->map.pages);
+ ret = bpf_map_charge_init(&trie->map.memory, cost);
if (ret)
goto out_err;
@@ -714,9 +707,14 @@ find_leftmost:
* have exact two children, so this function will never return NULL.
*/
for (node = search_root; node;) {
- if (!(node->flags & LPM_TREE_NODE_FLAG_IM))
+ if (node->flags & LPM_TREE_NODE_FLAG_IM) {
+ node = rcu_dereference(node->child[0]);
+ } else {
next_node = node;
- node = rcu_dereference(node->child[0]);
+ node = rcu_dereference(node->child[0]);
+ if (!node)
+ node = rcu_dereference(next_node->child[1]);
+ }
}
do_copy:
next_key->prefixlen = next_node->prefixlen;
diff --git a/kernel/bpf/map_in_map.c b/kernel/bpf/map_in_map.c
index 52378d3e34b3..fab4fb134547 100644
--- a/kernel/bpf/map_in_map.c
+++ b/kernel/bpf/map_in_map.c
@@ -1,8 +1,5 @@
+// SPDX-License-Identifier: GPL-2.0-only
/* Copyright (c) 2017 Facebook
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of version 2 of the GNU General Public
- * License as published by the Free Software Foundation.
*/
#include <linux/slab.h>
#include <linux/bpf.h>
@@ -37,6 +34,11 @@ struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd)
return ERR_PTR(-EINVAL);
}
+ if (map_value_has_spin_lock(inner_map)) {
+ fdput(f);
+ return ERR_PTR(-ENOTSUPP);
+ }
+
inner_map_meta_size = sizeof(*inner_map_meta);
/* In some cases verifier needs to access beyond just base map. */
if (inner_map->ops == &array_map_ops)
@@ -53,6 +55,7 @@ struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd)
inner_map_meta->value_size = inner_map->value_size;
inner_map_meta->map_flags = inner_map->map_flags;
inner_map_meta->max_entries = inner_map->max_entries;
+ inner_map_meta->spin_lock_off = inner_map->spin_lock_off;
/* Misc members not needed in bpf_map_meta_equal() check. */
inner_map_meta->ops = inner_map->ops;
diff --git a/kernel/bpf/map_in_map.h b/kernel/bpf/map_in_map.h
index 6183db9ec08c..a507bf6ef8b9 100644
--- a/kernel/bpf/map_in_map.h
+++ b/kernel/bpf/map_in_map.h
@@ -1,8 +1,5 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
/* Copyright (c) 2017 Facebook
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of version 2 of the GNU General Public
- * License as published by the Free Software Foundation.
*/
#ifndef __MAP_IN_MAP_H__
#define __MAP_IN_MAP_H__
diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c
index 54cf2b9c44a4..ba635209ae9a 100644
--- a/kernel/bpf/offload.c
+++ b/kernel/bpf/offload.c
@@ -35,6 +35,7 @@ static DECLARE_RWSEM(bpf_devs_lock);
struct bpf_offload_dev {
const struct bpf_prog_offload_ops *ops;
struct list_head netdevs;
+ void *priv;
};
struct bpf_offload_netdev {
@@ -173,6 +174,41 @@ int bpf_prog_offload_finalize(struct bpf_verifier_env *env)
return ret;
}
+void
+bpf_prog_offload_replace_insn(struct bpf_verifier_env *env, u32 off,
+ struct bpf_insn *insn)
+{
+ const struct bpf_prog_offload_ops *ops;
+ struct bpf_prog_offload *offload;
+ int ret = -EOPNOTSUPP;
+
+ down_read(&bpf_devs_lock);
+ offload = env->prog->aux->offload;
+ if (offload) {
+ ops = offload->offdev->ops;
+ if (!offload->opt_failed && ops->replace_insn)
+ ret = ops->replace_insn(env, off, insn);
+ offload->opt_failed |= ret;
+ }
+ up_read(&bpf_devs_lock);
+}
+
+void
+bpf_prog_offload_remove_insns(struct bpf_verifier_env *env, u32 off, u32 cnt)
+{
+ struct bpf_prog_offload *offload;
+ int ret = -EOPNOTSUPP;
+
+ down_read(&bpf_devs_lock);
+ offload = env->prog->aux->offload;
+ if (offload) {
+ if (!offload->opt_failed && offload->offdev->ops->remove_insns)
+ ret = offload->offdev->ops->remove_insns(env, off, cnt);
+ offload->opt_failed |= ret;
+ }
+ up_read(&bpf_devs_lock);
+}
+
static void __bpf_prog_offload_destroy(struct bpf_prog *prog)
{
struct bpf_prog_offload *offload = prog->aux->offload;
@@ -634,7 +670,7 @@ unlock:
EXPORT_SYMBOL_GPL(bpf_offload_dev_netdev_unregister);
struct bpf_offload_dev *
-bpf_offload_dev_create(const struct bpf_prog_offload_ops *ops)
+bpf_offload_dev_create(const struct bpf_prog_offload_ops *ops, void *priv)
{
struct bpf_offload_dev *offdev;
int err;
@@ -653,6 +689,7 @@ bpf_offload_dev_create(const struct bpf_prog_offload_ops *ops)
return ERR_PTR(-ENOMEM);
offdev->ops = ops;
+ offdev->priv = priv;
INIT_LIST_HEAD(&offdev->netdevs);
return offdev;
@@ -665,3 +702,9 @@ void bpf_offload_dev_destroy(struct bpf_offload_dev *offdev)
kfree(offdev);
}
EXPORT_SYMBOL_GPL(bpf_offload_dev_destroy);
+
+void *bpf_offload_dev_priv(struct bpf_offload_dev *offdev)
+{
+ return offdev->priv;
+}
+EXPORT_SYMBOL_GPL(bpf_offload_dev_priv);
diff --git a/kernel/bpf/percpu_freelist.c b/kernel/bpf/percpu_freelist.c
index 0c1b4ba9e90e..6e090140b924 100644
--- a/kernel/bpf/percpu_freelist.c
+++ b/kernel/bpf/percpu_freelist.c
@@ -1,8 +1,5 @@
+// SPDX-License-Identifier: GPL-2.0-only
/* Copyright (c) 2016 Facebook
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of version 2 of the GNU General Public
- * License as published by the Free Software Foundation.
*/
#include "percpu_freelist.h"
diff --git a/kernel/bpf/percpu_freelist.h b/kernel/bpf/percpu_freelist.h
index c3960118e617..fbf8a8a28979 100644
--- a/kernel/bpf/percpu_freelist.h
+++ b/kernel/bpf/percpu_freelist.h
@@ -1,8 +1,5 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
/* Copyright (c) 2016 Facebook
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of version 2 of the GNU General Public
- * License as published by the Free Software Foundation.
*/
#ifndef __PERCPU_FREELIST_H__
#define __PERCPU_FREELIST_H__
diff --git a/kernel/bpf/queue_stack_maps.c b/kernel/bpf/queue_stack_maps.c
index b384ea9f3254..f697647ceb54 100644
--- a/kernel/bpf/queue_stack_maps.c
+++ b/kernel/bpf/queue_stack_maps.c
@@ -11,8 +11,7 @@
#include "percpu_freelist.h"
#define QUEUE_STACK_CREATE_FLAG_MASK \
- (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY)
-
+ (BPF_F_NUMA_NODE | BPF_F_ACCESS_MASK)
struct bpf_queue_stack {
struct bpf_map map;
@@ -52,7 +51,8 @@ static int queue_stack_map_alloc_check(union bpf_attr *attr)
/* check sanity of attributes */
if (attr->max_entries == 0 || attr->key_size != 0 ||
attr->value_size == 0 ||
- attr->map_flags & ~QUEUE_STACK_CREATE_FLAG_MASK)
+ attr->map_flags & ~QUEUE_STACK_CREATE_FLAG_MASK ||
+ !bpf_map_flags_access_ok(attr->map_flags))
return -EINVAL;
if (attr->value_size > KMALLOC_MAX_SIZE)
@@ -67,29 +67,28 @@ static int queue_stack_map_alloc_check(union bpf_attr *attr)
static struct bpf_map *queue_stack_map_alloc(union bpf_attr *attr)
{
int ret, numa_node = bpf_map_attr_numa_node(attr);
+ struct bpf_map_memory mem = {0};
struct bpf_queue_stack *qs;
u64 size, queue_size, cost;
size = (u64) attr->max_entries + 1;
cost = queue_size = sizeof(*qs) + size * attr->value_size;
- if (cost >= U32_MAX - PAGE_SIZE)
- return ERR_PTR(-E2BIG);
-
- cost = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
- ret = bpf_map_precharge_memlock(cost);
+ ret = bpf_map_charge_init(&mem, cost);
if (ret < 0)
return ERR_PTR(ret);
qs = bpf_map_area_alloc(queue_size, numa_node);
- if (!qs)
+ if (!qs) {
+ bpf_map_charge_finish(&mem);
return ERR_PTR(-ENOMEM);
+ }
memset(qs, 0, sizeof(*qs));
bpf_map_init_from_attr(&qs->map, attr);
- qs->map.pages = cost;
+ bpf_map_charge_move(&qs->map.memory, &mem);
qs->size = size;
raw_spin_lock_init(&qs->lock);
diff --git a/kernel/bpf/reuseport_array.c b/kernel/bpf/reuseport_array.c
index 18e225de80ff..50c083ba978c 100644
--- a/kernel/bpf/reuseport_array.c
+++ b/kernel/bpf/reuseport_array.c
@@ -151,7 +151,8 @@ static struct bpf_map *reuseport_array_alloc(union bpf_attr *attr)
{
int err, numa_node = bpf_map_attr_numa_node(attr);
struct reuseport_array *array;
- u64 cost, array_size;
+ struct bpf_map_memory mem;
+ u64 array_size;
if (!capable(CAP_SYS_ADMIN))
return ERR_PTR(-EPERM);
@@ -159,24 +160,20 @@ static struct bpf_map *reuseport_array_alloc(union bpf_attr *attr)
array_size = sizeof(*array);
array_size += (u64)attr->max_entries * sizeof(struct sock *);
- /* make sure there is no u32 overflow later in round_up() */
- cost = array_size;
- if (cost >= U32_MAX - PAGE_SIZE)
- return ERR_PTR(-ENOMEM);
- cost = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
-
- err = bpf_map_precharge_memlock(cost);
+ err = bpf_map_charge_init(&mem, array_size);
if (err)
return ERR_PTR(err);
/* allocate all map elements and zero-initialize them */
array = bpf_map_area_alloc(array_size, numa_node);
- if (!array)
+ if (!array) {
+ bpf_map_charge_finish(&mem);
return ERR_PTR(-ENOMEM);
+ }
/* copy mandatory map attributes */
bpf_map_init_from_attr(&array->map, attr);
- array->map.pages = cost;
+ bpf_map_charge_move(&array->map.memory, &mem);
return &array->map;
}
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index d43b14535827..052580c33d26 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -1,8 +1,5 @@
+// SPDX-License-Identifier: GPL-2.0-only
/* Copyright (c) 2016 Facebook
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of version 2 of the GNU General Public
- * License as published by the Free Software Foundation.
*/
#include <linux/bpf.h>
#include <linux/jhash.h>
@@ -44,7 +41,7 @@ static void do_up_read(struct irq_work *entry)
struct stack_map_irq_work *work;
work = container_of(entry, struct stack_map_irq_work, irq_work);
- up_read(work->sem);
+ up_read_non_owner(work->sem);
work->sem = NULL;
}
@@ -89,6 +86,7 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr)
{
u32 value_size = attr->value_size;
struct bpf_stack_map *smap;
+ struct bpf_map_memory mem;
u64 cost, n_buckets;
int err;
@@ -116,40 +114,37 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr)
n_buckets = roundup_pow_of_two(attr->max_entries);
cost = n_buckets * sizeof(struct stack_map_bucket *) + sizeof(*smap);
- if (cost >= U32_MAX - PAGE_SIZE)
- return ERR_PTR(-E2BIG);
+ cost += n_buckets * (value_size + sizeof(struct stack_map_bucket));
+ err = bpf_map_charge_init(&mem, cost);
+ if (err)
+ return ERR_PTR(err);
smap = bpf_map_area_alloc(cost, bpf_map_attr_numa_node(attr));
- if (!smap)
+ if (!smap) {
+ bpf_map_charge_finish(&mem);
return ERR_PTR(-ENOMEM);
-
- err = -E2BIG;
- cost += n_buckets * (value_size + sizeof(struct stack_map_bucket));
- if (cost >= U32_MAX - PAGE_SIZE)
- goto free_smap;
+ }
bpf_map_init_from_attr(&smap->map, attr);
smap->map.value_size = value_size;
smap->n_buckets = n_buckets;
- smap->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
-
- err = bpf_map_precharge_memlock(smap->map.pages);
- if (err)
- goto free_smap;
err = get_callchain_buffers(sysctl_perf_event_max_stack);
if (err)
- goto free_smap;
+ goto free_charge;
err = prealloc_elems_and_freelist(smap);
if (err)
goto put_buffers;
+ bpf_map_charge_move(&smap->map.memory, &mem);
+
return &smap->map;
put_buffers:
put_callchain_buffers();
-free_smap:
+free_charge:
+ bpf_map_charge_finish(&mem);
bpf_map_area_free(smap);
return ERR_PTR(err);
}
@@ -338,6 +333,12 @@ static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs,
} else {
work->sem = &current->mm->mmap_sem;
irq_work_queue(&work->irq_work);
+ /*
+ * The irq_work will release the mmap_sem with
+ * up_read_non_owner(). The rwsem_release() is called
+ * here to release the lock from lockdep's perspective.
+ */
+ rwsem_release(&current->mm->mmap_sem.dep_map, 1, _RET_IP_);
}
}
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 8577bb7f8be6..5d141f16f6fa 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1,13 +1,5 @@
+// SPDX-License-Identifier: GPL-2.0-only
/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of version 2 of the GNU General Public
- * License as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
*/
#include <linux/bpf.h>
#include <linux/bpf_trace.h>
@@ -136,21 +128,29 @@ static struct bpf_map *find_and_alloc_map(union bpf_attr *attr)
void *bpf_map_area_alloc(size_t size, int numa_node)
{
- /* We definitely need __GFP_NORETRY, so OOM killer doesn't
- * trigger under memory pressure as we really just want to
- * fail instead.
+ /* We really just want to fail instead of triggering OOM killer
+ * under memory pressure, therefore we set __GFP_NORETRY to kmalloc,
+ * which is used for lower order allocation requests.
+ *
+ * It has been observed that higher order allocation requests done by
+ * vmalloc with __GFP_NORETRY being set might fail due to not trying
+ * to reclaim memory from the page cache, thus we set
+ * __GFP_RETRY_MAYFAIL to avoid such situations.
*/
- const gfp_t flags = __GFP_NOWARN | __GFP_NORETRY | __GFP_ZERO;
+
+ const gfp_t flags = __GFP_NOWARN | __GFP_ZERO;
void *area;
if (size <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) {
- area = kmalloc_node(size, GFP_USER | flags, numa_node);
+ area = kmalloc_node(size, GFP_USER | __GFP_NORETRY | flags,
+ numa_node);
if (area != NULL)
return area;
}
- return __vmalloc_node_flags_caller(size, numa_node, GFP_KERNEL | flags,
- __builtin_return_address(0));
+ return __vmalloc_node_flags_caller(size, numa_node,
+ GFP_KERNEL | __GFP_RETRY_MAYFAIL |
+ flags, __builtin_return_address(0));
}
void bpf_map_area_free(void *area)
@@ -158,29 +158,28 @@ void bpf_map_area_free(void *area)
kvfree(area);
}
+static u32 bpf_map_flags_retain_permanent(u32 flags)
+{
+ /* Some map creation flags are not tied to the map object but
+ * rather to the map fd instead, so they have no meaning upon
+ * map object inspection since multiple file descriptors with
+ * different (access) properties can exist here. Thus, given
+ * this has zero meaning for the map itself, lets clear these
+ * from here.
+ */
+ return flags & ~(BPF_F_RDONLY | BPF_F_WRONLY);
+}
+
void bpf_map_init_from_attr(struct bpf_map *map, union bpf_attr *attr)
{
map->map_type = attr->map_type;
map->key_size = attr->key_size;
map->value_size = attr->value_size;
map->max_entries = attr->max_entries;
- map->map_flags = attr->map_flags;
+ map->map_flags = bpf_map_flags_retain_permanent(attr->map_flags);
map->numa_node = bpf_map_attr_numa_node(attr);
}
-int bpf_map_precharge_memlock(u32 pages)
-{
- struct user_struct *user = get_current_user();
- unsigned long memlock_limit, cur;
-
- memlock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
- cur = atomic_long_read(&user->locked_vm);
- free_uid(user);
- if (cur + pages > memlock_limit)
- return -EPERM;
- return 0;
-}
-
static int bpf_charge_memlock(struct user_struct *user, u32 pages)
{
unsigned long memlock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
@@ -194,45 +193,62 @@ static int bpf_charge_memlock(struct user_struct *user, u32 pages)
static void bpf_uncharge_memlock(struct user_struct *user, u32 pages)
{
- atomic_long_sub(pages, &user->locked_vm);
+ if (user)
+ atomic_long_sub(pages, &user->locked_vm);
}
-static int bpf_map_init_memlock(struct bpf_map *map)
+int bpf_map_charge_init(struct bpf_map_memory *mem, size_t size)
{
- struct user_struct *user = get_current_user();
+ u32 pages = round_up(size, PAGE_SIZE) >> PAGE_SHIFT;
+ struct user_struct *user;
int ret;
- ret = bpf_charge_memlock(user, map->pages);
+ if (size >= U32_MAX - PAGE_SIZE)
+ return -E2BIG;
+
+ user = get_current_user();
+ ret = bpf_charge_memlock(user, pages);
if (ret) {
free_uid(user);
return ret;
}
- map->user = user;
- return ret;
+
+ mem->pages = pages;
+ mem->user = user;
+
+ return 0;
}
-static void bpf_map_release_memlock(struct bpf_map *map)
+void bpf_map_charge_finish(struct bpf_map_memory *mem)
{
- struct user_struct *user = map->user;
- bpf_uncharge_memlock(user, map->pages);
- free_uid(user);
+ bpf_uncharge_memlock(mem->user, mem->pages);
+ free_uid(mem->user);
+}
+
+void bpf_map_charge_move(struct bpf_map_memory *dst,
+ struct bpf_map_memory *src)
+{
+ *dst = *src;
+
+ /* Make sure src will not be used for the redundant uncharging. */
+ memset(src, 0, sizeof(struct bpf_map_memory));
}
int bpf_map_charge_memlock(struct bpf_map *map, u32 pages)
{
int ret;
- ret = bpf_charge_memlock(map->user, pages);
+ ret = bpf_charge_memlock(map->memory.user, pages);
if (ret)
return ret;
- map->pages += pages;
+ map->memory.pages += pages;
return ret;
}
void bpf_map_uncharge_memlock(struct bpf_map *map, u32 pages)
{
- bpf_uncharge_memlock(map->user, pages);
- map->pages -= pages;
+ bpf_uncharge_memlock(map->memory.user, pages);
+ map->memory.pages -= pages;
}
static int bpf_map_alloc_id(struct bpf_map *map)
@@ -283,11 +299,13 @@ void bpf_map_free_id(struct bpf_map *map, bool do_idr_lock)
static void bpf_map_free_deferred(struct work_struct *work)
{
struct bpf_map *map = container_of(work, struct bpf_map, work);
+ struct bpf_map_memory mem;
- bpf_map_release_memlock(map);
+ bpf_map_charge_move(&mem, &map->memory);
security_bpf_map_free(map);
/* implementation dependent freeing */
map->ops->map_free(map);
+ bpf_map_charge_finish(&mem);
}
static void bpf_map_put_uref(struct bpf_map *map)
@@ -335,6 +353,18 @@ static int bpf_map_release(struct inode *inode, struct file *filp)
return 0;
}
+static fmode_t map_get_sys_perms(struct bpf_map *map, struct fd f)
+{
+ fmode_t mode = f.file->f_mode;
+
+ /* Our file permissions may have been overridden by global
+ * map permissions facing syscall side.
+ */
+ if (READ_ONCE(map->frozen))
+ mode &= ~FMODE_CAN_WRITE;
+ return mode;
+}
+
#ifdef CONFIG_PROC_FS
static void bpf_map_show_fdinfo(struct seq_file *m, struct file *filp)
{
@@ -356,14 +386,16 @@ static void bpf_map_show_fdinfo(struct seq_file *m, struct file *filp)
"max_entries:\t%u\n"
"map_flags:\t%#x\n"
"memlock:\t%llu\n"
- "map_id:\t%u\n",
+ "map_id:\t%u\n"
+ "frozen:\t%u\n",
map->map_type,
map->key_size,
map->value_size,
map->max_entries,
map->map_flags,
- map->pages * 1ULL << PAGE_SHIFT,
- map->id);
+ map->memory.pages * 1ULL << PAGE_SHIFT,
+ map->id,
+ READ_ONCE(map->frozen));
if (owner_prog_type) {
seq_printf(m, "owner_prog_type:\t%u\n",
@@ -440,10 +472,10 @@ static int bpf_obj_name_cpy(char *dst, const char *src)
const char *end = src + BPF_OBJ_NAME_LEN;
memset(dst, 0, BPF_OBJ_NAME_LEN);
-
- /* Copy all isalnum() and '_' char */
+ /* Copy all isalnum(), '_' and '.' chars. */
while (src < end && *src) {
- if (!isalnum(*src) && *src != '_')
+ if (!isalnum(*src) &&
+ *src != '_' && *src != '.')
return -EINVAL;
*dst++ = *src++;
}
@@ -463,21 +495,47 @@ int map_check_no_btf(const struct bpf_map *map,
return -ENOTSUPP;
}
-static int map_check_btf(const struct bpf_map *map, const struct btf *btf,
+static int map_check_btf(struct bpf_map *map, const struct btf *btf,
u32 btf_key_id, u32 btf_value_id)
{
const struct btf_type *key_type, *value_type;
u32 key_size, value_size;
int ret = 0;
- key_type = btf_type_id_size(btf, &btf_key_id, &key_size);
- if (!key_type || key_size != map->key_size)
- return -EINVAL;
+ /* Some maps allow key to be unspecified. */
+ if (btf_key_id) {
+ key_type = btf_type_id_size(btf, &btf_key_id, &key_size);
+ if (!key_type || key_size != map->key_size)
+ return -EINVAL;
+ } else {
+ key_type = btf_type_by_id(btf, 0);
+ if (!map->ops->map_check_btf)
+ return -EINVAL;
+ }
value_type = btf_type_id_size(btf, &btf_value_id, &value_size);
if (!value_type || value_size != map->value_size)
return -EINVAL;
+ map->spin_lock_off = btf_find_spin_lock(btf, value_type);
+
+ if (map_value_has_spin_lock(map)) {
+ if (map->map_flags & BPF_F_RDONLY_PROG)
+ return -EACCES;
+ if (map->map_type != BPF_MAP_TYPE_HASH &&
+ map->map_type != BPF_MAP_TYPE_ARRAY &&
+ map->map_type != BPF_MAP_TYPE_CGROUP_STORAGE &&
+ map->map_type != BPF_MAP_TYPE_SK_STORAGE)
+ return -ENOTSUPP;
+ if (map->spin_lock_off + sizeof(struct bpf_spin_lock) >
+ map->value_size) {
+ WARN_ONCE(1,
+ "verifier bug spin_lock_off %d value_size %d\n",
+ map->spin_lock_off, map->value_size);
+ return -EFAULT;
+ }
+ }
+
if (map->ops->map_check_btf)
ret = map->ops->map_check_btf(map, btf, key_type, value_type);
@@ -489,6 +547,7 @@ static int map_check_btf(const struct bpf_map *map, const struct btf *btf,
static int map_create(union bpf_attr *attr)
{
int numa_node = bpf_map_attr_numa_node(attr);
+ struct bpf_map_memory mem;
struct bpf_map *map;
int f_flags;
int err;
@@ -513,7 +572,7 @@ static int map_create(union bpf_attr *attr)
err = bpf_obj_name_cpy(map->name, attr->map_name);
if (err)
- goto free_map_nouncharge;
+ goto free_map;
atomic_set(&map->refcnt, 1);
atomic_set(&map->usercnt, 1);
@@ -521,62 +580,60 @@ static int map_create(union bpf_attr *attr)
if (attr->btf_key_type_id || attr->btf_value_type_id) {
struct btf *btf;
- if (!attr->btf_key_type_id || !attr->btf_value_type_id) {
+ if (!attr->btf_value_type_id) {
err = -EINVAL;
- goto free_map_nouncharge;
+ goto free_map;
}
btf = btf_get_by_fd(attr->btf_fd);
if (IS_ERR(btf)) {
err = PTR_ERR(btf);
- goto free_map_nouncharge;
+ goto free_map;
}
err = map_check_btf(map, btf, attr->btf_key_type_id,
attr->btf_value_type_id);
if (err) {
btf_put(btf);
- goto free_map_nouncharge;
+ goto free_map;
}
map->btf = btf;
map->btf_key_type_id = attr->btf_key_type_id;
map->btf_value_type_id = attr->btf_value_type_id;
+ } else {
+ map->spin_lock_off = -EINVAL;
}
err = security_bpf_map_alloc(map);
if (err)
- goto free_map_nouncharge;
-
- err = bpf_map_init_memlock(map);
- if (err)
- goto free_map_sec;
+ goto free_map;
err = bpf_map_alloc_id(map);
if (err)
- goto free_map;
+ goto free_map_sec;
err = bpf_map_new_fd(map, f_flags);
if (err < 0) {
/* failed to allocate fd.
- * bpf_map_put() is needed because the above
+ * bpf_map_put_with_uref() is needed because the above
* bpf_map_alloc_id() has published the map
* to the userspace and the userspace may
* have refcnt-ed it through BPF_MAP_GET_FD_BY_ID.
*/
- bpf_map_put(map);
+ bpf_map_put_with_uref(map);
return err;
}
return err;
-free_map:
- bpf_map_release_memlock(map);
free_map_sec:
security_bpf_map_free(map);
-free_map_nouncharge:
+free_map:
btf_put(map->btf);
+ bpf_map_charge_move(&mem, &map->memory);
map->ops->map_free(map);
+ bpf_map_charge_finish(&mem);
return err;
}
@@ -664,7 +721,7 @@ static void *__bpf_copy_key(void __user *ukey, u64 key_size)
}
/* last field in 'union bpf_attr' used by this command */
-#define BPF_MAP_LOOKUP_ELEM_LAST_FIELD value
+#define BPF_MAP_LOOKUP_ELEM_LAST_FIELD flags
static int map_lookup_elem(union bpf_attr *attr)
{
@@ -680,16 +737,24 @@ static int map_lookup_elem(union bpf_attr *attr)
if (CHECK_ATTR(BPF_MAP_LOOKUP_ELEM))
return -EINVAL;
+ if (attr->flags & ~BPF_F_LOCK)
+ return -EINVAL;
+
f = fdget(ufd);
map = __bpf_map_get(f);
if (IS_ERR(map))
return PTR_ERR(map);
-
- if (!(f.file->f_mode & FMODE_CAN_READ)) {
+ if (!(map_get_sys_perms(map, f) & FMODE_CAN_READ)) {
err = -EPERM;
goto err_put;
}
+ if ((attr->flags & BPF_F_LOCK) &&
+ !map_value_has_spin_lock(map)) {
+ err = -EINVAL;
+ goto err_put;
+ }
+
key = __bpf_copy_key(ukey, map->key_size);
if (IS_ERR(key)) {
err = PTR_ERR(key);
@@ -738,14 +803,23 @@ static int map_lookup_elem(union bpf_attr *attr)
err = map->ops->map_peek_elem(map, value);
} else {
rcu_read_lock();
- ptr = map->ops->map_lookup_elem(map, key);
+ if (map->ops->map_lookup_elem_sys_only)
+ ptr = map->ops->map_lookup_elem_sys_only(map, key);
+ else
+ ptr = map->ops->map_lookup_elem(map, key);
if (IS_ERR(ptr)) {
err = PTR_ERR(ptr);
} else if (!ptr) {
err = -ENOENT;
} else {
err = 0;
- memcpy(value, ptr, value_size);
+ if (attr->flags & BPF_F_LOCK)
+ /* lock 'ptr' and copy everything but lock */
+ copy_map_value_locked(map, value, ptr, true);
+ else
+ copy_map_value(map, value, ptr);
+ /* mask lock, since value wasn't zero inited */
+ check_and_init_map_lock(map, value);
}
rcu_read_unlock();
}
@@ -802,12 +876,17 @@ static int map_update_elem(union bpf_attr *attr)
map = __bpf_map_get(f);
if (IS_ERR(map))
return PTR_ERR(map);
-
- if (!(f.file->f_mode & FMODE_CAN_WRITE)) {
+ if (!(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) {
err = -EPERM;
goto err_put;
}
+ if ((attr->flags & BPF_F_LOCK) &&
+ !map_value_has_spin_lock(map)) {
+ err = -EINVAL;
+ goto err_put;
+ }
+
key = __bpf_copy_key(ukey, map->key_size);
if (IS_ERR(key)) {
err = PTR_ERR(key);
@@ -908,8 +987,7 @@ static int map_delete_elem(union bpf_attr *attr)
map = __bpf_map_get(f);
if (IS_ERR(map))
return PTR_ERR(map);
-
- if (!(f.file->f_mode & FMODE_CAN_WRITE)) {
+ if (!(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) {
err = -EPERM;
goto err_put;
}
@@ -960,8 +1038,7 @@ static int map_get_next_key(union bpf_attr *attr)
map = __bpf_map_get(f);
if (IS_ERR(map))
return PTR_ERR(map);
-
- if (!(f.file->f_mode & FMODE_CAN_READ)) {
+ if (!(map_get_sys_perms(map, f) & FMODE_CAN_READ)) {
err = -EPERM;
goto err_put;
}
@@ -1028,8 +1105,7 @@ static int map_lookup_and_delete_elem(union bpf_attr *attr)
map = __bpf_map_get(f);
if (IS_ERR(map))
return PTR_ERR(map);
-
- if (!(f.file->f_mode & FMODE_CAN_WRITE)) {
+ if (!(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) {
err = -EPERM;
goto err_put;
}
@@ -1071,6 +1147,36 @@ err_put:
return err;
}
+#define BPF_MAP_FREEZE_LAST_FIELD map_fd
+
+static int map_freeze(const union bpf_attr *attr)
+{
+ int err = 0, ufd = attr->map_fd;
+ struct bpf_map *map;
+ struct fd f;
+
+ if (CHECK_ATTR(BPF_MAP_FREEZE))
+ return -EINVAL;
+
+ f = fdget(ufd);
+ map = __bpf_map_get(f);
+ if (IS_ERR(map))
+ return PTR_ERR(map);
+ if (READ_ONCE(map->frozen)) {
+ err = -EBUSY;
+ goto err_put;
+ }
+ if (!capable(CAP_SYS_ADMIN)) {
+ err = -EPERM;
+ goto err_put;
+ }
+
+ WRITE_ONCE(map->frozen, true);
+err_put:
+ fdput(f);
+ return err;
+}
+
static const struct bpf_prog_ops * const bpf_prog_types[] = {
#define BPF_PROG_TYPE(_id, _name) \
[_id] = & _name ## _prog_ops,
@@ -1219,6 +1325,7 @@ static void __bpf_prog_put_rcu(struct rcu_head *rcu)
static void __bpf_prog_put(struct bpf_prog *prog, bool do_idr_lock)
{
if (atomic_dec_and_test(&prog->aux->refcnt)) {
+ perf_event_bpf_event(prog, PERF_BPF_EVENT_PROG_UNLOAD, 0);
/* bpf_prog_free_id() must be called first */
bpf_prog_free_id(prog, do_idr_lock);
bpf_prog_kallsyms_del_all(prog);
@@ -1244,24 +1351,54 @@ static int bpf_prog_release(struct inode *inode, struct file *filp)
return 0;
}
+static void bpf_prog_get_stats(const struct bpf_prog *prog,
+ struct bpf_prog_stats *stats)
+{
+ u64 nsecs = 0, cnt = 0;
+ int cpu;
+
+ for_each_possible_cpu(cpu) {
+ const struct bpf_prog_stats *st;
+ unsigned int start;
+ u64 tnsecs, tcnt;
+
+ st = per_cpu_ptr(prog->aux->stats, cpu);
+ do {
+ start = u64_stats_fetch_begin_irq(&st->syncp);
+ tnsecs = st->nsecs;
+ tcnt = st->cnt;
+ } while (u64_stats_fetch_retry_irq(&st->syncp, start));
+ nsecs += tnsecs;
+ cnt += tcnt;
+ }
+ stats->nsecs = nsecs;
+ stats->cnt = cnt;
+}
+
#ifdef CONFIG_PROC_FS
static void bpf_prog_show_fdinfo(struct seq_file *m, struct file *filp)
{
const struct bpf_prog *prog = filp->private_data;
char prog_tag[sizeof(prog->tag) * 2 + 1] = { };
+ struct bpf_prog_stats stats;
+ bpf_prog_get_stats(prog, &stats);
bin2hex(prog_tag, prog->tag, sizeof(prog->tag));
seq_printf(m,
"prog_type:\t%u\n"
"prog_jited:\t%u\n"
"prog_tag:\t%s\n"
"memlock:\t%llu\n"
- "prog_id:\t%u\n",
+ "prog_id:\t%u\n"
+ "run_time_ns:\t%llu\n"
+ "run_cnt:\t%llu\n",
prog->type,
prog->jited,
prog_tag,
prog->pages * 1ULL << PAGE_SHIFT,
- prog->aux->id);
+ prog->aux->id,
+ stats.nsecs,
+ stats.cnt);
}
#endif
@@ -1439,6 +1576,24 @@ bpf_prog_load_check_attach_type(enum bpf_prog_type prog_type,
case BPF_CGROUP_INET6_CONNECT:
case BPF_CGROUP_UDP4_SENDMSG:
case BPF_CGROUP_UDP6_SENDMSG:
+ case BPF_CGROUP_UDP4_RECVMSG:
+ case BPF_CGROUP_UDP6_RECVMSG:
+ return 0;
+ default:
+ return -EINVAL;
+ }
+ case BPF_PROG_TYPE_CGROUP_SKB:
+ switch (expected_attach_type) {
+ case BPF_CGROUP_INET_INGRESS:
+ case BPF_CGROUP_INET_EGRESS:
+ return 0;
+ default:
+ return -EINVAL;
+ }
+ case BPF_PROG_TYPE_CGROUP_SOCKOPT:
+ switch (expected_attach_type) {
+ case BPF_CGROUP_SETSOCKOPT:
+ case BPF_CGROUP_GETSOCKOPT:
return 0;
default:
return -EINVAL;
@@ -1462,7 +1617,9 @@ static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr)
if (CHECK_ATTR(BPF_PROG_LOAD))
return -EINVAL;
- if (attr->prog_flags & ~(BPF_F_STRICT_ALIGNMENT | BPF_F_ANY_ALIGNMENT))
+ if (attr->prog_flags & ~(BPF_F_STRICT_ALIGNMENT |
+ BPF_F_ANY_ALIGNMENT |
+ BPF_F_TEST_RND_HI32))
return -EINVAL;
if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) &&
@@ -1479,7 +1636,8 @@ static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr)
/* eBPF programs must be GPL compatible to use GPL-ed functions */
is_gpl = license_is_gpl_compatible(license);
- if (attr->insn_cnt == 0 || attr->insn_cnt > BPF_MAXINSNS)
+ if (attr->insn_cnt == 0 ||
+ attr->insn_cnt > (capable(CAP_SYS_ADMIN) ? BPF_COMPLEXITY_LIMIT_INSNS : BPF_MAXINSNS))
return -E2BIG;
if (type != BPF_PROG_TYPE_SOCKET_FILTER &&
type != BPF_PROG_TYPE_CGROUP_SKB &&
@@ -1531,7 +1689,7 @@ static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr)
if (err < 0)
goto free_prog;
- prog->aux->load_time = ktime_get_boot_ns();
+ prog->aux->load_time = ktime_get_boottime_ns();
err = bpf_obj_name_cpy(prog->aux->name, attr->prog_name);
if (err)
goto free_prog;
@@ -1562,6 +1720,7 @@ static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr)
}
bpf_prog_kallsyms_add(prog);
+ perf_event_bpf_event(prog, PERF_BPF_EVENT_PROG_LOAD, 0);
return err;
free_used_maps:
@@ -1649,12 +1808,16 @@ static int bpf_raw_tracepoint_open(const union bpf_attr *attr)
}
raw_tp->btp = btp;
- prog = bpf_prog_get_type(attr->raw_tracepoint.prog_fd,
- BPF_PROG_TYPE_RAW_TRACEPOINT);
+ prog = bpf_prog_get(attr->raw_tracepoint.prog_fd);
if (IS_ERR(prog)) {
err = PTR_ERR(prog);
goto out_free_tp;
}
+ if (prog->type != BPF_PROG_TYPE_RAW_TRACEPOINT &&
+ prog->type != BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE) {
+ err = -EINVAL;
+ goto out_put_prog;
+ }
err = bpf_probe_register(raw_tp->btp, prog);
if (err)
@@ -1685,7 +1848,12 @@ static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog,
switch (prog->type) {
case BPF_PROG_TYPE_CGROUP_SOCK:
case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
+ case BPF_PROG_TYPE_CGROUP_SOCKOPT:
return attach_type == prog->expected_attach_type ? 0 : -EINVAL;
+ case BPF_PROG_TYPE_CGROUP_SKB:
+ return prog->enforce_expected_attach_type &&
+ prog->expected_attach_type != attach_type ?
+ -EINVAL : 0;
default:
return 0;
}
@@ -1727,6 +1895,8 @@ static int bpf_prog_attach(const union bpf_attr *attr)
case BPF_CGROUP_INET6_CONNECT:
case BPF_CGROUP_UDP4_SENDMSG:
case BPF_CGROUP_UDP6_SENDMSG:
+ case BPF_CGROUP_UDP4_RECVMSG:
+ case BPF_CGROUP_UDP6_RECVMSG:
ptype = BPF_PROG_TYPE_CGROUP_SOCK_ADDR;
break;
case BPF_CGROUP_SOCK_OPS:
@@ -1748,6 +1918,13 @@ static int bpf_prog_attach(const union bpf_attr *attr)
case BPF_FLOW_DISSECTOR:
ptype = BPF_PROG_TYPE_FLOW_DISSECTOR;
break;
+ case BPF_CGROUP_SYSCTL:
+ ptype = BPF_PROG_TYPE_CGROUP_SYSCTL;
+ break;
+ case BPF_CGROUP_GETSOCKOPT:
+ case BPF_CGROUP_SETSOCKOPT:
+ ptype = BPF_PROG_TYPE_CGROUP_SOCKOPT;
+ break;
default:
return -EINVAL;
}
@@ -1809,6 +1986,8 @@ static int bpf_prog_detach(const union bpf_attr *attr)
case BPF_CGROUP_INET6_CONNECT:
case BPF_CGROUP_UDP4_SENDMSG:
case BPF_CGROUP_UDP6_SENDMSG:
+ case BPF_CGROUP_UDP4_RECVMSG:
+ case BPF_CGROUP_UDP6_RECVMSG:
ptype = BPF_PROG_TYPE_CGROUP_SOCK_ADDR;
break;
case BPF_CGROUP_SOCK_OPS:
@@ -1826,6 +2005,13 @@ static int bpf_prog_detach(const union bpf_attr *attr)
return lirc_prog_detach(attr);
case BPF_FLOW_DISSECTOR:
return skb_flow_dissector_bpf_prog_detach(attr);
+ case BPF_CGROUP_SYSCTL:
+ ptype = BPF_PROG_TYPE_CGROUP_SYSCTL;
+ break;
+ case BPF_CGROUP_GETSOCKOPT:
+ case BPF_CGROUP_SETSOCKOPT:
+ ptype = BPF_PROG_TYPE_CGROUP_SOCKOPT;
+ break;
default:
return -EINVAL;
}
@@ -1857,11 +2043,18 @@ static int bpf_prog_query(const union bpf_attr *attr,
case BPF_CGROUP_INET6_CONNECT:
case BPF_CGROUP_UDP4_SENDMSG:
case BPF_CGROUP_UDP6_SENDMSG:
+ case BPF_CGROUP_UDP4_RECVMSG:
+ case BPF_CGROUP_UDP6_RECVMSG:
case BPF_CGROUP_SOCK_OPS:
case BPF_CGROUP_DEVICE:
+ case BPF_CGROUP_SYSCTL:
+ case BPF_CGROUP_GETSOCKOPT:
+ case BPF_CGROUP_SETSOCKOPT:
break;
case BPF_LIRC_MODE2:
return lirc_prog_query(attr, uattr);
+ case BPF_FLOW_DISSECTOR:
+ return skb_flow_dissector_prog_query(attr, uattr);
default:
return -EINVAL;
}
@@ -1869,7 +2062,7 @@ static int bpf_prog_query(const union bpf_attr *attr,
return cgroup_bpf_prog_query(attr, uattr);
}
-#define BPF_PROG_TEST_RUN_LAST_FIELD test.duration
+#define BPF_PROG_TEST_RUN_LAST_FIELD test.ctx_out
static int bpf_prog_test_run(const union bpf_attr *attr,
union bpf_attr __user *uattr)
@@ -1882,6 +2075,14 @@ static int bpf_prog_test_run(const union bpf_attr *attr,
if (CHECK_ATTR(BPF_PROG_TEST_RUN))
return -EINVAL;
+ if ((attr->test.ctx_size_in && !attr->test.ctx_in) ||
+ (!attr->test.ctx_size_in && attr->test.ctx_in))
+ return -EINVAL;
+
+ if ((attr->test.ctx_size_out && !attr->test.ctx_out) ||
+ (!attr->test.ctx_size_out && attr->test.ctx_out))
+ return -EINVAL;
+
prog = bpf_prog_get(attr->test.prog_fd);
if (IS_ERR(prog))
return PTR_ERR(prog);
@@ -1986,19 +2187,32 @@ static int bpf_map_get_fd_by_id(const union bpf_attr *attr)
fd = bpf_map_new_fd(map, f_flags);
if (fd < 0)
- bpf_map_put(map);
+ bpf_map_put_with_uref(map);
return fd;
}
static const struct bpf_map *bpf_map_from_imm(const struct bpf_prog *prog,
- unsigned long addr)
+ unsigned long addr, u32 *off,
+ u32 *type)
{
+ const struct bpf_map *map;
int i;
- for (i = 0; i < prog->aux->used_map_cnt; i++)
- if (prog->aux->used_maps[i] == (void *)addr)
- return prog->aux->used_maps[i];
+ for (i = 0, *off = 0; i < prog->aux->used_map_cnt; i++) {
+ map = prog->aux->used_maps[i];
+ if (map == (void *)addr) {
+ *type = BPF_PSEUDO_MAP_FD;
+ return map;
+ }
+ if (!map->ops->map_direct_value_meta)
+ continue;
+ if (!map->ops->map_direct_value_meta(map, addr, off)) {
+ *type = BPF_PSEUDO_MAP_VALUE;
+ return map;
+ }
+ }
+
return NULL;
}
@@ -2006,6 +2220,7 @@ static struct bpf_insn *bpf_insn_prepare_dump(const struct bpf_prog *prog)
{
const struct bpf_map *map;
struct bpf_insn *insns;
+ u32 off, type;
u64 imm;
int i;
@@ -2033,11 +2248,11 @@ static struct bpf_insn *bpf_insn_prepare_dump(const struct bpf_prog *prog)
continue;
imm = ((u64)insns[i + 1].imm << 32) | (u32)insns[i].imm;
- map = bpf_map_from_imm(prog, imm);
+ map = bpf_map_from_imm(prog, imm, &off, &type);
if (map) {
- insns[i].src_reg = BPF_PSEUDO_MAP_FD;
+ insns[i].src_reg = type;
insns[i].imm = map->id;
- insns[i + 1].imm = 0;
+ insns[i + 1].imm = off;
continue;
}
}
@@ -2083,6 +2298,7 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog,
struct bpf_prog_info __user *uinfo = u64_to_user_ptr(attr->info.info);
struct bpf_prog_info info = {};
u32 info_len = attr->info.info_len;
+ struct bpf_prog_stats stats;
char __user *uinsns;
u32 ulen;
int err;
@@ -2122,6 +2338,10 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog,
if (err)
return err;
+ bpf_prog_get_stats(prog, &stats);
+ info.run_time_ns = stats.nsecs;
+ info.run_cnt = stats.cnt;
+
if (!capable(CAP_SYS_ADMIN)) {
info.jited_prog_len = 0;
info.xlated_prog_len = 0;
@@ -2622,6 +2842,9 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
case BPF_MAP_GET_NEXT_KEY:
err = map_get_next_key(&attr);
break;
+ case BPF_MAP_FREEZE:
+ err = map_freeze(&attr);
+ break;
case BPF_PROG_LOAD:
err = bpf_prog_load(&attr, uattr);
break;
diff --git a/kernel/bpf/tnum.c b/kernel/bpf/tnum.c
index 938d41211be7..ca52b9642943 100644
--- a/kernel/bpf/tnum.c
+++ b/kernel/bpf/tnum.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/* tnum: tracked (or tristate) numbers
*
* A tnum tracks knowledge about the bits of a value. Each bit can be either
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 56674a7c3778..a2e763703c30 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -1,15 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-only
/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
* Copyright (c) 2016 Facebook
* Copyright (c) 2018 Covalent IO, Inc. http://covalent.io
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of version 2 of the GNU General Public
- * License as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
*/
#include <uapi/linux/btf.h>
#include <linux/kernel.h>
@@ -176,8 +168,7 @@ struct bpf_verifier_stack_elem {
struct bpf_verifier_stack_elem *next;
};
-#define BPF_COMPLEXITY_LIMIT_INSNS 131072
-#define BPF_COMPLEXITY_LIMIT_STACK 1024
+#define BPF_COMPLEXITY_LIMIT_JMP_SEQ 8192
#define BPF_COMPLEXITY_LIMIT_STATES 64
#define BPF_MAP_PTR_UNPRIV 1UL
@@ -212,7 +203,8 @@ struct bpf_call_arg_meta {
int access_size;
s64 msize_smax_value;
u64 msize_umax_value;
- int ptr_id;
+ int ref_obj_id;
+ int func_id;
};
static DEFINE_MUTEX(bpf_verifier_lock);
@@ -330,35 +322,39 @@ static bool type_is_pkt_pointer(enum bpf_reg_type type)
type == PTR_TO_PACKET_META;
}
-static bool reg_type_may_be_null(enum bpf_reg_type type)
-{
- return type == PTR_TO_MAP_VALUE_OR_NULL ||
- type == PTR_TO_SOCKET_OR_NULL;
-}
-
-static bool type_is_refcounted(enum bpf_reg_type type)
+static bool type_is_sk_pointer(enum bpf_reg_type type)
{
- return type == PTR_TO_SOCKET;
+ return type == PTR_TO_SOCKET ||
+ type == PTR_TO_SOCK_COMMON ||
+ type == PTR_TO_TCP_SOCK ||
+ type == PTR_TO_XDP_SOCK;
}
-static bool type_is_refcounted_or_null(enum bpf_reg_type type)
+static bool reg_type_may_be_null(enum bpf_reg_type type)
{
- return type == PTR_TO_SOCKET || type == PTR_TO_SOCKET_OR_NULL;
+ return type == PTR_TO_MAP_VALUE_OR_NULL ||
+ type == PTR_TO_SOCKET_OR_NULL ||
+ type == PTR_TO_SOCK_COMMON_OR_NULL ||
+ type == PTR_TO_TCP_SOCK_OR_NULL;
}
-static bool reg_is_refcounted(const struct bpf_reg_state *reg)
+static bool reg_may_point_to_spin_lock(const struct bpf_reg_state *reg)
{
- return type_is_refcounted(reg->type);
+ return reg->type == PTR_TO_MAP_VALUE &&
+ map_value_has_spin_lock(reg->map_ptr);
}
-static bool reg_is_refcounted_or_null(const struct bpf_reg_state *reg)
+static bool reg_type_may_be_refcounted_or_null(enum bpf_reg_type type)
{
- return type_is_refcounted_or_null(reg->type);
+ return type == PTR_TO_SOCKET ||
+ type == PTR_TO_SOCKET_OR_NULL ||
+ type == PTR_TO_TCP_SOCK ||
+ type == PTR_TO_TCP_SOCK_OR_NULL;
}
-static bool arg_type_is_refcounted(enum bpf_arg_type type)
+static bool arg_type_may_be_refcounted(enum bpf_arg_type type)
{
- return type == ARG_PTR_TO_SOCKET;
+ return type == ARG_PTR_TO_SOCK_COMMON;
}
/* Determine whether the function releases some resources allocated by another
@@ -370,6 +366,19 @@ static bool is_release_function(enum bpf_func_id func_id)
return func_id == BPF_FUNC_sk_release;
}
+static bool is_acquire_function(enum bpf_func_id func_id)
+{
+ return func_id == BPF_FUNC_sk_lookup_tcp ||
+ func_id == BPF_FUNC_sk_lookup_udp ||
+ func_id == BPF_FUNC_skc_lookup_tcp;
+}
+
+static bool is_ptr_cast_function(enum bpf_func_id func_id)
+{
+ return func_id == BPF_FUNC_tcp_sock ||
+ func_id == BPF_FUNC_sk_fullsock;
+}
+
/* string representation of 'enum bpf_reg_type' */
static const char * const reg_type_str[] = {
[NOT_INIT] = "?",
@@ -385,6 +394,12 @@ static const char * const reg_type_str[] = {
[PTR_TO_FLOW_KEYS] = "flow_keys",
[PTR_TO_SOCKET] = "sock",
[PTR_TO_SOCKET_OR_NULL] = "sock_or_null",
+ [PTR_TO_SOCK_COMMON] = "sock_common",
+ [PTR_TO_SOCK_COMMON_OR_NULL] = "sock_common_or_null",
+ [PTR_TO_TCP_SOCK] = "tcp_sock",
+ [PTR_TO_TCP_SOCK_OR_NULL] = "tcp_sock_or_null",
+ [PTR_TO_TP_BUFFER] = "tp_buffer",
+ [PTR_TO_XDP_SOCK] = "xdp_sock",
};
static char slot_type_char[] = {
@@ -432,14 +447,16 @@ static void print_verifier_state(struct bpf_verifier_env *env,
verbose(env, " R%d", i);
print_liveness(env, reg->live);
verbose(env, "=%s", reg_type_str[t]);
+ if (t == SCALAR_VALUE && reg->precise)
+ verbose(env, "P");
if ((t == SCALAR_VALUE || t == PTR_TO_STACK) &&
tnum_is_const(reg->var_off)) {
/* reg->off should be 0 for SCALAR_VALUE */
verbose(env, "%lld", reg->var_off.value + reg->off);
- if (t == PTR_TO_STACK)
- verbose(env, ",call_%d", func(env, reg)->callsite);
} else {
verbose(env, "(id=%d", reg->id);
+ if (reg_type_may_be_refcounted_or_null(t))
+ verbose(env, ",ref_obj_id=%d", reg->ref_obj_id);
if (t != SCALAR_VALUE)
verbose(env, ",off=%d", reg->off);
if (type_is_pkt_pointer(t))
@@ -497,11 +514,17 @@ static void print_verifier_state(struct bpf_verifier_env *env,
continue;
verbose(env, " fp%d", (-i - 1) * BPF_REG_SIZE);
print_liveness(env, state->stack[i].spilled_ptr.live);
- if (state->stack[i].slot_type[0] == STACK_SPILL)
- verbose(env, "=%s",
- reg_type_str[state->stack[i].spilled_ptr.type]);
- else
+ if (state->stack[i].slot_type[0] == STACK_SPILL) {
+ reg = &state->stack[i].spilled_ptr;
+ t = reg->type;
+ verbose(env, "=%s", reg_type_str[t]);
+ if (t == SCALAR_VALUE && reg->precise)
+ verbose(env, "P");
+ if (t == SCALAR_VALUE && tnum_is_const(reg->var_off))
+ verbose(env, "%lld", reg->var_off.value + reg->off);
+ } else {
verbose(env, "=%s", types_buf);
+ }
}
if (state->acquired_refs && state->refs[0].id) {
verbose(env, " refs=%d", state->refs[0].id);
@@ -611,13 +634,10 @@ static int acquire_reference_state(struct bpf_verifier_env *env, int insn_idx)
}
/* release function corresponding to acquire_reference_state(). Idempotent. */
-static int __release_reference_state(struct bpf_func_state *state, int ptr_id)
+static int release_reference_state(struct bpf_func_state *state, int ptr_id)
{
int i, last_idx;
- if (!ptr_id)
- return -EFAULT;
-
last_idx = state->acquired_refs - 1;
for (i = 0; i < state->acquired_refs; i++) {
if (state->refs[i].id == ptr_id) {
@@ -629,21 +649,7 @@ static int __release_reference_state(struct bpf_func_state *state, int ptr_id)
return 0;
}
}
- return -EFAULT;
-}
-
-/* variation on the above for cases where we expect that there must be an
- * outstanding reference for the specified ptr_id.
- */
-static int release_reference_state(struct bpf_verifier_env *env, int ptr_id)
-{
- struct bpf_func_state *state = cur_func(env);
- int err;
-
- err = __release_reference_state(state, ptr_id);
- if (WARN_ON_ONCE(err != 0))
- verbose(env, "verifier internal error: can't release reference\n");
- return err;
+ return -EINVAL;
}
static int transfer_reference_state(struct bpf_func_state *dst,
@@ -667,6 +673,13 @@ static void free_func_state(struct bpf_func_state *state)
kfree(state);
}
+static void clear_jmp_history(struct bpf_verifier_state *state)
+{
+ kfree(state->jmp_history);
+ state->jmp_history = NULL;
+ state->jmp_history_cnt = 0;
+}
+
static void free_verifier_state(struct bpf_verifier_state *state,
bool free_self)
{
@@ -676,6 +689,7 @@ static void free_verifier_state(struct bpf_verifier_state *state,
free_func_state(state->frame[i]);
state->frame[i] = NULL;
}
+ clear_jmp_history(state);
if (free_self)
kfree(state);
}
@@ -703,8 +717,18 @@ static int copy_verifier_state(struct bpf_verifier_state *dst_state,
const struct bpf_verifier_state *src)
{
struct bpf_func_state *dst;
+ u32 jmp_sz = sizeof(struct bpf_idx_pair) * src->jmp_history_cnt;
int i, err;
+ if (dst_state->jmp_history_cnt < src->jmp_history_cnt) {
+ kfree(dst_state->jmp_history);
+ dst_state->jmp_history = kmalloc(jmp_sz, GFP_USER);
+ if (!dst_state->jmp_history)
+ return -ENOMEM;
+ }
+ memcpy(dst_state->jmp_history, src->jmp_history, jmp_sz);
+ dst_state->jmp_history_cnt = src->jmp_history_cnt;
+
/* if dst has more stack frames then src frame, free them */
for (i = src->curframe + 1; i <= dst_state->curframe; i++) {
free_func_state(dst_state->frame[i]);
@@ -712,6 +736,11 @@ static int copy_verifier_state(struct bpf_verifier_state *dst_state,
}
dst_state->speculative = src->speculative;
dst_state->curframe = src->curframe;
+ dst_state->active_spin_lock = src->active_spin_lock;
+ dst_state->branches = src->branches;
+ dst_state->parent = src->parent;
+ dst_state->first_insn_idx = src->first_insn_idx;
+ dst_state->last_insn_idx = src->last_insn_idx;
for (i = 0; i <= src->curframe; i++) {
dst = dst_state->frame[i];
if (!dst) {
@@ -727,6 +756,23 @@ static int copy_verifier_state(struct bpf_verifier_state *dst_state,
return 0;
}
+static void update_branch_counts(struct bpf_verifier_env *env, struct bpf_verifier_state *st)
+{
+ while (st) {
+ u32 br = --st->branches;
+
+ /* WARN_ON(br > 1) technically makes sense here,
+ * but see comment in push_stack(), hence:
+ */
+ WARN_ONCE((int)br < 0,
+ "BUG update_branch_counts:branches_to_explore=%d\n",
+ br);
+ if (br)
+ break;
+ st = st->parent;
+ }
+}
+
static int pop_stack(struct bpf_verifier_env *env, int *prev_insn_idx,
int *insn_idx)
{
@@ -775,10 +821,23 @@ static struct bpf_verifier_state *push_stack(struct bpf_verifier_env *env,
if (err)
goto err;
elem->st.speculative |= speculative;
- if (env->stack_size > BPF_COMPLEXITY_LIMIT_STACK) {
- verbose(env, "BPF program is too complex\n");
+ if (env->stack_size > BPF_COMPLEXITY_LIMIT_JMP_SEQ) {
+ verbose(env, "The sequence of %d jumps is too complex.\n",
+ env->stack_size);
goto err;
}
+ if (elem->st.parent) {
+ ++elem->st.parent->branches;
+ /* WARN_ON(branches > 2) technically makes sense here,
+ * but
+ * 1. speculative states will bump 'branches' for non-branch
+ * instructions
+ * 2. is_state_visited() heuristics may decide not to create
+ * a new state for a sequence of branches and all such current
+ * and cloned states will be pointing to a single parent state
+ * which might have large 'branches' count.
+ */
+ }
return &elem->st;
err:
free_verifier_state(env->cur_state, true);
@@ -926,6 +985,9 @@ static void __mark_reg_unbounded(struct bpf_reg_state *reg)
reg->smax_value = S64_MAX;
reg->umin_value = 0;
reg->umax_value = U64_MAX;
+
+ /* constant backtracking is enabled for root only for now */
+ reg->precise = capable(CAP_SYS_ADMIN) ? false : true;
}
/* Mark a register as having a completely unknown (scalar) value. */
@@ -974,6 +1036,7 @@ static void mark_reg_not_init(struct bpf_verifier_env *env,
__mark_reg_not_init(regs + regno);
}
+#define DEF_NOT_SUBREG (0)
static void init_reg_state(struct bpf_verifier_env *env,
struct bpf_func_state *state)
{
@@ -984,6 +1047,7 @@ static void init_reg_state(struct bpf_verifier_env *env,
mark_reg_not_init(env, regs, i);
regs[i].live = REG_LIVE_NONE;
regs[i].parent = NULL;
+ regs[i].subreg_def = DEF_NOT_SUBREG;
}
/* frame pointer */
@@ -1085,7 +1149,7 @@ static int check_subprogs(struct bpf_verifier_env *env)
*/
subprog[env->subprog_cnt].start = insn_cnt;
- if (env->log.level > 1)
+ if (env->log.level & BPF_LOG_LEVEL2)
for (i = 0; i < env->subprog_cnt; i++)
verbose(env, "func#%d @%d\n", i, subprog[i].start);
@@ -1095,7 +1159,7 @@ static int check_subprogs(struct bpf_verifier_env *env)
for (i = 0; i < insn_cnt; i++) {
u8 code = insn[i].code;
- if (BPF_CLASS(code) != BPF_JMP)
+ if (BPF_CLASS(code) != BPF_JMP && BPF_CLASS(code) != BPF_JMP32)
goto next;
if (BPF_OP(code) == BPF_EXIT || BPF_OP(code) == BPF_CALL)
goto next;
@@ -1129,9 +1193,10 @@ next:
*/
static int mark_reg_read(struct bpf_verifier_env *env,
const struct bpf_reg_state *state,
- struct bpf_reg_state *parent)
+ struct bpf_reg_state *parent, u8 flag)
{
bool writes = parent == state->parent; /* Observe write marks */
+ int cnt = 0;
while (parent) {
/* if read wasn't screened by an earlier write ... */
@@ -1143,50 +1208,625 @@ static int mark_reg_read(struct bpf_verifier_env *env,
parent->var_off.value, parent->off);
return -EFAULT;
}
+ /* The first condition is more likely to be true than the
+ * second, checked it first.
+ */
+ if ((parent->live & REG_LIVE_READ) == flag ||
+ parent->live & REG_LIVE_READ64)
+ /* The parentage chain never changes and
+ * this parent was already marked as LIVE_READ.
+ * There is no need to keep walking the chain again and
+ * keep re-marking all parents as LIVE_READ.
+ * This case happens when the same register is read
+ * multiple times without writes into it in-between.
+ * Also, if parent has the stronger REG_LIVE_READ64 set,
+ * then no need to set the weak REG_LIVE_READ32.
+ */
+ break;
/* ... then we depend on parent's value */
- parent->live |= REG_LIVE_READ;
+ parent->live |= flag;
+ /* REG_LIVE_READ64 overrides REG_LIVE_READ32. */
+ if (flag == REG_LIVE_READ64)
+ parent->live &= ~REG_LIVE_READ32;
state = parent;
parent = state->parent;
writes = true;
+ cnt++;
}
+
+ if (env->longest_mark_read_walk < cnt)
+ env->longest_mark_read_walk = cnt;
return 0;
}
+/* This function is supposed to be used by the following 32-bit optimization
+ * code only. It returns TRUE if the source or destination register operates
+ * on 64-bit, otherwise return FALSE.
+ */
+static bool is_reg64(struct bpf_verifier_env *env, struct bpf_insn *insn,
+ u32 regno, struct bpf_reg_state *reg, enum reg_arg_type t)
+{
+ u8 code, class, op;
+
+ code = insn->code;
+ class = BPF_CLASS(code);
+ op = BPF_OP(code);
+ if (class == BPF_JMP) {
+ /* BPF_EXIT for "main" will reach here. Return TRUE
+ * conservatively.
+ */
+ if (op == BPF_EXIT)
+ return true;
+ if (op == BPF_CALL) {
+ /* BPF to BPF call will reach here because of marking
+ * caller saved clobber with DST_OP_NO_MARK for which we
+ * don't care the register def because they are anyway
+ * marked as NOT_INIT already.
+ */
+ if (insn->src_reg == BPF_PSEUDO_CALL)
+ return false;
+ /* Helper call will reach here because of arg type
+ * check, conservatively return TRUE.
+ */
+ if (t == SRC_OP)
+ return true;
+
+ return false;
+ }
+ }
+
+ if (class == BPF_ALU64 || class == BPF_JMP ||
+ /* BPF_END always use BPF_ALU class. */
+ (class == BPF_ALU && op == BPF_END && insn->imm == 64))
+ return true;
+
+ if (class == BPF_ALU || class == BPF_JMP32)
+ return false;
+
+ if (class == BPF_LDX) {
+ if (t != SRC_OP)
+ return BPF_SIZE(code) == BPF_DW;
+ /* LDX source must be ptr. */
+ return true;
+ }
+
+ if (class == BPF_STX) {
+ if (reg->type != SCALAR_VALUE)
+ return true;
+ return BPF_SIZE(code) == BPF_DW;
+ }
+
+ if (class == BPF_LD) {
+ u8 mode = BPF_MODE(code);
+
+ /* LD_IMM64 */
+ if (mode == BPF_IMM)
+ return true;
+
+ /* Both LD_IND and LD_ABS return 32-bit data. */
+ if (t != SRC_OP)
+ return false;
+
+ /* Implicit ctx ptr. */
+ if (regno == BPF_REG_6)
+ return true;
+
+ /* Explicit source could be any width. */
+ return true;
+ }
+
+ if (class == BPF_ST)
+ /* The only source register for BPF_ST is a ptr. */
+ return true;
+
+ /* Conservatively return true at default. */
+ return true;
+}
+
+/* Return TRUE if INSN doesn't have explicit value define. */
+static bool insn_no_def(struct bpf_insn *insn)
+{
+ u8 class = BPF_CLASS(insn->code);
+
+ return (class == BPF_JMP || class == BPF_JMP32 ||
+ class == BPF_STX || class == BPF_ST);
+}
+
+/* Return TRUE if INSN has defined any 32-bit value explicitly. */
+static bool insn_has_def32(struct bpf_verifier_env *env, struct bpf_insn *insn)
+{
+ if (insn_no_def(insn))
+ return false;
+
+ return !is_reg64(env, insn, insn->dst_reg, NULL, DST_OP);
+}
+
+static void mark_insn_zext(struct bpf_verifier_env *env,
+ struct bpf_reg_state *reg)
+{
+ s32 def_idx = reg->subreg_def;
+
+ if (def_idx == DEF_NOT_SUBREG)
+ return;
+
+ env->insn_aux_data[def_idx - 1].zext_dst = true;
+ /* The dst will be zero extended, so won't be sub-register anymore. */
+ reg->subreg_def = DEF_NOT_SUBREG;
+}
+
static int check_reg_arg(struct bpf_verifier_env *env, u32 regno,
enum reg_arg_type t)
{
struct bpf_verifier_state *vstate = env->cur_state;
struct bpf_func_state *state = vstate->frame[vstate->curframe];
- struct bpf_reg_state *regs = state->regs;
+ struct bpf_insn *insn = env->prog->insnsi + env->insn_idx;
+ struct bpf_reg_state *reg, *regs = state->regs;
+ bool rw64;
if (regno >= MAX_BPF_REG) {
verbose(env, "R%d is invalid\n", regno);
return -EINVAL;
}
+ reg = &regs[regno];
+ rw64 = is_reg64(env, insn, regno, reg, t);
if (t == SRC_OP) {
/* check whether register used as source operand can be read */
- if (regs[regno].type == NOT_INIT) {
+ if (reg->type == NOT_INIT) {
verbose(env, "R%d !read_ok\n", regno);
return -EACCES;
}
/* We don't need to worry about FP liveness because it's read-only */
- if (regno != BPF_REG_FP)
- return mark_reg_read(env, &regs[regno],
- regs[regno].parent);
+ if (regno == BPF_REG_FP)
+ return 0;
+
+ if (rw64)
+ mark_insn_zext(env, reg);
+
+ return mark_reg_read(env, reg, reg->parent,
+ rw64 ? REG_LIVE_READ64 : REG_LIVE_READ32);
} else {
/* check whether register used as dest operand can be written to */
if (regno == BPF_REG_FP) {
verbose(env, "frame pointer is read only\n");
return -EACCES;
}
- regs[regno].live |= REG_LIVE_WRITTEN;
+ reg->live |= REG_LIVE_WRITTEN;
+ reg->subreg_def = rw64 ? DEF_NOT_SUBREG : env->insn_idx + 1;
if (t == DST_OP)
mark_reg_unknown(env, regs, regno);
}
return 0;
}
+/* for any branch, call, exit record the history of jmps in the given state */
+static int push_jmp_history(struct bpf_verifier_env *env,
+ struct bpf_verifier_state *cur)
+{
+ u32 cnt = cur->jmp_history_cnt;
+ struct bpf_idx_pair *p;
+
+ cnt++;
+ p = krealloc(cur->jmp_history, cnt * sizeof(*p), GFP_USER);
+ if (!p)
+ return -ENOMEM;
+ p[cnt - 1].idx = env->insn_idx;
+ p[cnt - 1].prev_idx = env->prev_insn_idx;
+ cur->jmp_history = p;
+ cur->jmp_history_cnt = cnt;
+ return 0;
+}
+
+/* Backtrack one insn at a time. If idx is not at the top of recorded
+ * history then previous instruction came from straight line execution.
+ */
+static int get_prev_insn_idx(struct bpf_verifier_state *st, int i,
+ u32 *history)
+{
+ u32 cnt = *history;
+
+ if (cnt && st->jmp_history[cnt - 1].idx == i) {
+ i = st->jmp_history[cnt - 1].prev_idx;
+ (*history)--;
+ } else {
+ i--;
+ }
+ return i;
+}
+
+/* For given verifier state backtrack_insn() is called from the last insn to
+ * the first insn. Its purpose is to compute a bitmask of registers and
+ * stack slots that needs precision in the parent verifier state.
+ */
+static int backtrack_insn(struct bpf_verifier_env *env, int idx,
+ u32 *reg_mask, u64 *stack_mask)
+{
+ const struct bpf_insn_cbs cbs = {
+ .cb_print = verbose,
+ .private_data = env,
+ };
+ struct bpf_insn *insn = env->prog->insnsi + idx;
+ u8 class = BPF_CLASS(insn->code);
+ u8 opcode = BPF_OP(insn->code);
+ u8 mode = BPF_MODE(insn->code);
+ u32 dreg = 1u << insn->dst_reg;
+ u32 sreg = 1u << insn->src_reg;
+ u32 spi;
+
+ if (insn->code == 0)
+ return 0;
+ if (env->log.level & BPF_LOG_LEVEL) {
+ verbose(env, "regs=%x stack=%llx before ", *reg_mask, *stack_mask);
+ verbose(env, "%d: ", idx);
+ print_bpf_insn(&cbs, insn, env->allow_ptr_leaks);
+ }
+
+ if (class == BPF_ALU || class == BPF_ALU64) {
+ if (!(*reg_mask & dreg))
+ return 0;
+ if (opcode == BPF_MOV) {
+ if (BPF_SRC(insn->code) == BPF_X) {
+ /* dreg = sreg
+ * dreg needs precision after this insn
+ * sreg needs precision before this insn
+ */
+ *reg_mask &= ~dreg;
+ *reg_mask |= sreg;
+ } else {
+ /* dreg = K
+ * dreg needs precision after this insn.
+ * Corresponding register is already marked
+ * as precise=true in this verifier state.
+ * No further markings in parent are necessary
+ */
+ *reg_mask &= ~dreg;
+ }
+ } else {
+ if (BPF_SRC(insn->code) == BPF_X) {
+ /* dreg += sreg
+ * both dreg and sreg need precision
+ * before this insn
+ */
+ *reg_mask |= sreg;
+ } /* else dreg += K
+ * dreg still needs precision before this insn
+ */
+ }
+ } else if (class == BPF_LDX) {
+ if (!(*reg_mask & dreg))
+ return 0;
+ *reg_mask &= ~dreg;
+
+ /* scalars can only be spilled into stack w/o losing precision.
+ * Load from any other memory can be zero extended.
+ * The desire to keep that precision is already indicated
+ * by 'precise' mark in corresponding register of this state.
+ * No further tracking necessary.
+ */
+ if (insn->src_reg != BPF_REG_FP)
+ return 0;
+ if (BPF_SIZE(insn->code) != BPF_DW)
+ return 0;
+
+ /* dreg = *(u64 *)[fp - off] was a fill from the stack.
+ * that [fp - off] slot contains scalar that needs to be
+ * tracked with precision
+ */
+ spi = (-insn->off - 1) / BPF_REG_SIZE;
+ if (spi >= 64) {
+ verbose(env, "BUG spi %d\n", spi);
+ WARN_ONCE(1, "verifier backtracking bug");
+ return -EFAULT;
+ }
+ *stack_mask |= 1ull << spi;
+ } else if (class == BPF_STX) {
+ if (*reg_mask & dreg)
+ /* stx shouldn't be using _scalar_ dst_reg
+ * to access memory. It means backtracking
+ * encountered a case of pointer subtraction.
+ */
+ return -ENOTSUPP;
+ /* scalars can only be spilled into stack */
+ if (insn->dst_reg != BPF_REG_FP)
+ return 0;
+ if (BPF_SIZE(insn->code) != BPF_DW)
+ return 0;
+ spi = (-insn->off - 1) / BPF_REG_SIZE;
+ if (spi >= 64) {
+ verbose(env, "BUG spi %d\n", spi);
+ WARN_ONCE(1, "verifier backtracking bug");
+ return -EFAULT;
+ }
+ if (!(*stack_mask & (1ull << spi)))
+ return 0;
+ *stack_mask &= ~(1ull << spi);
+ *reg_mask |= sreg;
+ } else if (class == BPF_JMP || class == BPF_JMP32) {
+ if (opcode == BPF_CALL) {
+ if (insn->src_reg == BPF_PSEUDO_CALL)
+ return -ENOTSUPP;
+ /* regular helper call sets R0 */
+ *reg_mask &= ~1;
+ if (*reg_mask & 0x3f) {
+ /* if backtracing was looking for registers R1-R5
+ * they should have been found already.
+ */
+ verbose(env, "BUG regs %x\n", *reg_mask);
+ WARN_ONCE(1, "verifier backtracking bug");
+ return -EFAULT;
+ }
+ } else if (opcode == BPF_EXIT) {
+ return -ENOTSUPP;
+ }
+ } else if (class == BPF_LD) {
+ if (!(*reg_mask & dreg))
+ return 0;
+ *reg_mask &= ~dreg;
+ /* It's ld_imm64 or ld_abs or ld_ind.
+ * For ld_imm64 no further tracking of precision
+ * into parent is necessary
+ */
+ if (mode == BPF_IND || mode == BPF_ABS)
+ /* to be analyzed */
+ return -ENOTSUPP;
+ } else if (class == BPF_ST) {
+ if (*reg_mask & dreg)
+ /* likely pointer subtraction */
+ return -ENOTSUPP;
+ }
+ return 0;
+}
+
+/* the scalar precision tracking algorithm:
+ * . at the start all registers have precise=false.
+ * . scalar ranges are tracked as normal through alu and jmp insns.
+ * . once precise value of the scalar register is used in:
+ * . ptr + scalar alu
+ * . if (scalar cond K|scalar)
+ * . helper_call(.., scalar, ...) where ARG_CONST is expected
+ * backtrack through the verifier states and mark all registers and
+ * stack slots with spilled constants that these scalar regisers
+ * should be precise.
+ * . during state pruning two registers (or spilled stack slots)
+ * are equivalent if both are not precise.
+ *
+ * Note the verifier cannot simply walk register parentage chain,
+ * since many different registers and stack slots could have been
+ * used to compute single precise scalar.
+ *
+ * The approach of starting with precise=true for all registers and then
+ * backtrack to mark a register as not precise when the verifier detects
+ * that program doesn't care about specific value (e.g., when helper
+ * takes register as ARG_ANYTHING parameter) is not safe.
+ *
+ * It's ok to walk single parentage chain of the verifier states.
+ * It's possible that this backtracking will go all the way till 1st insn.
+ * All other branches will be explored for needing precision later.
+ *
+ * The backtracking needs to deal with cases like:
+ * R8=map_value(id=0,off=0,ks=4,vs=1952,imm=0) R9_w=map_value(id=0,off=40,ks=4,vs=1952,imm=0)
+ * r9 -= r8
+ * r5 = r9
+ * if r5 > 0x79f goto pc+7
+ * R5_w=inv(id=0,umax_value=1951,var_off=(0x0; 0x7ff))
+ * r5 += 1
+ * ...
+ * call bpf_perf_event_output#25
+ * where .arg5_type = ARG_CONST_SIZE_OR_ZERO
+ *
+ * and this case:
+ * r6 = 1
+ * call foo // uses callee's r6 inside to compute r0
+ * r0 += r6
+ * if r0 == 0 goto
+ *
+ * to track above reg_mask/stack_mask needs to be independent for each frame.
+ *
+ * Also if parent's curframe > frame where backtracking started,
+ * the verifier need to mark registers in both frames, otherwise callees
+ * may incorrectly prune callers. This is similar to
+ * commit 7640ead93924 ("bpf: verifier: make sure callees don't prune with caller differences")
+ *
+ * For now backtracking falls back into conservative marking.
+ */
+static void mark_all_scalars_precise(struct bpf_verifier_env *env,
+ struct bpf_verifier_state *st)
+{
+ struct bpf_func_state *func;
+ struct bpf_reg_state *reg;
+ int i, j;
+
+ /* big hammer: mark all scalars precise in this path.
+ * pop_stack may still get !precise scalars.
+ */
+ for (; st; st = st->parent)
+ for (i = 0; i <= st->curframe; i++) {
+ func = st->frame[i];
+ for (j = 0; j < BPF_REG_FP; j++) {
+ reg = &func->regs[j];
+ if (reg->type != SCALAR_VALUE)
+ continue;
+ reg->precise = true;
+ }
+ for (j = 0; j < func->allocated_stack / BPF_REG_SIZE; j++) {
+ if (func->stack[j].slot_type[0] != STACK_SPILL)
+ continue;
+ reg = &func->stack[j].spilled_ptr;
+ if (reg->type != SCALAR_VALUE)
+ continue;
+ reg->precise = true;
+ }
+ }
+}
+
+static int __mark_chain_precision(struct bpf_verifier_env *env, int regno,
+ int spi)
+{
+ struct bpf_verifier_state *st = env->cur_state;
+ int first_idx = st->first_insn_idx;
+ int last_idx = env->insn_idx;
+ struct bpf_func_state *func;
+ struct bpf_reg_state *reg;
+ u32 reg_mask = regno >= 0 ? 1u << regno : 0;
+ u64 stack_mask = spi >= 0 ? 1ull << spi : 0;
+ bool skip_first = true;
+ bool new_marks = false;
+ int i, err;
+
+ if (!env->allow_ptr_leaks)
+ /* backtracking is root only for now */
+ return 0;
+
+ func = st->frame[st->curframe];
+ if (regno >= 0) {
+ reg = &func->regs[regno];
+ if (reg->type != SCALAR_VALUE) {
+ WARN_ONCE(1, "backtracing misuse");
+ return -EFAULT;
+ }
+ if (!reg->precise)
+ new_marks = true;
+ else
+ reg_mask = 0;
+ reg->precise = true;
+ }
+
+ while (spi >= 0) {
+ if (func->stack[spi].slot_type[0] != STACK_SPILL) {
+ stack_mask = 0;
+ break;
+ }
+ reg = &func->stack[spi].spilled_ptr;
+ if (reg->type != SCALAR_VALUE) {
+ stack_mask = 0;
+ break;
+ }
+ if (!reg->precise)
+ new_marks = true;
+ else
+ stack_mask = 0;
+ reg->precise = true;
+ break;
+ }
+
+ if (!new_marks)
+ return 0;
+ if (!reg_mask && !stack_mask)
+ return 0;
+ for (;;) {
+ DECLARE_BITMAP(mask, 64);
+ u32 history = st->jmp_history_cnt;
+
+ if (env->log.level & BPF_LOG_LEVEL)
+ verbose(env, "last_idx %d first_idx %d\n", last_idx, first_idx);
+ for (i = last_idx;;) {
+ if (skip_first) {
+ err = 0;
+ skip_first = false;
+ } else {
+ err = backtrack_insn(env, i, &reg_mask, &stack_mask);
+ }
+ if (err == -ENOTSUPP) {
+ mark_all_scalars_precise(env, st);
+ return 0;
+ } else if (err) {
+ return err;
+ }
+ if (!reg_mask && !stack_mask)
+ /* Found assignment(s) into tracked register in this state.
+ * Since this state is already marked, just return.
+ * Nothing to be tracked further in the parent state.
+ */
+ return 0;
+ if (i == first_idx)
+ break;
+ i = get_prev_insn_idx(st, i, &history);
+ if (i >= env->prog->len) {
+ /* This can happen if backtracking reached insn 0
+ * and there are still reg_mask or stack_mask
+ * to backtrack.
+ * It means the backtracking missed the spot where
+ * particular register was initialized with a constant.
+ */
+ verbose(env, "BUG backtracking idx %d\n", i);
+ WARN_ONCE(1, "verifier backtracking bug");
+ return -EFAULT;
+ }
+ }
+ st = st->parent;
+ if (!st)
+ break;
+
+ new_marks = false;
+ func = st->frame[st->curframe];
+ bitmap_from_u64(mask, reg_mask);
+ for_each_set_bit(i, mask, 32) {
+ reg = &func->regs[i];
+ if (reg->type != SCALAR_VALUE) {
+ reg_mask &= ~(1u << i);
+ continue;
+ }
+ if (!reg->precise)
+ new_marks = true;
+ reg->precise = true;
+ }
+
+ bitmap_from_u64(mask, stack_mask);
+ for_each_set_bit(i, mask, 64) {
+ if (i >= func->allocated_stack / BPF_REG_SIZE) {
+ /* This can happen if backtracking
+ * is propagating stack precision where
+ * caller has larger stack frame
+ * than callee, but backtrack_insn() should
+ * have returned -ENOTSUPP.
+ */
+ verbose(env, "BUG spi %d stack_size %d\n",
+ i, func->allocated_stack);
+ WARN_ONCE(1, "verifier backtracking bug");
+ return -EFAULT;
+ }
+
+ if (func->stack[i].slot_type[0] != STACK_SPILL) {
+ stack_mask &= ~(1ull << i);
+ continue;
+ }
+ reg = &func->stack[i].spilled_ptr;
+ if (reg->type != SCALAR_VALUE) {
+ stack_mask &= ~(1ull << i);
+ continue;
+ }
+ if (!reg->precise)
+ new_marks = true;
+ reg->precise = true;
+ }
+ if (env->log.level & BPF_LOG_LEVEL) {
+ print_verifier_state(env, func);
+ verbose(env, "parent %s regs=%x stack=%llx marks\n",
+ new_marks ? "didn't have" : "already had",
+ reg_mask, stack_mask);
+ }
+
+ if (!reg_mask && !stack_mask)
+ break;
+ if (!new_marks)
+ break;
+
+ last_idx = st->last_insn_idx;
+ first_idx = st->first_insn_idx;
+ }
+ return 0;
+}
+
+static int mark_chain_precision(struct bpf_verifier_env *env, int regno)
+{
+ return __mark_chain_precision(env, regno, -1);
+}
+
+static int mark_chain_precision_stack(struct bpf_verifier_env *env, int spi)
+{
+ return __mark_chain_precision(env, -1, spi);
+}
+
static bool is_spillable_regtype(enum bpf_reg_type type)
{
switch (type) {
@@ -1201,6 +1841,11 @@ static bool is_spillable_regtype(enum bpf_reg_type type)
case CONST_PTR_TO_MAP:
case PTR_TO_SOCKET:
case PTR_TO_SOCKET_OR_NULL:
+ case PTR_TO_SOCK_COMMON:
+ case PTR_TO_SOCK_COMMON_OR_NULL:
+ case PTR_TO_TCP_SOCK:
+ case PTR_TO_TCP_SOCK_OR_NULL:
+ case PTR_TO_XDP_SOCK:
return true;
default:
return false;
@@ -1213,6 +1858,23 @@ static bool register_is_null(struct bpf_reg_state *reg)
return reg->type == SCALAR_VALUE && tnum_equals_const(reg->var_off, 0);
}
+static bool register_is_const(struct bpf_reg_state *reg)
+{
+ return reg->type == SCALAR_VALUE && tnum_is_const(reg->var_off);
+}
+
+static void save_register_state(struct bpf_func_state *state,
+ int spi, struct bpf_reg_state *reg)
+{
+ int i;
+
+ state->stack[spi].spilled_ptr = *reg;
+ state->stack[spi].spilled_ptr.live |= REG_LIVE_WRITTEN;
+
+ for (i = 0; i < BPF_REG_SIZE; i++)
+ state->stack[spi].slot_type[i] = STACK_SPILL;
+}
+
/* check_stack_read/write functions track spill/fill of registers,
* stack boundary and alignment are checked in check_mem_access()
*/
@@ -1222,7 +1884,8 @@ static int check_stack_write(struct bpf_verifier_env *env,
{
struct bpf_func_state *cur; /* state of the current function */
int i, slot = -off - 1, spi = slot / BPF_REG_SIZE, err;
- enum bpf_reg_type type;
+ u32 dst_reg = env->prog->insnsi[insn_idx].dst_reg;
+ struct bpf_reg_state *reg = NULL;
err = realloc_func_state(state, round_up(slot + 1, BPF_REG_SIZE),
state->acquired_refs, true);
@@ -1239,27 +1902,48 @@ static int check_stack_write(struct bpf_verifier_env *env,
}
cur = env->cur_state->frame[env->cur_state->curframe];
- if (value_regno >= 0 &&
- is_spillable_regtype((type = cur->regs[value_regno].type))) {
-
+ if (value_regno >= 0)
+ reg = &cur->regs[value_regno];
+
+ if (reg && size == BPF_REG_SIZE && register_is_const(reg) &&
+ !register_is_null(reg) && env->allow_ptr_leaks) {
+ if (dst_reg != BPF_REG_FP) {
+ /* The backtracking logic can only recognize explicit
+ * stack slot address like [fp - 8]. Other spill of
+ * scalar via different register has to be conervative.
+ * Backtrack from here and mark all registers as precise
+ * that contributed into 'reg' being a constant.
+ */
+ err = mark_chain_precision(env, value_regno);
+ if (err)
+ return err;
+ }
+ save_register_state(state, spi, reg);
+ } else if (reg && is_spillable_regtype(reg->type)) {
/* register containing pointer is being spilled into stack */
if (size != BPF_REG_SIZE) {
+ verbose_linfo(env, insn_idx, "; ");
verbose(env, "invalid size of register spill\n");
return -EACCES;
}
- if (state != cur && type == PTR_TO_STACK) {
+ if (state != cur && reg->type == PTR_TO_STACK) {
verbose(env, "cannot spill pointers to stack into stack frame of the caller\n");
return -EINVAL;
}
- /* save register state */
- state->stack[spi].spilled_ptr = cur->regs[value_regno];
- state->stack[spi].spilled_ptr.live |= REG_LIVE_WRITTEN;
+ if (!env->allow_ptr_leaks) {
+ bool sanitize = false;
- for (i = 0; i < BPF_REG_SIZE; i++) {
- if (state->stack[spi].slot_type[i] == STACK_MISC &&
- !env->allow_ptr_leaks) {
+ if (state->stack[spi].slot_type[0] == STACK_SPILL &&
+ register_is_const(&state->stack[spi].spilled_ptr))
+ sanitize = true;
+ for (i = 0; i < BPF_REG_SIZE; i++)
+ if (state->stack[spi].slot_type[i] == STACK_MISC) {
+ sanitize = true;
+ break;
+ }
+ if (sanitize) {
int *poff = &env->insn_aux_data[insn_idx].sanitize_stack_off;
int soff = (-spi - 1) * BPF_REG_SIZE;
@@ -1282,8 +1966,8 @@ static int check_stack_write(struct bpf_verifier_env *env,
}
*poff = soff;
}
- state->stack[spi].slot_type[i] = STACK_SPILL;
}
+ save_register_state(state, spi, reg);
} else {
u8 type = STACK_MISC;
@@ -1306,9 +1990,13 @@ static int check_stack_write(struct bpf_verifier_env *env,
state->stack[spi].spilled_ptr.live |= REG_LIVE_WRITTEN;
/* when we zero initialize stack slots mark them as such */
- if (value_regno >= 0 &&
- register_is_null(&cur->regs[value_regno]))
+ if (reg && register_is_null(reg)) {
+ /* backtracking doesn't work for STACK_ZERO yet. */
+ err = mark_chain_precision(env, value_regno);
+ if (err)
+ return err;
type = STACK_ZERO;
+ }
/* Mark slots affected by this stack write. */
for (i = 0; i < size; i++)
@@ -1325,6 +2013,7 @@ static int check_stack_read(struct bpf_verifier_env *env,
struct bpf_verifier_state *vstate = env->cur_state;
struct bpf_func_state *state = vstate->frame[vstate->curframe];
int i, slot = -off - 1, spi = slot / BPF_REG_SIZE;
+ struct bpf_reg_state *reg;
u8 *stype;
if (reg_state->allocated_stack <= slot) {
@@ -1333,11 +2022,21 @@ static int check_stack_read(struct bpf_verifier_env *env,
return -EACCES;
}
stype = reg_state->stack[spi].slot_type;
+ reg = &reg_state->stack[spi].spilled_ptr;
if (stype[0] == STACK_SPILL) {
if (size != BPF_REG_SIZE) {
- verbose(env, "invalid size of register spill\n");
- return -EACCES;
+ if (reg->type != SCALAR_VALUE) {
+ verbose_linfo(env, env->insn_idx, "; ");
+ verbose(env, "invalid size of register fill\n");
+ return -EACCES;
+ }
+ if (value_regno >= 0) {
+ mark_reg_unknown(env, state->regs, value_regno);
+ state->regs[value_regno].live |= REG_LIVE_WRITTEN;
+ }
+ mark_reg_read(env, reg, reg->parent, REG_LIVE_READ64);
+ return 0;
}
for (i = 1; i < BPF_REG_SIZE; i++) {
if (stype[(slot - i) % BPF_REG_SIZE] != STACK_SPILL) {
@@ -1348,16 +2047,14 @@ static int check_stack_read(struct bpf_verifier_env *env,
if (value_regno >= 0) {
/* restore register state from stack */
- state->regs[value_regno] = reg_state->stack[spi].spilled_ptr;
+ state->regs[value_regno] = *reg;
/* mark reg as written since spilled pointer state likely
* has its liveness marks cleared by is_state_visited()
* which resets stack/reg liveness for state transitions
*/
state->regs[value_regno].live |= REG_LIVE_WRITTEN;
}
- mark_reg_read(env, &reg_state->stack[spi].spilled_ptr,
- reg_state->stack[spi].spilled_ptr.parent);
- return 0;
+ mark_reg_read(env, reg, reg->parent, REG_LIVE_READ64);
} else {
int zeros = 0;
@@ -1372,22 +2069,32 @@ static int check_stack_read(struct bpf_verifier_env *env,
off, i, size);
return -EACCES;
}
- mark_reg_read(env, &reg_state->stack[spi].spilled_ptr,
- reg_state->stack[spi].spilled_ptr.parent);
+ mark_reg_read(env, reg, reg->parent, REG_LIVE_READ64);
if (value_regno >= 0) {
if (zeros == size) {
/* any size read into register is zero extended,
* so the whole register == const_zero
*/
__mark_reg_const_zero(&state->regs[value_regno]);
+ /* backtracking doesn't support STACK_ZERO yet,
+ * so mark it precise here, so that later
+ * backtracking can stop here.
+ * Backtracking may not need this if this register
+ * doesn't participate in pointer adjustment.
+ * Forward propagation of precise flag is not
+ * necessary either. This mark is only to stop
+ * backtracking. Any register that contributed
+ * to const 0 was marked precise before spill.
+ */
+ state->regs[value_regno].precise = true;
} else {
/* have read misc data from the stack */
mark_reg_unknown(env, state->regs, value_regno);
}
state->regs[value_regno].live |= REG_LIVE_WRITTEN;
}
- return 0;
}
+ return 0;
}
static int check_stack_access(struct bpf_verifier_env *env,
@@ -1402,7 +2109,7 @@ static int check_stack_access(struct bpf_verifier_env *env,
char tn_buf[48];
tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
- verbose(env, "variable stack access var_off=%s off=%d size=%d",
+ verbose(env, "variable stack access var_off=%s off=%d size=%d\n",
tn_buf, off, size);
return -EACCES;
}
@@ -1415,6 +2122,28 @@ static int check_stack_access(struct bpf_verifier_env *env,
return 0;
}
+static int check_map_access_type(struct bpf_verifier_env *env, u32 regno,
+ int off, int size, enum bpf_access_type type)
+{
+ struct bpf_reg_state *regs = cur_regs(env);
+ struct bpf_map *map = regs[regno].map_ptr;
+ u32 cap = bpf_map_flags_to_cap(map);
+
+ if (type == BPF_WRITE && !(cap & BPF_MAP_CAN_WRITE)) {
+ verbose(env, "write into map forbidden, value_size=%d off=%d size=%d\n",
+ map->value_size, off, size);
+ return -EACCES;
+ }
+
+ if (type == BPF_READ && !(cap & BPF_MAP_CAN_READ)) {
+ verbose(env, "read from map forbidden, value_size=%d off=%d size=%d\n",
+ map->value_size, off, size);
+ return -EACCES;
+ }
+
+ return 0;
+}
+
/* check read/write into map element returned by bpf_map_lookup_elem() */
static int __check_map_access(struct bpf_verifier_env *env, u32 regno, int off,
int size, bool zero_size_allowed)
@@ -1444,7 +2173,7 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno,
* need to try adding each of min_value and max_value to off
* to make sure our theoretical access will be safe.
*/
- if (env->log.level)
+ if (env->log.level & BPF_LOG_LEVEL)
print_verifier_state(env, state);
/* The minimum value is only important with signed
@@ -1483,6 +2212,21 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno,
if (err)
verbose(env, "R%d max value is outside of the array range\n",
regno);
+
+ if (map_value_has_spin_lock(reg->map_ptr)) {
+ u32 lock = reg->map_ptr->spin_lock_off;
+
+ /* if any part of struct bpf_spin_lock can be touched by
+ * load/store reject this program.
+ * To check that [x1, x2) overlaps with [y1, y2)
+ * it is sufficient to check x1 < y2 && y1 < x2.
+ */
+ if (reg->smin_value + off < lock + sizeof(struct bpf_spin_lock) &&
+ lock < reg->umax_value + off + size) {
+ verbose(env, "bpf_spin_lock cannot be accessed directly by load/store\n");
+ return -EACCES;
+ }
+ }
return err;
}
@@ -1516,6 +2260,13 @@ static bool may_access_direct_pkt_data(struct bpf_verifier_env *env,
env->seen_direct_write = true;
return true;
+
+ case BPF_PROG_TYPE_CGROUP_SOCKOPT:
+ if (t == BPF_WRITE)
+ env->seen_direct_write = true;
+
+ return true;
+
default:
return false;
}
@@ -1617,12 +2368,14 @@ static int check_flow_keys_access(struct bpf_verifier_env *env, int off,
return 0;
}
-static int check_sock_access(struct bpf_verifier_env *env, u32 regno, int off,
- int size, enum bpf_access_type t)
+static int check_sock_access(struct bpf_verifier_env *env, int insn_idx,
+ u32 regno, int off, int size,
+ enum bpf_access_type t)
{
struct bpf_reg_state *regs = cur_regs(env);
struct bpf_reg_state *reg = &regs[regno];
- struct bpf_insn_access_aux info;
+ struct bpf_insn_access_aux info = {};
+ bool valid;
if (reg->smin_value < 0) {
verbose(env, "R%d min value is negative, either use unsigned index or do a if (index >=0) check.\n",
@@ -1630,13 +2383,34 @@ static int check_sock_access(struct bpf_verifier_env *env, u32 regno, int off,
return -EACCES;
}
- if (!bpf_sock_is_valid_access(off, size, t, &info)) {
- verbose(env, "invalid bpf_sock access off=%d size=%d\n",
- off, size);
- return -EACCES;
+ switch (reg->type) {
+ case PTR_TO_SOCK_COMMON:
+ valid = bpf_sock_common_is_valid_access(off, size, t, &info);
+ break;
+ case PTR_TO_SOCKET:
+ valid = bpf_sock_is_valid_access(off, size, t, &info);
+ break;
+ case PTR_TO_TCP_SOCK:
+ valid = bpf_tcp_sock_is_valid_access(off, size, t, &info);
+ break;
+ case PTR_TO_XDP_SOCK:
+ valid = bpf_xdp_sock_is_valid_access(off, size, t, &info);
+ break;
+ default:
+ valid = false;
}
- return 0;
+
+ if (valid) {
+ env->insn_aux_data[insn_idx].ctx_field_size =
+ info.ctx_field_size;
+ return 0;
+ }
+
+ verbose(env, "R%d invalid %s access off=%d size=%d\n",
+ regno, reg_type_str[reg->type], off, size);
+
+ return -EACCES;
}
static bool __is_pointer_value(bool allow_ptr_leaks,
@@ -1662,8 +2436,14 @@ static bool is_ctx_reg(struct bpf_verifier_env *env, int regno)
{
const struct bpf_reg_state *reg = reg_state(env, regno);
- return reg->type == PTR_TO_CTX ||
- reg->type == PTR_TO_SOCKET;
+ return reg->type == PTR_TO_CTX;
+}
+
+static bool is_sk_reg(struct bpf_verifier_env *env, int regno)
+{
+ const struct bpf_reg_state *reg = reg_state(env, regno);
+
+ return type_is_sk_pointer(reg->type);
}
static bool is_pkt_reg(struct bpf_verifier_env *env, int regno)
@@ -1774,6 +2554,15 @@ static int check_ptr_alignment(struct bpf_verifier_env *env,
case PTR_TO_SOCKET:
pointer_desc = "sock ";
break;
+ case PTR_TO_SOCK_COMMON:
+ pointer_desc = "sock_common ";
+ break;
+ case PTR_TO_TCP_SOCK:
+ pointer_desc = "tcp_sock ";
+ break;
+ case PTR_TO_XDP_SOCK:
+ pointer_desc = "xdp_sock ";
+ break;
default:
break;
}
@@ -1840,8 +2629,9 @@ continue_func:
}
frame++;
if (frame >= MAX_CALL_FRAMES) {
- WARN_ONCE(1, "verifier bug. Call stack is too deep\n");
- return -EFAULT;
+ verbose(env, "the call stack of %d frames is too deep !\n",
+ frame);
+ return -E2BIG;
}
goto process_func;
}
@@ -1897,6 +2687,32 @@ static int check_ctx_reg(struct bpf_verifier_env *env,
return 0;
}
+static int check_tp_buffer_access(struct bpf_verifier_env *env,
+ const struct bpf_reg_state *reg,
+ int regno, int off, int size)
+{
+ if (off < 0) {
+ verbose(env,
+ "R%d invalid tracepoint buffer access: off=%d, size=%d",
+ regno, off, size);
+ return -EACCES;
+ }
+ if (!tnum_is_const(reg->var_off) || reg->var_off.value) {
+ char tn_buf[48];
+
+ tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
+ verbose(env,
+ "R%d invalid variable buffer offset: off=%d, var_off=%s",
+ regno, off, tn_buf);
+ return -EACCES;
+ }
+ if (off + size > env->prog->aux->max_tp_access)
+ env->prog->aux->max_tp_access = off + size;
+
+ return 0;
+}
+
+
/* truncate register to smaller size (in bytes)
* must be called with size < BPF_REG_SIZE
*/
@@ -1953,7 +2769,9 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
verbose(env, "R%d leaks addr into map\n", value_regno);
return -EACCES;
}
-
+ err = check_map_access_type(env, regno, off, size, t);
+ if (err)
+ return err;
err = check_map_access(env, regno, off, size, false);
if (!err && t == BPF_READ && value_regno >= 0)
mark_reg_unknown(env, regs, value_regno);
@@ -1977,11 +2795,20 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
* PTR_TO_PACKET[_META,_END]. In the latter
* case, we know the offset is zero.
*/
- if (reg_type == SCALAR_VALUE)
+ if (reg_type == SCALAR_VALUE) {
mark_reg_unknown(env, regs, value_regno);
- else
+ } else {
mark_reg_known_zero(env, regs,
value_regno);
+ if (reg_type_may_be_null(reg_type))
+ regs[value_regno].id = ++env->id_gen;
+ /* A load of ctx field could have different
+ * actual load size with the one encoded in the
+ * insn. When the dst is PTR, it is for sure not
+ * a sub-register.
+ */
+ regs[value_regno].subreg_def = DEF_NOT_SUBREG;
+ }
regs[value_regno].type = reg_type;
}
@@ -2027,14 +2854,19 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
err = check_flow_keys_access(env, off, size);
if (!err && t == BPF_READ && value_regno >= 0)
mark_reg_unknown(env, regs, value_regno);
- } else if (reg->type == PTR_TO_SOCKET) {
+ } else if (type_is_sk_pointer(reg->type)) {
if (t == BPF_WRITE) {
- verbose(env, "cannot write into socket\n");
+ verbose(env, "R%d cannot write into %s\n",
+ regno, reg_type_str[reg->type]);
return -EACCES;
}
- err = check_sock_access(env, regno, off, size, t);
+ err = check_sock_access(env, insn_idx, regno, off, size, t);
if (!err && value_regno >= 0)
mark_reg_unknown(env, regs, value_regno);
+ } else if (reg->type == PTR_TO_TP_BUFFER) {
+ err = check_tp_buffer_access(env, reg, regno, off, size);
+ if (!err && t == BPF_READ && value_regno >= 0)
+ mark_reg_unknown(env, regs, value_regno);
} else {
verbose(env, "R%d invalid mem access '%s'\n", regno,
reg_type_str[reg->type]);
@@ -2076,7 +2908,8 @@ static int check_xadd(struct bpf_verifier_env *env, int insn_idx, struct bpf_ins
if (is_ctx_reg(env, insn->dst_reg) ||
is_pkt_reg(env, insn->dst_reg) ||
- is_flow_key_reg(env, insn->dst_reg)) {
+ is_flow_key_reg(env, insn->dst_reg) ||
+ is_sk_reg(env, insn->dst_reg)) {
verbose(env, "BPF_XADD stores into R%d %s is not allowed\n",
insn->dst_reg,
reg_type_str[reg_state(env, insn->dst_reg)->type]);
@@ -2094,6 +2927,29 @@ static int check_xadd(struct bpf_verifier_env *env, int insn_idx, struct bpf_ins
BPF_SIZE(insn->code), BPF_WRITE, -1, true);
}
+static int __check_stack_boundary(struct bpf_verifier_env *env, u32 regno,
+ int off, int access_size,
+ bool zero_size_allowed)
+{
+ struct bpf_reg_state *reg = reg_state(env, regno);
+
+ if (off >= 0 || off < -MAX_BPF_STACK || off + access_size > 0 ||
+ access_size < 0 || (access_size == 0 && !zero_size_allowed)) {
+ if (tnum_is_const(reg->var_off)) {
+ verbose(env, "invalid stack type R%d off=%d access_size=%d\n",
+ regno, off, access_size);
+ } else {
+ char tn_buf[48];
+
+ tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
+ verbose(env, "invalid stack type R%d var_off=%s access_size=%d\n",
+ regno, tn_buf, access_size);
+ }
+ return -EACCES;
+ }
+ return 0;
+}
+
/* when register 'regno' is passed into function that will read 'access_size'
* bytes from that pointer, make sure that it's within stack boundary
* and all elements of stack are initialized.
@@ -2106,7 +2962,7 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno,
{
struct bpf_reg_state *reg = reg_state(env, regno);
struct bpf_func_state *state = func(env, reg);
- int off, i, slot, spi;
+ int err, min_off, max_off, i, j, slot, spi;
if (reg->type != PTR_TO_STACK) {
/* Allow zero-byte read from NULL, regardless of pointer type */
@@ -2120,21 +2976,57 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno,
return -EACCES;
}
- /* Only allow fixed-offset stack reads */
- if (!tnum_is_const(reg->var_off)) {
- char tn_buf[48];
+ if (tnum_is_const(reg->var_off)) {
+ min_off = max_off = reg->var_off.value + reg->off;
+ err = __check_stack_boundary(env, regno, min_off, access_size,
+ zero_size_allowed);
+ if (err)
+ return err;
+ } else {
+ /* Variable offset is prohibited for unprivileged mode for
+ * simplicity since it requires corresponding support in
+ * Spectre masking for stack ALU.
+ * See also retrieve_ptr_limit().
+ */
+ if (!env->allow_ptr_leaks) {
+ char tn_buf[48];
- tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
- verbose(env, "invalid variable stack read R%d var_off=%s\n",
- regno, tn_buf);
- return -EACCES;
- }
- off = reg->off + reg->var_off.value;
- if (off >= 0 || off < -MAX_BPF_STACK || off + access_size > 0 ||
- access_size < 0 || (access_size == 0 && !zero_size_allowed)) {
- verbose(env, "invalid stack type R%d off=%d access_size=%d\n",
- regno, off, access_size);
- return -EACCES;
+ tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
+ verbose(env, "R%d indirect variable offset stack access prohibited for !root, var_off=%s\n",
+ regno, tn_buf);
+ return -EACCES;
+ }
+ /* Only initialized buffer on stack is allowed to be accessed
+ * with variable offset. With uninitialized buffer it's hard to
+ * guarantee that whole memory is marked as initialized on
+ * helper return since specific bounds are unknown what may
+ * cause uninitialized stack leaking.
+ */
+ if (meta && meta->raw_mode)
+ meta = NULL;
+
+ if (reg->smax_value >= BPF_MAX_VAR_OFF ||
+ reg->smax_value <= -BPF_MAX_VAR_OFF) {
+ verbose(env, "R%d unbounded indirect variable offset stack access\n",
+ regno);
+ return -EACCES;
+ }
+ min_off = reg->smin_value + reg->off;
+ max_off = reg->smax_value + reg->off;
+ err = __check_stack_boundary(env, regno, min_off, access_size,
+ zero_size_allowed);
+ if (err) {
+ verbose(env, "R%d min value is outside of stack bound\n",
+ regno);
+ return err;
+ }
+ err = __check_stack_boundary(env, regno, max_off, access_size,
+ zero_size_allowed);
+ if (err) {
+ verbose(env, "R%d max value is outside of stack bound\n",
+ regno);
+ return err;
+ }
}
if (meta && meta->raw_mode) {
@@ -2143,10 +3035,10 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno,
return 0;
}
- for (i = 0; i < access_size; i++) {
+ for (i = min_off; i < max_off + access_size; i++) {
u8 *stype;
- slot = -(off + i) - 1;
+ slot = -i - 1;
spi = slot / BPF_REG_SIZE;
if (state->allocated_stack <= slot)
goto err;
@@ -2158,18 +3050,35 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno,
*stype = STACK_MISC;
goto mark;
}
+ if (state->stack[spi].slot_type[0] == STACK_SPILL &&
+ state->stack[spi].spilled_ptr.type == SCALAR_VALUE) {
+ __mark_reg_unknown(&state->stack[spi].spilled_ptr);
+ for (j = 0; j < BPF_REG_SIZE; j++)
+ state->stack[spi].slot_type[j] = STACK_MISC;
+ goto mark;
+ }
+
err:
- verbose(env, "invalid indirect read from stack off %d+%d size %d\n",
- off, i, access_size);
+ if (tnum_is_const(reg->var_off)) {
+ verbose(env, "invalid indirect read from stack off %d+%d size %d\n",
+ min_off, i - min_off, access_size);
+ } else {
+ char tn_buf[48];
+
+ tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
+ verbose(env, "invalid indirect read from stack var_off %s+%d size %d\n",
+ tn_buf, i - min_off, access_size);
+ }
return -EACCES;
mark:
/* reading any byte out of 8-byte 'spill_slot' will cause
* the whole slot to be marked as 'read'
*/
mark_reg_read(env, &state->stack[spi].spilled_ptr,
- state->stack[spi].spilled_ptr.parent);
+ state->stack[spi].spilled_ptr.parent,
+ REG_LIVE_READ64);
}
- return update_stack_depth(env, state, off);
+ return update_stack_depth(env, state, min_off);
}
static int check_helper_mem_access(struct bpf_verifier_env *env, int regno,
@@ -2184,6 +3093,10 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno,
return check_packet_access(env, regno, reg->off, access_size,
zero_size_allowed);
case PTR_TO_MAP_VALUE:
+ if (check_map_access_type(env, regno, reg->off, access_size,
+ meta && meta->raw_mode ? BPF_WRITE :
+ BPF_READ))
+ return -EACCES;
return check_map_access(env, regno, reg->off, access_size,
zero_size_allowed);
default: /* scalar_value|ptr_to_stack or invalid ptr */
@@ -2192,6 +3105,91 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno,
}
}
+/* Implementation details:
+ * bpf_map_lookup returns PTR_TO_MAP_VALUE_OR_NULL
+ * Two bpf_map_lookups (even with the same key) will have different reg->id.
+ * For traditional PTR_TO_MAP_VALUE the verifier clears reg->id after
+ * value_or_null->value transition, since the verifier only cares about
+ * the range of access to valid map value pointer and doesn't care about actual
+ * address of the map element.
+ * For maps with 'struct bpf_spin_lock' inside map value the verifier keeps
+ * reg->id > 0 after value_or_null->value transition. By doing so
+ * two bpf_map_lookups will be considered two different pointers that
+ * point to different bpf_spin_locks.
+ * The verifier allows taking only one bpf_spin_lock at a time to avoid
+ * dead-locks.
+ * Since only one bpf_spin_lock is allowed the checks are simpler than
+ * reg_is_refcounted() logic. The verifier needs to remember only
+ * one spin_lock instead of array of acquired_refs.
+ * cur_state->active_spin_lock remembers which map value element got locked
+ * and clears it after bpf_spin_unlock.
+ */
+static int process_spin_lock(struct bpf_verifier_env *env, int regno,
+ bool is_lock)
+{
+ struct bpf_reg_state *regs = cur_regs(env), *reg = &regs[regno];
+ struct bpf_verifier_state *cur = env->cur_state;
+ bool is_const = tnum_is_const(reg->var_off);
+ struct bpf_map *map = reg->map_ptr;
+ u64 val = reg->var_off.value;
+
+ if (reg->type != PTR_TO_MAP_VALUE) {
+ verbose(env, "R%d is not a pointer to map_value\n", regno);
+ return -EINVAL;
+ }
+ if (!is_const) {
+ verbose(env,
+ "R%d doesn't have constant offset. bpf_spin_lock has to be at the constant offset\n",
+ regno);
+ return -EINVAL;
+ }
+ if (!map->btf) {
+ verbose(env,
+ "map '%s' has to have BTF in order to use bpf_spin_lock\n",
+ map->name);
+ return -EINVAL;
+ }
+ if (!map_value_has_spin_lock(map)) {
+ if (map->spin_lock_off == -E2BIG)
+ verbose(env,
+ "map '%s' has more than one 'struct bpf_spin_lock'\n",
+ map->name);
+ else if (map->spin_lock_off == -ENOENT)
+ verbose(env,
+ "map '%s' doesn't have 'struct bpf_spin_lock'\n",
+ map->name);
+ else
+ verbose(env,
+ "map '%s' is not a struct type or bpf_spin_lock is mangled\n",
+ map->name);
+ return -EINVAL;
+ }
+ if (map->spin_lock_off != val + reg->off) {
+ verbose(env, "off %lld doesn't point to 'struct bpf_spin_lock'\n",
+ val + reg->off);
+ return -EINVAL;
+ }
+ if (is_lock) {
+ if (cur->active_spin_lock) {
+ verbose(env,
+ "Locking two bpf_spin_locks are not allowed\n");
+ return -EINVAL;
+ }
+ cur->active_spin_lock = reg->id;
+ } else {
+ if (!cur->active_spin_lock) {
+ verbose(env, "bpf_spin_unlock without taking a lock\n");
+ return -EINVAL;
+ }
+ if (cur->active_spin_lock != reg->id) {
+ verbose(env, "bpf_spin_unlock of different lock\n");
+ return -EINVAL;
+ }
+ cur->active_spin_lock = 0;
+ }
+ return 0;
+}
+
static bool arg_type_is_mem_ptr(enum bpf_arg_type type)
{
return type == ARG_PTR_TO_MEM ||
@@ -2205,6 +3203,22 @@ static bool arg_type_is_mem_size(enum bpf_arg_type type)
type == ARG_CONST_SIZE_OR_ZERO;
}
+static bool arg_type_is_int_ptr(enum bpf_arg_type type)
+{
+ return type == ARG_PTR_TO_INT ||
+ type == ARG_PTR_TO_LONG;
+}
+
+static int int_ptr_type_to_size(enum bpf_arg_type type)
+{
+ if (type == ARG_PTR_TO_INT)
+ return sizeof(u32);
+ else if (type == ARG_PTR_TO_LONG)
+ return sizeof(u64);
+
+ return -EINVAL;
+}
+
static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
enum bpf_arg_type arg_type,
struct bpf_call_arg_meta *meta)
@@ -2237,10 +3251,15 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
if (arg_type == ARG_PTR_TO_MAP_KEY ||
arg_type == ARG_PTR_TO_MAP_VALUE ||
- arg_type == ARG_PTR_TO_UNINIT_MAP_VALUE) {
+ arg_type == ARG_PTR_TO_UNINIT_MAP_VALUE ||
+ arg_type == ARG_PTR_TO_MAP_VALUE_OR_NULL) {
expected_type = PTR_TO_STACK;
- if (!type_is_pkt_pointer(type) && type != PTR_TO_MAP_VALUE &&
- type != expected_type)
+ if (register_is_null(reg) &&
+ arg_type == ARG_PTR_TO_MAP_VALUE_OR_NULL)
+ /* final test in check_stack_boundary() */;
+ else if (!type_is_pkt_pointer(type) &&
+ type != PTR_TO_MAP_VALUE &&
+ type != expected_type)
goto err_type;
} else if (arg_type == ARG_CONST_SIZE ||
arg_type == ARG_CONST_SIZE_OR_ZERO) {
@@ -2258,16 +3277,35 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
err = check_ctx_reg(env, reg, regno);
if (err < 0)
return err;
+ } else if (arg_type == ARG_PTR_TO_SOCK_COMMON) {
+ expected_type = PTR_TO_SOCK_COMMON;
+ /* Any sk pointer can be ARG_PTR_TO_SOCK_COMMON */
+ if (!type_is_sk_pointer(type))
+ goto err_type;
+ if (reg->ref_obj_id) {
+ if (meta->ref_obj_id) {
+ verbose(env, "verifier internal error: more than one arg with ref_obj_id R%d %u %u\n",
+ regno, reg->ref_obj_id,
+ meta->ref_obj_id);
+ return -EFAULT;
+ }
+ meta->ref_obj_id = reg->ref_obj_id;
+ }
} else if (arg_type == ARG_PTR_TO_SOCKET) {
expected_type = PTR_TO_SOCKET;
if (type != expected_type)
goto err_type;
- if (meta->ptr_id || !reg->id) {
- verbose(env, "verifier internal error: mismatched references meta=%d, reg=%d\n",
- meta->ptr_id, reg->id);
+ } else if (arg_type == ARG_PTR_TO_SPIN_LOCK) {
+ if (meta->func_id == BPF_FUNC_spin_lock) {
+ if (process_spin_lock(env, regno, true))
+ return -EACCES;
+ } else if (meta->func_id == BPF_FUNC_spin_unlock) {
+ if (process_spin_lock(env, regno, false))
+ return -EACCES;
+ } else {
+ verbose(env, "verifier internal error\n");
return -EFAULT;
}
- meta->ptr_id = reg->id;
} else if (arg_type_is_mem_ptr(arg_type)) {
expected_type = PTR_TO_STACK;
/* One exception here. In case function allows for NULL to be
@@ -2282,6 +3320,12 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
type != expected_type)
goto err_type;
meta->raw_mode = arg_type == ARG_PTR_TO_UNINIT_MEM;
+ } else if (arg_type_is_int_ptr(arg_type)) {
+ expected_type = PTR_TO_STACK;
+ if (!type_is_pkt_pointer(type) &&
+ type != PTR_TO_MAP_VALUE &&
+ type != expected_type)
+ goto err_type;
} else {
verbose(env, "unsupported arg_type %d\n", arg_type);
return -EFAULT;
@@ -2308,6 +3352,8 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
meta->map_ptr->key_size, false,
NULL);
} else if (arg_type == ARG_PTR_TO_MAP_VALUE ||
+ (arg_type == ARG_PTR_TO_MAP_VALUE_OR_NULL &&
+ !register_is_null(reg)) ||
arg_type == ARG_PTR_TO_UNINIT_MAP_VALUE) {
/* bpf_map_xxx(..., map_ptr, ..., value) call:
* check [value, value + map->value_size) validity
@@ -2363,6 +3409,15 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
err = check_helper_mem_access(env, regno - 1,
reg->umax_value,
zero_size_allowed, meta);
+ if (!err)
+ err = mark_chain_precision(env, regno);
+ } else if (arg_type_is_int_ptr(arg_type)) {
+ int size = int_ptr_type_to_size(arg_type);
+
+ err = check_helper_mem_access(env, regno, size, false, meta);
+ if (err)
+ return err;
+ err = check_ptr_alignment(env, reg, 0, size, true);
}
return err;
@@ -2404,22 +3459,23 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
if (func_id != BPF_FUNC_get_local_storage)
goto error;
break;
- /* devmap returns a pointer to a live net_device ifindex that we cannot
- * allow to be modified from bpf side. So do not allow lookup elements
- * for now.
- */
case BPF_MAP_TYPE_DEVMAP:
- if (func_id != BPF_FUNC_redirect_map)
+ if (func_id != BPF_FUNC_redirect_map &&
+ func_id != BPF_FUNC_map_lookup_elem)
goto error;
break;
/* Restrict bpf side of cpumap and xskmap, open when use-cases
* appear.
*/
case BPF_MAP_TYPE_CPUMAP:
- case BPF_MAP_TYPE_XSKMAP:
if (func_id != BPF_FUNC_redirect_map)
goto error;
break;
+ case BPF_MAP_TYPE_XSKMAP:
+ if (func_id != BPF_FUNC_redirect_map &&
+ func_id != BPF_FUNC_map_lookup_elem)
+ goto error;
+ break;
case BPF_MAP_TYPE_ARRAY_OF_MAPS:
case BPF_MAP_TYPE_HASH_OF_MAPS:
if (func_id != BPF_FUNC_map_lookup_elem)
@@ -2450,6 +3506,11 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
func_id != BPF_FUNC_map_push_elem)
goto error;
break;
+ case BPF_MAP_TYPE_SK_STORAGE:
+ if (func_id != BPF_FUNC_sk_storage_get &&
+ func_id != BPF_FUNC_sk_storage_delete)
+ goto error;
+ break;
default:
break;
}
@@ -2513,6 +3574,11 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
map->map_type != BPF_MAP_TYPE_STACK)
goto error;
break;
+ case BPF_FUNC_sk_storage_get:
+ case BPF_FUNC_sk_storage_delete:
+ if (map->map_type != BPF_MAP_TYPE_SK_STORAGE)
+ goto error;
+ break;
default:
break;
}
@@ -2573,32 +3639,38 @@ static bool check_arg_pair_ok(const struct bpf_func_proto *fn)
return true;
}
-static bool check_refcount_ok(const struct bpf_func_proto *fn)
+static bool check_refcount_ok(const struct bpf_func_proto *fn, int func_id)
{
int count = 0;
- if (arg_type_is_refcounted(fn->arg1_type))
+ if (arg_type_may_be_refcounted(fn->arg1_type))
count++;
- if (arg_type_is_refcounted(fn->arg2_type))
+ if (arg_type_may_be_refcounted(fn->arg2_type))
count++;
- if (arg_type_is_refcounted(fn->arg3_type))
+ if (arg_type_may_be_refcounted(fn->arg3_type))
count++;
- if (arg_type_is_refcounted(fn->arg4_type))
+ if (arg_type_may_be_refcounted(fn->arg4_type))
count++;
- if (arg_type_is_refcounted(fn->arg5_type))
+ if (arg_type_may_be_refcounted(fn->arg5_type))
count++;
+ /* A reference acquiring function cannot acquire
+ * another refcounted ptr.
+ */
+ if (is_acquire_function(func_id) && count)
+ return false;
+
/* We only support one arg being unreferenced at the moment,
* which is sufficient for the helper functions we have right now.
*/
return count <= 1;
}
-static int check_func_proto(const struct bpf_func_proto *fn)
+static int check_func_proto(const struct bpf_func_proto *fn, int func_id)
{
return check_raw_mode_ok(fn) &&
check_arg_pair_ok(fn) &&
- check_refcount_ok(fn) ? 0 : -EINVAL;
+ check_refcount_ok(fn, func_id) ? 0 : -EINVAL;
}
/* Packet data might have moved, any old PTR_TO_PACKET[_META,_END]
@@ -2632,19 +3704,20 @@ static void clear_all_pkt_pointers(struct bpf_verifier_env *env)
}
static void release_reg_references(struct bpf_verifier_env *env,
- struct bpf_func_state *state, int id)
+ struct bpf_func_state *state,
+ int ref_obj_id)
{
struct bpf_reg_state *regs = state->regs, *reg;
int i;
for (i = 0; i < MAX_BPF_REG; i++)
- if (regs[i].id == id)
+ if (regs[i].ref_obj_id == ref_obj_id)
mark_reg_unknown(env, regs, i);
bpf_for_each_spilled_reg(i, state, reg) {
if (!reg)
continue;
- if (reg_is_refcounted(reg) && reg->id == id)
+ if (reg->ref_obj_id == ref_obj_id)
__mark_reg_unknown(reg);
}
}
@@ -2653,15 +3726,20 @@ static void release_reg_references(struct bpf_verifier_env *env,
* resources. Identify all copies of the same pointer and clear the reference.
*/
static int release_reference(struct bpf_verifier_env *env,
- struct bpf_call_arg_meta *meta)
+ int ref_obj_id)
{
struct bpf_verifier_state *vstate = env->cur_state;
+ int err;
int i;
+ err = release_reference_state(cur_func(env), ref_obj_id);
+ if (err)
+ return err;
+
for (i = 0; i <= vstate->curframe; i++)
- release_reg_references(env, vstate->frame[i], meta->ptr_id);
+ release_reg_references(env, vstate->frame[i], ref_obj_id);
- return release_reference_state(env, meta->ptr_id);
+ return 0;
}
static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
@@ -2730,7 +3808,7 @@ static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
/* and go analyze first insn of the callee */
*insn_idx = target_insn;
- if (env->log.level) {
+ if (env->log.level & BPF_LOG_LEVEL) {
verbose(env, "caller:\n");
print_verifier_state(env, caller);
verbose(env, "callee:\n");
@@ -2770,7 +3848,7 @@ static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx)
return err;
*insn_idx = callee->callsite + 1;
- if (env->log.level) {
+ if (env->log.level & BPF_LOG_LEVEL) {
verbose(env, "returning from callee:\n");
print_verifier_state(env, callee);
verbose(env, "to caller at %d:\n", *insn_idx);
@@ -2804,6 +3882,7 @@ record_func_map(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta,
int func_id, int insn_idx)
{
struct bpf_insn_aux_data *aux = &env->insn_aux_data[insn_idx];
+ struct bpf_map *map = meta->map_ptr;
if (func_id != BPF_FUNC_tail_call &&
func_id != BPF_FUNC_map_lookup_elem &&
@@ -2814,11 +3893,24 @@ record_func_map(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta,
func_id != BPF_FUNC_map_peek_elem)
return 0;
- if (meta->map_ptr == NULL) {
+ if (map == NULL) {
verbose(env, "kernel subsystem misconfigured verifier\n");
return -EINVAL;
}
+ /* In case of read-only, some additional restrictions
+ * need to be applied in order to prevent altering the
+ * state of the map from program side.
+ */
+ if ((map->map_flags & BPF_F_RDONLY_PROG) &&
+ (func_id == BPF_FUNC_map_delete_elem ||
+ func_id == BPF_FUNC_map_update_elem ||
+ func_id == BPF_FUNC_map_push_elem ||
+ func_id == BPF_FUNC_map_pop_elem)) {
+ verbose(env, "write into map forbidden\n");
+ return -EACCES;
+ }
+
if (!BPF_MAP_PTR(aux->map_state))
bpf_map_ptr_store(aux, meta->map_ptr,
meta->map_ptr->unpriv_array);
@@ -2880,13 +3972,14 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn
memset(&meta, 0, sizeof(meta));
meta.pkt_access = fn->pkt_access;
- err = check_func_proto(fn);
+ err = check_func_proto(fn, func_id);
if (err) {
verbose(env, "kernel subsystem misconfigured func %s#%d\n",
func_id_name(func_id), func_id);
return err;
}
+ meta.func_id = func_id;
/* check args */
err = check_func_arg(env, BPF_REG_1, fn->arg1_type, &meta);
if (err)
@@ -2925,9 +4018,12 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn
return err;
}
} else if (is_release_function(func_id)) {
- err = release_reference(env, &meta);
- if (err)
+ err = release_reference(env, meta.ref_obj_id);
+ if (err) {
+ verbose(env, "func %s#%d reference has not been acquired before\n",
+ func_id_name(func_id), func_id);
return err;
+ }
}
regs = cur_regs(env);
@@ -2947,6 +4043,9 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn
check_reg_arg(env, caller_saved[i], DST_OP_NO_MARK);
}
+ /* helper call returns 64-bit value. */
+ regs[BPF_REG_0].subreg_def = DEF_NOT_SUBREG;
+
/* update return register (already marked as written above) */
if (fn->ret_type == RET_INTEGER) {
/* sets type to SCALAR_VALUE */
@@ -2969,23 +4068,44 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn
regs[BPF_REG_0].map_ptr = meta.map_ptr;
if (fn->ret_type == RET_PTR_TO_MAP_VALUE) {
regs[BPF_REG_0].type = PTR_TO_MAP_VALUE;
+ if (map_value_has_spin_lock(meta.map_ptr))
+ regs[BPF_REG_0].id = ++env->id_gen;
} else {
regs[BPF_REG_0].type = PTR_TO_MAP_VALUE_OR_NULL;
regs[BPF_REG_0].id = ++env->id_gen;
}
} else if (fn->ret_type == RET_PTR_TO_SOCKET_OR_NULL) {
- int id = acquire_reference_state(env, insn_idx);
- if (id < 0)
- return id;
mark_reg_known_zero(env, regs, BPF_REG_0);
regs[BPF_REG_0].type = PTR_TO_SOCKET_OR_NULL;
- regs[BPF_REG_0].id = id;
+ regs[BPF_REG_0].id = ++env->id_gen;
+ } else if (fn->ret_type == RET_PTR_TO_SOCK_COMMON_OR_NULL) {
+ mark_reg_known_zero(env, regs, BPF_REG_0);
+ regs[BPF_REG_0].type = PTR_TO_SOCK_COMMON_OR_NULL;
+ regs[BPF_REG_0].id = ++env->id_gen;
+ } else if (fn->ret_type == RET_PTR_TO_TCP_SOCK_OR_NULL) {
+ mark_reg_known_zero(env, regs, BPF_REG_0);
+ regs[BPF_REG_0].type = PTR_TO_TCP_SOCK_OR_NULL;
+ regs[BPF_REG_0].id = ++env->id_gen;
} else {
verbose(env, "unknown return type %d of func %s#%d\n",
fn->ret_type, func_id_name(func_id), func_id);
return -EINVAL;
}
+ if (is_ptr_cast_function(func_id)) {
+ /* For release_reference() */
+ regs[BPF_REG_0].ref_obj_id = meta.ref_obj_id;
+ } else if (is_acquire_function(func_id)) {
+ int id = acquire_reference_state(env, insn_idx);
+
+ if (id < 0)
+ return id;
+ /* For mark_ptr_or_null_reg() */
+ regs[BPF_REG_0].id = id;
+ /* For release_reference() */
+ regs[BPF_REG_0].ref_obj_id = id;
+ }
+
do_refine_retval_range(regs, fn->ret_type, func_id, &meta);
err = check_map_func_compatibility(env, meta.map_ptr, func_id);
@@ -3084,6 +4204,9 @@ static int retrieve_ptr_limit(const struct bpf_reg_state *ptr_reg,
switch (ptr_reg->type) {
case PTR_TO_STACK:
+ /* Indirect variable offset stack access is prohibited in
+ * unprivileged mode so it's not handled here.
+ */
off = ptr_reg->off + ptr_reg->var_off.value;
if (mask_to_left)
*ptr_limit = MAX_BPF_STACK + off;
@@ -3184,7 +4307,7 @@ do_sim:
*dst_reg = *ptr_reg;
}
ret = push_stack(env, env->insn_idx + 1, env->insn_idx, true);
- if (!ptr_is_dst_reg)
+ if (!ptr_is_dst_reg && ret)
*dst_reg = tmp;
return !ret ? -EFAULT : 0;
}
@@ -3239,6 +4362,11 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
case PTR_TO_PACKET_END:
case PTR_TO_SOCKET:
case PTR_TO_SOCKET_OR_NULL:
+ case PTR_TO_SOCK_COMMON:
+ case PTR_TO_SOCK_COMMON_OR_NULL:
+ case PTR_TO_TCP_SOCK:
+ case PTR_TO_TCP_SOCK_OR_NULL:
+ case PTR_TO_XDP_SOCK:
verbose(env, "R%d pointer arithmetic on %s prohibited\n",
dst, reg_type_str[ptr_reg->type]);
return -EACCES;
@@ -3716,6 +4844,7 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env,
struct bpf_reg_state *regs = state->regs, *dst_reg, *src_reg;
struct bpf_reg_state *ptr_reg = NULL, off_reg = {0};
u8 opcode = BPF_OP(insn->code);
+ int err;
dst_reg = &regs[insn->dst_reg];
src_reg = NULL;
@@ -3742,11 +4871,17 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env,
* This is legal, but we have to reverse our
* src/dest handling in computing the range
*/
+ err = mark_chain_precision(env, insn->dst_reg);
+ if (err)
+ return err;
return adjust_ptr_min_max_vals(env, insn,
src_reg, dst_reg);
}
} else if (ptr_reg) {
/* pointer += scalar */
+ err = mark_chain_precision(env, insn->src_reg);
+ if (err)
+ return err;
return adjust_ptr_min_max_vals(env, insn,
dst_reg, src_reg);
}
@@ -3850,6 +4985,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
*/
*dst_reg = *src_reg;
dst_reg->live |= REG_LIVE_WRITTEN;
+ dst_reg->subreg_def = DEF_NOT_SUBREG;
} else {
/* R1 = (u32) R2 */
if (is_pointer_value(env, insn->src_reg)) {
@@ -3860,6 +4996,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
} else if (src_reg->type == SCALAR_VALUE) {
*dst_reg = *src_reg;
dst_reg->live |= REG_LIVE_WRITTEN;
+ dst_reg->subreg_def = env->insn_idx + 1;
} else {
mark_reg_unknown(env, regs,
insn->dst_reg);
@@ -3936,15 +5073,35 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
return 0;
}
+static void __find_good_pkt_pointers(struct bpf_func_state *state,
+ struct bpf_reg_state *dst_reg,
+ enum bpf_reg_type type, u16 new_range)
+{
+ struct bpf_reg_state *reg;
+ int i;
+
+ for (i = 0; i < MAX_BPF_REG; i++) {
+ reg = &state->regs[i];
+ if (reg->type == type && reg->id == dst_reg->id)
+ /* keep the maximum range already checked */
+ reg->range = max(reg->range, new_range);
+ }
+
+ bpf_for_each_spilled_reg(i, state, reg) {
+ if (!reg)
+ continue;
+ if (reg->type == type && reg->id == dst_reg->id)
+ reg->range = max(reg->range, new_range);
+ }
+}
+
static void find_good_pkt_pointers(struct bpf_verifier_state *vstate,
struct bpf_reg_state *dst_reg,
enum bpf_reg_type type,
bool range_right_open)
{
- struct bpf_func_state *state = vstate->frame[vstate->curframe];
- struct bpf_reg_state *regs = state->regs, *reg;
u16 new_range;
- int i, j;
+ int i;
if (dst_reg->off < 0 ||
(dst_reg->off == 0 && range_right_open))
@@ -4009,20 +5166,9 @@ static void find_good_pkt_pointers(struct bpf_verifier_state *vstate,
* the range won't allow anything.
* dst_reg->off is known < MAX_PACKET_OFF, therefore it fits in a u16.
*/
- for (i = 0; i < MAX_BPF_REG; i++)
- if (regs[i].type == type && regs[i].id == dst_reg->id)
- /* keep the maximum range already checked */
- regs[i].range = max(regs[i].range, new_range);
-
- for (j = 0; j <= vstate->curframe; j++) {
- state = vstate->frame[j];
- bpf_for_each_spilled_reg(i, state, reg) {
- if (!reg)
- continue;
- if (reg->type == type && reg->id == dst_reg->id)
- reg->range = max(reg->range, new_range);
- }
- }
+ for (i = 0; i <= vstate->curframe; i++)
+ __find_good_pkt_pointers(vstate->frame[i], dst_reg, type,
+ new_range);
}
/* compute branch direction of the expression "if (reg opcode val) goto target;"
@@ -4031,11 +5177,50 @@ static void find_good_pkt_pointers(struct bpf_verifier_state *vstate,
* 0 - branch will not be taken and fall-through to next insn
* -1 - unknown. Example: "if (reg < 5)" is unknown when register value range [0,10]
*/
-static int is_branch_taken(struct bpf_reg_state *reg, u64 val, u8 opcode)
+static int is_branch_taken(struct bpf_reg_state *reg, u64 val, u8 opcode,
+ bool is_jmp32)
{
+ struct bpf_reg_state reg_lo;
+ s64 sval;
+
if (__is_pointer_value(false, reg))
return -1;
+ if (is_jmp32) {
+ reg_lo = *reg;
+ reg = &reg_lo;
+ /* For JMP32, only low 32 bits are compared, coerce_reg_to_size
+ * could truncate high bits and update umin/umax according to
+ * information of low bits.
+ */
+ coerce_reg_to_size(reg, 4);
+ /* smin/smax need special handling. For example, after coerce,
+ * if smin_value is 0x00000000ffffffffLL, the value is -1 when
+ * used as operand to JMP32. It is a negative number from s32's
+ * point of view, while it is a positive number when seen as
+ * s64. The smin/smax are kept as s64, therefore, when used with
+ * JMP32, they need to be transformed into s32, then sign
+ * extended back to s64.
+ *
+ * Also, smin/smax were copied from umin/umax. If umin/umax has
+ * different sign bit, then min/max relationship doesn't
+ * maintain after casting into s32, for this case, set smin/smax
+ * to safest range.
+ */
+ if ((reg->umax_value ^ reg->umin_value) &
+ (1ULL << 31)) {
+ reg->smin_value = S32_MIN;
+ reg->smax_value = S32_MAX;
+ }
+ reg->smin_value = (s64)(s32)reg->smin_value;
+ reg->smax_value = (s64)(s32)reg->smax_value;
+
+ val = (u32)val;
+ sval = (s64)(s32)val;
+ } else {
+ sval = (s64)val;
+ }
+
switch (opcode) {
case BPF_JEQ:
if (tnum_is_const(reg->var_off))
@@ -4058,9 +5243,9 @@ static int is_branch_taken(struct bpf_reg_state *reg, u64 val, u8 opcode)
return 0;
break;
case BPF_JSGT:
- if (reg->smin_value > (s64)val)
+ if (reg->smin_value > sval)
return 1;
- else if (reg->smax_value < (s64)val)
+ else if (reg->smax_value < sval)
return 0;
break;
case BPF_JLT:
@@ -4070,9 +5255,9 @@ static int is_branch_taken(struct bpf_reg_state *reg, u64 val, u8 opcode)
return 0;
break;
case BPF_JSLT:
- if (reg->smax_value < (s64)val)
+ if (reg->smax_value < sval)
return 1;
- else if (reg->smin_value >= (s64)val)
+ else if (reg->smin_value >= sval)
return 0;
break;
case BPF_JGE:
@@ -4082,9 +5267,9 @@ static int is_branch_taken(struct bpf_reg_state *reg, u64 val, u8 opcode)
return 0;
break;
case BPF_JSGE:
- if (reg->smin_value >= (s64)val)
+ if (reg->smin_value >= sval)
return 1;
- else if (reg->smax_value < (s64)val)
+ else if (reg->smax_value < sval)
return 0;
break;
case BPF_JLE:
@@ -4094,9 +5279,9 @@ static int is_branch_taken(struct bpf_reg_state *reg, u64 val, u8 opcode)
return 0;
break;
case BPF_JSLE:
- if (reg->smax_value <= (s64)val)
+ if (reg->smax_value <= sval)
return 1;
- else if (reg->smin_value > (s64)val)
+ else if (reg->smin_value > sval)
return 0;
break;
}
@@ -4104,6 +5289,29 @@ static int is_branch_taken(struct bpf_reg_state *reg, u64 val, u8 opcode)
return -1;
}
+/* Generate min value of the high 32-bit from TNUM info. */
+static u64 gen_hi_min(struct tnum var)
+{
+ return var.value & ~0xffffffffULL;
+}
+
+/* Generate max value of the high 32-bit from TNUM info. */
+static u64 gen_hi_max(struct tnum var)
+{
+ return (var.value | var.mask) & ~0xffffffffULL;
+}
+
+/* Return true if VAL is compared with a s64 sign extended from s32, and they
+ * are with the same signedness.
+ */
+static bool cmp_val_with_extended_s64(s64 sval, struct bpf_reg_state *reg)
+{
+ return ((s32)sval >= 0 &&
+ reg->smin_value >= 0 && reg->smax_value <= S32_MAX) ||
+ ((s32)sval < 0 &&
+ reg->smax_value <= 0 && reg->smin_value >= S32_MIN);
+}
+
/* Adjusts the register min/max values in the case that the dst_reg is the
* variable register that we are working on, and src_reg is a constant or we're
* simply doing a BPF_K check.
@@ -4111,8 +5319,10 @@ static int is_branch_taken(struct bpf_reg_state *reg, u64 val, u8 opcode)
*/
static void reg_set_min_max(struct bpf_reg_state *true_reg,
struct bpf_reg_state *false_reg, u64 val,
- u8 opcode)
+ u8 opcode, bool is_jmp32)
{
+ s64 sval;
+
/* If the dst_reg is a pointer, we can't learn anything about its
* variable offset from the compare (unless src_reg were a pointer into
* the same object, but we don't bother with that.
@@ -4122,19 +5332,31 @@ static void reg_set_min_max(struct bpf_reg_state *true_reg,
if (__is_pointer_value(false, false_reg))
return;
+ val = is_jmp32 ? (u32)val : val;
+ sval = is_jmp32 ? (s64)(s32)val : (s64)val;
+
switch (opcode) {
case BPF_JEQ:
- /* If this is false then we know nothing Jon Snow, but if it is
- * true then we know for sure.
- */
- __mark_reg_known(true_reg, val);
- break;
case BPF_JNE:
- /* If this is true we know nothing Jon Snow, but if it is false
- * we know the value for sure;
+ {
+ struct bpf_reg_state *reg =
+ opcode == BPF_JEQ ? true_reg : false_reg;
+
+ /* For BPF_JEQ, if this is false we know nothing Jon Snow, but
+ * if it is true we know the value for sure. Likewise for
+ * BPF_JNE.
*/
- __mark_reg_known(false_reg, val);
+ if (is_jmp32) {
+ u64 old_v = reg->var_off.value;
+ u64 hi_mask = ~0xffffffffULL;
+
+ reg->var_off.value = (old_v & hi_mask) | val;
+ reg->var_off.mask &= hi_mask;
+ } else {
+ __mark_reg_known(reg, val);
+ }
break;
+ }
case BPF_JSET:
false_reg->var_off = tnum_and(false_reg->var_off,
tnum_const(~val));
@@ -4142,38 +5364,61 @@ static void reg_set_min_max(struct bpf_reg_state *true_reg,
true_reg->var_off = tnum_or(true_reg->var_off,
tnum_const(val));
break;
- case BPF_JGT:
- false_reg->umax_value = min(false_reg->umax_value, val);
- true_reg->umin_value = max(true_reg->umin_value, val + 1);
- break;
- case BPF_JSGT:
- false_reg->smax_value = min_t(s64, false_reg->smax_value, val);
- true_reg->smin_value = max_t(s64, true_reg->smin_value, val + 1);
- break;
- case BPF_JLT:
- false_reg->umin_value = max(false_reg->umin_value, val);
- true_reg->umax_value = min(true_reg->umax_value, val - 1);
- break;
- case BPF_JSLT:
- false_reg->smin_value = max_t(s64, false_reg->smin_value, val);
- true_reg->smax_value = min_t(s64, true_reg->smax_value, val - 1);
- break;
case BPF_JGE:
- false_reg->umax_value = min(false_reg->umax_value, val - 1);
- true_reg->umin_value = max(true_reg->umin_value, val);
+ case BPF_JGT:
+ {
+ u64 false_umax = opcode == BPF_JGT ? val : val - 1;
+ u64 true_umin = opcode == BPF_JGT ? val + 1 : val;
+
+ if (is_jmp32) {
+ false_umax += gen_hi_max(false_reg->var_off);
+ true_umin += gen_hi_min(true_reg->var_off);
+ }
+ false_reg->umax_value = min(false_reg->umax_value, false_umax);
+ true_reg->umin_value = max(true_reg->umin_value, true_umin);
break;
+ }
case BPF_JSGE:
- false_reg->smax_value = min_t(s64, false_reg->smax_value, val - 1);
- true_reg->smin_value = max_t(s64, true_reg->smin_value, val);
+ case BPF_JSGT:
+ {
+ s64 false_smax = opcode == BPF_JSGT ? sval : sval - 1;
+ s64 true_smin = opcode == BPF_JSGT ? sval + 1 : sval;
+
+ /* If the full s64 was not sign-extended from s32 then don't
+ * deduct further info.
+ */
+ if (is_jmp32 && !cmp_val_with_extended_s64(sval, false_reg))
+ break;
+ false_reg->smax_value = min(false_reg->smax_value, false_smax);
+ true_reg->smin_value = max(true_reg->smin_value, true_smin);
break;
+ }
case BPF_JLE:
- false_reg->umin_value = max(false_reg->umin_value, val + 1);
- true_reg->umax_value = min(true_reg->umax_value, val);
+ case BPF_JLT:
+ {
+ u64 false_umin = opcode == BPF_JLT ? val : val + 1;
+ u64 true_umax = opcode == BPF_JLT ? val - 1 : val;
+
+ if (is_jmp32) {
+ false_umin += gen_hi_min(false_reg->var_off);
+ true_umax += gen_hi_max(true_reg->var_off);
+ }
+ false_reg->umin_value = max(false_reg->umin_value, false_umin);
+ true_reg->umax_value = min(true_reg->umax_value, true_umax);
break;
+ }
case BPF_JSLE:
- false_reg->smin_value = max_t(s64, false_reg->smin_value, val + 1);
- true_reg->smax_value = min_t(s64, true_reg->smax_value, val);
+ case BPF_JSLT:
+ {
+ s64 false_smin = opcode == BPF_JSLT ? sval : sval + 1;
+ s64 true_smax = opcode == BPF_JSLT ? sval - 1 : sval;
+
+ if (is_jmp32 && !cmp_val_with_extended_s64(sval, false_reg))
+ break;
+ false_reg->smin_value = max(false_reg->smin_value, false_smin);
+ true_reg->smax_value = min(true_reg->smax_value, true_smax);
break;
+ }
default:
break;
}
@@ -4196,24 +5441,34 @@ static void reg_set_min_max(struct bpf_reg_state *true_reg,
*/
static void reg_set_min_max_inv(struct bpf_reg_state *true_reg,
struct bpf_reg_state *false_reg, u64 val,
- u8 opcode)
+ u8 opcode, bool is_jmp32)
{
+ s64 sval;
+
if (__is_pointer_value(false, false_reg))
return;
+ val = is_jmp32 ? (u32)val : val;
+ sval = is_jmp32 ? (s64)(s32)val : (s64)val;
+
switch (opcode) {
case BPF_JEQ:
- /* If this is false then we know nothing Jon Snow, but if it is
- * true then we know for sure.
- */
- __mark_reg_known(true_reg, val);
- break;
case BPF_JNE:
- /* If this is true we know nothing Jon Snow, but if it is false
- * we know the value for sure;
- */
- __mark_reg_known(false_reg, val);
+ {
+ struct bpf_reg_state *reg =
+ opcode == BPF_JEQ ? true_reg : false_reg;
+
+ if (is_jmp32) {
+ u64 old_v = reg->var_off.value;
+ u64 hi_mask = ~0xffffffffULL;
+
+ reg->var_off.value = (old_v & hi_mask) | val;
+ reg->var_off.mask &= hi_mask;
+ } else {
+ __mark_reg_known(reg, val);
+ }
break;
+ }
case BPF_JSET:
false_reg->var_off = tnum_and(false_reg->var_off,
tnum_const(~val));
@@ -4221,38 +5476,58 @@ static void reg_set_min_max_inv(struct bpf_reg_state *true_reg,
true_reg->var_off = tnum_or(true_reg->var_off,
tnum_const(val));
break;
- case BPF_JGT:
- true_reg->umax_value = min(true_reg->umax_value, val - 1);
- false_reg->umin_value = max(false_reg->umin_value, val);
- break;
- case BPF_JSGT:
- true_reg->smax_value = min_t(s64, true_reg->smax_value, val - 1);
- false_reg->smin_value = max_t(s64, false_reg->smin_value, val);
- break;
- case BPF_JLT:
- true_reg->umin_value = max(true_reg->umin_value, val + 1);
- false_reg->umax_value = min(false_reg->umax_value, val);
- break;
- case BPF_JSLT:
- true_reg->smin_value = max_t(s64, true_reg->smin_value, val + 1);
- false_reg->smax_value = min_t(s64, false_reg->smax_value, val);
- break;
case BPF_JGE:
- true_reg->umax_value = min(true_reg->umax_value, val);
- false_reg->umin_value = max(false_reg->umin_value, val + 1);
+ case BPF_JGT:
+ {
+ u64 false_umin = opcode == BPF_JGT ? val : val + 1;
+ u64 true_umax = opcode == BPF_JGT ? val - 1 : val;
+
+ if (is_jmp32) {
+ false_umin += gen_hi_min(false_reg->var_off);
+ true_umax += gen_hi_max(true_reg->var_off);
+ }
+ false_reg->umin_value = max(false_reg->umin_value, false_umin);
+ true_reg->umax_value = min(true_reg->umax_value, true_umax);
break;
+ }
case BPF_JSGE:
- true_reg->smax_value = min_t(s64, true_reg->smax_value, val);
- false_reg->smin_value = max_t(s64, false_reg->smin_value, val + 1);
+ case BPF_JSGT:
+ {
+ s64 false_smin = opcode == BPF_JSGT ? sval : sval + 1;
+ s64 true_smax = opcode == BPF_JSGT ? sval - 1 : sval;
+
+ if (is_jmp32 && !cmp_val_with_extended_s64(sval, false_reg))
+ break;
+ false_reg->smin_value = max(false_reg->smin_value, false_smin);
+ true_reg->smax_value = min(true_reg->smax_value, true_smax);
break;
+ }
case BPF_JLE:
- true_reg->umin_value = max(true_reg->umin_value, val);
- false_reg->umax_value = min(false_reg->umax_value, val - 1);
+ case BPF_JLT:
+ {
+ u64 false_umax = opcode == BPF_JLT ? val : val - 1;
+ u64 true_umin = opcode == BPF_JLT ? val + 1 : val;
+
+ if (is_jmp32) {
+ false_umax += gen_hi_max(false_reg->var_off);
+ true_umin += gen_hi_min(true_reg->var_off);
+ }
+ false_reg->umax_value = min(false_reg->umax_value, false_umax);
+ true_reg->umin_value = max(true_reg->umin_value, true_umin);
break;
+ }
case BPF_JSLE:
- true_reg->smin_value = max_t(s64, true_reg->smin_value, val);
- false_reg->smax_value = min_t(s64, false_reg->smax_value, val - 1);
+ case BPF_JSLT:
+ {
+ s64 false_smax = opcode == BPF_JSLT ? sval : sval - 1;
+ s64 true_smin = opcode == BPF_JSLT ? sval + 1 : sval;
+
+ if (is_jmp32 && !cmp_val_with_extended_s64(sval, false_reg))
+ break;
+ false_reg->smax_value = min(false_reg->smax_value, false_smax);
+ true_reg->smin_value = max(true_reg->smin_value, true_smin);
break;
+ }
default:
break;
}
@@ -4338,22 +5613,54 @@ static void mark_ptr_or_null_reg(struct bpf_func_state *state,
if (reg->map_ptr->inner_map_meta) {
reg->type = CONST_PTR_TO_MAP;
reg->map_ptr = reg->map_ptr->inner_map_meta;
+ } else if (reg->map_ptr->map_type ==
+ BPF_MAP_TYPE_XSKMAP) {
+ reg->type = PTR_TO_XDP_SOCK;
} else {
reg->type = PTR_TO_MAP_VALUE;
}
} else if (reg->type == PTR_TO_SOCKET_OR_NULL) {
reg->type = PTR_TO_SOCKET;
+ } else if (reg->type == PTR_TO_SOCK_COMMON_OR_NULL) {
+ reg->type = PTR_TO_SOCK_COMMON;
+ } else if (reg->type == PTR_TO_TCP_SOCK_OR_NULL) {
+ reg->type = PTR_TO_TCP_SOCK;
}
- if (is_null || !reg_is_refcounted(reg)) {
- /* We don't need id from this point onwards anymore,
- * thus we should better reset it, so that state
- * pruning has chances to take effect.
+ if (is_null) {
+ /* We don't need id and ref_obj_id from this point
+ * onwards anymore, thus we should better reset it,
+ * so that state pruning has chances to take effect.
+ */
+ reg->id = 0;
+ reg->ref_obj_id = 0;
+ } else if (!reg_may_point_to_spin_lock(reg)) {
+ /* For not-NULL ptr, reg->ref_obj_id will be reset
+ * in release_reg_references().
+ *
+ * reg->id is still used by spin_lock ptr. Other
+ * than spin_lock ptr type, reg->id can be reset.
*/
reg->id = 0;
}
}
}
+static void __mark_ptr_or_null_regs(struct bpf_func_state *state, u32 id,
+ bool is_null)
+{
+ struct bpf_reg_state *reg;
+ int i;
+
+ for (i = 0; i < MAX_BPF_REG; i++)
+ mark_ptr_or_null_reg(state, &state->regs[i], id, is_null);
+
+ bpf_for_each_spilled_reg(i, state, reg) {
+ if (!reg)
+ continue;
+ mark_ptr_or_null_reg(state, reg, id, is_null);
+ }
+}
+
/* The logic is similar to find_good_pkt_pointers(), both could eventually
* be folded together at some point.
*/
@@ -4361,24 +5668,20 @@ static void mark_ptr_or_null_regs(struct bpf_verifier_state *vstate, u32 regno,
bool is_null)
{
struct bpf_func_state *state = vstate->frame[vstate->curframe];
- struct bpf_reg_state *reg, *regs = state->regs;
+ struct bpf_reg_state *regs = state->regs;
+ u32 ref_obj_id = regs[regno].ref_obj_id;
u32 id = regs[regno].id;
- int i, j;
-
- if (reg_is_refcounted_or_null(&regs[regno]) && is_null)
- __release_reference_state(state, id);
+ int i;
- for (i = 0; i < MAX_BPF_REG; i++)
- mark_ptr_or_null_reg(state, &regs[i], id, is_null);
+ if (ref_obj_id && ref_obj_id == id && is_null)
+ /* regs[regno] is in the " == NULL" branch.
+ * No one could have freed the reference state before
+ * doing the NULL check.
+ */
+ WARN_ON_ONCE(release_reference_state(state, id));
- for (j = 0; j <= vstate->curframe; j++) {
- state = vstate->frame[j];
- bpf_for_each_spilled_reg(i, state, reg) {
- if (!reg)
- continue;
- mark_ptr_or_null_reg(state, reg, id, is_null);
- }
- }
+ for (i = 0; i <= vstate->curframe; i++)
+ __mark_ptr_or_null_regs(vstate->frame[i], id, is_null);
}
static bool try_match_pkt_pointers(const struct bpf_insn *insn,
@@ -4390,6 +5693,10 @@ static bool try_match_pkt_pointers(const struct bpf_insn *insn,
if (BPF_SRC(insn->code) != BPF_X)
return false;
+ /* Pointers are always 64-bit. */
+ if (BPF_CLASS(insn->code) == BPF_JMP32)
+ return false;
+
switch (BPF_OP(insn->code)) {
case BPF_JGT:
if ((dst_reg->type == PTR_TO_PACKET &&
@@ -4480,18 +5787,21 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
struct bpf_verifier_state *this_branch = env->cur_state;
struct bpf_verifier_state *other_branch;
struct bpf_reg_state *regs = this_branch->frame[this_branch->curframe]->regs;
- struct bpf_reg_state *dst_reg, *other_branch_regs;
+ struct bpf_reg_state *dst_reg, *other_branch_regs, *src_reg = NULL;
u8 opcode = BPF_OP(insn->code);
+ bool is_jmp32;
+ int pred = -1;
int err;
- if (opcode > BPF_JSLE) {
- verbose(env, "invalid BPF_JMP opcode %x\n", opcode);
+ /* Only conditional jumps are expected to reach here. */
+ if (opcode == BPF_JA || opcode > BPF_JSLE) {
+ verbose(env, "invalid BPF_JMP/JMP32 opcode %x\n", opcode);
return -EINVAL;
}
if (BPF_SRC(insn->code) == BPF_X) {
if (insn->imm != 0) {
- verbose(env, "BPF_JMP uses reserved fields\n");
+ verbose(env, "BPF_JMP/JMP32 uses reserved fields\n");
return -EINVAL;
}
@@ -4505,9 +5815,10 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
insn->src_reg);
return -EACCES;
}
+ src_reg = &regs[insn->src_reg];
} else {
if (insn->src_reg != BPF_REG_0) {
- verbose(env, "BPF_JMP uses reserved fields\n");
+ verbose(env, "BPF_JMP/JMP32 uses reserved fields\n");
return -EINVAL;
}
}
@@ -4518,20 +5829,31 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
return err;
dst_reg = &regs[insn->dst_reg];
-
- if (BPF_SRC(insn->code) == BPF_K) {
- int pred = is_branch_taken(dst_reg, insn->imm, opcode);
-
- if (pred == 1) {
- /* only follow the goto, ignore fall-through */
- *insn_idx += insn->off;
- return 0;
- } else if (pred == 0) {
- /* only follow fall-through branch, since
- * that's where the program will go
- */
- return 0;
- }
+ is_jmp32 = BPF_CLASS(insn->code) == BPF_JMP32;
+
+ if (BPF_SRC(insn->code) == BPF_K)
+ pred = is_branch_taken(dst_reg, insn->imm,
+ opcode, is_jmp32);
+ else if (src_reg->type == SCALAR_VALUE &&
+ tnum_is_const(src_reg->var_off))
+ pred = is_branch_taken(dst_reg, src_reg->var_off.value,
+ opcode, is_jmp32);
+ if (pred >= 0) {
+ err = mark_chain_precision(env, insn->dst_reg);
+ if (BPF_SRC(insn->code) == BPF_X && !err)
+ err = mark_chain_precision(env, insn->src_reg);
+ if (err)
+ return err;
+ }
+ if (pred == 1) {
+ /* only follow the goto, ignore fall-through */
+ *insn_idx += insn->off;
+ return 0;
+ } else if (pred == 0) {
+ /* only follow fall-through branch, since
+ * that's where the program will go
+ */
+ return 0;
}
other_branch = push_stack(env, *insn_idx + insn->off + 1, *insn_idx,
@@ -4548,30 +5870,51 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
* comparable.
*/
if (BPF_SRC(insn->code) == BPF_X) {
+ struct bpf_reg_state *src_reg = &regs[insn->src_reg];
+ struct bpf_reg_state lo_reg0 = *dst_reg;
+ struct bpf_reg_state lo_reg1 = *src_reg;
+ struct bpf_reg_state *src_lo, *dst_lo;
+
+ dst_lo = &lo_reg0;
+ src_lo = &lo_reg1;
+ coerce_reg_to_size(dst_lo, 4);
+ coerce_reg_to_size(src_lo, 4);
+
if (dst_reg->type == SCALAR_VALUE &&
- regs[insn->src_reg].type == SCALAR_VALUE) {
- if (tnum_is_const(regs[insn->src_reg].var_off))
+ src_reg->type == SCALAR_VALUE) {
+ if (tnum_is_const(src_reg->var_off) ||
+ (is_jmp32 && tnum_is_const(src_lo->var_off)))
reg_set_min_max(&other_branch_regs[insn->dst_reg],
- dst_reg, regs[insn->src_reg].var_off.value,
- opcode);
- else if (tnum_is_const(dst_reg->var_off))
+ dst_reg,
+ is_jmp32
+ ? src_lo->var_off.value
+ : src_reg->var_off.value,
+ opcode, is_jmp32);
+ else if (tnum_is_const(dst_reg->var_off) ||
+ (is_jmp32 && tnum_is_const(dst_lo->var_off)))
reg_set_min_max_inv(&other_branch_regs[insn->src_reg],
- &regs[insn->src_reg],
- dst_reg->var_off.value, opcode);
- else if (opcode == BPF_JEQ || opcode == BPF_JNE)
+ src_reg,
+ is_jmp32
+ ? dst_lo->var_off.value
+ : dst_reg->var_off.value,
+ opcode, is_jmp32);
+ else if (!is_jmp32 &&
+ (opcode == BPF_JEQ || opcode == BPF_JNE))
/* Comparing for equality, we can combine knowledge */
reg_combine_min_max(&other_branch_regs[insn->src_reg],
&other_branch_regs[insn->dst_reg],
- &regs[insn->src_reg],
- &regs[insn->dst_reg], opcode);
+ src_reg, dst_reg, opcode);
}
} else if (dst_reg->type == SCALAR_VALUE) {
reg_set_min_max(&other_branch_regs[insn->dst_reg],
- dst_reg, insn->imm, opcode);
+ dst_reg, insn->imm, opcode, is_jmp32);
}
- /* detect if R == 0 where R is returned from bpf_map_lookup_elem() */
- if (BPF_SRC(insn->code) == BPF_K &&
+ /* detect if R == 0 where R is returned from bpf_map_lookup_elem().
+ * NOTE: these optimizations below are related with pointer comparison
+ * which will never be JMP32.
+ */
+ if (!is_jmp32 && BPF_SRC(insn->code) == BPF_K &&
insn->imm == 0 && (opcode == BPF_JEQ || opcode == BPF_JNE) &&
reg_type_may_be_null(dst_reg->type)) {
/* Mark all identical registers in each branch as either
@@ -4588,23 +5931,17 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
insn->dst_reg);
return -EACCES;
}
- if (env->log.level)
+ if (env->log.level & BPF_LOG_LEVEL)
print_verifier_state(env, this_branch->frame[this_branch->curframe]);
return 0;
}
-/* return the map pointer stored inside BPF_LD_IMM64 instruction */
-static struct bpf_map *ld_imm64_to_map_ptr(struct bpf_insn *insn)
-{
- u64 imm64 = ((u64) (u32) insn[0].imm) | ((u64) (u32) insn[1].imm) << 32;
-
- return (struct bpf_map *) (unsigned long) imm64;
-}
-
/* verify BPF_LD_IMM64 instruction */
static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn)
{
+ struct bpf_insn_aux_data *aux = cur_aux(env);
struct bpf_reg_state *regs = cur_regs(env);
+ struct bpf_map *map;
int err;
if (BPF_SIZE(insn->code) != BPF_DW) {
@@ -4628,11 +5965,22 @@ static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn)
return 0;
}
- /* replace_map_fd_with_map_ptr() should have caught bad ld_imm64 */
- BUG_ON(insn->src_reg != BPF_PSEUDO_MAP_FD);
+ map = env->used_maps[aux->map_index];
+ mark_reg_known_zero(env, regs, insn->dst_reg);
+ regs[insn->dst_reg].map_ptr = map;
+
+ if (insn->src_reg == BPF_PSEUDO_MAP_VALUE) {
+ regs[insn->dst_reg].type = PTR_TO_MAP_VALUE;
+ regs[insn->dst_reg].off = aux->map_off;
+ if (map_value_has_spin_lock(map))
+ regs[insn->dst_reg].id = ++env->id_gen;
+ } else if (insn->src_reg == BPF_PSEUDO_MAP_FD) {
+ regs[insn->dst_reg].type = CONST_PTR_TO_MAP;
+ } else {
+ verbose(env, "bpf verifier is misconfigured\n");
+ return -EINVAL;
+ }
- regs[insn->dst_reg].type = CONST_PTR_TO_MAP;
- regs[insn->dst_reg].map_ptr = ld_imm64_to_map_ptr(insn);
return 0;
}
@@ -4713,6 +6061,11 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn)
return err;
}
+ if (env->cur_state->active_spin_lock) {
+ verbose(env, "BPF_LD_[ABS|IND] cannot be used inside bpf_spin_lock-ed region\n");
+ return -EINVAL;
+ }
+
if (regs[BPF_REG_6].type != PTR_TO_CTX) {
verbose(env,
"at the time of BPF_LD_ABS|IND R6 != pointer to skb\n");
@@ -4737,20 +6090,32 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn)
* Already marked as written above.
*/
mark_reg_unknown(env, regs, BPF_REG_0);
+ /* ld_abs load up to 32-bit skb data. */
+ regs[BPF_REG_0].subreg_def = env->insn_idx + 1;
return 0;
}
static int check_return_code(struct bpf_verifier_env *env)
{
+ struct tnum enforce_attach_type_range = tnum_unknown;
struct bpf_reg_state *reg;
struct tnum range = tnum_range(0, 1);
switch (env->prog->type) {
+ case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
+ if (env->prog->expected_attach_type == BPF_CGROUP_UDP4_RECVMSG ||
+ env->prog->expected_attach_type == BPF_CGROUP_UDP6_RECVMSG)
+ range = tnum_range(1, 1);
case BPF_PROG_TYPE_CGROUP_SKB:
+ if (env->prog->expected_attach_type == BPF_CGROUP_INET_EGRESS) {
+ range = tnum_range(0, 3);
+ enforce_attach_type_range = tnum_range(2, 3);
+ }
case BPF_PROG_TYPE_CGROUP_SOCK:
- case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
case BPF_PROG_TYPE_SOCK_OPS:
case BPF_PROG_TYPE_CGROUP_DEVICE:
+ case BPF_PROG_TYPE_CGROUP_SYSCTL:
+ case BPF_PROG_TYPE_CGROUP_SOCKOPT:
break;
default:
return 0;
@@ -4764,18 +6129,23 @@ static int check_return_code(struct bpf_verifier_env *env)
}
if (!tnum_in(range, reg->var_off)) {
+ char tn_buf[48];
+
verbose(env, "At program exit the register R0 ");
if (!tnum_is_unknown(reg->var_off)) {
- char tn_buf[48];
-
tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
verbose(env, "has value %s", tn_buf);
} else {
verbose(env, "has unknown scalar value");
}
- verbose(env, " should have been 0 or 1\n");
+ tnum_strn(tn_buf, sizeof(tn_buf), range);
+ verbose(env, " should have been in %s\n", tn_buf);
return -EINVAL;
}
+
+ if (!tnum_is_unknown(enforce_attach_type_range) &&
+ tnum_in(enforce_attach_type_range, reg->var_off))
+ env->prog->enforce_expected_attach_type = 1;
return 0;
}
@@ -4819,19 +6189,37 @@ enum {
BRANCH = 2,
};
-#define STATE_LIST_MARK ((struct bpf_verifier_state_list *) -1L)
+static u32 state_htab_size(struct bpf_verifier_env *env)
+{
+ return env->prog->len;
+}
+
+static struct bpf_verifier_state_list **explored_state(
+ struct bpf_verifier_env *env,
+ int idx)
+{
+ struct bpf_verifier_state *cur = env->cur_state;
+ struct bpf_func_state *state = cur->frame[cur->curframe];
+
+ return &env->explored_states[(idx ^ state->callsite) % state_htab_size(env)];
+}
-static int *insn_stack; /* stack of insns to process */
-static int cur_stack; /* current stack index */
-static int *insn_state;
+static void init_explored_state(struct bpf_verifier_env *env, int idx)
+{
+ env->insn_aux_data[idx].prune_point = true;
+}
/* t, w, e - match pseudo-code above:
* t - index of current instruction
* w - next instruction
* e - edge
*/
-static int push_insn(int t, int w, int e, struct bpf_verifier_env *env)
+static int push_insn(int t, int w, int e, struct bpf_verifier_env *env,
+ bool loop_ok)
{
+ int *insn_stack = env->cfg.insn_stack;
+ int *insn_state = env->cfg.insn_state;
+
if (e == FALLTHROUGH && insn_state[t] >= (DISCOVERED | FALLTHROUGH))
return 0;
@@ -4846,17 +6234,19 @@ static int push_insn(int t, int w, int e, struct bpf_verifier_env *env)
if (e == BRANCH)
/* mark branch target for state pruning */
- env->explored_states[w] = STATE_LIST_MARK;
+ init_explored_state(env, w);
if (insn_state[w] == 0) {
/* tree-edge */
insn_state[t] = DISCOVERED | e;
insn_state[w] = DISCOVERED;
- if (cur_stack >= env->prog->len)
+ if (env->cfg.cur_stack >= env->prog->len)
return -E2BIG;
- insn_stack[cur_stack++] = w;
+ insn_stack[env->cfg.cur_stack++] = w;
return 1;
} else if ((insn_state[w] & 0xF0) == DISCOVERED) {
+ if (loop_ok && env->allow_ptr_leaks)
+ return 0;
verbose_linfo(env, t, "%d: ", t);
verbose_linfo(env, w, "%d: ", w);
verbose(env, "back-edge from insn %d to %d\n", t, w);
@@ -4878,44 +6268,47 @@ static int check_cfg(struct bpf_verifier_env *env)
{
struct bpf_insn *insns = env->prog->insnsi;
int insn_cnt = env->prog->len;
+ int *insn_stack, *insn_state;
int ret = 0;
int i, t;
- insn_state = kcalloc(insn_cnt, sizeof(int), GFP_KERNEL);
+ insn_state = env->cfg.insn_state = kvcalloc(insn_cnt, sizeof(int), GFP_KERNEL);
if (!insn_state)
return -ENOMEM;
- insn_stack = kcalloc(insn_cnt, sizeof(int), GFP_KERNEL);
+ insn_stack = env->cfg.insn_stack = kvcalloc(insn_cnt, sizeof(int), GFP_KERNEL);
if (!insn_stack) {
- kfree(insn_state);
+ kvfree(insn_state);
return -ENOMEM;
}
insn_state[0] = DISCOVERED; /* mark 1st insn as discovered */
insn_stack[0] = 0; /* 0 is the first instruction */
- cur_stack = 1;
+ env->cfg.cur_stack = 1;
peek_stack:
- if (cur_stack == 0)
+ if (env->cfg.cur_stack == 0)
goto check_state;
- t = insn_stack[cur_stack - 1];
+ t = insn_stack[env->cfg.cur_stack - 1];
- if (BPF_CLASS(insns[t].code) == BPF_JMP) {
+ if (BPF_CLASS(insns[t].code) == BPF_JMP ||
+ BPF_CLASS(insns[t].code) == BPF_JMP32) {
u8 opcode = BPF_OP(insns[t].code);
if (opcode == BPF_EXIT) {
goto mark_explored;
} else if (opcode == BPF_CALL) {
- ret = push_insn(t, t + 1, FALLTHROUGH, env);
+ ret = push_insn(t, t + 1, FALLTHROUGH, env, false);
if (ret == 1)
goto peek_stack;
else if (ret < 0)
goto err_free;
if (t + 1 < insn_cnt)
- env->explored_states[t + 1] = STATE_LIST_MARK;
+ init_explored_state(env, t + 1);
if (insns[t].src_reg == BPF_PSEUDO_CALL) {
- env->explored_states[t] = STATE_LIST_MARK;
- ret = push_insn(t, t + insns[t].imm + 1, BRANCH, env);
+ init_explored_state(env, t);
+ ret = push_insn(t, t + insns[t].imm + 1, BRANCH,
+ env, false);
if (ret == 1)
goto peek_stack;
else if (ret < 0)
@@ -4928,26 +6321,31 @@ peek_stack:
}
/* unconditional jump with single edge */
ret = push_insn(t, t + insns[t].off + 1,
- FALLTHROUGH, env);
+ FALLTHROUGH, env, true);
if (ret == 1)
goto peek_stack;
else if (ret < 0)
goto err_free;
+ /* unconditional jmp is not a good pruning point,
+ * but it's marked, since backtracking needs
+ * to record jmp history in is_state_visited().
+ */
+ init_explored_state(env, t + insns[t].off + 1);
/* tell verifier to check for equivalent states
* after every call and jump
*/
if (t + 1 < insn_cnt)
- env->explored_states[t + 1] = STATE_LIST_MARK;
+ init_explored_state(env, t + 1);
} else {
/* conditional jump with two edges */
- env->explored_states[t] = STATE_LIST_MARK;
- ret = push_insn(t, t + 1, FALLTHROUGH, env);
+ init_explored_state(env, t);
+ ret = push_insn(t, t + 1, FALLTHROUGH, env, true);
if (ret == 1)
goto peek_stack;
else if (ret < 0)
goto err_free;
- ret = push_insn(t, t + insns[t].off + 1, BRANCH, env);
+ ret = push_insn(t, t + insns[t].off + 1, BRANCH, env, true);
if (ret == 1)
goto peek_stack;
else if (ret < 0)
@@ -4957,7 +6355,7 @@ peek_stack:
/* all other non-branch instructions with single
* fall-through edge
*/
- ret = push_insn(t, t + 1, FALLTHROUGH, env);
+ ret = push_insn(t, t + 1, FALLTHROUGH, env, false);
if (ret == 1)
goto peek_stack;
else if (ret < 0)
@@ -4966,7 +6364,7 @@ peek_stack:
mark_explored:
insn_state[t] = EXPLORED;
- if (cur_stack-- <= 0) {
+ if (env->cfg.cur_stack-- <= 0) {
verbose(env, "pop stack internal bug\n");
ret = -EFAULT;
goto err_free;
@@ -4984,8 +6382,9 @@ check_state:
ret = 0; /* cfg looks good */
err_free:
- kfree(insn_state);
- kfree(insn_stack);
+ kvfree(insn_state);
+ kvfree(insn_stack);
+ env->cfg.insn_state = env->cfg.insn_stack = NULL;
return ret;
}
@@ -4997,13 +6396,14 @@ static int check_btf_func(struct bpf_verifier_env *env,
const union bpf_attr *attr,
union bpf_attr __user *uattr)
{
- u32 i, nfuncs, urec_size, min_size, prev_offset;
+ u32 i, nfuncs, urec_size, min_size;
u32 krec_size = sizeof(struct bpf_func_info);
struct bpf_func_info *krecord;
const struct btf_type *type;
struct bpf_prog *prog;
const struct btf *btf;
void __user *urecord;
+ u32 prev_offset = 0;
int ret = 0;
nfuncs = attr->func_info_cnt;
@@ -5386,12 +6786,12 @@ static void clean_live_states(struct bpf_verifier_env *env, int insn,
struct bpf_verifier_state_list *sl;
int i;
- sl = env->explored_states[insn];
- if (!sl)
- return;
-
- while (sl != STATE_LIST_MARK) {
- if (sl->state.curframe != cur->curframe)
+ sl = *explored_state(env, insn);
+ while (sl) {
+ if (sl->state.branches)
+ goto next;
+ if (sl->state.insn_idx != insn ||
+ sl->state.curframe != cur->curframe)
goto next;
for (i = 0; i <= cur->curframe; i++)
if (sl->state.frame[i]->callsite != cur->frame[i]->callsite)
@@ -5431,6 +6831,8 @@ static bool regsafe(struct bpf_reg_state *rold, struct bpf_reg_state *rcur,
switch (rold->type) {
case SCALAR_VALUE:
if (rcur->type == SCALAR_VALUE) {
+ if (!rold->precise && !rcur->precise)
+ return true;
/* new val must satisfy old val knowledge */
return range_within(rold, rcur) &&
tnum_in(rold->var_off, rcur->var_off);
@@ -5447,8 +6849,11 @@ static bool regsafe(struct bpf_reg_state *rold, struct bpf_reg_state *rcur,
case PTR_TO_MAP_VALUE:
/* If the new min/max/var_off satisfy the old ones and
* everything else matches, we are OK.
- * We don't care about the 'id' value, because nothing
- * uses it for PTR_TO_MAP_VALUE (only for ..._OR_NULL)
+ * 'id' is not compared, since it's only used for maps with
+ * bpf_spin_lock inside map element and in such cases if
+ * the rest of the prog is valid for one map element then
+ * it's valid for all map elements regardless of the key
+ * used in bpf_map_lookup()
*/
return memcmp(rold, rcur, offsetof(struct bpf_reg_state, id)) == 0 &&
range_within(rold, rcur) &&
@@ -5496,6 +6901,11 @@ static bool regsafe(struct bpf_reg_state *rold, struct bpf_reg_state *rcur,
case PTR_TO_FLOW_KEYS:
case PTR_TO_SOCKET:
case PTR_TO_SOCKET_OR_NULL:
+ case PTR_TO_SOCK_COMMON:
+ case PTR_TO_SOCK_COMMON_OR_NULL:
+ case PTR_TO_TCP_SOCK:
+ case PTR_TO_TCP_SOCK_OR_NULL:
+ case PTR_TO_XDP_SOCK:
/* Only valid matches are exact, which memcmp() above
* would have accepted
*/
@@ -5651,6 +7061,9 @@ static bool states_equal(struct bpf_verifier_env *env,
if (old->speculative && !cur->speculative)
return false;
+ if (old->active_spin_lock != cur->active_spin_lock)
+ return false;
+
/* for states to be equal callsites have to be the same
* and all frame states need to be equivalent
*/
@@ -5663,6 +7076,35 @@ static bool states_equal(struct bpf_verifier_env *env,
return true;
}
+/* Return 0 if no propagation happened. Return negative error code if error
+ * happened. Otherwise, return the propagated bit.
+ */
+static int propagate_liveness_reg(struct bpf_verifier_env *env,
+ struct bpf_reg_state *reg,
+ struct bpf_reg_state *parent_reg)
+{
+ u8 parent_flag = parent_reg->live & REG_LIVE_READ;
+ u8 flag = reg->live & REG_LIVE_READ;
+ int err;
+
+ /* When comes here, read flags of PARENT_REG or REG could be any of
+ * REG_LIVE_READ64, REG_LIVE_READ32, REG_LIVE_NONE. There is no need
+ * of propagation if PARENT_REG has strongest REG_LIVE_READ64.
+ */
+ if (parent_flag == REG_LIVE_READ64 ||
+ /* Or if there is no read flag from REG. */
+ !flag ||
+ /* Or if the read flag from REG is the same as PARENT_REG. */
+ parent_flag == flag)
+ return 0;
+
+ err = mark_reg_read(env, reg, parent_reg, flag);
+ if (err)
+ return err;
+
+ return flag;
+}
+
/* A write screens off any subsequent reads; but write marks come from the
* straight-line code between a state and its parent. When we arrive at an
* equivalent state (jump target or such) we didn't arrive by the straight-line
@@ -5674,8 +7116,9 @@ static int propagate_liveness(struct bpf_verifier_env *env,
const struct bpf_verifier_state *vstate,
struct bpf_verifier_state *vparent)
{
- int i, frame, err = 0;
+ struct bpf_reg_state *state_reg, *parent_reg;
struct bpf_func_state *state, *parent;
+ int i, frame, err = 0;
if (vparent->curframe != vstate->curframe) {
WARN(1, "propagate_live: parent frame %d current frame %d\n",
@@ -5684,52 +7127,156 @@ static int propagate_liveness(struct bpf_verifier_env *env,
}
/* Propagate read liveness of registers... */
BUILD_BUG_ON(BPF_REG_FP + 1 != MAX_BPF_REG);
- /* We don't need to worry about FP liveness because it's read-only */
- for (i = 0; i < BPF_REG_FP; i++) {
- if (vparent->frame[vparent->curframe]->regs[i].live & REG_LIVE_READ)
- continue;
- if (vstate->frame[vstate->curframe]->regs[i].live & REG_LIVE_READ) {
- err = mark_reg_read(env, &vstate->frame[vstate->curframe]->regs[i],
- &vparent->frame[vstate->curframe]->regs[i]);
- if (err)
+ for (frame = 0; frame <= vstate->curframe; frame++) {
+ parent = vparent->frame[frame];
+ state = vstate->frame[frame];
+ parent_reg = parent->regs;
+ state_reg = state->regs;
+ /* We don't need to worry about FP liveness, it's read-only */
+ for (i = frame < vstate->curframe ? BPF_REG_6 : 0; i < BPF_REG_FP; i++) {
+ err = propagate_liveness_reg(env, &state_reg[i],
+ &parent_reg[i]);
+ if (err < 0)
return err;
+ if (err == REG_LIVE_READ64)
+ mark_insn_zext(env, &parent_reg[i]);
}
- }
- /* ... and stack slots */
- for (frame = 0; frame <= vstate->curframe; frame++) {
- state = vstate->frame[frame];
- parent = vparent->frame[frame];
+ /* Propagate stack slots. */
for (i = 0; i < state->allocated_stack / BPF_REG_SIZE &&
i < parent->allocated_stack / BPF_REG_SIZE; i++) {
- if (parent->stack[i].spilled_ptr.live & REG_LIVE_READ)
- continue;
- if (state->stack[i].spilled_ptr.live & REG_LIVE_READ)
- mark_reg_read(env, &state->stack[i].spilled_ptr,
- &parent->stack[i].spilled_ptr);
+ parent_reg = &parent->stack[i].spilled_ptr;
+ state_reg = &state->stack[i].spilled_ptr;
+ err = propagate_liveness_reg(env, state_reg,
+ parent_reg);
+ if (err < 0)
+ return err;
}
}
- return err;
+ return 0;
}
+/* find precise scalars in the previous equivalent state and
+ * propagate them into the current state
+ */
+static int propagate_precision(struct bpf_verifier_env *env,
+ const struct bpf_verifier_state *old)
+{
+ struct bpf_reg_state *state_reg;
+ struct bpf_func_state *state;
+ int i, err = 0;
+
+ state = old->frame[old->curframe];
+ state_reg = state->regs;
+ for (i = 0; i < BPF_REG_FP; i++, state_reg++) {
+ if (state_reg->type != SCALAR_VALUE ||
+ !state_reg->precise)
+ continue;
+ if (env->log.level & BPF_LOG_LEVEL2)
+ verbose(env, "propagating r%d\n", i);
+ err = mark_chain_precision(env, i);
+ if (err < 0)
+ return err;
+ }
+
+ for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) {
+ if (state->stack[i].slot_type[0] != STACK_SPILL)
+ continue;
+ state_reg = &state->stack[i].spilled_ptr;
+ if (state_reg->type != SCALAR_VALUE ||
+ !state_reg->precise)
+ continue;
+ if (env->log.level & BPF_LOG_LEVEL2)
+ verbose(env, "propagating fp%d\n",
+ (-i - 1) * BPF_REG_SIZE);
+ err = mark_chain_precision_stack(env, i);
+ if (err < 0)
+ return err;
+ }
+ return 0;
+}
+
+static bool states_maybe_looping(struct bpf_verifier_state *old,
+ struct bpf_verifier_state *cur)
+{
+ struct bpf_func_state *fold, *fcur;
+ int i, fr = cur->curframe;
+
+ if (old->curframe != fr)
+ return false;
+
+ fold = old->frame[fr];
+ fcur = cur->frame[fr];
+ for (i = 0; i < MAX_BPF_REG; i++)
+ if (memcmp(&fold->regs[i], &fcur->regs[i],
+ offsetof(struct bpf_reg_state, parent)))
+ return false;
+ return true;
+}
+
+
static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
{
struct bpf_verifier_state_list *new_sl;
- struct bpf_verifier_state_list *sl;
+ struct bpf_verifier_state_list *sl, **pprev;
struct bpf_verifier_state *cur = env->cur_state, *new;
int i, j, err, states_cnt = 0;
+ bool add_new_state = false;
- sl = env->explored_states[insn_idx];
- if (!sl)
+ cur->last_insn_idx = env->prev_insn_idx;
+ if (!env->insn_aux_data[insn_idx].prune_point)
/* this 'insn_idx' instruction wasn't marked, so we will not
* be doing state search here
*/
return 0;
+ /* bpf progs typically have pruning point every 4 instructions
+ * http://vger.kernel.org/bpfconf2019.html#session-1
+ * Do not add new state for future pruning if the verifier hasn't seen
+ * at least 2 jumps and at least 8 instructions.
+ * This heuristics helps decrease 'total_states' and 'peak_states' metric.
+ * In tests that amounts to up to 50% reduction into total verifier
+ * memory consumption and 20% verifier time speedup.
+ */
+ if (env->jmps_processed - env->prev_jmps_processed >= 2 &&
+ env->insn_processed - env->prev_insn_processed >= 8)
+ add_new_state = true;
+
+ pprev = explored_state(env, insn_idx);
+ sl = *pprev;
+
clean_live_states(env, insn_idx, cur);
- while (sl != STATE_LIST_MARK) {
+ while (sl) {
+ states_cnt++;
+ if (sl->state.insn_idx != insn_idx)
+ goto next;
+ if (sl->state.branches) {
+ if (states_maybe_looping(&sl->state, cur) &&
+ states_equal(env, &sl->state, cur)) {
+ verbose_linfo(env, insn_idx, "; ");
+ verbose(env, "infinite loop detected at insn %d\n", insn_idx);
+ return -EINVAL;
+ }
+ /* if the verifier is processing a loop, avoid adding new state
+ * too often, since different loop iterations have distinct
+ * states and may not help future pruning.
+ * This threshold shouldn't be too low to make sure that
+ * a loop with large bound will be rejected quickly.
+ * The most abusive loop will be:
+ * r1 += 1
+ * if r1 < 1000000 goto pc-2
+ * 1M insn_procssed limit / 100 == 10k peak states.
+ * This threshold shouldn't be too high either, since states
+ * at the end of the loop are likely to be useful in pruning.
+ */
+ if (env->jmps_processed - env->prev_jmps_processed < 20 &&
+ env->insn_processed - env->prev_insn_processed < 100)
+ add_new_state = false;
+ goto miss;
+ }
if (states_equal(env, &sl->state, cur)) {
+ sl->hit_cnt++;
/* reached equivalent register/stack state,
* prune the search.
* Registers read by the continuation are read by us.
@@ -5741,27 +7288,87 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
* this state and will pop a new one.
*/
err = propagate_liveness(env, &sl->state, cur);
+
+ /* if previous state reached the exit with precision and
+ * current state is equivalent to it (except precsion marks)
+ * the precision needs to be propagated back in
+ * the current state.
+ */
+ err = err ? : push_jmp_history(env, cur);
+ err = err ? : propagate_precision(env, &sl->state);
if (err)
return err;
return 1;
}
- sl = sl->next;
- states_cnt++;
+miss:
+ /* when new state is not going to be added do not increase miss count.
+ * Otherwise several loop iterations will remove the state
+ * recorded earlier. The goal of these heuristics is to have
+ * states from some iterations of the loop (some in the beginning
+ * and some at the end) to help pruning.
+ */
+ if (add_new_state)
+ sl->miss_cnt++;
+ /* heuristic to determine whether this state is beneficial
+ * to keep checking from state equivalence point of view.
+ * Higher numbers increase max_states_per_insn and verification time,
+ * but do not meaningfully decrease insn_processed.
+ */
+ if (sl->miss_cnt > sl->hit_cnt * 3 + 3) {
+ /* the state is unlikely to be useful. Remove it to
+ * speed up verification
+ */
+ *pprev = sl->next;
+ if (sl->state.frame[0]->regs[0].live & REG_LIVE_DONE) {
+ u32 br = sl->state.branches;
+
+ WARN_ONCE(br,
+ "BUG live_done but branches_to_explore %d\n",
+ br);
+ free_verifier_state(&sl->state, false);
+ kfree(sl);
+ env->peak_states--;
+ } else {
+ /* cannot free this state, since parentage chain may
+ * walk it later. Add it for free_list instead to
+ * be freed at the end of verification
+ */
+ sl->next = env->free_list;
+ env->free_list = sl;
+ }
+ sl = *pprev;
+ continue;
+ }
+next:
+ pprev = &sl->next;
+ sl = *pprev;
}
+ if (env->max_states_per_insn < states_cnt)
+ env->max_states_per_insn = states_cnt;
+
if (!env->allow_ptr_leaks && states_cnt > BPF_COMPLEXITY_LIMIT_STATES)
- return 0;
+ return push_jmp_history(env, cur);
- /* there were no equivalent states, remember current one.
- * technically the current state is not proven to be safe yet,
+ if (!add_new_state)
+ return push_jmp_history(env, cur);
+
+ /* There were no equivalent states, remember the current one.
+ * Technically the current state is not proven to be safe yet,
* but it will either reach outer most bpf_exit (which means it's safe)
- * or it will be rejected. Since there are no loops, we won't be
+ * or it will be rejected. When there are no loops the verifier won't be
* seeing this tuple (frame[0].callsite, frame[1].callsite, .. insn_idx)
- * again on the way to bpf_exit
+ * again on the way to bpf_exit.
+ * When looping the sl->state.branches will be > 0 and this state
+ * will not be considered for equivalence until branches == 0.
*/
new_sl = kzalloc(sizeof(struct bpf_verifier_state_list), GFP_KERNEL);
if (!new_sl)
return -ENOMEM;
+ env->total_states++;
+ env->peak_states++;
+ env->prev_jmps_processed = env->jmps_processed;
+ env->prev_insn_processed = env->insn_processed;
/* add new state to the head of linked list */
new = &new_sl->state;
@@ -5771,8 +7378,15 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
kfree(new_sl);
return err;
}
- new_sl->next = env->explored_states[insn_idx];
- env->explored_states[insn_idx] = new_sl;
+ new->insn_idx = insn_idx;
+ WARN_ONCE(new->branches != 1,
+ "BUG is_state_visited:branches_to_explore=%d insn %d\n", new->branches, insn_idx);
+
+ cur->parent = new;
+ cur->first_insn_idx = insn_idx;
+ clear_jmp_history(cur);
+ new_sl->next = *explored_state(env, insn_idx);
+ *explored_state(env, insn_idx) = new_sl;
/* connect new state to parentage chain. Current frame needs all
* registers connected. Only r6 - r9 of the callers are alive (pushed
* to the stack implicitly by JITs) so in callers' frames connect just
@@ -5780,17 +7394,18 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
* the state of the call instruction (with WRITTEN set), and r0 comes
* from callee with its full parentage chain, anyway.
*/
- for (j = 0; j <= cur->curframe; j++)
- for (i = j < cur->curframe ? BPF_REG_6 : 0; i < BPF_REG_FP; i++)
- cur->frame[j]->regs[i].parent = &new->frame[j]->regs[i];
/* clear write marks in current state: the writes we did are not writes
* our child did, so they don't screen off its reads from us.
* (There are no read marks in current state, because reads always mark
* their parent and current state never has children yet. Only
* explored_states can get read marks.)
*/
- for (i = 0; i < BPF_REG_FP; i++)
- cur->frame[cur->curframe]->regs[i].live = REG_LIVE_NONE;
+ for (j = 0; j <= cur->curframe; j++) {
+ for (i = j < cur->curframe ? BPF_REG_6 : 0; i < BPF_REG_FP; i++)
+ cur->frame[j]->regs[i].parent = &new->frame[j]->regs[i];
+ for (i = 0; i < BPF_REG_FP; i++)
+ cur->frame[j]->regs[i].live = REG_LIVE_NONE;
+ }
/* all stack frames are accessible from callee, clear them all */
for (j = 0; j <= cur->curframe; j++) {
@@ -5813,6 +7428,11 @@ static bool reg_type_mismatch_ok(enum bpf_reg_type type)
case PTR_TO_CTX:
case PTR_TO_SOCKET:
case PTR_TO_SOCKET_OR_NULL:
+ case PTR_TO_SOCK_COMMON:
+ case PTR_TO_SOCK_COMMON_OR_NULL:
+ case PTR_TO_TCP_SOCK:
+ case PTR_TO_TCP_SOCK_OR_NULL:
+ case PTR_TO_XDP_SOCK:
return false;
default:
return true;
@@ -5842,9 +7462,9 @@ static int do_check(struct bpf_verifier_env *env)
struct bpf_verifier_state *state;
struct bpf_insn *insns = env->prog->insnsi;
struct bpf_reg_state *regs;
- int insn_cnt = env->prog->len, i;
- int insn_processed = 0;
+ int insn_cnt = env->prog->len;
bool do_print_state = false;
+ int prev_insn_idx = -1;
env->prev_linfo = NULL;
@@ -5853,6 +7473,7 @@ static int do_check(struct bpf_verifier_env *env)
return -ENOMEM;
state->curframe = 0;
state->speculative = false;
+ state->branches = 1;
state->frame[0] = kzalloc(sizeof(struct bpf_func_state), GFP_KERNEL);
if (!state->frame[0]) {
kfree(state);
@@ -5869,6 +7490,7 @@ static int do_check(struct bpf_verifier_env *env)
u8 class;
int err;
+ env->prev_insn_idx = prev_insn_idx;
if (env->insn_idx >= insn_cnt) {
verbose(env, "invalid insn idx %d insn_cnt %d\n",
env->insn_idx, insn_cnt);
@@ -5878,10 +7500,10 @@ static int do_check(struct bpf_verifier_env *env)
insn = &insns[env->insn_idx];
class = BPF_CLASS(insn->code);
- if (++insn_processed > BPF_COMPLEXITY_LIMIT_INSNS) {
+ if (++env->insn_processed > BPF_COMPLEXITY_LIMIT_INSNS) {
verbose(env,
"BPF program is too large. Processed %d insn\n",
- insn_processed);
+ env->insn_processed);
return -E2BIG;
}
@@ -5890,7 +7512,7 @@ static int do_check(struct bpf_verifier_env *env)
return err;
if (err == 1) {
/* found equivalent state, can prune the search */
- if (env->log.level) {
+ if (env->log.level & BPF_LOG_LEVEL) {
if (do_print_state)
verbose(env, "\nfrom %d to %d%s: safe\n",
env->prev_insn_idx, env->insn_idx,
@@ -5908,8 +7530,9 @@ static int do_check(struct bpf_verifier_env *env)
if (need_resched())
cond_resched();
- if (env->log.level > 1 || (env->log.level && do_print_state)) {
- if (env->log.level > 1)
+ if (env->log.level & BPF_LOG_LEVEL2 ||
+ (env->log.level & BPF_LOG_LEVEL && do_print_state)) {
+ if (env->log.level & BPF_LOG_LEVEL2)
verbose(env, "%d:", env->insn_idx);
else
verbose(env, "\nfrom %d to %d%s:",
@@ -5920,7 +7543,7 @@ static int do_check(struct bpf_verifier_env *env)
do_print_state = false;
}
- if (env->log.level) {
+ if (env->log.level & BPF_LOG_LEVEL) {
const struct bpf_insn_cbs cbs = {
.cb_print = verbose,
.private_data = env,
@@ -5940,6 +7563,7 @@ static int do_check(struct bpf_verifier_env *env)
regs = cur_regs(env);
env->insn_aux_data[env->insn_idx].seen = true;
+ prev_insn_idx = env->insn_idx;
if (class == BPF_ALU || class == BPF_ALU64) {
err = check_alu_op(env, insn);
@@ -6055,19 +7679,27 @@ static int do_check(struct bpf_verifier_env *env)
if (err)
return err;
- } else if (class == BPF_JMP) {
+ } else if (class == BPF_JMP || class == BPF_JMP32) {
u8 opcode = BPF_OP(insn->code);
+ env->jmps_processed++;
if (opcode == BPF_CALL) {
if (BPF_SRC(insn->code) != BPF_K ||
insn->off != 0 ||
(insn->src_reg != BPF_REG_0 &&
insn->src_reg != BPF_PSEUDO_CALL) ||
- insn->dst_reg != BPF_REG_0) {
+ insn->dst_reg != BPF_REG_0 ||
+ class == BPF_JMP32) {
verbose(env, "BPF_CALL uses reserved fields\n");
return -EINVAL;
}
+ if (env->cur_state->active_spin_lock &&
+ (insn->src_reg == BPF_PSEUDO_CALL ||
+ insn->imm != BPF_FUNC_spin_unlock)) {
+ verbose(env, "function calls are not allowed while holding a lock\n");
+ return -EINVAL;
+ }
if (insn->src_reg == BPF_PSEUDO_CALL)
err = check_func_call(env, insn, &env->insn_idx);
else
@@ -6079,7 +7711,8 @@ static int do_check(struct bpf_verifier_env *env)
if (BPF_SRC(insn->code) != BPF_K ||
insn->imm != 0 ||
insn->src_reg != BPF_REG_0 ||
- insn->dst_reg != BPF_REG_0) {
+ insn->dst_reg != BPF_REG_0 ||
+ class == BPF_JMP32) {
verbose(env, "BPF_JA uses reserved fields\n");
return -EINVAL;
}
@@ -6091,14 +7724,19 @@ static int do_check(struct bpf_verifier_env *env)
if (BPF_SRC(insn->code) != BPF_K ||
insn->imm != 0 ||
insn->src_reg != BPF_REG_0 ||
- insn->dst_reg != BPF_REG_0) {
+ insn->dst_reg != BPF_REG_0 ||
+ class == BPF_JMP32) {
verbose(env, "BPF_EXIT uses reserved fields\n");
return -EINVAL;
}
+ if (env->cur_state->active_spin_lock) {
+ verbose(env, "bpf_spin_unlock is missing\n");
+ return -EINVAL;
+ }
+
if (state->curframe) {
/* exit from nested function */
- env->prev_insn_idx = env->insn_idx;
err = prepare_func_exit(env, &env->insn_idx);
if (err)
return err;
@@ -6129,7 +7767,8 @@ static int do_check(struct bpf_verifier_env *env)
if (err)
return err;
process_bpf_exit:
- err = pop_stack(env, &env->prev_insn_idx,
+ update_branch_counts(env, env->cur_state);
+ err = pop_stack(env, &prev_insn_idx,
&env->insn_idx);
if (err < 0) {
if (err != -ENOENT)
@@ -6171,16 +7810,6 @@ process_bpf_exit:
env->insn_idx++;
}
- verbose(env, "processed %d insns (limit %d), stack depth ",
- insn_processed, BPF_COMPLEXITY_LIMIT_INSNS);
- for (i = 0; i < env->subprog_cnt; i++) {
- u32 depth = env->subprog_info[i].stack_depth;
-
- verbose(env, "%d", depth);
- if (i + 1 < env->subprog_cnt)
- verbose(env, "+");
- }
- verbose(env, "\n");
env->prog->aux->stack_depth = env->subprog_info[0].stack_depth;
return 0;
}
@@ -6193,6 +7822,19 @@ static int check_map_prealloc(struct bpf_map *map)
!(map->map_flags & BPF_F_NO_PREALLOC);
}
+static bool is_tracing_prog_type(enum bpf_prog_type type)
+{
+ switch (type) {
+ case BPF_PROG_TYPE_KPROBE:
+ case BPF_PROG_TYPE_TRACEPOINT:
+ case BPF_PROG_TYPE_PERF_EVENT:
+ case BPF_PROG_TYPE_RAW_TRACEPOINT:
+ return true;
+ default:
+ return false;
+ }
+}
+
static int check_map_prog_compatibility(struct bpf_verifier_env *env,
struct bpf_map *map,
struct bpf_prog *prog)
@@ -6215,6 +7857,13 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env,
}
}
+ if ((is_tracing_prog_type(prog->type) ||
+ prog->type == BPF_PROG_TYPE_SOCKET_FILTER) &&
+ map_value_has_spin_lock(map)) {
+ verbose(env, "tracing progs cannot use bpf_spin_lock yet\n");
+ return -EINVAL;
+ }
+
if ((bpf_prog_is_dev_bound(prog->aux) || bpf_map_is_dev_bound(map)) &&
!bpf_offload_prog_map_match(prog, map)) {
verbose(env, "offload device mismatch between prog and map\n");
@@ -6258,8 +7907,10 @@ static int replace_map_fd_with_map_ptr(struct bpf_verifier_env *env)
}
if (insn[0].code == (BPF_LD | BPF_IMM | BPF_DW)) {
+ struct bpf_insn_aux_data *aux;
struct bpf_map *map;
struct fd f;
+ u64 addr;
if (i == insn_cnt - 1 || insn[1].code != 0 ||
insn[1].dst_reg != 0 || insn[1].src_reg != 0 ||
@@ -6268,21 +7919,27 @@ static int replace_map_fd_with_map_ptr(struct bpf_verifier_env *env)
return -EINVAL;
}
- if (insn->src_reg == 0)
+ if (insn[0].src_reg == 0)
/* valid generic load 64-bit imm */
goto next_insn;
- if (insn->src_reg != BPF_PSEUDO_MAP_FD) {
+ /* In final convert_pseudo_ld_imm64() step, this is
+ * converted into regular 64-bit imm load insn.
+ */
+ if ((insn[0].src_reg != BPF_PSEUDO_MAP_FD &&
+ insn[0].src_reg != BPF_PSEUDO_MAP_VALUE) ||
+ (insn[0].src_reg == BPF_PSEUDO_MAP_FD &&
+ insn[1].imm != 0)) {
verbose(env,
"unrecognized bpf_ld_imm64 insn\n");
return -EINVAL;
}
- f = fdget(insn->imm);
+ f = fdget(insn[0].imm);
map = __bpf_map_get(f);
if (IS_ERR(map)) {
verbose(env, "fd %d is not pointing to valid bpf_map\n",
- insn->imm);
+ insn[0].imm);
return PTR_ERR(map);
}
@@ -6292,16 +7949,47 @@ static int replace_map_fd_with_map_ptr(struct bpf_verifier_env *env)
return err;
}
- /* store map pointer inside BPF_LD_IMM64 instruction */
- insn[0].imm = (u32) (unsigned long) map;
- insn[1].imm = ((u64) (unsigned long) map) >> 32;
+ aux = &env->insn_aux_data[i];
+ if (insn->src_reg == BPF_PSEUDO_MAP_FD) {
+ addr = (unsigned long)map;
+ } else {
+ u32 off = insn[1].imm;
+
+ if (off >= BPF_MAX_VAR_OFF) {
+ verbose(env, "direct value offset of %u is not allowed\n", off);
+ fdput(f);
+ return -EINVAL;
+ }
+
+ if (!map->ops->map_direct_value_addr) {
+ verbose(env, "no direct value access support for this map type\n");
+ fdput(f);
+ return -EINVAL;
+ }
+
+ err = map->ops->map_direct_value_addr(map, &addr, off);
+ if (err) {
+ verbose(env, "invalid access to map value pointer, value_size=%u off=%u\n",
+ map->value_size, off);
+ fdput(f);
+ return err;
+ }
+
+ aux->map_off = off;
+ addr += off;
+ }
+
+ insn[0].imm = (u32)addr;
+ insn[1].imm = addr >> 32;
/* check whether we recorded this map already */
- for (j = 0; j < env->used_map_cnt; j++)
+ for (j = 0; j < env->used_map_cnt; j++) {
if (env->used_maps[j] == map) {
+ aux->map_index = j;
fdput(f);
goto next_insn;
}
+ }
if (env->used_map_cnt >= MAX_USED_MAPS) {
fdput(f);
@@ -6318,6 +8006,8 @@ static int replace_map_fd_with_map_ptr(struct bpf_verifier_env *env)
fdput(f);
return PTR_ERR(map);
}
+
+ aux->map_index = env->used_map_cnt;
env->used_maps[env->used_map_cnt++] = map;
if (bpf_map_is_cgroup_storage(map) &&
@@ -6381,14 +8071,23 @@ static void convert_pseudo_ld_imm64(struct bpf_verifier_env *env)
* insni[off, off + cnt). Adjust corresponding insn_aux_data by copying
* [0, off) and [off, end) to new locations, so the patched range stays zero
*/
-static int adjust_insn_aux_data(struct bpf_verifier_env *env, u32 prog_len,
- u32 off, u32 cnt)
+static int adjust_insn_aux_data(struct bpf_verifier_env *env,
+ struct bpf_prog *new_prog, u32 off, u32 cnt)
{
struct bpf_insn_aux_data *new_data, *old_data = env->insn_aux_data;
+ struct bpf_insn *insn = new_prog->insnsi;
+ u32 prog_len;
int i;
+ /* aux info at OFF always needs adjustment, no matter fast path
+ * (cnt == 1) is taken or not. There is no guarantee INSN at OFF is the
+ * original insn at old prog.
+ */
+ old_data[off].zext_dst = insn_has_def32(env, insn + off + cnt - 1);
+
if (cnt == 1)
return 0;
+ prog_len = new_prog->len;
new_data = vzalloc(array_size(prog_len,
sizeof(struct bpf_insn_aux_data)));
if (!new_data)
@@ -6396,8 +8095,10 @@ static int adjust_insn_aux_data(struct bpf_verifier_env *env, u32 prog_len,
memcpy(new_data, old_data, sizeof(struct bpf_insn_aux_data) * off);
memcpy(new_data + off + cnt - 1, old_data + off,
sizeof(struct bpf_insn_aux_data) * (prog_len - off - cnt + 1));
- for (i = off; i < off + cnt - 1; i++)
+ for (i = off; i < off + cnt - 1; i++) {
new_data[i].seen = true;
+ new_data[i].zext_dst = insn_has_def32(env, insn + i);
+ }
env->insn_aux_data = new_data;
vfree(old_data);
return 0;
@@ -6423,14 +8124,166 @@ static struct bpf_prog *bpf_patch_insn_data(struct bpf_verifier_env *env, u32 of
struct bpf_prog *new_prog;
new_prog = bpf_patch_insn_single(env->prog, off, patch, len);
- if (!new_prog)
+ if (IS_ERR(new_prog)) {
+ if (PTR_ERR(new_prog) == -ERANGE)
+ verbose(env,
+ "insn %d cannot be patched due to 16-bit range\n",
+ env->insn_aux_data[off].orig_idx);
return NULL;
- if (adjust_insn_aux_data(env, new_prog->len, off, len))
+ }
+ if (adjust_insn_aux_data(env, new_prog, off, len))
return NULL;
adjust_subprog_starts(env, off, len);
return new_prog;
}
+static int adjust_subprog_starts_after_remove(struct bpf_verifier_env *env,
+ u32 off, u32 cnt)
+{
+ int i, j;
+
+ /* find first prog starting at or after off (first to remove) */
+ for (i = 0; i < env->subprog_cnt; i++)
+ if (env->subprog_info[i].start >= off)
+ break;
+ /* find first prog starting at or after off + cnt (first to stay) */
+ for (j = i; j < env->subprog_cnt; j++)
+ if (env->subprog_info[j].start >= off + cnt)
+ break;
+ /* if j doesn't start exactly at off + cnt, we are just removing
+ * the front of previous prog
+ */
+ if (env->subprog_info[j].start != off + cnt)
+ j--;
+
+ if (j > i) {
+ struct bpf_prog_aux *aux = env->prog->aux;
+ int move;
+
+ /* move fake 'exit' subprog as well */
+ move = env->subprog_cnt + 1 - j;
+
+ memmove(env->subprog_info + i,
+ env->subprog_info + j,
+ sizeof(*env->subprog_info) * move);
+ env->subprog_cnt -= j - i;
+
+ /* remove func_info */
+ if (aux->func_info) {
+ move = aux->func_info_cnt - j;
+
+ memmove(aux->func_info + i,
+ aux->func_info + j,
+ sizeof(*aux->func_info) * move);
+ aux->func_info_cnt -= j - i;
+ /* func_info->insn_off is set after all code rewrites,
+ * in adjust_btf_func() - no need to adjust
+ */
+ }
+ } else {
+ /* convert i from "first prog to remove" to "first to adjust" */
+ if (env->subprog_info[i].start == off)
+ i++;
+ }
+
+ /* update fake 'exit' subprog as well */
+ for (; i <= env->subprog_cnt; i++)
+ env->subprog_info[i].start -= cnt;
+
+ return 0;
+}
+
+static int bpf_adj_linfo_after_remove(struct bpf_verifier_env *env, u32 off,
+ u32 cnt)
+{
+ struct bpf_prog *prog = env->prog;
+ u32 i, l_off, l_cnt, nr_linfo;
+ struct bpf_line_info *linfo;
+
+ nr_linfo = prog->aux->nr_linfo;
+ if (!nr_linfo)
+ return 0;
+
+ linfo = prog->aux->linfo;
+
+ /* find first line info to remove, count lines to be removed */
+ for (i = 0; i < nr_linfo; i++)
+ if (linfo[i].insn_off >= off)
+ break;
+
+ l_off = i;
+ l_cnt = 0;
+ for (; i < nr_linfo; i++)
+ if (linfo[i].insn_off < off + cnt)
+ l_cnt++;
+ else
+ break;
+
+ /* First live insn doesn't match first live linfo, it needs to "inherit"
+ * last removed linfo. prog is already modified, so prog->len == off
+ * means no live instructions after (tail of the program was removed).
+ */
+ if (prog->len != off && l_cnt &&
+ (i == nr_linfo || linfo[i].insn_off != off + cnt)) {
+ l_cnt--;
+ linfo[--i].insn_off = off + cnt;
+ }
+
+ /* remove the line info which refer to the removed instructions */
+ if (l_cnt) {
+ memmove(linfo + l_off, linfo + i,
+ sizeof(*linfo) * (nr_linfo - i));
+
+ prog->aux->nr_linfo -= l_cnt;
+ nr_linfo = prog->aux->nr_linfo;
+ }
+
+ /* pull all linfo[i].insn_off >= off + cnt in by cnt */
+ for (i = l_off; i < nr_linfo; i++)
+ linfo[i].insn_off -= cnt;
+
+ /* fix up all subprogs (incl. 'exit') which start >= off */
+ for (i = 0; i <= env->subprog_cnt; i++)
+ if (env->subprog_info[i].linfo_idx > l_off) {
+ /* program may have started in the removed region but
+ * may not be fully removed
+ */
+ if (env->subprog_info[i].linfo_idx >= l_off + l_cnt)
+ env->subprog_info[i].linfo_idx -= l_cnt;
+ else
+ env->subprog_info[i].linfo_idx = l_off;
+ }
+
+ return 0;
+}
+
+static int verifier_remove_insns(struct bpf_verifier_env *env, u32 off, u32 cnt)
+{
+ struct bpf_insn_aux_data *aux_data = env->insn_aux_data;
+ unsigned int orig_prog_len = env->prog->len;
+ int err;
+
+ if (bpf_prog_is_dev_bound(env->prog->aux))
+ bpf_prog_offload_remove_insns(env, off, cnt);
+
+ err = bpf_remove_insns(env->prog, off, cnt);
+ if (err)
+ return err;
+
+ err = adjust_subprog_starts_after_remove(env, off, cnt);
+ if (err)
+ return err;
+
+ err = bpf_adj_linfo_after_remove(env, off, cnt);
+ if (err)
+ return err;
+
+ memmove(aux_data + off, aux_data + off + cnt,
+ sizeof(*aux_data) * (orig_prog_len - off - cnt));
+
+ return 0;
+}
+
/* The verifier does more data flow analysis than llvm and will not
* explore branches that are dead at run time. Malicious programs can
* have dead code too. Therefore replace all dead at-run-time code
@@ -6457,6 +8310,169 @@ static void sanitize_dead_code(struct bpf_verifier_env *env)
}
}
+static bool insn_is_cond_jump(u8 code)
+{
+ u8 op;
+
+ if (BPF_CLASS(code) == BPF_JMP32)
+ return true;
+
+ if (BPF_CLASS(code) != BPF_JMP)
+ return false;
+
+ op = BPF_OP(code);
+ return op != BPF_JA && op != BPF_EXIT && op != BPF_CALL;
+}
+
+static void opt_hard_wire_dead_code_branches(struct bpf_verifier_env *env)
+{
+ struct bpf_insn_aux_data *aux_data = env->insn_aux_data;
+ struct bpf_insn ja = BPF_JMP_IMM(BPF_JA, 0, 0, 0);
+ struct bpf_insn *insn = env->prog->insnsi;
+ const int insn_cnt = env->prog->len;
+ int i;
+
+ for (i = 0; i < insn_cnt; i++, insn++) {
+ if (!insn_is_cond_jump(insn->code))
+ continue;
+
+ if (!aux_data[i + 1].seen)
+ ja.off = insn->off;
+ else if (!aux_data[i + 1 + insn->off].seen)
+ ja.off = 0;
+ else
+ continue;
+
+ if (bpf_prog_is_dev_bound(env->prog->aux))
+ bpf_prog_offload_replace_insn(env, i, &ja);
+
+ memcpy(insn, &ja, sizeof(ja));
+ }
+}
+
+static int opt_remove_dead_code(struct bpf_verifier_env *env)
+{
+ struct bpf_insn_aux_data *aux_data = env->insn_aux_data;
+ int insn_cnt = env->prog->len;
+ int i, err;
+
+ for (i = 0; i < insn_cnt; i++) {
+ int j;
+
+ j = 0;
+ while (i + j < insn_cnt && !aux_data[i + j].seen)
+ j++;
+ if (!j)
+ continue;
+
+ err = verifier_remove_insns(env, i, j);
+ if (err)
+ return err;
+ insn_cnt = env->prog->len;
+ }
+
+ return 0;
+}
+
+static int opt_remove_nops(struct bpf_verifier_env *env)
+{
+ const struct bpf_insn ja = BPF_JMP_IMM(BPF_JA, 0, 0, 0);
+ struct bpf_insn *insn = env->prog->insnsi;
+ int insn_cnt = env->prog->len;
+ int i, err;
+
+ for (i = 0; i < insn_cnt; i++) {
+ if (memcmp(&insn[i], &ja, sizeof(ja)))
+ continue;
+
+ err = verifier_remove_insns(env, i, 1);
+ if (err)
+ return err;
+ insn_cnt--;
+ i--;
+ }
+
+ return 0;
+}
+
+static int opt_subreg_zext_lo32_rnd_hi32(struct bpf_verifier_env *env,
+ const union bpf_attr *attr)
+{
+ struct bpf_insn *patch, zext_patch[2], rnd_hi32_patch[4];
+ struct bpf_insn_aux_data *aux = env->insn_aux_data;
+ int i, patch_len, delta = 0, len = env->prog->len;
+ struct bpf_insn *insns = env->prog->insnsi;
+ struct bpf_prog *new_prog;
+ bool rnd_hi32;
+
+ rnd_hi32 = attr->prog_flags & BPF_F_TEST_RND_HI32;
+ zext_patch[1] = BPF_ZEXT_REG(0);
+ rnd_hi32_patch[1] = BPF_ALU64_IMM(BPF_MOV, BPF_REG_AX, 0);
+ rnd_hi32_patch[2] = BPF_ALU64_IMM(BPF_LSH, BPF_REG_AX, 32);
+ rnd_hi32_patch[3] = BPF_ALU64_REG(BPF_OR, 0, BPF_REG_AX);
+ for (i = 0; i < len; i++) {
+ int adj_idx = i + delta;
+ struct bpf_insn insn;
+
+ insn = insns[adj_idx];
+ if (!aux[adj_idx].zext_dst) {
+ u8 code, class;
+ u32 imm_rnd;
+
+ if (!rnd_hi32)
+ continue;
+
+ code = insn.code;
+ class = BPF_CLASS(code);
+ if (insn_no_def(&insn))
+ continue;
+
+ /* NOTE: arg "reg" (the fourth one) is only used for
+ * BPF_STX which has been ruled out in above
+ * check, it is safe to pass NULL here.
+ */
+ if (is_reg64(env, &insn, insn.dst_reg, NULL, DST_OP)) {
+ if (class == BPF_LD &&
+ BPF_MODE(code) == BPF_IMM)
+ i++;
+ continue;
+ }
+
+ /* ctx load could be transformed into wider load. */
+ if (class == BPF_LDX &&
+ aux[adj_idx].ptr_type == PTR_TO_CTX)
+ continue;
+
+ imm_rnd = get_random_int();
+ rnd_hi32_patch[0] = insn;
+ rnd_hi32_patch[1].imm = imm_rnd;
+ rnd_hi32_patch[3].dst_reg = insn.dst_reg;
+ patch = rnd_hi32_patch;
+ patch_len = 4;
+ goto apply_patch_buffer;
+ }
+
+ if (!bpf_jit_needs_zext())
+ continue;
+
+ zext_patch[0] = insn;
+ zext_patch[1].dst_reg = insn.dst_reg;
+ zext_patch[1].src_reg = insn.dst_reg;
+ patch = zext_patch;
+ patch_len = 2;
+apply_patch_buffer:
+ new_prog = bpf_patch_insn_data(env, adj_idx, patch, patch_len);
+ if (!new_prog)
+ return -ENOMEM;
+ env->prog = new_prog;
+ insns = new_prog->insnsi;
+ aux = env->insn_aux_data;
+ delta += patch_len - 1;
+ }
+
+ return 0;
+}
+
/* convert load instructions that access fields of a context type into a
* sequence of instructions that access fields of the underlying structure:
* struct __sk_buff -> struct sk_buff
@@ -6549,8 +8565,15 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
convert_ctx_access = ops->convert_ctx_access;
break;
case PTR_TO_SOCKET:
+ case PTR_TO_SOCK_COMMON:
convert_ctx_access = bpf_sock_convert_ctx_access;
break;
+ case PTR_TO_TCP_SOCK:
+ convert_ctx_access = bpf_tcp_sock_convert_ctx_access;
+ break;
+ case PTR_TO_XDP_SOCK:
+ convert_ctx_access = bpf_xdp_sock_convert_ctx_access;
+ break;
default:
continue;
}
@@ -6609,7 +8632,7 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
insn->dst_reg,
shift);
insn_buf[cnt++] = BPF_ALU64_IMM(BPF_AND, insn->dst_reg,
- (1 << size * 8) - 1);
+ (1ULL << size * 8) - 1);
}
}
@@ -6678,7 +8701,12 @@ static int jit_subprogs(struct bpf_verifier_env *env)
subprog_end = env->subprog_info[i + 1].start;
len = subprog_end - subprog_start;
- func[i] = bpf_prog_alloc(bpf_prog_size(len), GFP_USER);
+ /* BPF_PROG_RUN doesn't call subprogs directly,
+ * hence main prog stats include the runtime of subprogs.
+ * subprogs don't have IDs and not reachable via prog_get_next_id
+ * func[i]->aux->stats will never be accessed and stays NULL
+ */
+ func[i] = bpf_prog_alloc_no_stats(bpf_prog_size(len), GFP_USER);
if (!func[i])
goto out_free;
memcpy(func[i]->insnsi, &prog->insnsi[subprog_start],
@@ -6721,9 +8749,8 @@ static int jit_subprogs(struct bpf_verifier_env *env)
insn->src_reg != BPF_PSEUDO_CALL)
continue;
subprog = insn->off;
- insn->imm = (u64 (*)(u64, u64, u64, u64, u64))
- func[subprog]->bpf_func -
- __bpf_call_base;
+ insn->imm = BPF_CAST_CALL(func[subprog]->bpf_func) -
+ __bpf_call_base;
}
/* we use the aux data to keep a list of the start addresses
@@ -6917,7 +8944,8 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env)
u32 off_reg;
aux = &env->insn_aux_data[i + delta];
- if (!aux->alu_state)
+ if (!aux->alu_state ||
+ aux->alu_state == BPF_ALU_NON_POINTER)
continue;
isneg = aux->alu_state & BPF_ALU_NEG_VALUE;
@@ -7124,30 +9152,63 @@ static void free_states(struct bpf_verifier_env *env)
struct bpf_verifier_state_list *sl, *sln;
int i;
+ sl = env->free_list;
+ while (sl) {
+ sln = sl->next;
+ free_verifier_state(&sl->state, false);
+ kfree(sl);
+ sl = sln;
+ }
+
if (!env->explored_states)
return;
- for (i = 0; i < env->prog->len; i++) {
+ for (i = 0; i < state_htab_size(env); i++) {
sl = env->explored_states[i];
- if (sl)
- while (sl != STATE_LIST_MARK) {
- sln = sl->next;
- free_verifier_state(&sl->state, false);
- kfree(sl);
- sl = sln;
- }
+ while (sl) {
+ sln = sl->next;
+ free_verifier_state(&sl->state, false);
+ kfree(sl);
+ sl = sln;
+ }
}
- kfree(env->explored_states);
+ kvfree(env->explored_states);
+}
+
+static void print_verification_stats(struct bpf_verifier_env *env)
+{
+ int i;
+
+ if (env->log.level & BPF_LOG_STATS) {
+ verbose(env, "verification time %lld usec\n",
+ div_u64(env->verification_time, 1000));
+ verbose(env, "stack depth ");
+ for (i = 0; i < env->subprog_cnt; i++) {
+ u32 depth = env->subprog_info[i].stack_depth;
+
+ verbose(env, "%d", depth);
+ if (i + 1 < env->subprog_cnt)
+ verbose(env, "+");
+ }
+ verbose(env, "\n");
+ }
+ verbose(env, "processed %d insns (limit %d) max_states_per_insn %d "
+ "total_states %d peak_states %d mark_read %d\n",
+ env->insn_processed, BPF_COMPLEXITY_LIMIT_INSNS,
+ env->max_states_per_insn, env->total_states,
+ env->peak_states, env->longest_mark_read_walk);
}
int bpf_check(struct bpf_prog **prog, union bpf_attr *attr,
union bpf_attr __user *uattr)
{
+ u64 start_time = ktime_get_ns();
struct bpf_verifier_env *env;
struct bpf_verifier_log *log;
- int ret = -EINVAL;
+ int i, len, ret = -EINVAL;
+ bool is_priv;
/* no program is valid */
if (ARRAY_SIZE(bpf_verifier_ops) == 0)
@@ -7161,17 +9222,21 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr,
return -ENOMEM;
log = &env->log;
+ len = (*prog)->len;
env->insn_aux_data =
- vzalloc(array_size(sizeof(struct bpf_insn_aux_data),
- (*prog)->len));
+ vzalloc(array_size(sizeof(struct bpf_insn_aux_data), len));
ret = -ENOMEM;
if (!env->insn_aux_data)
goto err_free_env;
+ for (i = 0; i < len; i++)
+ env->insn_aux_data[i].orig_idx = i;
env->prog = *prog;
env->ops = bpf_verifier_ops[env->prog->type];
+ is_priv = capable(CAP_SYS_ADMIN);
/* grab the mutex to protect few globals used by verifier */
- mutex_lock(&bpf_verifier_lock);
+ if (!is_priv)
+ mutex_lock(&bpf_verifier_lock);
if (attr->log_level || attr->log_buf || attr->log_size) {
/* user requested verbose verifier output
@@ -7183,8 +9248,8 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr,
ret = -EINVAL;
/* log attributes have to be sane */
- if (log->len_total < 128 || log->len_total > UINT_MAX >> 8 ||
- !log->level || !log->ubuf)
+ if (log->len_total < 128 || log->len_total > UINT_MAX >> 2 ||
+ !log->level || !log->ubuf || log->level & ~BPF_LOG_MASK)
goto err_unlock;
}
@@ -7194,6 +9259,8 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr,
if (attr->prog_flags & BPF_F_ANY_ALIGNMENT)
env->strict_alignment = false;
+ env->allow_ptr_leaks = is_priv;
+
ret = replace_map_fd_with_map_ptr(env);
if (ret < 0)
goto skip_full_check;
@@ -7204,15 +9271,13 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr,
goto skip_full_check;
}
- env->explored_states = kcalloc(env->prog->len,
+ env->explored_states = kvcalloc(state_htab_size(env),
sizeof(struct bpf_verifier_state_list *),
GFP_USER);
ret = -ENOMEM;
if (!env->explored_states)
goto skip_full_check;
- env->allow_ptr_leaks = capable(CAP_SYS_ADMIN);
-
ret = check_subprogs(env);
if (ret < 0)
goto skip_full_check;
@@ -7242,8 +9307,17 @@ skip_full_check:
ret = check_max_stack_depth(env);
/* instruction rewrites happen after this point */
- if (ret == 0)
- sanitize_dead_code(env);
+ if (is_priv) {
+ if (ret == 0)
+ opt_hard_wire_dead_code_branches(env);
+ if (ret == 0)
+ ret = opt_remove_dead_code(env);
+ if (ret == 0)
+ ret = opt_remove_nops(env);
+ } else {
+ if (ret == 0)
+ sanitize_dead_code(env);
+ }
if (ret == 0)
/* program is valid, convert *(u32*)(ctx + off) accesses */
@@ -7252,9 +9326,21 @@ skip_full_check:
if (ret == 0)
ret = fixup_bpf_calls(env);
+ /* do 32-bit optimization after insn patching has done so those patched
+ * insns could be handled correctly.
+ */
+ if (ret == 0 && !bpf_prog_is_dev_bound(env->prog->aux)) {
+ ret = opt_subreg_zext_lo32_rnd_hi32(env, attr);
+ env->prog->aux->verifier_zext = bpf_jit_needs_zext() ? !ret
+ : false;
+ }
+
if (ret == 0)
ret = fixup_call_args(env);
+ env->verification_time = ktime_get_ns() - start_time;
+ print_verification_stats(env);
+
if (log->level && bpf_verifier_log_full(log))
ret = -ENOSPC;
if (log->level && !log->ubuf) {
@@ -7294,7 +9380,8 @@ err_release_maps:
release_maps(env);
*prog = env->prog;
err_unlock:
- mutex_unlock(&bpf_verifier_lock);
+ if (!is_priv)
+ mutex_unlock(&bpf_verifier_lock);
vfree(env->insn_aux_data);
err_free_env:
kfree(env);
diff --git a/kernel/bpf/xskmap.c b/kernel/bpf/xskmap.c
index 686d244e798d..9bb96ace9fa1 100644
--- a/kernel/bpf/xskmap.c
+++ b/kernel/bpf/xskmap.c
@@ -17,8 +17,8 @@ struct xsk_map {
static struct bpf_map *xsk_map_alloc(union bpf_attr *attr)
{
- int cpu, err = -EINVAL;
struct xsk_map *m;
+ int cpu, err;
u64 cost;
if (!capable(CAP_NET_ADMIN))
@@ -37,13 +37,9 @@ static struct bpf_map *xsk_map_alloc(union bpf_attr *attr)
cost = (u64)m->map.max_entries * sizeof(struct xdp_sock *);
cost += sizeof(struct list_head) * num_possible_cpus();
- if (cost >= U32_MAX - PAGE_SIZE)
- goto free_m;
-
- m->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
/* Notice returns -EPERM on if map size is larger than memlock limit */
- err = bpf_map_precharge_memlock(m->map.pages);
+ err = bpf_map_charge_init(&m->map.memory, cost);
if (err)
goto free_m;
@@ -51,7 +47,7 @@ static struct bpf_map *xsk_map_alloc(union bpf_attr *attr)
m->flush_list = alloc_percpu(struct list_head);
if (!m->flush_list)
- goto free_m;
+ goto free_charge;
for_each_possible_cpu(cpu)
INIT_LIST_HEAD(per_cpu_ptr(m->flush_list, cpu));
@@ -65,6 +61,8 @@ static struct bpf_map *xsk_map_alloc(union bpf_attr *attr)
free_percpu:
free_percpu(m->flush_list);
+free_charge:
+ bpf_map_charge_finish(&m->map.memory);
free_m:
kfree(m);
return ERR_PTR(err);
@@ -147,13 +145,18 @@ void __xsk_map_flush(struct bpf_map *map)
list_for_each_entry_safe(xs, tmp, flush_list, flush_node) {
xsk_flush(xs);
- __list_del(xs->flush_node.prev, xs->flush_node.next);
- xs->flush_node.prev = NULL;
+ __list_del_clearprev(&xs->flush_node);
}
}
static void *xsk_map_lookup_elem(struct bpf_map *map, void *key)
{
+ WARN_ON_ONCE(!rcu_read_lock_held());
+ return __xsk_map_lookup_elem(map, *(u32 *)key);
+}
+
+static void *xsk_map_lookup_elem_sys_only(struct bpf_map *map, void *key)
+{
return ERR_PTR(-EOPNOTSUPP);
}
@@ -220,6 +223,7 @@ const struct bpf_map_ops xsk_map_ops = {
.map_free = xsk_map_free,
.map_get_next_key = xsk_map_get_next_key,
.map_lookup_elem = xsk_map_lookup_elem,
+ .map_lookup_elem_sys_only = xsk_map_lookup_elem_sys_only,
.map_update_elem = xsk_map_update_elem,
.map_delete_elem = xsk_map_delete_elem,
.map_check_btf = map_check_no_btf,
diff --git a/kernel/capability.c b/kernel/capability.c
index 1e1c0236f55b..1444f3954d75 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -93,9 +93,7 @@ static int cap_validate_magic(cap_user_header_t header, unsigned *tocopy)
break;
case _LINUX_CAPABILITY_VERSION_2:
warn_deprecated_v2();
- /*
- * fall through - v3 is otherwise equivalent to v2.
- */
+ /* fall through - v3 is otherwise equivalent to v2. */
case _LINUX_CAPABILITY_VERSION_3:
*tocopy = _LINUX_CAPABILITY_U32S_3;
break;
@@ -299,7 +297,7 @@ bool has_ns_capability(struct task_struct *t,
int ret;
rcu_read_lock();
- ret = security_capable(__task_cred(t), ns, cap);
+ ret = security_capable(__task_cred(t), ns, cap, CAP_OPT_NONE);
rcu_read_unlock();
return (ret == 0);
@@ -340,7 +338,7 @@ bool has_ns_capability_noaudit(struct task_struct *t,
int ret;
rcu_read_lock();
- ret = security_capable_noaudit(__task_cred(t), ns, cap);
+ ret = security_capable(__task_cred(t), ns, cap, CAP_OPT_NOAUDIT);
rcu_read_unlock();
return (ret == 0);
@@ -363,7 +361,9 @@ bool has_capability_noaudit(struct task_struct *t, int cap)
return has_ns_capability_noaudit(t, &init_user_ns, cap);
}
-static bool ns_capable_common(struct user_namespace *ns, int cap, bool audit)
+static bool ns_capable_common(struct user_namespace *ns,
+ int cap,
+ unsigned int opts)
{
int capable;
@@ -372,8 +372,7 @@ static bool ns_capable_common(struct user_namespace *ns, int cap, bool audit)
BUG();
}
- capable = audit ? security_capable(current_cred(), ns, cap) :
- security_capable_noaudit(current_cred(), ns, cap);
+ capable = security_capable(current_cred(), ns, cap, opts);
if (capable == 0) {
current->flags |= PF_SUPERPRIV;
return true;
@@ -394,7 +393,7 @@ static bool ns_capable_common(struct user_namespace *ns, int cap, bool audit)
*/
bool ns_capable(struct user_namespace *ns, int cap)
{
- return ns_capable_common(ns, cap, true);
+ return ns_capable_common(ns, cap, CAP_OPT_NONE);
}
EXPORT_SYMBOL(ns_capable);
@@ -412,11 +411,30 @@ EXPORT_SYMBOL(ns_capable);
*/
bool ns_capable_noaudit(struct user_namespace *ns, int cap)
{
- return ns_capable_common(ns, cap, false);
+ return ns_capable_common(ns, cap, CAP_OPT_NOAUDIT);
}
EXPORT_SYMBOL(ns_capable_noaudit);
/**
+ * ns_capable_setid - Determine if the current task has a superior capability
+ * in effect, while signalling that this check is being done from within a
+ * setid syscall.
+ * @ns: The usernamespace we want the capability in
+ * @cap: The capability to be tested for
+ *
+ * Return true if the current task has the given superior capability currently
+ * available for use, false if not.
+ *
+ * This sets PF_SUPERPRIV on the task if the capability is available on the
+ * assumption that it's about to be used.
+ */
+bool ns_capable_setid(struct user_namespace *ns, int cap)
+{
+ return ns_capable_common(ns, cap, CAP_OPT_INSETID);
+}
+EXPORT_SYMBOL(ns_capable_setid);
+
+/**
* capable - Determine if the current task has a superior capability in effect
* @cap: The capability to be tested for
*
@@ -448,10 +466,11 @@ EXPORT_SYMBOL(capable);
bool file_ns_capable(const struct file *file, struct user_namespace *ns,
int cap)
{
+
if (WARN_ON_ONCE(!cap_valid(cap)))
return false;
- if (security_capable(file->f_cred, ns, cap) == 0)
+ if (security_capable(file->f_cred, ns, cap, CAP_OPT_NONE) == 0)
return true;
return false;
@@ -500,10 +519,12 @@ bool ptracer_capable(struct task_struct *tsk, struct user_namespace *ns)
{
int ret = 0; /* An absent tracer adds no restrictions */
const struct cred *cred;
+
rcu_read_lock();
cred = rcu_dereference(tsk->ptracer_cred);
if (cred)
- ret = security_capable_noaudit(cred, ns, CAP_SYS_PTRACE);
+ ret = security_capable(cred, ns, CAP_SYS_PTRACE,
+ CAP_OPT_NOAUDIT);
rcu_read_unlock();
return (ret == 0);
}
diff --git a/kernel/cgroup/Makefile b/kernel/cgroup/Makefile
index bfcdae896122..5d7a76bfbbb7 100644
--- a/kernel/cgroup/Makefile
+++ b/kernel/cgroup/Makefile
@@ -1,7 +1,7 @@
# SPDX-License-Identifier: GPL-2.0
-obj-y := cgroup.o rstat.o namespace.o cgroup-v1.o
+obj-y := cgroup.o rstat.o namespace.o cgroup-v1.o freezer.o
-obj-$(CONFIG_CGROUP_FREEZER) += freezer.o
+obj-$(CONFIG_CGROUP_FREEZER) += legacy_freezer.o
obj-$(CONFIG_CGROUP_PIDS) += pids.o
obj-$(CONFIG_CGROUP_RDMA) += rdma.o
obj-$(CONFIG_CPUSETS) += cpuset.o
diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h
index c950864016e2..809e34a3c017 100644
--- a/kernel/cgroup/cgroup-internal.h
+++ b/kernel/cgroup/cgroup-internal.h
@@ -7,6 +7,7 @@
#include <linux/workqueue.h>
#include <linux/list.h>
#include <linux/refcount.h>
+#include <linux/fs_context.h>
#define TRACE_CGROUP_PATH_LEN 1024
extern spinlock_t trace_cgroup_path_lock;
@@ -27,16 +28,44 @@ extern void __init enable_debug_cgroup(void);
#define TRACE_CGROUP_PATH(type, cgrp, ...) \
do { \
if (trace_cgroup_##type##_enabled()) { \
- spin_lock(&trace_cgroup_path_lock); \
+ unsigned long flags; \
+ spin_lock_irqsave(&trace_cgroup_path_lock, \
+ flags); \
cgroup_path(cgrp, trace_cgroup_path, \
TRACE_CGROUP_PATH_LEN); \
trace_cgroup_##type(cgrp, trace_cgroup_path, \
##__VA_ARGS__); \
- spin_unlock(&trace_cgroup_path_lock); \
+ spin_unlock_irqrestore(&trace_cgroup_path_lock, \
+ flags); \
} \
} while (0)
/*
+ * The cgroup filesystem superblock creation/mount context.
+ */
+struct cgroup_fs_context {
+ struct kernfs_fs_context kfc;
+ struct cgroup_root *root;
+ struct cgroup_namespace *ns;
+ unsigned int flags; /* CGRP_ROOT_* flags */
+
+ /* cgroup1 bits */
+ bool cpuset_clone_children;
+ bool none; /* User explicitly requested empty subsystem */
+ bool all_ss; /* Seen 'all' option */
+ u16 subsys_mask; /* Selected subsystems */
+ char *name; /* Hierarchy name */
+ char *release_agent; /* Path for release notifications */
+};
+
+static inline struct cgroup_fs_context *cgroup_fc2context(struct fs_context *fc)
+{
+ struct kernfs_fs_context *kfc = fc->fs_private;
+
+ return container_of(kfc, struct cgroup_fs_context, kfc);
+}
+
+/*
* A cgroup can be associated with multiple css_sets as different tasks may
* belong to different cgroups on different hierarchies. In the other
* direction, a css_set is naturally associated with multiple cgroups.
@@ -117,16 +146,6 @@ struct cgroup_mgctx {
#define DEFINE_CGROUP_MGCTX(name) \
struct cgroup_mgctx name = CGROUP_MGCTX_INIT(name)
-struct cgroup_sb_opts {
- u16 subsys_mask;
- unsigned int flags;
- char *release_agent;
- bool cpuset_clone_children;
- char *name;
- /* User explicitly requested empty subsystem */
- bool none;
-};
-
extern struct mutex cgroup_mutex;
extern spinlock_t css_set_lock;
extern struct cgroup_subsys *cgroup_subsys[];
@@ -197,12 +216,10 @@ int cgroup_path_ns_locked(struct cgroup *cgrp, char *buf, size_t buflen,
struct cgroup_namespace *ns);
void cgroup_free_root(struct cgroup_root *root);
-void init_cgroup_root(struct cgroup_root *root, struct cgroup_sb_opts *opts);
-int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask, int ref_flags);
+void init_cgroup_root(struct cgroup_fs_context *ctx);
+int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask);
int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask);
-struct dentry *cgroup_do_mount(struct file_system_type *fs_type, int flags,
- struct cgroup_root *root, unsigned long magic,
- struct cgroup_namespace *ns);
+int cgroup_do_get_tree(struct fs_context *fc);
int cgroup_migrate_vet_dst(struct cgroup *dst_cgrp);
void cgroup_migrate_finish(struct cgroup_mgctx *mgctx);
@@ -226,6 +243,7 @@ int cgroup_rmdir(struct kernfs_node *kn);
int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node,
struct kernfs_root *kf_root);
+int __cgroup_task_count(const struct cgroup *cgrp);
int cgroup_task_count(const struct cgroup *cgrp);
/*
@@ -246,14 +264,15 @@ extern const struct proc_ns_operations cgroupns_operations;
*/
extern struct cftype cgroup1_base_files[];
extern struct kernfs_syscall_ops cgroup1_kf_syscall_ops;
+extern const struct fs_parameter_description cgroup1_fs_parameters;
int proc_cgroupstats_show(struct seq_file *m, void *v);
bool cgroup1_ssid_disabled(int ssid);
void cgroup1_pidlist_destroy_all(struct cgroup *cgrp);
void cgroup1_release_agent(struct work_struct *work);
void cgroup1_check_for_release(struct cgroup *cgrp);
-struct dentry *cgroup1_mount(struct file_system_type *fs_type, int flags,
- void *data, unsigned long magic,
- struct cgroup_namespace *ns);
+int cgroup1_parse_param(struct fs_context *fc, struct fs_parameter *param);
+int cgroup1_get_tree(struct fs_context *fc);
+int cgroup1_reconfigure(struct fs_context *ctx);
#endif /* __CGROUP_INTERNAL_H */
diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c
index 583b969b0c0e..88006be40ea3 100644
--- a/kernel/cgroup/cgroup-v1.c
+++ b/kernel/cgroup/cgroup-v1.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
#include "cgroup-internal.h"
#include <linux/ctype.h>
@@ -13,9 +14,12 @@
#include <linux/delayacct.h>
#include <linux/pid_namespace.h>
#include <linux/cgroupstats.h>
+#include <linux/fs_parser.h>
#include <trace/events/cgroup.h>
+#define cg_invalf(fc, fmt, ...) invalf(fc, fmt, ## __VA_ARGS__)
+
/*
* pidlists linger the following amount before being destroyed. The goal
* is avoiding frequent destruction in the middle of consecutive read calls
@@ -339,22 +343,6 @@ static struct cgroup_pidlist *cgroup_pidlist_find_create(struct cgroup *cgrp,
return l;
}
-/**
- * cgroup_task_count - count the number of tasks in a cgroup.
- * @cgrp: the cgroup in question
- */
-int cgroup_task_count(const struct cgroup *cgrp)
-{
- int count = 0;
- struct cgrp_cset_link *link;
-
- spin_lock_irq(&css_set_lock);
- list_for_each_entry(link, &cgrp->cset_links, cset_link)
- count += link->cset->nr_tasks;
- spin_unlock_irq(&css_set_lock);
- return count;
-}
-
/*
* Load a cgroup's pidarray with either procs' tgids or tasks' pids
*/
@@ -906,172 +894,195 @@ static int cgroup1_show_options(struct seq_file *seq, struct kernfs_root *kf_roo
return 0;
}
-static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
-{
- char *token, *o = data;
- bool all_ss = false, one_ss = false;
- u16 mask = U16_MAX;
- struct cgroup_subsys *ss;
- int nr_opts = 0;
- int i;
-
-#ifdef CONFIG_CPUSETS
- mask = ~((u16)1 << cpuset_cgrp_id);
-#endif
-
- memset(opts, 0, sizeof(*opts));
+enum cgroup1_param {
+ Opt_all,
+ Opt_clone_children,
+ Opt_cpuset_v2_mode,
+ Opt_name,
+ Opt_none,
+ Opt_noprefix,
+ Opt_release_agent,
+ Opt_xattr,
+};
- while ((token = strsep(&o, ",")) != NULL) {
- nr_opts++;
+static const struct fs_parameter_spec cgroup1_param_specs[] = {
+ fsparam_flag ("all", Opt_all),
+ fsparam_flag ("clone_children", Opt_clone_children),
+ fsparam_flag ("cpuset_v2_mode", Opt_cpuset_v2_mode),
+ fsparam_string("name", Opt_name),
+ fsparam_flag ("none", Opt_none),
+ fsparam_flag ("noprefix", Opt_noprefix),
+ fsparam_string("release_agent", Opt_release_agent),
+ fsparam_flag ("xattr", Opt_xattr),
+ {}
+};
- if (!*token)
- return -EINVAL;
- if (!strcmp(token, "none")) {
- /* Explicitly have no subsystems */
- opts->none = true;
- continue;
- }
- if (!strcmp(token, "all")) {
- /* Mutually exclusive option 'all' + subsystem name */
- if (one_ss)
- return -EINVAL;
- all_ss = true;
- continue;
- }
- if (!strcmp(token, "noprefix")) {
- opts->flags |= CGRP_ROOT_NOPREFIX;
- continue;
- }
- if (!strcmp(token, "clone_children")) {
- opts->cpuset_clone_children = true;
- continue;
- }
- if (!strcmp(token, "cpuset_v2_mode")) {
- opts->flags |= CGRP_ROOT_CPUSET_V2_MODE;
- continue;
- }
- if (!strcmp(token, "xattr")) {
- opts->flags |= CGRP_ROOT_XATTR;
- continue;
- }
- if (!strncmp(token, "release_agent=", 14)) {
- /* Specifying two release agents is forbidden */
- if (opts->release_agent)
- return -EINVAL;
- opts->release_agent =
- kstrndup(token + 14, PATH_MAX - 1, GFP_KERNEL);
- if (!opts->release_agent)
- return -ENOMEM;
- continue;
- }
- if (!strncmp(token, "name=", 5)) {
- const char *name = token + 5;
-
- /* blocked by boot param? */
- if (cgroup_no_v1_named)
- return -ENOENT;
- /* Can't specify an empty name */
- if (!strlen(name))
- return -EINVAL;
- /* Must match [\w.-]+ */
- for (i = 0; i < strlen(name); i++) {
- char c = name[i];
- if (isalnum(c))
- continue;
- if ((c == '.') || (c == '-') || (c == '_'))
- continue;
- return -EINVAL;
- }
- /* Specifying two names is forbidden */
- if (opts->name)
- return -EINVAL;
- opts->name = kstrndup(name,
- MAX_CGROUP_ROOT_NAMELEN - 1,
- GFP_KERNEL);
- if (!opts->name)
- return -ENOMEM;
+const struct fs_parameter_description cgroup1_fs_parameters = {
+ .name = "cgroup1",
+ .specs = cgroup1_param_specs,
+};
- continue;
+int cgroup1_parse_param(struct fs_context *fc, struct fs_parameter *param)
+{
+ struct cgroup_fs_context *ctx = cgroup_fc2context(fc);
+ struct cgroup_subsys *ss;
+ struct fs_parse_result result;
+ int opt, i;
+
+ opt = fs_parse(fc, &cgroup1_fs_parameters, param, &result);
+ if (opt == -ENOPARAM) {
+ if (strcmp(param->key, "source") == 0) {
+ fc->source = param->string;
+ param->string = NULL;
+ return 0;
}
-
for_each_subsys(ss, i) {
- if (strcmp(token, ss->legacy_name))
+ if (strcmp(param->key, ss->legacy_name))
continue;
- if (!cgroup_ssid_enabled(i))
+ ctx->subsys_mask |= (1 << i);
+ return 0;
+ }
+ return cg_invalf(fc, "cgroup1: Unknown subsys name '%s'", param->key);
+ }
+ if (opt < 0)
+ return opt;
+
+ switch (opt) {
+ case Opt_none:
+ /* Explicitly have no subsystems */
+ ctx->none = true;
+ break;
+ case Opt_all:
+ ctx->all_ss = true;
+ break;
+ case Opt_noprefix:
+ ctx->flags |= CGRP_ROOT_NOPREFIX;
+ break;
+ case Opt_clone_children:
+ ctx->cpuset_clone_children = true;
+ break;
+ case Opt_cpuset_v2_mode:
+ ctx->flags |= CGRP_ROOT_CPUSET_V2_MODE;
+ break;
+ case Opt_xattr:
+ ctx->flags |= CGRP_ROOT_XATTR;
+ break;
+ case Opt_release_agent:
+ /* Specifying two release agents is forbidden */
+ if (ctx->release_agent)
+ return cg_invalf(fc, "cgroup1: release_agent respecified");
+ ctx->release_agent = param->string;
+ param->string = NULL;
+ break;
+ case Opt_name:
+ /* blocked by boot param? */
+ if (cgroup_no_v1_named)
+ return -ENOENT;
+ /* Can't specify an empty name */
+ if (!param->size)
+ return cg_invalf(fc, "cgroup1: Empty name");
+ if (param->size > MAX_CGROUP_ROOT_NAMELEN - 1)
+ return cg_invalf(fc, "cgroup1: Name too long");
+ /* Must match [\w.-]+ */
+ for (i = 0; i < param->size; i++) {
+ char c = param->string[i];
+ if (isalnum(c))
continue;
- if (cgroup1_ssid_disabled(i))
+ if ((c == '.') || (c == '-') || (c == '_'))
continue;
-
- /* Mutually exclusive option 'all' + subsystem name */
- if (all_ss)
- return -EINVAL;
- opts->subsys_mask |= (1 << i);
- one_ss = true;
-
- break;
+ return cg_invalf(fc, "cgroup1: Invalid name");
}
- if (i == CGROUP_SUBSYS_COUNT)
- return -ENOENT;
+ /* Specifying two names is forbidden */
+ if (ctx->name)
+ return cg_invalf(fc, "cgroup1: name respecified");
+ ctx->name = param->string;
+ param->string = NULL;
+ break;
}
+ return 0;
+}
+
+static int check_cgroupfs_options(struct fs_context *fc)
+{
+ struct cgroup_fs_context *ctx = cgroup_fc2context(fc);
+ u16 mask = U16_MAX;
+ u16 enabled = 0;
+ struct cgroup_subsys *ss;
+ int i;
+
+#ifdef CONFIG_CPUSETS
+ mask = ~((u16)1 << cpuset_cgrp_id);
+#endif
+ for_each_subsys(ss, i)
+ if (cgroup_ssid_enabled(i) && !cgroup1_ssid_disabled(i))
+ enabled |= 1 << i;
+
+ ctx->subsys_mask &= enabled;
/*
- * If the 'all' option was specified select all the subsystems,
- * otherwise if 'none', 'name=' and a subsystem name options were
- * not specified, let's default to 'all'
+ * In absense of 'none', 'name=' or subsystem name options,
+ * let's default to 'all'.
*/
- if (all_ss || (!one_ss && !opts->none && !opts->name))
- for_each_subsys(ss, i)
- if (cgroup_ssid_enabled(i) && !cgroup1_ssid_disabled(i))
- opts->subsys_mask |= (1 << i);
+ if (!ctx->subsys_mask && !ctx->none && !ctx->name)
+ ctx->all_ss = true;
+
+ if (ctx->all_ss) {
+ /* Mutually exclusive option 'all' + subsystem name */
+ if (ctx->subsys_mask)
+ return cg_invalf(fc, "cgroup1: subsys name conflicts with all");
+ /* 'all' => select all the subsystems */
+ ctx->subsys_mask = enabled;
+ }
/*
* We either have to specify by name or by subsystems. (So all
* empty hierarchies must have a name).
*/
- if (!opts->subsys_mask && !opts->name)
- return -EINVAL;
+ if (!ctx->subsys_mask && !ctx->name)
+ return cg_invalf(fc, "cgroup1: Need name or subsystem set");
/*
* Option noprefix was introduced just for backward compatibility
* with the old cpuset, so we allow noprefix only if mounting just
* the cpuset subsystem.
*/
- if ((opts->flags & CGRP_ROOT_NOPREFIX) && (opts->subsys_mask & mask))
- return -EINVAL;
+ if ((ctx->flags & CGRP_ROOT_NOPREFIX) && (ctx->subsys_mask & mask))
+ return cg_invalf(fc, "cgroup1: noprefix used incorrectly");
/* Can't specify "none" and some subsystems */
- if (opts->subsys_mask && opts->none)
- return -EINVAL;
+ if (ctx->subsys_mask && ctx->none)
+ return cg_invalf(fc, "cgroup1: none used incorrectly");
return 0;
}
-static int cgroup1_remount(struct kernfs_root *kf_root, int *flags, char *data)
+int cgroup1_reconfigure(struct fs_context *fc)
{
- int ret = 0;
+ struct cgroup_fs_context *ctx = cgroup_fc2context(fc);
+ struct kernfs_root *kf_root = kernfs_root_from_sb(fc->root->d_sb);
struct cgroup_root *root = cgroup_root_from_kf(kf_root);
- struct cgroup_sb_opts opts;
+ int ret = 0;
u16 added_mask, removed_mask;
cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp);
/* See what subsystems are wanted */
- ret = parse_cgroupfs_options(data, &opts);
+ ret = check_cgroupfs_options(fc);
if (ret)
goto out_unlock;
- if (opts.subsys_mask != root->subsys_mask || opts.release_agent)
+ if (ctx->subsys_mask != root->subsys_mask || ctx->release_agent)
pr_warn("option changes via remount are deprecated (pid=%d comm=%s)\n",
task_tgid_nr(current), current->comm);
- added_mask = opts.subsys_mask & ~root->subsys_mask;
- removed_mask = root->subsys_mask & ~opts.subsys_mask;
+ added_mask = ctx->subsys_mask & ~root->subsys_mask;
+ removed_mask = root->subsys_mask & ~ctx->subsys_mask;
/* Don't allow flags or name to change at remount */
- if ((opts.flags ^ root->flags) ||
- (opts.name && strcmp(opts.name, root->name))) {
- pr_err("option or name mismatch, new: 0x%x \"%s\", old: 0x%x \"%s\"\n",
- opts.flags, opts.name ?: "", root->flags, root->name);
+ if ((ctx->flags ^ root->flags) ||
+ (ctx->name && strcmp(ctx->name, root->name))) {
+ cg_invalf(fc, "option or name mismatch, new: 0x%x \"%s\", old: 0x%x \"%s\"",
+ ctx->flags, ctx->name ?: "", root->flags, root->name);
ret = -EINVAL;
goto out_unlock;
}
@@ -1088,17 +1099,15 @@ static int cgroup1_remount(struct kernfs_root *kf_root, int *flags, char *data)
WARN_ON(rebind_subsystems(&cgrp_dfl_root, removed_mask));
- if (opts.release_agent) {
+ if (ctx->release_agent) {
spin_lock(&release_agent_path_lock);
- strcpy(root->release_agent_path, opts.release_agent);
+ strcpy(root->release_agent_path, ctx->release_agent);
spin_unlock(&release_agent_path_lock);
}
trace_cgroup_remount(root);
out_unlock:
- kfree(opts.release_agent);
- kfree(opts.name);
mutex_unlock(&cgroup_mutex);
return ret;
}
@@ -1106,30 +1115,30 @@ static int cgroup1_remount(struct kernfs_root *kf_root, int *flags, char *data)
struct kernfs_syscall_ops cgroup1_kf_syscall_ops = {
.rename = cgroup1_rename,
.show_options = cgroup1_show_options,
- .remount_fs = cgroup1_remount,
.mkdir = cgroup_mkdir,
.rmdir = cgroup_rmdir,
.show_path = cgroup_show_path,
};
-struct dentry *cgroup1_mount(struct file_system_type *fs_type, int flags,
- void *data, unsigned long magic,
- struct cgroup_namespace *ns)
+/*
+ * The guts of cgroup1 mount - find or create cgroup_root to use.
+ * Called with cgroup_mutex held; returns 0 on success, -E... on
+ * error and positive - in case when the candidate is busy dying.
+ * On success it stashes a reference to cgroup_root into given
+ * cgroup_fs_context; that reference is *NOT* counting towards the
+ * cgroup_root refcount.
+ */
+static int cgroup1_root_to_use(struct fs_context *fc)
{
- struct super_block *pinned_sb = NULL;
- struct cgroup_sb_opts opts;
+ struct cgroup_fs_context *ctx = cgroup_fc2context(fc);
struct cgroup_root *root;
struct cgroup_subsys *ss;
- struct dentry *dentry;
int i, ret;
- bool new_root = false;
-
- cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp);
/* First find the desired set of subsystems */
- ret = parse_cgroupfs_options(data, &opts);
+ ret = check_cgroupfs_options(fc);
if (ret)
- goto out_unlock;
+ return ret;
/*
* Destruction of cgroup root is asynchronous, so subsystems may
@@ -1139,16 +1148,12 @@ struct dentry *cgroup1_mount(struct file_system_type *fs_type, int flags,
* starting. Testing ref liveliness is good enough.
*/
for_each_subsys(ss, i) {
- if (!(opts.subsys_mask & (1 << i)) ||
+ if (!(ctx->subsys_mask & (1 << i)) ||
ss->root == &cgrp_dfl_root)
continue;
- if (!percpu_ref_tryget_live(&ss->root->cgrp.self.refcnt)) {
- mutex_unlock(&cgroup_mutex);
- msleep(10);
- ret = restart_syscall();
- goto out_free;
- }
+ if (!percpu_ref_tryget_live(&ss->root->cgrp.self.refcnt))
+ return 1; /* restart */
cgroup_put(&ss->root->cgrp);
}
@@ -1163,8 +1168,8 @@ struct dentry *cgroup1_mount(struct file_system_type *fs_type, int flags,
* name matches but sybsys_mask doesn't, we should fail.
* Remember whether name matched.
*/
- if (opts.name) {
- if (strcmp(opts.name, root->name))
+ if (ctx->name) {
+ if (strcmp(ctx->name, root->name))
continue;
name_match = true;
}
@@ -1173,42 +1178,18 @@ struct dentry *cgroup1_mount(struct file_system_type *fs_type, int flags,
* If we asked for subsystems (or explicitly for no
* subsystems) then they must match.
*/
- if ((opts.subsys_mask || opts.none) &&
- (opts.subsys_mask != root->subsys_mask)) {
+ if ((ctx->subsys_mask || ctx->none) &&
+ (ctx->subsys_mask != root->subsys_mask)) {
if (!name_match)
continue;
- ret = -EBUSY;
- goto out_unlock;
+ return -EBUSY;
}
- if (root->flags ^ opts.flags)
+ if (root->flags ^ ctx->flags)
pr_warn("new mount options do not match the existing superblock, will be ignored\n");
- /*
- * We want to reuse @root whose lifetime is governed by its
- * ->cgrp. Let's check whether @root is alive and keep it
- * that way. As cgroup_kill_sb() can happen anytime, we
- * want to block it by pinning the sb so that @root doesn't
- * get killed before mount is complete.
- *
- * With the sb pinned, tryget_live can reliably indicate
- * whether @root can be reused. If it's being killed,
- * drain it. We can use wait_queue for the wait but this
- * path is super cold. Let's just sleep a bit and retry.
- */
- pinned_sb = kernfs_pin_sb(root->kf_root, NULL);
- if (IS_ERR(pinned_sb) ||
- !percpu_ref_tryget_live(&root->cgrp.self.refcnt)) {
- mutex_unlock(&cgroup_mutex);
- if (!IS_ERR_OR_NULL(pinned_sb))
- deactivate_super(pinned_sb);
- msleep(10);
- ret = restart_syscall();
- goto out_free;
- }
-
- ret = 0;
- goto out_unlock;
+ ctx->root = root;
+ return 0;
}
/*
@@ -1216,62 +1197,58 @@ struct dentry *cgroup1_mount(struct file_system_type *fs_type, int flags,
* specification is allowed for already existing hierarchies but we
* can't create new one without subsys specification.
*/
- if (!opts.subsys_mask && !opts.none) {
- ret = -EINVAL;
- goto out_unlock;
- }
+ if (!ctx->subsys_mask && !ctx->none)
+ return cg_invalf(fc, "cgroup1: No subsys list or none specified");
/* Hierarchies may only be created in the initial cgroup namespace. */
- if (ns != &init_cgroup_ns) {
- ret = -EPERM;
- goto out_unlock;
- }
+ if (ctx->ns != &init_cgroup_ns)
+ return -EPERM;
root = kzalloc(sizeof(*root), GFP_KERNEL);
- if (!root) {
- ret = -ENOMEM;
- goto out_unlock;
- }
- new_root = true;
+ if (!root)
+ return -ENOMEM;
- init_cgroup_root(root, &opts);
+ ctx->root = root;
+ init_cgroup_root(ctx);
- ret = cgroup_setup_root(root, opts.subsys_mask, PERCPU_REF_INIT_DEAD);
+ ret = cgroup_setup_root(root, ctx->subsys_mask);
if (ret)
cgroup_free_root(root);
+ return ret;
+}
-out_unlock:
- mutex_unlock(&cgroup_mutex);
-out_free:
- kfree(opts.release_agent);
- kfree(opts.name);
+int cgroup1_get_tree(struct fs_context *fc)
+{
+ struct cgroup_fs_context *ctx = cgroup_fc2context(fc);
+ int ret;
- if (ret)
- return ERR_PTR(ret);
+ /* Check if the caller has permission to mount. */
+ if (!ns_capable(ctx->ns->user_ns, CAP_SYS_ADMIN))
+ return -EPERM;
- dentry = cgroup_do_mount(&cgroup_fs_type, flags, root,
- CGROUP_SUPER_MAGIC, ns);
+ cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp);
- /*
- * There's a race window after we release cgroup_mutex and before
- * allocating a superblock. Make sure a concurrent process won't
- * be able to re-use the root during this window by delaying the
- * initialization of root refcnt.
- */
- if (new_root) {
- mutex_lock(&cgroup_mutex);
- percpu_ref_reinit(&root->cgrp.self.refcnt);
- mutex_unlock(&cgroup_mutex);
- }
+ ret = cgroup1_root_to_use(fc);
+ if (!ret && !percpu_ref_tryget_live(&ctx->root->cgrp.self.refcnt))
+ ret = 1; /* restart */
- /*
- * If @pinned_sb, we're reusing an existing root and holding an
- * extra ref on its sb. Mount is complete. Put the extra ref.
- */
- if (pinned_sb)
- deactivate_super(pinned_sb);
+ mutex_unlock(&cgroup_mutex);
- return dentry;
+ if (!ret)
+ ret = cgroup_do_get_tree(fc);
+
+ if (!ret && percpu_ref_is_dying(&ctx->root->cgrp.self.refcnt)) {
+ struct super_block *sb = fc->root->d_sb;
+ dput(fc->root);
+ deactivate_locked_super(sb);
+ ret = 1;
+ }
+
+ if (unlikely(ret > 0)) {
+ msleep(10);
+ return restart_syscall();
+ }
+ return ret;
}
static int __init cgroup1_wq_init(void)
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index f31bd61c9466..300b0c416341 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -54,6 +54,7 @@
#include <linux/proc_ns.h>
#include <linux/nsproxy.h>
#include <linux/file.h>
+#include <linux/fs_parser.h>
#include <linux/sched/cputime.h>
#include <linux/psi.h>
#include <net/sock.h>
@@ -100,7 +101,7 @@ static DEFINE_SPINLOCK(cgroup_idr_lock);
*/
static DEFINE_SPINLOCK(cgroup_file_kn_lock);
-struct percpu_rw_semaphore cgroup_threadgroup_rwsem;
+DEFINE_PERCPU_RWSEM(cgroup_threadgroup_rwsem);
#define cgroup_assert_mutex_or_rcu_locked() \
RCU_LOCKDEP_WARN(!rcu_read_lock_held() && \
@@ -197,7 +198,7 @@ static u64 css_serial_nr_next = 1;
*/
static u16 have_fork_callback __read_mostly;
static u16 have_exit_callback __read_mostly;
-static u16 have_free_callback __read_mostly;
+static u16 have_release_callback __read_mostly;
static u16 have_canfork_callback __read_mostly;
/* cgroup namespace for init task */
@@ -214,7 +215,8 @@ static struct cftype cgroup_base_files[];
static int cgroup_apply_control(struct cgroup *cgrp);
static void cgroup_finalize_control(struct cgroup *cgrp, int ret);
-static void css_task_iter_advance(struct css_task_iter *it);
+static void css_task_iter_skip(struct css_task_iter *it,
+ struct task_struct *task);
static int cgroup_destroy_locked(struct cgroup *cgrp);
static struct cgroup_subsys_state *css_create(struct cgroup *cgrp,
struct cgroup_subsys *ss);
@@ -592,6 +594,39 @@ static void cgroup_get_live(struct cgroup *cgrp)
css_get(&cgrp->self);
}
+/**
+ * __cgroup_task_count - count the number of tasks in a cgroup. The caller
+ * is responsible for taking the css_set_lock.
+ * @cgrp: the cgroup in question
+ */
+int __cgroup_task_count(const struct cgroup *cgrp)
+{
+ int count = 0;
+ struct cgrp_cset_link *link;
+
+ lockdep_assert_held(&css_set_lock);
+
+ list_for_each_entry(link, &cgrp->cset_links, cset_link)
+ count += link->cset->nr_tasks;
+
+ return count;
+}
+
+/**
+ * cgroup_task_count - count the number of tasks in a cgroup.
+ * @cgrp: the cgroup in question
+ */
+int cgroup_task_count(const struct cgroup *cgrp)
+{
+ int count;
+
+ spin_lock_irq(&css_set_lock);
+ count = __cgroup_task_count(cgrp);
+ spin_unlock_irq(&css_set_lock);
+
+ return count;
+}
+
struct cgroup_subsys_state *of_css(struct kernfs_open_file *of)
{
struct cgroup *cgrp = of->kn->parent->priv;
@@ -704,6 +739,7 @@ struct css_set init_css_set = {
.dom_cset = &init_css_set,
.tasks = LIST_HEAD_INIT(init_css_set.tasks),
.mg_tasks = LIST_HEAD_INIT(init_css_set.mg_tasks),
+ .dying_tasks = LIST_HEAD_INIT(init_css_set.dying_tasks),
.task_iters = LIST_HEAD_INIT(init_css_set.task_iters),
.threaded_csets = LIST_HEAD_INIT(init_css_set.threaded_csets),
.cgrp_links = LIST_HEAD_INIT(init_css_set.cgrp_links),
@@ -782,6 +818,8 @@ static void cgroup_update_populated(struct cgroup *cgrp, bool populated)
break;
cgroup1_check_for_release(cgrp);
+ TRACE_CGROUP_PATH(notify_populated, cgrp,
+ cgroup_is_populated(cgrp));
cgroup_file_notify(&cgrp->events_file);
child = cgrp;
@@ -807,6 +845,21 @@ static void css_set_update_populated(struct css_set *cset, bool populated)
cgroup_update_populated(link->cgrp, populated);
}
+/*
+ * @task is leaving, advance task iterators which are pointing to it so
+ * that they can resume at the next position. Advancing an iterator might
+ * remove it from the list, use safe walk. See css_task_iter_skip() for
+ * details.
+ */
+static void css_set_skip_task_iters(struct css_set *cset,
+ struct task_struct *task)
+{
+ struct css_task_iter *it, *pos;
+
+ list_for_each_entry_safe(it, pos, &cset->task_iters, iters_node)
+ css_task_iter_skip(it, task);
+}
+
/**
* css_set_move_task - move a task from one css_set to another
* @task: task being moved
@@ -832,22 +885,9 @@ static void css_set_move_task(struct task_struct *task,
css_set_update_populated(to_cset, true);
if (from_cset) {
- struct css_task_iter *it, *pos;
-
WARN_ON_ONCE(list_empty(&task->cg_list));
- /*
- * @task is leaving, advance task iterators which are
- * pointing to it so that they can resume at the next
- * position. Advancing an iterator might remove it from
- * the list, use safe walk. See css_task_iter_advance*()
- * for details.
- */
- list_for_each_entry_safe(it, pos, &from_cset->task_iters,
- iters_node)
- if (it->task_pos == &task->cg_list)
- css_task_iter_advance(it);
-
+ css_set_skip_task_iters(from_cset, task);
list_del_init(&task->cg_list);
if (!css_set_populated(from_cset))
css_set_update_populated(from_cset, false);
@@ -1174,6 +1214,7 @@ static struct css_set *find_css_set(struct css_set *old_cset,
cset->dom_cset = cset;
INIT_LIST_HEAD(&cset->tasks);
INIT_LIST_HEAD(&cset->mg_tasks);
+ INIT_LIST_HEAD(&cset->dying_tasks);
INIT_LIST_HEAD(&cset->task_iters);
INIT_LIST_HEAD(&cset->threaded_csets);
INIT_HLIST_NODE(&cset->hlist);
@@ -1772,26 +1813,42 @@ int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node,
return len;
}
-static int parse_cgroup_root_flags(char *data, unsigned int *root_flags)
-{
- char *token;
+enum cgroup2_param {
+ Opt_nsdelegate,
+ Opt_memory_localevents,
+ nr__cgroup2_params
+};
- *root_flags = 0;
+static const struct fs_parameter_spec cgroup2_param_specs[] = {
+ fsparam_flag("nsdelegate", Opt_nsdelegate),
+ fsparam_flag("memory_localevents", Opt_memory_localevents),
+ {}
+};
- if (!data || *data == '\0')
- return 0;
+static const struct fs_parameter_description cgroup2_fs_parameters = {
+ .name = "cgroup2",
+ .specs = cgroup2_param_specs,
+};
- while ((token = strsep(&data, ",")) != NULL) {
- if (!strcmp(token, "nsdelegate")) {
- *root_flags |= CGRP_ROOT_NS_DELEGATE;
- continue;
- }
+static int cgroup2_parse_param(struct fs_context *fc, struct fs_parameter *param)
+{
+ struct cgroup_fs_context *ctx = cgroup_fc2context(fc);
+ struct fs_parse_result result;
+ int opt;
- pr_err("cgroup2: unknown option \"%s\"\n", token);
- return -EINVAL;
- }
+ opt = fs_parse(fc, &cgroup2_fs_parameters, param, &result);
+ if (opt < 0)
+ return opt;
- return 0;
+ switch (opt) {
+ case Opt_nsdelegate:
+ ctx->flags |= CGRP_ROOT_NS_DELEGATE;
+ return 0;
+ case Opt_memory_localevents:
+ ctx->flags |= CGRP_ROOT_MEMORY_LOCAL_EVENTS;
+ return 0;
+ }
+ return -EINVAL;
}
static void apply_cgroup_root_flags(unsigned int root_flags)
@@ -1801,6 +1858,11 @@ static void apply_cgroup_root_flags(unsigned int root_flags)
cgrp_dfl_root.flags |= CGRP_ROOT_NS_DELEGATE;
else
cgrp_dfl_root.flags &= ~CGRP_ROOT_NS_DELEGATE;
+
+ if (root_flags & CGRP_ROOT_MEMORY_LOCAL_EVENTS)
+ cgrp_dfl_root.flags |= CGRP_ROOT_MEMORY_LOCAL_EVENTS;
+ else
+ cgrp_dfl_root.flags &= ~CGRP_ROOT_MEMORY_LOCAL_EVENTS;
}
}
@@ -1808,19 +1870,16 @@ static int cgroup_show_options(struct seq_file *seq, struct kernfs_root *kf_root
{
if (cgrp_dfl_root.flags & CGRP_ROOT_NS_DELEGATE)
seq_puts(seq, ",nsdelegate");
+ if (cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_LOCAL_EVENTS)
+ seq_puts(seq, ",memory_localevents");
return 0;
}
-static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data)
+static int cgroup_reconfigure(struct fs_context *fc)
{
- unsigned int root_flags;
- int ret;
+ struct cgroup_fs_context *ctx = cgroup_fc2context(fc);
- ret = parse_cgroup_root_flags(data, &root_flags);
- if (ret)
- return ret;
-
- apply_cgroup_root_flags(root_flags);
+ apply_cgroup_root_flags(ctx->flags);
return 0;
}
@@ -1908,8 +1967,9 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
INIT_WORK(&cgrp->release_agent_work, cgroup1_release_agent);
}
-void init_cgroup_root(struct cgroup_root *root, struct cgroup_sb_opts *opts)
+void init_cgroup_root(struct cgroup_fs_context *ctx)
{
+ struct cgroup_root *root = ctx->root;
struct cgroup *cgrp = &root->cgrp;
INIT_LIST_HEAD(&root->root_list);
@@ -1918,16 +1978,16 @@ void init_cgroup_root(struct cgroup_root *root, struct cgroup_sb_opts *opts)
init_cgroup_housekeeping(cgrp);
idr_init(&root->cgroup_idr);
- root->flags = opts->flags;
- if (opts->release_agent)
- strscpy(root->release_agent_path, opts->release_agent, PATH_MAX);
- if (opts->name)
- strscpy(root->name, opts->name, MAX_CGROUP_ROOT_NAMELEN);
- if (opts->cpuset_clone_children)
+ root->flags = ctx->flags;
+ if (ctx->release_agent)
+ strscpy(root->release_agent_path, ctx->release_agent, PATH_MAX);
+ if (ctx->name)
+ strscpy(root->name, ctx->name, MAX_CGROUP_ROOT_NAMELEN);
+ if (ctx->cpuset_clone_children)
set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags);
}
-int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask, int ref_flags)
+int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask)
{
LIST_HEAD(tmp_links);
struct cgroup *root_cgrp = &root->cgrp;
@@ -1944,7 +2004,7 @@ int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask, int ref_flags)
root_cgrp->ancestor_ids[0] = ret;
ret = percpu_ref_init(&root_cgrp->self.refcnt, css_release,
- ref_flags, GFP_KERNEL);
+ 0, GFP_KERNEL);
if (ret)
goto out;
@@ -2028,57 +2088,104 @@ out:
return ret;
}
-struct dentry *cgroup_do_mount(struct file_system_type *fs_type, int flags,
- struct cgroup_root *root, unsigned long magic,
- struct cgroup_namespace *ns)
+int cgroup_do_get_tree(struct fs_context *fc)
{
- struct dentry *dentry;
- bool new_sb;
+ struct cgroup_fs_context *ctx = cgroup_fc2context(fc);
+ int ret;
- dentry = kernfs_mount(fs_type, flags, root->kf_root, magic, &new_sb);
+ ctx->kfc.root = ctx->root->kf_root;
+ if (fc->fs_type == &cgroup2_fs_type)
+ ctx->kfc.magic = CGROUP2_SUPER_MAGIC;
+ else
+ ctx->kfc.magic = CGROUP_SUPER_MAGIC;
+ ret = kernfs_get_tree(fc);
/*
* In non-init cgroup namespace, instead of root cgroup's dentry,
* we return the dentry corresponding to the cgroupns->root_cgrp.
*/
- if (!IS_ERR(dentry) && ns != &init_cgroup_ns) {
+ if (!ret && ctx->ns != &init_cgroup_ns) {
struct dentry *nsdentry;
+ struct super_block *sb = fc->root->d_sb;
struct cgroup *cgrp;
mutex_lock(&cgroup_mutex);
spin_lock_irq(&css_set_lock);
- cgrp = cset_cgroup_from_root(ns->root_cset, root);
+ cgrp = cset_cgroup_from_root(ctx->ns->root_cset, ctx->root);
spin_unlock_irq(&css_set_lock);
mutex_unlock(&cgroup_mutex);
- nsdentry = kernfs_node_dentry(cgrp->kn, dentry->d_sb);
- dput(dentry);
- dentry = nsdentry;
+ nsdentry = kernfs_node_dentry(cgrp->kn, sb);
+ dput(fc->root);
+ fc->root = nsdentry;
+ if (IS_ERR(nsdentry)) {
+ ret = PTR_ERR(nsdentry);
+ deactivate_locked_super(sb);
+ }
}
- if (IS_ERR(dentry) || !new_sb)
- cgroup_put(&root->cgrp);
+ if (!ctx->kfc.new_sb_created)
+ cgroup_put(&ctx->root->cgrp);
+
+ return ret;
+}
+
+/*
+ * Destroy a cgroup filesystem context.
+ */
+static void cgroup_fs_context_free(struct fs_context *fc)
+{
+ struct cgroup_fs_context *ctx = cgroup_fc2context(fc);
- return dentry;
+ kfree(ctx->name);
+ kfree(ctx->release_agent);
+ put_cgroup_ns(ctx->ns);
+ kernfs_free_fs_context(fc);
+ kfree(ctx);
}
-static struct dentry *cgroup_mount(struct file_system_type *fs_type,
- int flags, const char *unused_dev_name,
- void *data)
+static int cgroup_get_tree(struct fs_context *fc)
{
- struct cgroup_namespace *ns = current->nsproxy->cgroup_ns;
- struct dentry *dentry;
+ struct cgroup_fs_context *ctx = cgroup_fc2context(fc);
int ret;
- get_cgroup_ns(ns);
+ cgrp_dfl_visible = true;
+ cgroup_get_live(&cgrp_dfl_root.cgrp);
+ ctx->root = &cgrp_dfl_root;
- /* Check if the caller has permission to mount. */
- if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN)) {
- put_cgroup_ns(ns);
- return ERR_PTR(-EPERM);
- }
+ ret = cgroup_do_get_tree(fc);
+ if (!ret)
+ apply_cgroup_root_flags(ctx->flags);
+ return ret;
+}
+
+static const struct fs_context_operations cgroup_fs_context_ops = {
+ .free = cgroup_fs_context_free,
+ .parse_param = cgroup2_parse_param,
+ .get_tree = cgroup_get_tree,
+ .reconfigure = cgroup_reconfigure,
+};
+
+static const struct fs_context_operations cgroup1_fs_context_ops = {
+ .free = cgroup_fs_context_free,
+ .parse_param = cgroup1_parse_param,
+ .get_tree = cgroup1_get_tree,
+ .reconfigure = cgroup1_reconfigure,
+};
+
+/*
+ * Initialise the cgroup filesystem creation/reconfiguration context. Notably,
+ * we select the namespace we're going to use.
+ */
+static int cgroup_init_fs_context(struct fs_context *fc)
+{
+ struct cgroup_fs_context *ctx;
+
+ ctx = kzalloc(sizeof(struct cgroup_fs_context), GFP_KERNEL);
+ if (!ctx)
+ return -ENOMEM;
/*
* The first time anyone tries to mount a cgroup, enable the list
@@ -2087,29 +2194,18 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
if (!use_task_css_set_links)
cgroup_enable_task_cg_lists();
- if (fs_type == &cgroup2_fs_type) {
- unsigned int root_flags;
-
- ret = parse_cgroup_root_flags(data, &root_flags);
- if (ret) {
- put_cgroup_ns(ns);
- return ERR_PTR(ret);
- }
-
- cgrp_dfl_visible = true;
- cgroup_get_live(&cgrp_dfl_root.cgrp);
-
- dentry = cgroup_do_mount(&cgroup2_fs_type, flags, &cgrp_dfl_root,
- CGROUP2_SUPER_MAGIC, ns);
- if (!IS_ERR(dentry))
- apply_cgroup_root_flags(root_flags);
- } else {
- dentry = cgroup1_mount(&cgroup_fs_type, flags, data,
- CGROUP_SUPER_MAGIC, ns);
- }
-
- put_cgroup_ns(ns);
- return dentry;
+ ctx->ns = current->nsproxy->cgroup_ns;
+ get_cgroup_ns(ctx->ns);
+ fc->fs_private = &ctx->kfc;
+ if (fc->fs_type == &cgroup2_fs_type)
+ fc->ops = &cgroup_fs_context_ops;
+ else
+ fc->ops = &cgroup1_fs_context_ops;
+ if (fc->user_ns)
+ put_user_ns(fc->user_ns);
+ fc->user_ns = get_user_ns(ctx->ns->user_ns);
+ fc->global = true;
+ return 0;
}
static void cgroup_kill_sb(struct super_block *sb)
@@ -2118,33 +2214,33 @@ static void cgroup_kill_sb(struct super_block *sb)
struct cgroup_root *root = cgroup_root_from_kf(kf_root);
/*
- * If @root doesn't have any mounts or children, start killing it.
+ * If @root doesn't have any children, start killing it.
* This prevents new mounts by disabling percpu_ref_tryget_live().
* cgroup_mount() may wait for @root's release.
*
* And don't kill the default root.
*/
- if (!list_empty(&root->cgrp.self.children) ||
- root == &cgrp_dfl_root)
- cgroup_put(&root->cgrp);
- else
+ if (list_empty(&root->cgrp.self.children) && root != &cgrp_dfl_root &&
+ !percpu_ref_is_dying(&root->cgrp.self.refcnt))
percpu_ref_kill(&root->cgrp.self.refcnt);
-
+ cgroup_put(&root->cgrp);
kernfs_kill_sb(sb);
}
struct file_system_type cgroup_fs_type = {
- .name = "cgroup",
- .mount = cgroup_mount,
- .kill_sb = cgroup_kill_sb,
- .fs_flags = FS_USERNS_MOUNT,
+ .name = "cgroup",
+ .init_fs_context = cgroup_init_fs_context,
+ .parameters = &cgroup1_fs_parameters,
+ .kill_sb = cgroup_kill_sb,
+ .fs_flags = FS_USERNS_MOUNT,
};
static struct file_system_type cgroup2_fs_type = {
- .name = "cgroup2",
- .mount = cgroup_mount,
- .kill_sb = cgroup_kill_sb,
- .fs_flags = FS_USERNS_MOUNT,
+ .name = "cgroup2",
+ .init_fs_context = cgroup_init_fs_context,
+ .parameters = &cgroup2_fs_parameters,
+ .kill_sb = cgroup_kill_sb,
+ .fs_flags = FS_USERNS_MOUNT,
};
int cgroup_path_ns_locked(struct cgroup *cgrp, char *buf, size_t buflen,
@@ -2358,8 +2454,15 @@ static int cgroup_migrate_execute(struct cgroup_mgctx *mgctx)
get_css_set(to_cset);
to_cset->nr_tasks++;
css_set_move_task(task, from_cset, to_cset, true);
- put_css_set_locked(from_cset);
from_cset->nr_tasks--;
+ /*
+ * If the source or destination cgroup is frozen,
+ * the task might require to change its state.
+ */
+ cgroup_freezer_migrate_task(task, from_cset->dfl_cgrp,
+ to_cset->dfl_cgrp);
+ put_css_set_locked(from_cset);
+
}
}
spin_unlock_irq(&css_set_lock);
@@ -2558,7 +2661,7 @@ int cgroup_migrate_prepare_dst(struct cgroup_mgctx *mgctx)
dst_cset = find_css_set(src_cset, src_cset->mg_dst_cgrp);
if (!dst_cset)
- goto err;
+ return -ENOMEM;
WARN_ON_ONCE(src_cset->mg_dst_cset || dst_cset->mg_dst_cset);
@@ -2590,9 +2693,6 @@ int cgroup_migrate_prepare_dst(struct cgroup_mgctx *mgctx)
}
return 0;
-err:
- cgroup_migrate_finish(mgctx);
- return -ENOMEM;
}
/**
@@ -3403,8 +3503,11 @@ static ssize_t cgroup_max_depth_write(struct kernfs_open_file *of,
static int cgroup_events_show(struct seq_file *seq, void *v)
{
- seq_printf(seq, "populated %d\n",
- cgroup_is_populated(seq_css(seq)->cgroup));
+ struct cgroup *cgrp = seq_css(seq)->cgroup;
+
+ seq_printf(seq, "populated %d\n", cgroup_is_populated(cgrp));
+ seq_printf(seq, "frozen %d\n", test_bit(CGRP_FROZEN, &cgrp->flags));
+
return 0;
}
@@ -3454,17 +3557,118 @@ static int cpu_stat_show(struct seq_file *seq, void *v)
#ifdef CONFIG_PSI
static int cgroup_io_pressure_show(struct seq_file *seq, void *v)
{
- return psi_show(seq, &seq_css(seq)->cgroup->psi, PSI_IO);
+ struct cgroup *cgroup = seq_css(seq)->cgroup;
+ struct psi_group *psi = cgroup->id == 1 ? &psi_system : &cgroup->psi;
+
+ return psi_show(seq, psi, PSI_IO);
}
static int cgroup_memory_pressure_show(struct seq_file *seq, void *v)
{
- return psi_show(seq, &seq_css(seq)->cgroup->psi, PSI_MEM);
+ struct cgroup *cgroup = seq_css(seq)->cgroup;
+ struct psi_group *psi = cgroup->id == 1 ? &psi_system : &cgroup->psi;
+
+ return psi_show(seq, psi, PSI_MEM);
}
static int cgroup_cpu_pressure_show(struct seq_file *seq, void *v)
{
- return psi_show(seq, &seq_css(seq)->cgroup->psi, PSI_CPU);
+ struct cgroup *cgroup = seq_css(seq)->cgroup;
+ struct psi_group *psi = cgroup->id == 1 ? &psi_system : &cgroup->psi;
+
+ return psi_show(seq, psi, PSI_CPU);
+}
+
+static ssize_t cgroup_pressure_write(struct kernfs_open_file *of, char *buf,
+ size_t nbytes, enum psi_res res)
+{
+ struct psi_trigger *new;
+ struct cgroup *cgrp;
+
+ cgrp = cgroup_kn_lock_live(of->kn, false);
+ if (!cgrp)
+ return -ENODEV;
+
+ cgroup_get(cgrp);
+ cgroup_kn_unlock(of->kn);
+
+ new = psi_trigger_create(&cgrp->psi, buf, nbytes, res);
+ if (IS_ERR(new)) {
+ cgroup_put(cgrp);
+ return PTR_ERR(new);
+ }
+
+ psi_trigger_replace(&of->priv, new);
+
+ cgroup_put(cgrp);
+
+ return nbytes;
+}
+
+static ssize_t cgroup_io_pressure_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes,
+ loff_t off)
+{
+ return cgroup_pressure_write(of, buf, nbytes, PSI_IO);
+}
+
+static ssize_t cgroup_memory_pressure_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes,
+ loff_t off)
+{
+ return cgroup_pressure_write(of, buf, nbytes, PSI_MEM);
+}
+
+static ssize_t cgroup_cpu_pressure_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes,
+ loff_t off)
+{
+ return cgroup_pressure_write(of, buf, nbytes, PSI_CPU);
+}
+
+static __poll_t cgroup_pressure_poll(struct kernfs_open_file *of,
+ poll_table *pt)
+{
+ return psi_trigger_poll(&of->priv, of->file, pt);
+}
+
+static void cgroup_pressure_release(struct kernfs_open_file *of)
+{
+ psi_trigger_replace(&of->priv, NULL);
+}
+#endif /* CONFIG_PSI */
+
+static int cgroup_freeze_show(struct seq_file *seq, void *v)
+{
+ struct cgroup *cgrp = seq_css(seq)->cgroup;
+
+ seq_printf(seq, "%d\n", cgrp->freezer.freeze);
+
+ return 0;
+}
+
+static ssize_t cgroup_freeze_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off)
+{
+ struct cgroup *cgrp;
+ ssize_t ret;
+ int freeze;
+
+ ret = kstrtoint(strstrip(buf), 0, &freeze);
+ if (ret)
+ return ret;
+
+ if (freeze < 0 || freeze > 1)
+ return -ERANGE;
+
+ cgrp = cgroup_kn_lock_live(of->kn, false);
+ if (!cgrp)
+ return -ENOENT;
+
+ cgroup_freeze(cgrp, freeze);
+
+ cgroup_kn_unlock(of->kn);
+
+ return nbytes;
}
-#endif
static int cgroup_file_open(struct kernfs_open_file *of)
{
@@ -3533,6 +3737,16 @@ static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf,
return ret ?: nbytes;
}
+static __poll_t cgroup_file_poll(struct kernfs_open_file *of, poll_table *pt)
+{
+ struct cftype *cft = of->kn->priv;
+
+ if (cft->poll)
+ return cft->poll(of, pt);
+
+ return kernfs_generic_poll(of, pt);
+}
+
static void *cgroup_seqfile_start(struct seq_file *seq, loff_t *ppos)
{
return seq_cft(seq)->seq_start(seq, ppos);
@@ -3571,6 +3785,7 @@ static struct kernfs_ops cgroup_kf_single_ops = {
.open = cgroup_file_open,
.release = cgroup_file_release,
.write = cgroup_file_write,
+ .poll = cgroup_file_poll,
.seq_show = cgroup_seqfile_show,
};
@@ -3579,6 +3794,7 @@ static struct kernfs_ops cgroup_kf_ops = {
.open = cgroup_file_open,
.release = cgroup_file_release,
.write = cgroup_file_write,
+ .poll = cgroup_file_poll,
.seq_start = cgroup_seqfile_start,
.seq_next = cgroup_seqfile_next,
.seq_stop = cgroup_seqfile_stop,
@@ -4010,6 +4226,7 @@ css_next_descendant_pre(struct cgroup_subsys_state *pos,
return NULL;
}
+EXPORT_SYMBOL_GPL(css_next_descendant_pre);
/**
* css_rightmost_descendant - return the rightmost descendant of a css
@@ -4197,15 +4414,18 @@ static void css_task_iter_advance_css_set(struct css_task_iter *it)
it->task_pos = NULL;
return;
}
- } while (!css_set_populated(cset));
+ } while (!css_set_populated(cset) && list_empty(&cset->dying_tasks));
if (!list_empty(&cset->tasks))
it->task_pos = cset->tasks.next;
- else
+ else if (!list_empty(&cset->mg_tasks))
it->task_pos = cset->mg_tasks.next;
+ else
+ it->task_pos = cset->dying_tasks.next;
it->tasks_head = &cset->tasks;
it->mg_tasks_head = &cset->mg_tasks;
+ it->dying_tasks_head = &cset->dying_tasks;
/*
* We don't keep css_sets locked across iteration steps and thus
@@ -4231,9 +4451,20 @@ static void css_task_iter_advance_css_set(struct css_task_iter *it)
list_add(&it->iters_node, &cset->task_iters);
}
+static void css_task_iter_skip(struct css_task_iter *it,
+ struct task_struct *task)
+{
+ lockdep_assert_held(&css_set_lock);
+
+ if (it->task_pos == &task->cg_list) {
+ it->task_pos = it->task_pos->next;
+ it->flags |= CSS_TASK_ITER_SKIPPED;
+ }
+}
+
static void css_task_iter_advance(struct css_task_iter *it)
{
- struct list_head *next;
+ struct task_struct *task;
lockdep_assert_held(&css_set_lock);
repeat:
@@ -4243,25 +4474,40 @@ repeat:
* consumed first and then ->mg_tasks. After ->mg_tasks,
* we move onto the next cset.
*/
- next = it->task_pos->next;
-
- if (next == it->tasks_head)
- next = it->mg_tasks_head->next;
+ if (it->flags & CSS_TASK_ITER_SKIPPED)
+ it->flags &= ~CSS_TASK_ITER_SKIPPED;
+ else
+ it->task_pos = it->task_pos->next;
- if (next == it->mg_tasks_head)
+ if (it->task_pos == it->tasks_head)
+ it->task_pos = it->mg_tasks_head->next;
+ if (it->task_pos == it->mg_tasks_head)
+ it->task_pos = it->dying_tasks_head->next;
+ if (it->task_pos == it->dying_tasks_head)
css_task_iter_advance_css_set(it);
- else
- it->task_pos = next;
} else {
/* called from start, proceed to the first cset */
css_task_iter_advance_css_set(it);
}
- /* if PROCS, skip over tasks which aren't group leaders */
- if ((it->flags & CSS_TASK_ITER_PROCS) && it->task_pos &&
- !thread_group_leader(list_entry(it->task_pos, struct task_struct,
- cg_list)))
- goto repeat;
+ if (!it->task_pos)
+ return;
+
+ task = list_entry(it->task_pos, struct task_struct, cg_list);
+
+ if (it->flags & CSS_TASK_ITER_PROCS) {
+ /* if PROCS, skip over tasks which aren't group leaders */
+ if (!thread_group_leader(task))
+ goto repeat;
+
+ /* and dying leaders w/o live member threads */
+ if (!atomic_read(&task->signal->live))
+ goto repeat;
+ } else {
+ /* skip all dying ones */
+ if (task->flags & PF_EXITING)
+ goto repeat;
+ }
}
/**
@@ -4317,6 +4563,10 @@ struct task_struct *css_task_iter_next(struct css_task_iter *it)
spin_lock_irq(&css_set_lock);
+ /* @it may be half-advanced by skips, finish advancing */
+ if (it->flags & CSS_TASK_ITER_SKIPPED)
+ css_task_iter_advance(it);
+
if (it->task_pos) {
it->cur_task = list_entry(it->task_pos, struct task_struct,
cg_list);
@@ -4598,6 +4848,12 @@ static struct cftype cgroup_base_files[] = {
.seq_show = cgroup_stat_show,
},
{
+ .name = "cgroup.freeze",
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .seq_show = cgroup_freeze_show,
+ .write = cgroup_freeze_write,
+ },
+ {
.name = "cpu.stat",
.flags = CFTYPE_NOT_ON_ROOT,
.seq_show = cpu_stat_show,
@@ -4605,20 +4861,26 @@ static struct cftype cgroup_base_files[] = {
#ifdef CONFIG_PSI
{
.name = "io.pressure",
- .flags = CFTYPE_NOT_ON_ROOT,
.seq_show = cgroup_io_pressure_show,
+ .write = cgroup_io_pressure_write,
+ .poll = cgroup_pressure_poll,
+ .release = cgroup_pressure_release,
},
{
.name = "memory.pressure",
- .flags = CFTYPE_NOT_ON_ROOT,
.seq_show = cgroup_memory_pressure_show,
+ .write = cgroup_memory_pressure_write,
+ .poll = cgroup_pressure_poll,
+ .release = cgroup_pressure_release,
},
{
.name = "cpu.pressure",
- .flags = CFTYPE_NOT_ON_ROOT,
.seq_show = cgroup_cpu_pressure_show,
+ .write = cgroup_cpu_pressure_write,
+ .poll = cgroup_pressure_poll,
+ .release = cgroup_pressure_release,
},
-#endif
+#endif /* CONFIG_PSI */
{ } /* terminate */
};
@@ -4725,9 +4987,11 @@ static void css_release_work_fn(struct work_struct *work)
if (cgroup_on_dfl(cgrp))
cgroup_rstat_flush(cgrp);
+ spin_lock_irq(&css_set_lock);
for (tcgrp = cgroup_parent(cgrp); tcgrp;
tcgrp = cgroup_parent(tcgrp))
tcgrp->nr_dying_descendants--;
+ spin_unlock_irq(&css_set_lock);
cgroup_idr_remove(&cgrp->root->cgroup_idr, cgrp->id);
cgrp->id = -1;
@@ -4742,8 +5006,6 @@ static void css_release_work_fn(struct work_struct *work)
if (cgrp->kn)
RCU_INIT_POINTER(*(void __rcu __force **)&cgrp->kn->priv,
NULL);
-
- cgroup_bpf_put(cgrp);
}
mutex_unlock(&cgroup_mutex);
@@ -4945,12 +5207,31 @@ static struct cgroup *cgroup_create(struct cgroup *parent)
if (ret)
goto out_psi_free;
+ /*
+ * New cgroup inherits effective freeze counter, and
+ * if the parent has to be frozen, the child has too.
+ */
+ cgrp->freezer.e_freeze = parent->freezer.e_freeze;
+ if (cgrp->freezer.e_freeze)
+ set_bit(CGRP_FROZEN, &cgrp->flags);
+
+ spin_lock_irq(&css_set_lock);
for (tcgrp = cgrp; tcgrp; tcgrp = cgroup_parent(tcgrp)) {
cgrp->ancestor_ids[tcgrp->level] = tcgrp->id;
- if (tcgrp != cgrp)
+ if (tcgrp != cgrp) {
tcgrp->nr_descendants++;
+
+ /*
+ * If the new cgroup is frozen, all ancestor cgroups
+ * get a new frozen descendant, but their state can't
+ * change because of this.
+ */
+ if (cgrp->freezer.e_freeze)
+ tcgrp->freezer.nr_frozen_descendants++;
+ }
}
+ spin_unlock_irq(&css_set_lock);
if (notify_on_release(parent))
set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
@@ -5235,13 +5516,23 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
if (parent && cgroup_is_threaded(cgrp))
parent->nr_threaded_children--;
+ spin_lock_irq(&css_set_lock);
for (tcgrp = cgroup_parent(cgrp); tcgrp; tcgrp = cgroup_parent(tcgrp)) {
tcgrp->nr_descendants--;
tcgrp->nr_dying_descendants++;
+ /*
+ * If the dying cgroup is frozen, decrease frozen descendants
+ * counters of ancestor cgroups.
+ */
+ if (test_bit(CGRP_FROZEN, &cgrp->flags))
+ tcgrp->freezer.nr_frozen_descendants--;
}
+ spin_unlock_irq(&css_set_lock);
cgroup1_check_for_release(parent);
+ cgroup_bpf_offline(cgrp);
+
/* put the base reference */
percpu_ref_kill(&cgrp->self.refcnt);
@@ -5267,7 +5558,6 @@ int cgroup_rmdir(struct kernfs_node *kn)
static struct kernfs_syscall_ops cgroup_kf_syscall_ops = {
.show_options = cgroup_show_options,
- .remount_fs = cgroup_remount,
.mkdir = cgroup_mkdir,
.rmdir = cgroup_rmdir,
.show_path = cgroup_show_path,
@@ -5313,7 +5603,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss, bool early)
have_fork_callback |= (bool)ss->fork << ss->id;
have_exit_callback |= (bool)ss->exit << ss->id;
- have_free_callback |= (bool)ss->free << ss->id;
+ have_release_callback |= (bool)ss->release << ss->id;
have_canfork_callback |= (bool)ss->can_fork << ss->id;
/* At system boot, before all subsystems have been
@@ -5334,11 +5624,12 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss, bool early)
*/
int __init cgroup_init_early(void)
{
- static struct cgroup_sb_opts __initdata opts;
+ static struct cgroup_fs_context __initdata ctx;
struct cgroup_subsys *ss;
int i;
- init_cgroup_root(&cgrp_dfl_root, &opts);
+ ctx.root = &cgrp_dfl_root;
+ init_cgroup_root(&ctx);
cgrp_dfl_root.cgrp.self.flags |= CSS_NO_REF;
RCU_INIT_POINTER(init_task.cgroups, &init_css_set);
@@ -5376,7 +5667,6 @@ int __init cgroup_init(void)
int ssid;
BUILD_BUG_ON(CGROUP_SUBSYS_COUNT > 16);
- BUG_ON(percpu_init_rwsem(&cgroup_threadgroup_rwsem));
BUG_ON(cgroup_init_cftypes(NULL, cgroup_base_files));
BUG_ON(cgroup_init_cftypes(NULL, cgroup1_base_files));
@@ -5399,7 +5689,7 @@ int __init cgroup_init(void)
hash_add(css_set_table, &init_css_set.hlist,
css_set_hash(init_css_set.subsys));
- BUG_ON(cgroup_setup_root(&cgrp_dfl_root, 0, 0));
+ BUG_ON(cgroup_setup_root(&cgrp_dfl_root, 0));
mutex_unlock(&cgroup_mutex);
@@ -5690,6 +5980,26 @@ void cgroup_post_fork(struct task_struct *child)
cset->nr_tasks++;
css_set_move_task(child, NULL, cset, false);
}
+
+ /*
+ * If the cgroup has to be frozen, the new task has too.
+ * Let's set the JOBCTL_TRAP_FREEZE jobctl bit to get
+ * the task into the frozen state.
+ */
+ if (unlikely(cgroup_task_freeze(child))) {
+ spin_lock(&child->sighand->siglock);
+ WARN_ON_ONCE(child->frozen);
+ child->jobctl |= JOBCTL_TRAP_FREEZE;
+ spin_unlock(&child->sighand->siglock);
+
+ /*
+ * Calling cgroup_update_frozen() isn't required here,
+ * because it will be called anyway a bit later
+ * from do_freezer_trap(). So we avoid cgroup's
+ * transient switch from the frozen state and back.
+ */
+ }
+
spin_unlock_irq(&css_set_lock);
}
@@ -5737,7 +6047,13 @@ void cgroup_exit(struct task_struct *tsk)
if (!list_empty(&tsk->cg_list)) {
spin_lock_irq(&css_set_lock);
css_set_move_task(tsk, cset, NULL, false);
+ list_add_tail(&tsk->cg_list, &cset->dying_tasks);
cset->nr_tasks--;
+
+ WARN_ON_ONCE(cgroup_task_frozen(tsk));
+ if (unlikely(cgroup_task_freeze(tsk)))
+ cgroup_update_frozen(task_dfl_cgroup(tsk));
+
spin_unlock_irq(&css_set_lock);
} else {
get_css_set(cset);
@@ -5749,16 +6065,26 @@ void cgroup_exit(struct task_struct *tsk)
} while_each_subsys_mask();
}
-void cgroup_free(struct task_struct *task)
+void cgroup_release(struct task_struct *task)
{
- struct css_set *cset = task_css_set(task);
struct cgroup_subsys *ss;
int ssid;
- do_each_subsys_mask(ss, ssid, have_free_callback) {
- ss->free(task);
+ do_each_subsys_mask(ss, ssid, have_release_callback) {
+ ss->release(task);
} while_each_subsys_mask();
+ if (use_task_css_set_links) {
+ spin_lock_irq(&css_set_lock);
+ css_set_skip_task_iters(task_css_set(task), task);
+ list_del_init(&task->cg_list);
+ spin_unlock_irq(&css_set_lock);
+ }
+}
+
+void cgroup_free(struct task_struct *task)
+{
+ struct css_set *cset = task_css_set(task);
put_css_set(cset);
}
@@ -5915,6 +6241,48 @@ struct cgroup *cgroup_get_from_fd(int fd)
}
EXPORT_SYMBOL_GPL(cgroup_get_from_fd);
+static u64 power_of_ten(int power)
+{
+ u64 v = 1;
+ while (power--)
+ v *= 10;
+ return v;
+}
+
+/**
+ * cgroup_parse_float - parse a floating number
+ * @input: input string
+ * @dec_shift: number of decimal digits to shift
+ * @v: output
+ *
+ * Parse a decimal floating point number in @input and store the result in
+ * @v with decimal point right shifted @dec_shift times. For example, if
+ * @input is "12.3456" and @dec_shift is 3, *@v will be set to 12345.
+ * Returns 0 on success, -errno otherwise.
+ *
+ * There's nothing cgroup specific about this function except that it's
+ * currently the only user.
+ */
+int cgroup_parse_float(const char *input, unsigned dec_shift, s64 *v)
+{
+ s64 whole, frac = 0;
+ int fstart = 0, fend = 0, flen;
+
+ if (!sscanf(input, "%lld.%n%lld%n", &whole, &fstart, &frac, &fend))
+ return -EINVAL;
+ if (frac < 0)
+ return -EINVAL;
+
+ flen = fend > fstart ? fend - fstart : 0;
+ if (flen < dec_shift)
+ frac *= power_of_ten(dec_shift - flen);
+ else
+ frac = DIV_ROUND_CLOSEST_ULL(frac, power_of_ten(flen - dec_shift));
+
+ *v = whole * power_of_ten(dec_shift) + frac;
+ return 0;
+}
+
/*
* sock->sk_cgrp_data handling. For more info, see sock_cgroup_data
* definition in cgroup-defs.h.
@@ -5953,6 +6321,7 @@ void cgroup_sk_alloc(struct sock_cgroup_data *skcd)
* Don't use cgroup_get_live().
*/
cgroup_get(sock_cgroup_ptr(skcd));
+ cgroup_bpf_get(sock_cgroup_ptr(skcd));
return;
}
@@ -5964,6 +6333,7 @@ void cgroup_sk_alloc(struct sock_cgroup_data *skcd)
cset = task_css_set(current);
if (likely(cgroup_tryget(cset->dfl_cgrp))) {
skcd->val = (unsigned long)cset->dfl_cgrp;
+ cgroup_bpf_get(cset->dfl_cgrp);
break;
}
cpu_relax();
@@ -5974,7 +6344,10 @@ void cgroup_sk_alloc(struct sock_cgroup_data *skcd)
void cgroup_sk_free(struct sock_cgroup_data *skcd)
{
- cgroup_put(sock_cgroup_ptr(skcd));
+ struct cgroup *cgrp = sock_cgroup_ptr(skcd);
+
+ cgroup_bpf_put(cgrp);
+ cgroup_put(cgrp);
}
#endif /* CONFIG_SOCK_CGROUP_DATA */
@@ -5996,7 +6369,7 @@ int cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
int ret;
mutex_lock(&cgroup_mutex);
- ret = __cgroup_bpf_detach(cgrp, prog, type, flags);
+ ret = __cgroup_bpf_detach(cgrp, prog, type);
mutex_unlock(&cgroup_mutex);
return ret;
}
@@ -6057,7 +6430,7 @@ static struct kobj_attribute cgroup_delegate_attr = __ATTR_RO(delegate);
static ssize_t features_show(struct kobject *kobj, struct kobj_attribute *attr,
char *buf)
{
- return snprintf(buf, PAGE_SIZE, "nsdelegate\n");
+ return snprintf(buf, PAGE_SIZE, "nsdelegate\nmemory_localevents\n");
}
static struct kobj_attribute cgroup_features_attr = __ATTR_RO(features);
@@ -6077,4 +6450,5 @@ static int __init cgroup_sysfs_init(void)
return sysfs_create_group(kernel_kobj, &cgroup_sysfs_attr_group);
}
subsys_initcall(cgroup_sysfs_init);
+
#endif /* CONFIG_SYSFS */
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index 479743db6c37..b3b02b9c4405 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -39,6 +39,7 @@
#include <linux/memory.h>
#include <linux/export.h>
#include <linux/mount.h>
+#include <linux/fs_context.h>
#include <linux/namei.h>
#include <linux/pagemap.h>
#include <linux/proc_fs.h>
@@ -203,19 +204,6 @@ static inline struct cpuset *parent_cs(struct cpuset *cs)
return css_cs(cs->css.parent);
}
-#ifdef CONFIG_NUMA
-static inline bool task_has_mempolicy(struct task_struct *task)
-{
- return task->mempolicy;
-}
-#else
-static inline bool task_has_mempolicy(struct task_struct *task)
-{
- return false;
-}
-#endif
-
-
/* bits in struct cpuset flags field */
typedef enum {
CS_ONLINE,
@@ -372,25 +360,52 @@ static inline bool is_in_v2_mode(void)
* users. If someone tries to mount the "cpuset" filesystem, we
* silently switch it to mount "cgroup" instead
*/
-static struct dentry *cpuset_mount(struct file_system_type *fs_type,
- int flags, const char *unused_dev_name, void *data)
-{
- struct file_system_type *cgroup_fs = get_fs_type("cgroup");
- struct dentry *ret = ERR_PTR(-ENODEV);
- if (cgroup_fs) {
- char mountopts[] =
- "cpuset,noprefix,"
- "release_agent=/sbin/cpuset_release_agent";
- ret = cgroup_fs->mount(cgroup_fs, flags,
- unused_dev_name, mountopts);
- put_filesystem(cgroup_fs);
+static int cpuset_get_tree(struct fs_context *fc)
+{
+ struct file_system_type *cgroup_fs;
+ struct fs_context *new_fc;
+ int ret;
+
+ cgroup_fs = get_fs_type("cgroup");
+ if (!cgroup_fs)
+ return -ENODEV;
+
+ new_fc = fs_context_for_mount(cgroup_fs, fc->sb_flags);
+ if (IS_ERR(new_fc)) {
+ ret = PTR_ERR(new_fc);
+ } else {
+ static const char agent_path[] = "/sbin/cpuset_release_agent";
+ ret = vfs_parse_fs_string(new_fc, "cpuset", NULL, 0);
+ if (!ret)
+ ret = vfs_parse_fs_string(new_fc, "noprefix", NULL, 0);
+ if (!ret)
+ ret = vfs_parse_fs_string(new_fc, "release_agent",
+ agent_path, sizeof(agent_path) - 1);
+ if (!ret)
+ ret = vfs_get_tree(new_fc);
+ if (!ret) { /* steal the result */
+ fc->root = new_fc->root;
+ new_fc->root = NULL;
+ }
+ put_fs_context(new_fc);
}
+ put_filesystem(cgroup_fs);
return ret;
}
+static const struct fs_context_operations cpuset_fs_context_ops = {
+ .get_tree = cpuset_get_tree,
+};
+
+static int cpuset_init_fs_context(struct fs_context *fc)
+{
+ fc->ops = &cpuset_fs_context_ops;
+ return 0;
+}
+
static struct file_system_type cpuset_fs_type = {
- .name = "cpuset",
- .mount = cpuset_mount,
+ .name = "cpuset",
+ .init_fs_context = cpuset_init_fs_context,
};
/*
@@ -714,7 +729,7 @@ static inline int nr_cpusets(void)
* load balancing domains (sched domains) as specified by that partial
* partition.
*
- * See "What is sched_load_balance" in Documentation/cgroup-v1/cpusets.txt
+ * See "What is sched_load_balance" in Documentation/cgroup-v1/cpusets.rst
* for a background explanation of this.
*
* Does not return errors, on the theory that the callers of this
@@ -725,11 +740,10 @@ static inline int nr_cpusets(void)
* Must be called with cpuset_mutex held.
*
* The three key local variables below are:
- * q - a linked-list queue of cpuset pointers, used to implement a
- * top-down scan of all cpusets. This scan loads a pointer
- * to each cpuset marked is_sched_load_balance into the
- * array 'csa'. For our purposes, rebuilding the schedulers
- * sched domains, we can ignore !is_sched_load_balance cpusets.
+ * cp - cpuset pointer, used (together with pos_css) to perform a
+ * top-down scan of all cpusets. For our purposes, rebuilding
+ * the schedulers sched domains, we can ignore !is_sched_load_
+ * balance cpusets.
* csa - (for CpuSet Array) Array of pointers to all the cpusets
* that need to be load balanced, for convenient iterative
* access by the subsequent code that finds the best partition,
@@ -760,7 +774,7 @@ static inline int nr_cpusets(void)
static int generate_sched_domains(cpumask_var_t **domains,
struct sched_domain_attr **attributes)
{
- struct cpuset *cp; /* scans q */
+ struct cpuset *cp; /* top-down scan of cpusets */
struct cpuset **csa; /* array of all cpuset ptrs */
int csn; /* how many cpuset ptrs in csa so far */
int i, j, k; /* indices for partition finding loops */
@@ -2815,7 +2829,7 @@ static void cpuset_fork(struct task_struct *task)
if (task_css_is_root(task, cpuset_cgrp_id))
return;
- set_cpus_allowed_ptr(task, &current->cpus_allowed);
+ set_cpus_allowed_ptr(task, current->cpus_ptr);
task->mems_allowed = current->mems_allowed;
}
@@ -3240,10 +3254,23 @@ void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask)
spin_unlock_irqrestore(&callback_lock, flags);
}
+/**
+ * cpuset_cpus_allowed_fallback - final fallback before complete catastrophe.
+ * @tsk: pointer to task_struct with which the scheduler is struggling
+ *
+ * Description: In the case that the scheduler cannot find an allowed cpu in
+ * tsk->cpus_allowed, we fall back to task_cs(tsk)->cpus_allowed. In legacy
+ * mode however, this value is the same as task_cs(tsk)->effective_cpus,
+ * which will not contain a sane cpumask during cases such as cpu hotplugging.
+ * This is the absolute last resort for the scheduler and it is only used if
+ * _every_ other avenue has been traveled.
+ **/
+
void cpuset_cpus_allowed_fallback(struct task_struct *tsk)
{
rcu_read_lock();
- do_set_cpus_allowed(tsk, task_cs(tsk)->effective_cpus);
+ do_set_cpus_allowed(tsk, is_in_v2_mode() ?
+ task_cs(tsk)->cpus_allowed : cpu_possible_mask);
rcu_read_unlock();
/*
diff --git a/kernel/cgroup/debug.c b/kernel/cgroup/debug.c
index 5f1b87330bee..80aa3f027ac3 100644
--- a/kernel/cgroup/debug.c
+++ b/kernel/cgroup/debug.c
@@ -64,8 +64,8 @@ static int current_css_set_read(struct seq_file *seq, void *v)
css = cset->subsys[ss->id];
if (!css)
continue;
- seq_printf(seq, "%2d: %-4s\t- %lx[%d]\n", ss->id, ss->name,
- (unsigned long)css, css->id);
+ seq_printf(seq, "%2d: %-4s\t- %p[%d]\n", ss->id, ss->name,
+ css, css->id);
}
rcu_read_unlock();
spin_unlock_irq(&css_set_lock);
@@ -224,8 +224,8 @@ static int cgroup_subsys_states_read(struct seq_file *seq, void *v)
if (css->parent)
snprintf(pbuf, sizeof(pbuf) - 1, " P=%d",
css->parent->id);
- seq_printf(seq, "%2d: %-4s\t- %lx[%d] %d%s\n", ss->id, ss->name,
- (unsigned long)css, css->id,
+ seq_printf(seq, "%2d: %-4s\t- %p[%d] %d%s\n", ss->id, ss->name,
+ css, css->id,
atomic_read(&css->online_cnt), pbuf);
}
diff --git a/kernel/cgroup/freezer.c b/kernel/cgroup/freezer.c
index 08236798d173..8cf010680678 100644
--- a/kernel/cgroup/freezer.c
+++ b/kernel/cgroup/freezer.c
@@ -1,481 +1,314 @@
-/*
- * cgroup_freezer.c - control group freezer subsystem
- *
- * Copyright IBM Corporation, 2007
- *
- * Author : Cedric Le Goater <clg@fr.ibm.com>
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of version 2.1 of the GNU Lesser General Public License
- * as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
- */
-
-#include <linux/export.h>
-#include <linux/slab.h>
+//SPDX-License-Identifier: GPL-2.0
#include <linux/cgroup.h>
-#include <linux/fs.h>
-#include <linux/uaccess.h>
-#include <linux/freezer.h>
-#include <linux/seq_file.h>
-#include <linux/mutex.h>
-
-/*
- * A cgroup is freezing if any FREEZING flags are set. FREEZING_SELF is
- * set if "FROZEN" is written to freezer.state cgroupfs file, and cleared
- * for "THAWED". FREEZING_PARENT is set if the parent freezer is FREEZING
- * for whatever reason. IOW, a cgroup has FREEZING_PARENT set if one of
- * its ancestors has FREEZING_SELF set.
- */
-enum freezer_state_flags {
- CGROUP_FREEZER_ONLINE = (1 << 0), /* freezer is fully online */
- CGROUP_FREEZING_SELF = (1 << 1), /* this freezer is freezing */
- CGROUP_FREEZING_PARENT = (1 << 2), /* the parent freezer is freezing */
- CGROUP_FROZEN = (1 << 3), /* this and its descendants frozen */
+#include <linux/sched.h>
+#include <linux/sched/task.h>
+#include <linux/sched/signal.h>
- /* mask for all FREEZING flags */
- CGROUP_FREEZING = CGROUP_FREEZING_SELF | CGROUP_FREEZING_PARENT,
-};
+#include "cgroup-internal.h"
-struct freezer {
- struct cgroup_subsys_state css;
- unsigned int state;
-};
+#include <trace/events/cgroup.h>
-static DEFINE_MUTEX(freezer_mutex);
-
-static inline struct freezer *css_freezer(struct cgroup_subsys_state *css)
+/*
+ * Propagate the cgroup frozen state upwards by the cgroup tree.
+ */
+static void cgroup_propagate_frozen(struct cgroup *cgrp, bool frozen)
{
- return css ? container_of(css, struct freezer, css) : NULL;
-}
+ int desc = 1;
-static inline struct freezer *task_freezer(struct task_struct *task)
-{
- return css_freezer(task_css(task, freezer_cgrp_id));
+ /*
+ * If the new state is frozen, some freezing ancestor cgroups may change
+ * their state too, depending on if all their descendants are frozen.
+ *
+ * Otherwise, all ancestor cgroups are forced into the non-frozen state.
+ */
+ while ((cgrp = cgroup_parent(cgrp))) {
+ if (frozen) {
+ cgrp->freezer.nr_frozen_descendants += desc;
+ if (!test_bit(CGRP_FROZEN, &cgrp->flags) &&
+ test_bit(CGRP_FREEZE, &cgrp->flags) &&
+ cgrp->freezer.nr_frozen_descendants ==
+ cgrp->nr_descendants) {
+ set_bit(CGRP_FROZEN, &cgrp->flags);
+ cgroup_file_notify(&cgrp->events_file);
+ TRACE_CGROUP_PATH(notify_frozen, cgrp, 1);
+ desc++;
+ }
+ } else {
+ cgrp->freezer.nr_frozen_descendants -= desc;
+ if (test_bit(CGRP_FROZEN, &cgrp->flags)) {
+ clear_bit(CGRP_FROZEN, &cgrp->flags);
+ cgroup_file_notify(&cgrp->events_file);
+ TRACE_CGROUP_PATH(notify_frozen, cgrp, 0);
+ desc++;
+ }
+ }
+ }
}
-static struct freezer *parent_freezer(struct freezer *freezer)
+/*
+ * Revisit the cgroup frozen state.
+ * Checks if the cgroup is really frozen and perform all state transitions.
+ */
+void cgroup_update_frozen(struct cgroup *cgrp)
{
- return css_freezer(freezer->css.parent);
-}
+ bool frozen;
-bool cgroup_freezing(struct task_struct *task)
-{
- bool ret;
+ lockdep_assert_held(&css_set_lock);
- rcu_read_lock();
- ret = task_freezer(task)->state & CGROUP_FREEZING;
- rcu_read_unlock();
+ /*
+ * If the cgroup has to be frozen (CGRP_FREEZE bit set),
+ * and all tasks are frozen and/or stopped, let's consider
+ * the cgroup frozen. Otherwise it's not frozen.
+ */
+ frozen = test_bit(CGRP_FREEZE, &cgrp->flags) &&
+ cgrp->freezer.nr_frozen_tasks == __cgroup_task_count(cgrp);
- return ret;
-}
+ if (frozen) {
+ /* Already there? */
+ if (test_bit(CGRP_FROZEN, &cgrp->flags))
+ return;
-static const char *freezer_state_strs(unsigned int state)
-{
- if (state & CGROUP_FROZEN)
- return "FROZEN";
- if (state & CGROUP_FREEZING)
- return "FREEZING";
- return "THAWED";
-};
-
-static struct cgroup_subsys_state *
-freezer_css_alloc(struct cgroup_subsys_state *parent_css)
-{
- struct freezer *freezer;
+ set_bit(CGRP_FROZEN, &cgrp->flags);
+ } else {
+ /* Already there? */
+ if (!test_bit(CGRP_FROZEN, &cgrp->flags))
+ return;
- freezer = kzalloc(sizeof(struct freezer), GFP_KERNEL);
- if (!freezer)
- return ERR_PTR(-ENOMEM);
+ clear_bit(CGRP_FROZEN, &cgrp->flags);
+ }
+ cgroup_file_notify(&cgrp->events_file);
+ TRACE_CGROUP_PATH(notify_frozen, cgrp, frozen);
- return &freezer->css;
+ /* Update the state of ancestor cgroups. */
+ cgroup_propagate_frozen(cgrp, frozen);
}
-/**
- * freezer_css_online - commit creation of a freezer css
- * @css: css being created
- *
- * We're committing to creation of @css. Mark it online and inherit
- * parent's freezing state while holding both parent's and our
- * freezer->lock.
+/*
+ * Increment cgroup's nr_frozen_tasks.
*/
-static int freezer_css_online(struct cgroup_subsys_state *css)
+static void cgroup_inc_frozen_cnt(struct cgroup *cgrp)
{
- struct freezer *freezer = css_freezer(css);
- struct freezer *parent = parent_freezer(freezer);
-
- mutex_lock(&freezer_mutex);
-
- freezer->state |= CGROUP_FREEZER_ONLINE;
-
- if (parent && (parent->state & CGROUP_FREEZING)) {
- freezer->state |= CGROUP_FREEZING_PARENT | CGROUP_FROZEN;
- atomic_inc(&system_freezing_cnt);
- }
-
- mutex_unlock(&freezer_mutex);
- return 0;
+ cgrp->freezer.nr_frozen_tasks++;
}
-/**
- * freezer_css_offline - initiate destruction of a freezer css
- * @css: css being destroyed
- *
- * @css is going away. Mark it dead and decrement system_freezing_count if
- * it was holding one.
+/*
+ * Decrement cgroup's nr_frozen_tasks.
*/
-static void freezer_css_offline(struct cgroup_subsys_state *css)
+static void cgroup_dec_frozen_cnt(struct cgroup *cgrp)
{
- struct freezer *freezer = css_freezer(css);
-
- mutex_lock(&freezer_mutex);
-
- if (freezer->state & CGROUP_FREEZING)
- atomic_dec(&system_freezing_cnt);
-
- freezer->state = 0;
-
- mutex_unlock(&freezer_mutex);
+ cgrp->freezer.nr_frozen_tasks--;
+ WARN_ON_ONCE(cgrp->freezer.nr_frozen_tasks < 0);
}
-static void freezer_css_free(struct cgroup_subsys_state *css)
+/*
+ * Enter frozen/stopped state, if not yet there. Update cgroup's counters,
+ * and revisit the state of the cgroup, if necessary.
+ */
+void cgroup_enter_frozen(void)
{
- kfree(css_freezer(css));
+ struct cgroup *cgrp;
+
+ if (current->frozen)
+ return;
+
+ spin_lock_irq(&css_set_lock);
+ current->frozen = true;
+ cgrp = task_dfl_cgroup(current);
+ cgroup_inc_frozen_cnt(cgrp);
+ cgroup_update_frozen(cgrp);
+ spin_unlock_irq(&css_set_lock);
}
/*
- * Tasks can be migrated into a different freezer anytime regardless of its
- * current state. freezer_attach() is responsible for making new tasks
- * conform to the current state.
+ * Conditionally leave frozen/stopped state. Update cgroup's counters,
+ * and revisit the state of the cgroup, if necessary.
*
- * Freezer state changes and task migration are synchronized via
- * @freezer->lock. freezer_attach() makes the new tasks conform to the
- * current state and all following state changes can see the new tasks.
+ * If always_leave is not set, and the cgroup is freezing,
+ * we're racing with the cgroup freezing. In this case, we don't
+ * drop the frozen counter to avoid a transient switch to
+ * the unfrozen state.
*/
-static void freezer_attach(struct cgroup_taskset *tset)
+void cgroup_leave_frozen(bool always_leave)
{
- struct task_struct *task;
- struct cgroup_subsys_state *new_css;
-
- mutex_lock(&freezer_mutex);
-
- /*
- * Make the new tasks conform to the current state of @new_css.
- * For simplicity, when migrating any task to a FROZEN cgroup, we
- * revert it to FREEZING and let update_if_frozen() determine the
- * correct state later.
- *
- * Tasks in @tset are on @new_css but may not conform to its
- * current state before executing the following - !frozen tasks may
- * be visible in a FROZEN cgroup and frozen tasks in a THAWED one.
- */
- cgroup_taskset_for_each(task, new_css, tset) {
- struct freezer *freezer = css_freezer(new_css);
-
- if (!(freezer->state & CGROUP_FREEZING)) {
- __thaw_task(task);
- } else {
- freeze_task(task);
- /* clear FROZEN and propagate upwards */
- while (freezer && (freezer->state & CGROUP_FROZEN)) {
- freezer->state &= ~CGROUP_FROZEN;
- freezer = parent_freezer(freezer);
- }
- }
+ struct cgroup *cgrp;
+
+ spin_lock_irq(&css_set_lock);
+ cgrp = task_dfl_cgroup(current);
+ if (always_leave || !test_bit(CGRP_FREEZE, &cgrp->flags)) {
+ cgroup_dec_frozen_cnt(cgrp);
+ cgroup_update_frozen(cgrp);
+ WARN_ON_ONCE(!current->frozen);
+ current->frozen = false;
+ } else if (!(current->jobctl & JOBCTL_TRAP_FREEZE)) {
+ spin_lock(&current->sighand->siglock);
+ current->jobctl |= JOBCTL_TRAP_FREEZE;
+ set_thread_flag(TIF_SIGPENDING);
+ spin_unlock(&current->sighand->siglock);
}
-
- mutex_unlock(&freezer_mutex);
+ spin_unlock_irq(&css_set_lock);
}
-/**
- * freezer_fork - cgroup post fork callback
- * @task: a task which has just been forked
- *
- * @task has just been created and should conform to the current state of
- * the cgroup_freezer it belongs to. This function may race against
- * freezer_attach(). Losing to freezer_attach() means that we don't have
- * to do anything as freezer_attach() will put @task into the appropriate
- * state.
+/*
+ * Freeze or unfreeze the task by setting or clearing the JOBCTL_TRAP_FREEZE
+ * jobctl bit.
*/
-static void freezer_fork(struct task_struct *task)
+static void cgroup_freeze_task(struct task_struct *task, bool freeze)
{
- struct freezer *freezer;
+ unsigned long flags;
- /*
- * The root cgroup is non-freezable, so we can skip locking the
- * freezer. This is safe regardless of race with task migration.
- * If we didn't race or won, skipping is obviously the right thing
- * to do. If we lost and root is the new cgroup, noop is still the
- * right thing to do.
- */
- if (task_css_is_root(task, freezer_cgrp_id))
+ /* If the task is about to die, don't bother with freezing it. */
+ if (!lock_task_sighand(task, &flags))
return;
- mutex_lock(&freezer_mutex);
- rcu_read_lock();
-
- freezer = task_freezer(task);
- if (freezer->state & CGROUP_FREEZING)
- freeze_task(task);
+ if (freeze) {
+ task->jobctl |= JOBCTL_TRAP_FREEZE;
+ signal_wake_up(task, false);
+ } else {
+ task->jobctl &= ~JOBCTL_TRAP_FREEZE;
+ wake_up_process(task);
+ }
- rcu_read_unlock();
- mutex_unlock(&freezer_mutex);
+ unlock_task_sighand(task, &flags);
}
-/**
- * update_if_frozen - update whether a cgroup finished freezing
- * @css: css of interest
- *
- * Once FREEZING is initiated, transition to FROZEN is lazily updated by
- * calling this function. If the current state is FREEZING but not FROZEN,
- * this function checks whether all tasks of this cgroup and the descendant
- * cgroups finished freezing and, if so, sets FROZEN.
- *
- * The caller is responsible for grabbing RCU read lock and calling
- * update_if_frozen() on all descendants prior to invoking this function.
- *
- * Task states and freezer state might disagree while tasks are being
- * migrated into or out of @css, so we can't verify task states against
- * @freezer state here. See freezer_attach() for details.
+/*
+ * Freeze or unfreeze all tasks in the given cgroup.
*/
-static void update_if_frozen(struct cgroup_subsys_state *css)
+static void cgroup_do_freeze(struct cgroup *cgrp, bool freeze)
{
- struct freezer *freezer = css_freezer(css);
- struct cgroup_subsys_state *pos;
struct css_task_iter it;
struct task_struct *task;
- lockdep_assert_held(&freezer_mutex);
-
- if (!(freezer->state & CGROUP_FREEZING) ||
- (freezer->state & CGROUP_FROZEN))
- return;
+ lockdep_assert_held(&cgroup_mutex);
- /* are all (live) children frozen? */
- rcu_read_lock();
- css_for_each_child(pos, css) {
- struct freezer *child = css_freezer(pos);
-
- if ((child->state & CGROUP_FREEZER_ONLINE) &&
- !(child->state & CGROUP_FROZEN)) {
- rcu_read_unlock();
- return;
- }
- }
- rcu_read_unlock();
+ spin_lock_irq(&css_set_lock);
+ if (freeze)
+ set_bit(CGRP_FREEZE, &cgrp->flags);
+ else
+ clear_bit(CGRP_FREEZE, &cgrp->flags);
+ spin_unlock_irq(&css_set_lock);
- /* are all tasks frozen? */
- css_task_iter_start(css, 0, &it);
+ if (freeze)
+ TRACE_CGROUP_PATH(freeze, cgrp);
+ else
+ TRACE_CGROUP_PATH(unfreeze, cgrp);
+ css_task_iter_start(&cgrp->self, 0, &it);
while ((task = css_task_iter_next(&it))) {
- if (freezing(task)) {
- /*
- * freezer_should_skip() indicates that the task
- * should be skipped when determining freezing
- * completion. Consider it frozen in addition to
- * the usual frozen condition.
- */
- if (!frozen(task) && !freezer_should_skip(task))
- goto out_iter_end;
- }
- }
-
- freezer->state |= CGROUP_FROZEN;
-out_iter_end:
- css_task_iter_end(&it);
-}
-
-static int freezer_read(struct seq_file *m, void *v)
-{
- struct cgroup_subsys_state *css = seq_css(m), *pos;
-
- mutex_lock(&freezer_mutex);
- rcu_read_lock();
-
- /* update states bottom-up */
- css_for_each_descendant_post(pos, css) {
- if (!css_tryget_online(pos))
+ /*
+ * Ignore kernel threads here. Freezing cgroups containing
+ * kthreads isn't supported.
+ */
+ if (task->flags & PF_KTHREAD)
continue;
- rcu_read_unlock();
-
- update_if_frozen(pos);
-
- rcu_read_lock();
- css_put(pos);
+ cgroup_freeze_task(task, freeze);
}
-
- rcu_read_unlock();
- mutex_unlock(&freezer_mutex);
-
- seq_puts(m, freezer_state_strs(css_freezer(css)->state));
- seq_putc(m, '\n');
- return 0;
-}
-
-static void freeze_cgroup(struct freezer *freezer)
-{
- struct css_task_iter it;
- struct task_struct *task;
-
- css_task_iter_start(&freezer->css, 0, &it);
- while ((task = css_task_iter_next(&it)))
- freeze_task(task);
css_task_iter_end(&it);
-}
-static void unfreeze_cgroup(struct freezer *freezer)
-{
- struct css_task_iter it;
- struct task_struct *task;
-
- css_task_iter_start(&freezer->css, 0, &it);
- while ((task = css_task_iter_next(&it)))
- __thaw_task(task);
- css_task_iter_end(&it);
+ /*
+ * Cgroup state should be revisited here to cover empty leaf cgroups
+ * and cgroups which descendants are already in the desired state.
+ */
+ spin_lock_irq(&css_set_lock);
+ if (cgrp->nr_descendants == cgrp->freezer.nr_frozen_descendants)
+ cgroup_update_frozen(cgrp);
+ spin_unlock_irq(&css_set_lock);
}
-/**
- * freezer_apply_state - apply state change to a single cgroup_freezer
- * @freezer: freezer to apply state change to
- * @freeze: whether to freeze or unfreeze
- * @state: CGROUP_FREEZING_* flag to set or clear
- *
- * Set or clear @state on @cgroup according to @freeze, and perform
- * freezing or thawing as necessary.
+/*
+ * Adjust the task state (freeze or unfreeze) and revisit the state of
+ * source and destination cgroups.
*/
-static void freezer_apply_state(struct freezer *freezer, bool freeze,
- unsigned int state)
+void cgroup_freezer_migrate_task(struct task_struct *task,
+ struct cgroup *src, struct cgroup *dst)
{
- /* also synchronizes against task migration, see freezer_attach() */
- lockdep_assert_held(&freezer_mutex);
+ lockdep_assert_held(&css_set_lock);
- if (!(freezer->state & CGROUP_FREEZER_ONLINE))
+ /*
+ * Kernel threads are not supposed to be frozen at all.
+ */
+ if (task->flags & PF_KTHREAD)
return;
- if (freeze) {
- if (!(freezer->state & CGROUP_FREEZING))
- atomic_inc(&system_freezing_cnt);
- freezer->state |= state;
- freeze_cgroup(freezer);
- } else {
- bool was_freezing = freezer->state & CGROUP_FREEZING;
-
- freezer->state &= ~state;
-
- if (!(freezer->state & CGROUP_FREEZING)) {
- if (was_freezing)
- atomic_dec(&system_freezing_cnt);
- freezer->state &= ~CGROUP_FROZEN;
- unfreeze_cgroup(freezer);
- }
+ /*
+ * Adjust counters of freezing and frozen tasks.
+ * Note, that if the task is frozen, but the destination cgroup is not
+ * frozen, we bump both counters to keep them balanced.
+ */
+ if (task->frozen) {
+ cgroup_inc_frozen_cnt(dst);
+ cgroup_dec_frozen_cnt(src);
}
-}
-
-/**
- * freezer_change_state - change the freezing state of a cgroup_freezer
- * @freezer: freezer of interest
- * @freeze: whether to freeze or thaw
- *
- * Freeze or thaw @freezer according to @freeze. The operations are
- * recursive - all descendants of @freezer will be affected.
- */
-static void freezer_change_state(struct freezer *freezer, bool freeze)
-{
- struct cgroup_subsys_state *pos;
+ cgroup_update_frozen(dst);
+ cgroup_update_frozen(src);
/*
- * Update all its descendants in pre-order traversal. Each
- * descendant will try to inherit its parent's FREEZING state as
- * CGROUP_FREEZING_PARENT.
+ * Force the task to the desired state.
*/
- mutex_lock(&freezer_mutex);
- rcu_read_lock();
- css_for_each_descendant_pre(pos, &freezer->css) {
- struct freezer *pos_f = css_freezer(pos);
- struct freezer *parent = parent_freezer(pos_f);
-
- if (!css_tryget_online(pos))
- continue;
- rcu_read_unlock();
-
- if (pos_f == freezer)
- freezer_apply_state(pos_f, freeze,
- CGROUP_FREEZING_SELF);
- else
- freezer_apply_state(pos_f,
- parent->state & CGROUP_FREEZING,
- CGROUP_FREEZING_PARENT);
-
- rcu_read_lock();
- css_put(pos);
- }
- rcu_read_unlock();
- mutex_unlock(&freezer_mutex);
+ cgroup_freeze_task(task, test_bit(CGRP_FREEZE, &dst->flags));
}
-static ssize_t freezer_write(struct kernfs_open_file *of,
- char *buf, size_t nbytes, loff_t off)
+void cgroup_freeze(struct cgroup *cgrp, bool freeze)
{
- bool freeze;
+ struct cgroup_subsys_state *css;
+ struct cgroup *dsct;
+ bool applied = false;
- buf = strstrip(buf);
+ lockdep_assert_held(&cgroup_mutex);
- if (strcmp(buf, freezer_state_strs(0)) == 0)
- freeze = false;
- else if (strcmp(buf, freezer_state_strs(CGROUP_FROZEN)) == 0)
- freeze = true;
- else
- return -EINVAL;
+ /*
+ * Nothing changed? Just exit.
+ */
+ if (cgrp->freezer.freeze == freeze)
+ return;
- freezer_change_state(css_freezer(of_css(of)), freeze);
- return nbytes;
-}
+ cgrp->freezer.freeze = freeze;
-static u64 freezer_self_freezing_read(struct cgroup_subsys_state *css,
- struct cftype *cft)
-{
- struct freezer *freezer = css_freezer(css);
+ /*
+ * Propagate changes downwards the cgroup tree.
+ */
+ css_for_each_descendant_pre(css, &cgrp->self) {
+ dsct = css->cgroup;
- return (bool)(freezer->state & CGROUP_FREEZING_SELF);
-}
+ if (cgroup_is_dead(dsct))
+ continue;
-static u64 freezer_parent_freezing_read(struct cgroup_subsys_state *css,
- struct cftype *cft)
-{
- struct freezer *freezer = css_freezer(css);
+ if (freeze) {
+ dsct->freezer.e_freeze++;
+ /*
+ * Already frozen because of ancestor's settings?
+ */
+ if (dsct->freezer.e_freeze > 1)
+ continue;
+ } else {
+ dsct->freezer.e_freeze--;
+ /*
+ * Still frozen because of ancestor's settings?
+ */
+ if (dsct->freezer.e_freeze > 0)
+ continue;
- return (bool)(freezer->state & CGROUP_FREEZING_PARENT);
-}
+ WARN_ON_ONCE(dsct->freezer.e_freeze < 0);
+ }
+
+ /*
+ * Do change actual state: freeze or unfreeze.
+ */
+ cgroup_do_freeze(dsct, freeze);
+ applied = true;
+ }
-static struct cftype files[] = {
- {
- .name = "state",
- .flags = CFTYPE_NOT_ON_ROOT,
- .seq_show = freezer_read,
- .write = freezer_write,
- },
- {
- .name = "self_freezing",
- .flags = CFTYPE_NOT_ON_ROOT,
- .read_u64 = freezer_self_freezing_read,
- },
- {
- .name = "parent_freezing",
- .flags = CFTYPE_NOT_ON_ROOT,
- .read_u64 = freezer_parent_freezing_read,
- },
- { } /* terminate */
-};
-
-struct cgroup_subsys freezer_cgrp_subsys = {
- .css_alloc = freezer_css_alloc,
- .css_online = freezer_css_online,
- .css_offline = freezer_css_offline,
- .css_free = freezer_css_free,
- .attach = freezer_attach,
- .fork = freezer_fork,
- .legacy_cftypes = files,
-};
+ /*
+ * Even if the actual state hasn't changed, let's notify a user.
+ * The state can be enforced by an ancestor cgroup: the cgroup
+ * can already be in the desired state or it can be locked in the
+ * opposite state, so that the transition will never happen.
+ * In both cases it's better to notify a user, that there is
+ * nothing to wait for.
+ */
+ if (!applied) {
+ TRACE_CGROUP_PATH(notify_frozen, cgrp,
+ test_bit(CGRP_FROZEN, &cgrp->flags));
+ cgroup_file_notify(&cgrp->events_file);
+ }
+}
diff --git a/kernel/cgroup/legacy_freezer.c b/kernel/cgroup/legacy_freezer.c
new file mode 100644
index 000000000000..08236798d173
--- /dev/null
+++ b/kernel/cgroup/legacy_freezer.c
@@ -0,0 +1,481 @@
+/*
+ * cgroup_freezer.c - control group freezer subsystem
+ *
+ * Copyright IBM Corporation, 2007
+ *
+ * Author : Cedric Le Goater <clg@fr.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2.1 of the GNU Lesser General Public License
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ */
+
+#include <linux/export.h>
+#include <linux/slab.h>
+#include <linux/cgroup.h>
+#include <linux/fs.h>
+#include <linux/uaccess.h>
+#include <linux/freezer.h>
+#include <linux/seq_file.h>
+#include <linux/mutex.h>
+
+/*
+ * A cgroup is freezing if any FREEZING flags are set. FREEZING_SELF is
+ * set if "FROZEN" is written to freezer.state cgroupfs file, and cleared
+ * for "THAWED". FREEZING_PARENT is set if the parent freezer is FREEZING
+ * for whatever reason. IOW, a cgroup has FREEZING_PARENT set if one of
+ * its ancestors has FREEZING_SELF set.
+ */
+enum freezer_state_flags {
+ CGROUP_FREEZER_ONLINE = (1 << 0), /* freezer is fully online */
+ CGROUP_FREEZING_SELF = (1 << 1), /* this freezer is freezing */
+ CGROUP_FREEZING_PARENT = (1 << 2), /* the parent freezer is freezing */
+ CGROUP_FROZEN = (1 << 3), /* this and its descendants frozen */
+
+ /* mask for all FREEZING flags */
+ CGROUP_FREEZING = CGROUP_FREEZING_SELF | CGROUP_FREEZING_PARENT,
+};
+
+struct freezer {
+ struct cgroup_subsys_state css;
+ unsigned int state;
+};
+
+static DEFINE_MUTEX(freezer_mutex);
+
+static inline struct freezer *css_freezer(struct cgroup_subsys_state *css)
+{
+ return css ? container_of(css, struct freezer, css) : NULL;
+}
+
+static inline struct freezer *task_freezer(struct task_struct *task)
+{
+ return css_freezer(task_css(task, freezer_cgrp_id));
+}
+
+static struct freezer *parent_freezer(struct freezer *freezer)
+{
+ return css_freezer(freezer->css.parent);
+}
+
+bool cgroup_freezing(struct task_struct *task)
+{
+ bool ret;
+
+ rcu_read_lock();
+ ret = task_freezer(task)->state & CGROUP_FREEZING;
+ rcu_read_unlock();
+
+ return ret;
+}
+
+static const char *freezer_state_strs(unsigned int state)
+{
+ if (state & CGROUP_FROZEN)
+ return "FROZEN";
+ if (state & CGROUP_FREEZING)
+ return "FREEZING";
+ return "THAWED";
+};
+
+static struct cgroup_subsys_state *
+freezer_css_alloc(struct cgroup_subsys_state *parent_css)
+{
+ struct freezer *freezer;
+
+ freezer = kzalloc(sizeof(struct freezer), GFP_KERNEL);
+ if (!freezer)
+ return ERR_PTR(-ENOMEM);
+
+ return &freezer->css;
+}
+
+/**
+ * freezer_css_online - commit creation of a freezer css
+ * @css: css being created
+ *
+ * We're committing to creation of @css. Mark it online and inherit
+ * parent's freezing state while holding both parent's and our
+ * freezer->lock.
+ */
+static int freezer_css_online(struct cgroup_subsys_state *css)
+{
+ struct freezer *freezer = css_freezer(css);
+ struct freezer *parent = parent_freezer(freezer);
+
+ mutex_lock(&freezer_mutex);
+
+ freezer->state |= CGROUP_FREEZER_ONLINE;
+
+ if (parent && (parent->state & CGROUP_FREEZING)) {
+ freezer->state |= CGROUP_FREEZING_PARENT | CGROUP_FROZEN;
+ atomic_inc(&system_freezing_cnt);
+ }
+
+ mutex_unlock(&freezer_mutex);
+ return 0;
+}
+
+/**
+ * freezer_css_offline - initiate destruction of a freezer css
+ * @css: css being destroyed
+ *
+ * @css is going away. Mark it dead and decrement system_freezing_count if
+ * it was holding one.
+ */
+static void freezer_css_offline(struct cgroup_subsys_state *css)
+{
+ struct freezer *freezer = css_freezer(css);
+
+ mutex_lock(&freezer_mutex);
+
+ if (freezer->state & CGROUP_FREEZING)
+ atomic_dec(&system_freezing_cnt);
+
+ freezer->state = 0;
+
+ mutex_unlock(&freezer_mutex);
+}
+
+static void freezer_css_free(struct cgroup_subsys_state *css)
+{
+ kfree(css_freezer(css));
+}
+
+/*
+ * Tasks can be migrated into a different freezer anytime regardless of its
+ * current state. freezer_attach() is responsible for making new tasks
+ * conform to the current state.
+ *
+ * Freezer state changes and task migration are synchronized via
+ * @freezer->lock. freezer_attach() makes the new tasks conform to the
+ * current state and all following state changes can see the new tasks.
+ */
+static void freezer_attach(struct cgroup_taskset *tset)
+{
+ struct task_struct *task;
+ struct cgroup_subsys_state *new_css;
+
+ mutex_lock(&freezer_mutex);
+
+ /*
+ * Make the new tasks conform to the current state of @new_css.
+ * For simplicity, when migrating any task to a FROZEN cgroup, we
+ * revert it to FREEZING and let update_if_frozen() determine the
+ * correct state later.
+ *
+ * Tasks in @tset are on @new_css but may not conform to its
+ * current state before executing the following - !frozen tasks may
+ * be visible in a FROZEN cgroup and frozen tasks in a THAWED one.
+ */
+ cgroup_taskset_for_each(task, new_css, tset) {
+ struct freezer *freezer = css_freezer(new_css);
+
+ if (!(freezer->state & CGROUP_FREEZING)) {
+ __thaw_task(task);
+ } else {
+ freeze_task(task);
+ /* clear FROZEN and propagate upwards */
+ while (freezer && (freezer->state & CGROUP_FROZEN)) {
+ freezer->state &= ~CGROUP_FROZEN;
+ freezer = parent_freezer(freezer);
+ }
+ }
+ }
+
+ mutex_unlock(&freezer_mutex);
+}
+
+/**
+ * freezer_fork - cgroup post fork callback
+ * @task: a task which has just been forked
+ *
+ * @task has just been created and should conform to the current state of
+ * the cgroup_freezer it belongs to. This function may race against
+ * freezer_attach(). Losing to freezer_attach() means that we don't have
+ * to do anything as freezer_attach() will put @task into the appropriate
+ * state.
+ */
+static void freezer_fork(struct task_struct *task)
+{
+ struct freezer *freezer;
+
+ /*
+ * The root cgroup is non-freezable, so we can skip locking the
+ * freezer. This is safe regardless of race with task migration.
+ * If we didn't race or won, skipping is obviously the right thing
+ * to do. If we lost and root is the new cgroup, noop is still the
+ * right thing to do.
+ */
+ if (task_css_is_root(task, freezer_cgrp_id))
+ return;
+
+ mutex_lock(&freezer_mutex);
+ rcu_read_lock();
+
+ freezer = task_freezer(task);
+ if (freezer->state & CGROUP_FREEZING)
+ freeze_task(task);
+
+ rcu_read_unlock();
+ mutex_unlock(&freezer_mutex);
+}
+
+/**
+ * update_if_frozen - update whether a cgroup finished freezing
+ * @css: css of interest
+ *
+ * Once FREEZING is initiated, transition to FROZEN is lazily updated by
+ * calling this function. If the current state is FREEZING but not FROZEN,
+ * this function checks whether all tasks of this cgroup and the descendant
+ * cgroups finished freezing and, if so, sets FROZEN.
+ *
+ * The caller is responsible for grabbing RCU read lock and calling
+ * update_if_frozen() on all descendants prior to invoking this function.
+ *
+ * Task states and freezer state might disagree while tasks are being
+ * migrated into or out of @css, so we can't verify task states against
+ * @freezer state here. See freezer_attach() for details.
+ */
+static void update_if_frozen(struct cgroup_subsys_state *css)
+{
+ struct freezer *freezer = css_freezer(css);
+ struct cgroup_subsys_state *pos;
+ struct css_task_iter it;
+ struct task_struct *task;
+
+ lockdep_assert_held(&freezer_mutex);
+
+ if (!(freezer->state & CGROUP_FREEZING) ||
+ (freezer->state & CGROUP_FROZEN))
+ return;
+
+ /* are all (live) children frozen? */
+ rcu_read_lock();
+ css_for_each_child(pos, css) {
+ struct freezer *child = css_freezer(pos);
+
+ if ((child->state & CGROUP_FREEZER_ONLINE) &&
+ !(child->state & CGROUP_FROZEN)) {
+ rcu_read_unlock();
+ return;
+ }
+ }
+ rcu_read_unlock();
+
+ /* are all tasks frozen? */
+ css_task_iter_start(css, 0, &it);
+
+ while ((task = css_task_iter_next(&it))) {
+ if (freezing(task)) {
+ /*
+ * freezer_should_skip() indicates that the task
+ * should be skipped when determining freezing
+ * completion. Consider it frozen in addition to
+ * the usual frozen condition.
+ */
+ if (!frozen(task) && !freezer_should_skip(task))
+ goto out_iter_end;
+ }
+ }
+
+ freezer->state |= CGROUP_FROZEN;
+out_iter_end:
+ css_task_iter_end(&it);
+}
+
+static int freezer_read(struct seq_file *m, void *v)
+{
+ struct cgroup_subsys_state *css = seq_css(m), *pos;
+
+ mutex_lock(&freezer_mutex);
+ rcu_read_lock();
+
+ /* update states bottom-up */
+ css_for_each_descendant_post(pos, css) {
+ if (!css_tryget_online(pos))
+ continue;
+ rcu_read_unlock();
+
+ update_if_frozen(pos);
+
+ rcu_read_lock();
+ css_put(pos);
+ }
+
+ rcu_read_unlock();
+ mutex_unlock(&freezer_mutex);
+
+ seq_puts(m, freezer_state_strs(css_freezer(css)->state));
+ seq_putc(m, '\n');
+ return 0;
+}
+
+static void freeze_cgroup(struct freezer *freezer)
+{
+ struct css_task_iter it;
+ struct task_struct *task;
+
+ css_task_iter_start(&freezer->css, 0, &it);
+ while ((task = css_task_iter_next(&it)))
+ freeze_task(task);
+ css_task_iter_end(&it);
+}
+
+static void unfreeze_cgroup(struct freezer *freezer)
+{
+ struct css_task_iter it;
+ struct task_struct *task;
+
+ css_task_iter_start(&freezer->css, 0, &it);
+ while ((task = css_task_iter_next(&it)))
+ __thaw_task(task);
+ css_task_iter_end(&it);
+}
+
+/**
+ * freezer_apply_state - apply state change to a single cgroup_freezer
+ * @freezer: freezer to apply state change to
+ * @freeze: whether to freeze or unfreeze
+ * @state: CGROUP_FREEZING_* flag to set or clear
+ *
+ * Set or clear @state on @cgroup according to @freeze, and perform
+ * freezing or thawing as necessary.
+ */
+static void freezer_apply_state(struct freezer *freezer, bool freeze,
+ unsigned int state)
+{
+ /* also synchronizes against task migration, see freezer_attach() */
+ lockdep_assert_held(&freezer_mutex);
+
+ if (!(freezer->state & CGROUP_FREEZER_ONLINE))
+ return;
+
+ if (freeze) {
+ if (!(freezer->state & CGROUP_FREEZING))
+ atomic_inc(&system_freezing_cnt);
+ freezer->state |= state;
+ freeze_cgroup(freezer);
+ } else {
+ bool was_freezing = freezer->state & CGROUP_FREEZING;
+
+ freezer->state &= ~state;
+
+ if (!(freezer->state & CGROUP_FREEZING)) {
+ if (was_freezing)
+ atomic_dec(&system_freezing_cnt);
+ freezer->state &= ~CGROUP_FROZEN;
+ unfreeze_cgroup(freezer);
+ }
+ }
+}
+
+/**
+ * freezer_change_state - change the freezing state of a cgroup_freezer
+ * @freezer: freezer of interest
+ * @freeze: whether to freeze or thaw
+ *
+ * Freeze or thaw @freezer according to @freeze. The operations are
+ * recursive - all descendants of @freezer will be affected.
+ */
+static void freezer_change_state(struct freezer *freezer, bool freeze)
+{
+ struct cgroup_subsys_state *pos;
+
+ /*
+ * Update all its descendants in pre-order traversal. Each
+ * descendant will try to inherit its parent's FREEZING state as
+ * CGROUP_FREEZING_PARENT.
+ */
+ mutex_lock(&freezer_mutex);
+ rcu_read_lock();
+ css_for_each_descendant_pre(pos, &freezer->css) {
+ struct freezer *pos_f = css_freezer(pos);
+ struct freezer *parent = parent_freezer(pos_f);
+
+ if (!css_tryget_online(pos))
+ continue;
+ rcu_read_unlock();
+
+ if (pos_f == freezer)
+ freezer_apply_state(pos_f, freeze,
+ CGROUP_FREEZING_SELF);
+ else
+ freezer_apply_state(pos_f,
+ parent->state & CGROUP_FREEZING,
+ CGROUP_FREEZING_PARENT);
+
+ rcu_read_lock();
+ css_put(pos);
+ }
+ rcu_read_unlock();
+ mutex_unlock(&freezer_mutex);
+}
+
+static ssize_t freezer_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off)
+{
+ bool freeze;
+
+ buf = strstrip(buf);
+
+ if (strcmp(buf, freezer_state_strs(0)) == 0)
+ freeze = false;
+ else if (strcmp(buf, freezer_state_strs(CGROUP_FROZEN)) == 0)
+ freeze = true;
+ else
+ return -EINVAL;
+
+ freezer_change_state(css_freezer(of_css(of)), freeze);
+ return nbytes;
+}
+
+static u64 freezer_self_freezing_read(struct cgroup_subsys_state *css,
+ struct cftype *cft)
+{
+ struct freezer *freezer = css_freezer(css);
+
+ return (bool)(freezer->state & CGROUP_FREEZING_SELF);
+}
+
+static u64 freezer_parent_freezing_read(struct cgroup_subsys_state *css,
+ struct cftype *cft)
+{
+ struct freezer *freezer = css_freezer(css);
+
+ return (bool)(freezer->state & CGROUP_FREEZING_PARENT);
+}
+
+static struct cftype files[] = {
+ {
+ .name = "state",
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .seq_show = freezer_read,
+ .write = freezer_write,
+ },
+ {
+ .name = "self_freezing",
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .read_u64 = freezer_self_freezing_read,
+ },
+ {
+ .name = "parent_freezing",
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .read_u64 = freezer_parent_freezing_read,
+ },
+ { } /* terminate */
+};
+
+struct cgroup_subsys freezer_cgrp_subsys = {
+ .css_alloc = freezer_css_alloc,
+ .css_online = freezer_css_online,
+ .css_offline = freezer_css_offline,
+ .css_free = freezer_css_free,
+ .attach = freezer_attach,
+ .fork = freezer_fork,
+ .legacy_cftypes = files,
+};
diff --git a/kernel/cgroup/pids.c b/kernel/cgroup/pids.c
index 9829c67ebc0a..8e513a573fe9 100644
--- a/kernel/cgroup/pids.c
+++ b/kernel/cgroup/pids.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Process number limiting controller for cgroups.
*
@@ -25,10 +26,6 @@
* a superset of parent/child/pids.current.
*
* Copyright (C) 2015 Aleksa Sarai <cyphar@cyphar.com>
- *
- * This file is subject to the terms and conditions of version 2 of the GNU
- * General Public License. See the file COPYING in the main directory of the
- * Linux distribution for more details.
*/
#include <linux/kernel.h>
@@ -247,7 +244,7 @@ static void pids_cancel_fork(struct task_struct *task)
pids_uncharge(pids, 1);
}
-static void pids_free(struct task_struct *task)
+static void pids_release(struct task_struct *task)
{
struct pids_cgroup *pids = css_pids(task_css(task, pids_cgrp_id));
@@ -342,7 +339,7 @@ struct cgroup_subsys pids_cgrp_subsys = {
.cancel_attach = pids_cancel_attach,
.can_fork = pids_can_fork,
.cancel_fork = pids_cancel_fork,
- .free = pids_free,
+ .release = pids_release,
.legacy_cftypes = pids_files,
.dfl_cftypes = pids_files,
.threaded = true,
diff --git a/kernel/cgroup/rdma.c b/kernel/cgroup/rdma.c
index d3bbb757ee49..ae042c347c64 100644
--- a/kernel/cgroup/rdma.c
+++ b/kernel/cgroup/rdma.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* RDMA resource limiting controller for cgroups.
*
@@ -5,10 +6,6 @@
* additional RDMA resources after a certain limit is reached.
*
* Copyright (C) 2016 Parav Pandit <pandit.parav@gmail.com>
- *
- * This file is subject to the terms and conditions of version 2 of the GNU
- * General Public License. See the file COPYING in the main directory of the
- * Linux distribution for more details.
*/
#include <linux/bitops.h>
@@ -313,10 +310,8 @@ EXPORT_SYMBOL(rdmacg_try_charge);
* If IB stack wish a device to participate in rdma cgroup resource
* tracking, it must invoke this API to register with rdma cgroup before
* any user space application can start using the RDMA resources.
- * Returns 0 on success or EINVAL when table length given is beyond
- * supported size.
*/
-int rdmacg_register_device(struct rdmacg_device *device)
+void rdmacg_register_device(struct rdmacg_device *device)
{
INIT_LIST_HEAD(&device->dev_node);
INIT_LIST_HEAD(&device->rpools);
@@ -324,7 +319,6 @@ int rdmacg_register_device(struct rdmacg_device *device)
mutex_lock(&rdmacg_mutex);
list_add_tail(&device->dev_node, &rdmacg_devices);
mutex_unlock(&rdmacg_mutex);
- return 0;
}
EXPORT_SYMBOL(rdmacg_register_device);
diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c
index d503d1a9007c..ca19b4c8acf5 100644
--- a/kernel/cgroup/rstat.c
+++ b/kernel/cgroup/rstat.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
#include "cgroup-internal.h"
#include <linux/sched/cputime.h>
@@ -87,7 +88,6 @@ static struct cgroup *cgroup_rstat_cpu_pop_updated(struct cgroup *pos,
struct cgroup *root, int cpu)
{
struct cgroup_rstat_cpu *rstatc;
- struct cgroup *parent;
if (pos == root)
return NULL;
@@ -115,8 +115,8 @@ static struct cgroup *cgroup_rstat_cpu_pop_updated(struct cgroup *pos,
* However, due to the way we traverse, @pos will be the first
* child in most cases. The only exception is @root.
*/
- parent = cgroup_parent(pos);
- if (parent && rstatc->updated_next) {
+ if (rstatc->updated_next) {
+ struct cgroup *parent = cgroup_parent(pos);
struct cgroup_rstat_cpu *prstatc = cgroup_rstat_cpu(parent, cpu);
struct cgroup_rstat_cpu *nrstatc;
struct cgroup **nextp;
@@ -140,9 +140,12 @@ static struct cgroup *cgroup_rstat_cpu_pop_updated(struct cgroup *pos,
* updated stat.
*/
smp_mb();
+
+ return pos;
}
- return pos;
+ /* only happens for @root */
+ return NULL;
}
/* see cgroup_rstat_flush() */
diff --git a/kernel/compat.c b/kernel/compat.c
index f01affa17e22..a2bc1d6ceb57 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* linux/kernel/compat.c
*
@@ -5,10 +6,6 @@
* on 64 bit kernels.
*
* Copyright (C) 2002-2003 Stephen Rothwell, IBM Corporation
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
*/
#include <linux/linkage.h>
@@ -20,7 +17,6 @@
#include <linux/syscalls.h>
#include <linux/unistd.h>
#include <linux/security.h>
-#include <linux/timex.h>
#include <linux/export.h>
#include <linux/migrate.h>
#include <linux/posix-timers.h>
@@ -30,69 +26,6 @@
#include <linux/uaccess.h>
-int compat_get_timex(struct timex *txc, const struct compat_timex __user *utp)
-{
- struct compat_timex tx32;
-
- memset(txc, 0, sizeof(struct timex));
- if (copy_from_user(&tx32, utp, sizeof(struct compat_timex)))
- return -EFAULT;
-
- txc->modes = tx32.modes;
- txc->offset = tx32.offset;
- txc->freq = tx32.freq;
- txc->maxerror = tx32.maxerror;
- txc->esterror = tx32.esterror;
- txc->status = tx32.status;
- txc->constant = tx32.constant;
- txc->precision = tx32.precision;
- txc->tolerance = tx32.tolerance;
- txc->time.tv_sec = tx32.time.tv_sec;
- txc->time.tv_usec = tx32.time.tv_usec;
- txc->tick = tx32.tick;
- txc->ppsfreq = tx32.ppsfreq;
- txc->jitter = tx32.jitter;
- txc->shift = tx32.shift;
- txc->stabil = tx32.stabil;
- txc->jitcnt = tx32.jitcnt;
- txc->calcnt = tx32.calcnt;
- txc->errcnt = tx32.errcnt;
- txc->stbcnt = tx32.stbcnt;
-
- return 0;
-}
-
-int compat_put_timex(struct compat_timex __user *utp, const struct timex *txc)
-{
- struct compat_timex tx32;
-
- memset(&tx32, 0, sizeof(struct compat_timex));
- tx32.modes = txc->modes;
- tx32.offset = txc->offset;
- tx32.freq = txc->freq;
- tx32.maxerror = txc->maxerror;
- tx32.esterror = txc->esterror;
- tx32.status = txc->status;
- tx32.constant = txc->constant;
- tx32.precision = txc->precision;
- tx32.tolerance = txc->tolerance;
- tx32.time.tv_sec = txc->time.tv_sec;
- tx32.time.tv_usec = txc->time.tv_usec;
- tx32.tick = txc->tick;
- tx32.ppsfreq = txc->ppsfreq;
- tx32.jitter = txc->jitter;
- tx32.shift = txc->shift;
- tx32.stabil = txc->stabil;
- tx32.jitcnt = txc->jitcnt;
- tx32.calcnt = txc->calcnt;
- tx32.errcnt = txc->errcnt;
- tx32.stbcnt = txc->stbcnt;
- tx32.tai = txc->tai;
- if (copy_to_user(utp, &tx32, sizeof(struct compat_timex)))
- return -EFAULT;
- return 0;
-}
-
static int __compat_get_timeval(struct timeval *tv, const struct old_timeval32 __user *ctv)
{
return (!access_ok(ctv, sizeof(*ctv)) ||
@@ -410,8 +343,11 @@ get_compat_sigset(sigset_t *set, const compat_sigset_t __user *compat)
return -EFAULT;
switch (_NSIG_WORDS) {
case 4: set->sig[3] = v.sig[6] | (((long)v.sig[7]) << 32 );
+ /* fall through */
case 3: set->sig[2] = v.sig[4] | (((long)v.sig[5]) << 32 );
+ /* fall through */
case 2: set->sig[1] = v.sig[2] | (((long)v.sig[3]) << 32 );
+ /* fall through */
case 1: set->sig[0] = v.sig[0] | (((long)v.sig[1]) << 32 );
}
#else
diff --git a/kernel/configs.c b/kernel/configs.c
index 2df132b20217..b062425ccf8d 100644
--- a/kernel/configs.c
+++ b/kernel/configs.c
@@ -30,37 +30,35 @@
#include <linux/init.h>
#include <linux/uaccess.h>
-/**************************************************/
-/* the actual current config file */
-
/*
- * Define kernel_config_data and kernel_config_data_size, which contains the
- * wrapped and compressed configuration file. The file is first compressed
- * with gzip and then bounded by two eight byte magic numbers to allow
- * extraction from a binary kernel image:
- *
- * IKCFG_ST
- * <image>
- * IKCFG_ED
+ * "IKCFG_ST" and "IKCFG_ED" are used to extract the config data from
+ * a binary kernel image or a module. See scripts/extract-ikconfig.
*/
-#define MAGIC_START "IKCFG_ST"
-#define MAGIC_END "IKCFG_ED"
-#include "config_data.h"
-
-
-#define MAGIC_SIZE (sizeof(MAGIC_START) - 1)
-#define kernel_config_data_size \
- (sizeof(kernel_config_data) - 1 - MAGIC_SIZE * 2)
+asm (
+" .pushsection .rodata, \"a\" \n"
+" .ascii \"IKCFG_ST\" \n"
+" .global kernel_config_data \n"
+"kernel_config_data: \n"
+" .incbin \"kernel/config_data.gz\" \n"
+" .global kernel_config_data_end \n"
+"kernel_config_data_end: \n"
+" .ascii \"IKCFG_ED\" \n"
+" .popsection \n"
+);
#ifdef CONFIG_IKCONFIG_PROC
+extern char kernel_config_data;
+extern char kernel_config_data_end;
+
static ssize_t
ikconfig_read_current(struct file *file, char __user *buf,
size_t len, loff_t * offset)
{
return simple_read_from_buffer(buf, len, offset,
- kernel_config_data + MAGIC_SIZE,
- kernel_config_data_size);
+ &kernel_config_data,
+ &kernel_config_data_end -
+ &kernel_config_data);
}
static const struct file_operations ikconfig_file_ops = {
@@ -79,7 +77,7 @@ static int __init ikconfig_init(void)
if (!entry)
return -ENOMEM;
- proc_set_size(entry, kernel_config_data_size);
+ proc_set_size(entry, &kernel_config_data_end - &kernel_config_data);
return 0;
}
diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c
index 9ad37b9e44a7..be01a4d627c9 100644
--- a/kernel/context_tracking.c
+++ b/kernel/context_tracking.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Context tracking: Probe on high level context boundaries such as kernel
* and userspace. This includes syscalls and exceptions entry/exit.
diff --git a/kernel/cpu.c b/kernel/cpu.c
index d1c6d152da89..e84c0873559e 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -9,6 +9,7 @@
#include <linux/notifier.h>
#include <linux/sched/signal.h>
#include <linux/sched/hotplug.h>
+#include <linux/sched/isolation.h>
#include <linux/sched/task.h>
#include <linux/sched/smt.h>
#include <linux/unistd.h>
@@ -313,6 +314,15 @@ void cpus_write_unlock(void)
void lockdep_assert_cpus_held(void)
{
+ /*
+ * We can't have hotplug operations before userspace starts running,
+ * and some init codepaths will knowingly not take the hotplug lock.
+ * This is all valid, so mute lockdep until it makes sense to report
+ * unheld locks.
+ */
+ if (system_state < SYSTEM_RUNNING)
+ return;
+
percpu_rwsem_assert_held(&cpu_hotplug_lock);
}
@@ -512,7 +522,7 @@ static int bringup_wait_for_ap(unsigned int cpu)
/*
* SMT soft disabling on X86 requires to bring the CPU out of the
* BIOS 'wait for SIPI' state in order to set the CR4.MCE bit. The
- * CPU marked itself as booted_once in cpu_notify_starting() so the
+ * CPU marked itself as booted_once in notify_cpu_starting() so the
* cpu_smt_allowed() check will now return false if this is not the
* primary sibling.
*/
@@ -555,6 +565,20 @@ static void undo_cpu_up(unsigned int cpu, struct cpuhp_cpu_state *st)
cpuhp_invoke_callback(cpu, st->state, false, NULL, NULL);
}
+static inline bool can_rollback_cpu(struct cpuhp_cpu_state *st)
+{
+ if (IS_ENABLED(CONFIG_HOTPLUG_CPU))
+ return true;
+ /*
+ * When CPU hotplug is disabled, then taking the CPU down is not
+ * possible because takedown_cpu() and the architecture and
+ * subsystem specific mechanisms are not available. So the CPU
+ * which would be completely unplugged again needs to stay around
+ * in the current state.
+ */
+ return st->state <= CPUHP_BRINGUP_CPU;
+}
+
static int cpuhp_up_callbacks(unsigned int cpu, struct cpuhp_cpu_state *st,
enum cpuhp_state target)
{
@@ -565,8 +589,10 @@ static int cpuhp_up_callbacks(unsigned int cpu, struct cpuhp_cpu_state *st,
st->state++;
ret = cpuhp_invoke_callback(cpu, st->state, true, NULL, NULL);
if (ret) {
- st->target = prev_state;
- undo_cpu_up(cpu, st);
+ if (can_rollback_cpu(st)) {
+ st->target = prev_state;
+ undo_cpu_up(cpu, st);
+ }
break;
}
}
@@ -835,6 +861,8 @@ static int take_cpu_down(void *_param)
/* Give up timekeeping duties */
tick_handover_do_timer();
+ /* Remove CPU from timer broadcasting */
+ tick_offline_cpu(cpu);
/* Park the stopper thread */
stop_machine_park(cpu);
return 0;
@@ -1174,8 +1202,15 @@ int freeze_secondary_cpus(int primary)
int cpu, error = 0;
cpu_maps_update_begin();
- if (!cpu_online(primary))
+ if (primary == -1) {
primary = cpumask_first(cpu_online_mask);
+ if (!housekeeping_cpu(primary, HK_FLAG_TIMER))
+ primary = housekeeping_any_cpu(HK_FLAG_TIMER);
+ } else {
+ if (!cpu_online(primary))
+ primary = cpumask_first(cpu_online_mask);
+ }
+
/*
* We take down all of the non-boot CPUs in one shot to avoid races
* with the userspace trying to use the CPU hotplug at the same time
@@ -1186,6 +1221,13 @@ int freeze_secondary_cpus(int primary)
for_each_online_cpu(cpu) {
if (cpu == primary)
continue;
+
+ if (pm_wakeup_pending()) {
+ pr_info("Wakeup pending. Abort CPU freeze\n");
+ error = -EBUSY;
+ break;
+ }
+
trace_suspend_resume(TPS("CPU_OFF"), cpu, true);
error = _cpu_down(cpu, 1, CPUHP_OFFLINE);
trace_suspend_resume(TPS("CPU_OFF"), cpu, false);
@@ -1929,6 +1971,9 @@ static ssize_t write_cpuhp_fail(struct device *dev,
if (ret)
return ret;
+ if (fail < CPUHP_OFFLINE || fail > CPUHP_ONLINE)
+ return -EINVAL;
+
/*
* Cannot fail STARTING/DYING callbacks.
*/
@@ -2008,19 +2053,6 @@ static const struct attribute_group cpuhp_cpu_root_attr_group = {
#ifdef CONFIG_HOTPLUG_SMT
-static const char *smt_states[] = {
- [CPU_SMT_ENABLED] = "on",
- [CPU_SMT_DISABLED] = "off",
- [CPU_SMT_FORCE_DISABLED] = "forceoff",
- [CPU_SMT_NOT_SUPPORTED] = "notsupported",
-};
-
-static ssize_t
-show_smt_control(struct device *dev, struct device_attribute *attr, char *buf)
-{
- return snprintf(buf, PAGE_SIZE - 2, "%s\n", smt_states[cpu_smt_control]);
-}
-
static void cpuhp_offline_cpu_device(unsigned int cpu)
{
struct device *dev = get_cpu_device(cpu);
@@ -2039,7 +2071,7 @@ static void cpuhp_online_cpu_device(unsigned int cpu)
kobject_uevent(&dev->kobj, KOBJ_ONLINE);
}
-static int cpuhp_smt_disable(enum cpuhp_smt_control ctrlval)
+int cpuhp_smt_disable(enum cpuhp_smt_control ctrlval)
{
int cpu, ret = 0;
@@ -2071,7 +2103,7 @@ static int cpuhp_smt_disable(enum cpuhp_smt_control ctrlval)
return ret;
}
-static int cpuhp_smt_enable(void)
+int cpuhp_smt_enable(void)
{
int cpu, ret = 0;
@@ -2091,9 +2123,10 @@ static int cpuhp_smt_enable(void)
return ret;
}
+
static ssize_t
-store_smt_control(struct device *dev, struct device_attribute *attr,
- const char *buf, size_t count)
+__store_smt_control(struct device *dev, struct device_attribute *attr,
+ const char *buf, size_t count)
{
int ctrlval, ret;
@@ -2131,14 +2164,44 @@ store_smt_control(struct device *dev, struct device_attribute *attr,
unlock_device_hotplug();
return ret ? ret : count;
}
+
+#else /* !CONFIG_HOTPLUG_SMT */
+static ssize_t
+__store_smt_control(struct device *dev, struct device_attribute *attr,
+ const char *buf, size_t count)
+{
+ return -ENODEV;
+}
+#endif /* CONFIG_HOTPLUG_SMT */
+
+static const char *smt_states[] = {
+ [CPU_SMT_ENABLED] = "on",
+ [CPU_SMT_DISABLED] = "off",
+ [CPU_SMT_FORCE_DISABLED] = "forceoff",
+ [CPU_SMT_NOT_SUPPORTED] = "notsupported",
+ [CPU_SMT_NOT_IMPLEMENTED] = "notimplemented",
+};
+
+static ssize_t
+show_smt_control(struct device *dev, struct device_attribute *attr, char *buf)
+{
+ const char *state = smt_states[cpu_smt_control];
+
+ return snprintf(buf, PAGE_SIZE - 2, "%s\n", state);
+}
+
+static ssize_t
+store_smt_control(struct device *dev, struct device_attribute *attr,
+ const char *buf, size_t count)
+{
+ return __store_smt_control(dev, attr, buf, count);
+}
static DEVICE_ATTR(control, 0644, show_smt_control, store_smt_control);
static ssize_t
show_smt_active(struct device *dev, struct device_attribute *attr, char *buf)
{
- bool active = topology_max_smt_threads() > 1;
-
- return snprintf(buf, PAGE_SIZE - 2, "%d\n", active);
+ return snprintf(buf, PAGE_SIZE - 2, "%d\n", sched_smt_active());
}
static DEVICE_ATTR(active, 0444, show_smt_active, NULL);
@@ -2154,21 +2217,17 @@ static const struct attribute_group cpuhp_smt_attr_group = {
NULL
};
-static int __init cpu_smt_state_init(void)
+static int __init cpu_smt_sysfs_init(void)
{
return sysfs_create_group(&cpu_subsys.dev_root->kobj,
&cpuhp_smt_attr_group);
}
-#else
-static inline int cpu_smt_state_init(void) { return 0; }
-#endif
-
static int __init cpuhp_sysfs_init(void)
{
int cpu, ret;
- ret = cpu_smt_state_init();
+ ret = cpu_smt_sysfs_init();
if (ret)
return ret;
@@ -2189,7 +2248,7 @@ static int __init cpuhp_sysfs_init(void)
return 0;
}
device_initcall(cpuhp_sysfs_init);
-#endif
+#endif /* CONFIG_SYSFS && CONFIG_HOTPLUG_CPU */
/*
* cpu_bit_bitmap[] is a special, "compressed" data structure that
@@ -2279,3 +2338,21 @@ void __init boot_cpu_hotplug_init(void)
#endif
this_cpu_write(cpuhp_state.state, CPUHP_ONLINE);
}
+
+enum cpu_mitigations cpu_mitigations __ro_after_init = CPU_MITIGATIONS_AUTO;
+
+static int __init mitigations_parse_cmdline(char *arg)
+{
+ if (!strcmp(arg, "off"))
+ cpu_mitigations = CPU_MITIGATIONS_OFF;
+ else if (!strcmp(arg, "auto"))
+ cpu_mitigations = CPU_MITIGATIONS_AUTO;
+ else if (!strcmp(arg, "auto,nosmt"))
+ cpu_mitigations = CPU_MITIGATIONS_AUTO_NOSMT;
+ else
+ pr_crit("Unsupported mitigations=%s, system may still be vulnerable\n",
+ arg);
+
+ return 0;
+}
+early_param("mitigations", mitigations_parse_cmdline);
diff --git a/kernel/cpu_pm.c b/kernel/cpu_pm.c
index 67b02e138a47..cbca6879ab7d 100644
--- a/kernel/cpu_pm.c
+++ b/kernel/cpu_pm.c
@@ -1,18 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright (C) 2011 Google, Inc.
*
* Author:
* Colin Cross <ccross@android.com>
- *
- * This software is licensed under the terms of the GNU General Public
- * License version 2, as published by the Free Software Foundation, and
- * may be copied, distributed, and modified under those terms.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
*/
#include <linux/kernel.h>
diff --git a/kernel/crash_core.c b/kernel/crash_core.c
index 933cb3e45b98..9f1557b98468 100644
--- a/kernel/crash_core.c
+++ b/kernel/crash_core.c
@@ -1,9 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* crash.c - kernel crash support code.
* Copyright (C) 2002-2004 Eric Biederman <ebiederm@xmission.com>
- *
- * This source code is licensed under the GNU General Public License,
- * Version 2. See the file COPYING for more details.
*/
#include <linux/crash_core.h>
@@ -464,6 +462,8 @@ static int __init crash_save_vmcoreinfo_init(void)
VMCOREINFO_NUMBER(PAGE_BUDDY_MAPCOUNT_VALUE);
#ifdef CONFIG_HUGETLB_PAGE
VMCOREINFO_NUMBER(HUGETLB_PAGE_DTOR);
+#define PAGE_OFFLINE_MAPCOUNT_VALUE (~PG_offline)
+ VMCOREINFO_NUMBER(PAGE_OFFLINE_MAPCOUNT_VALUE);
#endif
arch_crash_save_vmcoreinfo();
diff --git a/kernel/crash_dump.c b/kernel/crash_dump.c
index b64e238b553b..9c23ae074b40 100644
--- a/kernel/crash_dump.c
+++ b/kernel/crash_dump.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
#include <linux/kernel.h>
#include <linux/crash_dump.h>
#include <linux/init.h>
diff --git a/kernel/cred.c b/kernel/cred.c
index 21f4a97085b4..f9a0ce66c9c3 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -1,12 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/* Task credentials management - see Documentation/security/credentials.rst
*
* Copyright (C) 2008 Red Hat, Inc. All Rights Reserved.
* Written by David Howells (dhowells@redhat.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public Licence
- * as published by the Free Software Foundation; either version
- * 2 of the Licence, or (at your option) any later version.
*/
#include <linux/export.h>
#include <linux/cred.h>
@@ -174,6 +170,11 @@ void exit_creds(struct task_struct *tsk)
validate_creds(cred);
alter_cred_subscribers(cred, -1);
put_cred(cred);
+
+#ifdef CONFIG_KEYS_REQUEST_CACHE
+ key_put(current->cached_requested_key);
+ current->cached_requested_key = NULL;
+#endif
}
/**
@@ -327,6 +328,10 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags)
struct cred *new;
int ret;
+#ifdef CONFIG_KEYS_REQUEST_CACHE
+ p->cached_requested_key = NULL;
+#endif
+
if (
#ifdef CONFIG_KEYS
!p->cred->thread_keyring &&
@@ -450,14 +455,23 @@ int commit_creds(struct cred *new)
if (task->mm)
set_dumpable(task->mm, suid_dumpable);
task->pdeath_signal = 0;
+ /*
+ * If a task drops privileges and becomes nondumpable,
+ * the dumpability change must become visible before
+ * the credential change; otherwise, a __ptrace_may_access()
+ * racing with this change may be able to attach to a task it
+ * shouldn't be able to attach to (as if the task had dropped
+ * privileges without becoming nondumpable).
+ * Pairs with a read barrier in __ptrace_may_access().
+ */
smp_wmb();
}
/* alter the thread keyring */
if (!uid_eq(new->fsuid, old->fsuid))
- key_fsuid_changed(task);
+ key_fsuid_changed(new);
if (!gid_eq(new->fsgid, old->fsgid))
- key_fsgid_changed(task);
+ key_fsgid_changed(new);
/* do it
* RLIMIT_NPROC limits on user->processes have already been checked
@@ -760,19 +774,6 @@ bool creds_are_invalid(const struct cred *cred)
{
if (cred->magic != CRED_MAGIC)
return true;
-#ifdef CONFIG_SECURITY_SELINUX
- /*
- * cred->security == NULL if security_cred_alloc_blank() or
- * security_prepare_creds() returned an error.
- */
- if (selinux_is_enabled() && cred->security) {
- if ((unsigned long) cred->security < PAGE_SIZE)
- return true;
- if ((*(u32 *)cred->security & 0xffffff00) ==
- (POISON_FREE << 24 | POISON_FREE << 16 | POISON_FREE << 8))
- return true;
- }
-#endif
return false;
}
EXPORT_SYMBOL(creds_are_invalid);
diff --git a/kernel/debug/Makefile b/kernel/debug/Makefile
index a85edc339985..332ee6c6ec2c 100644
--- a/kernel/debug/Makefile
+++ b/kernel/debug/Makefile
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
#
# Makefile for the linux kernel debugger
#
diff --git a/kernel/debug/gdbstub.c b/kernel/debug/gdbstub.c
index 7510dc687c0d..4b280fc7dd67 100644
--- a/kernel/debug/gdbstub.c
+++ b/kernel/debug/gdbstub.c
@@ -1033,13 +1033,14 @@ int gdb_serial_stub(struct kgdb_state *ks)
return DBG_PASS_EVENT;
}
#endif
+ /* Fall through */
case 'C': /* Exception passing */
tmp = gdb_cmd_exception_pass(ks);
if (tmp > 0)
goto default_handle;
if (tmp == 0)
break;
- /* Fall through on tmp < 0 */
+ /* Fall through - on tmp < 0 */
case 'c': /* Continue packet */
case 's': /* Single step packet */
if (kgdb_contthread && kgdb_contthread != current) {
@@ -1048,7 +1049,7 @@ int gdb_serial_stub(struct kgdb_state *ks)
break;
}
dbg_activate_sw_breakpoints();
- /* Fall through to default processing */
+ /* Fall through - to default processing */
default:
default_handle:
error = kgdb_arch_handle_exception(ks->ex_vector,
@@ -1094,10 +1095,10 @@ int gdbstub_state(struct kgdb_state *ks, char *cmd)
return error;
case 's':
case 'c':
- strcpy(remcom_in_buffer, cmd);
+ strscpy(remcom_in_buffer, cmd, sizeof(remcom_in_buffer));
return 0;
case '$':
- strcpy(remcom_in_buffer, cmd);
+ strscpy(remcom_in_buffer, cmd, sizeof(remcom_in_buffer));
gdbstub_use_prev_in_buf = strlen(remcom_in_buffer);
gdbstub_prev_in_buf_pos = 0;
return 0;
diff --git a/kernel/debug/kdb/Makefile b/kernel/debug/kdb/Makefile
index d4fc58f4b88d..efac857c5511 100644
--- a/kernel/debug/kdb/Makefile
+++ b/kernel/debug/kdb/Makefile
@@ -6,7 +6,6 @@
# Copyright (c) 2009 Wind River Systems, Inc. All Rights Reserved.
#
-CCVERSION := $(shell $(CC) -v 2>&1 | sed -ne '$$p')
obj-y := kdb_io.o kdb_main.o kdb_support.o kdb_bt.o gen-kdb_cmds.o kdb_bp.o kdb_debugger.o
obj-$(CONFIG_KDB_KEYBOARD) += kdb_keyboard.o
diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c
index 6a4b41484afe..3a5184eb6977 100644
--- a/kernel/debug/kdb/kdb_io.c
+++ b/kernel/debug/kdb/kdb_io.c
@@ -446,7 +446,7 @@ poll_again:
char *kdb_getstr(char *buffer, size_t bufsize, const char *prompt)
{
if (prompt && kdb_prompt_str != prompt)
- strncpy(kdb_prompt_str, prompt, CMD_BUFLEN);
+ strscpy(kdb_prompt_str, prompt, CMD_BUFLEN);
kdb_printf(kdb_prompt_str);
kdb_nextline = 1; /* Prompt and input resets line number */
return kdb_read(buffer, bufsize);
diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c
index 82a3b32a7cfc..9ecfa37c7fbf 100644
--- a/kernel/debug/kdb/kdb_main.c
+++ b/kernel/debug/kdb/kdb_main.c
@@ -2522,7 +2522,6 @@ static int kdb_summary(int argc, const char **argv)
kdb_printf("machine %s\n", init_uts_ns.name.machine);
kdb_printf("nodename %s\n", init_uts_ns.name.nodename);
kdb_printf("domainname %s\n", init_uts_ns.name.domainname);
- kdb_printf("ccversion %s\n", __stringify(CCVERSION));
now = __ktime_get_real_seconds();
time64_to_tm(now, 0, &tm);
@@ -2584,7 +2583,7 @@ static int kdb_per_cpu(int argc, const char **argv)
diag = kdbgetularg(argv[3], &whichcpu);
if (diag)
return diag;
- if (!cpu_online(whichcpu)) {
+ if (whichcpu >= nr_cpu_ids || !cpu_online(whichcpu)) {
kdb_printf("cpu %ld is not online\n", whichcpu);
return KDB_BADCPUNUM;
}
diff --git a/kernel/debug/kdb/kdb_support.c b/kernel/debug/kdb/kdb_support.c
index 50bf9b119bad..b8e6306e7e13 100644
--- a/kernel/debug/kdb/kdb_support.c
+++ b/kernel/debug/kdb/kdb_support.c
@@ -192,7 +192,7 @@ int kallsyms_symbol_complete(char *prefix_name, int max_len)
while ((name = kdb_walk_kallsyms(&pos))) {
if (strncmp(name, prefix_name, prefix_len) == 0) {
- strcpy(ks_namebuf, name);
+ strscpy(ks_namebuf, name, sizeof(ks_namebuf));
/* Work out the longest name that matches the prefix */
if (++number == 1) {
prev_len = min_t(int, max_len-1,
diff --git a/kernel/delayacct.c b/kernel/delayacct.c
index 2a12b988c717..27725754ac99 100644
--- a/kernel/delayacct.c
+++ b/kernel/delayacct.c
@@ -1,16 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/* delayacct.c - per-task delay accounting
*
* Copyright (C) Shailabh Nagar, IBM Corp. 2006
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it would be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
- * the GNU General Public License for more details.
*/
#include <linux/sched.h>
diff --git a/kernel/dma/Kconfig b/kernel/dma/Kconfig
index ca88b867e7fe..70f8f8d9200e 100644
--- a/kernel/dma/Kconfig
+++ b/kernel/dma/Kconfig
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
config HAS_DMA
bool
@@ -16,7 +17,16 @@ config ARCH_DMA_ADDR_T_64BIT
config ARCH_HAS_DMA_COHERENCE_H
bool
-config HAVE_GENERIC_DMA_COHERENT
+config ARCH_HAS_DMA_SET_MASK
+ bool
+
+config DMA_DECLARE_COHERENT
+ bool
+
+config ARCH_HAS_SETUP_DMA_OPS
+ bool
+
+config ARCH_HAS_TEARDOWN_DMA_OPS
bool
config ARCH_HAS_SYNC_DMA_FOR_DEVICE
@@ -29,6 +39,9 @@ config ARCH_HAS_SYNC_DMA_FOR_CPU
config ARCH_HAS_SYNC_DMA_FOR_CPU_ALL
bool
+config ARCH_HAS_DMA_PREP_COHERENT
+ bool
+
config ARCH_HAS_DMA_COHERENT_TO_PFN
bool
@@ -48,8 +61,122 @@ config SWIOTLB
config DMA_REMAP
depends on MMU
+ select GENERIC_ALLOCATOR
bool
config DMA_DIRECT_REMAP
bool
select DMA_REMAP
+
+config DMA_CMA
+ bool "DMA Contiguous Memory Allocator"
+ depends on HAVE_DMA_CONTIGUOUS && CMA
+ help
+ This enables the Contiguous Memory Allocator which allows drivers
+ to allocate big physically-contiguous blocks of memory for use with
+ hardware components that do not support I/O map nor scatter-gather.
+
+ You can disable CMA by specifying "cma=0" on the kernel's command
+ line.
+
+ For more information see <include/linux/dma-contiguous.h>.
+ If unsure, say "n".
+
+if DMA_CMA
+comment "Default contiguous memory area size:"
+
+config CMA_SIZE_MBYTES
+ int "Size in Mega Bytes"
+ depends on !CMA_SIZE_SEL_PERCENTAGE
+ default 0 if X86
+ default 16
+ help
+ Defines the size (in MiB) of the default memory area for Contiguous
+ Memory Allocator. If the size of 0 is selected, CMA is disabled by
+ default, but it can be enabled by passing cma=size[MG] to the kernel.
+
+
+config CMA_SIZE_PERCENTAGE
+ int "Percentage of total memory"
+ depends on !CMA_SIZE_SEL_MBYTES
+ default 0 if X86
+ default 10
+ help
+ Defines the size of the default memory area for Contiguous Memory
+ Allocator as a percentage of the total memory in the system.
+ If 0 percent is selected, CMA is disabled by default, but it can be
+ enabled by passing cma=size[MG] to the kernel.
+
+choice
+ prompt "Selected region size"
+ default CMA_SIZE_SEL_MBYTES
+
+config CMA_SIZE_SEL_MBYTES
+ bool "Use mega bytes value only"
+
+config CMA_SIZE_SEL_PERCENTAGE
+ bool "Use percentage value only"
+
+config CMA_SIZE_SEL_MIN
+ bool "Use lower value (minimum)"
+
+config CMA_SIZE_SEL_MAX
+ bool "Use higher value (maximum)"
+
+endchoice
+
+config CMA_ALIGNMENT
+ int "Maximum PAGE_SIZE order of alignment for contiguous buffers"
+ range 4 12
+ default 8
+ help
+ DMA mapping framework by default aligns all buffers to the smallest
+ PAGE_SIZE order which is greater than or equal to the requested buffer
+ size. This works well for buffers up to a few hundreds kilobytes, but
+ for larger buffers it just a memory waste. With this parameter you can
+ specify the maximum PAGE_SIZE order for contiguous buffers. Larger
+ buffers will be aligned only to this specified order. The order is
+ expressed as a power of two multiplied by the PAGE_SIZE.
+
+ For example, if your system defaults to 4KiB pages, the order value
+ of 8 means that the buffers will be aligned up to 1MiB only.
+
+ If unsure, leave the default value "8".
+
+endif
+
+config DMA_API_DEBUG
+ bool "Enable debugging of DMA-API usage"
+ select NEED_DMA_MAP_STATE
+ help
+ Enable this option to debug the use of the DMA API by device drivers.
+ With this option you will be able to detect common bugs in device
+ drivers like double-freeing of DMA mappings or freeing mappings that
+ were never allocated.
+
+ This also attempts to catch cases where a page owned by DMA is
+ accessed by the cpu in a way that could cause data corruption. For
+ example, this enables cow_user_page() to check that the source page is
+ not undergoing DMA.
+
+ This option causes a performance degradation. Use only if you want to
+ debug device drivers and dma interactions.
+
+ If unsure, say N.
+
+config DMA_API_DEBUG_SG
+ bool "Debug DMA scatter-gather usage"
+ default y
+ depends on DMA_API_DEBUG
+ help
+ Perform extra checking that callers of dma_map_sg() have respected the
+ appropriate segment length/boundary limits for the given device when
+ preparing DMA scatterlists.
+
+ This is particularly likely to have been overlooked in cases where the
+ dma_map_sg() API is used for general bulk mapping of pages rather than
+ preparing literal scatter-gather descriptors, where there is a risk of
+ unexpected behaviour from DMA API implementations if the scatterlist
+ is technically out-of-spec.
+
+ If unsure, say N.
diff --git a/kernel/dma/Makefile b/kernel/dma/Makefile
index 72ff6e46aa86..d237cf3dc181 100644
--- a/kernel/dma/Makefile
+++ b/kernel/dma/Makefile
@@ -2,7 +2,7 @@
obj-$(CONFIG_HAS_DMA) += mapping.o direct.o dummy.o
obj-$(CONFIG_DMA_CMA) += contiguous.o
-obj-$(CONFIG_HAVE_GENERIC_DMA_COHERENT) += coherent.o
+obj-$(CONFIG_DMA_DECLARE_COHERENT) += coherent.o
obj-$(CONFIG_DMA_VIRT_OPS) += virt.o
obj-$(CONFIG_DMA_API_DEBUG) += debug.o
obj-$(CONFIG_SWIOTLB) += swiotlb.o
diff --git a/kernel/dma/coherent.c b/kernel/dma/coherent.c
index 66f0fb7e9a3a..29fd6590dc1e 100644
--- a/kernel/dma/coherent.c
+++ b/kernel/dma/coherent.c
@@ -14,7 +14,6 @@ struct dma_coherent_mem {
dma_addr_t device_base;
unsigned long pfn_base;
int size;
- int flags;
unsigned long *bitmap;
spinlock_t spinlock;
bool use_dev_dma_pfn_offset;
@@ -38,12 +37,12 @@ static inline dma_addr_t dma_get_device_base(struct device *dev,
return mem->device_base;
}
-static int dma_init_coherent_memory(
- phys_addr_t phys_addr, dma_addr_t device_addr, size_t size, int flags,
- struct dma_coherent_mem **mem)
+static int dma_init_coherent_memory(phys_addr_t phys_addr,
+ dma_addr_t device_addr, size_t size,
+ struct dma_coherent_mem **mem)
{
struct dma_coherent_mem *dma_mem = NULL;
- void __iomem *mem_base = NULL;
+ void *mem_base = NULL;
int pages = size >> PAGE_SHIFT;
int bitmap_size = BITS_TO_LONGS(pages) * sizeof(long);
int ret;
@@ -73,7 +72,6 @@ static int dma_init_coherent_memory(
dma_mem->device_base = device_addr;
dma_mem->pfn_base = PFN_DOWN(phys_addr);
dma_mem->size = pages;
- dma_mem->flags = flags;
spin_lock_init(&dma_mem->spinlock);
*mem = dma_mem;
@@ -110,12 +108,12 @@ static int dma_assign_coherent_memory(struct device *dev,
}
int dma_declare_coherent_memory(struct device *dev, phys_addr_t phys_addr,
- dma_addr_t device_addr, size_t size, int flags)
+ dma_addr_t device_addr, size_t size)
{
struct dma_coherent_mem *mem;
int ret;
- ret = dma_init_coherent_memory(phys_addr, device_addr, size, flags, &mem);
+ ret = dma_init_coherent_memory(phys_addr, device_addr, size, &mem);
if (ret)
return ret;
@@ -137,29 +135,6 @@ void dma_release_declared_memory(struct device *dev)
}
EXPORT_SYMBOL(dma_release_declared_memory);
-void *dma_mark_declared_memory_occupied(struct device *dev,
- dma_addr_t device_addr, size_t size)
-{
- struct dma_coherent_mem *mem = dev->dma_mem;
- unsigned long flags;
- int pos, err;
-
- size += device_addr & ~PAGE_MASK;
-
- if (!mem)
- return ERR_PTR(-EINVAL);
-
- spin_lock_irqsave(&mem->spinlock, flags);
- pos = PFN_DOWN(device_addr - dma_get_device_base(dev, mem));
- err = bitmap_allocate_region(mem->bitmap, pos, get_order(size));
- spin_unlock_irqrestore(&mem->spinlock, flags);
-
- if (err != 0)
- return ERR_PTR(err);
- return mem->virt_base + (pos << PAGE_SHIFT);
-}
-EXPORT_SYMBOL(dma_mark_declared_memory_occupied);
-
static void *__dma_alloc_from_coherent(struct dma_coherent_mem *mem,
ssize_t size, dma_addr_t *dma_handle)
{
@@ -213,15 +188,7 @@ int dma_alloc_from_dev_coherent(struct device *dev, ssize_t size,
return 0;
*ret = __dma_alloc_from_coherent(mem, size, dma_handle);
- if (*ret)
- return 1;
-
- /*
- * In the case where the allocation can not be satisfied from the
- * per-device area, try to fall back to generic memory if the
- * constraints allow it.
- */
- return mem->flags & DMA_MEMORY_EXCLUSIVE;
+ return 1;
}
void *dma_alloc_from_global_coherent(ssize_t size, dma_addr_t *dma_handle)
@@ -350,8 +317,7 @@ static int rmem_dma_device_init(struct reserved_mem *rmem, struct device *dev)
if (!mem) {
ret = dma_init_coherent_memory(rmem->base, rmem->base,
- rmem->size,
- DMA_MEMORY_EXCLUSIVE, &mem);
+ rmem->size, &mem);
if (ret) {
pr_err("Reserved memory: failed to init DMA memory pool at %pa, size %ld MiB\n",
&rmem->base, (unsigned long)rmem->size / SZ_1M);
diff --git a/kernel/dma/contiguous.c b/kernel/dma/contiguous.c
index b2a87905846d..bfc0c17f2a3d 100644
--- a/kernel/dma/contiguous.c
+++ b/kernel/dma/contiguous.c
@@ -214,6 +214,62 @@ bool dma_release_from_contiguous(struct device *dev, struct page *pages,
return cma_release(dev_get_cma_area(dev), pages, count);
}
+/**
+ * dma_alloc_contiguous() - allocate contiguous pages
+ * @dev: Pointer to device for which the allocation is performed.
+ * @size: Requested allocation size.
+ * @gfp: Allocation flags.
+ *
+ * This function allocates contiguous memory buffer for specified device. It
+ * first tries to use device specific contiguous memory area if available or
+ * the default global one, then tries a fallback allocation of normal pages.
+ *
+ * Note that it byapss one-page size of allocations from the global area as
+ * the addresses within one page are always contiguous, so there is no need
+ * to waste CMA pages for that kind; it also helps reduce fragmentations.
+ */
+struct page *dma_alloc_contiguous(struct device *dev, size_t size, gfp_t gfp)
+{
+ int node = dev ? dev_to_node(dev) : NUMA_NO_NODE;
+ size_t count = PAGE_ALIGN(size) >> PAGE_SHIFT;
+ size_t align = get_order(PAGE_ALIGN(size));
+ struct page *page = NULL;
+ struct cma *cma = NULL;
+
+ if (dev && dev->cma_area)
+ cma = dev->cma_area;
+ else if (count > 1)
+ cma = dma_contiguous_default_area;
+
+ /* CMA can be used only in the context which permits sleeping */
+ if (cma && gfpflags_allow_blocking(gfp)) {
+ align = min_t(size_t, align, CONFIG_CMA_ALIGNMENT);
+ page = cma_alloc(cma, count, align, gfp & __GFP_NOWARN);
+ }
+
+ /* Fallback allocation of normal pages */
+ if (!page)
+ page = alloc_pages_node(node, gfp, align);
+ return page;
+}
+
+/**
+ * dma_free_contiguous() - release allocated pages
+ * @dev: Pointer to device for which the pages were allocated.
+ * @page: Pointer to the allocated pages.
+ * @size: Size of allocated pages.
+ *
+ * This function releases memory allocated by dma_alloc_contiguous(). As the
+ * cma_release returns false when provided pages do not belong to contiguous
+ * area and true otherwise, this function then does a fallback __free_pages()
+ * upon a false-return.
+ */
+void dma_free_contiguous(struct device *dev, struct page *page, size_t size)
+{
+ if (!cma_release(dev_get_cma_area(dev), page, size >> PAGE_SHIFT))
+ __free_pages(page, get_order(size));
+}
+
/*
* Support for reserved memory regions defined in device tree
*/
diff --git a/kernel/dma/debug.c b/kernel/dma/debug.c
index 23cf5361bcf1..099002d84f46 100644
--- a/kernel/dma/debug.c
+++ b/kernel/dma/debug.c
@@ -1,20 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright (C) 2008 Advanced Micro Devices, Inc.
*
* Author: Joerg Roedel <joerg.roedel@amd.com>
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License version 2 as published
- * by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
#define pr_fmt(fmt) "DMA-API: " fmt
@@ -89,8 +77,8 @@ struct dma_debug_entry {
int sg_mapped_ents;
enum map_err_types map_err_type;
#ifdef CONFIG_STACKTRACE
- struct stack_trace stacktrace;
- unsigned long st_entries[DMA_DEBUG_STACKTRACE_ENTRIES];
+ unsigned int stack_len;
+ unsigned long stack_entries[DMA_DEBUG_STACKTRACE_ENTRIES];
#endif
};
@@ -134,17 +122,6 @@ static u32 nr_total_entries;
/* number of preallocated entries requested by kernel cmdline */
static u32 nr_prealloc_entries = PREALLOC_DMA_DEBUG_ENTRIES;
-/* debugfs dentry's for the stuff above */
-static struct dentry *dma_debug_dent __read_mostly;
-static struct dentry *global_disable_dent __read_mostly;
-static struct dentry *error_count_dent __read_mostly;
-static struct dentry *show_all_errors_dent __read_mostly;
-static struct dentry *show_num_errors_dent __read_mostly;
-static struct dentry *num_free_entries_dent __read_mostly;
-static struct dentry *min_free_entries_dent __read_mostly;
-static struct dentry *nr_total_entries_dent __read_mostly;
-static struct dentry *filter_dent __read_mostly;
-
/* per-driver filter related state */
#define NAME_MAX_LEN 64
@@ -185,7 +162,7 @@ static inline void dump_entry_trace(struct dma_debug_entry *entry)
#ifdef CONFIG_STACKTRACE
if (entry) {
pr_warning("Mapped at:\n");
- print_stack_trace(&entry->stacktrace, 0);
+ stack_trace_print(entry->stack_entries, entry->stack_len, 0);
}
#endif
}
@@ -715,12 +692,10 @@ static struct dma_debug_entry *dma_entry_alloc(void)
spin_unlock_irqrestore(&free_entries_lock, flags);
#ifdef CONFIG_STACKTRACE
- entry->stacktrace.max_entries = DMA_DEBUG_STACKTRACE_ENTRIES;
- entry->stacktrace.entries = entry->st_entries;
- entry->stacktrace.skip = 2;
- save_stack_trace(&entry->stacktrace);
+ entry->stack_len = stack_trace_save(entry->stack_entries,
+ ARRAY_SIZE(entry->stack_entries),
+ 1);
#endif
-
return entry;
}
@@ -840,66 +815,46 @@ static const struct file_operations filter_fops = {
.llseek = default_llseek,
};
-static int dma_debug_fs_init(void)
+static int dump_show(struct seq_file *seq, void *v)
{
- dma_debug_dent = debugfs_create_dir("dma-api", NULL);
- if (!dma_debug_dent) {
- pr_err("can not create debugfs directory\n");
- return -ENOMEM;
- }
+ int idx;
- global_disable_dent = debugfs_create_bool("disabled", 0444,
- dma_debug_dent,
- &global_disable);
- if (!global_disable_dent)
- goto out_err;
-
- error_count_dent = debugfs_create_u32("error_count", 0444,
- dma_debug_dent, &error_count);
- if (!error_count_dent)
- goto out_err;
-
- show_all_errors_dent = debugfs_create_u32("all_errors", 0644,
- dma_debug_dent,
- &show_all_errors);
- if (!show_all_errors_dent)
- goto out_err;
-
- show_num_errors_dent = debugfs_create_u32("num_errors", 0644,
- dma_debug_dent,
- &show_num_errors);
- if (!show_num_errors_dent)
- goto out_err;
-
- num_free_entries_dent = debugfs_create_u32("num_free_entries", 0444,
- dma_debug_dent,
- &num_free_entries);
- if (!num_free_entries_dent)
- goto out_err;
-
- min_free_entries_dent = debugfs_create_u32("min_free_entries", 0444,
- dma_debug_dent,
- &min_free_entries);
- if (!min_free_entries_dent)
- goto out_err;
-
- nr_total_entries_dent = debugfs_create_u32("nr_total_entries", 0444,
- dma_debug_dent,
- &nr_total_entries);
- if (!nr_total_entries_dent)
- goto out_err;
-
- filter_dent = debugfs_create_file("driver_filter", 0644,
- dma_debug_dent, NULL, &filter_fops);
- if (!filter_dent)
- goto out_err;
+ for (idx = 0; idx < HASH_SIZE; idx++) {
+ struct hash_bucket *bucket = &dma_entry_hash[idx];
+ struct dma_debug_entry *entry;
+ unsigned long flags;
+ spin_lock_irqsave(&bucket->lock, flags);
+ list_for_each_entry(entry, &bucket->list, list) {
+ seq_printf(seq,
+ "%s %s %s idx %d P=%llx N=%lx D=%llx L=%llx %s %s\n",
+ dev_name(entry->dev),
+ dev_driver_string(entry->dev),
+ type2name[entry->type], idx,
+ phys_addr(entry), entry->pfn,
+ entry->dev_addr, entry->size,
+ dir2name[entry->direction],
+ maperr2str[entry->map_err_type]);
+ }
+ spin_unlock_irqrestore(&bucket->lock, flags);
+ }
return 0;
+}
+DEFINE_SHOW_ATTRIBUTE(dump);
-out_err:
- debugfs_remove_recursive(dma_debug_dent);
-
- return -ENOMEM;
+static void dma_debug_fs_init(void)
+{
+ struct dentry *dentry = debugfs_create_dir("dma-api", NULL);
+
+ debugfs_create_bool("disabled", 0444, dentry, &global_disable);
+ debugfs_create_u32("error_count", 0444, dentry, &error_count);
+ debugfs_create_u32("all_errors", 0644, dentry, &show_all_errors);
+ debugfs_create_u32("num_errors", 0644, dentry, &show_num_errors);
+ debugfs_create_u32("num_free_entries", 0444, dentry, &num_free_entries);
+ debugfs_create_u32("min_free_entries", 0444, dentry, &min_free_entries);
+ debugfs_create_u32("nr_total_entries", 0444, dentry, &nr_total_entries);
+ debugfs_create_file("driver_filter", 0644, dentry, NULL, &filter_fops);
+ debugfs_create_file("dump", 0444, dentry, NULL, &dump_fops);
}
static int device_dma_allocations(struct device *dev, struct dma_debug_entry **out_entry)
@@ -985,12 +940,7 @@ static int dma_debug_init(void)
spin_lock_init(&dma_entry_hash[i].lock);
}
- if (dma_debug_fs_init() != 0) {
- pr_err("error creating debugfs entries - disabling\n");
- global_disable = true;
-
- return 0;
- }
+ dma_debug_fs_init();
nr_pages = DIV_ROUND_UP(nr_prealloc_entries, DMA_DEBUG_DYNAMIC_ENTRIES);
for (i = 0; i < nr_pages; ++i)
diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
index 355d16acee6d..b90e1aede743 100644
--- a/kernel/dma/direct.c
+++ b/kernel/dma/direct.c
@@ -96,8 +96,6 @@ static bool dma_coherent_ok(struct device *dev, phys_addr_t phys, size_t size)
struct page *__dma_direct_alloc_pages(struct device *dev, size_t size,
dma_addr_t *dma_handle, gfp_t gfp, unsigned long attrs)
{
- unsigned int count = PAGE_ALIGN(size) >> PAGE_SHIFT;
- int page_order = get_order(size);
struct page *page = NULL;
u64 phys_mask;
@@ -109,20 +107,9 @@ struct page *__dma_direct_alloc_pages(struct device *dev, size_t size,
gfp |= __dma_direct_optimal_gfp_mask(dev, dev->coherent_dma_mask,
&phys_mask);
again:
- /* CMA can be used only in the context which permits sleeping */
- if (gfpflags_allow_blocking(gfp)) {
- page = dma_alloc_from_contiguous(dev, count, page_order,
- gfp & __GFP_NOWARN);
- if (page && !dma_coherent_ok(dev, page_to_phys(page), size)) {
- dma_release_from_contiguous(dev, page, count);
- page = NULL;
- }
- }
- if (!page)
- page = alloc_pages_node(dev_to_node(dev), gfp, page_order);
-
+ page = dma_alloc_contiguous(dev, size, gfp);
if (page && !dma_coherent_ok(dev, page_to_phys(page), size)) {
- __free_pages(page, page_order);
+ dma_free_contiguous(dev, page, size);
page = NULL;
if (IS_ENABLED(CONFIG_ZONE_DMA32) &&
@@ -132,8 +119,7 @@ again:
goto again;
}
- if (IS_ENABLED(CONFIG_ZONE_DMA) &&
- phys_mask < DMA_BIT_MASK(32) && !(gfp & GFP_DMA)) {
+ if (IS_ENABLED(CONFIG_ZONE_DMA) && !(gfp & GFP_DMA)) {
gfp = (gfp & ~GFP_DMA32) | GFP_DMA;
goto again;
}
@@ -152,10 +138,18 @@ void *dma_direct_alloc_pages(struct device *dev, size_t size,
if (!page)
return NULL;
+ if (attrs & DMA_ATTR_NO_KERNEL_MAPPING) {
+ /* remove any dirty cache lines on the kernel alias */
+ if (!PageHighMem(page))
+ arch_dma_prep_coherent(page, size);
+ /* return the page pointer as the opaque cookie */
+ return page;
+ }
+
if (PageHighMem(page)) {
/*
* Depending on the cma= arguments and per-arch setup
- * dma_alloc_from_contiguous could return highmem pages.
+ * dma_alloc_contiguous could return highmem pages.
* Without remapping there is no way to return them here,
* so log an error and fail.
*/
@@ -172,15 +166,19 @@ void *dma_direct_alloc_pages(struct device *dev, size_t size,
*dma_handle = phys_to_dma(dev, page_to_phys(page));
}
memset(ret, 0, size);
+
+ if (IS_ENABLED(CONFIG_ARCH_HAS_UNCACHED_SEGMENT) &&
+ dma_alloc_need_uncached(dev, attrs)) {
+ arch_dma_prep_coherent(page, size);
+ ret = uncached_kernel_address(ret);
+ }
+
return ret;
}
void __dma_direct_free_pages(struct device *dev, size_t size, struct page *page)
{
- unsigned int count = PAGE_ALIGN(size) >> PAGE_SHIFT;
-
- if (!dma_release_from_contiguous(dev, page, count))
- __free_pages(page, get_order(size));
+ dma_free_contiguous(dev, page, size);
}
void dma_direct_free_pages(struct device *dev, size_t size, void *cpu_addr,
@@ -188,15 +186,26 @@ void dma_direct_free_pages(struct device *dev, size_t size, void *cpu_addr,
{
unsigned int page_order = get_order(size);
+ if (attrs & DMA_ATTR_NO_KERNEL_MAPPING) {
+ /* cpu_addr is a struct page cookie, not a kernel address */
+ __dma_direct_free_pages(dev, size, cpu_addr);
+ return;
+ }
+
if (force_dma_unencrypted())
set_memory_encrypted((unsigned long)cpu_addr, 1 << page_order);
+
+ if (IS_ENABLED(CONFIG_ARCH_HAS_UNCACHED_SEGMENT) &&
+ dma_alloc_need_uncached(dev, attrs))
+ cpu_addr = cached_kernel_address(cpu_addr);
__dma_direct_free_pages(dev, size, virt_to_page(cpu_addr));
}
void *dma_direct_alloc(struct device *dev, size_t size,
dma_addr_t *dma_handle, gfp_t gfp, unsigned long attrs)
{
- if (!dev_is_dma_coherent(dev))
+ if (!IS_ENABLED(CONFIG_ARCH_HAS_UNCACHED_SEGMENT) &&
+ dma_alloc_need_uncached(dev, attrs))
return arch_dma_alloc(dev, size, dma_handle, gfp, attrs);
return dma_direct_alloc_pages(dev, size, dma_handle, gfp, attrs);
}
@@ -204,7 +213,8 @@ void *dma_direct_alloc(struct device *dev, size_t size,
void dma_direct_free(struct device *dev, size_t size,
void *cpu_addr, dma_addr_t dma_addr, unsigned long attrs)
{
- if (!dev_is_dma_coherent(dev))
+ if (!IS_ENABLED(CONFIG_ARCH_HAS_UNCACHED_SEGMENT) &&
+ dma_alloc_need_uncached(dev, attrs))
arch_dma_free(dev, size, cpu_addr, dma_addr, attrs);
else
dma_direct_free_pages(dev, size, cpu_addr, dma_addr, attrs);
@@ -312,7 +322,7 @@ static inline bool dma_direct_possible(struct device *dev, dma_addr_t dma_addr,
size_t size)
{
return swiotlb_force != SWIOTLB_FORCE &&
- (!dev || dma_capable(dev, dma_addr, size));
+ dma_capable(dev, dma_addr, size);
}
dma_addr_t dma_direct_map_page(struct device *dev, struct page *page,
@@ -356,6 +366,20 @@ out_unmap:
}
EXPORT_SYMBOL(dma_direct_map_sg);
+dma_addr_t dma_direct_map_resource(struct device *dev, phys_addr_t paddr,
+ size_t size, enum dma_data_direction dir, unsigned long attrs)
+{
+ dma_addr_t dma_addr = paddr;
+
+ if (unlikely(!dma_direct_possible(dev, dma_addr, size))) {
+ report_addr(dev, dma_addr, size);
+ return DMA_MAPPING_ERROR;
+ }
+
+ return dma_addr;
+}
+EXPORT_SYMBOL(dma_direct_map_resource);
+
/*
* Because 32-bit DMA masks are so common we expect every architecture to be
* able to satisfy them - either by not supporting more physical memory, or by
@@ -380,3 +404,14 @@ int dma_direct_supported(struct device *dev, u64 mask)
*/
return mask >= __phys_to_dma(dev, min_mask);
}
+
+size_t dma_direct_max_mapping_size(struct device *dev)
+{
+ size_t size = SIZE_MAX;
+
+ /* If SWIOTLB is active, use its maximum mapping size */
+ if (is_swiotlb_active())
+ size = swiotlb_max_mapping_size(dev);
+
+ return size;
+}
diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c
index a11006b6d8e8..1f628e7ac709 100644
--- a/kernel/dma/mapping.c
+++ b/kernel/dma/mapping.c
@@ -207,7 +207,6 @@ int dma_mmap_attrs(struct device *dev, struct vm_area_struct *vma,
}
EXPORT_SYMBOL(dma_mmap_attrs);
-#ifndef ARCH_HAS_DMA_GET_REQUIRED_MASK
static u64 dma_default_get_required_mask(struct device *dev)
{
u32 low_totalram = ((max_pfn - 1) << PAGE_SHIFT);
@@ -238,11 +237,6 @@ u64 dma_get_required_mask(struct device *dev)
return dma_default_get_required_mask(dev);
}
EXPORT_SYMBOL_GPL(dma_get_required_mask);
-#endif
-
-#ifndef arch_dma_alloc_attrs
-#define arch_dma_alloc_attrs(dev) (true)
-#endif
void *dma_alloc_attrs(struct device *dev, size_t size, dma_addr_t *dma_handle,
gfp_t flag, unsigned long attrs)
@@ -250,7 +244,7 @@ void *dma_alloc_attrs(struct device *dev, size_t size, dma_addr_t *dma_handle,
const struct dma_map_ops *ops = get_dma_ops(dev);
void *cpu_addr;
- WARN_ON_ONCE(dev && !dev->coherent_dma_mask);
+ WARN_ON_ONCE(!dev->coherent_dma_mask);
if (dma_alloc_from_dev_coherent(dev, size, dma_handle, &cpu_addr))
return cpu_addr;
@@ -258,9 +252,6 @@ void *dma_alloc_attrs(struct device *dev, size_t size, dma_addr_t *dma_handle,
/* let the implementation decide on the zone to allocate from: */
flag &= ~(__GFP_DMA | __GFP_DMA32 | __GFP_HIGHMEM);
- if (!arch_dma_alloc_attrs(&dev))
- return NULL;
-
if (dma_is_direct(ops))
cpu_addr = dma_direct_alloc(dev, size, dma_handle, flag, attrs);
else if (ops->alloc)
@@ -318,22 +309,39 @@ int dma_supported(struct device *dev, u64 mask)
}
EXPORT_SYMBOL(dma_supported);
-#ifndef HAVE_ARCH_DMA_SET_MASK
+#ifdef CONFIG_ARCH_HAS_DMA_SET_MASK
+void arch_dma_set_mask(struct device *dev, u64 mask);
+#else
+#define arch_dma_set_mask(dev, mask) do { } while (0)
+#endif
+
int dma_set_mask(struct device *dev, u64 mask)
{
+ /*
+ * Truncate the mask to the actually supported dma_addr_t width to
+ * avoid generating unsupportable addresses.
+ */
+ mask = (dma_addr_t)mask;
+
if (!dev->dma_mask || !dma_supported(dev, mask))
return -EIO;
+ arch_dma_set_mask(dev, mask);
dma_check_mask(dev, mask);
*dev->dma_mask = mask;
return 0;
}
EXPORT_SYMBOL(dma_set_mask);
-#endif
#ifndef CONFIG_ARCH_HAS_DMA_SET_COHERENT_MASK
int dma_set_coherent_mask(struct device *dev, u64 mask)
{
+ /*
+ * Truncate the mask to the actually supported dma_addr_t width to
+ * avoid generating unsupportable addresses.
+ */
+ mask = (dma_addr_t)mask;
+
if (!dma_supported(dev, mask))
return -EIO;
@@ -357,3 +365,17 @@ void dma_cache_sync(struct device *dev, void *vaddr, size_t size,
ops->cache_sync(dev, vaddr, size, dir);
}
EXPORT_SYMBOL(dma_cache_sync);
+
+size_t dma_max_mapping_size(struct device *dev)
+{
+ const struct dma_map_ops *ops = get_dma_ops(dev);
+ size_t size = SIZE_MAX;
+
+ if (dma_is_direct(ops))
+ size = dma_direct_max_mapping_size(dev);
+ else if (ops && ops->max_mapping_size)
+ size = ops->max_mapping_size(dev);
+
+ return size;
+}
+EXPORT_SYMBOL_GPL(dma_max_mapping_size);
diff --git a/kernel/dma/remap.c b/kernel/dma/remap.c
index 7a723194ecbe..a594aec07882 100644
--- a/kernel/dma/remap.c
+++ b/kernel/dma/remap.c
@@ -158,6 +158,9 @@ out:
bool dma_in_atomic_pool(void *start, size_t size)
{
+ if (unlikely(!atomic_pool))
+ return false;
+
return addr_in_gen_pool(atomic_pool, (unsigned long)start, size);
}
@@ -199,8 +202,7 @@ void *arch_dma_alloc(struct device *dev, size_t size, dma_addr_t *dma_handle,
size = PAGE_ALIGN(size);
- if (!gfpflags_allow_blocking(flags) &&
- !(attrs & DMA_ATTR_NO_KERNEL_MAPPING)) {
+ if (!gfpflags_allow_blocking(flags)) {
ret = dma_alloc_from_pool(size, &page, flags);
if (!ret)
return NULL;
@@ -214,11 +216,6 @@ void *arch_dma_alloc(struct device *dev, size_t size, dma_addr_t *dma_handle,
/* remove any dirty cache lines on the kernel alias */
arch_dma_prep_coherent(page, size);
- if (attrs & DMA_ATTR_NO_KERNEL_MAPPING) {
- ret = page; /* opaque cookie */
- goto done;
- }
-
/* create a coherent mapping */
ret = dma_common_contiguous_remap(page, size, VM_USERMAP,
arch_dma_mmap_pgprot(dev, PAGE_KERNEL, attrs),
@@ -237,10 +234,7 @@ done:
void arch_dma_free(struct device *dev, size_t size, void *vaddr,
dma_addr_t dma_handle, unsigned long attrs)
{
- if (attrs & DMA_ATTR_NO_KERNEL_MAPPING) {
- /* vaddr is a struct page cookie, not a kernel address */
- __dma_direct_free_pages(dev, size, vaddr);
- } else if (!dma_free_from_pool(vaddr, PAGE_ALIGN(size))) {
+ if (!dma_free_from_pool(vaddr, PAGE_ALIGN(size))) {
phys_addr_t phys = dma_to_phys(dev, dma_handle);
struct page *page = pfn_to_page(__phys_to_pfn(phys));
diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
index 1fb6fd68b9c7..62fa5a82a065 100644
--- a/kernel/dma/swiotlb.c
+++ b/kernel/dma/swiotlb.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Dynamic DMA mapping support.
*
@@ -34,6 +35,9 @@
#include <linux/scatterlist.h>
#include <linux/mem_encrypt.h>
#include <linux/set_memory.h>
+#ifdef CONFIG_DEBUG_FS
+#include <linux/debugfs.h>
+#endif
#include <asm/io.h>
#include <asm/dma.h>
@@ -73,6 +77,11 @@ phys_addr_t io_tlb_start, io_tlb_end;
static unsigned long io_tlb_nslabs;
/*
+ * The number of used IO TLB block
+ */
+static unsigned long io_tlb_used;
+
+/*
* This is a free list describing the number of free entries available from
* each index
*/
@@ -191,6 +200,7 @@ void __init swiotlb_update_mem_attributes(void)
int __init swiotlb_init_with_tbl(char *tlb, unsigned long nslabs, int verbose)
{
unsigned long i, bytes;
+ size_t alloc_size;
bytes = nslabs << IO_TLB_SHIFT;
@@ -203,12 +213,18 @@ int __init swiotlb_init_with_tbl(char *tlb, unsigned long nslabs, int verbose)
* to find contiguous free memory regions of size up to IO_TLB_SEGSIZE
* between io_tlb_start and io_tlb_end.
*/
- io_tlb_list = memblock_alloc(
- PAGE_ALIGN(io_tlb_nslabs * sizeof(int)),
- PAGE_SIZE);
- io_tlb_orig_addr = memblock_alloc(
- PAGE_ALIGN(io_tlb_nslabs * sizeof(phys_addr_t)),
- PAGE_SIZE);
+ alloc_size = PAGE_ALIGN(io_tlb_nslabs * sizeof(int));
+ io_tlb_list = memblock_alloc(alloc_size, PAGE_SIZE);
+ if (!io_tlb_list)
+ panic("%s: Failed to allocate %zu bytes align=0x%lx\n",
+ __func__, alloc_size, PAGE_SIZE);
+
+ alloc_size = PAGE_ALIGN(io_tlb_nslabs * sizeof(phys_addr_t));
+ io_tlb_orig_addr = memblock_alloc(alloc_size, PAGE_SIZE);
+ if (!io_tlb_orig_addr)
+ panic("%s: Failed to allocate %zu bytes align=0x%lx\n",
+ __func__, alloc_size, PAGE_SIZE);
+
for (i = 0; i < io_tlb_nslabs; i++) {
io_tlb_list[i] = IO_TLB_SEGSIZE - OFFSET(i, IO_TLB_SEGSIZE);
io_tlb_orig_addr[i] = INVALID_PHYS_ADDR;
@@ -241,7 +257,7 @@ swiotlb_init(int verbose)
bytes = io_tlb_nslabs << IO_TLB_SHIFT;
/* Get IO TLB memory from the low pages */
- vstart = memblock_alloc_low_nopanic(PAGE_ALIGN(bytes), PAGE_SIZE);
+ vstart = memblock_alloc_low(PAGE_ALIGN(bytes), PAGE_SIZE);
if (vstart && !swiotlb_init_with_tbl(vstart, io_tlb_nslabs, verbose))
return;
@@ -385,7 +401,7 @@ void __init swiotlb_exit(void)
}
/*
- * Bounce: copy the swiotlb buffer back to the original dma location
+ * Bounce: copy the swiotlb buffer from or back to the original dma location
*/
static void swiotlb_bounce(phys_addr_t orig_addr, phys_addr_t tlb_addr,
size_t size, enum dma_data_direction dir)
@@ -437,6 +453,7 @@ phys_addr_t swiotlb_tbl_map_single(struct device *hwdev,
unsigned long mask;
unsigned long offset_slots;
unsigned long max_slots;
+ unsigned long tmp_io_tlb_used;
if (no_iotlb_memory)
panic("Can not allocate SWIOTLB buffer earlier and can't now provide you with the DMA bounce buffer");
@@ -475,6 +492,10 @@ phys_addr_t swiotlb_tbl_map_single(struct device *hwdev,
* request and allocate a buffer from that IO TLB pool.
*/
spin_lock_irqsave(&io_tlb_lock, flags);
+
+ if (unlikely(nslots > io_tlb_nslabs - io_tlb_used))
+ goto not_found;
+
index = ALIGN(io_tlb_index, stride);
if (index >= io_tlb_nslabs)
index = 0;
@@ -519,11 +540,15 @@ phys_addr_t swiotlb_tbl_map_single(struct device *hwdev,
} while (index != wrap);
not_found:
+ tmp_io_tlb_used = io_tlb_used;
+
spin_unlock_irqrestore(&io_tlb_lock, flags);
if (!(attrs & DMA_ATTR_NO_WARN) && printk_ratelimit())
- dev_warn(hwdev, "swiotlb buffer is full (sz: %zd bytes)\n", size);
+ dev_warn(hwdev, "swiotlb buffer is full (sz: %zd bytes), total %lu (slots), used %lu (slots)\n",
+ size, io_tlb_nslabs, tmp_io_tlb_used);
return DMA_MAPPING_ERROR;
found:
+ io_tlb_used += nslots;
spin_unlock_irqrestore(&io_tlb_lock, flags);
/*
@@ -584,6 +609,8 @@ void swiotlb_tbl_unmap_single(struct device *hwdev, phys_addr_t tlb_addr,
*/
for (i = index - 1; (OFFSET(i, IO_TLB_SEGSIZE) != IO_TLB_SEGSIZE -1) && io_tlb_list[i]; i--)
io_tlb_list[i] = ++count;
+
+ io_tlb_used -= nslots;
}
spin_unlock_irqrestore(&io_tlb_lock, flags);
}
@@ -651,14 +678,32 @@ bool swiotlb_map(struct device *dev, phys_addr_t *phys, dma_addr_t *dma_addr,
return true;
}
-/*
- * Return whether the given device DMA address mask can be supported
- * properly. For example, if your device can only drive the low 24-bits
- * during bus mastering, then you would pass 0x00ffffff as the mask to
- * this function.
- */
-int
-swiotlb_dma_supported(struct device *hwdev, u64 mask)
+size_t swiotlb_max_mapping_size(struct device *dev)
{
- return __phys_to_dma(hwdev, io_tlb_end - 1) <= mask;
+ return ((size_t)1 << IO_TLB_SHIFT) * IO_TLB_SEGSIZE;
}
+
+bool is_swiotlb_active(void)
+{
+ /*
+ * When SWIOTLB is initialized, even if io_tlb_start points to physical
+ * address zero, io_tlb_end surely doesn't.
+ */
+ return io_tlb_end != 0;
+}
+
+#ifdef CONFIG_DEBUG_FS
+
+static int __init swiotlb_create_debugfs(void)
+{
+ struct dentry *root;
+
+ root = debugfs_create_dir("swiotlb", NULL);
+ debugfs_create_ulong("io_tlb_nslabs", 0400, root, &io_tlb_nslabs);
+ debugfs_create_ulong("io_tlb_used", 0400, root, &io_tlb_used);
+ return 0;
+}
+
+late_initcall(swiotlb_create_debugfs);
+
+#endif
diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c
index 24a77c34e9ad..c2b41a263166 100644
--- a/kernel/events/callchain.c
+++ b/kernel/events/callchain.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Performance events callchain code, extracted from core.c:
*
@@ -5,8 +6,6 @@
* Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar
* Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra
* Copyright © 2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
- *
- * For licensing details see kernel-base/COPYING
*/
#include <linux/perf_event.h>
diff --git a/kernel/events/core.c b/kernel/events/core.c
index e5ede6918050..eea9d52b010c 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Performance events core code:
*
@@ -5,8 +6,6 @@
* Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar
* Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra
* Copyright © 2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
- *
- * For licensing details see kernel-base/COPYING
*/
#include <linux/fs.h>
@@ -385,6 +384,8 @@ static atomic_t nr_namespaces_events __read_mostly;
static atomic_t nr_task_events __read_mostly;
static atomic_t nr_freq_events __read_mostly;
static atomic_t nr_switch_events __read_mostly;
+static atomic_t nr_ksymbol_events __read_mostly;
+static atomic_t nr_bpf_events __read_mostly;
static LIST_HEAD(pmus);
static DEFINE_MUTEX(pmus_lock);
@@ -1171,7 +1172,7 @@ static void perf_event_ctx_deactivate(struct perf_event_context *ctx)
static void get_ctx(struct perf_event_context *ctx)
{
- WARN_ON(!atomic_inc_not_zero(&ctx->refcount));
+ refcount_inc(&ctx->refcount);
}
static void free_ctx(struct rcu_head *head)
@@ -1185,7 +1186,7 @@ static void free_ctx(struct rcu_head *head)
static void put_ctx(struct perf_event_context *ctx)
{
- if (atomic_dec_and_test(&ctx->refcount)) {
+ if (refcount_dec_and_test(&ctx->refcount)) {
if (ctx->parent_ctx)
put_ctx(ctx->parent_ctx);
if (ctx->task && ctx->task != TASK_TOMBSTONE)
@@ -1254,6 +1255,7 @@ static void put_ctx(struct perf_event_context *ctx)
* perf_event_context::lock
* perf_event::mmap_mutex
* mmap_sem
+ * perf_addr_filters_head::lock
*
* cpu_hotplug_lock
* pmus_lock
@@ -1267,7 +1269,7 @@ perf_event_ctx_lock_nested(struct perf_event *event, int nesting)
again:
rcu_read_lock();
ctx = READ_ONCE(event->ctx);
- if (!atomic_inc_not_zero(&ctx->refcount)) {
+ if (!refcount_inc_not_zero(&ctx->refcount)) {
rcu_read_unlock();
goto again;
}
@@ -1400,7 +1402,7 @@ retry:
}
if (ctx->task == TASK_TOMBSTONE ||
- !atomic_inc_not_zero(&ctx->refcount)) {
+ !refcount_inc_not_zero(&ctx->refcount)) {
raw_spin_unlock(&ctx->lock);
ctx = NULL;
} else {
@@ -2007,8 +2009,8 @@ event_sched_out(struct perf_event *event,
event->pmu->del(event, 0);
event->oncpu = -1;
- if (event->pending_disable) {
- event->pending_disable = 0;
+ if (READ_ONCE(event->pending_disable) >= 0) {
+ WRITE_ONCE(event->pending_disable, -1);
state = PERF_EVENT_STATE_OFF;
}
perf_event_set_state(event, state);
@@ -2196,7 +2198,8 @@ EXPORT_SYMBOL_GPL(perf_event_disable);
void perf_event_disable_inatomic(struct perf_event *event)
{
- event->pending_disable = 1;
+ WRITE_ONCE(event->pending_disable, smp_processor_id());
+ /* can fail, see perf_pending_event_disable() */
irq_work_queue(&event->pending);
}
@@ -2475,6 +2478,16 @@ static void ctx_resched(struct perf_cpu_context *cpuctx,
perf_pmu_enable(cpuctx->ctx.pmu);
}
+void perf_pmu_resched(struct pmu *pmu)
+{
+ struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
+ struct perf_event_context *task_ctx = cpuctx->task_ctx;
+
+ perf_ctx_lock(cpuctx, task_ctx);
+ ctx_resched(cpuctx, task_ctx, EVENT_ALL|EVENT_CPU);
+ perf_ctx_unlock(cpuctx, task_ctx);
+}
+
/*
* Cross CPU call to install and enable a performance event
*
@@ -2540,6 +2553,9 @@ unlock:
return ret;
}
+static bool exclusive_event_installable(struct perf_event *event,
+ struct perf_event_context *ctx);
+
/*
* Attach a performance event to a context.
*
@@ -2554,6 +2570,8 @@ perf_install_in_context(struct perf_event_context *ctx,
lockdep_assert_held(&ctx->mutex);
+ WARN_ON_ONCE(!exclusive_event_installable(event, ctx));
+
if (event->cpu != -1)
event->cpu = cpu;
@@ -2797,7 +2815,7 @@ static int perf_event_stop(struct perf_event *event, int restart)
*
* (p1) when userspace mappings change as a result of (1) or (2) or (3) below,
* we update the addresses of corresponding vmas in
- * event::addr_filters_offs array and bump the event::addr_filters_gen;
+ * event::addr_filter_ranges array and bump the event::addr_filters_gen;
* (p2) when an event is scheduled in (pmu::add), it calls
* perf_event_addr_filters_sync() which calls pmu::addr_filters_sync()
* if the generation has changed since the previous call.
@@ -2939,6 +2957,12 @@ static void ctx_sched_out(struct perf_event_context *ctx,
if (!ctx->nr_active || !(is_active & EVENT_ALL))
return;
+ /*
+ * If we had been multiplexing, no rotations are necessary, now no events
+ * are active.
+ */
+ ctx->rotate_necessary = 0;
+
perf_pmu_disable(ctx->pmu);
if (is_active & EVENT_PINNED) {
list_for_each_entry_safe(event, tmp, &ctx->pinned_active, active_list)
@@ -3306,10 +3330,13 @@ static int flexible_sched_in(struct perf_event *event, void *data)
return 0;
if (group_can_go_on(event, sid->cpuctx, sid->can_add_hw)) {
- if (!group_sched_in(event, sid->cpuctx, sid->ctx))
- list_add_tail(&event->active_list, &sid->ctx->flexible_active);
- else
+ int ret = group_sched_in(event, sid->cpuctx, sid->ctx);
+ if (ret) {
sid->can_add_hw = 0;
+ sid->ctx->rotate_necessary = 1;
+ return 0;
+ }
+ list_add_tail(&event->active_list, &sid->ctx->flexible_active);
}
return 0;
@@ -3677,24 +3704,17 @@ ctx_first_active(struct perf_event_context *ctx)
static bool perf_rotate_context(struct perf_cpu_context *cpuctx)
{
struct perf_event *cpu_event = NULL, *task_event = NULL;
- bool cpu_rotate = false, task_rotate = false;
- struct perf_event_context *ctx = NULL;
+ struct perf_event_context *task_ctx = NULL;
+ int cpu_rotate, task_rotate;
/*
* Since we run this from IRQ context, nobody can install new
* events, thus the event count values are stable.
*/
- if (cpuctx->ctx.nr_events) {
- if (cpuctx->ctx.nr_events != cpuctx->ctx.nr_active)
- cpu_rotate = true;
- }
-
- ctx = cpuctx->task_ctx;
- if (ctx && ctx->nr_events) {
- if (ctx->nr_events != ctx->nr_active)
- task_rotate = true;
- }
+ cpu_rotate = cpuctx->ctx.rotate_necessary;
+ task_ctx = cpuctx->task_ctx;
+ task_rotate = task_ctx ? task_ctx->rotate_necessary : 0;
if (!(cpu_rotate || task_rotate))
return false;
@@ -3703,7 +3723,7 @@ static bool perf_rotate_context(struct perf_cpu_context *cpuctx)
perf_pmu_disable(cpuctx->ctx.pmu);
if (task_rotate)
- task_event = ctx_first_active(ctx);
+ task_event = ctx_first_active(task_ctx);
if (cpu_rotate)
cpu_event = ctx_first_active(&cpuctx->ctx);
@@ -3711,17 +3731,17 @@ static bool perf_rotate_context(struct perf_cpu_context *cpuctx)
* As per the order given at ctx_resched() first 'pop' task flexible
* and then, if needed CPU flexible.
*/
- if (task_event || (ctx && cpu_event))
- ctx_sched_out(ctx, cpuctx, EVENT_FLEXIBLE);
+ if (task_event || (task_ctx && cpu_event))
+ ctx_sched_out(task_ctx, cpuctx, EVENT_FLEXIBLE);
if (cpu_event)
cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
if (task_event)
- rotate_ctx(ctx, task_event);
+ rotate_ctx(task_ctx, task_event);
if (cpu_event)
rotate_ctx(&cpuctx->ctx, cpu_event);
- perf_event_sched_in(cpuctx, ctx, current);
+ perf_event_sched_in(cpuctx, task_ctx, current);
perf_pmu_enable(cpuctx->ctx.pmu);
perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
@@ -4056,7 +4076,7 @@ static void __perf_event_init_context(struct perf_event_context *ctx)
INIT_LIST_HEAD(&ctx->event_list);
INIT_LIST_HEAD(&ctx->pinned_active);
INIT_LIST_HEAD(&ctx->flexible_active);
- atomic_set(&ctx->refcount, 1);
+ refcount_set(&ctx->refcount, 1);
}
static struct perf_event_context *
@@ -4235,8 +4255,9 @@ static bool is_sb_event(struct perf_event *event)
if (attr->mmap || attr->mmap_data || attr->mmap2 ||
attr->comm || attr->comm_exec ||
- attr->task ||
- attr->context_switch)
+ attr->task || attr->ksymbol ||
+ attr->context_switch ||
+ attr->bpf_event)
return true;
return false;
}
@@ -4305,6 +4326,10 @@ static void unaccount_event(struct perf_event *event)
dec = true;
if (has_branch_stack(event))
dec = true;
+ if (event->attr.ksymbol)
+ atomic_dec(&nr_ksymbol_events);
+ if (event->attr.bpf_event)
+ atomic_dec(&nr_bpf_events);
if (dec) {
if (!atomic_add_unless(&perf_sched_count, -1, 1))
@@ -4340,7 +4365,7 @@ static int exclusive_event_init(struct perf_event *event)
{
struct pmu *pmu = event->pmu;
- if (!(pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE))
+ if (!is_exclusive_pmu(pmu))
return 0;
/*
@@ -4371,7 +4396,7 @@ static void exclusive_event_destroy(struct perf_event *event)
{
struct pmu *pmu = event->pmu;
- if (!(pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE))
+ if (!is_exclusive_pmu(pmu))
return;
/* see comment in exclusive_event_init() */
@@ -4391,14 +4416,15 @@ static bool exclusive_event_match(struct perf_event *e1, struct perf_event *e2)
return false;
}
-/* Called under the same ctx::mutex as perf_install_in_context() */
static bool exclusive_event_installable(struct perf_event *event,
struct perf_event_context *ctx)
{
struct perf_event *iter_event;
struct pmu *pmu = event->pmu;
- if (!(pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE))
+ lockdep_assert_held(&ctx->mutex);
+
+ if (!is_exclusive_pmu(pmu))
return true;
list_for_each_entry(iter_event, &ctx->event_list, event_entry) {
@@ -4440,17 +4466,25 @@ static void _free_event(struct perf_event *event)
perf_event_free_bpf_prog(event);
perf_addr_filters_splice(event, NULL);
- kfree(event->addr_filters_offs);
+ kfree(event->addr_filter_ranges);
if (event->destroy)
event->destroy(event);
- if (event->ctx)
- put_ctx(event->ctx);
-
+ /*
+ * Must be after ->destroy(), due to uprobe_perf_close() using
+ * hw.target.
+ */
if (event->hw.target)
put_task_struct(event->hw.target);
+ /*
+ * perf_event_free_task() relies on put_ctx() being 'last', in particular
+ * all task references must be cleaned up.
+ */
+ if (event->ctx)
+ put_ctx(event->ctx);
+
exclusive_event_destroy(event);
module_put(event->pmu->module);
@@ -4630,8 +4664,17 @@ again:
mutex_unlock(&event->child_mutex);
list_for_each_entry_safe(child, tmp, &free_list, child_list) {
+ void *var = &child->ctx->refcount;
+
list_del(&child->child_list);
free_event(child);
+
+ /*
+ * Wake any perf_event_free_task() waiting for this event to be
+ * freed.
+ */
+ smp_mb(); /* pairs with wait_var_event() */
+ wake_up_var(var);
}
no_ctx:
@@ -4963,6 +5006,11 @@ static void __perf_event_period(struct perf_event *event,
}
}
+static int perf_event_check_period(struct perf_event *event, u64 value)
+{
+ return event->pmu->check_period(event, value);
+}
+
static int perf_event_period(struct perf_event *event, u64 __user *arg)
{
u64 value;
@@ -4979,6 +5027,12 @@ static int perf_event_period(struct perf_event *event, u64 __user *arg)
if (event->attr.freq && value > sysctl_perf_event_sample_rate)
return -EINVAL;
+ if (perf_event_check_period(event, value))
+ return -EINVAL;
+
+ if (!event->attr.freq && (value & (1ULL << 63)))
+ return -EINVAL;
+
event_function_call(event, __perf_event_period, &value);
return 0;
@@ -5388,7 +5442,7 @@ struct ring_buffer *ring_buffer_get(struct perf_event *event)
rcu_read_lock();
rb = rcu_dereference(event->rb);
if (rb) {
- if (!atomic_inc_not_zero(&rb->refcount))
+ if (!refcount_inc_not_zero(&rb->refcount))
rb = NULL;
}
rcu_read_unlock();
@@ -5398,7 +5452,7 @@ struct ring_buffer *ring_buffer_get(struct perf_event *event)
void ring_buffer_put(struct ring_buffer *rb)
{
- if (!atomic_dec_and_test(&rb->refcount))
+ if (!refcount_dec_and_test(&rb->refcount))
return;
WARN_ON_ONCE(!list_empty(&rb->event_list));
@@ -5459,11 +5513,11 @@ static void perf_mmap_close(struct vm_area_struct *vma)
/* now it's safe to free the pages */
atomic_long_sub(rb->aux_nr_pages, &mmap_user->locked_vm);
- vma->vm_mm->pinned_vm -= rb->aux_mmap_locked;
+ atomic64_sub(rb->aux_mmap_locked, &vma->vm_mm->pinned_vm);
/* this has to be the last one */
rb_free_aux(rb);
- WARN_ON_ONCE(atomic_read(&rb->aux_refcount));
+ WARN_ON_ONCE(refcount_read(&rb->aux_refcount));
mutex_unlock(&event->mmap_mutex);
}
@@ -5532,7 +5586,7 @@ again:
*/
atomic_long_sub((size >> PAGE_SHIFT) + 1, &mmap_user->locked_vm);
- vma->vm_mm->pinned_vm -= mmap_locked;
+ atomic64_sub(mmap_locked, &vma->vm_mm->pinned_vm);
free_uid(mmap_user);
out_put:
@@ -5680,7 +5734,7 @@ accounting:
lock_limit = rlimit(RLIMIT_MEMLOCK);
lock_limit >>= PAGE_SHIFT;
- locked = vma->vm_mm->pinned_vm + extra;
+ locked = atomic64_read(&vma->vm_mm->pinned_vm) + extra;
if ((locked > lock_limit) && perf_paranoid_tracepoint_raw() &&
!capable(CAP_IPC_LOCK)) {
@@ -5721,7 +5775,7 @@ accounting:
unlock:
if (!ret) {
atomic_long_add(user_extra, &user->locked_vm);
- vma->vm_mm->pinned_vm += extra;
+ atomic64_add(extra, &vma->vm_mm->pinned_vm);
atomic_inc(&event->mmap_count);
} else if (rb) {
@@ -5795,10 +5849,45 @@ void perf_event_wakeup(struct perf_event *event)
}
}
+static void perf_pending_event_disable(struct perf_event *event)
+{
+ int cpu = READ_ONCE(event->pending_disable);
+
+ if (cpu < 0)
+ return;
+
+ if (cpu == smp_processor_id()) {
+ WRITE_ONCE(event->pending_disable, -1);
+ perf_event_disable_local(event);
+ return;
+ }
+
+ /*
+ * CPU-A CPU-B
+ *
+ * perf_event_disable_inatomic()
+ * @pending_disable = CPU-A;
+ * irq_work_queue();
+ *
+ * sched-out
+ * @pending_disable = -1;
+ *
+ * sched-in
+ * perf_event_disable_inatomic()
+ * @pending_disable = CPU-B;
+ * irq_work_queue(); // FAILS
+ *
+ * irq_work_run()
+ * perf_pending_event()
+ *
+ * But the event runs on CPU-B and wants disabling there.
+ */
+ irq_work_queue_on(&event->pending, cpu);
+}
+
static void perf_pending_event(struct irq_work *entry)
{
- struct perf_event *event = container_of(entry,
- struct perf_event, pending);
+ struct perf_event *event = container_of(entry, struct perf_event, pending);
int rctx;
rctx = perf_swevent_get_recursion_context();
@@ -5807,10 +5896,7 @@ static void perf_pending_event(struct irq_work *entry)
* and we won't recurse 'further'.
*/
- if (event->pending_disable) {
- event->pending_disable = 0;
- perf_event_disable_local(event);
- }
+ perf_pending_event_disable(event);
if (event->pending_wakeup) {
event->pending_wakeup = 0;
@@ -5865,7 +5951,7 @@ static void perf_sample_regs_user(struct perf_regs *regs_user,
if (user_mode(regs)) {
regs_user->abi = perf_reg_abi(current);
regs_user->regs = regs;
- } else if (current->mm) {
+ } else if (!(current->flags & PF_KTHREAD)) {
perf_get_regs_user(regs_user, regs, regs_user_copy);
} else {
regs_user->abi = PERF_SAMPLE_REGS_ABI_NONE;
@@ -6489,7 +6575,7 @@ void perf_prepare_sample(struct perf_event_header *header,
data->phys_addr = perf_virt_to_phys(data->addr);
}
-static __always_inline void
+static __always_inline int
__perf_event_output(struct perf_event *event,
struct perf_sample_data *data,
struct pt_regs *regs,
@@ -6499,13 +6585,15 @@ __perf_event_output(struct perf_event *event,
{
struct perf_output_handle handle;
struct perf_event_header header;
+ int err;
/* protect the callchain buffers */
rcu_read_lock();
perf_prepare_sample(&header, data, event, regs);
- if (output_begin(&handle, event, header.size))
+ err = output_begin(&handle, event, header.size);
+ if (err)
goto exit;
perf_output_sample(&handle, &header, data, event);
@@ -6514,6 +6602,7 @@ __perf_event_output(struct perf_event *event,
exit:
rcu_read_unlock();
+ return err;
}
void
@@ -6532,12 +6621,12 @@ perf_event_output_backward(struct perf_event *event,
__perf_event_output(event, data, regs, perf_output_begin_backward);
}
-void
+int
perf_event_output(struct perf_event *event,
struct perf_sample_data *data,
struct pt_regs *regs)
{
- __perf_event_output(event, data, regs, perf_output_begin);
+ return __perf_event_output(event, data, regs, perf_output_begin);
}
/*
@@ -6678,7 +6767,8 @@ static void perf_event_addr_filters_exec(struct perf_event *event, void *data)
raw_spin_lock_irqsave(&ifh->lock, flags);
list_for_each_entry(filter, &ifh->list, entry) {
if (filter->path.dentry) {
- event->addr_filters_offs[count] = 0;
+ event->addr_filter_ranges[count].start = 0;
+ event->addr_filter_ranges[count].size = 0;
restart++;
}
@@ -7170,6 +7260,7 @@ static void perf_event_mmap_output(struct perf_event *event,
struct perf_output_handle handle;
struct perf_sample_data sample;
int size = mmap_event->event_id.header.size;
+ u32 type = mmap_event->event_id.header.type;
int ret;
if (!perf_event_mmap_match(event, data))
@@ -7213,6 +7304,7 @@ static void perf_event_mmap_output(struct perf_event *event,
perf_output_end(&handle);
out:
mmap_event->event_id.header.size = size;
+ mmap_event->event_id.header.type = type;
}
static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
@@ -7358,28 +7450,47 @@ static bool perf_addr_filter_match(struct perf_addr_filter *filter,
return true;
}
+static bool perf_addr_filter_vma_adjust(struct perf_addr_filter *filter,
+ struct vm_area_struct *vma,
+ struct perf_addr_filter_range *fr)
+{
+ unsigned long vma_size = vma->vm_end - vma->vm_start;
+ unsigned long off = vma->vm_pgoff << PAGE_SHIFT;
+ struct file *file = vma->vm_file;
+
+ if (!perf_addr_filter_match(filter, file, off, vma_size))
+ return false;
+
+ if (filter->offset < off) {
+ fr->start = vma->vm_start;
+ fr->size = min(vma_size, filter->size - (off - filter->offset));
+ } else {
+ fr->start = vma->vm_start + filter->offset - off;
+ fr->size = min(vma->vm_end - fr->start, filter->size);
+ }
+
+ return true;
+}
+
static void __perf_addr_filters_adjust(struct perf_event *event, void *data)
{
struct perf_addr_filters_head *ifh = perf_event_addr_filters(event);
struct vm_area_struct *vma = data;
- unsigned long off = vma->vm_pgoff << PAGE_SHIFT, flags;
- struct file *file = vma->vm_file;
struct perf_addr_filter *filter;
unsigned int restart = 0, count = 0;
+ unsigned long flags;
if (!has_addr_filter(event))
return;
- if (!file)
+ if (!vma->vm_file)
return;
raw_spin_lock_irqsave(&ifh->lock, flags);
list_for_each_entry(filter, &ifh->list, entry) {
- if (perf_addr_filter_match(filter, file, off,
- vma->vm_end - vma->vm_start)) {
- event->addr_filters_offs[count] = vma->vm_start;
+ if (perf_addr_filter_vma_adjust(filter, vma,
+ &event->addr_filter_ranges[count]))
restart++;
- }
count++;
}
@@ -7650,6 +7761,207 @@ static void perf_log_throttle(struct perf_event *event, int enable)
perf_output_end(&handle);
}
+/*
+ * ksymbol register/unregister tracking
+ */
+
+struct perf_ksymbol_event {
+ const char *name;
+ int name_len;
+ struct {
+ struct perf_event_header header;
+ u64 addr;
+ u32 len;
+ u16 ksym_type;
+ u16 flags;
+ } event_id;
+};
+
+static int perf_event_ksymbol_match(struct perf_event *event)
+{
+ return event->attr.ksymbol;
+}
+
+static void perf_event_ksymbol_output(struct perf_event *event, void *data)
+{
+ struct perf_ksymbol_event *ksymbol_event = data;
+ struct perf_output_handle handle;
+ struct perf_sample_data sample;
+ int ret;
+
+ if (!perf_event_ksymbol_match(event))
+ return;
+
+ perf_event_header__init_id(&ksymbol_event->event_id.header,
+ &sample, event);
+ ret = perf_output_begin(&handle, event,
+ ksymbol_event->event_id.header.size);
+ if (ret)
+ return;
+
+ perf_output_put(&handle, ksymbol_event->event_id);
+ __output_copy(&handle, ksymbol_event->name, ksymbol_event->name_len);
+ perf_event__output_id_sample(event, &handle, &sample);
+
+ perf_output_end(&handle);
+}
+
+void perf_event_ksymbol(u16 ksym_type, u64 addr, u32 len, bool unregister,
+ const char *sym)
+{
+ struct perf_ksymbol_event ksymbol_event;
+ char name[KSYM_NAME_LEN];
+ u16 flags = 0;
+ int name_len;
+
+ if (!atomic_read(&nr_ksymbol_events))
+ return;
+
+ if (ksym_type >= PERF_RECORD_KSYMBOL_TYPE_MAX ||
+ ksym_type == PERF_RECORD_KSYMBOL_TYPE_UNKNOWN)
+ goto err;
+
+ strlcpy(name, sym, KSYM_NAME_LEN);
+ name_len = strlen(name) + 1;
+ while (!IS_ALIGNED(name_len, sizeof(u64)))
+ name[name_len++] = '\0';
+ BUILD_BUG_ON(KSYM_NAME_LEN % sizeof(u64));
+
+ if (unregister)
+ flags |= PERF_RECORD_KSYMBOL_FLAGS_UNREGISTER;
+
+ ksymbol_event = (struct perf_ksymbol_event){
+ .name = name,
+ .name_len = name_len,
+ .event_id = {
+ .header = {
+ .type = PERF_RECORD_KSYMBOL,
+ .size = sizeof(ksymbol_event.event_id) +
+ name_len,
+ },
+ .addr = addr,
+ .len = len,
+ .ksym_type = ksym_type,
+ .flags = flags,
+ },
+ };
+
+ perf_iterate_sb(perf_event_ksymbol_output, &ksymbol_event, NULL);
+ return;
+err:
+ WARN_ONCE(1, "%s: Invalid KSYMBOL type 0x%x\n", __func__, ksym_type);
+}
+
+/*
+ * bpf program load/unload tracking
+ */
+
+struct perf_bpf_event {
+ struct bpf_prog *prog;
+ struct {
+ struct perf_event_header header;
+ u16 type;
+ u16 flags;
+ u32 id;
+ u8 tag[BPF_TAG_SIZE];
+ } event_id;
+};
+
+static int perf_event_bpf_match(struct perf_event *event)
+{
+ return event->attr.bpf_event;
+}
+
+static void perf_event_bpf_output(struct perf_event *event, void *data)
+{
+ struct perf_bpf_event *bpf_event = data;
+ struct perf_output_handle handle;
+ struct perf_sample_data sample;
+ int ret;
+
+ if (!perf_event_bpf_match(event))
+ return;
+
+ perf_event_header__init_id(&bpf_event->event_id.header,
+ &sample, event);
+ ret = perf_output_begin(&handle, event,
+ bpf_event->event_id.header.size);
+ if (ret)
+ return;
+
+ perf_output_put(&handle, bpf_event->event_id);
+ perf_event__output_id_sample(event, &handle, &sample);
+
+ perf_output_end(&handle);
+}
+
+static void perf_event_bpf_emit_ksymbols(struct bpf_prog *prog,
+ enum perf_bpf_event_type type)
+{
+ bool unregister = type == PERF_BPF_EVENT_PROG_UNLOAD;
+ char sym[KSYM_NAME_LEN];
+ int i;
+
+ if (prog->aux->func_cnt == 0) {
+ bpf_get_prog_name(prog, sym);
+ perf_event_ksymbol(PERF_RECORD_KSYMBOL_TYPE_BPF,
+ (u64)(unsigned long)prog->bpf_func,
+ prog->jited_len, unregister, sym);
+ } else {
+ for (i = 0; i < prog->aux->func_cnt; i++) {
+ struct bpf_prog *subprog = prog->aux->func[i];
+
+ bpf_get_prog_name(subprog, sym);
+ perf_event_ksymbol(
+ PERF_RECORD_KSYMBOL_TYPE_BPF,
+ (u64)(unsigned long)subprog->bpf_func,
+ subprog->jited_len, unregister, sym);
+ }
+ }
+}
+
+void perf_event_bpf_event(struct bpf_prog *prog,
+ enum perf_bpf_event_type type,
+ u16 flags)
+{
+ struct perf_bpf_event bpf_event;
+
+ if (type <= PERF_BPF_EVENT_UNKNOWN ||
+ type >= PERF_BPF_EVENT_MAX)
+ return;
+
+ switch (type) {
+ case PERF_BPF_EVENT_PROG_LOAD:
+ case PERF_BPF_EVENT_PROG_UNLOAD:
+ if (atomic_read(&nr_ksymbol_events))
+ perf_event_bpf_emit_ksymbols(prog, type);
+ break;
+ default:
+ break;
+ }
+
+ if (!atomic_read(&nr_bpf_events))
+ return;
+
+ bpf_event = (struct perf_bpf_event){
+ .prog = prog,
+ .event_id = {
+ .header = {
+ .type = PERF_RECORD_BPF_EVENT,
+ .size = sizeof(bpf_event.event_id),
+ },
+ .type = type,
+ .flags = flags,
+ .id = prog->aux->id,
+ },
+ };
+
+ BUILD_BUG_ON(BPF_TAG_SIZE % sizeof(u64));
+
+ memcpy(bpf_event.event_id.tag, prog->tag, BPF_TAG_SIZE);
+ perf_iterate_sb(perf_event_bpf_output, &bpf_event, NULL);
+}
+
void perf_event_itrace_started(struct perf_event *event)
{
event->attach_state |= PERF_ATTACH_ITRACE;
@@ -8248,9 +8560,9 @@ static int perf_tp_event_match(struct perf_event *event,
if (event->hw.state & PERF_HES_STOPPED)
return 0;
/*
- * All tracepoints are from kernel-space.
+ * If exclude_kernel, only trace user-space tracepoints (uprobes)
*/
- if (event->attr.exclude_kernel)
+ if (event->attr.exclude_kernel && !user_mode(regs))
return 0;
if (!perf_tp_filter_match(event, data))
@@ -8768,26 +9080,19 @@ static void perf_addr_filters_splice(struct perf_event *event,
* @filter; if so, adjust filter's address range.
* Called with mm::mmap_sem down for reading.
*/
-static unsigned long perf_addr_filter_apply(struct perf_addr_filter *filter,
- struct mm_struct *mm)
+static void perf_addr_filter_apply(struct perf_addr_filter *filter,
+ struct mm_struct *mm,
+ struct perf_addr_filter_range *fr)
{
struct vm_area_struct *vma;
for (vma = mm->mmap; vma; vma = vma->vm_next) {
- struct file *file = vma->vm_file;
- unsigned long off = vma->vm_pgoff << PAGE_SHIFT;
- unsigned long vma_size = vma->vm_end - vma->vm_start;
-
- if (!file)
- continue;
-
- if (!perf_addr_filter_match(filter, file, off, vma_size))
+ if (!vma->vm_file)
continue;
- return vma->vm_start;
+ if (perf_addr_filter_vma_adjust(filter, vma, fr))
+ return;
}
-
- return 0;
}
/*
@@ -8810,26 +9115,29 @@ static void perf_event_addr_filters_apply(struct perf_event *event)
if (task == TASK_TOMBSTONE)
return;
- if (!ifh->nr_file_filters)
- return;
-
- mm = get_task_mm(event->ctx->task);
- if (!mm)
- goto restart;
+ if (ifh->nr_file_filters) {
+ mm = get_task_mm(event->ctx->task);
+ if (!mm)
+ goto restart;
- down_read(&mm->mmap_sem);
+ down_read(&mm->mmap_sem);
+ }
raw_spin_lock_irqsave(&ifh->lock, flags);
list_for_each_entry(filter, &ifh->list, entry) {
- event->addr_filters_offs[count] = 0;
+ if (filter->path.dentry) {
+ /*
+ * Adjust base offset if the filter is associated to a
+ * binary that needs to be mapped:
+ */
+ event->addr_filter_ranges[count].start = 0;
+ event->addr_filter_ranges[count].size = 0;
- /*
- * Adjust base offset if the filter is associated to a binary
- * that needs to be mapped:
- */
- if (filter->path.dentry)
- event->addr_filters_offs[count] =
- perf_addr_filter_apply(filter, mm);
+ perf_addr_filter_apply(filter, mm, &event->addr_filter_ranges[count]);
+ } else {
+ event->addr_filter_ranges[count].start = filter->offset;
+ event->addr_filter_ranges[count].size = filter->size;
+ }
count++;
}
@@ -8837,9 +9145,11 @@ static void perf_event_addr_filters_apply(struct perf_event *event)
event->addr_filters_gen++;
raw_spin_unlock_irqrestore(&ifh->lock, flags);
- up_read(&mm->mmap_sem);
+ if (ifh->nr_file_filters) {
+ up_read(&mm->mmap_sem);
- mmput(mm);
+ mmput(mm);
+ }
restart:
perf_event_stop(event, 1);
@@ -8943,6 +9253,7 @@ perf_event_parse_addr_filter(struct perf_event *event, char *fstr,
case IF_SRC_KERNELADDR:
case IF_SRC_KERNEL:
kernel = 1;
+ /* fall through */
case IF_SRC_FILEADDR:
case IF_SRC_FILE:
@@ -9391,6 +9702,11 @@ static int perf_pmu_nop_int(struct pmu *pmu)
return 0;
}
+static int perf_event_nop_int(struct perf_event *event, u64 value)
+{
+ return 0;
+}
+
static DEFINE_PER_CPU(unsigned int, nop_txn_flags);
static void perf_pmu_start_txn(struct pmu *pmu, unsigned int flags)
@@ -9586,6 +9902,12 @@ static int pmu_dev_alloc(struct pmu *pmu)
if (ret)
goto del_dev;
+ if (pmu->attr_update)
+ ret = sysfs_update_groups(&pmu->dev->kobj, pmu->attr_update);
+
+ if (ret)
+ goto del_dev;
+
out:
return ret;
@@ -9691,6 +10013,9 @@ got_cpu_context:
pmu->pmu_disable = perf_pmu_nop_void;
}
+ if (!pmu->check_period)
+ pmu->check_period = perf_event_nop_int;
+
if (!pmu->event_idx)
pmu->event_idx = perf_event_idx_default;
@@ -9742,6 +10067,12 @@ void perf_pmu_unregister(struct pmu *pmu)
}
EXPORT_SYMBOL_GPL(perf_pmu_unregister);
+static inline bool has_extended_regs(struct perf_event *event)
+{
+ return (event->attr.sample_regs_user & PERF_REG_EXTENDED_MASK) ||
+ (event->attr.sample_regs_intr & PERF_REG_EXTENDED_MASK);
+}
+
static int perf_try_init_event(struct pmu *pmu, struct perf_event *event)
{
struct perf_event_context *ctx = NULL;
@@ -9772,6 +10103,19 @@ static int perf_try_init_event(struct pmu *pmu, struct perf_event *event)
if (ctx)
perf_event_ctx_unlock(event->group_leader, ctx);
+ if (!ret) {
+ if (!(pmu->capabilities & PERF_PMU_CAP_EXTENDED_REGS) &&
+ has_extended_regs(event))
+ ret = -EOPNOTSUPP;
+
+ if (pmu->capabilities & PERF_PMU_CAP_NO_EXCLUDE &&
+ event_has_any_exclude_flag(event))
+ ret = -EINVAL;
+
+ if (ret && event->destroy)
+ event->destroy(event);
+ }
+
if (ret)
module_put(pmu->module);
@@ -9900,6 +10244,10 @@ static void account_event(struct perf_event *event)
inc = true;
if (is_cgroup_event(event))
inc = true;
+ if (event->attr.ksymbol)
+ atomic_inc(&nr_ksymbol_events);
+ if (event->attr.bpf_event)
+ atomic_inc(&nr_bpf_events);
if (inc) {
/*
@@ -9980,6 +10328,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
init_waitqueue_head(&event->waitq);
+ event->pending_disable = -1;
init_irq_work(&event->pending, perf_pending_event);
mutex_init(&event->mmap_mutex);
@@ -10082,14 +10431,28 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
goto err_pmu;
if (has_addr_filter(event)) {
- event->addr_filters_offs = kcalloc(pmu->nr_addr_filters,
- sizeof(unsigned long),
- GFP_KERNEL);
- if (!event->addr_filters_offs) {
+ event->addr_filter_ranges = kcalloc(pmu->nr_addr_filters,
+ sizeof(struct perf_addr_filter_range),
+ GFP_KERNEL);
+ if (!event->addr_filter_ranges) {
err = -ENOMEM;
goto err_per_task;
}
+ /*
+ * Clone the parent's vma offsets: they are valid until exec()
+ * even if the mm is not shared with the parent.
+ */
+ if (event->parent) {
+ struct perf_addr_filters_head *ifh = perf_event_addr_filters(event);
+
+ raw_spin_lock_irq(&ifh->lock);
+ memcpy(event->addr_filter_ranges,
+ event->parent->addr_filter_ranges,
+ pmu->nr_addr_filters * sizeof(struct perf_addr_filter_range));
+ raw_spin_unlock_irq(&ifh->lock);
+ }
+
/* force hw sync on the address filters */
event->addr_filters_gen = 1;
}
@@ -10108,7 +10471,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
return event;
err_addr_filters:
- kfree(event->addr_filters_offs);
+ kfree(event->addr_filter_ranges);
err_per_task:
exclusive_event_destroy(event);
@@ -10361,11 +10724,11 @@ static int perf_event_set_clock(struct perf_event *event, clockid_t clk_id)
break;
case CLOCK_BOOTTIME:
- event->clock = &ktime_get_boot_ns;
+ event->clock = &ktime_get_boottime_ns;
break;
case CLOCK_TAI:
- event->clock = &ktime_get_tai_ns;
+ event->clock = &ktime_get_clocktai_ns;
break;
default:
@@ -10391,7 +10754,7 @@ __perf_event_ctx_lock_double(struct perf_event *group_leader,
again:
rcu_read_lock();
gctx = READ_ONCE(group_leader->ctx);
- if (!atomic_inc_not_zero(&gctx->refcount)) {
+ if (!refcount_inc_not_zero(&gctx->refcount)) {
rcu_read_unlock();
goto again;
}
@@ -10590,11 +10953,6 @@ SYSCALL_DEFINE5(perf_event_open,
goto err_alloc;
}
- if ((pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE) && group_leader) {
- err = -EBUSY;
- goto err_context;
- }
-
/*
* Look up the group leader (we will attach this event to it):
*/
@@ -10682,6 +11040,18 @@ SYSCALL_DEFINE5(perf_event_open,
move_group = 0;
}
}
+
+ /*
+ * Failure to create exclusive events returns -EBUSY.
+ */
+ err = -EBUSY;
+ if (!exclusive_event_installable(group_leader, ctx))
+ goto err_locked;
+
+ for_each_sibling_event(sibling, group_leader) {
+ if (!exclusive_event_installable(sibling, ctx))
+ goto err_locked;
+ }
} else {
mutex_lock(&ctx->mutex);
}
@@ -10718,9 +11088,6 @@ SYSCALL_DEFINE5(perf_event_open,
* because we need to serialize with concurrent event creation.
*/
if (!exclusive_event_installable(event, ctx)) {
- /* exclusive and group stuff are assumed mutually exclusive */
- WARN_ON_ONCE(move_group);
-
err = -EBUSY;
goto err_locked;
}
@@ -11187,11 +11554,11 @@ static void perf_free_event(struct perf_event *event,
}
/*
- * Free an unexposed, unused context as created by inheritance by
- * perf_event_init_task below, used by fork() in case of fail.
+ * Free a context as created by inheritance by perf_event_init_task() below,
+ * used by fork() in case of fail.
*
- * Not all locks are strictly required, but take them anyway to be nice and
- * help out with the lockdep assertions.
+ * Even though the task has never lived, the context and events have been
+ * exposed through the child_list, so we must take care tearing it all down.
*/
void perf_event_free_task(struct task_struct *task)
{
@@ -11221,7 +11588,23 @@ void perf_event_free_task(struct task_struct *task)
perf_free_event(event, ctx);
mutex_unlock(&ctx->mutex);
- put_ctx(ctx);
+
+ /*
+ * perf_event_release_kernel() could've stolen some of our
+ * child events and still have them on its free_list. In that
+ * case we must wait for these events to have been freed (in
+ * particular all their references to this task must've been
+ * dropped).
+ *
+ * Without this copy_process() will unconditionally free this
+ * task (irrespective of its reference count) and
+ * _free_event()'s put_task_struct(event->hw.target) will be a
+ * use-after-free.
+ *
+ * Wait for all events to drop their context reference.
+ */
+ wait_var_event(&ctx->refcount, refcount_read(&ctx->refcount) == 1);
+ put_ctx(ctx); /* must be last */
}
}
@@ -11608,7 +11991,7 @@ static void __init perf_event_init_all_cpus(void)
}
}
-void perf_swevent_init_cpu(unsigned int cpu)
+static void perf_swevent_init_cpu(unsigned int cpu)
{
struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c
index 5befb338a18d..c5cd852fe86b 100644
--- a/kernel/events/hw_breakpoint.c
+++ b/kernel/events/hw_breakpoint.c
@@ -1,18 +1,5 @@
+// SPDX-License-Identifier: GPL-2.0+
/*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
- *
* Copyright (C) 2007 Alan Stern
* Copyright (C) IBM Corporation, 2009
* Copyright (C) 2009, Frederic Weisbecker <fweisbec@gmail.com>
diff --git a/kernel/events/internal.h b/kernel/events/internal.h
index 6dc725a7e7bc..3aef4191798c 100644
--- a/kernel/events/internal.h
+++ b/kernel/events/internal.h
@@ -4,13 +4,14 @@
#include <linux/hardirq.h>
#include <linux/uaccess.h>
+#include <linux/refcount.h>
/* Buffer handling */
#define RING_BUFFER_WRITABLE 0x01
struct ring_buffer {
- atomic_t refcount;
+ refcount_t refcount;
struct rcu_head rcu_head;
#ifdef CONFIG_PERF_USE_VMALLOC
struct work_struct work;
@@ -23,7 +24,7 @@ struct ring_buffer {
atomic_t poll; /* POLL_ for wakeups */
local_t head; /* write position */
- local_t nest; /* nested writers */
+ unsigned int nest; /* nested writers */
local_t events; /* event limit */
local_t wakeup; /* wakeup stamp */
local_t lost; /* nr records lost */
@@ -40,7 +41,7 @@ struct ring_buffer {
/* AUX area */
long aux_head;
- local_t aux_nest;
+ unsigned int aux_nest;
long aux_wakeup; /* last aux_watermark boundary crossed by aux_head */
unsigned long aux_pgoff;
int aux_nr_pages;
@@ -48,7 +49,7 @@ struct ring_buffer {
atomic_t aux_mmap_count;
unsigned long aux_mmap_locked;
void (*free_aux)(void *);
- atomic_t aux_refcount;
+ refcount_t aux_refcount;
void **aux_pages;
void *aux_priv;
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index 309ef5a64af5..ffb59a4ef4ff 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Performance events ring-buffer code:
*
@@ -5,8 +6,6 @@
* Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar
* Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra
* Copyright © 2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
- *
- * For licensing details see kernel-base/COPYING
*/
#include <linux/perf_event.h>
@@ -39,7 +38,12 @@ static void perf_output_get_handle(struct perf_output_handle *handle)
struct ring_buffer *rb = handle->rb;
preempt_disable();
- local_inc(&rb->nest);
+
+ /*
+ * Avoid an explicit LOAD/STORE such that architectures with memops
+ * can use them.
+ */
+ (*(volatile unsigned int *)&rb->nest)++;
handle->wakeup = local_read(&rb->wakeup);
}
@@ -47,17 +51,35 @@ static void perf_output_put_handle(struct perf_output_handle *handle)
{
struct ring_buffer *rb = handle->rb;
unsigned long head;
+ unsigned int nest;
+
+ /*
+ * If this isn't the outermost nesting, we don't have to update
+ * @rb->user_page->data_head.
+ */
+ nest = READ_ONCE(rb->nest);
+ if (nest > 1) {
+ WRITE_ONCE(rb->nest, nest - 1);
+ goto out;
+ }
again:
+ /*
+ * In order to avoid publishing a head value that goes backwards,
+ * we must ensure the load of @rb->head happens after we've
+ * incremented @rb->nest.
+ *
+ * Otherwise we can observe a @rb->head value before one published
+ * by an IRQ/NMI happening between the load and the increment.
+ */
+ barrier();
head = local_read(&rb->head);
/*
- * IRQ/NMI can happen here, which means we can miss a head update.
+ * IRQ/NMI can happen here and advance @rb->head, causing our
+ * load above to be stale.
*/
- if (!local_dec_and_test(&rb->nest))
- goto out;
-
/*
* Since the mmap() consumer (userspace) can run on a different CPU:
*
@@ -85,14 +107,23 @@ again:
* See perf_output_begin().
*/
smp_wmb(); /* B, matches C */
- rb->user_page->data_head = head;
+ WRITE_ONCE(rb->user_page->data_head, head);
/*
- * Now check if we missed an update -- rely on previous implied
- * compiler barriers to force a re-read.
+ * We must publish the head before decrementing the nest count,
+ * otherwise an IRQ/NMI can publish a more recent head value and our
+ * write will (temporarily) publish a stale value.
*/
+ barrier();
+ WRITE_ONCE(rb->nest, 0);
+
+ /*
+ * Ensure we decrement @rb->nest before we validate the @rb->head.
+ * Otherwise we cannot be sure we caught the 'last' nested update.
+ */
+ barrier();
if (unlikely(head != local_read(&rb->head))) {
- local_inc(&rb->nest);
+ WRITE_ONCE(rb->nest, 1);
goto again;
}
@@ -285,7 +316,7 @@ ring_buffer_init(struct ring_buffer *rb, long watermark, int flags)
else
rb->overwrite = 1;
- atomic_set(&rb->refcount, 1);
+ refcount_set(&rb->refcount, 1);
INIT_LIST_HEAD(&rb->event_list);
spin_lock_init(&rb->event_lock);
@@ -331,6 +362,7 @@ void *perf_aux_output_begin(struct perf_output_handle *handle,
struct perf_event *output_event = event;
unsigned long aux_head, aux_tail;
struct ring_buffer *rb;
+ unsigned int nest;
if (output_event->parent)
output_event = output_event->parent;
@@ -358,16 +390,19 @@ void *perf_aux_output_begin(struct perf_output_handle *handle,
if (!atomic_read(&rb->aux_mmap_count))
goto err;
- if (!atomic_inc_not_zero(&rb->aux_refcount))
+ if (!refcount_inc_not_zero(&rb->aux_refcount))
goto err;
+ nest = READ_ONCE(rb->aux_nest);
/*
* Nesting is not supported for AUX area, make sure nested
* writers are caught early
*/
- if (WARN_ON_ONCE(local_xchg(&rb->aux_nest, 1)))
+ if (WARN_ON_ONCE(nest))
goto err_put;
+ WRITE_ONCE(rb->aux_nest, nest + 1);
+
aux_head = rb->aux_head;
handle->rb = rb;
@@ -393,9 +428,9 @@ void *perf_aux_output_begin(struct perf_output_handle *handle,
* store that will be enabled on successful return
*/
if (!handle->size) { /* A, matches D */
- event->pending_disable = 1;
+ event->pending_disable = smp_processor_id();
perf_output_wakeup(handle);
- local_set(&rb->aux_nest, 0);
+ WRITE_ONCE(rb->aux_nest, 0);
goto err_put;
}
}
@@ -456,38 +491,35 @@ void perf_aux_output_end(struct perf_output_handle *handle, unsigned long size)
rb->aux_head += size;
}
- if (size || handle->aux_flags) {
- /*
- * Only send RECORD_AUX if we have something useful to communicate
- *
- * Note: the OVERWRITE records by themselves are not considered
- * useful, as they don't communicate any *new* information,
- * aside from the short-lived offset, that becomes history at
- * the next event sched-in and therefore isn't useful.
- * The userspace that needs to copy out AUX data in overwrite
- * mode should know to use user_page::aux_head for the actual
- * offset. So, from now on we don't output AUX records that
- * have *only* OVERWRITE flag set.
- */
-
- if (handle->aux_flags & ~(u64)PERF_AUX_FLAG_OVERWRITE)
- perf_event_aux_event(handle->event, aux_head, size,
- handle->aux_flags);
- }
+ /*
+ * Only send RECORD_AUX if we have something useful to communicate
+ *
+ * Note: the OVERWRITE records by themselves are not considered
+ * useful, as they don't communicate any *new* information,
+ * aside from the short-lived offset, that becomes history at
+ * the next event sched-in and therefore isn't useful.
+ * The userspace that needs to copy out AUX data in overwrite
+ * mode should know to use user_page::aux_head for the actual
+ * offset. So, from now on we don't output AUX records that
+ * have *only* OVERWRITE flag set.
+ */
+ if (size || (handle->aux_flags & ~(u64)PERF_AUX_FLAG_OVERWRITE))
+ perf_event_aux_event(handle->event, aux_head, size,
+ handle->aux_flags);
- rb->user_page->aux_head = rb->aux_head;
+ WRITE_ONCE(rb->user_page->aux_head, rb->aux_head);
if (rb_need_aux_wakeup(rb))
wakeup = true;
if (wakeup) {
if (handle->aux_flags & PERF_AUX_FLAG_TRUNCATED)
- handle->event->pending_disable = 1;
+ handle->event->pending_disable = smp_processor_id();
perf_output_wakeup(handle);
}
handle->event = NULL;
- local_set(&rb->aux_nest, 0);
+ WRITE_ONCE(rb->aux_nest, 0);
/* can't be last */
rb_free_aux(rb);
ring_buffer_put(rb);
@@ -507,7 +539,7 @@ int perf_aux_output_skip(struct perf_output_handle *handle, unsigned long size)
rb->aux_head += size;
- rb->user_page->aux_head = rb->aux_head;
+ WRITE_ONCE(rb->user_page->aux_head, rb->aux_head);
if (rb_need_aux_wakeup(rb)) {
perf_output_wakeup(handle);
handle->wakeup = rb->aux_wakeup + rb->aux_watermark;
@@ -599,29 +631,26 @@ int rb_alloc_aux(struct ring_buffer *rb, struct perf_event *event,
{
bool overwrite = !(flags & RING_BUFFER_WRITABLE);
int node = (event->cpu == -1) ? -1 : cpu_to_node(event->cpu);
- int ret = -ENOMEM, max_order = 0;
+ int ret = -ENOMEM, max_order;
if (!has_aux(event))
return -EOPNOTSUPP;
- if (event->pmu->capabilities & PERF_PMU_CAP_AUX_NO_SG) {
- /*
- * We need to start with the max_order that fits in nr_pages,
- * not the other way around, hence ilog2() and not get_order.
- */
- max_order = ilog2(nr_pages);
+ /*
+ * We need to start with the max_order that fits in nr_pages,
+ * not the other way around, hence ilog2() and not get_order.
+ */
+ max_order = ilog2(nr_pages);
- /*
- * PMU requests more than one contiguous chunks of memory
- * for SW double buffering
- */
- if ((event->pmu->capabilities & PERF_PMU_CAP_AUX_SW_DOUBLEBUF) &&
- !overwrite) {
- if (!max_order)
- return -EINVAL;
+ /*
+ * PMU requests more than one contiguous chunks of memory
+ * for SW double buffering
+ */
+ if (!overwrite) {
+ if (!max_order)
+ return -EINVAL;
- max_order--;
- }
+ max_order--;
}
rb->aux_pages = kcalloc_node(nr_pages, sizeof(void *), GFP_KERNEL,
@@ -658,7 +687,7 @@ int rb_alloc_aux(struct ring_buffer *rb, struct perf_event *event,
goto out;
}
- rb->aux_priv = event->pmu->setup_aux(event->cpu, rb->aux_pages, nr_pages,
+ rb->aux_priv = event->pmu->setup_aux(event, rb->aux_pages, nr_pages,
overwrite);
if (!rb->aux_priv)
goto out;
@@ -671,7 +700,7 @@ int rb_alloc_aux(struct ring_buffer *rb, struct perf_event *event,
* we keep a refcount here to make sure either of the two can
* reference them safely.
*/
- atomic_set(&rb->aux_refcount, 1);
+ refcount_set(&rb->aux_refcount, 1);
rb->aux_overwrite = overwrite;
rb->aux_watermark = watermark;
@@ -690,7 +719,7 @@ out:
void rb_free_aux(struct ring_buffer *rb)
{
- if (atomic_dec_and_test(&rb->aux_refcount))
+ if (refcount_dec_and_test(&rb->aux_refcount))
__rb_free_aux(rb);
}
@@ -734,7 +763,7 @@ struct ring_buffer *rb_alloc(int nr_pages, long watermark, int cpu, int flags)
size = sizeof(struct ring_buffer);
size += nr_pages * sizeof(void *);
- if (order_base_2(size) >= MAX_ORDER)
+ if (order_base_2(size) >= PAGE_SHIFT+MAX_ORDER)
goto fail;
rb = kzalloc(size, GFP_KERNEL);
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 8aef47ee7bfa..84fa00497c49 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -1,20 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0+
/*
* User-space Probes (UProbes)
*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
- *
* Copyright (C) IBM Corporation, 2008-2012
* Authors:
* Srikar Dronamraju
@@ -59,14 +46,14 @@ static DEFINE_SPINLOCK(uprobes_treelock); /* serialize rbtree access */
static struct mutex uprobes_mmap_mutex[UPROBES_HASH_SZ];
#define uprobes_mmap_hash(v) (&uprobes_mmap_mutex[((unsigned long)(v)) % UPROBES_HASH_SZ])
-static struct percpu_rw_semaphore dup_mmap_sem;
+DEFINE_STATIC_PERCPU_RWSEM(dup_mmap_sem);
/* Have a copy of original instruction */
#define UPROBE_COPY_INSN 0
struct uprobe {
struct rb_node rb_node; /* node in the rb tree */
- atomic_t ref;
+ refcount_t ref;
struct rw_semaphore register_rwsem;
struct rw_semaphore consumer_rwsem;
struct list_head pending_list;
@@ -174,7 +161,8 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
struct mmu_notifier_range range;
struct mem_cgroup *memcg;
- mmu_notifier_range_init(&range, mm, addr, addr + PAGE_SIZE);
+ mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, addr,
+ addr + PAGE_SIZE);
VM_BUG_ON_PAGE(PageTransHuge(old_page), old_page);
@@ -560,13 +548,13 @@ set_orig_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long v
static struct uprobe *get_uprobe(struct uprobe *uprobe)
{
- atomic_inc(&uprobe->ref);
+ refcount_inc(&uprobe->ref);
return uprobe;
}
static void put_uprobe(struct uprobe *uprobe)
{
- if (atomic_dec_and_test(&uprobe->ref)) {
+ if (refcount_dec_and_test(&uprobe->ref)) {
/*
* If application munmap(exec_vma) before uprobe_unregister()
* gets called, we don't get a chance to remove uprobe from
@@ -657,7 +645,7 @@ static struct uprobe *__insert_uprobe(struct uprobe *uprobe)
rb_link_node(&uprobe->rb_node, parent, p);
rb_insert_color(&uprobe->rb_node, &uprobes_tree);
/* get access + creation ref */
- atomic_set(&uprobe->ref, 2);
+ refcount_set(&uprobe->ref, 2);
return u;
}
@@ -2041,7 +2029,7 @@ static void handler_chain(struct uprobe *uprobe, struct pt_regs *regs)
if (uc->handler) {
rc = uc->handler(uc, regs);
WARN(rc & ~UPROBE_HANDLER_MASK,
- "bad rc=0x%x from %pf()\n", rc, uc->handler);
+ "bad rc=0x%x from %ps()\n", rc, uc->handler);
}
if (uc->ret_handler)
@@ -2124,7 +2112,7 @@ static void handle_trampoline(struct pt_regs *regs)
sigill:
uprobe_warn(current, "handle uretprobe, sending SIGILL.");
- force_sig(SIGILL, current);
+ force_sig(SIGILL);
}
@@ -2240,7 +2228,7 @@ static void handle_singlestep(struct uprobe_task *utask, struct pt_regs *regs)
if (unlikely(err)) {
uprobe_warn(current, "execute the probed insn, sending SIGILL.");
- force_sig(SIGILL, current);
+ force_sig(SIGILL);
}
}
@@ -2307,16 +2295,12 @@ static struct notifier_block uprobe_exception_nb = {
.priority = INT_MAX-1, /* notified after kprobes, kgdb */
};
-static int __init init_uprobes(void)
+void __init uprobes_init(void)
{
int i;
for (i = 0; i < UPROBES_HASH_SZ; i++)
mutex_init(&uprobes_mmap_mutex[i]);
- if (percpu_init_rwsem(&dup_mmap_sem))
- return -ENOMEM;
-
- return register_die_notifier(&uprobe_exception_nb);
+ BUG_ON(register_die_notifier(&uprobe_exception_nb));
}
-__initcall(init_uprobes);
diff --git a/kernel/exit.c b/kernel/exit.c
index 2639a30a8aa5..a75b6a7f458a 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* linux/kernel/exit.c
*
@@ -194,6 +195,7 @@ repeat:
rcu_read_unlock();
proc_flush_task(p);
+ cgroup_release(p);
write_lock_irq(&tasklist_lock);
ptrace_release_task(p);
@@ -421,7 +423,7 @@ retry:
* freed task structure.
*/
if (atomic_read(&mm->mm_users) <= 1) {
- mm->owner = NULL;
+ WRITE_ONCE(mm->owner, NULL);
return;
}
@@ -461,7 +463,7 @@ retry:
* most likely racing with swapoff (try_to_unuse()) or /proc or
* ptrace or page migration (get_task_mm()). Mark owner as NULL.
*/
- mm->owner = NULL;
+ WRITE_ONCE(mm->owner, NULL);
return;
assign_new_owner:
@@ -482,7 +484,7 @@ assign_new_owner:
put_task_struct(c);
goto retry;
}
- mm->owner = c;
+ WRITE_ONCE(mm->owner, c);
task_unlock(c);
put_task_struct(c);
}
diff --git a/kernel/extable.c b/kernel/extable.c
index 6a5b61ebc66c..e23cce6e6092 100644
--- a/kernel/extable.c
+++ b/kernel/extable.c
@@ -1,19 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/* Rewritten by Rusty Russell, on the backs of many others...
Copyright (C) 2001 Rusty Russell, 2002 Rusty Russell IBM.
- This program is free software; you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation; either version 2 of the License, or
- (at your option) any later version.
-
- This program is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program; if not, write to the Free Software
- Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
#include <linux/ftrace.h>
#include <linux/memory.h>
diff --git a/kernel/fail_function.c b/kernel/fail_function.c
index 17f75b545f66..63b349168da7 100644
--- a/kernel/fail_function.c
+++ b/kernel/fail_function.c
@@ -152,20 +152,13 @@ static int fei_retval_get(void *data, u64 *val)
DEFINE_DEBUGFS_ATTRIBUTE(fei_retval_ops, fei_retval_get, fei_retval_set,
"%llx\n");
-static int fei_debugfs_add_attr(struct fei_attr *attr)
+static void fei_debugfs_add_attr(struct fei_attr *attr)
{
struct dentry *dir;
dir = debugfs_create_dir(attr->kp.symbol_name, fei_debugfs_dir);
- if (!dir)
- return -ENOMEM;
-
- if (!debugfs_create_file("retval", 0600, dir, attr, &fei_retval_ops)) {
- debugfs_remove_recursive(dir);
- return -ENOMEM;
- }
- return 0;
+ debugfs_create_file("retval", 0600, dir, attr, &fei_retval_ops);
}
static void fei_debugfs_remove_attr(struct fei_attr *attr)
@@ -210,7 +203,7 @@ static int fei_seq_show(struct seq_file *m, void *v)
{
struct fei_attr *attr = list_entry(v, struct fei_attr, list);
- seq_printf(m, "%pf\n", attr->kp.addr);
+ seq_printf(m, "%ps\n", attr->kp.addr);
return 0;
}
@@ -306,7 +299,7 @@ static ssize_t fei_write(struct file *file, const char __user *buffer,
ret = register_kprobe(&attr->kp);
if (!ret)
- ret = fei_debugfs_add_attr(attr);
+ fei_debugfs_add_attr(attr);
if (ret < 0)
fei_attr_remove(attr);
else {
@@ -337,19 +330,13 @@ static int __init fei_debugfs_init(void)
return PTR_ERR(dir);
/* injectable attribute is just a symlink of error_inject/list */
- if (!debugfs_create_symlink("injectable", dir,
- "../error_injection/list"))
- goto error;
+ debugfs_create_symlink("injectable", dir, "../error_injection/list");
- if (!debugfs_create_file("inject", 0600, dir, NULL, &fei_ops))
- goto error;
+ debugfs_create_file("inject", 0600, dir, NULL, &fei_ops);
fei_debugfs_dir = dir;
return 0;
-error:
- debugfs_remove_recursive(dir);
- return -ENOMEM;
}
late_initcall(fei_debugfs_init);
diff --git a/kernel/fork.c b/kernel/fork.c
index b69248e6f0e0..8f3e2d97d771 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* linux/kernel/fork.c
*
@@ -11,6 +12,7 @@
* management can be a bitch. See 'mm/memory.c': 'copy_page_range()'
*/
+#include <linux/anon_inodes.h>
#include <linux/slab.h>
#include <linux/sched/autogroup.h>
#include <linux/sched/mm.h>
@@ -21,6 +23,7 @@
#include <linux/sched/task.h>
#include <linux/sched/task_stack.h>
#include <linux/sched/cputime.h>
+#include <linux/seq_file.h>
#include <linux/rtmutex.h>
#include <linux/init.h>
#include <linux/unistd.h>
@@ -77,7 +80,6 @@
#include <linux/blkdev.h>
#include <linux/fs_struct.h>
#include <linux/magic.h>
-#include <linux/sched/mm.h>
#include <linux/perf_event.h>
#include <linux/posix-timers.h>
#include <linux/user-return-notifier.h>
@@ -121,7 +123,7 @@
unsigned long total_forks; /* Handle normal Linux uptimes. */
int nr_threads; /* The idle threads do not count.. */
-int max_threads; /* tunable limit on nr_threads */
+static int max_threads; /* tunable limit on nr_threads */
DEFINE_PER_CPU(unsigned long, process_counts) = 0;
@@ -246,7 +248,11 @@ static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node)
struct page *page = alloc_pages_node(node, THREADINFO_GFP,
THREAD_SIZE_ORDER);
- return page ? page_address(page) : NULL;
+ if (likely(page)) {
+ tsk->stack = page_address(page);
+ return tsk->stack;
+ }
+ return NULL;
#endif
}
@@ -429,7 +435,7 @@ static void release_task_stack(struct task_struct *tsk)
#ifdef CONFIG_THREAD_INFO_IN_TASK
void put_task_stack(struct task_struct *tsk)
{
- if (atomic_dec_and_test(&tsk->stack_refcount))
+ if (refcount_dec_and_test(&tsk->stack_refcount))
release_task_stack(tsk);
}
#endif
@@ -447,7 +453,7 @@ void free_task(struct task_struct *tsk)
* If the task had a separate stack allocation, it should be gone
* by now.
*/
- WARN_ON_ONCE(atomic_read(&tsk->stack_refcount) != 0);
+ WARN_ON_ONCE(refcount_read(&tsk->stack_refcount) != 0);
#endif
rt_mutex_debug_task_free(tsk);
ftrace_graph_exit_task(tsk);
@@ -710,14 +716,14 @@ static inline void free_signal_struct(struct signal_struct *sig)
static inline void put_signal_struct(struct signal_struct *sig)
{
- if (atomic_dec_and_test(&sig->sigcnt))
+ if (refcount_dec_and_test(&sig->sigcnt))
free_signal_struct(sig);
}
void __put_task_struct(struct task_struct *tsk)
{
WARN_ON(!tsk->exit_state);
- WARN_ON(atomic_read(&tsk->usage));
+ WARN_ON(refcount_read(&tsk->usage));
WARN_ON(tsk == current);
cgroup_free(tsk);
@@ -816,6 +822,7 @@ void __init fork_init(void)
#endif
lockdep_init_task(&init_task);
+ uprobes_init();
}
int __weak arch_dup_task_struct(struct task_struct *dst,
@@ -867,7 +874,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
tsk->stack_vm_area = stack_vm_area;
#endif
#ifdef CONFIG_THREAD_INFO_IN_TASK
- atomic_set(&tsk->stack_refcount, 1);
+ refcount_set(&tsk->stack_refcount, 1);
#endif
if (err)
@@ -891,12 +898,14 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
#ifdef CONFIG_STACKPROTECTOR
tsk->stack_canary = get_random_canary();
#endif
+ if (orig->cpus_ptr == &orig->cpus_mask)
+ tsk->cpus_ptr = &tsk->cpus_mask;
/*
* One for us, one for whoever does the "release_task()" (usually
* parent)
*/
- atomic_set(&tsk->usage, 2);
+ refcount_set(&tsk->usage, 2);
#ifdef CONFIG_BLK_DEV_IO_TRACE
tsk->btrace_seq = 0;
#endif
@@ -953,6 +962,15 @@ static void mm_init_aio(struct mm_struct *mm)
#endif
}
+static __always_inline void mm_clear_owner(struct mm_struct *mm,
+ struct task_struct *p)
+{
+#ifdef CONFIG_MEMCG
+ if (mm->owner == p)
+ WRITE_ONCE(mm->owner, NULL);
+#endif
+}
+
static void mm_init_owner(struct mm_struct *mm, struct task_struct *p)
{
#ifdef CONFIG_MEMCG
@@ -981,7 +999,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
mm_pgtables_bytes_init(mm);
mm->map_count = 0;
mm->locked_vm = 0;
- mm->pinned_vm = 0;
+ atomic64_set(&mm->pinned_vm, 0);
memset(&mm->rss_stat, 0, sizeof(mm->rss_stat));
spin_lock_init(&mm->page_table_lock);
spin_lock_init(&mm->arg_lock);
@@ -1223,7 +1241,9 @@ static int wait_for_vfork_done(struct task_struct *child,
int killed;
freezer_do_not_count();
+ cgroup_enter_frozen();
killed = wait_for_completion_killable(vfork);
+ cgroup_leave_frozen(false);
freezer_count();
if (killed) {
@@ -1299,13 +1319,20 @@ void mm_release(struct task_struct *tsk, struct mm_struct *mm)
complete_vfork_done(tsk);
}
-/*
- * Allocate a new mm structure and copy contents from the
- * mm structure of the passed in task structure.
+/**
+ * dup_mm() - duplicates an existing mm structure
+ * @tsk: the task_struct with which the new mm will be associated.
+ * @oldmm: the mm to duplicate.
+ *
+ * Allocates a new mm structure and duplicates the provided @oldmm structure
+ * content into it.
+ *
+ * Return: the duplicated mm or NULL on failure.
*/
-static struct mm_struct *dup_mm(struct task_struct *tsk)
+static struct mm_struct *dup_mm(struct task_struct *tsk,
+ struct mm_struct *oldmm)
{
- struct mm_struct *mm, *oldmm = current->mm;
+ struct mm_struct *mm;
int err;
mm = allocate_mm();
@@ -1332,6 +1359,7 @@ static struct mm_struct *dup_mm(struct task_struct *tsk)
free_pt:
/* don't put binfmt in mmput, we haven't got module yet */
mm->binfmt = NULL;
+ mm_init_owner(mm, NULL);
mmput(mm);
fail_nomem:
@@ -1372,7 +1400,7 @@ static int copy_mm(unsigned long clone_flags, struct task_struct *tsk)
}
retval = -ENOMEM;
- mm = dup_mm(tsk);
+ mm = dup_mm(tsk, current->mm);
if (!mm)
goto fail_nomem;
@@ -1463,7 +1491,7 @@ static int copy_sighand(unsigned long clone_flags, struct task_struct *tsk)
struct sighand_struct *sig;
if (clone_flags & CLONE_SIGHAND) {
- atomic_inc(&current->sighand->count);
+ refcount_inc(&current->sighand->count);
return 0;
}
sig = kmem_cache_alloc(sighand_cachep, GFP_KERNEL);
@@ -1471,7 +1499,7 @@ static int copy_sighand(unsigned long clone_flags, struct task_struct *tsk)
if (!sig)
return -ENOMEM;
- atomic_set(&sig->count, 1);
+ refcount_set(&sig->count, 1);
spin_lock_irq(&current->sighand->siglock);
memcpy(sig->action, current->sighand->action, sizeof(sig->action));
spin_unlock_irq(&current->sighand->siglock);
@@ -1480,7 +1508,7 @@ static int copy_sighand(unsigned long clone_flags, struct task_struct *tsk)
void __cleanup_sighand(struct sighand_struct *sighand)
{
- if (atomic_dec_and_test(&sighand->count)) {
+ if (refcount_dec_and_test(&sighand->count)) {
signalfd_cleanup(sighand);
/*
* sighand_cachep is SLAB_TYPESAFE_BY_RCU so we can free it
@@ -1527,7 +1555,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
sig->nr_threads = 1;
atomic_set(&sig->live, 1);
- atomic_set(&sig->sigcnt, 1);
+ refcount_set(&sig->sigcnt, 1);
/* list_add(thread_node, thread_head) without INIT_LIST_HEAD() */
sig->thread_head = (struct list_head)LIST_HEAD_INIT(tsk->thread_node);
@@ -1663,6 +1691,74 @@ static inline void rcu_copy_process(struct task_struct *p)
#endif /* #ifdef CONFIG_TASKS_RCU */
}
+static int pidfd_release(struct inode *inode, struct file *file)
+{
+ struct pid *pid = file->private_data;
+
+ file->private_data = NULL;
+ put_pid(pid);
+ return 0;
+}
+
+#ifdef CONFIG_PROC_FS
+static void pidfd_show_fdinfo(struct seq_file *m, struct file *f)
+{
+ struct pid_namespace *ns = proc_pid_ns(file_inode(m->file));
+ struct pid *pid = f->private_data;
+
+ seq_put_decimal_ull(m, "Pid:\t", pid_nr_ns(pid, ns));
+ seq_putc(m, '\n');
+}
+#endif
+
+/*
+ * Poll support for process exit notification.
+ */
+static unsigned int pidfd_poll(struct file *file, struct poll_table_struct *pts)
+{
+ struct task_struct *task;
+ struct pid *pid = file->private_data;
+ int poll_flags = 0;
+
+ poll_wait(file, &pid->wait_pidfd, pts);
+
+ rcu_read_lock();
+ task = pid_task(pid, PIDTYPE_PID);
+ /*
+ * Inform pollers only when the whole thread group exits.
+ * If the thread group leader exits before all other threads in the
+ * group, then poll(2) should block, similar to the wait(2) family.
+ */
+ if (!task || (task->exit_state && thread_group_empty(task)))
+ poll_flags = POLLIN | POLLRDNORM;
+ rcu_read_unlock();
+
+ return poll_flags;
+}
+
+const struct file_operations pidfd_fops = {
+ .release = pidfd_release,
+ .poll = pidfd_poll,
+#ifdef CONFIG_PROC_FS
+ .show_fdinfo = pidfd_show_fdinfo,
+#endif
+};
+
+static void __delayed_free_task(struct rcu_head *rhp)
+{
+ struct task_struct *tsk = container_of(rhp, struct task_struct, rcu);
+
+ free_task(tsk);
+}
+
+static __always_inline void delayed_free_task(struct task_struct *tsk)
+{
+ if (IS_ENABLED(CONFIG_MEMCG))
+ call_rcu(&tsk->rcu, __delayed_free_task);
+ else
+ free_task(tsk);
+}
+
/*
* This creates a new process as a copy of the old one,
* but does not actually start it yet.
@@ -1672,18 +1768,16 @@ static inline void rcu_copy_process(struct task_struct *p)
* flags). The actual kick-off is left to the caller.
*/
static __latent_entropy struct task_struct *copy_process(
- unsigned long clone_flags,
- unsigned long stack_start,
- unsigned long stack_size,
- int __user *child_tidptr,
struct pid *pid,
int trace,
- unsigned long tls,
- int node)
+ int node,
+ struct kernel_clone_args *args)
{
- int retval;
+ int pidfd = -1, retval;
struct task_struct *p;
struct multiprocess_signals delayed;
+ struct file *pidfile = NULL;
+ u64 clone_flags = args->flags;
/*
* Don't allow sharing the root directory with processes in a different
@@ -1731,6 +1825,16 @@ static __latent_entropy struct task_struct *copy_process(
return ERR_PTR(-EINVAL);
}
+ if (clone_flags & CLONE_PIDFD) {
+ /*
+ * - CLONE_DETACHED is blocked so that we can potentially
+ * reuse it later for CLONE_PIDFD.
+ * - CLONE_THREAD is blocked until someone really needs it.
+ */
+ if (clone_flags & (CLONE_DETACHED | CLONE_THREAD))
+ return ERR_PTR(-EINVAL);
+ }
+
/*
* Force any signals received before this point to be delivered
* before the fork happens. Collect up signals sent to multiple
@@ -1760,11 +1864,11 @@ static __latent_entropy struct task_struct *copy_process(
* p->set_child_tid which is (ab)used as a kthread's data pointer for
* kernel threads (PF_KTHREAD).
*/
- p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL;
+ p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? args->child_tid : NULL;
/*
* Clear TID on mm_release()?
*/
- p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? child_tidptr : NULL;
+ p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? args->child_tid : NULL;
ftrace_graph_init_task(p);
@@ -1869,9 +1973,6 @@ static __latent_entropy struct task_struct *copy_process(
p->pagefault_disabled = 0;
#ifdef CONFIG_LOCKDEP
- p->lockdep_depth = 0; /* no locks held yet */
- p->curr_chain_key = 0;
- p->lockdep_recursion = 0;
lockdep_init_task(p);
#endif
@@ -1923,7 +2024,8 @@ static __latent_entropy struct task_struct *copy_process(
retval = copy_io(clone_flags, p);
if (retval)
goto bad_fork_cleanup_namespaces;
- retval = copy_thread_tls(clone_flags, stack_start, stack_size, p, tls);
+ retval = copy_thread_tls(clone_flags, args->stack, args->stack_size, p,
+ args->tls);
if (retval)
goto bad_fork_cleanup_io;
@@ -1937,6 +2039,32 @@ static __latent_entropy struct task_struct *copy_process(
}
}
+ /*
+ * This has to happen after we've potentially unshared the file
+ * descriptor table (so that the pidfd doesn't leak into the child
+ * if the fd table isn't shared).
+ */
+ if (clone_flags & CLONE_PIDFD) {
+ retval = get_unused_fd_flags(O_RDWR | O_CLOEXEC);
+ if (retval < 0)
+ goto bad_fork_free_pid;
+
+ pidfd = retval;
+
+ pidfile = anon_inode_getfile("[pidfd]", &pidfd_fops, pid,
+ O_RDWR | O_CLOEXEC);
+ if (IS_ERR(pidfile)) {
+ put_unused_fd(pidfd);
+ retval = PTR_ERR(pidfile);
+ goto bad_fork_free_pid;
+ }
+ get_pid(pid); /* held by pidfile now */
+
+ retval = put_user(pidfd, args->pidfd);
+ if (retval)
+ goto bad_fork_put_pidfd;
+ }
+
#ifdef CONFIG_BLOCK
p->plug = NULL;
#endif
@@ -1963,7 +2091,7 @@ static __latent_entropy struct task_struct *copy_process(
#ifdef TIF_SYSCALL_EMU
clear_tsk_thread_flag(p, TIF_SYSCALL_EMU);
#endif
- clear_all_latency_tracing(p);
+ clear_tsk_latency_tracing(p);
/* ok, now we should be set up.. */
p->pid = pid_nr(pid);
@@ -1975,7 +2103,7 @@ static __latent_entropy struct task_struct *copy_process(
if (clone_flags & CLONE_PARENT)
p->exit_signal = current->group_leader->exit_signal;
else
- p->exit_signal = (clone_flags & CSIGNAL);
+ p->exit_signal = args->exit_signal;
p->group_leader = p;
p->tgid = p->pid;
}
@@ -1997,7 +2125,7 @@ static __latent_entropy struct task_struct *copy_process(
*/
retval = cgroup_can_fork(p);
if (retval)
- goto bad_fork_free_pid;
+ goto bad_fork_cgroup_threadgroup_change_end;
/*
* From this point on we must avoid any synchronous user-space
@@ -2008,7 +2136,7 @@ static __latent_entropy struct task_struct *copy_process(
*/
p->start_time = ktime_get_ns();
- p->real_start_time = ktime_get_boot_ns();
+ p->real_start_time = ktime_get_boottime_ns();
/*
* Make it visible to the rest of the system, but dont wake it up yet.
@@ -2049,6 +2177,9 @@ static __latent_entropy struct task_struct *copy_process(
goto bad_fork_cancel_cgroup;
}
+ /* past the last point of failure */
+ if (pidfile)
+ fd_install(pidfd, pidfile);
init_task_pid_links(p);
if (likely(p->pid)) {
@@ -2082,7 +2213,7 @@ static __latent_entropy struct task_struct *copy_process(
} else {
current->signal->nr_threads++;
atomic_inc(&current->signal->live);
- atomic_inc(&current->signal->sigcnt);
+ refcount_inc(&current->signal->sigcnt);
task_join_group_stop(p);
list_add_tail_rcu(&p->thread_group,
&p->group_leader->thread_group);
@@ -2112,8 +2243,14 @@ bad_fork_cancel_cgroup:
spin_unlock(&current->sighand->siglock);
write_unlock_irq(&tasklist_lock);
cgroup_cancel_fork(p);
-bad_fork_free_pid:
+bad_fork_cgroup_threadgroup_change_end:
cgroup_threadgroup_change_end(current);
+bad_fork_put_pidfd:
+ if (clone_flags & CLONE_PIDFD) {
+ fput(pidfile);
+ put_unused_fd(pidfd);
+ }
+bad_fork_free_pid:
if (pid != &init_struct_pid)
free_pid(pid);
bad_fork_cleanup_thread:
@@ -2124,8 +2261,10 @@ bad_fork_cleanup_io:
bad_fork_cleanup_namespaces:
exit_task_namespaces(p);
bad_fork_cleanup_mm:
- if (p->mm)
+ if (p->mm) {
+ mm_clear_owner(p->mm, p);
mmput(p->mm);
+ }
bad_fork_cleanup_signal:
if (!(clone_flags & CLONE_THREAD))
free_signal_struct(p->signal);
@@ -2156,7 +2295,7 @@ bad_fork_cleanup_count:
bad_fork_free:
p->state = TASK_DEAD;
put_task_stack(p);
- free_task(p);
+ delayed_free_task(p);
fork_out:
spin_lock_irq(&current->sighand->siglock);
hlist_del_init(&delayed.node);
@@ -2177,8 +2316,11 @@ static inline void init_idle_pids(struct task_struct *idle)
struct task_struct *fork_idle(int cpu)
{
struct task_struct *task;
- task = copy_process(CLONE_VM, 0, 0, NULL, &init_struct_pid, 0, 0,
- cpu_to_node(cpu));
+ struct kernel_clone_args args = {
+ .flags = CLONE_VM,
+ };
+
+ task = copy_process(&init_struct_pid, 0, cpu_to_node(cpu), &args);
if (!IS_ERR(task)) {
init_idle_pids(task);
init_idle(task, cpu);
@@ -2187,19 +2329,20 @@ struct task_struct *fork_idle(int cpu)
return task;
}
+struct mm_struct *copy_init_mm(void)
+{
+ return dup_mm(NULL, &init_mm);
+}
+
/*
* Ok, this is the main fork-routine.
*
* It copies the process, and if successful kick-starts
* it and waits for it to finish using the VM if required.
*/
-long _do_fork(unsigned long clone_flags,
- unsigned long stack_start,
- unsigned long stack_size,
- int __user *parent_tidptr,
- int __user *child_tidptr,
- unsigned long tls)
+long _do_fork(struct kernel_clone_args *args)
{
+ u64 clone_flags = args->flags;
struct completion vfork;
struct pid *pid;
struct task_struct *p;
@@ -2215,7 +2358,7 @@ long _do_fork(unsigned long clone_flags,
if (!(clone_flags & CLONE_UNTRACED)) {
if (clone_flags & CLONE_VFORK)
trace = PTRACE_EVENT_VFORK;
- else if ((clone_flags & CSIGNAL) != SIGCHLD)
+ else if (args->exit_signal != SIGCHLD)
trace = PTRACE_EVENT_CLONE;
else
trace = PTRACE_EVENT_FORK;
@@ -2224,8 +2367,7 @@ long _do_fork(unsigned long clone_flags,
trace = 0;
}
- p = copy_process(clone_flags, stack_start, stack_size,
- child_tidptr, NULL, trace, tls, NUMA_NO_NODE);
+ p = copy_process(NULL, trace, NUMA_NO_NODE, args);
add_latent_entropy();
if (IS_ERR(p))
@@ -2241,7 +2383,7 @@ long _do_fork(unsigned long clone_flags,
nr = pid_vnr(pid);
if (clone_flags & CLONE_PARENT_SETTID)
- put_user(nr, parent_tidptr);
+ put_user(nr, args->parent_tid);
if (clone_flags & CLONE_VFORK) {
p->vfork_done = &vfork;
@@ -2273,8 +2415,16 @@ long do_fork(unsigned long clone_flags,
int __user *parent_tidptr,
int __user *child_tidptr)
{
- return _do_fork(clone_flags, stack_start, stack_size,
- parent_tidptr, child_tidptr, 0);
+ struct kernel_clone_args args = {
+ .flags = (clone_flags & ~CSIGNAL),
+ .child_tid = child_tidptr,
+ .parent_tid = parent_tidptr,
+ .exit_signal = (clone_flags & CSIGNAL),
+ .stack = stack_start,
+ .stack_size = stack_size,
+ };
+
+ return _do_fork(&args);
}
#endif
@@ -2283,15 +2433,25 @@ long do_fork(unsigned long clone_flags,
*/
pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags)
{
- return _do_fork(flags|CLONE_VM|CLONE_UNTRACED, (unsigned long)fn,
- (unsigned long)arg, NULL, NULL, 0);
+ struct kernel_clone_args args = {
+ .flags = ((flags | CLONE_VM | CLONE_UNTRACED) & ~CSIGNAL),
+ .exit_signal = (flags & CSIGNAL),
+ .stack = (unsigned long)fn,
+ .stack_size = (unsigned long)arg,
+ };
+
+ return _do_fork(&args);
}
#ifdef __ARCH_WANT_SYS_FORK
SYSCALL_DEFINE0(fork)
{
#ifdef CONFIG_MMU
- return _do_fork(SIGCHLD, 0, 0, NULL, NULL, 0);
+ struct kernel_clone_args args = {
+ .exit_signal = SIGCHLD,
+ };
+
+ return _do_fork(&args);
#else
/* can not support in nommu mode */
return -EINVAL;
@@ -2302,8 +2462,12 @@ SYSCALL_DEFINE0(fork)
#ifdef __ARCH_WANT_SYS_VFORK
SYSCALL_DEFINE0(vfork)
{
- return _do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, 0,
- 0, NULL, NULL, 0);
+ struct kernel_clone_args args = {
+ .flags = CLONE_VFORK | CLONE_VM,
+ .exit_signal = SIGCHLD,
+ };
+
+ return _do_fork(&args);
}
#endif
@@ -2331,7 +2495,112 @@ SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp,
unsigned long, tls)
#endif
{
- return _do_fork(clone_flags, newsp, 0, parent_tidptr, child_tidptr, tls);
+ struct kernel_clone_args args = {
+ .flags = (clone_flags & ~CSIGNAL),
+ .pidfd = parent_tidptr,
+ .child_tid = child_tidptr,
+ .parent_tid = parent_tidptr,
+ .exit_signal = (clone_flags & CSIGNAL),
+ .stack = newsp,
+ .tls = tls,
+ };
+
+ /* clone(CLONE_PIDFD) uses parent_tidptr to return a pidfd */
+ if ((clone_flags & CLONE_PIDFD) && (clone_flags & CLONE_PARENT_SETTID))
+ return -EINVAL;
+
+ return _do_fork(&args);
+}
+#endif
+
+#ifdef __ARCH_WANT_SYS_CLONE3
+noinline static int copy_clone_args_from_user(struct kernel_clone_args *kargs,
+ struct clone_args __user *uargs,
+ size_t size)
+{
+ struct clone_args args;
+
+ if (unlikely(size > PAGE_SIZE))
+ return -E2BIG;
+
+ if (unlikely(size < sizeof(struct clone_args)))
+ return -EINVAL;
+
+ if (unlikely(!access_ok(uargs, size)))
+ return -EFAULT;
+
+ if (size > sizeof(struct clone_args)) {
+ unsigned char __user *addr;
+ unsigned char __user *end;
+ unsigned char val;
+
+ addr = (void __user *)uargs + sizeof(struct clone_args);
+ end = (void __user *)uargs + size;
+
+ for (; addr < end; addr++) {
+ if (get_user(val, addr))
+ return -EFAULT;
+ if (val)
+ return -E2BIG;
+ }
+
+ size = sizeof(struct clone_args);
+ }
+
+ if (copy_from_user(&args, uargs, size))
+ return -EFAULT;
+
+ *kargs = (struct kernel_clone_args){
+ .flags = args.flags,
+ .pidfd = u64_to_user_ptr(args.pidfd),
+ .child_tid = u64_to_user_ptr(args.child_tid),
+ .parent_tid = u64_to_user_ptr(args.parent_tid),
+ .exit_signal = args.exit_signal,
+ .stack = args.stack,
+ .stack_size = args.stack_size,
+ .tls = args.tls,
+ };
+
+ return 0;
+}
+
+static bool clone3_args_valid(const struct kernel_clone_args *kargs)
+{
+ /*
+ * All lower bits of the flag word are taken.
+ * Verify that no other unknown flags are passed along.
+ */
+ if (kargs->flags & ~CLONE_LEGACY_FLAGS)
+ return false;
+
+ /*
+ * - make the CLONE_DETACHED bit reuseable for clone3
+ * - make the CSIGNAL bits reuseable for clone3
+ */
+ if (kargs->flags & (CLONE_DETACHED | CSIGNAL))
+ return false;
+
+ if ((kargs->flags & (CLONE_THREAD | CLONE_PARENT)) &&
+ kargs->exit_signal)
+ return false;
+
+ return true;
+}
+
+SYSCALL_DEFINE2(clone3, struct clone_args __user *, uargs, size_t, size)
+{
+ int err;
+
+ struct kernel_clone_args kargs;
+
+ err = copy_clone_args_from_user(&kargs, uargs, size);
+ if (err)
+ return err;
+
+ if (!clone3_args_valid(&kargs))
+ return -EINVAL;
+
+ return _do_fork(&kargs);
}
#endif
@@ -2439,7 +2708,7 @@ static int check_unshare_flags(unsigned long unshare_flags)
return -EINVAL;
}
if (unshare_flags & (CLONE_SIGHAND | CLONE_VM)) {
- if (atomic_read(&current->sighand->count) > 1)
+ if (refcount_read(&current->sighand->count) > 1)
return -EINVAL;
}
if (unshare_flags & CLONE_VM) {
diff --git a/kernel/freezer.c b/kernel/freezer.c
index b162b74611e4..c0738424bb43 100644
--- a/kernel/freezer.c
+++ b/kernel/freezer.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* kernel/freezer.c - Function to freeze a process
*
diff --git a/kernel/futex.c b/kernel/futex.c
index a0514e01c3eb..6d50728ef2e7 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* Fast Userspace Mutexes (which I call "Futexes!").
* (C) Rusty Russell, IBM 2002
@@ -29,20 +30,6 @@
*
* "The futexes are also cursed."
* "But they come in a choice of three flavours!"
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
#include <linux/compat.h>
#include <linux/slab.h>
@@ -68,6 +55,7 @@
#include <linux/freezer.h>
#include <linux/memblock.h>
#include <linux/fault-inject.h>
+#include <linux/refcount.h>
#include <asm/futex.h>
@@ -212,7 +200,7 @@ struct futex_pi_state {
struct rt_mutex pi_mutex;
struct task_struct *owner;
- atomic_t refcount;
+ refcount_t refcount;
union futex_key key;
} __randomize_layout;
@@ -321,12 +309,8 @@ static int __init fail_futex_debugfs(void)
if (IS_ERR(dir))
return PTR_ERR(dir);
- if (!debugfs_create_bool("ignore-private", mode, dir,
- &fail_futex.ignore_private)) {
- debugfs_remove_recursive(dir);
- return -ENOMEM;
- }
-
+ debugfs_create_bool("ignore-private", mode, dir,
+ &fail_futex.ignore_private);
return 0;
}
@@ -487,6 +471,37 @@ enum futex_access {
};
/**
+ * futex_setup_timer - set up the sleeping hrtimer.
+ * @time: ptr to the given timeout value
+ * @timeout: the hrtimer_sleeper structure to be set up
+ * @flags: futex flags
+ * @range_ns: optional range in ns
+ *
+ * Return: Initialized hrtimer_sleeper structure or NULL if no timeout
+ * value given
+ */
+static inline struct hrtimer_sleeper *
+futex_setup_timer(ktime_t *time, struct hrtimer_sleeper *timeout,
+ int flags, u64 range_ns)
+{
+ if (!time)
+ return NULL;
+
+ hrtimer_init_on_stack(&timeout->timer, (flags & FLAGS_CLOCKRT) ?
+ CLOCK_REALTIME : CLOCK_MONOTONIC,
+ HRTIMER_MODE_ABS);
+ hrtimer_init_sleeper(timeout, current);
+
+ /*
+ * If range_ns is 0, calling hrtimer_set_expires_range_ns() is
+ * effectively the same as calling hrtimer_set_expires().
+ */
+ hrtimer_set_expires_range_ns(&timeout->timer, *time, range_ns);
+
+ return timeout;
+}
+
+/**
* get_futex_key() - Get parameters which are the keys for a futex
* @uaddr: virtual address of the futex
* @fshared: 0 for a PROCESS_PRIVATE futex, 1 for PROCESS_SHARED
@@ -546,7 +561,7 @@ again:
if (unlikely(should_fail_futex(fshared)))
return -EFAULT;
- err = get_user_pages_fast(address, 1, 1, &page);
+ err = get_user_pages_fast(address, 1, FOLL_WRITE, &page);
/*
* If write access is not required (eg. FUTEX_WAIT), try
* and get read-only access.
@@ -803,7 +818,7 @@ static int refill_pi_state_cache(void)
INIT_LIST_HEAD(&pi_state->list);
/* pi_mutex gets initialized later */
pi_state->owner = NULL;
- atomic_set(&pi_state->refcount, 1);
+ refcount_set(&pi_state->refcount, 1);
pi_state->key = FUTEX_KEY_INIT;
current->pi_state_cache = pi_state;
@@ -823,7 +838,7 @@ static struct futex_pi_state *alloc_pi_state(void)
static void get_pi_state(struct futex_pi_state *pi_state)
{
- WARN_ON_ONCE(!atomic_inc_not_zero(&pi_state->refcount));
+ WARN_ON_ONCE(!refcount_inc_not_zero(&pi_state->refcount));
}
/*
@@ -835,7 +850,7 @@ static void put_pi_state(struct futex_pi_state *pi_state)
if (!pi_state)
return;
- if (!atomic_dec_and_test(&pi_state->refcount))
+ if (!refcount_dec_and_test(&pi_state->refcount))
return;
/*
@@ -865,7 +880,7 @@ static void put_pi_state(struct futex_pi_state *pi_state)
* refcount is at 0 - put it back to 1.
*/
pi_state->owner = NULL;
- atomic_set(&pi_state->refcount, 1);
+ refcount_set(&pi_state->refcount, 1);
current->pi_state_cache = pi_state;
}
}
@@ -908,7 +923,7 @@ void exit_pi_state_list(struct task_struct *curr)
* In that case; drop the locks to let put_pi_state() make
* progress and retry the loop.
*/
- if (!atomic_inc_not_zero(&pi_state->refcount)) {
+ if (!refcount_inc_not_zero(&pi_state->refcount)) {
raw_spin_unlock_irq(&curr->pi_lock);
cpu_relax();
raw_spin_lock_irq(&curr->pi_lock);
@@ -1064,7 +1079,7 @@ static int attach_to_pi_state(u32 __user *uaddr, u32 uval,
* and futex_wait_requeue_pi() as it cannot go to 0 and consequently
* free pi_state before we can take a reference ourselves.
*/
- WARN_ON(!atomic_read(&pi_state->refcount));
+ WARN_ON(!refcount_read(&pi_state->refcount));
/*
* Now that we have a pi_state, we can acquire wait_lock
@@ -1314,13 +1329,15 @@ static int lookup_pi_state(u32 __user *uaddr, u32 uval,
static int lock_pi_update_atomic(u32 __user *uaddr, u32 uval, u32 newval)
{
+ int err;
u32 uninitialized_var(curval);
if (unlikely(should_fail_futex(true)))
return -EFAULT;
- if (unlikely(cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)))
- return -EFAULT;
+ err = cmpxchg_futex_value_locked(&curval, uaddr, uval, newval);
+ if (unlikely(err))
+ return err;
/* If user space value changed, let the caller retry */
return curval != uval ? -EAGAIN : 0;
@@ -1467,8 +1484,7 @@ static void mark_wake_futex(struct wake_q_head *wake_q, struct futex_q *q)
* Queue the task for later wakeup for after we've released
* the hb->lock. wake_q_add() grabs reference to p.
*/
- wake_q_add(wake_q, p);
- put_task_struct(p);
+ wake_q_add_safe(wake_q, p);
}
/*
@@ -1506,10 +1522,8 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_pi_state *pi_
if (unlikely(should_fail_futex(true)))
ret = -EFAULT;
- if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)) {
- ret = -EFAULT;
-
- } else if (curval != uval) {
+ ret = cmpxchg_futex_value_locked(&curval, uaddr, uval, newval);
+ if (!ret && (curval != uval)) {
/*
* If a unconditional UNLOCK_PI operation (user space did not
* try the TID->0 transition) raced with a waiter setting the
@@ -1704,32 +1718,32 @@ retry_private:
double_lock_hb(hb1, hb2);
op_ret = futex_atomic_op_inuser(op, uaddr2);
if (unlikely(op_ret < 0)) {
-
double_unlock_hb(hb1, hb2);
-#ifndef CONFIG_MMU
- /*
- * we don't get EFAULT from MMU faults if we don't have an MMU,
- * but we might get them from range checking
- */
- ret = op_ret;
- goto out_put_keys;
-#endif
-
- if (unlikely(op_ret != -EFAULT)) {
+ if (!IS_ENABLED(CONFIG_MMU) ||
+ unlikely(op_ret != -EFAULT && op_ret != -EAGAIN)) {
+ /*
+ * we don't get EFAULT from MMU faults if we don't have
+ * an MMU, but we might get them from range checking
+ */
ret = op_ret;
goto out_put_keys;
}
- ret = fault_in_user_writeable(uaddr2);
- if (ret)
- goto out_put_keys;
+ if (op_ret == -EFAULT) {
+ ret = fault_in_user_writeable(uaddr2);
+ if (ret)
+ goto out_put_keys;
+ }
- if (!(flags & FLAGS_SHARED))
+ if (!(flags & FLAGS_SHARED)) {
+ cond_resched();
goto retry_private;
+ }
put_futex_key(&key2);
put_futex_key(&key1);
+ cond_resched();
goto retry;
}
@@ -2354,7 +2368,7 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
u32 uval, uninitialized_var(curval), newval;
struct task_struct *oldowner, *newowner;
u32 newtid;
- int ret;
+ int ret, err = 0;
lockdep_assert_held(q->lock_ptr);
@@ -2425,14 +2439,17 @@ retry:
if (!pi_state->owner)
newtid |= FUTEX_OWNER_DIED;
- if (get_futex_value_locked(&uval, uaddr))
- goto handle_fault;
+ err = get_futex_value_locked(&uval, uaddr);
+ if (err)
+ goto handle_err;
for (;;) {
newval = (uval & FUTEX_OWNER_DIED) | newtid;
- if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval))
- goto handle_fault;
+ err = cmpxchg_futex_value_locked(&curval, uaddr, uval, newval);
+ if (err)
+ goto handle_err;
+
if (curval == uval)
break;
uval = curval;
@@ -2460,23 +2477,37 @@ retry:
return 0;
/*
- * To handle the page fault we need to drop the locks here. That gives
- * the other task (either the highest priority waiter itself or the
- * task which stole the rtmutex) the chance to try the fixup of the
- * pi_state. So once we are back from handling the fault we need to
- * check the pi_state after reacquiring the locks and before trying to
- * do another fixup. When the fixup has been done already we simply
- * return.
+ * In order to reschedule or handle a page fault, we need to drop the
+ * locks here. In the case of a fault, this gives the other task
+ * (either the highest priority waiter itself or the task which stole
+ * the rtmutex) the chance to try the fixup of the pi_state. So once we
+ * are back from handling the fault we need to check the pi_state after
+ * reacquiring the locks and before trying to do another fixup. When
+ * the fixup has been done already we simply return.
*
* Note: we hold both hb->lock and pi_mutex->wait_lock. We can safely
* drop hb->lock since the caller owns the hb -> futex_q relation.
* Dropping the pi_mutex->wait_lock requires the state revalidate.
*/
-handle_fault:
+handle_err:
raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
spin_unlock(q->lock_ptr);
- ret = fault_in_user_writeable(uaddr);
+ switch (err) {
+ case -EFAULT:
+ ret = fault_in_user_writeable(uaddr);
+ break;
+
+ case -EAGAIN:
+ cond_resched();
+ ret = 0;
+ break;
+
+ default:
+ WARN_ON_ONCE(1);
+ ret = err;
+ break;
+ }
spin_lock(q->lock_ptr);
raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
@@ -2679,7 +2710,7 @@ out:
static int futex_wait(u32 __user *uaddr, unsigned int flags, u32 val,
ktime_t *abs_time, u32 bitset)
{
- struct hrtimer_sleeper timeout, *to = NULL;
+ struct hrtimer_sleeper timeout, *to;
struct restart_block *restart;
struct futex_hash_bucket *hb;
struct futex_q q = futex_q_init;
@@ -2689,17 +2720,8 @@ static int futex_wait(u32 __user *uaddr, unsigned int flags, u32 val,
return -EINVAL;
q.bitset = bitset;
- if (abs_time) {
- to = &timeout;
-
- hrtimer_init_on_stack(&to->timer, (flags & FLAGS_CLOCKRT) ?
- CLOCK_REALTIME : CLOCK_MONOTONIC,
- HRTIMER_MODE_ABS);
- hrtimer_init_sleeper(to, current);
- hrtimer_set_expires_range_ns(&to->timer, *abs_time,
- current->timer_slack_ns);
- }
-
+ to = futex_setup_timer(abs_time, &timeout, flags,
+ current->timer_slack_ns);
retry:
/*
* Prepare to wait on uaddr. On success, holds hb lock and increments
@@ -2779,7 +2801,7 @@ static long futex_wait_restart(struct restart_block *restart)
static int futex_lock_pi(u32 __user *uaddr, unsigned int flags,
ktime_t *time, int trylock)
{
- struct hrtimer_sleeper timeout, *to = NULL;
+ struct hrtimer_sleeper timeout, *to;
struct futex_pi_state *pi_state = NULL;
struct rt_mutex_waiter rt_waiter;
struct futex_hash_bucket *hb;
@@ -2792,13 +2814,7 @@ static int futex_lock_pi(u32 __user *uaddr, unsigned int flags,
if (refill_pi_state_cache())
return -ENOMEM;
- if (time) {
- to = &timeout;
- hrtimer_init_on_stack(&to->timer, CLOCK_REALTIME,
- HRTIMER_MODE_ABS);
- hrtimer_init_sleeper(to, current);
- hrtimer_set_expires(&to->timer, *time);
- }
+ to = futex_setup_timer(time, &timeout, FLAGS_CLOCKRT, 0);
retry:
ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q.key, FUTEX_WRITE);
@@ -3045,10 +3061,8 @@ retry:
* A unconditional UNLOCK_PI op raced against a waiter
* setting the FUTEX_WAITERS bit. Try again.
*/
- if (ret == -EAGAIN) {
- put_futex_key(&key);
- goto retry;
- }
+ if (ret == -EAGAIN)
+ goto pi_retry;
/*
* wake_futex_pi has detected invalid state. Tell user
* space.
@@ -3063,9 +3077,19 @@ retry:
* preserve the WAITERS bit not the OWNER_DIED one. We are the
* owner.
*/
- if (cmpxchg_futex_value_locked(&curval, uaddr, uval, 0)) {
+ if ((ret = cmpxchg_futex_value_locked(&curval, uaddr, uval, 0))) {
spin_unlock(&hb->lock);
- goto pi_faulted;
+ switch (ret) {
+ case -EFAULT:
+ goto pi_faulted;
+
+ case -EAGAIN:
+ goto pi_retry;
+
+ default:
+ WARN_ON_ONCE(1);
+ goto out_putkey;
+ }
}
/*
@@ -3079,6 +3103,11 @@ out_putkey:
put_futex_key(&key);
return ret;
+pi_retry:
+ put_futex_key(&key);
+ cond_resched();
+ goto retry;
+
pi_faulted:
put_futex_key(&key);
@@ -3182,7 +3211,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
u32 val, ktime_t *abs_time, u32 bitset,
u32 __user *uaddr2)
{
- struct hrtimer_sleeper timeout, *to = NULL;
+ struct hrtimer_sleeper timeout, *to;
struct futex_pi_state *pi_state = NULL;
struct rt_mutex_waiter rt_waiter;
struct futex_hash_bucket *hb;
@@ -3199,15 +3228,8 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
if (!bitset)
return -EINVAL;
- if (abs_time) {
- to = &timeout;
- hrtimer_init_on_stack(&to->timer, (flags & FLAGS_CLOCKRT) ?
- CLOCK_REALTIME : CLOCK_MONOTONIC,
- HRTIMER_MODE_ABS);
- hrtimer_init_sleeper(to, current);
- hrtimer_set_expires_range_ns(&to->timer, *abs_time,
- current->timer_slack_ns);
- }
+ to = futex_setup_timer(abs_time, &timeout, flags,
+ current->timer_slack_ns);
/*
* The waiter is allocated on our stack, manipulated by the requeue
@@ -3439,47 +3461,67 @@ err_unlock:
static int handle_futex_death(u32 __user *uaddr, struct task_struct *curr, int pi)
{
u32 uval, uninitialized_var(nval), mval;
+ int err;
+
+ /* Futex address must be 32bit aligned */
+ if ((((unsigned long)uaddr) % sizeof(*uaddr)) != 0)
+ return -1;
retry:
if (get_user(uval, uaddr))
return -1;
- if ((uval & FUTEX_TID_MASK) == task_pid_vnr(curr)) {
- /*
- * Ok, this dying thread is truly holding a futex
- * of interest. Set the OWNER_DIED bit atomically
- * via cmpxchg, and if the value had FUTEX_WAITERS
- * set, wake up a waiter (if any). (We have to do a
- * futex_wake() even if OWNER_DIED is already set -
- * to handle the rare but possible case of recursive
- * thread-death.) The rest of the cleanup is done in
- * userspace.
- */
- mval = (uval & FUTEX_WAITERS) | FUTEX_OWNER_DIED;
- /*
- * We are not holding a lock here, but we want to have
- * the pagefault_disable/enable() protection because
- * we want to handle the fault gracefully. If the
- * access fails we try to fault in the futex with R/W
- * verification via get_user_pages. get_user() above
- * does not guarantee R/W access. If that fails we
- * give up and leave the futex locked.
- */
- if (cmpxchg_futex_value_locked(&nval, uaddr, uval, mval)) {
+ if ((uval & FUTEX_TID_MASK) != task_pid_vnr(curr))
+ return 0;
+
+ /*
+ * Ok, this dying thread is truly holding a futex
+ * of interest. Set the OWNER_DIED bit atomically
+ * via cmpxchg, and if the value had FUTEX_WAITERS
+ * set, wake up a waiter (if any). (We have to do a
+ * futex_wake() even if OWNER_DIED is already set -
+ * to handle the rare but possible case of recursive
+ * thread-death.) The rest of the cleanup is done in
+ * userspace.
+ */
+ mval = (uval & FUTEX_WAITERS) | FUTEX_OWNER_DIED;
+
+ /*
+ * We are not holding a lock here, but we want to have
+ * the pagefault_disable/enable() protection because
+ * we want to handle the fault gracefully. If the
+ * access fails we try to fault in the futex with R/W
+ * verification via get_user_pages. get_user() above
+ * does not guarantee R/W access. If that fails we
+ * give up and leave the futex locked.
+ */
+ if ((err = cmpxchg_futex_value_locked(&nval, uaddr, uval, mval))) {
+ switch (err) {
+ case -EFAULT:
if (fault_in_user_writeable(uaddr))
return -1;
goto retry;
- }
- if (nval != uval)
+
+ case -EAGAIN:
+ cond_resched();
goto retry;
- /*
- * Wake robust non-PI futexes here. The wakeup of
- * PI futexes happens in exit_pi_state():
- */
- if (!pi && (uval & FUTEX_WAITERS))
- futex_wake(uaddr, 1, 1, FUTEX_BITSET_MATCH_ANY);
+ default:
+ WARN_ON_ONCE(1);
+ return err;
+ }
}
+
+ if (nval != uval)
+ goto retry;
+
+ /*
+ * Wake robust non-PI futexes here. The wakeup of
+ * PI futexes happens in exit_pi_state():
+ */
+ if (!pi && (uval & FUTEX_WAITERS))
+ futex_wake(uaddr, 1, 1, FUTEX_BITSET_MATCH_ANY);
+
return 0;
}
@@ -3823,7 +3865,7 @@ err_unlock:
#endif /* CONFIG_COMPAT */
#ifdef CONFIG_COMPAT_32BIT_TIME
-COMPAT_SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val,
+SYSCALL_DEFINE6(futex_time32, u32 __user *, uaddr, int, op, u32, val,
struct old_timespec32 __user *, utime, u32 __user *, uaddr2,
u32, val3)
{
diff --git a/kernel/gcov/Kconfig b/kernel/gcov/Kconfig
index 1e3823fa799b..3941a9c48f83 100644
--- a/kernel/gcov/Kconfig
+++ b/kernel/gcov/Kconfig
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
menu "GCOV-based kernel profiling"
config GCOV_KERNEL
@@ -53,6 +54,7 @@ config GCOV_PROFILE_ALL
choice
prompt "Specify GCOV format"
depends on GCOV_KERNEL
+ depends on CC_IS_GCC
---help---
The gcov format is usually determined by the GCC version, and the
default is chosen according to your GCC version. However, there are
@@ -62,7 +64,7 @@ choice
config GCOV_FORMAT_3_4
bool "GCC 3.4 format"
- depends on CC_IS_GCC && GCC_VERSION < 40700
+ depends on GCC_VERSION < 40700
---help---
Select this option to use the format defined by GCC 3.4.
diff --git a/kernel/gcov/Makefile b/kernel/gcov/Makefile
index ff06d64df397..d66a74b0f100 100644
--- a/kernel/gcov/Makefile
+++ b/kernel/gcov/Makefile
@@ -2,5 +2,6 @@
ccflags-y := -DSRCTREE='"$(srctree)"' -DOBJTREE='"$(objtree)"'
obj-y := base.o fs.o
-obj-$(CONFIG_GCOV_FORMAT_3_4) += gcc_3_4.o
-obj-$(CONFIG_GCOV_FORMAT_4_7) += gcc_4_7.o
+obj-$(CONFIG_GCOV_FORMAT_3_4) += gcc_base.o gcc_3_4.o
+obj-$(CONFIG_GCOV_FORMAT_4_7) += gcc_base.o gcc_4_7.o
+obj-$(CONFIG_CC_IS_CLANG) += clang.o
diff --git a/kernel/gcov/base.c b/kernel/gcov/base.c
index 9c7c8d5c18f2..0ffe9f194080 100644
--- a/kernel/gcov/base.c
+++ b/kernel/gcov/base.c
@@ -22,88 +22,8 @@
#include <linux/sched.h>
#include "gcov.h"
-static int gcov_events_enabled;
-static DEFINE_MUTEX(gcov_lock);
-
-/*
- * __gcov_init is called by gcc-generated constructor code for each object
- * file compiled with -fprofile-arcs.
- */
-void __gcov_init(struct gcov_info *info)
-{
- static unsigned int gcov_version;
-
- mutex_lock(&gcov_lock);
- if (gcov_version == 0) {
- gcov_version = gcov_info_version(info);
- /*
- * Printing gcc's version magic may prove useful for debugging
- * incompatibility reports.
- */
- pr_info("version magic: 0x%x\n", gcov_version);
- }
- /*
- * Add new profiling data structure to list and inform event
- * listener.
- */
- gcov_info_link(info);
- if (gcov_events_enabled)
- gcov_event(GCOV_ADD, info);
- mutex_unlock(&gcov_lock);
-}
-EXPORT_SYMBOL(__gcov_init);
-
-/*
- * These functions may be referenced by gcc-generated profiling code but serve
- * no function for kernel profiling.
- */
-void __gcov_flush(void)
-{
- /* Unused. */
-}
-EXPORT_SYMBOL(__gcov_flush);
-
-void __gcov_merge_add(gcov_type *counters, unsigned int n_counters)
-{
- /* Unused. */
-}
-EXPORT_SYMBOL(__gcov_merge_add);
-
-void __gcov_merge_single(gcov_type *counters, unsigned int n_counters)
-{
- /* Unused. */
-}
-EXPORT_SYMBOL(__gcov_merge_single);
-
-void __gcov_merge_delta(gcov_type *counters, unsigned int n_counters)
-{
- /* Unused. */
-}
-EXPORT_SYMBOL(__gcov_merge_delta);
-
-void __gcov_merge_ior(gcov_type *counters, unsigned int n_counters)
-{
- /* Unused. */
-}
-EXPORT_SYMBOL(__gcov_merge_ior);
-
-void __gcov_merge_time_profile(gcov_type *counters, unsigned int n_counters)
-{
- /* Unused. */
-}
-EXPORT_SYMBOL(__gcov_merge_time_profile);
-
-void __gcov_merge_icall_topn(gcov_type *counters, unsigned int n_counters)
-{
- /* Unused. */
-}
-EXPORT_SYMBOL(__gcov_merge_icall_topn);
-
-void __gcov_exit(void)
-{
- /* Unused. */
-}
-EXPORT_SYMBOL(__gcov_exit);
+int gcov_events_enabled;
+DEFINE_MUTEX(gcov_lock);
/**
* gcov_enable_events - enable event reporting through gcov_event()
@@ -144,7 +64,7 @@ static int gcov_module_notifier(struct notifier_block *nb, unsigned long event,
/* Remove entries located in module from linked list. */
while ((info = gcov_info_next(info))) {
- if (within_module((unsigned long)info, mod)) {
+ if (gcov_info_within_module(info, mod)) {
gcov_info_unlink(prev, info);
if (gcov_events_enabled)
gcov_event(GCOV_REMOVE, info);
diff --git a/kernel/gcov/clang.c b/kernel/gcov/clang.c
new file mode 100644
index 000000000000..c94b820a1b62
--- /dev/null
+++ b/kernel/gcov/clang.c
@@ -0,0 +1,581 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2019 Google, Inc.
+ * modified from kernel/gcov/gcc_4_7.c
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ *
+ * LLVM uses profiling data that's deliberately similar to GCC, but has a
+ * very different way of exporting that data. LLVM calls llvm_gcov_init() once
+ * per module, and provides a couple of callbacks that we can use to ask for
+ * more data.
+ *
+ * We care about the "writeout" callback, which in turn calls back into
+ * compiler-rt/this module to dump all the gathered coverage data to disk:
+ *
+ * llvm_gcda_start_file()
+ * llvm_gcda_emit_function()
+ * llvm_gcda_emit_arcs()
+ * llvm_gcda_emit_function()
+ * llvm_gcda_emit_arcs()
+ * [... repeats for each function ...]
+ * llvm_gcda_summary_info()
+ * llvm_gcda_end_file()
+ *
+ * This design is much more stateless and unstructured than gcc's, and is
+ * intended to run at process exit. This forces us to keep some local state
+ * about which module we're dealing with at the moment. On the other hand, it
+ * also means we don't depend as much on how LLVM represents profiling data
+ * internally.
+ *
+ * See LLVM's lib/Transforms/Instrumentation/GCOVProfiling.cpp for more
+ * details on how this works, particularly GCOVProfiler::emitProfileArcs(),
+ * GCOVProfiler::insertCounterWriteout(), and
+ * GCOVProfiler::insertFlush().
+ */
+
+#define pr_fmt(fmt) "gcov: " fmt
+
+#include <linux/kernel.h>
+#include <linux/list.h>
+#include <linux/printk.h>
+#include <linux/ratelimit.h>
+#include <linux/seq_file.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include "gcov.h"
+
+typedef void (*llvm_gcov_callback)(void);
+
+struct gcov_info {
+ struct list_head head;
+
+ const char *filename;
+ unsigned int version;
+ u32 checksum;
+
+ struct list_head functions;
+};
+
+struct gcov_fn_info {
+ struct list_head head;
+
+ u32 ident;
+ u32 checksum;
+ u8 use_extra_checksum;
+ u32 cfg_checksum;
+
+ u32 num_counters;
+ u64 *counters;
+ const char *function_name;
+};
+
+static struct gcov_info *current_info;
+
+static LIST_HEAD(clang_gcov_list);
+
+void llvm_gcov_init(llvm_gcov_callback writeout, llvm_gcov_callback flush)
+{
+ struct gcov_info *info = kzalloc(sizeof(*info), GFP_KERNEL);
+
+ if (!info)
+ return;
+
+ INIT_LIST_HEAD(&info->head);
+ INIT_LIST_HEAD(&info->functions);
+
+ mutex_lock(&gcov_lock);
+
+ list_add_tail(&info->head, &clang_gcov_list);
+ current_info = info;
+ writeout();
+ current_info = NULL;
+ if (gcov_events_enabled)
+ gcov_event(GCOV_ADD, info);
+
+ mutex_unlock(&gcov_lock);
+}
+EXPORT_SYMBOL(llvm_gcov_init);
+
+void llvm_gcda_start_file(const char *orig_filename, const char version[4],
+ u32 checksum)
+{
+ current_info->filename = orig_filename;
+ memcpy(&current_info->version, version, sizeof(current_info->version));
+ current_info->checksum = checksum;
+}
+EXPORT_SYMBOL(llvm_gcda_start_file);
+
+void llvm_gcda_emit_function(u32 ident, const char *function_name,
+ u32 func_checksum, u8 use_extra_checksum, u32 cfg_checksum)
+{
+ struct gcov_fn_info *info = kzalloc(sizeof(*info), GFP_KERNEL);
+
+ if (!info)
+ return;
+
+ INIT_LIST_HEAD(&info->head);
+ info->ident = ident;
+ info->checksum = func_checksum;
+ info->use_extra_checksum = use_extra_checksum;
+ info->cfg_checksum = cfg_checksum;
+ if (function_name)
+ info->function_name = kstrdup(function_name, GFP_KERNEL);
+
+ list_add_tail(&info->head, &current_info->functions);
+}
+EXPORT_SYMBOL(llvm_gcda_emit_function);
+
+void llvm_gcda_emit_arcs(u32 num_counters, u64 *counters)
+{
+ struct gcov_fn_info *info = list_last_entry(&current_info->functions,
+ struct gcov_fn_info, head);
+
+ info->num_counters = num_counters;
+ info->counters = counters;
+}
+EXPORT_SYMBOL(llvm_gcda_emit_arcs);
+
+void llvm_gcda_summary_info(void)
+{
+}
+EXPORT_SYMBOL(llvm_gcda_summary_info);
+
+void llvm_gcda_end_file(void)
+{
+}
+EXPORT_SYMBOL(llvm_gcda_end_file);
+
+/**
+ * gcov_info_filename - return info filename
+ * @info: profiling data set
+ */
+const char *gcov_info_filename(struct gcov_info *info)
+{
+ return info->filename;
+}
+
+/**
+ * gcov_info_version - return info version
+ * @info: profiling data set
+ */
+unsigned int gcov_info_version(struct gcov_info *info)
+{
+ return info->version;
+}
+
+/**
+ * gcov_info_next - return next profiling data set
+ * @info: profiling data set
+ *
+ * Returns next gcov_info following @info or first gcov_info in the chain if
+ * @info is %NULL.
+ */
+struct gcov_info *gcov_info_next(struct gcov_info *info)
+{
+ if (!info)
+ return list_first_entry_or_null(&clang_gcov_list,
+ struct gcov_info, head);
+ if (list_is_last(&info->head, &clang_gcov_list))
+ return NULL;
+ return list_next_entry(info, head);
+}
+
+/**
+ * gcov_info_link - link/add profiling data set to the list
+ * @info: profiling data set
+ */
+void gcov_info_link(struct gcov_info *info)
+{
+ list_add_tail(&info->head, &clang_gcov_list);
+}
+
+/**
+ * gcov_info_unlink - unlink/remove profiling data set from the list
+ * @prev: previous profiling data set
+ * @info: profiling data set
+ */
+void gcov_info_unlink(struct gcov_info *prev, struct gcov_info *info)
+{
+ /* Generic code unlinks while iterating. */
+ __list_del_entry(&info->head);
+}
+
+/**
+ * gcov_info_within_module - check if a profiling data set belongs to a module
+ * @info: profiling data set
+ * @mod: module
+ *
+ * Returns true if profiling data belongs module, false otherwise.
+ */
+bool gcov_info_within_module(struct gcov_info *info, struct module *mod)
+{
+ return within_module((unsigned long)info->filename, mod);
+}
+
+/* Symbolic links to be created for each profiling data file. */
+const struct gcov_link gcov_link[] = {
+ { OBJ_TREE, "gcno" }, /* Link to .gcno file in $(objtree). */
+ { 0, NULL},
+};
+
+/**
+ * gcov_info_reset - reset profiling data to zero
+ * @info: profiling data set
+ */
+void gcov_info_reset(struct gcov_info *info)
+{
+ struct gcov_fn_info *fn;
+
+ list_for_each_entry(fn, &info->functions, head)
+ memset(fn->counters, 0,
+ sizeof(fn->counters[0]) * fn->num_counters);
+}
+
+/**
+ * gcov_info_is_compatible - check if profiling data can be added
+ * @info1: first profiling data set
+ * @info2: second profiling data set
+ *
+ * Returns non-zero if profiling data can be added, zero otherwise.
+ */
+int gcov_info_is_compatible(struct gcov_info *info1, struct gcov_info *info2)
+{
+ struct gcov_fn_info *fn_ptr1 = list_first_entry_or_null(
+ &info1->functions, struct gcov_fn_info, head);
+ struct gcov_fn_info *fn_ptr2 = list_first_entry_or_null(
+ &info2->functions, struct gcov_fn_info, head);
+
+ if (info1->checksum != info2->checksum)
+ return false;
+ if (!fn_ptr1)
+ return fn_ptr1 == fn_ptr2;
+ while (!list_is_last(&fn_ptr1->head, &info1->functions) &&
+ !list_is_last(&fn_ptr2->head, &info2->functions)) {
+ if (fn_ptr1->checksum != fn_ptr2->checksum)
+ return false;
+ if (fn_ptr1->use_extra_checksum != fn_ptr2->use_extra_checksum)
+ return false;
+ if (fn_ptr1->use_extra_checksum &&
+ fn_ptr1->cfg_checksum != fn_ptr2->cfg_checksum)
+ return false;
+ fn_ptr1 = list_next_entry(fn_ptr1, head);
+ fn_ptr2 = list_next_entry(fn_ptr2, head);
+ }
+ return list_is_last(&fn_ptr1->head, &info1->functions) &&
+ list_is_last(&fn_ptr2->head, &info2->functions);
+}
+
+/**
+ * gcov_info_add - add up profiling data
+ * @dest: profiling data set to which data is added
+ * @source: profiling data set which is added
+ *
+ * Adds profiling counts of @source to @dest.
+ */
+void gcov_info_add(struct gcov_info *dst, struct gcov_info *src)
+{
+ struct gcov_fn_info *dfn_ptr;
+ struct gcov_fn_info *sfn_ptr = list_first_entry_or_null(&src->functions,
+ struct gcov_fn_info, head);
+
+ list_for_each_entry(dfn_ptr, &dst->functions, head) {
+ u32 i;
+
+ for (i = 0; i < sfn_ptr->num_counters; i++)
+ dfn_ptr->counters[i] += sfn_ptr->counters[i];
+ }
+}
+
+static struct gcov_fn_info *gcov_fn_info_dup(struct gcov_fn_info *fn)
+{
+ size_t cv_size; /* counter values size */
+ struct gcov_fn_info *fn_dup = kmemdup(fn, sizeof(*fn),
+ GFP_KERNEL);
+ if (!fn_dup)
+ return NULL;
+ INIT_LIST_HEAD(&fn_dup->head);
+
+ fn_dup->function_name = kstrdup(fn->function_name, GFP_KERNEL);
+ if (!fn_dup->function_name)
+ goto err_name;
+
+ cv_size = fn->num_counters * sizeof(fn->counters[0]);
+ fn_dup->counters = vmalloc(cv_size);
+ if (!fn_dup->counters)
+ goto err_counters;
+ memcpy(fn_dup->counters, fn->counters, cv_size);
+
+ return fn_dup;
+
+err_counters:
+ kfree(fn_dup->function_name);
+err_name:
+ kfree(fn_dup);
+ return NULL;
+}
+
+/**
+ * gcov_info_dup - duplicate profiling data set
+ * @info: profiling data set to duplicate
+ *
+ * Return newly allocated duplicate on success, %NULL on error.
+ */
+struct gcov_info *gcov_info_dup(struct gcov_info *info)
+{
+ struct gcov_info *dup;
+ struct gcov_fn_info *fn;
+
+ dup = kmemdup(info, sizeof(*dup), GFP_KERNEL);
+ if (!dup)
+ return NULL;
+ INIT_LIST_HEAD(&dup->head);
+ INIT_LIST_HEAD(&dup->functions);
+ dup->filename = kstrdup(info->filename, GFP_KERNEL);
+ if (!dup->filename)
+ goto err;
+
+ list_for_each_entry(fn, &info->functions, head) {
+ struct gcov_fn_info *fn_dup = gcov_fn_info_dup(fn);
+
+ if (!fn_dup)
+ goto err;
+ list_add_tail(&fn_dup->head, &dup->functions);
+ }
+
+ return dup;
+
+err:
+ gcov_info_free(dup);
+ return NULL;
+}
+
+/**
+ * gcov_info_free - release memory for profiling data set duplicate
+ * @info: profiling data set duplicate to free
+ */
+void gcov_info_free(struct gcov_info *info)
+{
+ struct gcov_fn_info *fn, *tmp;
+
+ list_for_each_entry_safe(fn, tmp, &info->functions, head) {
+ kfree(fn->function_name);
+ vfree(fn->counters);
+ list_del(&fn->head);
+ kfree(fn);
+ }
+ kfree(info->filename);
+ kfree(info);
+}
+
+#define ITER_STRIDE PAGE_SIZE
+
+/**
+ * struct gcov_iterator - specifies current file position in logical records
+ * @info: associated profiling data
+ * @buffer: buffer containing file data
+ * @size: size of buffer
+ * @pos: current position in file
+ */
+struct gcov_iterator {
+ struct gcov_info *info;
+ void *buffer;
+ size_t size;
+ loff_t pos;
+};
+
+/**
+ * store_gcov_u32 - store 32 bit number in gcov format to buffer
+ * @buffer: target buffer or NULL
+ * @off: offset into the buffer
+ * @v: value to be stored
+ *
+ * Number format defined by gcc: numbers are recorded in the 32 bit
+ * unsigned binary form of the endianness of the machine generating the
+ * file. Returns the number of bytes stored. If @buffer is %NULL, doesn't
+ * store anything.
+ */
+static size_t store_gcov_u32(void *buffer, size_t off, u32 v)
+{
+ u32 *data;
+
+ if (buffer) {
+ data = buffer + off;
+ *data = v;
+ }
+
+ return sizeof(*data);
+}
+
+/**
+ * store_gcov_u64 - store 64 bit number in gcov format to buffer
+ * @buffer: target buffer or NULL
+ * @off: offset into the buffer
+ * @v: value to be stored
+ *
+ * Number format defined by gcc: numbers are recorded in the 32 bit
+ * unsigned binary form of the endianness of the machine generating the
+ * file. 64 bit numbers are stored as two 32 bit numbers, the low part
+ * first. Returns the number of bytes stored. If @buffer is %NULL, doesn't store
+ * anything.
+ */
+static size_t store_gcov_u64(void *buffer, size_t off, u64 v)
+{
+ u32 *data;
+
+ if (buffer) {
+ data = buffer + off;
+
+ data[0] = (v & 0xffffffffUL);
+ data[1] = (v >> 32);
+ }
+
+ return sizeof(*data) * 2;
+}
+
+/**
+ * convert_to_gcda - convert profiling data set to gcda file format
+ * @buffer: the buffer to store file data or %NULL if no data should be stored
+ * @info: profiling data set to be converted
+ *
+ * Returns the number of bytes that were/would have been stored into the buffer.
+ */
+static size_t convert_to_gcda(char *buffer, struct gcov_info *info)
+{
+ struct gcov_fn_info *fi_ptr;
+ size_t pos = 0;
+
+ /* File header. */
+ pos += store_gcov_u32(buffer, pos, GCOV_DATA_MAGIC);
+ pos += store_gcov_u32(buffer, pos, info->version);
+ pos += store_gcov_u32(buffer, pos, info->checksum);
+
+ list_for_each_entry(fi_ptr, &info->functions, head) {
+ u32 i;
+ u32 len = 2;
+
+ if (fi_ptr->use_extra_checksum)
+ len++;
+
+ pos += store_gcov_u32(buffer, pos, GCOV_TAG_FUNCTION);
+ pos += store_gcov_u32(buffer, pos, len);
+ pos += store_gcov_u32(buffer, pos, fi_ptr->ident);
+ pos += store_gcov_u32(buffer, pos, fi_ptr->checksum);
+ if (fi_ptr->use_extra_checksum)
+ pos += store_gcov_u32(buffer, pos, fi_ptr->cfg_checksum);
+
+ pos += store_gcov_u32(buffer, pos, GCOV_TAG_COUNTER_BASE);
+ pos += store_gcov_u32(buffer, pos, fi_ptr->num_counters * 2);
+ for (i = 0; i < fi_ptr->num_counters; i++)
+ pos += store_gcov_u64(buffer, pos, fi_ptr->counters[i]);
+ }
+
+ return pos;
+}
+
+/**
+ * gcov_iter_new - allocate and initialize profiling data iterator
+ * @info: profiling data set to be iterated
+ *
+ * Return file iterator on success, %NULL otherwise.
+ */
+struct gcov_iterator *gcov_iter_new(struct gcov_info *info)
+{
+ struct gcov_iterator *iter;
+
+ iter = kzalloc(sizeof(struct gcov_iterator), GFP_KERNEL);
+ if (!iter)
+ goto err_free;
+
+ iter->info = info;
+ /* Dry-run to get the actual buffer size. */
+ iter->size = convert_to_gcda(NULL, info);
+ iter->buffer = vmalloc(iter->size);
+ if (!iter->buffer)
+ goto err_free;
+
+ convert_to_gcda(iter->buffer, info);
+
+ return iter;
+
+err_free:
+ kfree(iter);
+ return NULL;
+}
+
+
+/**
+ * gcov_iter_get_info - return profiling data set for given file iterator
+ * @iter: file iterator
+ */
+void gcov_iter_free(struct gcov_iterator *iter)
+{
+ vfree(iter->buffer);
+ kfree(iter);
+}
+
+/**
+ * gcov_iter_get_info - return profiling data set for given file iterator
+ * @iter: file iterator
+ */
+struct gcov_info *gcov_iter_get_info(struct gcov_iterator *iter)
+{
+ return iter->info;
+}
+
+/**
+ * gcov_iter_start - reset file iterator to starting position
+ * @iter: file iterator
+ */
+void gcov_iter_start(struct gcov_iterator *iter)
+{
+ iter->pos = 0;
+}
+
+/**
+ * gcov_iter_next - advance file iterator to next logical record
+ * @iter: file iterator
+ *
+ * Return zero if new position is valid, non-zero if iterator has reached end.
+ */
+int gcov_iter_next(struct gcov_iterator *iter)
+{
+ if (iter->pos < iter->size)
+ iter->pos += ITER_STRIDE;
+
+ if (iter->pos >= iter->size)
+ return -EINVAL;
+
+ return 0;
+}
+
+/**
+ * gcov_iter_write - write data for current pos to seq_file
+ * @iter: file iterator
+ * @seq: seq_file handle
+ *
+ * Return zero on success, non-zero otherwise.
+ */
+int gcov_iter_write(struct gcov_iterator *iter, struct seq_file *seq)
+{
+ size_t len;
+
+ if (iter->pos >= iter->size)
+ return -EINVAL;
+
+ len = ITER_STRIDE;
+ if (iter->pos + len > iter->size)
+ len = iter->size - iter->pos;
+
+ seq_write(seq, iter->buffer + iter->pos, len);
+
+ return 0;
+}
diff --git a/kernel/gcov/fs.c b/kernel/gcov/fs.c
index 6e40ff6be083..e5eb5ea7ea59 100644
--- a/kernel/gcov/fs.c
+++ b/kernel/gcov/fs.c
@@ -64,7 +64,6 @@ struct gcov_node {
static const char objtree[] = OBJTREE;
static const char srctree[] = SRCTREE;
static struct gcov_node root_node;
-static struct dentry *reset_dentry;
static LIST_HEAD(all_head);
static DEFINE_MUTEX(node_lock);
@@ -387,8 +386,6 @@ static void add_links(struct gcov_node *node, struct dentry *parent)
goto out_err;
node->links[i] = debugfs_create_symlink(deskew(basename),
parent, target);
- if (!node->links[i])
- goto out_err;
kfree(target);
}
@@ -450,11 +447,6 @@ static struct gcov_node *new_node(struct gcov_node *parent,
parent->dentry, node, &gcov_data_fops);
} else
node->dentry = debugfs_create_dir(node->name, parent->dentry);
- if (!node->dentry) {
- pr_warn("could not create file\n");
- kfree(node);
- return NULL;
- }
if (info)
add_links(node, parent->dentry);
list_add(&node->list, &parent->children);
@@ -761,32 +753,20 @@ void gcov_event(enum gcov_action action, struct gcov_info *info)
/* Create debugfs entries. */
static __init int gcov_fs_init(void)
{
- int rc = -EIO;
-
init_node(&root_node, NULL, NULL, NULL);
/*
* /sys/kernel/debug/gcov will be parent for the reset control file
* and all profiling files.
*/
root_node.dentry = debugfs_create_dir("gcov", NULL);
- if (!root_node.dentry)
- goto err_remove;
/*
* Create reset file which resets all profiling counts when written
* to.
*/
- reset_dentry = debugfs_create_file("reset", 0600, root_node.dentry,
- NULL, &gcov_reset_fops);
- if (!reset_dentry)
- goto err_remove;
+ debugfs_create_file("reset", 0600, root_node.dentry, NULL,
+ &gcov_reset_fops);
/* Replay previous events to get our fs hierarchy up-to-date. */
gcov_enable_events();
return 0;
-
-err_remove:
- pr_err("init failed\n");
- debugfs_remove(root_node.dentry);
-
- return rc;
}
device_initcall(gcov_fs_init);
diff --git a/kernel/gcov/gcc_3_4.c b/kernel/gcov/gcc_3_4.c
index 1e32e66c9563..801ee4b0b969 100644
--- a/kernel/gcov/gcc_3_4.c
+++ b/kernel/gcov/gcc_3_4.c
@@ -137,6 +137,18 @@ void gcov_info_unlink(struct gcov_info *prev, struct gcov_info *info)
gcov_info_head = info->next;
}
+/**
+ * gcov_info_within_module - check if a profiling data set belongs to a module
+ * @info: profiling data set
+ * @mod: module
+ *
+ * Returns true if profiling data belongs module, false otherwise.
+ */
+bool gcov_info_within_module(struct gcov_info *info, struct module *mod)
+{
+ return within_module((unsigned long)info, mod);
+}
+
/* Symbolic links to be created for each profiling data file. */
const struct gcov_link gcov_link[] = {
{ OBJ_TREE, "gcno" }, /* Link to .gcno file in $(objtree). */
@@ -245,8 +257,7 @@ struct gcov_info *gcov_info_dup(struct gcov_info *info)
/* Duplicate gcov_info. */
active = num_counter_active(info);
- dup = kzalloc(sizeof(struct gcov_info) +
- sizeof(struct gcov_ctr_info) * active, GFP_KERNEL);
+ dup = kzalloc(struct_size(dup, counts, active), GFP_KERNEL);
if (!dup)
return NULL;
dup->version = info->version;
@@ -364,8 +375,7 @@ struct gcov_iterator *gcov_iter_new(struct gcov_info *info)
{
struct gcov_iterator *iter;
- iter = kzalloc(sizeof(struct gcov_iterator) +
- num_counter_active(info) * sizeof(struct type_info),
+ iter = kzalloc(struct_size(iter, type_info, num_counter_active(info)),
GFP_KERNEL);
if (iter)
iter->info = info;
diff --git a/kernel/gcov/gcc_4_7.c b/kernel/gcov/gcc_4_7.c
index ca5e5c0ef853..ec37563674d6 100644
--- a/kernel/gcov/gcc_4_7.c
+++ b/kernel/gcov/gcc_4_7.c
@@ -150,6 +150,18 @@ void gcov_info_unlink(struct gcov_info *prev, struct gcov_info *info)
gcov_info_head = info->next;
}
+/**
+ * gcov_info_within_module - check if a profiling data set belongs to a module
+ * @info: profiling data set
+ * @mod: module
+ *
+ * Returns true if profiling data belongs module, false otherwise.
+ */
+bool gcov_info_within_module(struct gcov_info *info, struct module *mod)
+{
+ return within_module((unsigned long)info, mod);
+}
+
/* Symbolic links to be created for each profiling data file. */
const struct gcov_link gcov_link[] = {
{ OBJ_TREE, "gcno" }, /* Link to .gcno file in $(objtree). */
diff --git a/kernel/gcov/gcc_base.c b/kernel/gcov/gcc_base.c
new file mode 100644
index 000000000000..3cf736b9f880
--- /dev/null
+++ b/kernel/gcov/gcc_base.c
@@ -0,0 +1,86 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/export.h>
+#include <linux/kernel.h>
+#include <linux/mutex.h>
+#include "gcov.h"
+
+/*
+ * __gcov_init is called by gcc-generated constructor code for each object
+ * file compiled with -fprofile-arcs.
+ */
+void __gcov_init(struct gcov_info *info)
+{
+ static unsigned int gcov_version;
+
+ mutex_lock(&gcov_lock);
+ if (gcov_version == 0) {
+ gcov_version = gcov_info_version(info);
+ /*
+ * Printing gcc's version magic may prove useful for debugging
+ * incompatibility reports.
+ */
+ pr_info("version magic: 0x%x\n", gcov_version);
+ }
+ /*
+ * Add new profiling data structure to list and inform event
+ * listener.
+ */
+ gcov_info_link(info);
+ if (gcov_events_enabled)
+ gcov_event(GCOV_ADD, info);
+ mutex_unlock(&gcov_lock);
+}
+EXPORT_SYMBOL(__gcov_init);
+
+/*
+ * These functions may be referenced by gcc-generated profiling code but serve
+ * no function for kernel profiling.
+ */
+void __gcov_flush(void)
+{
+ /* Unused. */
+}
+EXPORT_SYMBOL(__gcov_flush);
+
+void __gcov_merge_add(gcov_type *counters, unsigned int n_counters)
+{
+ /* Unused. */
+}
+EXPORT_SYMBOL(__gcov_merge_add);
+
+void __gcov_merge_single(gcov_type *counters, unsigned int n_counters)
+{
+ /* Unused. */
+}
+EXPORT_SYMBOL(__gcov_merge_single);
+
+void __gcov_merge_delta(gcov_type *counters, unsigned int n_counters)
+{
+ /* Unused. */
+}
+EXPORT_SYMBOL(__gcov_merge_delta);
+
+void __gcov_merge_ior(gcov_type *counters, unsigned int n_counters)
+{
+ /* Unused. */
+}
+EXPORT_SYMBOL(__gcov_merge_ior);
+
+void __gcov_merge_time_profile(gcov_type *counters, unsigned int n_counters)
+{
+ /* Unused. */
+}
+EXPORT_SYMBOL(__gcov_merge_time_profile);
+
+void __gcov_merge_icall_topn(gcov_type *counters, unsigned int n_counters)
+{
+ /* Unused. */
+}
+EXPORT_SYMBOL(__gcov_merge_icall_topn);
+
+void __gcov_exit(void)
+{
+ /* Unused. */
+}
+EXPORT_SYMBOL(__gcov_exit);
diff --git a/kernel/gcov/gcov.h b/kernel/gcov/gcov.h
index de118ad4a024..6ab2c1808c9d 100644
--- a/kernel/gcov/gcov.h
+++ b/kernel/gcov/gcov.h
@@ -15,6 +15,7 @@
#ifndef GCOV_H
#define GCOV_H GCOV_H
+#include <linux/module.h>
#include <linux/types.h>
/*
@@ -46,6 +47,7 @@ unsigned int gcov_info_version(struct gcov_info *info);
struct gcov_info *gcov_info_next(struct gcov_info *info);
void gcov_info_link(struct gcov_info *info);
void gcov_info_unlink(struct gcov_info *prev, struct gcov_info *info);
+bool gcov_info_within_module(struct gcov_info *info, struct module *mod);
/* Base interface. */
enum gcov_action {
@@ -83,4 +85,7 @@ struct gcov_link {
};
extern const struct gcov_link gcov_link[];
+extern int gcov_events_enabled;
+extern struct mutex gcov_lock;
+
#endif /* GCOV_H */
diff --git a/kernel/gen_kheaders.sh b/kernel/gen_kheaders.sh
new file mode 100755
index 000000000000..9ff449888d9c
--- /dev/null
+++ b/kernel/gen_kheaders.sh
@@ -0,0 +1,81 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+# This script generates an archive consisting of kernel headers
+# for CONFIG_IKHEADERS.
+set -e
+sfile="$(readlink -f "$0")"
+outdir="$(pwd)"
+tarfile=$1
+cpio_dir=$outdir/$tarfile.tmp
+
+dir_list="
+include/
+arch/$SRCARCH/include/
+"
+
+# Support incremental builds by skipping archive generation
+# if timestamps of files being archived are not changed.
+
+# This block is useful for debugging the incremental builds.
+# Uncomment it for debugging.
+# if [ ! -f /tmp/iter ]; then iter=1; echo 1 > /tmp/iter;
+# else iter=$(($(cat /tmp/iter) + 1)); echo $iter > /tmp/iter; fi
+# find $src_file_list -name "*.h" | xargs ls -l > /tmp/src-ls-$iter
+# find $obj_file_list -name "*.h" | xargs ls -l > /tmp/obj-ls-$iter
+
+# include/generated/compile.h is ignored because it is touched even when none
+# of the source files changed. This causes pointless regeneration, so let us
+# ignore them for md5 calculation.
+pushd $srctree > /dev/null
+src_files_md5="$(find $dir_list -name "*.h" |
+ grep -v "include/generated/compile.h" |
+ grep -v "include/generated/autoconf.h" |
+ xargs ls -l | md5sum | cut -d ' ' -f1)"
+popd > /dev/null
+obj_files_md5="$(find $dir_list -name "*.h" |
+ grep -v "include/generated/compile.h" |
+ grep -v "include/generated/autoconf.h" |
+ xargs ls -l | md5sum | cut -d ' ' -f1)"
+# Any changes to this script will also cause a rebuild of the archive.
+this_file_md5="$(ls -l $sfile | md5sum | cut -d ' ' -f1)"
+if [ -f $tarfile ]; then tarfile_md5="$(md5sum $tarfile | cut -d ' ' -f1)"; fi
+if [ -f kernel/kheaders.md5 ] &&
+ [ "$(cat kernel/kheaders.md5|head -1)" == "$src_files_md5" ] &&
+ [ "$(cat kernel/kheaders.md5|head -2|tail -1)" == "$obj_files_md5" ] &&
+ [ "$(cat kernel/kheaders.md5|head -3|tail -1)" == "$this_file_md5" ] &&
+ [ "$(cat kernel/kheaders.md5|tail -1)" == "$tarfile_md5" ]; then
+ exit
+fi
+
+if [ "${quiet}" != "silent_" ]; then
+ echo " GEN $tarfile"
+fi
+
+rm -rf $cpio_dir
+mkdir $cpio_dir
+
+pushd $srctree > /dev/null
+for f in $dir_list;
+ do find "$f" -name "*.h";
+done | cpio --quiet -pd $cpio_dir
+popd > /dev/null
+
+# The second CPIO can complain if files already exist which can
+# happen with out of tree builds. Just silence CPIO for now.
+for f in $dir_list;
+ do find "$f" -name "*.h";
+done | cpio --quiet -pd $cpio_dir >/dev/null 2>&1
+
+# Remove comments except SDPX lines
+find $cpio_dir -type f -print0 |
+ xargs -0 -P8 -n1 perl -pi -e 'BEGIN {undef $/;}; s/\/\*((?!SPDX).)*?\*\///smg;'
+
+tar -Jcf $tarfile -C $cpio_dir/ . > /dev/null
+
+echo "$src_files_md5" > kernel/kheaders.md5
+echo "$obj_files_md5" >> kernel/kheaders.md5
+echo "$this_file_md5" >> kernel/kheaders.md5
+echo "$(md5sum $tarfile | cut -d ' ' -f1)" >> kernel/kheaders.md5
+
+rm -rf $cpio_dir
diff --git a/kernel/hung_task.c b/kernel/hung_task.c
index 4a9191617076..14a625c16cb3 100644
--- a/kernel/hung_task.c
+++ b/kernel/hung_task.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Detect Hung Task
*
@@ -19,6 +20,7 @@
#include <linux/utsname.h>
#include <linux/sched/signal.h>
#include <linux/sched/debug.h>
+#include <linux/sched/sysctl.h>
#include <trace/events/sched.h>
@@ -126,7 +128,7 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout)
if (sysctl_hung_task_warnings > 0)
sysctl_hung_task_warnings--;
pr_err("INFO: task %s:%d blocked for more than %ld seconds.\n",
- t->comm, t->pid, timeout);
+ t->comm, t->pid, (jiffies - t->last_switch_time) / HZ);
pr_err(" %s %s %.*s\n",
print_tainted(), init_utsname()->release,
(int)strcspn(init_utsname()->version, " "),
diff --git a/kernel/iomem.c b/kernel/iomem.c
index f7525e14ebc6..62c92e43aa0d 100644
--- a/kernel/iomem.c
+++ b/kernel/iomem.c
@@ -55,7 +55,7 @@ static void *try_ram_remap(resource_size_t offset, size_t size,
*
* MEMREMAP_WB - matches the default mapping for System RAM on
* the architecture. This is usually a read-allocate write-back cache.
- * Morever, if MEMREMAP_WB is specified and the requested remap region is RAM
+ * Moreover, if MEMREMAP_WB is specified and the requested remap region is RAM
* memremap() will bypass establishing a new mapping and instead return
* a pointer into the direct map.
*
@@ -86,7 +86,7 @@ void *memremap(resource_size_t offset, size_t size, unsigned long flags)
/* Try all mapping types requested until one returns non-NULL */
if (flags & MEMREMAP_WB) {
/*
- * MEMREMAP_WB is special in that it can be satisifed
+ * MEMREMAP_WB is special in that it can be satisfied
* from the direct map. Some archs depend on the
* capability of memremap() to autodetect cases where
* the requested range is potentially in System RAM.
@@ -121,7 +121,7 @@ EXPORT_SYMBOL(memremap);
void memunmap(void *addr)
{
- if (is_vmalloc_addr(addr))
+ if (is_ioremap_addr(addr))
iounmap((void __iomem *) addr);
}
EXPORT_SYMBOL(memunmap);
diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig
index 5f3e2baefca9..f92d9a687372 100644
--- a/kernel/irq/Kconfig
+++ b/kernel/irq/Kconfig
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
menu "IRQ subsystem"
# Options selectable by the architecture code
@@ -91,6 +92,9 @@ config GENERIC_MSI_IRQ_DOMAIN
select IRQ_DOMAIN_HIERARCHY
select GENERIC_MSI_IRQ
+config IRQ_MSI_IOMMU
+ bool
+
config HANDLE_DOMAIN_IRQ
bool
diff --git a/kernel/irq/Makefile b/kernel/irq/Makefile
index ff6e352e3a6c..b4f53717d143 100644
--- a/kernel/irq/Makefile
+++ b/kernel/irq/Makefile
@@ -2,6 +2,9 @@
obj-y := irqdesc.o handle.o manage.o spurious.o resend.o chip.o dummychip.o devres.o
obj-$(CONFIG_IRQ_TIMINGS) += timings.o
+ifeq ($(CONFIG_TEST_IRQ_TIMINGS),y)
+ CFLAGS_timings.o += -DDEBUG
+endif
obj-$(CONFIG_GENERIC_IRQ_CHIP) += generic-chip.o
obj-$(CONFIG_GENERIC_IRQ_PROBE) += autoprobe.o
obj-$(CONFIG_IRQ_DOMAIN) += irqdomain.o
diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c
index 45b68b4ea48b..4352b08ae48d 100644
--- a/kernel/irq/affinity.c
+++ b/kernel/irq/affinity.c
@@ -9,7 +9,7 @@
#include <linux/cpu.h>
static void irq_spread_init_one(struct cpumask *irqmsk, struct cpumask *nmsk,
- int cpus_per_vec)
+ unsigned int cpus_per_vec)
{
const struct cpumask *siblmsk;
int cpu, sibl;
@@ -94,16 +94,17 @@ static int get_nodes_in_cpumask(cpumask_var_t *node_to_cpumask,
return nodes;
}
-static int __irq_build_affinity_masks(const struct irq_affinity *affd,
- int startvec, int numvecs, int firstvec,
+static int __irq_build_affinity_masks(unsigned int startvec,
+ unsigned int numvecs,
+ unsigned int firstvec,
cpumask_var_t *node_to_cpumask,
const struct cpumask *cpu_mask,
struct cpumask *nmsk,
struct irq_affinity_desc *masks)
{
- int n, nodes, cpus_per_vec, extra_vecs, done = 0;
- int last_affv = firstvec + numvecs;
- int curvec = startvec;
+ unsigned int n, nodes, cpus_per_vec, extra_vecs, done = 0;
+ unsigned int last_affv = firstvec + numvecs;
+ unsigned int curvec = startvec;
nodemask_t nodemsk = NODE_MASK_NONE;
if (!cpumask_weight(cpu_mask))
@@ -117,18 +118,16 @@ static int __irq_build_affinity_masks(const struct irq_affinity *affd,
*/
if (numvecs <= nodes) {
for_each_node_mask(n, nodemsk) {
- cpumask_or(&masks[curvec].mask,
- &masks[curvec].mask,
- node_to_cpumask[n]);
+ cpumask_or(&masks[curvec].mask, &masks[curvec].mask,
+ node_to_cpumask[n]);
if (++curvec == last_affv)
curvec = firstvec;
}
- done = numvecs;
- goto out;
+ return numvecs;
}
for_each_node_mask(n, nodemsk) {
- int ncpus, v, vecs_to_assign, vecs_per_node;
+ unsigned int ncpus, v, vecs_to_assign, vecs_per_node;
/* Spread the vectors per node */
vecs_per_node = (numvecs - (curvec - firstvec)) / nodes;
@@ -163,8 +162,6 @@ static int __irq_build_affinity_masks(const struct irq_affinity *affd,
curvec = firstvec;
--nodes;
}
-
-out:
return done;
}
@@ -173,20 +170,24 @@ out:
* 1) spread present CPU on these vectors
* 2) spread other possible CPUs on these vectors
*/
-static int irq_build_affinity_masks(const struct irq_affinity *affd,
- int startvec, int numvecs, int firstvec,
- cpumask_var_t *node_to_cpumask,
+static int irq_build_affinity_masks(unsigned int startvec, unsigned int numvecs,
+ unsigned int firstvec,
struct irq_affinity_desc *masks)
{
- int curvec = startvec, nr_present, nr_others;
- int ret = -ENOMEM;
+ unsigned int curvec = startvec, nr_present, nr_others;
+ cpumask_var_t *node_to_cpumask;
cpumask_var_t nmsk, npresmsk;
+ int ret = -ENOMEM;
if (!zalloc_cpumask_var(&nmsk, GFP_KERNEL))
return ret;
if (!zalloc_cpumask_var(&npresmsk, GFP_KERNEL))
- goto fail;
+ goto fail_nmsk;
+
+ node_to_cpumask = alloc_node_to_cpumask();
+ if (!node_to_cpumask)
+ goto fail_npresmsk;
ret = 0;
/* Stabilize the cpumasks */
@@ -194,7 +195,7 @@ static int irq_build_affinity_masks(const struct irq_affinity *affd,
build_node_to_cpumask(node_to_cpumask);
/* Spread on present CPUs starting from affd->pre_vectors */
- nr_present = __irq_build_affinity_masks(affd, curvec, numvecs,
+ nr_present = __irq_build_affinity_masks(curvec, numvecs,
firstvec, node_to_cpumask,
cpu_present_mask, nmsk, masks);
@@ -209,7 +210,7 @@ static int irq_build_affinity_masks(const struct irq_affinity *affd,
else
curvec = firstvec + nr_present;
cpumask_andnot(npresmsk, cpu_possible_mask, cpu_present_mask);
- nr_others = __irq_build_affinity_masks(affd, curvec, numvecs,
+ nr_others = __irq_build_affinity_masks(curvec, numvecs,
firstvec, node_to_cpumask,
npresmsk, nmsk, masks);
put_online_cpus();
@@ -217,13 +218,22 @@ static int irq_build_affinity_masks(const struct irq_affinity *affd,
if (nr_present < numvecs)
WARN_ON(nr_present + nr_others < numvecs);
+ free_node_to_cpumask(node_to_cpumask);
+
+ fail_npresmsk:
free_cpumask_var(npresmsk);
- fail:
+ fail_nmsk:
free_cpumask_var(nmsk);
return ret;
}
+static void default_calc_sets(struct irq_affinity *affd, unsigned int affvecs)
+{
+ affd->nr_sets = 1;
+ affd->set_size[0] = affvecs;
+}
+
/**
* irq_create_affinity_masks - Create affinity masks for multiqueue spreading
* @nvecs: The total number of vectors
@@ -232,50 +242,62 @@ static int irq_build_affinity_masks(const struct irq_affinity *affd,
* Returns the irq_affinity_desc pointer or NULL if allocation failed.
*/
struct irq_affinity_desc *
-irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd)
+irq_create_affinity_masks(unsigned int nvecs, struct irq_affinity *affd)
{
- int affvecs = nvecs - affd->pre_vectors - affd->post_vectors;
- int curvec, usedvecs;
- cpumask_var_t *node_to_cpumask;
+ unsigned int affvecs, curvec, usedvecs, i;
struct irq_affinity_desc *masks = NULL;
- int i, nr_sets;
/*
- * If there aren't any vectors left after applying the pre/post
- * vectors don't bother with assigning affinity.
+ * Determine the number of vectors which need interrupt affinities
+ * assigned. If the pre/post request exhausts the available vectors
+ * then nothing to do here except for invoking the calc_sets()
+ * callback so the device driver can adjust to the situation. If there
+ * is only a single vector, then managing the queue is pointless as
+ * well.
*/
- if (nvecs == affd->pre_vectors + affd->post_vectors)
+ if (nvecs > 1 && nvecs > affd->pre_vectors + affd->post_vectors)
+ affvecs = nvecs - affd->pre_vectors - affd->post_vectors;
+ else
+ affvecs = 0;
+
+ /*
+ * Simple invocations do not provide a calc_sets() callback. Install
+ * the generic one.
+ */
+ if (!affd->calc_sets)
+ affd->calc_sets = default_calc_sets;
+
+ /* Recalculate the sets */
+ affd->calc_sets(affd, affvecs);
+
+ if (WARN_ON_ONCE(affd->nr_sets > IRQ_AFFINITY_MAX_SETS))
return NULL;
- node_to_cpumask = alloc_node_to_cpumask();
- if (!node_to_cpumask)
+ /* Nothing to assign? */
+ if (!affvecs)
return NULL;
masks = kcalloc(nvecs, sizeof(*masks), GFP_KERNEL);
if (!masks)
- goto outnodemsk;
+ return NULL;
/* Fill out vectors at the beginning that don't need affinity */
for (curvec = 0; curvec < affd->pre_vectors; curvec++)
cpumask_copy(&masks[curvec].mask, irq_default_affinity);
+
/*
* Spread on present CPUs starting from affd->pre_vectors. If we
* have multiple sets, build each sets affinity mask separately.
*/
- nr_sets = affd->nr_sets;
- if (!nr_sets)
- nr_sets = 1;
-
- for (i = 0, usedvecs = 0; i < nr_sets; i++) {
- int this_vecs = affd->sets ? affd->sets[i] : affvecs;
+ for (i = 0, usedvecs = 0; i < affd->nr_sets; i++) {
+ unsigned int this_vecs = affd->set_size[i];
int ret;
- ret = irq_build_affinity_masks(affd, curvec, this_vecs,
- curvec, node_to_cpumask, masks);
+ ret = irq_build_affinity_masks(curvec, this_vecs,
+ curvec, masks);
if (ret) {
kfree(masks);
- masks = NULL;
- goto outnodemsk;
+ return NULL;
}
curvec += this_vecs;
usedvecs += this_vecs;
@@ -293,8 +315,6 @@ irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd)
for (i = affd->pre_vectors; i < nvecs - affd->post_vectors; i++)
masks[i].is_managed = 1;
-outnodemsk:
- free_node_to_cpumask(node_to_cpumask);
return masks;
}
@@ -304,25 +324,22 @@ outnodemsk:
* @maxvec: The maximum number of vectors available
* @affd: Description of the affinity requirements
*/
-int irq_calc_affinity_vectors(int minvec, int maxvec, const struct irq_affinity *affd)
+unsigned int irq_calc_affinity_vectors(unsigned int minvec, unsigned int maxvec,
+ const struct irq_affinity *affd)
{
- int resv = affd->pre_vectors + affd->post_vectors;
- int vecs = maxvec - resv;
- int set_vecs;
+ unsigned int resv = affd->pre_vectors + affd->post_vectors;
+ unsigned int set_vecs;
if (resv > minvec)
return 0;
- if (affd->nr_sets) {
- int i;
-
- for (i = 0, set_vecs = 0; i < affd->nr_sets; i++)
- set_vecs += affd->sets[i];
+ if (affd->calc_sets) {
+ set_vecs = maxvec - resv;
} else {
get_online_cpus();
set_vecs = cpumask_weight(cpu_possible_mask);
put_online_cpus();
}
- return resv + min(set_vecs, vecs);
+ return resv + min(set_vecs, maxvec - resv);
}
diff --git a/kernel/irq/autoprobe.c b/kernel/irq/autoprobe.c
index 16cbf6beb276..ae60cae24e9a 100644
--- a/kernel/irq/autoprobe.c
+++ b/kernel/irq/autoprobe.c
@@ -90,7 +90,7 @@ unsigned long probe_irq_on(void)
/* It triggered already - consider it spurious. */
if (!(desc->istate & IRQS_WAITING)) {
desc->istate &= ~IRQS_AUTODETECT;
- irq_shutdown(desc);
+ irq_shutdown_and_deactivate(desc);
} else
if (i < 32)
mask |= 1 << i;
@@ -127,7 +127,7 @@ unsigned int probe_irq_mask(unsigned long val)
mask |= 1 << i;
desc->istate &= ~IRQS_AUTODETECT;
- irq_shutdown(desc);
+ irq_shutdown_and_deactivate(desc);
}
raw_spin_unlock_irq(&desc->lock);
}
@@ -169,7 +169,7 @@ int probe_irq_off(unsigned long val)
nr_of_irqs++;
}
desc->istate &= ~IRQS_AUTODETECT;
- irq_shutdown(desc);
+ irq_shutdown_and_deactivate(desc);
}
raw_spin_unlock_irq(&desc->lock);
}
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 34e969069488..b76703b2c0af 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -314,6 +314,12 @@ void irq_shutdown(struct irq_desc *desc)
}
irq_state_clr_started(desc);
}
+}
+
+
+void irq_shutdown_and_deactivate(struct irq_desc *desc)
+{
+ irq_shutdown(desc);
/*
* This must be called even if the interrupt was never started up,
* because the activation can happen before the interrupt is
@@ -730,6 +736,39 @@ out:
EXPORT_SYMBOL_GPL(handle_fasteoi_irq);
/**
+ * handle_fasteoi_nmi - irq handler for NMI interrupt lines
+ * @desc: the interrupt description structure for this irq
+ *
+ * A simple NMI-safe handler, considering the restrictions
+ * from request_nmi.
+ *
+ * Only a single callback will be issued to the chip: an ->eoi()
+ * call when the interrupt has been serviced. This enables support
+ * for modern forms of interrupt handlers, which handle the flow
+ * details in hardware, transparently.
+ */
+void handle_fasteoi_nmi(struct irq_desc *desc)
+{
+ struct irq_chip *chip = irq_desc_get_chip(desc);
+ struct irqaction *action = desc->action;
+ unsigned int irq = irq_desc_get_irq(desc);
+ irqreturn_t res;
+
+ __kstat_incr_irqs_this_cpu(desc);
+
+ trace_irq_handler_entry(irq, action);
+ /*
+ * NMIs cannot be shared, there is only one action.
+ */
+ res = action->handler(irq, action->dev_id);
+ trace_irq_handler_exit(irq, action, res);
+
+ if (chip->irq_eoi)
+ chip->irq_eoi(&desc->irq_data);
+}
+EXPORT_SYMBOL_GPL(handle_fasteoi_nmi);
+
+/**
* handle_edge_irq - edge type IRQ handler
* @desc: the interrupt description structure for this irq
*
@@ -855,7 +894,11 @@ void handle_percpu_irq(struct irq_desc *desc)
{
struct irq_chip *chip = irq_desc_get_chip(desc);
- kstat_incr_irqs_this_cpu(desc);
+ /*
+ * PER CPU interrupts are not serialized. Do not touch
+ * desc->tot_count.
+ */
+ __kstat_incr_irqs_this_cpu(desc);
if (chip->irq_ack)
chip->irq_ack(&desc->irq_data);
@@ -884,7 +927,11 @@ void handle_percpu_devid_irq(struct irq_desc *desc)
unsigned int irq = irq_desc_get_irq(desc);
irqreturn_t res;
- kstat_incr_irqs_this_cpu(desc);
+ /*
+ * PER CPU interrupts are not serialized. Do not touch
+ * desc->tot_count.
+ */
+ __kstat_incr_irqs_this_cpu(desc);
if (chip->irq_ack)
chip->irq_ack(&desc->irq_data);
@@ -908,6 +955,31 @@ void handle_percpu_devid_irq(struct irq_desc *desc)
chip->irq_eoi(&desc->irq_data);
}
+/**
+ * handle_percpu_devid_fasteoi_nmi - Per CPU local NMI handler with per cpu
+ * dev ids
+ * @desc: the interrupt description structure for this irq
+ *
+ * Similar to handle_fasteoi_nmi, but handling the dev_id cookie
+ * as a percpu pointer.
+ */
+void handle_percpu_devid_fasteoi_nmi(struct irq_desc *desc)
+{
+ struct irq_chip *chip = irq_desc_get_chip(desc);
+ struct irqaction *action = desc->action;
+ unsigned int irq = irq_desc_get_irq(desc);
+ irqreturn_t res;
+
+ __kstat_incr_irqs_this_cpu(desc);
+
+ trace_irq_handler_entry(irq, action);
+ res = action->handler(irq, raw_cpu_ptr(action->percpu_dev_id));
+ trace_irq_handler_exit(irq, action, res);
+
+ if (chip->irq_eoi)
+ chip->irq_eoi(&desc->irq_data);
+}
+
static void
__irq_do_set_handler(struct irq_desc *desc, irq_flow_handler_t handle,
int is_chained, const char *name)
@@ -1278,6 +1350,17 @@ void irq_chip_mask_parent(struct irq_data *data)
EXPORT_SYMBOL_GPL(irq_chip_mask_parent);
/**
+ * irq_chip_mask_ack_parent - Mask and acknowledge the parent interrupt
+ * @data: Pointer to interrupt specific data
+ */
+void irq_chip_mask_ack_parent(struct irq_data *data)
+{
+ data = data->parent_data;
+ data->chip->irq_mask_ack(data);
+}
+EXPORT_SYMBOL_GPL(irq_chip_mask_ack_parent);
+
+/**
* irq_chip_unmask_parent - Unmask the parent interrupt
* @data: Pointer to interrupt specific data
*/
@@ -1376,11 +1459,43 @@ int irq_chip_set_vcpu_affinity_parent(struct irq_data *data, void *vcpu_info)
int irq_chip_set_wake_parent(struct irq_data *data, unsigned int on)
{
data = data->parent_data;
+
+ if (data->chip->flags & IRQCHIP_SKIP_SET_WAKE)
+ return 0;
+
if (data->chip->irq_set_wake)
return data->chip->irq_set_wake(data, on);
return -ENOSYS;
}
+EXPORT_SYMBOL_GPL(irq_chip_set_wake_parent);
+
+/**
+ * irq_chip_request_resources_parent - Request resources on the parent interrupt
+ * @data: Pointer to interrupt specific data
+ */
+int irq_chip_request_resources_parent(struct irq_data *data)
+{
+ data = data->parent_data;
+
+ if (data->chip->irq_request_resources)
+ return data->chip->irq_request_resources(data);
+
+ return -ENOSYS;
+}
+EXPORT_SYMBOL_GPL(irq_chip_request_resources_parent);
+
+/**
+ * irq_chip_release_resources_parent - Release resources on the parent interrupt
+ * @data: Pointer to interrupt specific data
+ */
+void irq_chip_release_resources_parent(struct irq_data *data)
+{
+ data = data->parent_data;
+ if (data->chip->irq_release_resources)
+ data->chip->irq_release_resources(data);
+}
+EXPORT_SYMBOL_GPL(irq_chip_release_resources_parent);
#endif
/**
diff --git a/kernel/irq/cpuhotplug.c b/kernel/irq/cpuhotplug.c
index 5b1072e394b2..6c7ca2e983a5 100644
--- a/kernel/irq/cpuhotplug.c
+++ b/kernel/irq/cpuhotplug.c
@@ -116,7 +116,7 @@ static bool migrate_one_irq(struct irq_desc *desc)
*/
if (irqd_affinity_is_managed(d)) {
irqd_set_managed_shutdown(d);
- irq_shutdown(desc);
+ irq_shutdown_and_deactivate(desc);
return false;
}
affinity = cpu_online_mask;
diff --git a/kernel/irq/debugfs.c b/kernel/irq/debugfs.c
index 6f636136cccc..c1eccd4f6520 100644
--- a/kernel/irq/debugfs.c
+++ b/kernel/irq/debugfs.c
@@ -56,6 +56,7 @@ static const struct irq_bit_descr irqchip_flags[] = {
BIT_MASK_DESCR(IRQCHIP_ONESHOT_SAFE),
BIT_MASK_DESCR(IRQCHIP_EOI_THREADED),
BIT_MASK_DESCR(IRQCHIP_SUPPORTS_LEVEL_MSI),
+ BIT_MASK_DESCR(IRQCHIP_SUPPORTS_NMI),
};
static void
@@ -140,6 +141,7 @@ static const struct irq_bit_descr irqdesc_istates[] = {
BIT_MASK_DESCR(IRQS_WAITING),
BIT_MASK_DESCR(IRQS_PENDING),
BIT_MASK_DESCR(IRQS_SUSPENDED),
+ BIT_MASK_DESCR(IRQS_NMI),
};
@@ -150,7 +152,7 @@ static int irq_debug_show(struct seq_file *m, void *p)
raw_spin_lock_irq(&desc->lock);
data = irq_desc_get_irq_data(desc);
- seq_printf(m, "handler: %pf\n", desc->handle_irq);
+ seq_printf(m, "handler: %ps\n", desc->handle_irq);
seq_printf(m, "device: %s\n", desc->dev_name);
seq_printf(m, "status: 0x%08x\n", desc->status_use_accessors);
irq_debug_show_bits(m, 0, desc->status_use_accessors, irqdesc_states,
@@ -203,8 +205,8 @@ static ssize_t irq_debug_write(struct file *file, const char __user *user_buf,
chip_bus_lock(desc);
raw_spin_lock_irqsave(&desc->lock, flags);
- if (irq_settings_is_level(desc)) {
- /* Can't do level, sorry */
+ if (irq_settings_is_level(desc) || desc->istate & IRQS_NMI) {
+ /* Can't do level nor NMIs, sorry */
err = -EINVAL;
} else {
desc->istate |= IRQS_PENDING;
@@ -256,8 +258,6 @@ static int __init irq_debugfs_init(void)
int irq;
root_dir = debugfs_create_dir("irq", NULL);
- if (!root_dir)
- return -ENOMEM;
irq_domain_debugfs_init(root_dir);
diff --git a/kernel/irq/devres.c b/kernel/irq/devres.c
index 5d5378ea0afe..f6e5515ee077 100644
--- a/kernel/irq/devres.c
+++ b/kernel/irq/devres.c
@@ -84,8 +84,6 @@ EXPORT_SYMBOL(devm_request_threaded_irq);
* @dev: device to request interrupt for
* @irq: Interrupt line to allocate
* @handler: Function to be called when the IRQ occurs
- * @thread_fn: function to be called in a threaded interrupt context. NULL
- * for devices which handle everything in @handler
* @irqflags: Interrupt type flags
* @devname: An ascii name for the claiming device, dev_name(dev) if NULL
* @dev_id: A cookie passed back to the handler function
@@ -222,9 +220,8 @@ devm_irq_alloc_generic_chip(struct device *dev, const char *name, int num_ct,
irq_flow_handler_t handler)
{
struct irq_chip_generic *gc;
- unsigned long sz = sizeof(*gc) + num_ct * sizeof(struct irq_chip_type);
- gc = devm_kzalloc(dev, sz, GFP_KERNEL);
+ gc = devm_kzalloc(dev, struct_size(gc, chip_types, num_ct), GFP_KERNEL);
if (gc)
irq_init_generic_chip(gc, name, num_ct,
irq_base, reg_base, handler);
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index 38554bc35375..a4ace611f47f 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -149,7 +149,7 @@ irqreturn_t __handle_irq_event_percpu(struct irq_desc *desc, unsigned int *flags
res = action->handler(irq, action->dev_id);
trace_irq_handler_exit(irq, action, res);
- if (WARN_ONCE(!irqs_disabled(),"irq %u handler %pF enabled interrupts\n",
+ if (WARN_ONCE(!irqs_disabled(),"irq %u handler %pS enabled interrupts\n",
irq, action->handler))
local_irq_disable();
@@ -166,7 +166,7 @@ irqreturn_t __handle_irq_event_percpu(struct irq_desc *desc, unsigned int *flags
__irq_wake_thread(desc, action);
- /* Fall through to add to randomness */
+ /* Fall through - to add to randomness */
case IRQ_HANDLED:
*flags |= action->flags;
break;
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index ca6afa267070..3924fbe829d4 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -49,6 +49,7 @@ enum {
* IRQS_WAITING - irq is waiting
* IRQS_PENDING - irq is pending and replayed later
* IRQS_SUSPENDED - irq is suspended
+ * IRQS_NMI - irq line is used to deliver NMIs
*/
enum {
IRQS_AUTODETECT = 0x00000001,
@@ -60,6 +61,7 @@ enum {
IRQS_PENDING = 0x00000200,
IRQS_SUSPENDED = 0x00000800,
IRQS_TIMINGS = 0x00001000,
+ IRQS_NMI = 0x00002000,
};
#include "debug.h"
@@ -80,6 +82,7 @@ extern int irq_activate_and_startup(struct irq_desc *desc, bool resend);
extern int irq_startup(struct irq_desc *desc, bool resend, bool force);
extern void irq_shutdown(struct irq_desc *desc);
+extern void irq_shutdown_and_deactivate(struct irq_desc *desc);
extern void irq_enable(struct irq_desc *desc);
extern void irq_disable(struct irq_desc *desc);
extern void irq_percpu_enable(struct irq_desc *desc, unsigned int cpu);
@@ -94,6 +97,10 @@ static inline void irq_mark_irq(unsigned int irq) { }
extern void irq_mark_irq(unsigned int irq);
#endif
+extern int __irq_get_irqchip_state(struct irq_data *data,
+ enum irqchip_irq_state which,
+ bool *state);
+
extern void init_kstat_irqs(struct irq_desc *desc, int node, int nr);
irqreturn_t __handle_irq_event_percpu(struct irq_desc *desc, unsigned int *flags);
@@ -242,12 +249,18 @@ static inline void irq_state_set_masked(struct irq_desc *desc)
#undef __irqd_to_state
-static inline void kstat_incr_irqs_this_cpu(struct irq_desc *desc)
+static inline void __kstat_incr_irqs_this_cpu(struct irq_desc *desc)
{
__this_cpu_inc(*desc->kstat_irqs);
__this_cpu_inc(kstat.irqs_sum);
}
+static inline void kstat_incr_irqs_this_cpu(struct irq_desc *desc)
+{
+ __kstat_incr_irqs_this_cpu(desc);
+ desc->tot_count++;
+}
+
static inline int irq_desc_get_node(struct irq_desc *desc)
{
return irq_common_data_get_node(&desc->irq_common_data);
@@ -346,6 +359,16 @@ static inline int irq_timing_decode(u64 value, u64 *timestamp)
return value & U16_MAX;
}
+static __always_inline void irq_timings_push(u64 ts, int irq)
+{
+ struct irq_timings *timings = this_cpu_ptr(&irq_timings);
+
+ timings->values[timings->count & IRQ_TIMINGS_MASK] =
+ irq_timing_encode(ts, irq);
+
+ timings->count++;
+}
+
/*
* The function record_irq_time is only called in one place in the
* interrupts handler. We want this function always inline so the code
@@ -359,15 +382,8 @@ static __always_inline void record_irq_time(struct irq_desc *desc)
if (!static_branch_likely(&irq_timing_enabled))
return;
- if (desc->istate & IRQS_TIMINGS) {
- struct irq_timings *timings = this_cpu_ptr(&irq_timings);
-
- timings->values[timings->count & IRQ_TIMINGS_MASK] =
- irq_timing_encode(local_clock(),
- irq_desc_get_irq(desc));
-
- timings->count++;
- }
+ if (desc->istate & IRQS_TIMINGS)
+ irq_timings_push(local_clock(), irq_desc_get_irq(desc));
}
#else
static inline void irq_remove_timings(struct irq_desc *desc) {}
diff --git a/kernel/irq/irq_sim.c b/kernel/irq/irq_sim.c
index 98a20e1594ce..b992f88c5613 100644
--- a/kernel/irq/irq_sim.c
+++ b/kernel/irq/irq_sim.c
@@ -25,10 +25,22 @@ static void irq_sim_irqunmask(struct irq_data *data)
irq_ctx->enabled = true;
}
+static int irq_sim_set_type(struct irq_data *data, unsigned int type)
+{
+ /* We only support rising and falling edge trigger types. */
+ if (type & ~IRQ_TYPE_EDGE_BOTH)
+ return -EINVAL;
+
+ irqd_set_trigger_type(data, type);
+
+ return 0;
+}
+
static struct irq_chip irq_sim_irqchip = {
.name = "irq_sim",
.irq_mask = irq_sim_irqmask,
.irq_unmask = irq_sim_irqunmask,
+ .irq_set_type = irq_sim_set_type,
};
static void irq_sim_handle_irq(struct irq_work *work)
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index ef8ad36cadcf..9484e88dabc2 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -119,6 +119,7 @@ static void desc_set_defaults(unsigned int irq, struct irq_desc *desc, int node,
desc->depth = 1;
desc->irq_count = 0;
desc->irqs_unhandled = 0;
+ desc->tot_count = 0;
desc->name = NULL;
desc->owner = owner;
for_each_possible_cpu(cpu)
@@ -274,11 +275,12 @@ static struct attribute *irq_attrs[] = {
&actions_attr.attr,
NULL
};
+ATTRIBUTE_GROUPS(irq);
static struct kobj_type irq_kobj_type = {
.release = irq_kobj_release,
.sysfs_ops = &kobj_sysfs_ops,
- .default_attrs = irq_attrs,
+ .default_groups = irq_groups,
};
static void irq_sysfs_add(int irq, struct irq_desc *desc)
@@ -557,6 +559,7 @@ int __init early_irq_init(void)
alloc_masks(&desc[i], node);
raw_spin_lock_init(&desc[i].lock);
lockdep_set_class(&desc[i].lock, &irq_desc_lock_class);
+ mutex_init(&desc[i].request_mutex);
desc_set_defaults(i, &desc[i], node, NULL, NULL);
}
return arch_early_irq_init();
@@ -669,6 +672,45 @@ int __handle_domain_irq(struct irq_domain *domain, unsigned int hwirq,
set_irq_regs(old_regs);
return ret;
}
+
+#ifdef CONFIG_IRQ_DOMAIN
+/**
+ * handle_domain_nmi - Invoke the handler for a HW irq belonging to a domain
+ * @domain: The domain where to perform the lookup
+ * @hwirq: The HW irq number to convert to a logical one
+ * @regs: Register file coming from the low-level handling code
+ *
+ * This function must be called from an NMI context.
+ *
+ * Returns: 0 on success, or -EINVAL if conversion has failed
+ */
+int handle_domain_nmi(struct irq_domain *domain, unsigned int hwirq,
+ struct pt_regs *regs)
+{
+ struct pt_regs *old_regs = set_irq_regs(regs);
+ unsigned int irq;
+ int ret = 0;
+
+ /*
+ * NMI context needs to be setup earlier in order to deal with tracing.
+ */
+ WARN_ON(!in_nmi());
+
+ irq = irq_find_mapping(domain, hwirq);
+
+ /*
+ * ack_bad_irq is not NMI-safe, just report
+ * an invalid interrupt.
+ */
+ if (likely(irq))
+ generic_handle_irq(irq);
+ else
+ ret = -EINVAL;
+
+ set_irq_regs(old_regs);
+ return ret;
+}
+#endif
#endif
/* Dynamic interrupt handling */
@@ -908,6 +950,11 @@ unsigned int kstat_irqs_cpu(unsigned int irq, int cpu)
*per_cpu_ptr(desc->kstat_irqs, cpu) : 0;
}
+static bool irq_is_nmi(struct irq_desc *desc)
+{
+ return desc->istate & IRQS_NMI;
+}
+
/**
* kstat_irqs - Get the statistics for an interrupt
* @irq: The interrupt number
@@ -919,11 +966,16 @@ unsigned int kstat_irqs_cpu(unsigned int irq, int cpu)
unsigned int kstat_irqs(unsigned int irq)
{
struct irq_desc *desc = irq_to_desc(irq);
- int cpu;
unsigned int sum = 0;
+ int cpu;
if (!desc || !desc->kstat_irqs)
return 0;
+ if (!irq_settings_is_per_cpu_devid(desc) &&
+ !irq_settings_is_per_cpu(desc) &&
+ !irq_is_nmi(desc))
+ return desc->tot_count;
+
for_each_possible_cpu(cpu)
sum += *per_cpu_ptr(desc->kstat_irqs, cpu);
return sum;
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index 8b0be4bd6565..3078d0e48bba 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -123,7 +123,7 @@ EXPORT_SYMBOL_GPL(irq_domain_free_fwnode);
* @ops: domain callbacks
* @host_data: Controller private data pointer
*
- * Allocates and initialize and irq_domain structure.
+ * Allocates and initializes an irq_domain structure.
* Returns pointer to IRQ domain, or NULL on failure.
*/
struct irq_domain *__irq_domain_add(struct fwnode_handle *fwnode, int size,
@@ -139,7 +139,7 @@ struct irq_domain *__irq_domain_add(struct fwnode_handle *fwnode, int size,
domain = kzalloc_node(sizeof(*domain) + (sizeof(unsigned int) * size),
GFP_KERNEL, of_node_to_nid(of_node));
- if (WARN_ON(!domain))
+ if (!domain)
return NULL;
if (fwnode && is_fwnode_irqchip(fwnode)) {
@@ -458,6 +458,20 @@ void irq_set_default_host(struct irq_domain *domain)
}
EXPORT_SYMBOL_GPL(irq_set_default_host);
+/**
+ * irq_get_default_host() - Retrieve the "default" irq domain
+ *
+ * Returns: the default domain, if any.
+ *
+ * Modern code should never use this. This should only be used on
+ * systems that cannot implement a firmware->fwnode mapping (which
+ * both DT and ACPI provide).
+ */
+struct irq_domain *irq_get_default_host(void)
+{
+ return irq_default_domain;
+}
+
static void irq_domain_clear_mapping(struct irq_domain *domain,
irq_hw_number_t hwirq)
{
@@ -729,16 +743,17 @@ static int irq_domain_translate(struct irq_domain *d,
return 0;
}
-static void of_phandle_args_to_fwspec(struct of_phandle_args *irq_data,
+static void of_phandle_args_to_fwspec(struct device_node *np, const u32 *args,
+ unsigned int count,
struct irq_fwspec *fwspec)
{
int i;
- fwspec->fwnode = irq_data->np ? &irq_data->np->fwnode : NULL;
- fwspec->param_count = irq_data->args_count;
+ fwspec->fwnode = np ? &np->fwnode : NULL;
+ fwspec->param_count = count;
- for (i = 0; i < irq_data->args_count; i++)
- fwspec->param[i] = irq_data->args[i];
+ for (i = 0; i < count; i++)
+ fwspec->param[i] = args[i];
}
unsigned int irq_create_fwspec_mapping(struct irq_fwspec *fwspec)
@@ -836,7 +851,9 @@ unsigned int irq_create_of_mapping(struct of_phandle_args *irq_data)
{
struct irq_fwspec fwspec;
- of_phandle_args_to_fwspec(irq_data, &fwspec);
+ of_phandle_args_to_fwspec(irq_data->np, irq_data->args,
+ irq_data->args_count, &fwspec);
+
return irq_create_fwspec_mapping(&fwspec);
}
EXPORT_SYMBOL_GPL(irq_create_of_mapping);
@@ -928,11 +945,10 @@ int irq_domain_xlate_twocell(struct irq_domain *d, struct device_node *ctrlr,
const u32 *intspec, unsigned int intsize,
irq_hw_number_t *out_hwirq, unsigned int *out_type)
{
- if (WARN_ON(intsize < 2))
- return -EINVAL;
- *out_hwirq = intspec[0];
- *out_type = intspec[1] & IRQ_TYPE_SENSE_MASK;
- return 0;
+ struct irq_fwspec fwspec;
+
+ of_phandle_args_to_fwspec(ctrlr, intspec, intsize, &fwspec);
+ return irq_domain_translate_twocell(d, &fwspec, out_hwirq, out_type);
}
EXPORT_SYMBOL_GPL(irq_domain_xlate_twocell);
@@ -968,6 +984,27 @@ const struct irq_domain_ops irq_domain_simple_ops = {
};
EXPORT_SYMBOL_GPL(irq_domain_simple_ops);
+/**
+ * irq_domain_translate_twocell() - Generic translate for direct two cell
+ * bindings
+ *
+ * Device Tree IRQ specifier translation function which works with two cell
+ * bindings where the cell values map directly to the hwirq number
+ * and linux irq flags.
+ */
+int irq_domain_translate_twocell(struct irq_domain *d,
+ struct irq_fwspec *fwspec,
+ unsigned long *out_hwirq,
+ unsigned int *out_type)
+{
+ if (WARN_ON(fwspec->param_count < 2))
+ return -EINVAL;
+ *out_hwirq = fwspec->param[0];
+ *out_type = fwspec->param[1] & IRQ_TYPE_SENSE_MASK;
+ return 0;
+}
+EXPORT_SYMBOL_GPL(irq_domain_translate_twocell);
+
int irq_domain_alloc_descs(int virq, unsigned int cnt, irq_hw_number_t hwirq,
int node, const struct irq_affinity_desc *affinity)
{
@@ -1260,7 +1297,7 @@ int irq_domain_alloc_irqs_hierarchy(struct irq_domain *domain,
/**
* __irq_domain_alloc_irqs - Allocate IRQs from domain
* @domain: domain to allocate from
- * @irq_base: allocate specified IRQ nubmer if irq_base >= 0
+ * @irq_base: allocate specified IRQ number if irq_base >= 0
* @nr_irqs: number of IRQs to allocate
* @node: NUMA node id for memory allocation
* @arg: domain specific argument
@@ -1749,8 +1786,6 @@ void __init irq_domain_debugfs_init(struct dentry *root)
struct irq_domain *d;
domain_dir = debugfs_create_dir("domains", root);
- if (!domain_dir)
- return;
debugfs_create_file("default", 0444, domain_dir, NULL,
&irq_domain_debug_fops);
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 84b54a17b95d..e8f7f179bf77 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -13,6 +13,7 @@
#include <linux/module.h>
#include <linux/random.h>
#include <linux/interrupt.h>
+#include <linux/irqdomain.h>
#include <linux/slab.h>
#include <linux/sched.h>
#include <linux/sched/rt.h>
@@ -34,8 +35,9 @@ static int __init setup_forced_irqthreads(char *arg)
early_param("threadirqs", setup_forced_irqthreads);
#endif
-static void __synchronize_hardirq(struct irq_desc *desc)
+static void __synchronize_hardirq(struct irq_desc *desc, bool sync_chip)
{
+ struct irq_data *irqd = irq_desc_get_irq_data(desc);
bool inprogress;
do {
@@ -51,6 +53,20 @@ static void __synchronize_hardirq(struct irq_desc *desc)
/* Ok, that indicated we're done: double-check carefully. */
raw_spin_lock_irqsave(&desc->lock, flags);
inprogress = irqd_irq_inprogress(&desc->irq_data);
+
+ /*
+ * If requested and supported, check at the chip whether it
+ * is in flight at the hardware level, i.e. already pending
+ * in a CPU and waiting for service and acknowledge.
+ */
+ if (!inprogress && sync_chip) {
+ /*
+ * Ignore the return code. inprogress is only updated
+ * when the chip supports it.
+ */
+ __irq_get_irqchip_state(irqd, IRQCHIP_STATE_ACTIVE,
+ &inprogress);
+ }
raw_spin_unlock_irqrestore(&desc->lock, flags);
/* Oops, that failed? */
@@ -73,13 +89,18 @@ static void __synchronize_hardirq(struct irq_desc *desc)
* Returns: false if a threaded handler is active.
*
* This function may be called - with care - from IRQ context.
+ *
+ * It does not check whether there is an interrupt in flight at the
+ * hardware level, but not serviced yet, as this might deadlock when
+ * called with interrupts disabled and the target CPU of the interrupt
+ * is the current CPU.
*/
bool synchronize_hardirq(unsigned int irq)
{
struct irq_desc *desc = irq_to_desc(irq);
if (desc) {
- __synchronize_hardirq(desc);
+ __synchronize_hardirq(desc, false);
return !atomic_read(&desc->threads_active);
}
@@ -95,14 +116,19 @@ EXPORT_SYMBOL(synchronize_hardirq);
* to complete before returning. If you use this function while
* holding a resource the IRQ handler may need you will deadlock.
*
- * This function may be called - with care - from IRQ context.
+ * Can only be called from preemptible code as it might sleep when
+ * an interrupt thread is associated to @irq.
+ *
+ * It optionally makes sure (when the irq chip supports that method)
+ * that the interrupt is not pending in any CPU and waiting for
+ * service.
*/
void synchronize_irq(unsigned int irq)
{
struct irq_desc *desc = irq_to_desc(irq);
if (desc) {
- __synchronize_hardirq(desc);
+ __synchronize_hardirq(desc, true);
/*
* We made sure that no hardirq handler is
* running. Now verify that no threaded handlers are
@@ -196,6 +222,7 @@ int irq_do_set_affinity(struct irq_data *data, const struct cpumask *mask,
case IRQ_SET_MASK_OK:
case IRQ_SET_MASK_OK_DONE:
cpumask_copy(desc->irq_common_data.affinity, mask);
+ /* fall through */
case IRQ_SET_MASK_OK_NOCOPY:
irq_validate_effective_affinity(data);
irq_set_thread_affinity(desc);
@@ -341,7 +368,7 @@ irq_set_affinity_notifier(unsigned int irq, struct irq_affinity_notify *notify)
/* The release function is promised process context */
might_sleep();
- if (!desc)
+ if (!desc || desc->istate & IRQS_NMI)
return -EINVAL;
/* Complete initialisation of *notify */
@@ -356,8 +383,10 @@ irq_set_affinity_notifier(unsigned int irq, struct irq_affinity_notify *notify)
desc->affinity_notify = notify;
raw_spin_unlock_irqrestore(&desc->lock, flags);
- if (old_notify)
+ if (old_notify) {
+ cancel_work_sync(&old_notify->work);
kref_put(&old_notify->kref, old_notify->release);
+ }
return 0;
}
@@ -553,6 +582,21 @@ bool disable_hardirq(unsigned int irq)
}
EXPORT_SYMBOL_GPL(disable_hardirq);
+/**
+ * disable_nmi_nosync - disable an nmi without waiting
+ * @irq: Interrupt to disable
+ *
+ * Disable the selected interrupt line. Disables and enables are
+ * nested.
+ * The interrupt to disable must have been requested through request_nmi.
+ * Unlike disable_nmi(), this function does not ensure existing
+ * instances of the IRQ handler have completed before returning.
+ */
+void disable_nmi_nosync(unsigned int irq)
+{
+ disable_irq_nosync(irq);
+}
+
void __enable_irq(struct irq_desc *desc)
{
switch (desc->depth) {
@@ -609,6 +653,20 @@ out:
}
EXPORT_SYMBOL(enable_irq);
+/**
+ * enable_nmi - enable handling of an nmi
+ * @irq: Interrupt to enable
+ *
+ * The interrupt to enable must have been requested through request_nmi.
+ * Undoes the effect of one call to disable_nmi(). If this
+ * matches the last disable, processing of interrupts on this
+ * IRQ line is re-enabled.
+ */
+void enable_nmi(unsigned int irq)
+{
+ enable_irq(irq);
+}
+
static int set_irq_wake_real(unsigned int irq, unsigned int on)
{
struct irq_desc *desc = irq_to_desc(irq);
@@ -644,6 +702,12 @@ int irq_set_irq_wake(unsigned int irq, unsigned int on)
if (!desc)
return -EINVAL;
+ /* Don't use NMIs as wake up interrupts please */
+ if (desc->istate & IRQS_NMI) {
+ ret = -EINVAL;
+ goto out_unlock;
+ }
+
/* wakeup-capable irqs can be shared between drivers that
* don't need to have the same sleep mode behaviors.
*/
@@ -666,6 +730,8 @@ int irq_set_irq_wake(unsigned int irq, unsigned int on)
irqd_clear(&desc->irq_data, IRQD_WAKEUP_STATE);
}
}
+
+out_unlock:
irq_put_desc_busunlock(desc, flags);
return ret;
}
@@ -726,6 +792,7 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned long flags)
case IRQ_SET_MASK_OK_DONE:
irqd_clear(&desc->irq_data, IRQD_TRIGGER_MASK);
irqd_set(&desc->irq_data, flags);
+ /* fall through */
case IRQ_SET_MASK_OK_NOCOPY:
flags = irqd_get_trigger_type(&desc->irq_data);
@@ -740,7 +807,7 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned long flags)
ret = 0;
break;
default:
- pr_err("Setting trigger mode %lu for irq %u failed (%pF)\n",
+ pr_err("Setting trigger mode %lu for irq %u failed (%pS)\n",
flags, irq_desc_get_irq(desc), chip->irq_set_type);
}
if (unmask)
@@ -1128,6 +1195,39 @@ static void irq_release_resources(struct irq_desc *desc)
c->irq_release_resources(d);
}
+static bool irq_supports_nmi(struct irq_desc *desc)
+{
+ struct irq_data *d = irq_desc_get_irq_data(desc);
+
+#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY
+ /* Only IRQs directly managed by the root irqchip can be set as NMI */
+ if (d->parent_data)
+ return false;
+#endif
+ /* Don't support NMIs for chips behind a slow bus */
+ if (d->chip->irq_bus_lock || d->chip->irq_bus_sync_unlock)
+ return false;
+
+ return d->chip->flags & IRQCHIP_SUPPORTS_NMI;
+}
+
+static int irq_nmi_setup(struct irq_desc *desc)
+{
+ struct irq_data *d = irq_desc_get_irq_data(desc);
+ struct irq_chip *c = d->chip;
+
+ return c->irq_nmi_setup ? c->irq_nmi_setup(d) : -EINVAL;
+}
+
+static void irq_nmi_teardown(struct irq_desc *desc)
+{
+ struct irq_data *d = irq_desc_get_irq_data(desc);
+ struct irq_chip *c = d->chip;
+
+ if (c->irq_nmi_teardown)
+ c->irq_nmi_teardown(d);
+}
+
static int
setup_irq_thread(struct irqaction *new, unsigned int irq, bool secondary)
{
@@ -1302,9 +1402,17 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
* fields must have IRQF_SHARED set and the bits which
* set the trigger type must match. Also all must
* agree on ONESHOT.
+ * Interrupt lines used for NMIs cannot be shared.
*/
unsigned int oldtype;
+ if (desc->istate & IRQS_NMI) {
+ pr_err("Invalid attempt to share NMI for %s (irq %d) on irqchip %s.\n",
+ new->name, irq, desc->irq_data.chip->name);
+ ret = -EINVAL;
+ goto out_unlock;
+ }
+
/*
* If nobody did set the configuration before, inherit
* the one provided by the requester.
@@ -1617,6 +1725,7 @@ static struct irqaction *__free_irq(struct irq_desc *desc, void *dev_id)
/* If this was the last handler, shut down the IRQ line: */
if (!desc->action) {
irq_settings_clr_disable_unlazy(desc);
+ /* Only shutdown. Deactivate after synchronize_hardirq() */
irq_shutdown(desc);
}
@@ -1645,8 +1754,12 @@ static struct irqaction *__free_irq(struct irq_desc *desc, void *dev_id)
unregister_handler_proc(irq, action);
- /* Make sure it's not being used on another CPU: */
- synchronize_hardirq(irq);
+ /*
+ * Make sure it's not being used on another CPU and if the chip
+ * supports it also make sure that there is no (not yet serviced)
+ * interrupt in flight at the hardware level.
+ */
+ __synchronize_hardirq(desc, true);
#ifdef CONFIG_DEBUG_SHIRQ
/*
@@ -1686,6 +1799,14 @@ static struct irqaction *__free_irq(struct irq_desc *desc, void *dev_id)
* require it to deallocate resources over the slow bus.
*/
chip_bus_lock(desc);
+ /*
+ * There is no interrupt on the fly anymore. Deactivate it
+ * completely.
+ */
+ raw_spin_lock_irqsave(&desc->lock, flags);
+ irq_domain_deactivate_irq(&desc->irq_data);
+ raw_spin_unlock_irqrestore(&desc->lock, flags);
+
irq_release_resources(desc);
chip_bus_sync_unlock(desc);
irq_remove_timings(desc);
@@ -1756,6 +1877,59 @@ const void *free_irq(unsigned int irq, void *dev_id)
}
EXPORT_SYMBOL(free_irq);
+/* This function must be called with desc->lock held */
+static const void *__cleanup_nmi(unsigned int irq, struct irq_desc *desc)
+{
+ const char *devname = NULL;
+
+ desc->istate &= ~IRQS_NMI;
+
+ if (!WARN_ON(desc->action == NULL)) {
+ irq_pm_remove_action(desc, desc->action);
+ devname = desc->action->name;
+ unregister_handler_proc(irq, desc->action);
+
+ kfree(desc->action);
+ desc->action = NULL;
+ }
+
+ irq_settings_clr_disable_unlazy(desc);
+ irq_shutdown_and_deactivate(desc);
+
+ irq_release_resources(desc);
+
+ irq_chip_pm_put(&desc->irq_data);
+ module_put(desc->owner);
+
+ return devname;
+}
+
+const void *free_nmi(unsigned int irq, void *dev_id)
+{
+ struct irq_desc *desc = irq_to_desc(irq);
+ unsigned long flags;
+ const void *devname;
+
+ if (!desc || WARN_ON(!(desc->istate & IRQS_NMI)))
+ return NULL;
+
+ if (WARN_ON(irq_settings_is_per_cpu_devid(desc)))
+ return NULL;
+
+ /* NMI still enabled */
+ if (WARN_ON(desc->depth == 0))
+ disable_nmi_nosync(irq);
+
+ raw_spin_lock_irqsave(&desc->lock, flags);
+
+ irq_nmi_teardown(desc);
+ devname = __cleanup_nmi(irq, desc);
+
+ raw_spin_unlock_irqrestore(&desc->lock, flags);
+
+ return devname;
+}
+
/**
* request_threaded_irq - allocate an interrupt line
* @irq: Interrupt line to allocate
@@ -1925,6 +2099,101 @@ int request_any_context_irq(unsigned int irq, irq_handler_t handler,
}
EXPORT_SYMBOL_GPL(request_any_context_irq);
+/**
+ * request_nmi - allocate an interrupt line for NMI delivery
+ * @irq: Interrupt line to allocate
+ * @handler: Function to be called when the IRQ occurs.
+ * Threaded handler for threaded interrupts.
+ * @irqflags: Interrupt type flags
+ * @name: An ascii name for the claiming device
+ * @dev_id: A cookie passed back to the handler function
+ *
+ * This call allocates interrupt resources and enables the
+ * interrupt line and IRQ handling. It sets up the IRQ line
+ * to be handled as an NMI.
+ *
+ * An interrupt line delivering NMIs cannot be shared and IRQ handling
+ * cannot be threaded.
+ *
+ * Interrupt lines requested for NMI delivering must produce per cpu
+ * interrupts and have auto enabling setting disabled.
+ *
+ * Dev_id must be globally unique. Normally the address of the
+ * device data structure is used as the cookie. Since the handler
+ * receives this value it makes sense to use it.
+ *
+ * If the interrupt line cannot be used to deliver NMIs, function
+ * will fail and return a negative value.
+ */
+int request_nmi(unsigned int irq, irq_handler_t handler,
+ unsigned long irqflags, const char *name, void *dev_id)
+{
+ struct irqaction *action;
+ struct irq_desc *desc;
+ unsigned long flags;
+ int retval;
+
+ if (irq == IRQ_NOTCONNECTED)
+ return -ENOTCONN;
+
+ /* NMI cannot be shared, used for Polling */
+ if (irqflags & (IRQF_SHARED | IRQF_COND_SUSPEND | IRQF_IRQPOLL))
+ return -EINVAL;
+
+ if (!(irqflags & IRQF_PERCPU))
+ return -EINVAL;
+
+ if (!handler)
+ return -EINVAL;
+
+ desc = irq_to_desc(irq);
+
+ if (!desc || irq_settings_can_autoenable(desc) ||
+ !irq_settings_can_request(desc) ||
+ WARN_ON(irq_settings_is_per_cpu_devid(desc)) ||
+ !irq_supports_nmi(desc))
+ return -EINVAL;
+
+ action = kzalloc(sizeof(struct irqaction), GFP_KERNEL);
+ if (!action)
+ return -ENOMEM;
+
+ action->handler = handler;
+ action->flags = irqflags | IRQF_NO_THREAD | IRQF_NOBALANCING;
+ action->name = name;
+ action->dev_id = dev_id;
+
+ retval = irq_chip_pm_get(&desc->irq_data);
+ if (retval < 0)
+ goto err_out;
+
+ retval = __setup_irq(irq, desc, action);
+ if (retval)
+ goto err_irq_setup;
+
+ raw_spin_lock_irqsave(&desc->lock, flags);
+
+ /* Setup NMI state */
+ desc->istate |= IRQS_NMI;
+ retval = irq_nmi_setup(desc);
+ if (retval) {
+ __cleanup_nmi(irq, desc);
+ raw_spin_unlock_irqrestore(&desc->lock, flags);
+ return -EINVAL;
+ }
+
+ raw_spin_unlock_irqrestore(&desc->lock, flags);
+
+ return 0;
+
+err_irq_setup:
+ irq_chip_pm_put(&desc->irq_data);
+err_out:
+ kfree(action);
+
+ return retval;
+}
+
void enable_percpu_irq(unsigned int irq, unsigned int type)
{
unsigned int cpu = smp_processor_id();
@@ -1959,6 +2228,11 @@ out:
}
EXPORT_SYMBOL_GPL(enable_percpu_irq);
+void enable_percpu_nmi(unsigned int irq, unsigned int type)
+{
+ enable_percpu_irq(irq, type);
+}
+
/**
* irq_percpu_is_enabled - Check whether the per cpu irq is enabled
* @irq: Linux irq number to check for
@@ -1998,6 +2272,11 @@ void disable_percpu_irq(unsigned int irq)
}
EXPORT_SYMBOL_GPL(disable_percpu_irq);
+void disable_percpu_nmi(unsigned int irq)
+{
+ disable_percpu_irq(irq);
+}
+
/*
* Internal function to unregister a percpu irqaction.
*/
@@ -2029,6 +2308,8 @@ static struct irqaction *__free_percpu_irq(unsigned int irq, void __percpu *dev_
/* Found it - now remove it from the list of entries: */
desc->action = NULL;
+ desc->istate &= ~IRQS_NMI;
+
raw_spin_unlock_irqrestore(&desc->lock, flags);
unregister_handler_proc(irq, action);
@@ -2082,6 +2363,19 @@ void free_percpu_irq(unsigned int irq, void __percpu *dev_id)
}
EXPORT_SYMBOL_GPL(free_percpu_irq);
+void free_percpu_nmi(unsigned int irq, void __percpu *dev_id)
+{
+ struct irq_desc *desc = irq_to_desc(irq);
+
+ if (!desc || !irq_settings_is_per_cpu_devid(desc))
+ return;
+
+ if (WARN_ON(!(desc->istate & IRQS_NMI)))
+ return;
+
+ kfree(__free_percpu_irq(irq, dev_id));
+}
+
/**
* setup_percpu_irq - setup a per-cpu interrupt
* @irq: Interrupt line to setup
@@ -2172,6 +2466,180 @@ int __request_percpu_irq(unsigned int irq, irq_handler_t handler,
EXPORT_SYMBOL_GPL(__request_percpu_irq);
/**
+ * request_percpu_nmi - allocate a percpu interrupt line for NMI delivery
+ * @irq: Interrupt line to allocate
+ * @handler: Function to be called when the IRQ occurs.
+ * @name: An ascii name for the claiming device
+ * @dev_id: A percpu cookie passed back to the handler function
+ *
+ * This call allocates interrupt resources for a per CPU NMI. Per CPU NMIs
+ * have to be setup on each CPU by calling prepare_percpu_nmi() before
+ * being enabled on the same CPU by using enable_percpu_nmi().
+ *
+ * Dev_id must be globally unique. It is a per-cpu variable, and
+ * the handler gets called with the interrupted CPU's instance of
+ * that variable.
+ *
+ * Interrupt lines requested for NMI delivering should have auto enabling
+ * setting disabled.
+ *
+ * If the interrupt line cannot be used to deliver NMIs, function
+ * will fail returning a negative value.
+ */
+int request_percpu_nmi(unsigned int irq, irq_handler_t handler,
+ const char *name, void __percpu *dev_id)
+{
+ struct irqaction *action;
+ struct irq_desc *desc;
+ unsigned long flags;
+ int retval;
+
+ if (!handler)
+ return -EINVAL;
+
+ desc = irq_to_desc(irq);
+
+ if (!desc || !irq_settings_can_request(desc) ||
+ !irq_settings_is_per_cpu_devid(desc) ||
+ irq_settings_can_autoenable(desc) ||
+ !irq_supports_nmi(desc))
+ return -EINVAL;
+
+ /* The line cannot already be NMI */
+ if (desc->istate & IRQS_NMI)
+ return -EINVAL;
+
+ action = kzalloc(sizeof(struct irqaction), GFP_KERNEL);
+ if (!action)
+ return -ENOMEM;
+
+ action->handler = handler;
+ action->flags = IRQF_PERCPU | IRQF_NO_SUSPEND | IRQF_NO_THREAD
+ | IRQF_NOBALANCING;
+ action->name = name;
+ action->percpu_dev_id = dev_id;
+
+ retval = irq_chip_pm_get(&desc->irq_data);
+ if (retval < 0)
+ goto err_out;
+
+ retval = __setup_irq(irq, desc, action);
+ if (retval)
+ goto err_irq_setup;
+
+ raw_spin_lock_irqsave(&desc->lock, flags);
+ desc->istate |= IRQS_NMI;
+ raw_spin_unlock_irqrestore(&desc->lock, flags);
+
+ return 0;
+
+err_irq_setup:
+ irq_chip_pm_put(&desc->irq_data);
+err_out:
+ kfree(action);
+
+ return retval;
+}
+
+/**
+ * prepare_percpu_nmi - performs CPU local setup for NMI delivery
+ * @irq: Interrupt line to prepare for NMI delivery
+ *
+ * This call prepares an interrupt line to deliver NMI on the current CPU,
+ * before that interrupt line gets enabled with enable_percpu_nmi().
+ *
+ * As a CPU local operation, this should be called from non-preemptible
+ * context.
+ *
+ * If the interrupt line cannot be used to deliver NMIs, function
+ * will fail returning a negative value.
+ */
+int prepare_percpu_nmi(unsigned int irq)
+{
+ unsigned long flags;
+ struct irq_desc *desc;
+ int ret = 0;
+
+ WARN_ON(preemptible());
+
+ desc = irq_get_desc_lock(irq, &flags,
+ IRQ_GET_DESC_CHECK_PERCPU);
+ if (!desc)
+ return -EINVAL;
+
+ if (WARN(!(desc->istate & IRQS_NMI),
+ KERN_ERR "prepare_percpu_nmi called for a non-NMI interrupt: irq %u\n",
+ irq)) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ ret = irq_nmi_setup(desc);
+ if (ret) {
+ pr_err("Failed to setup NMI delivery: irq %u\n", irq);
+ goto out;
+ }
+
+out:
+ irq_put_desc_unlock(desc, flags);
+ return ret;
+}
+
+/**
+ * teardown_percpu_nmi - undoes NMI setup of IRQ line
+ * @irq: Interrupt line from which CPU local NMI configuration should be
+ * removed
+ *
+ * This call undoes the setup done by prepare_percpu_nmi().
+ *
+ * IRQ line should not be enabled for the current CPU.
+ *
+ * As a CPU local operation, this should be called from non-preemptible
+ * context.
+ */
+void teardown_percpu_nmi(unsigned int irq)
+{
+ unsigned long flags;
+ struct irq_desc *desc;
+
+ WARN_ON(preemptible());
+
+ desc = irq_get_desc_lock(irq, &flags,
+ IRQ_GET_DESC_CHECK_PERCPU);
+ if (!desc)
+ return;
+
+ if (WARN_ON(!(desc->istate & IRQS_NMI)))
+ goto out;
+
+ irq_nmi_teardown(desc);
+out:
+ irq_put_desc_unlock(desc, flags);
+}
+
+int __irq_get_irqchip_state(struct irq_data *data, enum irqchip_irq_state which,
+ bool *state)
+{
+ struct irq_chip *chip;
+ int err = -EINVAL;
+
+ do {
+ chip = irq_data_get_irq_chip(data);
+ if (chip->irq_get_irqchip_state)
+ break;
+#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY
+ data = data->parent_data;
+#else
+ data = NULL;
+#endif
+ } while (data);
+
+ if (data)
+ err = chip->irq_get_irqchip_state(data, which, state);
+ return err;
+}
+
+/**
* irq_get_irqchip_state - returns the irqchip state of a interrupt.
* @irq: Interrupt line that is forwarded to a VM
* @which: One of IRQCHIP_STATE_* the caller wants to know about
@@ -2189,7 +2657,6 @@ int irq_get_irqchip_state(unsigned int irq, enum irqchip_irq_state which,
{
struct irq_desc *desc;
struct irq_data *data;
- struct irq_chip *chip;
unsigned long flags;
int err = -EINVAL;
@@ -2199,19 +2666,7 @@ int irq_get_irqchip_state(unsigned int irq, enum irqchip_irq_state which,
data = irq_desc_get_irq_data(desc);
- do {
- chip = irq_data_get_irq_chip(data);
- if (chip->irq_get_irqchip_state)
- break;
-#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY
- data = data->parent_data;
-#else
- data = NULL;
-#endif
- } while (data);
-
- if (data)
- err = chip->irq_get_irqchip_state(data, which, state);
+ err = __irq_get_irqchip_state(data, which, state);
irq_put_desc_busunlock(desc, flags);
return err;
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index 6d2fa6914b30..2ed97a7c9b2a 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -212,9 +212,9 @@ static void __report_bad_irq(struct irq_desc *desc, irqreturn_t action_ret)
*/
raw_spin_lock_irqsave(&desc->lock, flags);
for_each_action_of_desc(desc, action) {
- printk(KERN_ERR "[<%p>] %pf", action->handler, action->handler);
+ printk(KERN_ERR "[<%p>] %ps", action->handler, action->handler);
if (action->thread_fn)
- printk(KERN_CONT " threaded [<%p>] %pf",
+ printk(KERN_CONT " threaded [<%p>] %ps",
action->thread_fn, action->thread_fn);
printk(KERN_CONT "\n");
}
diff --git a/kernel/irq/timings.c b/kernel/irq/timings.c
index 1e4cb63a5c82..e960d7ce7bcc 100644
--- a/kernel/irq/timings.c
+++ b/kernel/irq/timings.c
@@ -1,14 +1,17 @@
// SPDX-License-Identifier: GPL-2.0
// Copyright (C) 2016, Linaro Ltd - Daniel Lezcano <daniel.lezcano@linaro.org>
+#define pr_fmt(fmt) "irq_timings: " fmt
#include <linux/kernel.h>
#include <linux/percpu.h>
#include <linux/slab.h>
#include <linux/static_key.h>
+#include <linux/init.h>
#include <linux/interrupt.h>
#include <linux/idr.h>
#include <linux/irq.h>
#include <linux/math64.h>
+#include <linux/log2.h>
#include <trace/events/irq.h>
@@ -18,16 +21,6 @@ DEFINE_STATIC_KEY_FALSE(irq_timing_enabled);
DEFINE_PER_CPU(struct irq_timings, irq_timings);
-struct irqt_stat {
- u64 next_evt;
- u64 last_ts;
- u64 variance;
- u32 avg;
- u32 nr_samples;
- int anomalies;
- int valid;
-};
-
static DEFINE_IDR(irqt_stats);
void irq_timings_enable(void)
@@ -40,182 +33,473 @@ void irq_timings_disable(void)
static_branch_disable(&irq_timing_enabled);
}
-/**
- * irqs_update - update the irq timing statistics with a new timestamp
+/*
+ * The main goal of this algorithm is to predict the next interrupt
+ * occurrence on the current CPU.
+ *
+ * Currently, the interrupt timings are stored in a circular array
+ * buffer every time there is an interrupt, as a tuple: the interrupt
+ * number and the associated timestamp when the event occurred <irq,
+ * timestamp>.
+ *
+ * For every interrupt occurring in a short period of time, we can
+ * measure the elapsed time between the occurrences for the same
+ * interrupt and we end up with a suite of intervals. The experience
+ * showed the interrupts are often coming following a periodic
+ * pattern.
+ *
+ * The objective of the algorithm is to find out this periodic pattern
+ * in a fastest way and use its period to predict the next irq event.
+ *
+ * When the next interrupt event is requested, we are in the situation
+ * where the interrupts are disabled and the circular buffer
+ * containing the timings is filled with the events which happened
+ * after the previous next-interrupt-event request.
+ *
+ * At this point, we read the circular buffer and we fill the irq
+ * related statistics structure. After this step, the circular array
+ * containing the timings is empty because all the values are
+ * dispatched in their corresponding buffers.
+ *
+ * Now for each interrupt, we can predict the next event by using the
+ * suffix array, log interval and exponential moving average
+ *
+ * 1. Suffix array
+ *
+ * Suffix array is an array of all the suffixes of a string. It is
+ * widely used as a data structure for compression, text search, ...
+ * For instance for the word 'banana', the suffixes will be: 'banana'
+ * 'anana' 'nana' 'ana' 'na' 'a'
+ *
+ * Usually, the suffix array is sorted but for our purpose it is
+ * not necessary and won't provide any improvement in the context of
+ * the solved problem where we clearly define the boundaries of the
+ * search by a max period and min period.
+ *
+ * The suffix array will build a suite of intervals of different
+ * length and will look for the repetition of each suite. If the suite
+ * is repeating then we have the period because it is the length of
+ * the suite whatever its position in the buffer.
+ *
+ * 2. Log interval
+ *
+ * We saw the irq timings allow to compute the interval of the
+ * occurrences for a specific interrupt. We can reasonibly assume the
+ * longer is the interval, the higher is the error for the next event
+ * and we can consider storing those interval values into an array
+ * where each slot in the array correspond to an interval at the power
+ * of 2 of the index. For example, index 12 will contain values
+ * between 2^11 and 2^12.
+ *
+ * At the end we have an array of values where at each index defines a
+ * [2^index - 1, 2 ^ index] interval values allowing to store a large
+ * number of values inside a small array.
+ *
+ * For example, if we have the value 1123, then we store it at
+ * ilog2(1123) = 10 index value.
+ *
+ * Storing those value at the specific index is done by computing an
+ * exponential moving average for this specific slot. For instance,
+ * for values 1800, 1123, 1453, ... fall under the same slot (10) and
+ * the exponential moving average is computed every time a new value
+ * is stored at this slot.
+ *
+ * 3. Exponential Moving Average
+ *
+ * The EMA is largely used to track a signal for stocks or as a low
+ * pass filter. The magic of the formula, is it is very simple and the
+ * reactivity of the average can be tuned with the factors called
+ * alpha.
+ *
+ * The higher the alphas are, the faster the average respond to the
+ * signal change. In our case, if a slot in the array is a big
+ * interval, we can have numbers with a big difference between
+ * them. The impact of those differences in the average computation
+ * can be tuned by changing the alpha value.
+ *
+ *
+ * -- The algorithm --
+ *
+ * We saw the different processing above, now let's see how they are
+ * used together.
+ *
+ * For each interrupt:
+ * For each interval:
+ * Compute the index = ilog2(interval)
+ * Compute a new_ema(buffer[index], interval)
+ * Store the index in a circular buffer
+ *
+ * Compute the suffix array of the indexes
+ *
+ * For each suffix:
+ * If the suffix is reverse-found 3 times
+ * Return suffix
+ *
+ * Return Not found
+ *
+ * However we can not have endless suffix array to be build, it won't
+ * make sense and it will add an extra overhead, so we can restrict
+ * this to a maximum suffix length of 5 and a minimum suffix length of
+ * 2. The experience showed 5 is the majority of the maximum pattern
+ * period found for different devices.
+ *
+ * The result is a pattern finding less than 1us for an interrupt.
+ *
+ * Example based on real values:
+ *
+ * Example 1 : MMC write/read interrupt interval:
+ *
+ * 223947, 1240, 1384, 1386, 1386,
+ * 217416, 1236, 1384, 1386, 1387,
+ * 214719, 1241, 1386, 1387, 1384,
+ * 213696, 1234, 1384, 1386, 1388,
+ * 219904, 1240, 1385, 1389, 1385,
+ * 212240, 1240, 1386, 1386, 1386,
+ * 214415, 1236, 1384, 1386, 1387,
+ * 214276, 1234, 1384, 1388, ?
+ *
+ * For each element, apply ilog2(value)
*
- * @irqs: an irqt_stat struct pointer
- * @ts: the new timestamp
+ * 15, 8, 8, 8, 8,
+ * 15, 8, 8, 8, 8,
+ * 15, 8, 8, 8, 8,
+ * 15, 8, 8, 8, 8,
+ * 15, 8, 8, 8, 8,
+ * 15, 8, 8, 8, 8,
+ * 15, 8, 8, 8, 8,
+ * 15, 8, 8, 8, ?
*
- * The statistics are computed online, in other words, the code is
- * designed to compute the statistics on a stream of values rather
- * than doing multiple passes on the values to compute the average,
- * then the variance. The integer division introduces a loss of
- * precision but with an acceptable error margin regarding the results
- * we would have with the double floating precision: we are dealing
- * with nanosec, so big numbers, consequently the mantisse is
- * negligeable, especially when converting the time in usec
- * afterwards.
+ * Max period of 5, we take the last (max_period * 3) 15 elements as
+ * we can be confident if the pattern repeats itself three times it is
+ * a repeating pattern.
*
- * The computation happens at idle time. When the CPU is not idle, the
- * interrupts' timestamps are stored in the circular buffer, when the
- * CPU goes idle and this routine is called, all the buffer's values
- * are injected in the statistical model continuying to extend the
- * statistics from the previous busy-idle cycle.
+ * 8,
+ * 15, 8, 8, 8, 8,
+ * 15, 8, 8, 8, 8,
+ * 15, 8, 8, 8, ?
*
- * The observations showed a device will trigger a burst of periodic
- * interrupts followed by one or two peaks of longer time, for
- * instance when a SD card device flushes its cache, then the periodic
- * intervals occur again. A one second inactivity period resets the
- * stats, that gives us the certitude the statistical values won't
- * exceed 1x10^9, thus the computation won't overflow.
+ * Suffixes are:
*
- * Basically, the purpose of the algorithm is to watch the periodic
- * interrupts and eliminate the peaks.
+ * 1) 8, 15, 8, 8, 8 <- max period
+ * 2) 8, 15, 8, 8
+ * 3) 8, 15, 8
+ * 4) 8, 15 <- min period
*
- * An interrupt is considered periodically stable if the interval of
- * its occurences follow the normal distribution, thus the values
- * comply with:
+ * From there we search the repeating pattern for each suffix.
*
- * avg - 3 x stddev < value < avg + 3 x stddev
+ * buffer: 8, 15, 8, 8, 8, 8, 15, 8, 8, 8, 8, 15, 8, 8, 8
+ * | | | | | | | | | | | | | | |
+ * 8, 15, 8, 8, 8 | | | | | | | | | |
+ * 8, 15, 8, 8, 8 | | | | |
+ * 8, 15, 8, 8, 8
*
- * Which can be simplified to:
+ * When moving the suffix, we found exactly 3 matches.
*
- * -3 x stddev < value - avg < 3 x stddev
+ * The first suffix with period 5 is repeating.
*
- * abs(value - avg) < 3 x stddev
+ * The next event is (3 * max_period) % suffix_period
*
- * In order to save a costly square root computation, we use the
- * variance. For the record, stddev = sqrt(variance). The equation
- * above becomes:
+ * In this example, the result 0, so the next event is suffix[0] => 8
*
- * abs(value - avg) < 3 x sqrt(variance)
+ * However, 8 is the index in the array of exponential moving average
+ * which was calculated on the fly when storing the values, so the
+ * interval is ema[8] = 1366
*
- * And finally we square it:
*
- * (value - avg) ^ 2 < (3 x sqrt(variance)) ^ 2
+ * Example 2:
*
- * (value - avg) x (value - avg) < 9 x variance
+ * 4, 3, 5, 100,
+ * 3, 3, 5, 117,
+ * 4, 4, 5, 112,
+ * 4, 3, 4, 110,
+ * 3, 5, 3, 117,
+ * 4, 4, 5, 112,
+ * 4, 3, 4, 110,
+ * 3, 4, 5, 112,
+ * 4, 3, 4, 110
*
- * Statistically speaking, any values out of this interval is
- * considered as an anomaly and is discarded. However, a normal
- * distribution appears when the number of samples is 30 (it is the
- * rule of thumb in statistics, cf. "30 samples" on Internet). When
- * there are three consecutive anomalies, the statistics are resetted.
+ * ilog2
*
+ * 0, 0, 0, 4,
+ * 0, 0, 0, 4,
+ * 0, 0, 0, 4,
+ * 0, 0, 0, 4,
+ * 0, 0, 0, 4,
+ * 0, 0, 0, 4,
+ * 0, 0, 0, 4,
+ * 0, 0, 0, 4,
+ * 0, 0, 0, 4
+ *
+ * Max period 5:
+ * 0, 0, 4,
+ * 0, 0, 0, 4,
+ * 0, 0, 0, 4,
+ * 0, 0, 0, 4
+ *
+ * Suffixes:
+ *
+ * 1) 0, 0, 4, 0, 0
+ * 2) 0, 0, 4, 0
+ * 3) 0, 0, 4
+ * 4) 0, 0
+ *
+ * buffer: 0, 0, 4, 0, 0, 0, 4, 0, 0, 0, 4, 0, 0, 0, 4
+ * | | | | | | X
+ * 0, 0, 4, 0, 0, | X
+ * 0, 0
+ *
+ * buffer: 0, 0, 4, 0, 0, 0, 4, 0, 0, 0, 4, 0, 0, 0, 4
+ * | | | | | | | | | | | | | | |
+ * 0, 0, 4, 0, | | | | | | | | | | |
+ * 0, 0, 4, 0, | | | | | | |
+ * 0, 0, 4, 0, | | |
+ * 0 0 4
+ *
+ * Pattern is found 3 times, the remaining is 1 which results from
+ * (max_period * 3) % suffix_period. This value is the index in the
+ * suffix arrays. The suffix array for a period 4 has the value 4
+ * at index 1.
+ */
+#define EMA_ALPHA_VAL 64
+#define EMA_ALPHA_SHIFT 7
+
+#define PREDICTION_PERIOD_MIN 3
+#define PREDICTION_PERIOD_MAX 5
+#define PREDICTION_FACTOR 4
+#define PREDICTION_MAX 10 /* 2 ^ PREDICTION_MAX useconds */
+#define PREDICTION_BUFFER_SIZE 16 /* slots for EMAs, hardly more than 16 */
+
+/*
+ * Number of elements in the circular buffer: If it happens it was
+ * flushed before, then the number of elements could be smaller than
+ * IRQ_TIMINGS_SIZE, so the count is used, otherwise the array size is
+ * used as we wrapped. The index begins from zero when we did not
+ * wrap. That could be done in a nicer way with the proper circular
+ * array structure type but with the cost of extra computation in the
+ * interrupt handler hot path. We choose efficiency.
+ */
+#define for_each_irqts(i, irqts) \
+ for (i = irqts->count < IRQ_TIMINGS_SIZE ? \
+ 0 : irqts->count & IRQ_TIMINGS_MASK, \
+ irqts->count = min(IRQ_TIMINGS_SIZE, \
+ irqts->count); \
+ irqts->count > 0; irqts->count--, \
+ i = (i + 1) & IRQ_TIMINGS_MASK)
+
+struct irqt_stat {
+ u64 last_ts;
+ u64 ema_time[PREDICTION_BUFFER_SIZE];
+ int timings[IRQ_TIMINGS_SIZE];
+ int circ_timings[IRQ_TIMINGS_SIZE];
+ int count;
+};
+
+/*
+ * Exponential moving average computation
*/
-static void irqs_update(struct irqt_stat *irqs, u64 ts)
+static u64 irq_timings_ema_new(u64 value, u64 ema_old)
{
- u64 old_ts = irqs->last_ts;
- u64 variance = 0;
- u64 interval;
s64 diff;
+ if (unlikely(!ema_old))
+ return value;
+
+ diff = (value - ema_old) * EMA_ALPHA_VAL;
/*
- * The timestamps are absolute time values, we need to compute
- * the timing interval between two interrupts.
+ * We can use a s64 type variable to be added with the u64
+ * ema_old variable as this one will never have its topmost
+ * bit set, it will be always smaller than 2^63 nanosec
+ * interrupt interval (292 years).
*/
- irqs->last_ts = ts;
+ return ema_old + (diff >> EMA_ALPHA_SHIFT);
+}
+
+static int irq_timings_next_event_index(int *buffer, size_t len, int period_max)
+{
+ int period;
/*
- * The interval type is u64 in order to deal with the same
- * type in our computation, that prevent mindfuck issues with
- * overflow, sign and division.
+ * Move the beginning pointer to the end minus the max period x 3.
+ * We are at the point we can begin searching the pattern
*/
- interval = ts - old_ts;
+ buffer = &buffer[len - (period_max * 3)];
+
+ /* Adjust the length to the maximum allowed period x 3 */
+ len = period_max * 3;
/*
- * The interrupt triggered more than one second apart, that
- * ends the sequence as predictible for our purpose. In this
- * case, assume we have the beginning of a sequence and the
- * timestamp is the first value. As it is impossible to
- * predict anything at this point, return.
- *
- * Note the first timestamp of the sequence will always fall
- * in this test because the old_ts is zero. That is what we
- * want as we need another timestamp to compute an interval.
+ * The buffer contains the suite of intervals, in a ilog2
+ * basis, we are looking for a repetition. We point the
+ * beginning of the search three times the length of the
+ * period beginning at the end of the buffer. We do that for
+ * each suffix.
*/
- if (interval >= NSEC_PER_SEC) {
- memset(irqs, 0, sizeof(*irqs));
- irqs->last_ts = ts;
- return;
+ for (period = period_max; period >= PREDICTION_PERIOD_MIN; period--) {
+
+ /*
+ * The first comparison always succeed because the
+ * suffix is deduced from the first n-period bytes of
+ * the buffer and we compare the initial suffix with
+ * itself, so we can skip the first iteration.
+ */
+ int idx = period;
+ size_t size = period;
+
+ /*
+ * We look if the suite with period 'i' repeat
+ * itself. If it is truncated at the end, as it
+ * repeats we can use the period to find out the next
+ * element with the modulo.
+ */
+ while (!memcmp(buffer, &buffer[idx], size * sizeof(int))) {
+
+ /*
+ * Move the index in a period basis
+ */
+ idx += size;
+
+ /*
+ * If this condition is reached, all previous
+ * memcmp were successful, so the period is
+ * found.
+ */
+ if (idx == len)
+ return buffer[len % period];
+
+ /*
+ * If the remaining elements to compare are
+ * smaller than the period, readjust the size
+ * of the comparison for the last iteration.
+ */
+ if (len - idx < period)
+ size = len - idx;
+ }
+ }
+
+ return -1;
+}
+
+static u64 __irq_timings_next_event(struct irqt_stat *irqs, int irq, u64 now)
+{
+ int index, i, period_max, count, start, min = INT_MAX;
+
+ if ((now - irqs->last_ts) >= NSEC_PER_SEC) {
+ irqs->count = irqs->last_ts = 0;
+ return U64_MAX;
}
/*
- * Pre-compute the delta with the average as the result is
- * used several times in this function.
+ * As we want to find three times the repetition, we need a
+ * number of intervals greater or equal to three times the
+ * maximum period, otherwise we truncate the max period.
*/
- diff = interval - irqs->avg;
+ period_max = irqs->count > (3 * PREDICTION_PERIOD_MAX) ?
+ PREDICTION_PERIOD_MAX : irqs->count / 3;
/*
- * Increment the number of samples.
+ * If we don't have enough irq timings for this prediction,
+ * just bail out.
*/
- irqs->nr_samples++;
+ if (period_max <= PREDICTION_PERIOD_MIN)
+ return U64_MAX;
/*
- * Online variance divided by the number of elements if there
- * is more than one sample. Normally the formula is division
- * by nr_samples - 1 but we assume the number of element will be
- * more than 32 and dividing by 32 instead of 31 is enough
- * precise.
+ * 'count' will depends if the circular buffer wrapped or not
*/
- if (likely(irqs->nr_samples > 1))
- variance = irqs->variance >> IRQ_TIMINGS_SHIFT;
+ count = irqs->count < IRQ_TIMINGS_SIZE ?
+ irqs->count : IRQ_TIMINGS_SIZE;
+
+ start = irqs->count < IRQ_TIMINGS_SIZE ?
+ 0 : (irqs->count & IRQ_TIMINGS_MASK);
/*
- * The rule of thumb in statistics for the normal distribution
- * is having at least 30 samples in order to have the model to
- * apply. Values outside the interval are considered as an
- * anomaly.
+ * Copy the content of the circular buffer into another buffer
+ * in order to linearize the buffer instead of dealing with
+ * wrapping indexes and shifted array which will be prone to
+ * error and extremelly difficult to debug.
*/
- if ((irqs->nr_samples >= 30) && ((diff * diff) > (9 * variance))) {
- /*
- * After three consecutive anomalies, we reset the
- * stats as it is no longer stable enough.
- */
- if (irqs->anomalies++ >= 3) {
- memset(irqs, 0, sizeof(*irqs));
- irqs->last_ts = ts;
- return;
- }
- } else {
- /*
- * The anomalies must be consecutives, so at this
- * point, we reset the anomalies counter.
- */
- irqs->anomalies = 0;
+ for (i = 0; i < count; i++) {
+ int index = (start + i) & IRQ_TIMINGS_MASK;
+
+ irqs->timings[i] = irqs->circ_timings[index];
+ min = min_t(int, irqs->timings[i], min);
}
+ index = irq_timings_next_event_index(irqs->timings, count, period_max);
+ if (index < 0)
+ return irqs->last_ts + irqs->ema_time[min];
+
+ return irqs->last_ts + irqs->ema_time[index];
+}
+
+static __always_inline int irq_timings_interval_index(u64 interval)
+{
/*
- * The interrupt is considered stable enough to try to predict
- * the next event on it.
+ * The PREDICTION_FACTOR increase the interval size for the
+ * array of exponential average.
*/
- irqs->valid = 1;
+ u64 interval_us = (interval >> 10) / PREDICTION_FACTOR;
+
+ return likely(interval_us) ? ilog2(interval_us) : 0;
+}
+
+static __always_inline void __irq_timings_store(int irq, struct irqt_stat *irqs,
+ u64 interval)
+{
+ int index;
/*
- * Online average algorithm:
- *
- * new_average = average + ((value - average) / count)
- *
- * The variance computation depends on the new average
- * to be computed here first.
- *
+ * Get the index in the ema table for this interrupt.
*/
- irqs->avg = irqs->avg + (diff >> IRQ_TIMINGS_SHIFT);
+ index = irq_timings_interval_index(interval);
/*
- * Online variance algorithm:
- *
- * new_variance = variance + (value - average) x (value - new_average)
- *
- * Warning: irqs->avg is updated with the line above, hence
- * 'interval - irqs->avg' is no longer equal to 'diff'
+ * Store the index as an element of the pattern in another
+ * circular array.
*/
- irqs->variance = irqs->variance + (diff * (interval - irqs->avg));
+ irqs->circ_timings[irqs->count & IRQ_TIMINGS_MASK] = index;
+
+ irqs->ema_time[index] = irq_timings_ema_new(interval,
+ irqs->ema_time[index]);
+
+ irqs->count++;
+}
+
+static inline void irq_timings_store(int irq, struct irqt_stat *irqs, u64 ts)
+{
+ u64 old_ts = irqs->last_ts;
+ u64 interval;
/*
- * Update the next event
+ * The timestamps are absolute time values, we need to compute
+ * the timing interval between two interrupts.
+ */
+ irqs->last_ts = ts;
+
+ /*
+ * The interval type is u64 in order to deal with the same
+ * type in our computation, that prevent mindfuck issues with
+ * overflow, sign and division.
+ */
+ interval = ts - old_ts;
+
+ /*
+ * The interrupt triggered more than one second apart, that
+ * ends the sequence as predictible for our purpose. In this
+ * case, assume we have the beginning of a sequence and the
+ * timestamp is the first value. As it is impossible to
+ * predict anything at this point, return.
+ *
+ * Note the first timestamp of the sequence will always fall
+ * in this test because the old_ts is zero. That is what we
+ * want as we need another timestamp to compute an interval.
*/
- irqs->next_evt = ts + irqs->avg;
+ if (interval >= NSEC_PER_SEC) {
+ irqs->count = 0;
+ return;
+ }
+
+ __irq_timings_store(irq, irqs, interval);
}
/**
@@ -259,6 +543,9 @@ u64 irq_timings_next_event(u64 now)
*/
lockdep_assert_irqs_disabled();
+ if (!irqts->count)
+ return next_evt;
+
/*
* Number of elements in the circular buffer: If it happens it
* was flushed before, then the number of elements could be
@@ -269,21 +556,15 @@ u64 irq_timings_next_event(u64 now)
* type but with the cost of extra computation in the
* interrupt handler hot path. We choose efficiency.
*
- * Inject measured irq/timestamp to the statistical model
- * while decrementing the counter because we consume the data
- * from our circular buffer.
+ * Inject measured irq/timestamp to the pattern prediction
+ * model while decrementing the counter because we consume the
+ * data from our circular buffer.
*/
- for (i = irqts->count & IRQ_TIMINGS_MASK,
- irqts->count = min(IRQ_TIMINGS_SIZE, irqts->count);
- irqts->count > 0; irqts->count--, i = (i + 1) & IRQ_TIMINGS_MASK) {
-
+ for_each_irqts(i, irqts) {
irq = irq_timing_decode(irqts->values[i], &ts);
-
s = idr_find(&irqt_stats, irq);
- if (s) {
- irqs = this_cpu_ptr(s);
- irqs_update(irqs, ts);
- }
+ if (s)
+ irq_timings_store(irq, this_cpu_ptr(s), ts);
}
/*
@@ -294,26 +575,12 @@ u64 irq_timings_next_event(u64 now)
irqs = this_cpu_ptr(s);
- if (!irqs->valid)
- continue;
-
- if (irqs->next_evt <= now) {
- irq = i;
- next_evt = now;
+ ts = __irq_timings_next_event(irqs, i, now);
+ if (ts <= now)
+ return now;
- /*
- * This interrupt mustn't use in the future
- * until new events occur and update the
- * statistics.
- */
- irqs->valid = 0;
- break;
- }
-
- if (irqs->next_evt < next_evt) {
- irq = i;
- next_evt = irqs->next_evt;
- }
+ if (ts < next_evt)
+ next_evt = ts;
}
return next_evt;
@@ -360,3 +627,325 @@ int irq_timings_alloc(int irq)
return 0;
}
+
+#ifdef CONFIG_TEST_IRQ_TIMINGS
+struct timings_intervals {
+ u64 *intervals;
+ size_t count;
+};
+
+/*
+ * Intervals are given in nanosecond base
+ */
+static u64 intervals0[] __initdata = {
+ 10000, 50000, 200000, 500000,
+ 10000, 50000, 200000, 500000,
+ 10000, 50000, 200000, 500000,
+ 10000, 50000, 200000, 500000,
+ 10000, 50000, 200000, 500000,
+ 10000, 50000, 200000, 500000,
+ 10000, 50000, 200000, 500000,
+ 10000, 50000, 200000, 500000,
+ 10000, 50000, 200000,
+};
+
+static u64 intervals1[] __initdata = {
+ 223947000, 1240000, 1384000, 1386000, 1386000,
+ 217416000, 1236000, 1384000, 1386000, 1387000,
+ 214719000, 1241000, 1386000, 1387000, 1384000,
+ 213696000, 1234000, 1384000, 1386000, 1388000,
+ 219904000, 1240000, 1385000, 1389000, 1385000,
+ 212240000, 1240000, 1386000, 1386000, 1386000,
+ 214415000, 1236000, 1384000, 1386000, 1387000,
+ 214276000, 1234000,
+};
+
+static u64 intervals2[] __initdata = {
+ 4000, 3000, 5000, 100000,
+ 3000, 3000, 5000, 117000,
+ 4000, 4000, 5000, 112000,
+ 4000, 3000, 4000, 110000,
+ 3000, 5000, 3000, 117000,
+ 4000, 4000, 5000, 112000,
+ 4000, 3000, 4000, 110000,
+ 3000, 4000, 5000, 112000,
+ 4000,
+};
+
+static u64 intervals3[] __initdata = {
+ 1385000, 212240000, 1240000,
+ 1386000, 214415000, 1236000,
+ 1384000, 214276000, 1234000,
+ 1386000, 214415000, 1236000,
+ 1385000, 212240000, 1240000,
+ 1386000, 214415000, 1236000,
+ 1384000, 214276000, 1234000,
+ 1386000, 214415000, 1236000,
+ 1385000, 212240000, 1240000,
+};
+
+static u64 intervals4[] __initdata = {
+ 10000, 50000, 10000, 50000,
+ 10000, 50000, 10000, 50000,
+ 10000, 50000, 10000, 50000,
+ 10000, 50000, 10000, 50000,
+ 10000, 50000, 10000, 50000,
+ 10000, 50000, 10000, 50000,
+ 10000, 50000, 10000, 50000,
+ 10000, 50000, 10000, 50000,
+ 10000,
+};
+
+static struct timings_intervals tis[] __initdata = {
+ { intervals0, ARRAY_SIZE(intervals0) },
+ { intervals1, ARRAY_SIZE(intervals1) },
+ { intervals2, ARRAY_SIZE(intervals2) },
+ { intervals3, ARRAY_SIZE(intervals3) },
+ { intervals4, ARRAY_SIZE(intervals4) },
+};
+
+static int __init irq_timings_test_next_index(struct timings_intervals *ti)
+{
+ int _buffer[IRQ_TIMINGS_SIZE];
+ int buffer[IRQ_TIMINGS_SIZE];
+ int index, start, i, count, period_max;
+
+ count = ti->count - 1;
+
+ period_max = count > (3 * PREDICTION_PERIOD_MAX) ?
+ PREDICTION_PERIOD_MAX : count / 3;
+
+ /*
+ * Inject all values except the last one which will be used
+ * to compare with the next index result.
+ */
+ pr_debug("index suite: ");
+
+ for (i = 0; i < count; i++) {
+ index = irq_timings_interval_index(ti->intervals[i]);
+ _buffer[i & IRQ_TIMINGS_MASK] = index;
+ pr_cont("%d ", index);
+ }
+
+ start = count < IRQ_TIMINGS_SIZE ? 0 :
+ count & IRQ_TIMINGS_MASK;
+
+ count = min_t(int, count, IRQ_TIMINGS_SIZE);
+
+ for (i = 0; i < count; i++) {
+ int index = (start + i) & IRQ_TIMINGS_MASK;
+ buffer[i] = _buffer[index];
+ }
+
+ index = irq_timings_next_event_index(buffer, count, period_max);
+ i = irq_timings_interval_index(ti->intervals[ti->count - 1]);
+
+ if (index != i) {
+ pr_err("Expected (%d) and computed (%d) next indexes differ\n",
+ i, index);
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static int __init irq_timings_next_index_selftest(void)
+{
+ int i, ret;
+
+ for (i = 0; i < ARRAY_SIZE(tis); i++) {
+
+ pr_info("---> Injecting intervals number #%d (count=%zd)\n",
+ i, tis[i].count);
+
+ ret = irq_timings_test_next_index(&tis[i]);
+ if (ret)
+ break;
+ }
+
+ return ret;
+}
+
+static int __init irq_timings_test_irqs(struct timings_intervals *ti)
+{
+ struct irqt_stat __percpu *s;
+ struct irqt_stat *irqs;
+ int i, index, ret, irq = 0xACE5;
+
+ ret = irq_timings_alloc(irq);
+ if (ret) {
+ pr_err("Failed to allocate irq timings\n");
+ return ret;
+ }
+
+ s = idr_find(&irqt_stats, irq);
+ if (!s) {
+ ret = -EIDRM;
+ goto out;
+ }
+
+ irqs = this_cpu_ptr(s);
+
+ for (i = 0; i < ti->count; i++) {
+
+ index = irq_timings_interval_index(ti->intervals[i]);
+ pr_debug("%d: interval=%llu ema_index=%d\n",
+ i, ti->intervals[i], index);
+
+ __irq_timings_store(irq, irqs, ti->intervals[i]);
+ if (irqs->circ_timings[i & IRQ_TIMINGS_MASK] != index) {
+ pr_err("Failed to store in the circular buffer\n");
+ goto out;
+ }
+ }
+
+ if (irqs->count != ti->count) {
+ pr_err("Count differs\n");
+ goto out;
+ }
+
+ ret = 0;
+out:
+ irq_timings_free(irq);
+
+ return ret;
+}
+
+static int __init irq_timings_irqs_selftest(void)
+{
+ int i, ret;
+
+ for (i = 0; i < ARRAY_SIZE(tis); i++) {
+ pr_info("---> Injecting intervals number #%d (count=%zd)\n",
+ i, tis[i].count);
+ ret = irq_timings_test_irqs(&tis[i]);
+ if (ret)
+ break;
+ }
+
+ return ret;
+}
+
+static int __init irq_timings_test_irqts(struct irq_timings *irqts,
+ unsigned count)
+{
+ int start = count >= IRQ_TIMINGS_SIZE ? count - IRQ_TIMINGS_SIZE : 0;
+ int i, irq, oirq = 0xBEEF;
+ u64 ots = 0xDEAD, ts;
+
+ /*
+ * Fill the circular buffer by using the dedicated function.
+ */
+ for (i = 0; i < count; i++) {
+ pr_debug("%d: index=%d, ts=%llX irq=%X\n",
+ i, i & IRQ_TIMINGS_MASK, ots + i, oirq + i);
+
+ irq_timings_push(ots + i, oirq + i);
+ }
+
+ /*
+ * Compute the first elements values after the index wrapped
+ * up or not.
+ */
+ ots += start;
+ oirq += start;
+
+ /*
+ * Test the circular buffer count is correct.
+ */
+ pr_debug("---> Checking timings array count (%d) is right\n", count);
+ if (WARN_ON(irqts->count != count))
+ return -EINVAL;
+
+ /*
+ * Test the macro allowing to browse all the irqts.
+ */
+ pr_debug("---> Checking the for_each_irqts() macro\n");
+ for_each_irqts(i, irqts) {
+
+ irq = irq_timing_decode(irqts->values[i], &ts);
+
+ pr_debug("index=%d, ts=%llX / %llX, irq=%X / %X\n",
+ i, ts, ots, irq, oirq);
+
+ if (WARN_ON(ts != ots || irq != oirq))
+ return -EINVAL;
+
+ ots++; oirq++;
+ }
+
+ /*
+ * The circular buffer should have be flushed when browsed
+ * with for_each_irqts
+ */
+ pr_debug("---> Checking timings array is empty after browsing it\n");
+ if (WARN_ON(irqts->count))
+ return -EINVAL;
+
+ return 0;
+}
+
+static int __init irq_timings_irqts_selftest(void)
+{
+ struct irq_timings *irqts = this_cpu_ptr(&irq_timings);
+ int i, ret;
+
+ /*
+ * Test the circular buffer with different number of
+ * elements. The purpose is to test at the limits (empty, half
+ * full, full, wrapped with the cursor at the boundaries,
+ * wrapped several times, etc ...
+ */
+ int count[] = { 0,
+ IRQ_TIMINGS_SIZE >> 1,
+ IRQ_TIMINGS_SIZE,
+ IRQ_TIMINGS_SIZE + (IRQ_TIMINGS_SIZE >> 1),
+ 2 * IRQ_TIMINGS_SIZE,
+ (2 * IRQ_TIMINGS_SIZE) + 3,
+ };
+
+ for (i = 0; i < ARRAY_SIZE(count); i++) {
+
+ pr_info("---> Checking the timings with %d/%d values\n",
+ count[i], IRQ_TIMINGS_SIZE);
+
+ ret = irq_timings_test_irqts(irqts, count[i]);
+ if (ret)
+ break;
+ }
+
+ return ret;
+}
+
+static int __init irq_timings_selftest(void)
+{
+ int ret;
+
+ pr_info("------------------- selftest start -----------------\n");
+
+ /*
+ * At this point, we don't except any subsystem to use the irq
+ * timings but us, so it should not be enabled.
+ */
+ if (static_branch_unlikely(&irq_timing_enabled)) {
+ pr_warn("irq timings already initialized, skipping selftest\n");
+ return 0;
+ }
+
+ ret = irq_timings_irqts_selftest();
+ if (ret)
+ goto out;
+
+ ret = irq_timings_irqs_selftest();
+ if (ret)
+ goto out;
+
+ ret = irq_timings_next_index_selftest();
+out:
+ pr_info("---------- selftest end with %s -----------\n",
+ ret ? "failure" : "success");
+
+ return ret;
+}
+early_initcall(irq_timings_selftest);
+#endif
diff --git a/kernel/irq_work.c b/kernel/irq_work.c
index 6b7cdf17ccf8..d42acaf81886 100644
--- a/kernel/irq_work.c
+++ b/kernel/irq_work.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright (C) 2010 Red Hat, Inc., Peter Zijlstra
*
@@ -56,61 +57,70 @@ void __weak arch_irq_work_raise(void)
*/
}
-/*
- * Enqueue the irq_work @work on @cpu unless it's already pending
- * somewhere.
- *
- * Can be re-enqueued while the callback is still in progress.
- */
-bool irq_work_queue_on(struct irq_work *work, int cpu)
+/* Enqueue on current CPU, work must already be claimed and preempt disabled */
+static void __irq_work_queue_local(struct irq_work *work)
{
- /* All work should have been flushed before going offline */
- WARN_ON_ONCE(cpu_is_offline(cpu));
-
-#ifdef CONFIG_SMP
-
- /* Arch remote IPI send/receive backend aren't NMI safe */
- WARN_ON_ONCE(in_nmi());
+ /* If the work is "lazy", handle it from next tick if any */
+ if (work->flags & IRQ_WORK_LAZY) {
+ if (llist_add(&work->llnode, this_cpu_ptr(&lazy_list)) &&
+ tick_nohz_tick_stopped())
+ arch_irq_work_raise();
+ } else {
+ if (llist_add(&work->llnode, this_cpu_ptr(&raised_list)))
+ arch_irq_work_raise();
+ }
+}
+/* Enqueue the irq work @work on the current CPU */
+bool irq_work_queue(struct irq_work *work)
+{
/* Only queue if not already pending */
if (!irq_work_claim(work))
return false;
- if (llist_add(&work->llnode, &per_cpu(raised_list, cpu)))
- arch_send_call_function_single_ipi(cpu);
-
-#else /* #ifdef CONFIG_SMP */
- irq_work_queue(work);
-#endif /* #else #ifdef CONFIG_SMP */
+ /* Queue the entry and raise the IPI if needed. */
+ preempt_disable();
+ __irq_work_queue_local(work);
+ preempt_enable();
return true;
}
+EXPORT_SYMBOL_GPL(irq_work_queue);
-/* Enqueue the irq work @work on the current CPU */
-bool irq_work_queue(struct irq_work *work)
+/*
+ * Enqueue the irq_work @work on @cpu unless it's already pending
+ * somewhere.
+ *
+ * Can be re-enqueued while the callback is still in progress.
+ */
+bool irq_work_queue_on(struct irq_work *work, int cpu)
{
+#ifndef CONFIG_SMP
+ return irq_work_queue(work);
+
+#else /* CONFIG_SMP: */
+ /* All work should have been flushed before going offline */
+ WARN_ON_ONCE(cpu_is_offline(cpu));
+
/* Only queue if not already pending */
if (!irq_work_claim(work))
return false;
- /* Queue the entry and raise the IPI if needed. */
preempt_disable();
-
- /* If the work is "lazy", handle it from next tick if any */
- if (work->flags & IRQ_WORK_LAZY) {
- if (llist_add(&work->llnode, this_cpu_ptr(&lazy_list)) &&
- tick_nohz_tick_stopped())
- arch_irq_work_raise();
+ if (cpu != smp_processor_id()) {
+ /* Arch remote IPI send/receive backend aren't NMI safe */
+ WARN_ON_ONCE(in_nmi());
+ if (llist_add(&work->llnode, &per_cpu(raised_list, cpu)))
+ arch_send_call_function_single_ipi(cpu);
} else {
- if (llist_add(&work->llnode, this_cpu_ptr(&raised_list)))
- arch_irq_work_raise();
+ __irq_work_queue_local(work);
}
-
preempt_enable();
return true;
+#endif /* CONFIG_SMP */
}
-EXPORT_SYMBOL_GPL(irq_work_queue);
+
bool irq_work_needs_cpu(void)
{
diff --git a/kernel/jump_label.c b/kernel/jump_label.c
index bad96b476eb6..df3008419a1d 100644
--- a/kernel/jump_label.c
+++ b/kernel/jump_label.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* jump label support
*
@@ -36,12 +37,26 @@ static int jump_label_cmp(const void *a, const void *b)
const struct jump_entry *jea = a;
const struct jump_entry *jeb = b;
+ /*
+ * Entrires are sorted by key.
+ */
if (jump_entry_key(jea) < jump_entry_key(jeb))
return -1;
if (jump_entry_key(jea) > jump_entry_key(jeb))
return 1;
+ /*
+ * In the batching mode, entries should also be sorted by the code
+ * inside the already sorted list of entries, enabling a bsearch in
+ * the vector.
+ */
+ if (jump_entry_code(jea) < jump_entry_code(jeb))
+ return -1;
+
+ if (jump_entry_code(jea) > jump_entry_code(jeb))
+ return 1;
+
return 0;
}
@@ -202,11 +217,13 @@ void static_key_disable(struct static_key *key)
}
EXPORT_SYMBOL_GPL(static_key_disable);
-static void __static_key_slow_dec_cpuslocked(struct static_key *key,
- unsigned long rate_limit,
- struct delayed_work *work)
+static bool static_key_slow_try_dec(struct static_key *key)
{
- lockdep_assert_cpus_held();
+ int val;
+
+ val = atomic_fetch_add_unless(&key->enabled, -1, 1);
+ if (val == 1)
+ return false;
/*
* The negative count check is valid even when a negative
@@ -215,63 +232,70 @@ static void __static_key_slow_dec_cpuslocked(struct static_key *key,
* returns is unbalanced, because all other static_key_slow_inc()
* instances block while the update is in progress.
*/
- if (!atomic_dec_and_mutex_lock(&key->enabled, &jump_label_mutex)) {
- WARN(atomic_read(&key->enabled) < 0,
- "jump label: negative count!\n");
+ WARN(val < 0, "jump label: negative count!\n");
+ return true;
+}
+
+static void __static_key_slow_dec_cpuslocked(struct static_key *key)
+{
+ lockdep_assert_cpus_held();
+
+ if (static_key_slow_try_dec(key))
return;
- }
- if (rate_limit) {
- atomic_inc(&key->enabled);
- schedule_delayed_work(work, rate_limit);
- } else {
+ jump_label_lock();
+ if (atomic_dec_and_test(&key->enabled))
jump_label_update(key);
- }
jump_label_unlock();
}
-static void __static_key_slow_dec(struct static_key *key,
- unsigned long rate_limit,
- struct delayed_work *work)
+static void __static_key_slow_dec(struct static_key *key)
{
cpus_read_lock();
- __static_key_slow_dec_cpuslocked(key, rate_limit, work);
+ __static_key_slow_dec_cpuslocked(key);
cpus_read_unlock();
}
-static void jump_label_update_timeout(struct work_struct *work)
+void jump_label_update_timeout(struct work_struct *work)
{
struct static_key_deferred *key =
container_of(work, struct static_key_deferred, work.work);
- __static_key_slow_dec(&key->key, 0, NULL);
+ __static_key_slow_dec(&key->key);
}
+EXPORT_SYMBOL_GPL(jump_label_update_timeout);
void static_key_slow_dec(struct static_key *key)
{
STATIC_KEY_CHECK_USE(key);
- __static_key_slow_dec(key, 0, NULL);
+ __static_key_slow_dec(key);
}
EXPORT_SYMBOL_GPL(static_key_slow_dec);
void static_key_slow_dec_cpuslocked(struct static_key *key)
{
STATIC_KEY_CHECK_USE(key);
- __static_key_slow_dec_cpuslocked(key, 0, NULL);
+ __static_key_slow_dec_cpuslocked(key);
}
-void static_key_slow_dec_deferred(struct static_key_deferred *key)
+void __static_key_slow_dec_deferred(struct static_key *key,
+ struct delayed_work *work,
+ unsigned long timeout)
{
STATIC_KEY_CHECK_USE(key);
- __static_key_slow_dec(&key->key, key->timeout, &key->work);
+
+ if (static_key_slow_try_dec(key))
+ return;
+
+ schedule_delayed_work(work, timeout);
}
-EXPORT_SYMBOL_GPL(static_key_slow_dec_deferred);
+EXPORT_SYMBOL_GPL(__static_key_slow_dec_deferred);
-void static_key_deferred_flush(struct static_key_deferred *key)
+void __static_key_deferred_flush(void *key, struct delayed_work *work)
{
STATIC_KEY_CHECK_USE(key);
- flush_delayed_work(&key->work);
+ flush_delayed_work(work);
}
-EXPORT_SYMBOL_GPL(static_key_deferred_flush);
+EXPORT_SYMBOL_GPL(__static_key_deferred_flush);
void jump_label_rate_limit(struct static_key_deferred *key,
unsigned long rl)
@@ -374,25 +398,55 @@ static enum jump_label_type jump_label_type(struct jump_entry *entry)
return enabled ^ branch;
}
+static bool jump_label_can_update(struct jump_entry *entry, bool init)
+{
+ /*
+ * Cannot update code that was in an init text area.
+ */
+ if (!init && jump_entry_is_init(entry))
+ return false;
+
+ if (!kernel_text_address(jump_entry_code(entry))) {
+ WARN_ONCE(1, "can't patch jump_label at %pS", (void *)jump_entry_code(entry));
+ return false;
+ }
+
+ return true;
+}
+
+#ifndef HAVE_JUMP_LABEL_BATCH
static void __jump_label_update(struct static_key *key,
struct jump_entry *entry,
struct jump_entry *stop,
bool init)
{
for (; (entry < stop) && (jump_entry_key(entry) == key); entry++) {
- /*
- * An entry->code of 0 indicates an entry which has been
- * disabled because it was in an init text area.
- */
- if (init || !jump_entry_is_init(entry)) {
- if (kernel_text_address(jump_entry_code(entry)))
- arch_jump_label_transform(entry, jump_label_type(entry));
- else
- WARN_ONCE(1, "can't patch jump_label at %pS",
- (void *)jump_entry_code(entry));
+ if (jump_label_can_update(entry, init))
+ arch_jump_label_transform(entry, jump_label_type(entry));
+ }
+}
+#else
+static void __jump_label_update(struct static_key *key,
+ struct jump_entry *entry,
+ struct jump_entry *stop,
+ bool init)
+{
+ for (; (entry < stop) && (jump_entry_key(entry) == key); entry++) {
+
+ if (!jump_label_can_update(entry, init))
+ continue;
+
+ if (!arch_jump_label_transform_queue(entry, jump_label_type(entry))) {
+ /*
+ * Queue is full: Apply the current queue and try again.
+ */
+ arch_jump_label_transform_apply();
+ BUG_ON(!arch_jump_label_transform_queue(entry, jump_label_type(entry)));
}
}
+ arch_jump_label_transform_apply();
}
+#endif
void __init jump_label_init(void)
{
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index f3a04994e063..95a260f9214b 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* kallsyms.c: in-kernel printing of symbolic oopses and stack traces.
*
@@ -494,7 +495,7 @@ static int get_ksymbol_ftrace_mod(struct kallsym_iter *iter)
static int get_ksymbol_bpf(struct kallsym_iter *iter)
{
- iter->module_name[0] = '\0';
+ strlcpy(iter->module_name, "bpf", MODULE_NAME_LEN);
iter->exported = 0;
return bpf_get_kallsym(iter->pos - iter->pos_ftrace_mod_end,
&iter->value, &iter->type,
diff --git a/kernel/kcov.c b/kernel/kcov.c
index c2277dbdbfb1..2ee38727844a 100644
--- a/kernel/kcov.c
+++ b/kernel/kcov.c
@@ -20,6 +20,7 @@
#include <linux/debugfs.h>
#include <linux/uaccess.h>
#include <linux/kcov.h>
+#include <linux/refcount.h>
#include <asm/setup.h>
/* Number of 64-bit words written per one comparison: */
@@ -44,7 +45,7 @@ struct kcov {
* - opened file descriptor
* - task with enabled coverage (we can't unwire it from another task)
*/
- atomic_t refcount;
+ refcount_t refcount;
/* The lock protects mode, size, area and t. */
spinlock_t lock;
enum kcov_mode mode;
@@ -228,12 +229,12 @@ EXPORT_SYMBOL(__sanitizer_cov_trace_switch);
static void kcov_get(struct kcov *kcov)
{
- atomic_inc(&kcov->refcount);
+ refcount_inc(&kcov->refcount);
}
static void kcov_put(struct kcov *kcov)
{
- if (atomic_dec_and_test(&kcov->refcount)) {
+ if (refcount_dec_and_test(&kcov->refcount)) {
vfree(kcov->area);
kfree(kcov);
}
@@ -312,7 +313,7 @@ static int kcov_open(struct inode *inode, struct file *filep)
if (!kcov)
return -ENOMEM;
kcov->mode = KCOV_MODE_DISABLED;
- atomic_set(&kcov->refcount, 1);
+ refcount_set(&kcov->refcount, 1);
spin_lock_init(&kcov->lock);
filep->private_data = kcov;
return nonseekable_open(inode, filep);
@@ -444,10 +445,8 @@ static int __init kcov_init(void)
* there is no need to protect it against removal races. The
* use of debugfs_create_file_unsafe() is actually safe here.
*/
- if (!debugfs_create_file_unsafe("kcov", 0600, NULL, NULL, &kcov_fops)) {
- pr_err("failed to create kcov in debugfs\n");
- return -ENOMEM;
- }
+ debugfs_create_file_unsafe("kcov", 0600, NULL, NULL, &kcov_fops);
+
return 0;
}
diff --git a/kernel/kexec.c b/kernel/kexec.c
index 68559808fdfa..1b018f1a6e0d 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -1,9 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* kexec.c - kexec_load system call
* Copyright (C) 2002-2004 Eric Biederman <ebiederm@xmission.com>
- *
- * This source code is licensed under the GNU General Public License,
- * Version 2. See the file COPYING for more details.
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
index d7140447be75..d5870723b8ad 100644
--- a/kernel/kexec_core.c
+++ b/kernel/kexec_core.c
@@ -1,9 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* kexec.c - kexec system call core code.
* Copyright (C) 2002-2004 Eric Biederman <ebiederm@xmission.com>
- *
- * This source code is licensed under the GNU General Public License,
- * Version 2. See the file COPYING for more details.
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
@@ -1150,7 +1148,7 @@ int kernel_kexec(void)
error = dpm_suspend_end(PMSG_FREEZE);
if (error)
goto Resume_devices;
- error = disable_nonboot_cpus();
+ error = suspend_disable_secondary_cpus();
if (error)
goto Enable_cpus;
local_irq_disable();
@@ -1183,7 +1181,7 @@ int kernel_kexec(void)
Enable_irqs:
local_irq_enable();
Enable_cpus:
- enable_nonboot_cpus();
+ suspend_enable_secondary_cpus();
dpm_resume_start(PMSG_RESTORE);
Resume_devices:
dpm_resume_end(PMSG_RESTORE);
diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c
index f1d0e00a3971..b8cc032d5620 100644
--- a/kernel/kexec_file.c
+++ b/kernel/kexec_file.c
@@ -1,12 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* kexec: kexec_file_load system call
*
* Copyright (C) 2014 Red Hat Inc.
* Authors:
* Vivek Goyal <vgoyal@redhat.com>
- *
- * This source code is licensed under the GNU General Public License,
- * Version 2. See the file COPYING for more details.
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
@@ -198,9 +196,6 @@ kimage_file_prepare_segments(struct kimage *image, int kernel_fd, int initrd_fd,
return ret;
image->kernel_buf_len = size;
- /* IMA needs to pass the measurement list to the next kernel. */
- ima_add_kexec_buffer(image);
-
/* Call arch image probe handlers */
ret = arch_kexec_kernel_image_probe(image, image->kernel_buf,
image->kernel_buf_len);
@@ -241,8 +236,14 @@ kimage_file_prepare_segments(struct kimage *image, int kernel_fd, int initrd_fd,
ret = -EINVAL;
goto out;
}
+
+ ima_kexec_cmdline(image->cmdline_buf,
+ image->cmdline_buf_len - 1);
}
+ /* IMA needs to pass the measurement list to the next kernel. */
+ ima_add_kexec_buffer(image);
+
/* Call arch image load handlers */
ldata = arch_kexec_kernel_image_load(image);
@@ -500,13 +501,7 @@ static int locate_mem_hole_callback(struct resource *res, void *arg)
return locate_mem_hole_bottom_up(start, end, kbuf);
}
-#ifdef CONFIG_ARCH_DISCARD_MEMBLOCK
-static int kexec_walk_memblock(struct kexec_buf *kbuf,
- int (*func)(struct resource *, void *))
-{
- return 0;
-}
-#else
+#ifdef CONFIG_ARCH_KEEP_MEMBLOCK
static int kexec_walk_memblock(struct kexec_buf *kbuf,
int (*func)(struct resource *, void *))
{
@@ -550,6 +545,12 @@ static int kexec_walk_memblock(struct kexec_buf *kbuf,
return ret;
}
+#else
+static int kexec_walk_memblock(struct kexec_buf *kbuf,
+ int (*func)(struct resource *, void *))
+{
+ return 0;
+}
#endif
/**
@@ -589,7 +590,7 @@ int kexec_locate_mem_hole(struct kexec_buf *kbuf)
if (kbuf->mem != KEXEC_BUF_MEM_UNKNOWN)
return 0;
- if (IS_ENABLED(CONFIG_ARCH_DISCARD_MEMBLOCK))
+ if (!IS_ENABLED(CONFIG_ARCH_KEEP_MEMBLOCK))
ret = kexec_walk_resources(kbuf, locate_mem_hole_callback);
else
ret = kexec_walk_memblock(kbuf, locate_mem_hole_callback);
@@ -688,7 +689,6 @@ static int kexec_calculate_store_digests(struct kimage *image)
goto out_free_desc;
desc->tfm = tfm;
- desc->flags = 0;
ret = crypto_shash_init(desc);
if (ret < 0)
diff --git a/kernel/kheaders.c b/kernel/kheaders.c
new file mode 100644
index 000000000000..8f69772af77b
--- /dev/null
+++ b/kernel/kheaders.c
@@ -0,0 +1,66 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Provide kernel headers useful to build tracing programs
+ * such as for running eBPF tracing tools.
+ *
+ * (Borrowed code from kernel/configs.c)
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/kobject.h>
+#include <linux/init.h>
+
+/*
+ * Define kernel_headers_data and kernel_headers_data_end, within which the
+ * compressed kernel headers are stored. The file is first compressed with xz.
+ */
+
+asm (
+" .pushsection .rodata, \"a\" \n"
+" .global kernel_headers_data \n"
+"kernel_headers_data: \n"
+" .incbin \"kernel/kheaders_data.tar.xz\" \n"
+" .global kernel_headers_data_end \n"
+"kernel_headers_data_end: \n"
+" .popsection \n"
+);
+
+extern char kernel_headers_data;
+extern char kernel_headers_data_end;
+
+static ssize_t
+ikheaders_read(struct file *file, struct kobject *kobj,
+ struct bin_attribute *bin_attr,
+ char *buf, loff_t off, size_t len)
+{
+ memcpy(buf, &kernel_headers_data + off, len);
+ return len;
+}
+
+static struct bin_attribute kheaders_attr __ro_after_init = {
+ .attr = {
+ .name = "kheaders.tar.xz",
+ .mode = 0444,
+ },
+ .read = &ikheaders_read,
+};
+
+static int __init ikheaders_init(void)
+{
+ kheaders_attr.size = (&kernel_headers_data_end -
+ &kernel_headers_data);
+ return sysfs_create_bin_file(kernel_kobj, &kheaders_attr);
+}
+
+static void __exit ikheaders_cleanup(void)
+{
+ sysfs_remove_bin_file(kernel_kobj, &kheaders_attr);
+}
+
+module_init(ikheaders_init);
+module_exit(ikheaders_cleanup);
+
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR("Joel Fernandes");
+MODULE_DESCRIPTION("Echo the kernel header artifacts used to build the kernel");
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index f4ddfdd2d07e..9f5433a52488 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -1,21 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* Kernel Probes (KProbes)
* kernel/kprobes.c
*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
- *
* Copyright (C) IBM Corporation, 2002, 2004
*
* 2002-Oct Created by Vamsi Krishna S <vamsi_krishna@in.ibm.com> Kernel
@@ -709,7 +696,6 @@ static void unoptimize_kprobe(struct kprobe *p, bool force)
static int reuse_unused_kprobe(struct kprobe *ap)
{
struct optimized_kprobe *op;
- int ret;
/*
* Unused kprobe MUST be on the way of delayed unoptimizing (means
@@ -720,9 +706,8 @@ static int reuse_unused_kprobe(struct kprobe *ap)
/* Enable the probe again */
ap->flags &= ~KPROBE_FLAG_DISABLED;
/* Optimize it again (remove from op->list) */
- ret = kprobe_optready(ap);
- if (ret)
- return ret;
+ if (!kprobe_optready(ap))
+ return -EINVAL;
optimize_kprobe(ap);
return 0;
@@ -1396,7 +1381,7 @@ bool __weak arch_within_kprobe_blacklist(unsigned long addr)
addr < (unsigned long)__kprobes_text_end;
}
-bool within_kprobe_blacklist(unsigned long addr)
+static bool __within_kprobe_blacklist(unsigned long addr)
{
struct kprobe_blacklist_entry *ent;
@@ -1410,7 +1395,26 @@ bool within_kprobe_blacklist(unsigned long addr)
if (addr >= ent->start_addr && addr < ent->end_addr)
return true;
}
+ return false;
+}
+bool within_kprobe_blacklist(unsigned long addr)
+{
+ char symname[KSYM_NAME_LEN], *p;
+
+ if (__within_kprobe_blacklist(addr))
+ return true;
+
+ /* Check if the address is on a suffixed-symbol */
+ if (!lookup_symbol_name(addr, symname)) {
+ p = strchr(symname, '.');
+ if (!p)
+ return false;
+ *p = '\0';
+ addr = (unsigned long)kprobe_lookup_name(symname, 0);
+ if (addr)
+ return __within_kprobe_blacklist(addr);
+ }
return false;
}
@@ -2566,33 +2570,20 @@ static const struct file_operations fops_kp = {
static int __init debugfs_kprobe_init(void)
{
- struct dentry *dir, *file;
+ struct dentry *dir;
unsigned int value = 1;
dir = debugfs_create_dir("kprobes", NULL);
- if (!dir)
- return -ENOMEM;
- file = debugfs_create_file("list", 0400, dir, NULL,
- &debugfs_kprobes_operations);
- if (!file)
- goto error;
+ debugfs_create_file("list", 0400, dir, NULL,
+ &debugfs_kprobes_operations);
- file = debugfs_create_file("enabled", 0600, dir,
- &value, &fops_kp);
- if (!file)
- goto error;
+ debugfs_create_file("enabled", 0600, dir, &value, &fops_kp);
- file = debugfs_create_file("blacklist", 0400, dir, NULL,
- &debugfs_kprobe_blacklist_ops);
- if (!file)
- goto error;
+ debugfs_create_file("blacklist", 0400, dir, NULL,
+ &debugfs_kprobe_blacklist_ops);
return 0;
-
-error:
- debugfs_remove(dir);
- return -ENOMEM;
}
late_initcall(debugfs_kprobe_init);
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
index 46ba853656f6..35859da8bd4f 100644
--- a/kernel/ksysfs.c
+++ b/kernel/ksysfs.c
@@ -1,11 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* kernel/ksysfs.c - sysfs attributes in /sys/kernel, which
* are not related to any other subsystem
*
* Copyright (C) 2004 Kay Sievers <kay.sievers@vrfy.org>
- *
- * This file is release under the GPLv2
- *
*/
#include <linux/kobject.h>
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 087d18d771b5..621467c33fef 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/* Kernel thread helper functions.
* Copyright (C) 2004 IBM Corporation, Rusty Russell.
*
@@ -11,6 +12,7 @@
#include <linux/kthread.h>
#include <linux/completion.h>
#include <linux/err.h>
+#include <linux/cgroup.h>
#include <linux/cpuset.h>
#include <linux/unistd.h>
#include <linux/file.h>
@@ -20,6 +22,7 @@
#include <linux/freezer.h>
#include <linux/ptrace.h>
#include <linux/uaccess.h>
+#include <linux/numa.h>
#include <trace/events/sched.h>
static DEFINE_SPINLOCK(kthread_create_lock);
@@ -101,6 +104,12 @@ bool kthread_should_stop(void)
}
EXPORT_SYMBOL(kthread_should_stop);
+bool __kthread_should_park(struct task_struct *k)
+{
+ return test_bit(KTHREAD_SHOULD_PARK, &to_kthread(k)->flags);
+}
+EXPORT_SYMBOL_GPL(__kthread_should_park);
+
/**
* kthread_should_park - should this kthread park now?
*
@@ -114,7 +123,7 @@ EXPORT_SYMBOL(kthread_should_stop);
*/
bool kthread_should_park(void)
{
- return test_bit(KTHREAD_SHOULD_PARK, &to_kthread(current)->flags);
+ return __kthread_should_park(current);
}
EXPORT_SYMBOL_GPL(kthread_should_park);
@@ -599,7 +608,7 @@ void __kthread_init_worker(struct kthread_worker *worker,
struct lock_class_key *key)
{
memset(worker, 0, sizeof(struct kthread_worker));
- spin_lock_init(&worker->lock);
+ raw_spin_lock_init(&worker->lock);
lockdep_set_class_and_name(&worker->lock, key, name);
INIT_LIST_HEAD(&worker->work_list);
INIT_LIST_HEAD(&worker->delayed_work_list);
@@ -641,21 +650,21 @@ repeat:
if (kthread_should_stop()) {
__set_current_state(TASK_RUNNING);
- spin_lock_irq(&worker->lock);
+ raw_spin_lock_irq(&worker->lock);
worker->task = NULL;
- spin_unlock_irq(&worker->lock);
+ raw_spin_unlock_irq(&worker->lock);
return 0;
}
work = NULL;
- spin_lock_irq(&worker->lock);
+ raw_spin_lock_irq(&worker->lock);
if (!list_empty(&worker->work_list)) {
work = list_first_entry(&worker->work_list,
struct kthread_work, node);
list_del_init(&work->node);
}
worker->current_work = work;
- spin_unlock_irq(&worker->lock);
+ raw_spin_unlock_irq(&worker->lock);
if (work) {
__set_current_state(TASK_RUNNING);
@@ -675,7 +684,7 @@ __kthread_create_worker(int cpu, unsigned int flags,
{
struct kthread_worker *worker;
struct task_struct *task;
- int node = -1;
+ int node = NUMA_NO_NODE;
worker = kzalloc(sizeof(*worker), GFP_KERNEL);
if (!worker)
@@ -812,12 +821,12 @@ bool kthread_queue_work(struct kthread_worker *worker,
bool ret = false;
unsigned long flags;
- spin_lock_irqsave(&worker->lock, flags);
+ raw_spin_lock_irqsave(&worker->lock, flags);
if (!queuing_blocked(worker, work)) {
kthread_insert_work(worker, work, &worker->work_list);
ret = true;
}
- spin_unlock_irqrestore(&worker->lock, flags);
+ raw_spin_unlock_irqrestore(&worker->lock, flags);
return ret;
}
EXPORT_SYMBOL_GPL(kthread_queue_work);
@@ -835,6 +844,7 @@ void kthread_delayed_work_timer_fn(struct timer_list *t)
struct kthread_delayed_work *dwork = from_timer(dwork, t, timer);
struct kthread_work *work = &dwork->work;
struct kthread_worker *worker = work->worker;
+ unsigned long flags;
/*
* This might happen when a pending work is reinitialized.
@@ -843,7 +853,7 @@ void kthread_delayed_work_timer_fn(struct timer_list *t)
if (WARN_ON_ONCE(!worker))
return;
- spin_lock(&worker->lock);
+ raw_spin_lock_irqsave(&worker->lock, flags);
/* Work must not be used with >1 worker, see kthread_queue_work(). */
WARN_ON_ONCE(work->worker != worker);
@@ -852,7 +862,7 @@ void kthread_delayed_work_timer_fn(struct timer_list *t)
list_del_init(&work->node);
kthread_insert_work(worker, work, &worker->work_list);
- spin_unlock(&worker->lock);
+ raw_spin_unlock_irqrestore(&worker->lock, flags);
}
EXPORT_SYMBOL(kthread_delayed_work_timer_fn);
@@ -908,14 +918,14 @@ bool kthread_queue_delayed_work(struct kthread_worker *worker,
unsigned long flags;
bool ret = false;
- spin_lock_irqsave(&worker->lock, flags);
+ raw_spin_lock_irqsave(&worker->lock, flags);
if (!queuing_blocked(worker, work)) {
__kthread_queue_delayed_work(worker, dwork, delay);
ret = true;
}
- spin_unlock_irqrestore(&worker->lock, flags);
+ raw_spin_unlock_irqrestore(&worker->lock, flags);
return ret;
}
EXPORT_SYMBOL_GPL(kthread_queue_delayed_work);
@@ -951,7 +961,7 @@ void kthread_flush_work(struct kthread_work *work)
if (!worker)
return;
- spin_lock_irq(&worker->lock);
+ raw_spin_lock_irq(&worker->lock);
/* Work must not be used with >1 worker, see kthread_queue_work(). */
WARN_ON_ONCE(work->worker != worker);
@@ -963,7 +973,7 @@ void kthread_flush_work(struct kthread_work *work)
else
noop = true;
- spin_unlock_irq(&worker->lock);
+ raw_spin_unlock_irq(&worker->lock);
if (!noop)
wait_for_completion(&fwork.done);
@@ -996,9 +1006,9 @@ static bool __kthread_cancel_work(struct kthread_work *work, bool is_dwork,
* any queuing is blocked by setting the canceling counter.
*/
work->canceling++;
- spin_unlock_irqrestore(&worker->lock, *flags);
+ raw_spin_unlock_irqrestore(&worker->lock, *flags);
del_timer_sync(&dwork->timer);
- spin_lock_irqsave(&worker->lock, *flags);
+ raw_spin_lock_irqsave(&worker->lock, *flags);
work->canceling--;
}
@@ -1045,7 +1055,7 @@ bool kthread_mod_delayed_work(struct kthread_worker *worker,
unsigned long flags;
int ret = false;
- spin_lock_irqsave(&worker->lock, flags);
+ raw_spin_lock_irqsave(&worker->lock, flags);
/* Do not bother with canceling when never queued. */
if (!work->worker)
@@ -1062,7 +1072,7 @@ bool kthread_mod_delayed_work(struct kthread_worker *worker,
fast_queue:
__kthread_queue_delayed_work(worker, dwork, delay);
out:
- spin_unlock_irqrestore(&worker->lock, flags);
+ raw_spin_unlock_irqrestore(&worker->lock, flags);
return ret;
}
EXPORT_SYMBOL_GPL(kthread_mod_delayed_work);
@@ -1076,7 +1086,7 @@ static bool __kthread_cancel_work_sync(struct kthread_work *work, bool is_dwork)
if (!worker)
goto out;
- spin_lock_irqsave(&worker->lock, flags);
+ raw_spin_lock_irqsave(&worker->lock, flags);
/* Work must not be used with >1 worker, see kthread_queue_work(). */
WARN_ON_ONCE(work->worker != worker);
@@ -1090,13 +1100,13 @@ static bool __kthread_cancel_work_sync(struct kthread_work *work, bool is_dwork)
* In the meantime, block any queuing by setting the canceling counter.
*/
work->canceling++;
- spin_unlock_irqrestore(&worker->lock, flags);
+ raw_spin_unlock_irqrestore(&worker->lock, flags);
kthread_flush_work(work);
- spin_lock_irqsave(&worker->lock, flags);
+ raw_spin_lock_irqsave(&worker->lock, flags);
work->canceling--;
out_fast:
- spin_unlock_irqrestore(&worker->lock, flags);
+ raw_spin_unlock_irqrestore(&worker->lock, flags);
out:
return ret;
}
diff --git a/kernel/latencytop.c b/kernel/latencytop.c
index 96b4179cee6a..e3acead004e6 100644
--- a/kernel/latencytop.c
+++ b/kernel/latencytop.c
@@ -1,13 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* latencytop.c: Latency display infrastructure
*
* (C) Copyright 2008 Intel Corporation
* Author: Arjan van de Ven <arjan@linux.intel.com>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; version 2
- * of the License.
*/
/*
@@ -67,13 +63,10 @@ static struct latency_record latency_record[MAXLR];
int latencytop_enabled;
-void clear_all_latency_tracing(struct task_struct *p)
+void clear_tsk_latency_tracing(struct task_struct *p)
{
unsigned long flags;
- if (!latencytop_enabled)
- return;
-
raw_spin_lock_irqsave(&latency_lock, flags);
memset(&p->latency_record, 0, sizeof(p->latency_record));
p->latency_record_count = 0;
@@ -96,9 +89,6 @@ account_global_scheduler_latency(struct task_struct *tsk,
int firstnonnull = MAXLR + 1;
int i;
- if (!latencytop_enabled)
- return;
-
/* skip kernel threads for now */
if (!tsk->mm)
return;
@@ -120,8 +110,8 @@ account_global_scheduler_latency(struct task_struct *tsk,
break;
}
- /* 0 and ULONG_MAX entries mean end of backtrace: */
- if (record == 0 || record == ULONG_MAX)
+ /* 0 entry marks end of backtrace: */
+ if (!record)
break;
}
if (same) {
@@ -141,20 +131,6 @@ account_global_scheduler_latency(struct task_struct *tsk,
memcpy(&latency_record[i], lat, sizeof(struct latency_record));
}
-/*
- * Iterator to store a backtrace into a latency record entry
- */
-static inline void store_stacktrace(struct task_struct *tsk,
- struct latency_record *lat)
-{
- struct stack_trace trace;
-
- memset(&trace, 0, sizeof(trace));
- trace.max_entries = LT_BACKTRACEDEPTH;
- trace.entries = &lat->backtrace[0];
- save_stack_trace_tsk(tsk, &trace);
-}
-
/**
* __account_scheduler_latency - record an occurred latency
* @tsk - the task struct of the task hitting the latency
@@ -191,7 +167,8 @@ __account_scheduler_latency(struct task_struct *tsk, int usecs, int inter)
lat.count = 1;
lat.time = usecs;
lat.max = usecs;
- store_stacktrace(tsk, &lat);
+
+ stack_trace_save_tsk(tsk, lat.backtrace, LT_BACKTRACEDEPTH, 0);
raw_spin_lock_irqsave(&latency_lock, flags);
@@ -210,8 +187,8 @@ __account_scheduler_latency(struct task_struct *tsk, int usecs, int inter)
break;
}
- /* 0 and ULONG_MAX entries mean end of backtrace: */
- if (record == 0 || record == ULONG_MAX)
+ /* 0 entry is end of backtrace */
+ if (!record)
break;
}
if (same) {
@@ -252,10 +229,10 @@ static int lstats_show(struct seq_file *m, void *v)
lr->count, lr->time, lr->max);
for (q = 0; q < LT_BACKTRACEDEPTH; q++) {
unsigned long bt = lr->backtrace[q];
+
if (!bt)
break;
- if (bt == ULONG_MAX)
- break;
+
seq_printf(m, " %ps", (void *)bt);
}
seq_puts(m, "\n");
diff --git a/kernel/livepatch/Kconfig b/kernel/livepatch/Kconfig
index ec4565122e65..54102deb50ba 100644
--- a/kernel/livepatch/Kconfig
+++ b/kernel/livepatch/Kconfig
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
config HAVE_LIVEPATCH
bool
help
diff --git a/kernel/livepatch/Makefile b/kernel/livepatch/Makefile
index b36ceda6488e..cf9b5bcdb952 100644
--- a/kernel/livepatch/Makefile
+++ b/kernel/livepatch/Makefile
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
obj-$(CONFIG_LIVEPATCH) += livepatch.o
livepatch-objs := core.o patch.o shadow.o transition.o
diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c
index 5b77a7314e01..c4ce08f43bd6 100644
--- a/kernel/livepatch/core.c
+++ b/kernel/livepatch/core.c
@@ -1,21 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* core.c - Kernel Live Patching Core
*
* Copyright (C) 2014 Seth Jennings <sjenning@redhat.com>
* Copyright (C) 2014 SUSE
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version 2
- * of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, see <http://www.gnu.org/licenses/>.
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
@@ -30,6 +18,7 @@
#include <linux/elf.h>
#include <linux/moduleloader.h>
#include <linux/completion.h>
+#include <linux/memory.h>
#include <asm/cacheflush.h>
#include "core.h"
#include "patch.h"
@@ -45,7 +34,12 @@
*/
DEFINE_MUTEX(klp_mutex);
-static LIST_HEAD(klp_patches);
+/*
+ * Actively used patches: enabled or in transition. Note that replaced
+ * or disabled patches are not listed even though the related kernel
+ * module still can be loaded.
+ */
+LIST_HEAD(klp_patches);
static struct kobject *klp_root_kobj;
@@ -82,20 +76,43 @@ static void klp_find_object_module(struct klp_object *obj)
mutex_unlock(&module_mutex);
}
-static bool klp_is_patch_registered(struct klp_patch *patch)
+static bool klp_initialized(void)
{
- struct klp_patch *mypatch;
+ return !!klp_root_kobj;
+}
- list_for_each_entry(mypatch, &klp_patches, list)
- if (mypatch == patch)
- return true;
+static struct klp_func *klp_find_func(struct klp_object *obj,
+ struct klp_func *old_func)
+{
+ struct klp_func *func;
+
+ klp_for_each_func(obj, func) {
+ if ((strcmp(old_func->old_name, func->old_name) == 0) &&
+ (old_func->old_sympos == func->old_sympos)) {
+ return func;
+ }
+ }
- return false;
+ return NULL;
}
-static bool klp_initialized(void)
+static struct klp_object *klp_find_object(struct klp_patch *patch,
+ struct klp_object *old_obj)
{
- return !!klp_root_kobj;
+ struct klp_object *obj;
+
+ klp_for_each_object(patch, obj) {
+ if (klp_is_module(old_obj)) {
+ if (klp_is_module(obj) &&
+ strcmp(old_obj->name, obj->name) == 0) {
+ return obj;
+ }
+ } else if (!klp_is_module(obj)) {
+ return obj;
+ }
+ }
+
+ return NULL;
}
struct klp_find_arg {
@@ -278,170 +295,6 @@ static int klp_write_object_relocations(struct module *pmod,
return ret;
}
-static int __klp_disable_patch(struct klp_patch *patch)
-{
- struct klp_object *obj;
-
- if (WARN_ON(!patch->enabled))
- return -EINVAL;
-
- if (klp_transition_patch)
- return -EBUSY;
-
- /* enforce stacking: only the last enabled patch can be disabled */
- if (!list_is_last(&patch->list, &klp_patches) &&
- list_next_entry(patch, list)->enabled)
- return -EBUSY;
-
- klp_init_transition(patch, KLP_UNPATCHED);
-
- klp_for_each_object(patch, obj)
- if (obj->patched)
- klp_pre_unpatch_callback(obj);
-
- /*
- * Enforce the order of the func->transition writes in
- * klp_init_transition() and the TIF_PATCH_PENDING writes in
- * klp_start_transition(). In the rare case where klp_ftrace_handler()
- * is called shortly after klp_update_patch_state() switches the task,
- * this ensures the handler sees that func->transition is set.
- */
- smp_wmb();
-
- klp_start_transition();
- klp_try_complete_transition();
- patch->enabled = false;
-
- return 0;
-}
-
-/**
- * klp_disable_patch() - disables a registered patch
- * @patch: The registered, enabled patch to be disabled
- *
- * Unregisters the patched functions from ftrace.
- *
- * Return: 0 on success, otherwise error
- */
-int klp_disable_patch(struct klp_patch *patch)
-{
- int ret;
-
- mutex_lock(&klp_mutex);
-
- if (!klp_is_patch_registered(patch)) {
- ret = -EINVAL;
- goto err;
- }
-
- if (!patch->enabled) {
- ret = -EINVAL;
- goto err;
- }
-
- ret = __klp_disable_patch(patch);
-
-err:
- mutex_unlock(&klp_mutex);
- return ret;
-}
-EXPORT_SYMBOL_GPL(klp_disable_patch);
-
-static int __klp_enable_patch(struct klp_patch *patch)
-{
- struct klp_object *obj;
- int ret;
-
- if (klp_transition_patch)
- return -EBUSY;
-
- if (WARN_ON(patch->enabled))
- return -EINVAL;
-
- /* enforce stacking: only the first disabled patch can be enabled */
- if (patch->list.prev != &klp_patches &&
- !list_prev_entry(patch, list)->enabled)
- return -EBUSY;
-
- /*
- * A reference is taken on the patch module to prevent it from being
- * unloaded.
- */
- if (!try_module_get(patch->mod))
- return -ENODEV;
-
- pr_notice("enabling patch '%s'\n", patch->mod->name);
-
- klp_init_transition(patch, KLP_PATCHED);
-
- /*
- * Enforce the order of the func->transition writes in
- * klp_init_transition() and the ops->func_stack writes in
- * klp_patch_object(), so that klp_ftrace_handler() will see the
- * func->transition updates before the handler is registered and the
- * new funcs become visible to the handler.
- */
- smp_wmb();
-
- klp_for_each_object(patch, obj) {
- if (!klp_is_object_loaded(obj))
- continue;
-
- ret = klp_pre_patch_callback(obj);
- if (ret) {
- pr_warn("pre-patch callback failed for object '%s'\n",
- klp_is_module(obj) ? obj->name : "vmlinux");
- goto err;
- }
-
- ret = klp_patch_object(obj);
- if (ret) {
- pr_warn("failed to patch object '%s'\n",
- klp_is_module(obj) ? obj->name : "vmlinux");
- goto err;
- }
- }
-
- klp_start_transition();
- klp_try_complete_transition();
- patch->enabled = true;
-
- return 0;
-err:
- pr_warn("failed to enable patch '%s'\n", patch->mod->name);
-
- klp_cancel_transition();
- return ret;
-}
-
-/**
- * klp_enable_patch() - enables a registered patch
- * @patch: The registered, disabled patch to be enabled
- *
- * Performs the needed symbol lookups and code relocations,
- * then registers the patched functions with ftrace.
- *
- * Return: 0 on success, otherwise error
- */
-int klp_enable_patch(struct klp_patch *patch)
-{
- int ret;
-
- mutex_lock(&klp_mutex);
-
- if (!klp_is_patch_registered(patch)) {
- ret = -EINVAL;
- goto err;
- }
-
- ret = __klp_enable_patch(patch);
-
-err:
- mutex_unlock(&klp_mutex);
- return ret;
-}
-EXPORT_SYMBOL_GPL(klp_enable_patch);
-
/*
* Sysfs Interface
*
@@ -449,11 +302,11 @@ EXPORT_SYMBOL_GPL(klp_enable_patch);
* /sys/kernel/livepatch/<patch>
* /sys/kernel/livepatch/<patch>/enabled
* /sys/kernel/livepatch/<patch>/transition
- * /sys/kernel/livepatch/<patch>/signal
* /sys/kernel/livepatch/<patch>/force
* /sys/kernel/livepatch/<patch>/<object>
* /sys/kernel/livepatch/<patch>/<object>/<function,sympos>
*/
+static int __klp_disable_patch(struct klp_patch *patch);
static ssize_t enabled_store(struct kobject *kobj, struct kobj_attribute *attr,
const char *buf, size_t count)
@@ -470,40 +323,32 @@ static ssize_t enabled_store(struct kobject *kobj, struct kobj_attribute *attr,
mutex_lock(&klp_mutex);
- if (!klp_is_patch_registered(patch)) {
- /*
- * Module with the patch could either disappear meanwhile or is
- * not properly initialized yet.
- */
- ret = -EINVAL;
- goto err;
- }
-
if (patch->enabled == enabled) {
/* already in requested state */
ret = -EINVAL;
- goto err;
+ goto out;
}
- if (patch == klp_transition_patch) {
+ /*
+ * Allow to reverse a pending transition in both ways. It might be
+ * necessary to complete the transition without forcing and breaking
+ * the system integrity.
+ *
+ * Do not allow to re-enable a disabled patch.
+ */
+ if (patch == klp_transition_patch)
klp_reverse_transition();
- } else if (enabled) {
- ret = __klp_enable_patch(patch);
- if (ret)
- goto err;
- } else {
+ else if (!enabled)
ret = __klp_disable_patch(patch);
- if (ret)
- goto err;
- }
+ else
+ ret = -EINVAL;
+out:
mutex_unlock(&klp_mutex);
+ if (ret)
+ return ret;
return count;
-
-err:
- mutex_unlock(&klp_mutex);
- return ret;
}
static ssize_t enabled_show(struct kobject *kobj,
@@ -525,35 +370,6 @@ static ssize_t transition_show(struct kobject *kobj,
patch == klp_transition_patch);
}
-static ssize_t signal_store(struct kobject *kobj, struct kobj_attribute *attr,
- const char *buf, size_t count)
-{
- struct klp_patch *patch;
- int ret;
- bool val;
-
- ret = kstrtobool(buf, &val);
- if (ret)
- return ret;
-
- if (!val)
- return count;
-
- mutex_lock(&klp_mutex);
-
- patch = container_of(kobj, struct klp_patch, kobj);
- if (patch != klp_transition_patch) {
- mutex_unlock(&klp_mutex);
- return -EINVAL;
- }
-
- klp_send_signals();
-
- mutex_unlock(&klp_mutex);
-
- return count;
-}
-
static ssize_t force_store(struct kobject *kobj, struct kobj_attribute *attr,
const char *buf, size_t count)
{
@@ -585,15 +401,132 @@ static ssize_t force_store(struct kobject *kobj, struct kobj_attribute *attr,
static struct kobj_attribute enabled_kobj_attr = __ATTR_RW(enabled);
static struct kobj_attribute transition_kobj_attr = __ATTR_RO(transition);
-static struct kobj_attribute signal_kobj_attr = __ATTR_WO(signal);
static struct kobj_attribute force_kobj_attr = __ATTR_WO(force);
static struct attribute *klp_patch_attrs[] = {
&enabled_kobj_attr.attr,
&transition_kobj_attr.attr,
- &signal_kobj_attr.attr,
&force_kobj_attr.attr,
NULL
};
+ATTRIBUTE_GROUPS(klp_patch);
+
+static void klp_free_object_dynamic(struct klp_object *obj)
+{
+ kfree(obj->name);
+ kfree(obj);
+}
+
+static void klp_init_func_early(struct klp_object *obj,
+ struct klp_func *func);
+static void klp_init_object_early(struct klp_patch *patch,
+ struct klp_object *obj);
+
+static struct klp_object *klp_alloc_object_dynamic(const char *name,
+ struct klp_patch *patch)
+{
+ struct klp_object *obj;
+
+ obj = kzalloc(sizeof(*obj), GFP_KERNEL);
+ if (!obj)
+ return NULL;
+
+ if (name) {
+ obj->name = kstrdup(name, GFP_KERNEL);
+ if (!obj->name) {
+ kfree(obj);
+ return NULL;
+ }
+ }
+
+ klp_init_object_early(patch, obj);
+ obj->dynamic = true;
+
+ return obj;
+}
+
+static void klp_free_func_nop(struct klp_func *func)
+{
+ kfree(func->old_name);
+ kfree(func);
+}
+
+static struct klp_func *klp_alloc_func_nop(struct klp_func *old_func,
+ struct klp_object *obj)
+{
+ struct klp_func *func;
+
+ func = kzalloc(sizeof(*func), GFP_KERNEL);
+ if (!func)
+ return NULL;
+
+ if (old_func->old_name) {
+ func->old_name = kstrdup(old_func->old_name, GFP_KERNEL);
+ if (!func->old_name) {
+ kfree(func);
+ return NULL;
+ }
+ }
+
+ klp_init_func_early(obj, func);
+ /*
+ * func->new_func is same as func->old_func. These addresses are
+ * set when the object is loaded, see klp_init_object_loaded().
+ */
+ func->old_sympos = old_func->old_sympos;
+ func->nop = true;
+
+ return func;
+}
+
+static int klp_add_object_nops(struct klp_patch *patch,
+ struct klp_object *old_obj)
+{
+ struct klp_object *obj;
+ struct klp_func *func, *old_func;
+
+ obj = klp_find_object(patch, old_obj);
+
+ if (!obj) {
+ obj = klp_alloc_object_dynamic(old_obj->name, patch);
+ if (!obj)
+ return -ENOMEM;
+ }
+
+ klp_for_each_func(old_obj, old_func) {
+ func = klp_find_func(obj, old_func);
+ if (func)
+ continue;
+
+ func = klp_alloc_func_nop(old_func, obj);
+ if (!func)
+ return -ENOMEM;
+ }
+
+ return 0;
+}
+
+/*
+ * Add 'nop' functions which simply return to the caller to run
+ * the original function. The 'nop' functions are added to a
+ * patch to facilitate a 'replace' mode.
+ */
+static int klp_add_nops(struct klp_patch *patch)
+{
+ struct klp_patch *old_patch;
+ struct klp_object *old_obj;
+
+ klp_for_each_patch(old_patch) {
+ klp_for_each_object(old_patch, old_obj) {
+ int err;
+
+ err = klp_add_object_nops(patch, old_obj);
+ if (err)
+ return err;
+ }
+ }
+
+ return 0;
+}
static void klp_kobj_release_patch(struct kobject *kobj)
{
@@ -606,11 +539,17 @@ static void klp_kobj_release_patch(struct kobject *kobj)
static struct kobj_type klp_ktype_patch = {
.release = klp_kobj_release_patch,
.sysfs_ops = &kobj_sysfs_ops,
- .default_attrs = klp_patch_attrs,
+ .default_groups = klp_patch_groups,
};
static void klp_kobj_release_object(struct kobject *kobj)
{
+ struct klp_object *obj;
+
+ obj = container_of(kobj, struct klp_object, kobj);
+
+ if (obj->dynamic)
+ klp_free_object_dynamic(obj);
}
static struct kobj_type klp_ktype_object = {
@@ -620,6 +559,12 @@ static struct kobj_type klp_ktype_object = {
static void klp_kobj_release_func(struct kobject *kobj)
{
+ struct klp_func *func;
+
+ func = container_of(kobj, struct klp_func, kobj);
+
+ if (func->nop)
+ klp_free_func_nop(func);
}
static struct kobj_type klp_ktype_func = {
@@ -627,17 +572,17 @@ static struct kobj_type klp_ktype_func = {
.sysfs_ops = &kobj_sysfs_ops,
};
-/*
- * Free all functions' kobjects in the array up to some limit. When limit is
- * NULL, all kobjects are freed.
- */
-static void klp_free_funcs_limited(struct klp_object *obj,
- struct klp_func *limit)
+static void __klp_free_funcs(struct klp_object *obj, bool nops_only)
{
- struct klp_func *func;
+ struct klp_func *func, *tmp_func;
+
+ klp_for_each_func_safe(obj, func, tmp_func) {
+ if (nops_only && !func->nop)
+ continue;
- for (func = obj->funcs; func->old_name && func != limit; func++)
+ list_del(&func->node);
kobject_put(&func->kobj);
+ }
}
/* Clean up when a patched object is unloaded */
@@ -647,35 +592,101 @@ static void klp_free_object_loaded(struct klp_object *obj)
obj->mod = NULL;
- klp_for_each_func(obj, func)
- func->old_addr = 0;
+ klp_for_each_func(obj, func) {
+ func->old_func = NULL;
+
+ if (func->nop)
+ func->new_func = NULL;
+ }
}
-/*
- * Free all objects' kobjects in the array up to some limit. When limit is
- * NULL, all kobjects are freed.
- */
-static void klp_free_objects_limited(struct klp_patch *patch,
- struct klp_object *limit)
+static void __klp_free_objects(struct klp_patch *patch, bool nops_only)
{
- struct klp_object *obj;
+ struct klp_object *obj, *tmp_obj;
- for (obj = patch->objs; obj->funcs && obj != limit; obj++) {
- klp_free_funcs_limited(obj, NULL);
+ klp_for_each_object_safe(patch, obj, tmp_obj) {
+ __klp_free_funcs(obj, nops_only);
+
+ if (nops_only && !obj->dynamic)
+ continue;
+
+ list_del(&obj->node);
kobject_put(&obj->kobj);
}
}
-static void klp_free_patch(struct klp_patch *patch)
+static void klp_free_objects(struct klp_patch *patch)
+{
+ __klp_free_objects(patch, false);
+}
+
+static void klp_free_objects_dynamic(struct klp_patch *patch)
+{
+ __klp_free_objects(patch, true);
+}
+
+/*
+ * This function implements the free operations that can be called safely
+ * under klp_mutex.
+ *
+ * The operation must be completed by calling klp_free_patch_finish()
+ * outside klp_mutex.
+ */
+void klp_free_patch_start(struct klp_patch *patch)
{
- klp_free_objects_limited(patch, NULL);
if (!list_empty(&patch->list))
list_del(&patch->list);
+
+ klp_free_objects(patch);
+}
+
+/*
+ * This function implements the free part that must be called outside
+ * klp_mutex.
+ *
+ * It must be called after klp_free_patch_start(). And it has to be
+ * the last function accessing the livepatch structures when the patch
+ * gets disabled.
+ */
+static void klp_free_patch_finish(struct klp_patch *patch)
+{
+ /*
+ * Avoid deadlock with enabled_store() sysfs callback by
+ * calling this outside klp_mutex. It is safe because
+ * this is called when the patch gets disabled and it
+ * cannot get enabled again.
+ */
+ kobject_put(&patch->kobj);
+ wait_for_completion(&patch->finish);
+
+ /* Put the module after the last access to struct klp_patch. */
+ if (!patch->forced)
+ module_put(patch->mod);
+}
+
+/*
+ * The livepatch might be freed from sysfs interface created by the patch.
+ * This work allows to wait until the interface is destroyed in a separate
+ * context.
+ */
+static void klp_free_patch_work_fn(struct work_struct *work)
+{
+ struct klp_patch *patch =
+ container_of(work, struct klp_patch, free_work);
+
+ klp_free_patch_finish(patch);
}
static int klp_init_func(struct klp_object *obj, struct klp_func *func)
{
- if (!func->old_name || !func->new_func)
+ if (!func->old_name)
+ return -EINVAL;
+
+ /*
+ * NOPs get the address later. The patched module must be loaded,
+ * see klp_init_object_loaded().
+ */
+ if (!func->new_func && !func->nop)
return -EINVAL;
if (strlen(func->old_name) >= KSYM_NAME_LEN)
@@ -690,9 +701,9 @@ static int klp_init_func(struct klp_object *obj, struct klp_func *func)
* object. If the user selects 0 for old_sympos, then 1 will be used
* since a unique symbol will be the first occurrence.
*/
- return kobject_init_and_add(&func->kobj, &klp_ktype_func,
- &obj->kobj, "%s,%lu", func->old_name,
- func->old_sympos ? func->old_sympos : 1);
+ return kobject_add(&func->kobj, &obj->kobj, "%s,%lu",
+ func->old_name,
+ func->old_sympos ? func->old_sympos : 1);
}
/* Arches may override this to finish any remaining arch-specific tasks */
@@ -708,24 +719,29 @@ static int klp_init_object_loaded(struct klp_patch *patch,
struct klp_func *func;
int ret;
+ mutex_lock(&text_mutex);
+
module_disable_ro(patch->mod);
ret = klp_write_object_relocations(patch->mod, obj);
if (ret) {
module_enable_ro(patch->mod, true);
+ mutex_unlock(&text_mutex);
return ret;
}
arch_klp_init_object_loaded(patch, obj);
module_enable_ro(patch->mod, true);
+ mutex_unlock(&text_mutex);
+
klp_for_each_func(obj, func) {
ret = klp_find_object_symbol(obj->name, func->old_name,
func->old_sympos,
- &func->old_addr);
+ (unsigned long *)&func->old_func);
if (ret)
return ret;
- ret = kallsyms_lookup_size_offset(func->old_addr,
+ ret = kallsyms_lookup_size_offset((unsigned long)func->old_func,
&func->old_size, NULL);
if (!ret) {
pr_err("kallsyms size lookup failed for '%s'\n",
@@ -733,6 +749,9 @@ static int klp_init_object_loaded(struct klp_patch *patch,
return -ENOENT;
}
+ if (func->nop)
+ func->new_func = func->old_func;
+
ret = kallsyms_lookup_size_offset((unsigned long)func->new_func,
&func->new_size, NULL);
if (!ret) {
@@ -751,9 +770,6 @@ static int klp_init_object(struct klp_patch *patch, struct klp_object *obj)
int ret;
const char *name;
- if (!obj->funcs)
- return -EINVAL;
-
if (klp_is_module(obj) && strlen(obj->name) >= MODULE_NAME_LEN)
return -EINVAL;
@@ -763,126 +779,200 @@ static int klp_init_object(struct klp_patch *patch, struct klp_object *obj)
klp_find_object_module(obj);
name = klp_is_module(obj) ? obj->name : "vmlinux";
- ret = kobject_init_and_add(&obj->kobj, &klp_ktype_object,
- &patch->kobj, "%s", name);
+ ret = kobject_add(&obj->kobj, &patch->kobj, "%s", name);
if (ret)
return ret;
klp_for_each_func(obj, func) {
ret = klp_init_func(obj, func);
if (ret)
- goto free;
+ return ret;
}
- if (klp_is_object_loaded(obj)) {
+ if (klp_is_object_loaded(obj))
ret = klp_init_object_loaded(patch, obj);
- if (ret)
- goto free;
- }
-
- return 0;
-free:
- klp_free_funcs_limited(obj, func);
- kobject_put(&obj->kobj);
return ret;
}
-static int klp_init_patch(struct klp_patch *patch)
+static void klp_init_func_early(struct klp_object *obj,
+ struct klp_func *func)
+{
+ kobject_init(&func->kobj, &klp_ktype_func);
+ list_add_tail(&func->node, &obj->func_list);
+}
+
+static void klp_init_object_early(struct klp_patch *patch,
+ struct klp_object *obj)
+{
+ INIT_LIST_HEAD(&obj->func_list);
+ kobject_init(&obj->kobj, &klp_ktype_object);
+ list_add_tail(&obj->node, &patch->obj_list);
+}
+
+static int klp_init_patch_early(struct klp_patch *patch)
{
struct klp_object *obj;
- int ret;
+ struct klp_func *func;
if (!patch->objs)
return -EINVAL;
- mutex_lock(&klp_mutex);
-
+ INIT_LIST_HEAD(&patch->list);
+ INIT_LIST_HEAD(&patch->obj_list);
+ kobject_init(&patch->kobj, &klp_ktype_patch);
patch->enabled = false;
+ patch->forced = false;
+ INIT_WORK(&patch->free_work, klp_free_patch_work_fn);
init_completion(&patch->finish);
- ret = kobject_init_and_add(&patch->kobj, &klp_ktype_patch,
- klp_root_kobj, "%s", patch->mod->name);
- if (ret) {
- mutex_unlock(&klp_mutex);
+ klp_for_each_object_static(patch, obj) {
+ if (!obj->funcs)
+ return -EINVAL;
+
+ klp_init_object_early(patch, obj);
+
+ klp_for_each_func_static(obj, func) {
+ klp_init_func_early(obj, func);
+ }
+ }
+
+ if (!try_module_get(patch->mod))
+ return -ENODEV;
+
+ return 0;
+}
+
+static int klp_init_patch(struct klp_patch *patch)
+{
+ struct klp_object *obj;
+ int ret;
+
+ ret = kobject_add(&patch->kobj, klp_root_kobj, "%s", patch->mod->name);
+ if (ret)
return ret;
+
+ if (patch->replace) {
+ ret = klp_add_nops(patch);
+ if (ret)
+ return ret;
}
klp_for_each_object(patch, obj) {
ret = klp_init_object(patch, obj);
if (ret)
- goto free;
+ return ret;
}
list_add_tail(&patch->list, &klp_patches);
- mutex_unlock(&klp_mutex);
-
return 0;
+}
+
+static int __klp_disable_patch(struct klp_patch *patch)
+{
+ struct klp_object *obj;
-free:
- klp_free_objects_limited(patch, obj);
+ if (WARN_ON(!patch->enabled))
+ return -EINVAL;
- mutex_unlock(&klp_mutex);
+ if (klp_transition_patch)
+ return -EBUSY;
- kobject_put(&patch->kobj);
- wait_for_completion(&patch->finish);
+ klp_init_transition(patch, KLP_UNPATCHED);
- return ret;
+ klp_for_each_object(patch, obj)
+ if (obj->patched)
+ klp_pre_unpatch_callback(obj);
+
+ /*
+ * Enforce the order of the func->transition writes in
+ * klp_init_transition() and the TIF_PATCH_PENDING writes in
+ * klp_start_transition(). In the rare case where klp_ftrace_handler()
+ * is called shortly after klp_update_patch_state() switches the task,
+ * this ensures the handler sees that func->transition is set.
+ */
+ smp_wmb();
+
+ klp_start_transition();
+ patch->enabled = false;
+ klp_try_complete_transition();
+
+ return 0;
}
-/**
- * klp_unregister_patch() - unregisters a patch
- * @patch: Disabled patch to be unregistered
- *
- * Frees the data structures and removes the sysfs interface.
- *
- * Return: 0 on success, otherwise error
- */
-int klp_unregister_patch(struct klp_patch *patch)
+static int __klp_enable_patch(struct klp_patch *patch)
{
+ struct klp_object *obj;
int ret;
- mutex_lock(&klp_mutex);
+ if (klp_transition_patch)
+ return -EBUSY;
- if (!klp_is_patch_registered(patch)) {
- ret = -EINVAL;
- goto err;
- }
+ if (WARN_ON(patch->enabled))
+ return -EINVAL;
- if (patch->enabled) {
- ret = -EBUSY;
- goto err;
- }
+ pr_notice("enabling patch '%s'\n", patch->mod->name);
- klp_free_patch(patch);
+ klp_init_transition(patch, KLP_PATCHED);
- mutex_unlock(&klp_mutex);
+ /*
+ * Enforce the order of the func->transition writes in
+ * klp_init_transition() and the ops->func_stack writes in
+ * klp_patch_object(), so that klp_ftrace_handler() will see the
+ * func->transition updates before the handler is registered and the
+ * new funcs become visible to the handler.
+ */
+ smp_wmb();
- kobject_put(&patch->kobj);
- wait_for_completion(&patch->finish);
+ klp_for_each_object(patch, obj) {
+ if (!klp_is_object_loaded(obj))
+ continue;
+
+ ret = klp_pre_patch_callback(obj);
+ if (ret) {
+ pr_warn("pre-patch callback failed for object '%s'\n",
+ klp_is_module(obj) ? obj->name : "vmlinux");
+ goto err;
+ }
+
+ ret = klp_patch_object(obj);
+ if (ret) {
+ pr_warn("failed to patch object '%s'\n",
+ klp_is_module(obj) ? obj->name : "vmlinux");
+ goto err;
+ }
+ }
+
+ klp_start_transition();
+ patch->enabled = true;
+ klp_try_complete_transition();
return 0;
err:
- mutex_unlock(&klp_mutex);
+ pr_warn("failed to enable patch '%s'\n", patch->mod->name);
+
+ klp_cancel_transition();
return ret;
}
-EXPORT_SYMBOL_GPL(klp_unregister_patch);
/**
- * klp_register_patch() - registers a patch
- * @patch: Patch to be registered
+ * klp_enable_patch() - enable the livepatch
+ * @patch: patch to be enabled
*
- * Initializes the data structure associated with the patch and
- * creates the sysfs interface.
+ * Initializes the data structure associated with the patch, creates the sysfs
+ * interface, performs the needed symbol lookups and code relocations,
+ * registers the patched functions with ftrace.
*
- * There is no need to take the reference on the patch module here. It is done
- * later when the patch is enabled.
+ * This function is supposed to be called from the livepatch module_init()
+ * callback.
*
* Return: 0 on success, otherwise error
*/
-int klp_register_patch(struct klp_patch *patch)
+int klp_enable_patch(struct klp_patch *patch)
{
+ int ret;
+
if (!patch || !patch->mod)
return -EINVAL;
@@ -896,13 +986,91 @@ int klp_register_patch(struct klp_patch *patch)
return -ENODEV;
if (!klp_have_reliable_stack()) {
- pr_err("This architecture doesn't have support for the livepatch consistency model.\n");
- return -ENOSYS;
+ pr_warn("This architecture doesn't have support for the livepatch consistency model.\n");
+ pr_warn("The livepatch transition may never complete.\n");
+ }
+
+ mutex_lock(&klp_mutex);
+
+ ret = klp_init_patch_early(patch);
+ if (ret) {
+ mutex_unlock(&klp_mutex);
+ return ret;
+ }
+
+ ret = klp_init_patch(patch);
+ if (ret)
+ goto err;
+
+ ret = __klp_enable_patch(patch);
+ if (ret)
+ goto err;
+
+ mutex_unlock(&klp_mutex);
+
+ return 0;
+
+err:
+ klp_free_patch_start(patch);
+
+ mutex_unlock(&klp_mutex);
+
+ klp_free_patch_finish(patch);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(klp_enable_patch);
+
+/*
+ * This function removes replaced patches.
+ *
+ * We could be pretty aggressive here. It is called in the situation where
+ * these structures are no longer accessible. All functions are redirected
+ * by the klp_transition_patch. They use either a new code or they are in
+ * the original code because of the special nop function patches.
+ *
+ * The only exception is when the transition was forced. In this case,
+ * klp_ftrace_handler() might still see the replaced patch on the stack.
+ * Fortunately, it is carefully designed to work with removed functions
+ * thanks to RCU. We only have to keep the patches on the system. Also
+ * this is handled transparently by patch->module_put.
+ */
+void klp_discard_replaced_patches(struct klp_patch *new_patch)
+{
+ struct klp_patch *old_patch, *tmp_patch;
+
+ klp_for_each_patch_safe(old_patch, tmp_patch) {
+ if (old_patch == new_patch)
+ return;
+
+ old_patch->enabled = false;
+ klp_unpatch_objects(old_patch);
+ klp_free_patch_start(old_patch);
+ schedule_work(&old_patch->free_work);
}
+}
- return klp_init_patch(patch);
+/*
+ * This function removes the dynamically allocated 'nop' functions.
+ *
+ * We could be pretty aggressive. NOPs do not change the existing
+ * behavior except for adding unnecessary delay by the ftrace handler.
+ *
+ * It is safe even when the transition was forced. The ftrace handler
+ * will see a valid ops->func_stack entry thanks to RCU.
+ *
+ * We could even free the NOPs structures. They must be the last entry
+ * in ops->func_stack. Therefore unregister_ftrace_function() is called.
+ * It does the same as klp_synchronize_transition() to make sure that
+ * nobody is inside the ftrace handler once the operation finishes.
+ *
+ * IMPORTANT: It must be called right after removing the replaced patches!
+ */
+void klp_discard_nops(struct klp_patch *new_patch)
+{
+ klp_unpatch_objects_dynamic(klp_transition_patch);
+ klp_free_objects_dynamic(klp_transition_patch);
}
-EXPORT_SYMBOL_GPL(klp_register_patch);
/*
* Remove parts of patches that touch a given kernel module. The list of
@@ -915,7 +1083,7 @@ static void klp_cleanup_module_patches_limited(struct module *mod,
struct klp_patch *patch;
struct klp_object *obj;
- list_for_each_entry(patch, &klp_patches, list) {
+ klp_for_each_patch(patch) {
if (patch == limit)
break;
@@ -923,21 +1091,14 @@ static void klp_cleanup_module_patches_limited(struct module *mod,
if (!klp_is_module(obj) || strcmp(obj->name, mod->name))
continue;
- /*
- * Only unpatch the module if the patch is enabled or
- * is in transition.
- */
- if (patch->enabled || patch == klp_transition_patch) {
-
- if (patch != klp_transition_patch)
- klp_pre_unpatch_callback(obj);
+ if (patch != klp_transition_patch)
+ klp_pre_unpatch_callback(obj);
- pr_notice("reverting patch '%s' on unloading module '%s'\n",
- patch->mod->name, obj->mod->name);
- klp_unpatch_object(obj);
+ pr_notice("reverting patch '%s' on unloading module '%s'\n",
+ patch->mod->name, obj->mod->name);
+ klp_unpatch_object(obj);
- klp_post_unpatch_callback(obj);
- }
+ klp_post_unpatch_callback(obj);
klp_free_object_loaded(obj);
break;
@@ -962,7 +1123,7 @@ int klp_module_coming(struct module *mod)
*/
mod->klp_alive = true;
- list_for_each_entry(patch, &klp_patches, list) {
+ klp_for_each_patch(patch) {
klp_for_each_object(patch, obj) {
if (!klp_is_module(obj) || strcmp(obj->name, mod->name))
continue;
@@ -976,13 +1137,6 @@ int klp_module_coming(struct module *mod)
goto err;
}
- /*
- * Only patch the module if the patch is enabled or is
- * in transition.
- */
- if (!patch->enabled && patch != klp_transition_patch)
- break;
-
pr_notice("applying patch '%s' to loading module '%s'\n",
patch->mod->name, obj->mod->name);
@@ -1048,14 +1202,6 @@ void klp_module_going(struct module *mod)
static int __init klp_init(void)
{
- int ret;
-
- ret = klp_check_compiler_support();
- if (ret) {
- pr_info("Your compiler is too old; turning off.\n");
- return -EINVAL;
- }
-
klp_root_kobj = kobject_create_and_add("livepatch", kernel_kobj);
if (!klp_root_kobj)
return -ENOMEM;
diff --git a/kernel/livepatch/core.h b/kernel/livepatch/core.h
index 48a83d4364cf..ec43a40b853f 100644
--- a/kernel/livepatch/core.h
+++ b/kernel/livepatch/core.h
@@ -5,6 +5,17 @@
#include <linux/livepatch.h>
extern struct mutex klp_mutex;
+extern struct list_head klp_patches;
+
+#define klp_for_each_patch_safe(patch, tmp_patch) \
+ list_for_each_entry_safe(patch, tmp_patch, &klp_patches, list)
+
+#define klp_for_each_patch(patch) \
+ list_for_each_entry(patch, &klp_patches, list)
+
+void klp_free_patch_start(struct klp_patch *patch);
+void klp_discard_replaced_patches(struct klp_patch *new_patch);
+void klp_discard_nops(struct klp_patch *new_patch);
static inline bool klp_is_object_loaded(struct klp_object *obj)
{
diff --git a/kernel/livepatch/patch.c b/kernel/livepatch/patch.c
index 7702cb4064fc..bd43537702bd 100644
--- a/kernel/livepatch/patch.c
+++ b/kernel/livepatch/patch.c
@@ -1,22 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* patch.c - livepatch patching functions
*
* Copyright (C) 2014 Seth Jennings <sjenning@redhat.com>
* Copyright (C) 2014 SUSE
* Copyright (C) 2015 Josh Poimboeuf <jpoimboe@redhat.com>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version 2
- * of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, see <http://www.gnu.org/licenses/>.
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
@@ -34,7 +22,7 @@
static LIST_HEAD(klp_ops);
-struct klp_ops *klp_find_ops(unsigned long old_addr)
+struct klp_ops *klp_find_ops(void *old_func)
{
struct klp_ops *ops;
struct klp_func *func;
@@ -42,7 +30,7 @@ struct klp_ops *klp_find_ops(unsigned long old_addr)
list_for_each_entry(ops, &klp_ops, node) {
func = list_first_entry(&ops->func_stack, struct klp_func,
stack_node);
- if (func->old_addr == old_addr)
+ if (func->old_func == old_func)
return ops;
}
@@ -118,7 +106,15 @@ static void notrace klp_ftrace_handler(unsigned long ip,
}
}
+ /*
+ * NOPs are used to replace existing patches with original code.
+ * Do nothing! Setting pc would cause an infinite loop.
+ */
+ if (func->nop)
+ goto unlock;
+
klp_arch_set_pc(regs, (unsigned long)func->new_func);
+
unlock:
preempt_enable_notrace();
}
@@ -142,17 +138,18 @@ static void klp_unpatch_func(struct klp_func *func)
if (WARN_ON(!func->patched))
return;
- if (WARN_ON(!func->old_addr))
+ if (WARN_ON(!func->old_func))
return;
- ops = klp_find_ops(func->old_addr);
+ ops = klp_find_ops(func->old_func);
if (WARN_ON(!ops))
return;
if (list_is_singular(&ops->func_stack)) {
unsigned long ftrace_loc;
- ftrace_loc = klp_get_ftrace_location(func->old_addr);
+ ftrace_loc =
+ klp_get_ftrace_location((unsigned long)func->old_func);
if (WARN_ON(!ftrace_loc))
return;
@@ -174,17 +171,18 @@ static int klp_patch_func(struct klp_func *func)
struct klp_ops *ops;
int ret;
- if (WARN_ON(!func->old_addr))
+ if (WARN_ON(!func->old_func))
return -EINVAL;
if (WARN_ON(func->patched))
return -EINVAL;
- ops = klp_find_ops(func->old_addr);
+ ops = klp_find_ops(func->old_func);
if (!ops) {
unsigned long ftrace_loc;
- ftrace_loc = klp_get_ftrace_location(func->old_addr);
+ ftrace_loc =
+ klp_get_ftrace_location((unsigned long)func->old_func);
if (!ftrace_loc) {
pr_err("failed to find location for function '%s'\n",
func->old_name);
@@ -236,15 +234,26 @@ err:
return ret;
}
-void klp_unpatch_object(struct klp_object *obj)
+static void __klp_unpatch_object(struct klp_object *obj, bool nops_only)
{
struct klp_func *func;
- klp_for_each_func(obj, func)
+ klp_for_each_func(obj, func) {
+ if (nops_only && !func->nop)
+ continue;
+
if (func->patched)
klp_unpatch_func(func);
+ }
+
+ if (obj->dynamic || !nops_only)
+ obj->patched = false;
+}
+
- obj->patched = false;
+void klp_unpatch_object(struct klp_object *obj)
+{
+ __klp_unpatch_object(obj, false);
}
int klp_patch_object(struct klp_object *obj)
@@ -267,11 +276,21 @@ int klp_patch_object(struct klp_object *obj)
return 0;
}
-void klp_unpatch_objects(struct klp_patch *patch)
+static void __klp_unpatch_objects(struct klp_patch *patch, bool nops_only)
{
struct klp_object *obj;
klp_for_each_object(patch, obj)
if (obj->patched)
- klp_unpatch_object(obj);
+ __klp_unpatch_object(obj, nops_only);
+}
+
+void klp_unpatch_objects(struct klp_patch *patch)
+{
+ __klp_unpatch_objects(patch, false);
+}
+
+void klp_unpatch_objects_dynamic(struct klp_patch *patch)
+{
+ __klp_unpatch_objects(patch, true);
}
diff --git a/kernel/livepatch/patch.h b/kernel/livepatch/patch.h
index e72d8250d04b..d5f2fbe373e0 100644
--- a/kernel/livepatch/patch.h
+++ b/kernel/livepatch/patch.h
@@ -10,7 +10,7 @@
* struct klp_ops - structure for tracking registered ftrace ops structs
*
* A single ftrace_ops is shared between all enabled replacement functions
- * (klp_func structs) which have the same old_addr. This allows the switch
+ * (klp_func structs) which have the same old_func. This allows the switch
* between function versions to happen instantaneously by updating the klp_ops
* struct's func_stack list. The winner is the klp_func at the top of the
* func_stack (front of the list).
@@ -25,10 +25,11 @@ struct klp_ops {
struct ftrace_ops fops;
};
-struct klp_ops *klp_find_ops(unsigned long old_addr);
+struct klp_ops *klp_find_ops(void *old_func);
int klp_patch_object(struct klp_object *obj);
void klp_unpatch_object(struct klp_object *obj);
void klp_unpatch_objects(struct klp_patch *patch);
+void klp_unpatch_objects_dynamic(struct klp_patch *patch);
#endif /* _LIVEPATCH_PATCH_H */
diff --git a/kernel/livepatch/shadow.c b/kernel/livepatch/shadow.c
index 83958c814439..e5c9fb295ba9 100644
--- a/kernel/livepatch/shadow.c
+++ b/kernel/livepatch/shadow.c
@@ -1,22 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* shadow.c - Shadow Variables
*
* Copyright (C) 2014 Josh Poimboeuf <jpoimboe@redhat.com>
* Copyright (C) 2014 Seth Jennings <sjenning@redhat.com>
* Copyright (C) 2017 Joe Lawrence <joe.lawrence@redhat.com>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version 2
- * of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, see <http://www.gnu.org/licenses/>.
*/
/**
diff --git a/kernel/livepatch/transition.c b/kernel/livepatch/transition.c
index 304d5eb8a98c..cdf318d86dd6 100644
--- a/kernel/livepatch/transition.c
+++ b/kernel/livepatch/transition.c
@@ -1,20 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* transition.c - Kernel Live Patching transition functions
*
* Copyright (C) 2015-2016 Josh Poimboeuf <jpoimboe@redhat.com>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version 2
- * of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, see <http://www.gnu.org/licenses/>.
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
@@ -29,11 +17,13 @@
#define MAX_STACK_ENTRIES 100
#define STACK_ERR_BUF_SIZE 128
+#define SIGNALS_TIMEOUT 15
+
struct klp_patch *klp_transition_patch;
static int klp_target_state = KLP_UNDEFINED;
-static bool klp_forced = false;
+static unsigned int klp_signals_cnt;
/*
* This work can be performed periodically to finish patching or unpatching any
@@ -87,6 +77,11 @@ static void klp_complete_transition(void)
klp_transition_patch->mod->name,
klp_target_state == KLP_PATCHED ? "patching" : "unpatching");
+ if (klp_transition_patch->replace && klp_target_state == KLP_PATCHED) {
+ klp_discard_replaced_patches(klp_transition_patch);
+ klp_discard_nops(klp_transition_patch);
+ }
+
if (klp_target_state == KLP_UNPATCHED) {
/*
* All tasks have transitioned to KLP_UNPATCHED so we can now
@@ -136,13 +131,6 @@ static void klp_complete_transition(void)
pr_notice("'%s': %s complete\n", klp_transition_patch->mod->name,
klp_target_state == KLP_PATCHED ? "patching" : "unpatching");
- /*
- * klp_forced set implies unbounded increase of module's ref count if
- * the module is disabled/enabled in a loop.
- */
- if (!klp_forced && klp_target_state == KLP_UNPATCHED)
- module_put(klp_transition_patch->mod);
-
klp_target_state = KLP_UNDEFINED;
klp_transition_patch = NULL;
}
@@ -202,15 +190,15 @@ void klp_update_patch_state(struct task_struct *task)
* Determine whether the given stack trace includes any references to a
* to-be-patched or to-be-unpatched function.
*/
-static int klp_check_stack_func(struct klp_func *func,
- struct stack_trace *trace)
+static int klp_check_stack_func(struct klp_func *func, unsigned long *entries,
+ unsigned int nr_entries)
{
unsigned long func_addr, func_size, address;
struct klp_ops *ops;
int i;
- for (i = 0; i < trace->nr_entries; i++) {
- address = trace->entries[i];
+ for (i = 0; i < nr_entries; i++) {
+ address = entries[i];
if (klp_target_state == KLP_UNPATCHED) {
/*
@@ -224,11 +212,11 @@ static int klp_check_stack_func(struct klp_func *func,
* Check for the to-be-patched function
* (the previous func).
*/
- ops = klp_find_ops(func->old_addr);
+ ops = klp_find_ops(func->old_func);
if (list_is_singular(&ops->func_stack)) {
/* original function */
- func_addr = func->old_addr;
+ func_addr = (unsigned long)func->old_func;
func_size = func->old_size;
} else {
/* previously patched function */
@@ -254,29 +242,24 @@ static int klp_check_stack_func(struct klp_func *func,
static int klp_check_stack(struct task_struct *task, char *err_buf)
{
static unsigned long entries[MAX_STACK_ENTRIES];
- struct stack_trace trace;
struct klp_object *obj;
struct klp_func *func;
- int ret;
+ int ret, nr_entries;
- trace.skip = 0;
- trace.nr_entries = 0;
- trace.max_entries = MAX_STACK_ENTRIES;
- trace.entries = entries;
- ret = save_stack_trace_tsk_reliable(task, &trace);
- WARN_ON_ONCE(ret == -ENOSYS);
- if (ret) {
+ ret = stack_trace_save_tsk_reliable(task, entries, ARRAY_SIZE(entries));
+ if (ret < 0) {
snprintf(err_buf, STACK_ERR_BUF_SIZE,
"%s: %s:%d has an unreliable stack\n",
__func__, task->comm, task->pid);
return ret;
}
+ nr_entries = ret;
klp_for_each_object(klp_transition_patch, obj) {
if (!obj->patched)
continue;
klp_for_each_func(obj, func) {
- ret = klp_check_stack_func(func, &trace);
+ ret = klp_check_stack_func(func, entries, nr_entries);
if (ret) {
snprintf(err_buf, STACK_ERR_BUF_SIZE,
"%s: %s:%d is sleeping on function %s\n",
@@ -297,11 +280,11 @@ static int klp_check_stack(struct task_struct *task, char *err_buf)
*/
static bool klp_try_switch_task(struct task_struct *task)
{
+ static char err_buf[STACK_ERR_BUF_SIZE];
struct rq *rq;
struct rq_flags flags;
int ret;
bool success = false;
- char err_buf[STACK_ERR_BUF_SIZE];
err_buf[0] = '\0';
@@ -310,6 +293,13 @@ static bool klp_try_switch_task(struct task_struct *task)
return true;
/*
+ * For arches which don't have reliable stack traces, we have to rely
+ * on other methods (e.g., switching tasks at kernel exit).
+ */
+ if (!klp_have_reliable_stack())
+ return false;
+
+ /*
* Now try to check the stack for any to-be-patched or to-be-unpatched
* functions. If all goes well, switch the task to the target patch
* state.
@@ -344,7 +334,47 @@ done:
pr_debug("%s", err_buf);
return success;
+}
+
+/*
+ * Sends a fake signal to all non-kthread tasks with TIF_PATCH_PENDING set.
+ * Kthreads with TIF_PATCH_PENDING set are woken up.
+ */
+static void klp_send_signals(void)
+{
+ struct task_struct *g, *task;
+
+ if (klp_signals_cnt == SIGNALS_TIMEOUT)
+ pr_notice("signaling remaining tasks\n");
+
+ read_lock(&tasklist_lock);
+ for_each_process_thread(g, task) {
+ if (!klp_patch_pending(task))
+ continue;
+ /*
+ * There is a small race here. We could see TIF_PATCH_PENDING
+ * set and decide to wake up a kthread or send a fake signal.
+ * Meanwhile the task could migrate itself and the action
+ * would be meaningless. It is not serious though.
+ */
+ if (task->flags & PF_KTHREAD) {
+ /*
+ * Wake up a kthread which sleeps interruptedly and
+ * still has not been migrated.
+ */
+ wake_up_state(task, TASK_INTERRUPTIBLE);
+ } else {
+ /*
+ * Send fake signal to all non-kthread tasks which are
+ * still not migrated.
+ */
+ spin_lock_irq(&task->sighand->siglock);
+ signal_wake_up(task, 0);
+ spin_unlock_irq(&task->sighand->siglock);
+ }
+ }
+ read_unlock(&tasklist_lock);
}
/*
@@ -359,6 +389,7 @@ void klp_try_complete_transition(void)
{
unsigned int cpu;
struct task_struct *g, *task;
+ struct klp_patch *patch;
bool complete = true;
WARN_ON_ONCE(klp_target_state == KLP_UNDEFINED);
@@ -396,6 +427,10 @@ void klp_try_complete_transition(void)
put_online_cpus();
if (!complete) {
+ if (klp_signals_cnt && !(klp_signals_cnt % SIGNALS_TIMEOUT))
+ klp_send_signals();
+ klp_signals_cnt++;
+
/*
* Some tasks weren't able to be switched over. Try again
* later and/or wait for other methods like kernel exit
@@ -407,7 +442,18 @@ void klp_try_complete_transition(void)
}
/* we're done, now cleanup the data structures */
+ patch = klp_transition_patch;
klp_complete_transition();
+
+ /*
+ * It would make more sense to free the patch in
+ * klp_complete_transition() but it is called also
+ * from klp_cancel_transition().
+ */
+ if (!patch->enabled) {
+ klp_free_patch_start(patch);
+ schedule_work(&patch->free_work);
+ }
}
/*
@@ -446,6 +492,8 @@ void klp_start_transition(void)
if (task->patch_state != klp_target_state)
set_tsk_thread_flag(task, TIF_PATCH_PENDING);
}
+
+ klp_signals_cnt = 0;
}
/*
@@ -569,47 +617,6 @@ void klp_copy_process(struct task_struct *child)
}
/*
- * Sends a fake signal to all non-kthread tasks with TIF_PATCH_PENDING set.
- * Kthreads with TIF_PATCH_PENDING set are woken up. Only admin can request this
- * action currently.
- */
-void klp_send_signals(void)
-{
- struct task_struct *g, *task;
-
- pr_notice("signaling remaining tasks\n");
-
- read_lock(&tasklist_lock);
- for_each_process_thread(g, task) {
- if (!klp_patch_pending(task))
- continue;
-
- /*
- * There is a small race here. We could see TIF_PATCH_PENDING
- * set and decide to wake up a kthread or send a fake signal.
- * Meanwhile the task could migrate itself and the action
- * would be meaningless. It is not serious though.
- */
- if (task->flags & PF_KTHREAD) {
- /*
- * Wake up a kthread which sleeps interruptedly and
- * still has not been migrated.
- */
- wake_up_state(task, TASK_INTERRUPTIBLE);
- } else {
- /*
- * Send fake signal to all non-kthread tasks which are
- * still not migrated.
- */
- spin_lock_irq(&task->sighand->siglock);
- signal_wake_up(task, 0);
- spin_unlock_irq(&task->sighand->siglock);
- }
- }
- read_unlock(&tasklist_lock);
-}
-
-/*
* Drop TIF_PATCH_PENDING of all tasks on admin's request. This forces an
* existing transition to finish.
*
@@ -620,6 +627,7 @@ void klp_send_signals(void)
*/
void klp_force_transition(void)
{
+ struct klp_patch *patch;
struct task_struct *g, *task;
unsigned int cpu;
@@ -633,5 +641,6 @@ void klp_force_transition(void)
for_each_possible_cpu(cpu)
klp_update_patch_state(idle_task(cpu));
- klp_forced = true;
+ klp_for_each_patch(patch)
+ patch->forced = true;
}
diff --git a/kernel/livepatch/transition.h b/kernel/livepatch/transition.h
index f9d0bc016067..322db16233de 100644
--- a/kernel/livepatch/transition.h
+++ b/kernel/livepatch/transition.h
@@ -11,7 +11,6 @@ void klp_cancel_transition(void);
void klp_start_transition(void);
void klp_try_complete_transition(void);
void klp_reverse_transition(void);
-void klp_send_signals(void);
void klp_force_transition(void);
#endif /* _LIVEPATCH_TRANSITION_H */
diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile
index 392c7f23af76..45452facff3b 100644
--- a/kernel/locking/Makefile
+++ b/kernel/locking/Makefile
@@ -25,8 +25,7 @@ obj-$(CONFIG_RT_MUTEXES) += rtmutex.o
obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o
obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o
obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock_debug.o
-obj-$(CONFIG_RWSEM_GENERIC_SPINLOCK) += rwsem-spinlock.o
-obj-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem-xadd.o
obj-$(CONFIG_QUEUED_RWLOCKS) += qrwlock.o
obj-$(CONFIG_LOCK_TORTURE_TEST) += locktorture.o
obj-$(CONFIG_WW_MUTEX_SELFTEST) += test-ww_mutex.o
+obj-$(CONFIG_LOCK_EVENT_COUNTS) += lock_events.o
diff --git a/kernel/locking/lock_events.c b/kernel/locking/lock_events.c
new file mode 100644
index 000000000000..fa2c2f951c6b
--- /dev/null
+++ b/kernel/locking/lock_events.c
@@ -0,0 +1,179 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * Authors: Waiman Long <waiman.long@hpe.com>
+ */
+
+/*
+ * Collect locking event counts
+ */
+#include <linux/debugfs.h>
+#include <linux/sched.h>
+#include <linux/sched/clock.h>
+#include <linux/fs.h>
+
+#include "lock_events.h"
+
+#undef LOCK_EVENT
+#define LOCK_EVENT(name) [LOCKEVENT_ ## name] = #name,
+
+#define LOCK_EVENTS_DIR "lock_event_counts"
+
+/*
+ * When CONFIG_LOCK_EVENT_COUNTS is enabled, event counts of different
+ * types of locks will be reported under the <debugfs>/lock_event_counts/
+ * directory. See lock_events_list.h for the list of available locking
+ * events.
+ *
+ * Writing to the special ".reset_counts" file will reset all the above
+ * locking event counts. This is a very slow operation and so should not
+ * be done frequently.
+ *
+ * These event counts are implemented as per-cpu variables which are
+ * summed and computed whenever the corresponding debugfs files are read. This
+ * minimizes added overhead making the counts usable even in a production
+ * environment.
+ */
+static const char * const lockevent_names[lockevent_num + 1] = {
+
+#include "lock_events_list.h"
+
+ [LOCKEVENT_reset_cnts] = ".reset_counts",
+};
+
+/*
+ * Per-cpu counts
+ */
+DEFINE_PER_CPU(unsigned long, lockevents[lockevent_num]);
+
+/*
+ * The lockevent_read() function can be overridden.
+ */
+ssize_t __weak lockevent_read(struct file *file, char __user *user_buf,
+ size_t count, loff_t *ppos)
+{
+ char buf[64];
+ int cpu, id, len;
+ u64 sum = 0;
+
+ /*
+ * Get the counter ID stored in file->f_inode->i_private
+ */
+ id = (long)file_inode(file)->i_private;
+
+ if (id >= lockevent_num)
+ return -EBADF;
+
+ for_each_possible_cpu(cpu)
+ sum += per_cpu(lockevents[id], cpu);
+ len = snprintf(buf, sizeof(buf) - 1, "%llu\n", sum);
+
+ return simple_read_from_buffer(user_buf, count, ppos, buf, len);
+}
+
+/*
+ * Function to handle write request
+ *
+ * When idx = reset_cnts, reset all the counts.
+ */
+static ssize_t lockevent_write(struct file *file, const char __user *user_buf,
+ size_t count, loff_t *ppos)
+{
+ int cpu;
+
+ /*
+ * Get the counter ID stored in file->f_inode->i_private
+ */
+ if ((long)file_inode(file)->i_private != LOCKEVENT_reset_cnts)
+ return count;
+
+ for_each_possible_cpu(cpu) {
+ int i;
+ unsigned long *ptr = per_cpu_ptr(lockevents, cpu);
+
+ for (i = 0 ; i < lockevent_num; i++)
+ WRITE_ONCE(ptr[i], 0);
+ }
+ return count;
+}
+
+/*
+ * Debugfs data structures
+ */
+static const struct file_operations fops_lockevent = {
+ .read = lockevent_read,
+ .write = lockevent_write,
+ .llseek = default_llseek,
+};
+
+#ifdef CONFIG_PARAVIRT_SPINLOCKS
+#include <asm/paravirt.h>
+
+static bool __init skip_lockevent(const char *name)
+{
+ static int pv_on __initdata = -1;
+
+ if (pv_on < 0)
+ pv_on = !pv_is_native_spin_unlock();
+ /*
+ * Skip PV qspinlock events on bare metal.
+ */
+ if (!pv_on && !memcmp(name, "pv_", 3))
+ return true;
+ return false;
+}
+#else
+static inline bool skip_lockevent(const char *name)
+{
+ return false;
+}
+#endif
+
+/*
+ * Initialize debugfs for the locking event counts.
+ */
+static int __init init_lockevent_counts(void)
+{
+ struct dentry *d_counts = debugfs_create_dir(LOCK_EVENTS_DIR, NULL);
+ int i;
+
+ if (!d_counts)
+ goto out;
+
+ /*
+ * Create the debugfs files
+ *
+ * As reading from and writing to the stat files can be slow, only
+ * root is allowed to do the read/write to limit impact to system
+ * performance.
+ */
+ for (i = 0; i < lockevent_num; i++) {
+ if (skip_lockevent(lockevent_names[i]))
+ continue;
+ if (!debugfs_create_file(lockevent_names[i], 0400, d_counts,
+ (void *)(long)i, &fops_lockevent))
+ goto fail_undo;
+ }
+
+ if (!debugfs_create_file(lockevent_names[LOCKEVENT_reset_cnts], 0200,
+ d_counts, (void *)(long)LOCKEVENT_reset_cnts,
+ &fops_lockevent))
+ goto fail_undo;
+
+ return 0;
+fail_undo:
+ debugfs_remove_recursive(d_counts);
+out:
+ pr_warn("Could not create '%s' debugfs entries\n", LOCK_EVENTS_DIR);
+ return -ENOMEM;
+}
+fs_initcall(init_lockevent_counts);
diff --git a/kernel/locking/lock_events.h b/kernel/locking/lock_events.h
new file mode 100644
index 000000000000..8c7e7d25f09c
--- /dev/null
+++ b/kernel/locking/lock_events.h
@@ -0,0 +1,60 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * Authors: Waiman Long <longman@redhat.com>
+ */
+
+#ifndef __LOCKING_LOCK_EVENTS_H
+#define __LOCKING_LOCK_EVENTS_H
+
+enum lock_events {
+
+#include "lock_events_list.h"
+
+ lockevent_num, /* Total number of lock event counts */
+ LOCKEVENT_reset_cnts = lockevent_num,
+};
+
+#ifdef CONFIG_LOCK_EVENT_COUNTS
+/*
+ * Per-cpu counters
+ */
+DECLARE_PER_CPU(unsigned long, lockevents[lockevent_num]);
+
+/*
+ * Increment the statistical counters. use raw_cpu_inc() because of lower
+ * overhead and we don't care if we loose the occasional update.
+ */
+static inline void __lockevent_inc(enum lock_events event, bool cond)
+{
+ if (cond)
+ raw_cpu_inc(lockevents[event]);
+}
+
+#define lockevent_inc(ev) __lockevent_inc(LOCKEVENT_ ##ev, true)
+#define lockevent_cond_inc(ev, c) __lockevent_inc(LOCKEVENT_ ##ev, c)
+
+static inline void __lockevent_add(enum lock_events event, int inc)
+{
+ raw_cpu_add(lockevents[event], inc);
+}
+
+#define lockevent_add(ev, c) __lockevent_add(LOCKEVENT_ ##ev, c)
+
+#else /* CONFIG_LOCK_EVENT_COUNTS */
+
+#define lockevent_inc(ev)
+#define lockevent_add(ev, c)
+#define lockevent_cond_inc(ev, c)
+
+#endif /* CONFIG_LOCK_EVENT_COUNTS */
+#endif /* __LOCKING_LOCK_EVENTS_H */
diff --git a/kernel/locking/lock_events_list.h b/kernel/locking/lock_events_list.h
new file mode 100644
index 000000000000..239039d0ce21
--- /dev/null
+++ b/kernel/locking/lock_events_list.h
@@ -0,0 +1,71 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * Authors: Waiman Long <longman@redhat.com>
+ */
+
+#ifndef LOCK_EVENT
+#define LOCK_EVENT(name) LOCKEVENT_ ## name,
+#endif
+
+#ifdef CONFIG_QUEUED_SPINLOCKS
+#ifdef CONFIG_PARAVIRT_SPINLOCKS
+/*
+ * Locking events for PV qspinlock.
+ */
+LOCK_EVENT(pv_hash_hops) /* Average # of hops per hashing operation */
+LOCK_EVENT(pv_kick_unlock) /* # of vCPU kicks issued at unlock time */
+LOCK_EVENT(pv_kick_wake) /* # of vCPU kicks for pv_latency_wake */
+LOCK_EVENT(pv_latency_kick) /* Average latency (ns) of vCPU kick */
+LOCK_EVENT(pv_latency_wake) /* Average latency (ns) of kick-to-wakeup */
+LOCK_EVENT(pv_lock_stealing) /* # of lock stealing operations */
+LOCK_EVENT(pv_spurious_wakeup) /* # of spurious wakeups in non-head vCPUs */
+LOCK_EVENT(pv_wait_again) /* # of wait's after queue head vCPU kick */
+LOCK_EVENT(pv_wait_early) /* # of early vCPU wait's */
+LOCK_EVENT(pv_wait_head) /* # of vCPU wait's at the queue head */
+LOCK_EVENT(pv_wait_node) /* # of vCPU wait's at non-head queue node */
+#endif /* CONFIG_PARAVIRT_SPINLOCKS */
+
+/*
+ * Locking events for qspinlock
+ *
+ * Subtracting lock_use_node[234] from lock_slowpath will give you
+ * lock_use_node1.
+ */
+LOCK_EVENT(lock_pending) /* # of locking ops via pending code */
+LOCK_EVENT(lock_slowpath) /* # of locking ops via MCS lock queue */
+LOCK_EVENT(lock_use_node2) /* # of locking ops that use 2nd percpu node */
+LOCK_EVENT(lock_use_node3) /* # of locking ops that use 3rd percpu node */
+LOCK_EVENT(lock_use_node4) /* # of locking ops that use 4th percpu node */
+LOCK_EVENT(lock_no_node) /* # of locking ops w/o using percpu node */
+#endif /* CONFIG_QUEUED_SPINLOCKS */
+
+/*
+ * Locking events for rwsem
+ */
+LOCK_EVENT(rwsem_sleep_reader) /* # of reader sleeps */
+LOCK_EVENT(rwsem_sleep_writer) /* # of writer sleeps */
+LOCK_EVENT(rwsem_wake_reader) /* # of reader wakeups */
+LOCK_EVENT(rwsem_wake_writer) /* # of writer wakeups */
+LOCK_EVENT(rwsem_opt_rlock) /* # of opt-acquired read locks */
+LOCK_EVENT(rwsem_opt_wlock) /* # of opt-acquired write locks */
+LOCK_EVENT(rwsem_opt_fail) /* # of failed optspins */
+LOCK_EVENT(rwsem_opt_nospin) /* # of disabled optspins */
+LOCK_EVENT(rwsem_opt_norspin) /* # of disabled reader-only optspins */
+LOCK_EVENT(rwsem_opt_rlock2) /* # of opt-acquired 2ndary read locks */
+LOCK_EVENT(rwsem_rlock) /* # of read locks acquired */
+LOCK_EVENT(rwsem_rlock_fast) /* # of fast read locks acquired */
+LOCK_EVENT(rwsem_rlock_fail) /* # of failed read lock acquisitions */
+LOCK_EVENT(rwsem_rlock_handoff) /* # of read lock handoffs */
+LOCK_EVENT(rwsem_wlock) /* # of write locks acquired */
+LOCK_EVENT(rwsem_wlock_fail) /* # of failed write lock acquisitions */
+LOCK_EVENT(rwsem_wlock_handoff) /* # of write lock handoffs */
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index 95932333a48b..341f52117f88 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* kernel/lockdep.c
*
@@ -45,11 +46,14 @@
#include <linux/hash.h>
#include <linux/ftrace.h>
#include <linux/stringify.h>
+#include <linux/bitmap.h>
#include <linux/bitops.h>
#include <linux/gfp.h>
#include <linux/random.h>
#include <linux/jhash.h>
#include <linux/nmi.h>
+#include <linux/rcupdate.h>
+#include <linux/kprobes.h>
#include <asm/sections.h>
@@ -81,6 +85,7 @@ module_param(lock_stat, int, 0644);
* code to recurse back into the lockdep code...
*/
static arch_spinlock_t lockdep_lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;
+static struct task_struct *lockdep_selftest_task_struct;
static int graph_lock(void)
{
@@ -130,29 +135,44 @@ static inline int debug_locks_off_graph_unlock(void)
unsigned long nr_list_entries;
static struct lock_list list_entries[MAX_LOCKDEP_ENTRIES];
+static DECLARE_BITMAP(list_entries_in_use, MAX_LOCKDEP_ENTRIES);
/*
* All data structures here are protected by the global debug_lock.
*
- * Mutex key structs only get allocated, once during bootup, and never
- * get freed - this significantly simplifies the debugging code.
+ * nr_lock_classes is the number of elements of lock_classes[] that is
+ * in use.
*/
+#define KEYHASH_BITS (MAX_LOCKDEP_KEYS_BITS - 1)
+#define KEYHASH_SIZE (1UL << KEYHASH_BITS)
+static struct hlist_head lock_keys_hash[KEYHASH_SIZE];
unsigned long nr_lock_classes;
#ifndef CONFIG_DEBUG_LOCKDEP
static
#endif
struct lock_class lock_classes[MAX_LOCKDEP_KEYS];
+static DECLARE_BITMAP(lock_classes_in_use, MAX_LOCKDEP_KEYS);
static inline struct lock_class *hlock_class(struct held_lock *hlock)
{
- if (!hlock->class_idx) {
+ unsigned int class_idx = hlock->class_idx;
+
+ /* Don't re-read hlock->class_idx, can't use READ_ONCE() on bitfield */
+ barrier();
+
+ if (!test_bit(class_idx, lock_classes_in_use)) {
/*
* Someone passed in garbage, we give up.
*/
DEBUG_LOCKS_WARN_ON(1);
return NULL;
}
- return lock_classes + hlock->class_idx - 1;
+
+ /*
+ * At this point, if the passed hlock->class_idx is still garbage,
+ * we just have to live with it
+ */
+ return lock_classes + class_idx;
}
#ifdef CONFIG_LOCK_STAT
@@ -277,11 +297,42 @@ static inline void lock_release_holdtime(struct held_lock *hlock)
#endif
/*
- * We keep a global list of all lock classes. The list only grows,
- * never shrinks. The list is only accessed with the lockdep
- * spinlock lock held.
+ * We keep a global list of all lock classes. The list is only accessed with
+ * the lockdep spinlock lock held. free_lock_classes is a list with free
+ * elements. These elements are linked together by the lock_entry member in
+ * struct lock_class.
*/
LIST_HEAD(all_lock_classes);
+static LIST_HEAD(free_lock_classes);
+
+/**
+ * struct pending_free - information about data structures about to be freed
+ * @zapped: Head of a list with struct lock_class elements.
+ * @lock_chains_being_freed: Bitmap that indicates which lock_chains[] elements
+ * are about to be freed.
+ */
+struct pending_free {
+ struct list_head zapped;
+ DECLARE_BITMAP(lock_chains_being_freed, MAX_LOCKDEP_CHAINS);
+};
+
+/**
+ * struct delayed_free - data structures used for delayed freeing
+ *
+ * A data structure for delayed freeing of data structures that may be
+ * accessed by RCU readers at the time these were freed.
+ *
+ * @rcu_head: Used to schedule an RCU callback for freeing data structures.
+ * @index: Index of @pf to which freed data structures are added.
+ * @scheduled: Whether or not an RCU callback has been scheduled.
+ * @pf: Array with information about data structures about to be freed.
+ */
+static struct delayed_free {
+ struct rcu_head rcu_head;
+ int index;
+ int scheduled;
+ struct pending_free pf[2];
+} delayed_free;
/*
* The lockdep classes are in a hash-table as well, for fast lookup:
@@ -319,6 +370,13 @@ static inline u64 iterate_chain_key(u64 key, u32 idx)
return k0 | (u64)k1 << 32;
}
+void lockdep_init_task(struct task_struct *task)
+{
+ task->lockdep_depth = 0; /* no locks held yet */
+ task->curr_chain_key = INITIAL_CHAIN_KEY;
+ task->lockdep_recursion = 0;
+}
+
void lockdep_off(void)
{
current->lockdep_recursion++;
@@ -331,6 +389,11 @@ void lockdep_on(void)
}
EXPORT_SYMBOL(lockdep_on);
+void lockdep_set_selftest_task(struct task_struct *task)
+{
+ lockdep_selftest_task_struct = task;
+}
+
/*
* Debugging switches:
*/
@@ -374,13 +437,6 @@ static int verbose(struct lock_class *class)
return 0;
}
-/*
- * Stack-trace: tightly packed array of stack backtrace
- * addresses. Protected by the graph_lock.
- */
-unsigned long nr_stack_trace_entries;
-static unsigned long stack_trace[MAX_STACK_TRACE_ENTRIES];
-
static void print_lockdep_off(const char *bug_msg)
{
printk(KERN_DEBUG "%s\n", bug_msg);
@@ -390,29 +446,23 @@ static void print_lockdep_off(const char *bug_msg)
#endif
}
-static int save_trace(struct stack_trace *trace)
-{
- trace->nr_entries = 0;
- trace->max_entries = MAX_STACK_TRACE_ENTRIES - nr_stack_trace_entries;
- trace->entries = stack_trace + nr_stack_trace_entries;
-
- trace->skip = 3;
-
- save_stack_trace(trace);
+unsigned long nr_stack_trace_entries;
- /*
- * Some daft arches put -1 at the end to indicate its a full trace.
- *
- * <rant> this is buggy anyway, since it takes a whole extra entry so a
- * complete trace that maxes out the entries provided will be reported
- * as incomplete, friggin useless </rant>
- */
- if (trace->nr_entries != 0 &&
- trace->entries[trace->nr_entries-1] == ULONG_MAX)
- trace->nr_entries--;
+#if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING)
+/*
+ * Stack-trace: tightly packed array of stack backtrace
+ * addresses. Protected by the graph_lock.
+ */
+static unsigned long stack_trace[MAX_STACK_TRACE_ENTRIES];
- trace->max_entries = trace->nr_entries;
+static int save_trace(struct lock_trace *trace)
+{
+ unsigned long *entries = stack_trace + nr_stack_trace_entries;
+ unsigned int max_entries;
+ trace->offset = nr_stack_trace_entries;
+ max_entries = MAX_STACK_TRACE_ENTRIES - nr_stack_trace_entries;
+ trace->nr_entries = stack_trace_save(entries, max_entries, 3);
nr_stack_trace_entries += trace->nr_entries;
if (nr_stack_trace_entries >= MAX_STACK_TRACE_ENTRIES-1) {
@@ -427,6 +477,7 @@ static int save_trace(struct stack_trace *trace)
return 1;
}
+#endif
unsigned int nr_hardirq_chains;
unsigned int nr_softirq_chains;
@@ -440,6 +491,7 @@ unsigned int max_lockdep_depth;
DEFINE_PER_CPU(struct lockdep_stats, lockdep_stats);
#endif
+#if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING)
/*
* Locking printouts:
*/
@@ -457,6 +509,7 @@ static const char *usage_str[] =
#undef LOCKDEP_STATE
[LOCK_USED] = "INITIAL USE",
};
+#endif
const char * __get_key_name(struct lockdep_subclass_key *key, char *str)
{
@@ -470,15 +523,26 @@ static inline unsigned long lock_flag(enum lock_usage_bit bit)
static char get_usage_char(struct lock_class *class, enum lock_usage_bit bit)
{
+ /*
+ * The usage character defaults to '.' (i.e., irqs disabled and not in
+ * irq context), which is the safest usage category.
+ */
char c = '.';
- if (class->usage_mask & lock_flag(bit + 2))
+ /*
+ * The order of the following usage checks matters, which will
+ * result in the outcome character as follows:
+ *
+ * - '+': irq is enabled and not in irq context
+ * - '-': in irq context and irq is disabled
+ * - '?': in irq context and irq is enabled
+ */
+ if (class->usage_mask & lock_flag(bit + LOCK_USAGE_DIR_MASK)) {
c = '+';
- if (class->usage_mask & lock_flag(bit)) {
- c = '-';
- if (class->usage_mask & lock_flag(bit + 2))
+ if (class->usage_mask & lock_flag(bit))
c = '?';
- }
+ } else if (class->usage_mask & lock_flag(bit))
+ c = '-';
return c;
}
@@ -542,19 +606,22 @@ static void print_lock(struct held_lock *hlock)
/*
* We can be called locklessly through debug_show_all_locks() so be
* extra careful, the hlock might have been released and cleared.
+ *
+ * If this indeed happens, lets pretend it does not hurt to continue
+ * to print the lock unless the hlock class_idx does not point to a
+ * registered class. The rationale here is: since we don't attempt
+ * to distinguish whether we are in this situation, if it just
+ * happened we can't count on class_idx to tell either.
*/
- unsigned int class_idx = hlock->class_idx;
-
- /* Don't re-read hlock->class_idx, can't use READ_ONCE() on bitfields: */
- barrier();
+ struct lock_class *lock = hlock_class(hlock);
- if (!class_idx || (class_idx - 1) >= MAX_LOCKDEP_KEYS) {
+ if (!lock) {
printk(KERN_CONT "<RELEASED>\n");
return;
}
printk(KERN_CONT "%p", hlock->instance);
- print_lock_name(lock_classes + class_idx - 1);
+ print_lock_name(lock);
printk(KERN_CONT ", at: %pS\n", (void *)hlock->acquire_ip);
}
@@ -599,12 +666,15 @@ static int very_verbose(struct lock_class *class)
* Is this the address of a static object:
*/
#ifdef __KERNEL__
-static int static_obj(void *obj)
+static int static_obj(const void *obj)
{
unsigned long start = (unsigned long) &_stext,
end = (unsigned long) &_end,
addr = (unsigned long) obj;
+ if (arch_is_kernel_initmem_freed(addr))
+ return 0;
+
/*
* static variable?
*/
@@ -699,7 +769,8 @@ look_up_lock_class(const struct lockdep_map *lock, unsigned int subclass)
* Huh! same key, different name? Did someone trample
* on some memory? We're most confused.
*/
- WARN_ON_ONCE(class->name != lock->name);
+ WARN_ON_ONCE(class->name != lock->name &&
+ lock->key != &__lockdep_no_validate__);
return class;
}
}
@@ -716,6 +787,17 @@ static bool assign_lock_key(struct lockdep_map *lock)
{
unsigned long can_addr, addr = (unsigned long)lock;
+#ifdef __KERNEL__
+ /*
+ * lockdep_free_key_range() assumes that struct lock_class_key
+ * objects do not overlap. Since we use the address of lock
+ * objects as class key for static objects, check whether the
+ * size of lock_class_key objects does not exceed the size of
+ * the smallest lock object.
+ */
+ BUILD_BUG_ON(sizeof(struct lock_class_key) > sizeof(raw_spinlock_t));
+#endif
+
if (__is_kernel_percpu_address(addr, &can_addr))
lock->key = (void *)can_addr;
else if (__is_module_percpu_address(addr, &can_addr))
@@ -735,6 +817,289 @@ static bool assign_lock_key(struct lockdep_map *lock)
return true;
}
+#ifdef CONFIG_DEBUG_LOCKDEP
+
+/* Check whether element @e occurs in list @h */
+static bool in_list(struct list_head *e, struct list_head *h)
+{
+ struct list_head *f;
+
+ list_for_each(f, h) {
+ if (e == f)
+ return true;
+ }
+
+ return false;
+}
+
+/*
+ * Check whether entry @e occurs in any of the locks_after or locks_before
+ * lists.
+ */
+static bool in_any_class_list(struct list_head *e)
+{
+ struct lock_class *class;
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(lock_classes); i++) {
+ class = &lock_classes[i];
+ if (in_list(e, &class->locks_after) ||
+ in_list(e, &class->locks_before))
+ return true;
+ }
+ return false;
+}
+
+static bool class_lock_list_valid(struct lock_class *c, struct list_head *h)
+{
+ struct lock_list *e;
+
+ list_for_each_entry(e, h, entry) {
+ if (e->links_to != c) {
+ printk(KERN_INFO "class %s: mismatch for lock entry %ld; class %s <> %s",
+ c->name ? : "(?)",
+ (unsigned long)(e - list_entries),
+ e->links_to && e->links_to->name ?
+ e->links_to->name : "(?)",
+ e->class && e->class->name ? e->class->name :
+ "(?)");
+ return false;
+ }
+ }
+ return true;
+}
+
+#ifdef CONFIG_PROVE_LOCKING
+static u16 chain_hlocks[MAX_LOCKDEP_CHAIN_HLOCKS];
+#endif
+
+static bool check_lock_chain_key(struct lock_chain *chain)
+{
+#ifdef CONFIG_PROVE_LOCKING
+ u64 chain_key = INITIAL_CHAIN_KEY;
+ int i;
+
+ for (i = chain->base; i < chain->base + chain->depth; i++)
+ chain_key = iterate_chain_key(chain_key, chain_hlocks[i]);
+ /*
+ * The 'unsigned long long' casts avoid that a compiler warning
+ * is reported when building tools/lib/lockdep.
+ */
+ if (chain->chain_key != chain_key) {
+ printk(KERN_INFO "chain %lld: key %#llx <> %#llx\n",
+ (unsigned long long)(chain - lock_chains),
+ (unsigned long long)chain->chain_key,
+ (unsigned long long)chain_key);
+ return false;
+ }
+#endif
+ return true;
+}
+
+static bool in_any_zapped_class_list(struct lock_class *class)
+{
+ struct pending_free *pf;
+ int i;
+
+ for (i = 0, pf = delayed_free.pf; i < ARRAY_SIZE(delayed_free.pf); i++, pf++) {
+ if (in_list(&class->lock_entry, &pf->zapped))
+ return true;
+ }
+
+ return false;
+}
+
+static bool __check_data_structures(void)
+{
+ struct lock_class *class;
+ struct lock_chain *chain;
+ struct hlist_head *head;
+ struct lock_list *e;
+ int i;
+
+ /* Check whether all classes occur in a lock list. */
+ for (i = 0; i < ARRAY_SIZE(lock_classes); i++) {
+ class = &lock_classes[i];
+ if (!in_list(&class->lock_entry, &all_lock_classes) &&
+ !in_list(&class->lock_entry, &free_lock_classes) &&
+ !in_any_zapped_class_list(class)) {
+ printk(KERN_INFO "class %px/%s is not in any class list\n",
+ class, class->name ? : "(?)");
+ return false;
+ }
+ }
+
+ /* Check whether all classes have valid lock lists. */
+ for (i = 0; i < ARRAY_SIZE(lock_classes); i++) {
+ class = &lock_classes[i];
+ if (!class_lock_list_valid(class, &class->locks_before))
+ return false;
+ if (!class_lock_list_valid(class, &class->locks_after))
+ return false;
+ }
+
+ /* Check the chain_key of all lock chains. */
+ for (i = 0; i < ARRAY_SIZE(chainhash_table); i++) {
+ head = chainhash_table + i;
+ hlist_for_each_entry_rcu(chain, head, entry) {
+ if (!check_lock_chain_key(chain))
+ return false;
+ }
+ }
+
+ /*
+ * Check whether all list entries that are in use occur in a class
+ * lock list.
+ */
+ for_each_set_bit(i, list_entries_in_use, ARRAY_SIZE(list_entries)) {
+ e = list_entries + i;
+ if (!in_any_class_list(&e->entry)) {
+ printk(KERN_INFO "list entry %d is not in any class list; class %s <> %s\n",
+ (unsigned int)(e - list_entries),
+ e->class->name ? : "(?)",
+ e->links_to->name ? : "(?)");
+ return false;
+ }
+ }
+
+ /*
+ * Check whether all list entries that are not in use do not occur in
+ * a class lock list.
+ */
+ for_each_clear_bit(i, list_entries_in_use, ARRAY_SIZE(list_entries)) {
+ e = list_entries + i;
+ if (in_any_class_list(&e->entry)) {
+ printk(KERN_INFO "list entry %d occurs in a class list; class %s <> %s\n",
+ (unsigned int)(e - list_entries),
+ e->class && e->class->name ? e->class->name :
+ "(?)",
+ e->links_to && e->links_to->name ?
+ e->links_to->name : "(?)");
+ return false;
+ }
+ }
+
+ return true;
+}
+
+int check_consistency = 0;
+module_param(check_consistency, int, 0644);
+
+static void check_data_structures(void)
+{
+ static bool once = false;
+
+ if (check_consistency && !once) {
+ if (!__check_data_structures()) {
+ once = true;
+ WARN_ON(once);
+ }
+ }
+}
+
+#else /* CONFIG_DEBUG_LOCKDEP */
+
+static inline void check_data_structures(void) { }
+
+#endif /* CONFIG_DEBUG_LOCKDEP */
+
+/*
+ * Initialize the lock_classes[] array elements, the free_lock_classes list
+ * and also the delayed_free structure.
+ */
+static void init_data_structures_once(void)
+{
+ static bool ds_initialized, rcu_head_initialized;
+ int i;
+
+ if (likely(rcu_head_initialized))
+ return;
+
+ if (system_state >= SYSTEM_SCHEDULING) {
+ init_rcu_head(&delayed_free.rcu_head);
+ rcu_head_initialized = true;
+ }
+
+ if (ds_initialized)
+ return;
+
+ ds_initialized = true;
+
+ INIT_LIST_HEAD(&delayed_free.pf[0].zapped);
+ INIT_LIST_HEAD(&delayed_free.pf[1].zapped);
+
+ for (i = 0; i < ARRAY_SIZE(lock_classes); i++) {
+ list_add_tail(&lock_classes[i].lock_entry, &free_lock_classes);
+ INIT_LIST_HEAD(&lock_classes[i].locks_after);
+ INIT_LIST_HEAD(&lock_classes[i].locks_before);
+ }
+}
+
+static inline struct hlist_head *keyhashentry(const struct lock_class_key *key)
+{
+ unsigned long hash = hash_long((uintptr_t)key, KEYHASH_BITS);
+
+ return lock_keys_hash + hash;
+}
+
+/* Register a dynamically allocated key. */
+void lockdep_register_key(struct lock_class_key *key)
+{
+ struct hlist_head *hash_head;
+ struct lock_class_key *k;
+ unsigned long flags;
+
+ if (WARN_ON_ONCE(static_obj(key)))
+ return;
+ hash_head = keyhashentry(key);
+
+ raw_local_irq_save(flags);
+ if (!graph_lock())
+ goto restore_irqs;
+ hlist_for_each_entry_rcu(k, hash_head, hash_entry) {
+ if (WARN_ON_ONCE(k == key))
+ goto out_unlock;
+ }
+ hlist_add_head_rcu(&key->hash_entry, hash_head);
+out_unlock:
+ graph_unlock();
+restore_irqs:
+ raw_local_irq_restore(flags);
+}
+EXPORT_SYMBOL_GPL(lockdep_register_key);
+
+/* Check whether a key has been registered as a dynamic key. */
+static bool is_dynamic_key(const struct lock_class_key *key)
+{
+ struct hlist_head *hash_head;
+ struct lock_class_key *k;
+ bool found = false;
+
+ if (WARN_ON_ONCE(static_obj(key)))
+ return false;
+
+ /*
+ * If lock debugging is disabled lock_keys_hash[] may contain
+ * pointers to memory that has already been freed. Avoid triggering
+ * a use-after-free in that case by returning early.
+ */
+ if (!debug_locks)
+ return true;
+
+ hash_head = keyhashentry(key);
+
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(k, hash_head, hash_entry) {
+ if (k == key) {
+ found = true;
+ break;
+ }
+ }
+ rcu_read_unlock();
+
+ return found;
+}
+
/*
* Register a lock's class in the hash-table, if the class is not present
* yet. Otherwise we look it up. We cache the result in the lock object
@@ -756,7 +1121,7 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force)
if (!lock->key) {
if (!assign_lock_key(lock))
return NULL;
- } else if (!static_obj(lock->key)) {
+ } else if (!static_obj(lock->key) && !is_dynamic_key(lock->key)) {
return NULL;
}
@@ -775,11 +1140,12 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force)
goto out_unlock_set;
}
- /*
- * Allocate a new key from the static array, and add it to
- * the hash:
- */
- if (nr_lock_classes >= MAX_LOCKDEP_KEYS) {
+ init_data_structures_once();
+
+ /* Allocate a new lock class and add it to the hash. */
+ class = list_first_entry_or_null(&free_lock_classes, typeof(*class),
+ lock_entry);
+ if (!class) {
if (!debug_locks_off_graph_unlock()) {
return NULL;
}
@@ -788,13 +1154,14 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force)
dump_stack();
return NULL;
}
- class = lock_classes + nr_lock_classes++;
+ nr_lock_classes++;
+ __set_bit(class - lock_classes, lock_classes_in_use);
debug_atomic_inc(nr_unused_locks);
class->key = key;
class->name = lock->name;
class->subclass = subclass;
- INIT_LIST_HEAD(&class->locks_before);
- INIT_LIST_HEAD(&class->locks_after);
+ WARN_ON_ONCE(!list_empty(&class->locks_before));
+ WARN_ON_ONCE(!list_empty(&class->locks_after));
class->name_version = count_matching_names(class);
/*
* We use RCU's safe list-add method to make
@@ -802,9 +1169,10 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force)
*/
hlist_add_head_rcu(&class->hash_entry, hash_head);
/*
- * Add it to the global list of classes:
+ * Remove the class from the free list and add it to the global list
+ * of classes.
*/
- list_add_tail(&class->lock_entry, &all_lock_classes);
+ list_move_tail(&class->lock_entry, &all_lock_classes);
if (verbose(class)) {
graph_unlock();
@@ -845,7 +1213,10 @@ out_set_class_cache:
*/
static struct lock_list *alloc_list_entry(void)
{
- if (nr_list_entries >= MAX_LOCKDEP_ENTRIES) {
+ int idx = find_first_zero_bit(list_entries_in_use,
+ ARRAY_SIZE(list_entries));
+
+ if (idx >= ARRAY_SIZE(list_entries)) {
if (!debug_locks_off_graph_unlock())
return NULL;
@@ -853,15 +1224,18 @@ static struct lock_list *alloc_list_entry(void)
dump_stack();
return NULL;
}
- return list_entries + nr_list_entries++;
+ nr_list_entries++;
+ __set_bit(idx, list_entries_in_use);
+ return list_entries + idx;
}
/*
* Add a new dependency to the head of the list:
*/
-static int add_lock_to_list(struct lock_class *this, struct list_head *head,
+static int add_lock_to_list(struct lock_class *this,
+ struct lock_class *links_to, struct list_head *head,
unsigned long ip, int distance,
- struct stack_trace *trace)
+ struct lock_trace *trace)
{
struct lock_list *entry;
/*
@@ -873,6 +1247,7 @@ static int add_lock_to_list(struct lock_class *this, struct list_head *head,
return 0;
entry->class = this;
+ entry->links_to = links_to;
entry->distance = distance;
entry->trace = *trace;
/*
@@ -892,13 +1267,17 @@ static int add_lock_to_list(struct lock_class *this, struct list_head *head,
#define CQ_MASK (MAX_CIRCULAR_QUEUE_SIZE-1)
/*
- * The circular_queue and helpers is used to implement the
- * breadth-first search(BFS)algorithem, by which we can build
- * the shortest path from the next lock to be acquired to the
- * previous held lock if there is a circular between them.
+ * The circular_queue and helpers are used to implement graph
+ * breadth-first search (BFS) algorithm, by which we can determine
+ * whether there is a path from a lock to another. In deadlock checks,
+ * a path from the next lock to be acquired to a previous held lock
+ * indicates that adding the <prev> -> <next> lock dependency will
+ * produce a circle in the graph. Breadth-first search instead of
+ * depth-first search is used in order to find the shortest (circular)
+ * path.
*/
struct circular_queue {
- unsigned long element[MAX_CIRCULAR_QUEUE_SIZE];
+ struct lock_list *element[MAX_CIRCULAR_QUEUE_SIZE];
unsigned int front, rear;
};
@@ -924,7 +1303,7 @@ static inline int __cq_full(struct circular_queue *cq)
return ((cq->rear + 1) & CQ_MASK) == cq->front;
}
-static inline int __cq_enqueue(struct circular_queue *cq, unsigned long elem)
+static inline int __cq_enqueue(struct circular_queue *cq, struct lock_list *elem)
{
if (__cq_full(cq))
return -1;
@@ -934,14 +1313,21 @@ static inline int __cq_enqueue(struct circular_queue *cq, unsigned long elem)
return 0;
}
-static inline int __cq_dequeue(struct circular_queue *cq, unsigned long *elem)
+/*
+ * Dequeue an element from the circular_queue, return a lock_list if
+ * the queue is not empty, or NULL if otherwise.
+ */
+static inline struct lock_list * __cq_dequeue(struct circular_queue *cq)
{
+ struct lock_list * lock;
+
if (__cq_empty(cq))
- return -1;
+ return NULL;
- *elem = cq->element[cq->front];
+ lock = cq->element[cq->front];
cq->front = (cq->front + 1) & CQ_MASK;
- return 0;
+
+ return lock;
}
static inline unsigned int __cq_get_elem_count(struct circular_queue *cq)
@@ -955,7 +1341,7 @@ static inline void mark_lock_accessed(struct lock_list *lock,
unsigned long nr;
nr = lock - list_entries;
- WARN_ON(nr >= nr_list_entries); /* Out-of-bounds, input fail */
+ WARN_ON(nr >= ARRAY_SIZE(list_entries)); /* Out-of-bounds, input fail */
lock->parent = parent;
lock->class->dep_gen_id = lockdep_dependency_gen_id;
}
@@ -965,7 +1351,7 @@ static inline unsigned long lock_accessed(struct lock_list *lock)
unsigned long nr;
nr = lock - list_entries;
- WARN_ON(nr >= nr_list_entries); /* Out-of-bounds, input fail */
+ WARN_ON(nr >= ARRAY_SIZE(list_entries)); /* Out-of-bounds, input fail */
return lock->class->dep_gen_id == lockdep_dependency_gen_id;
}
@@ -986,13 +1372,32 @@ static inline int get_lock_depth(struct lock_list *child)
return depth;
}
+/*
+ * Return the forward or backward dependency list.
+ *
+ * @lock: the lock_list to get its class's dependency list
+ * @offset: the offset to struct lock_class to determine whether it is
+ * locks_after or locks_before
+ */
+static inline struct list_head *get_dep_list(struct lock_list *lock, int offset)
+{
+ void *lock_class = lock->class;
+
+ return lock_class + offset;
+}
+
+/*
+ * Forward- or backward-dependency search, used for both circular dependency
+ * checking and hardirq-unsafe/softirq-unsafe checking.
+ */
static int __bfs(struct lock_list *source_entry,
void *data,
int (*match)(struct lock_list *entry, void *data),
struct lock_list **target_entry,
- int forward)
+ int offset)
{
struct lock_list *entry;
+ struct lock_list *lock;
struct list_head *head;
struct circular_queue *cq = &lock_cq;
int ret = 1;
@@ -1003,31 +1408,21 @@ static int __bfs(struct lock_list *source_entry,
goto exit;
}
- if (forward)
- head = &source_entry->class->locks_after;
- else
- head = &source_entry->class->locks_before;
-
+ head = get_dep_list(source_entry, offset);
if (list_empty(head))
goto exit;
__cq_init(cq);
- __cq_enqueue(cq, (unsigned long)source_entry);
+ __cq_enqueue(cq, source_entry);
- while (!__cq_empty(cq)) {
- struct lock_list *lock;
-
- __cq_dequeue(cq, (unsigned long *)&lock);
+ while ((lock = __cq_dequeue(cq))) {
if (!lock->class) {
ret = -2;
goto exit;
}
- if (forward)
- head = &lock->class->locks_after;
- else
- head = &lock->class->locks_before;
+ head = get_dep_list(lock, offset);
DEBUG_LOCKS_WARN_ON(!irqs_disabled());
@@ -1041,7 +1436,7 @@ static int __bfs(struct lock_list *source_entry,
goto exit;
}
- if (__cq_enqueue(cq, (unsigned long)entry)) {
+ if (__cq_enqueue(cq, entry)) {
ret = -1;
goto exit;
}
@@ -1060,7 +1455,8 @@ static inline int __bfs_forwards(struct lock_list *src_entry,
int (*match)(struct lock_list *entry, void *data),
struct lock_list **target_entry)
{
- return __bfs(src_entry, data, match, target_entry, 1);
+ return __bfs(src_entry, data, match, target_entry,
+ offsetof(struct lock_class, locks_after));
}
@@ -1069,31 +1465,31 @@ static inline int __bfs_backwards(struct lock_list *src_entry,
int (*match)(struct lock_list *entry, void *data),
struct lock_list **target_entry)
{
- return __bfs(src_entry, data, match, target_entry, 0);
+ return __bfs(src_entry, data, match, target_entry,
+ offsetof(struct lock_class, locks_before));
}
-/*
- * Recursive, forwards-direction lock-dependency checking, used for
- * both noncyclic checking and for hardirq-unsafe/softirq-unsafe
- * checking.
- */
+static void print_lock_trace(struct lock_trace *trace, unsigned int spaces)
+{
+ unsigned long *entries = stack_trace + trace->offset;
+
+ stack_trace_print(entries, trace->nr_entries, spaces);
+}
/*
* Print a dependency chain entry (this is only done when a deadlock
* has been detected):
*/
-static noinline int
+static noinline void
print_circular_bug_entry(struct lock_list *target, int depth)
{
if (debug_locks_silent)
- return 0;
+ return;
printk("\n-> #%u", depth);
print_lock_name(target->class);
printk(KERN_CONT ":\n");
- print_stack_trace(&target->trace, 6);
-
- return 0;
+ print_lock_trace(&target->trace, 6);
}
static void
@@ -1150,7 +1546,7 @@ print_circular_lock_scenario(struct held_lock *src,
* When a circular dependency is detected, print the
* header first:
*/
-static noinline int
+static noinline void
print_circular_bug_header(struct lock_list *entry, unsigned int depth,
struct held_lock *check_src,
struct held_lock *check_tgt)
@@ -1158,7 +1554,7 @@ print_circular_bug_header(struct lock_list *entry, unsigned int depth,
struct task_struct *curr = current;
if (debug_locks_silent)
- return 0;
+ return;
pr_warn("\n");
pr_warn("======================================================\n");
@@ -1176,8 +1572,6 @@ print_circular_bug_header(struct lock_list *entry, unsigned int depth,
pr_warn("\nthe existing dependency chain (in reverse order) is:\n");
print_circular_bug_entry(entry, depth);
-
- return 0;
}
static inline int class_equal(struct lock_list *entry, void *data)
@@ -1185,11 +1579,10 @@ static inline int class_equal(struct lock_list *entry, void *data)
return entry->class == data;
}
-static noinline int print_circular_bug(struct lock_list *this,
- struct lock_list *target,
- struct held_lock *check_src,
- struct held_lock *check_tgt,
- struct stack_trace *trace)
+static noinline void print_circular_bug(struct lock_list *this,
+ struct lock_list *target,
+ struct held_lock *check_src,
+ struct held_lock *check_tgt)
{
struct task_struct *curr = current;
struct lock_list *parent;
@@ -1197,10 +1590,10 @@ static noinline int print_circular_bug(struct lock_list *this,
int depth;
if (!debug_locks_off_graph_unlock() || debug_locks_silent)
- return 0;
+ return;
if (!save_trace(&this->trace))
- return 0;
+ return;
depth = get_lock_depth(target);
@@ -1222,21 +1615,17 @@ static noinline int print_circular_bug(struct lock_list *this,
printk("\nstack backtrace:\n");
dump_stack();
-
- return 0;
}
-static noinline int print_bfs_bug(int ret)
+static noinline void print_bfs_bug(int ret)
{
if (!debug_locks_off_graph_unlock())
- return 0;
+ return;
/*
* Breadth-first-search failed, graph got corrupted?
*/
WARN(1, "lockdep bfs error:%d\n", ret);
-
- return 0;
}
static int noop_count(struct lock_list *entry, void *data)
@@ -1299,49 +1688,114 @@ unsigned long lockdep_count_backward_deps(struct lock_class *class)
}
/*
- * Prove that the dependency graph starting at <entry> can not
- * lead to <target>. Print an error and return 0 if it does.
+ * Check that the dependency graph starting at <src> can lead to
+ * <target> or not. Print an error and return 0 if it does.
*/
static noinline int
-check_noncircular(struct lock_list *root, struct lock_class *target,
- struct lock_list **target_entry)
+check_path(struct lock_class *target, struct lock_list *src_entry,
+ struct lock_list **target_entry)
{
- int result;
+ int ret;
+
+ ret = __bfs_forwards(src_entry, (void *)target, class_equal,
+ target_entry);
+
+ if (unlikely(ret < 0))
+ print_bfs_bug(ret);
+
+ return ret;
+}
+
+/*
+ * Prove that the dependency graph starting at <src> can not
+ * lead to <target>. If it can, there is a circle when adding
+ * <target> -> <src> dependency.
+ *
+ * Print an error and return 0 if it does.
+ */
+static noinline int
+check_noncircular(struct held_lock *src, struct held_lock *target,
+ struct lock_trace *trace)
+{
+ int ret;
+ struct lock_list *uninitialized_var(target_entry);
+ struct lock_list src_entry = {
+ .class = hlock_class(src),
+ .parent = NULL,
+ };
debug_atomic_inc(nr_cyclic_checks);
- result = __bfs_forwards(root, target, class_equal, target_entry);
+ ret = check_path(hlock_class(target), &src_entry, &target_entry);
- return result;
+ if (unlikely(!ret)) {
+ if (!trace->nr_entries) {
+ /*
+ * If save_trace fails here, the printing might
+ * trigger a WARN but because of the !nr_entries it
+ * should not do bad things.
+ */
+ save_trace(trace);
+ }
+
+ print_circular_bug(&src_entry, target_entry, src, target);
+ }
+
+ return ret;
}
+#ifdef CONFIG_LOCKDEP_SMALL
+/*
+ * Check that the dependency graph starting at <src> can lead to
+ * <target> or not. If it can, <src> -> <target> dependency is already
+ * in the graph.
+ *
+ * Print an error and return 2 if it does or 1 if it does not.
+ */
static noinline int
-check_redundant(struct lock_list *root, struct lock_class *target,
- struct lock_list **target_entry)
+check_redundant(struct held_lock *src, struct held_lock *target)
{
- int result;
+ int ret;
+ struct lock_list *uninitialized_var(target_entry);
+ struct lock_list src_entry = {
+ .class = hlock_class(src),
+ .parent = NULL,
+ };
debug_atomic_inc(nr_redundant_checks);
- result = __bfs_forwards(root, target, class_equal, target_entry);
+ ret = check_path(hlock_class(target), &src_entry, &target_entry);
- return result;
+ if (!ret) {
+ debug_atomic_inc(nr_redundant);
+ ret = 2;
+ } else if (ret < 0)
+ ret = 0;
+
+ return ret;
+}
+#endif
+
+#ifdef CONFIG_TRACE_IRQFLAGS
+
+static inline int usage_accumulate(struct lock_list *entry, void *mask)
+{
+ *(unsigned long *)mask |= entry->class->usage_mask;
+
+ return 0;
}
-#if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING)
/*
* Forwards and backwards subgraph searching, for the purposes of
* proving that two subgraphs can be connected by a new dependency
* without creating any illegal irq-safe -> irq-unsafe lock dependency.
*/
-static inline int usage_match(struct lock_list *entry, void *bit)
+static inline int usage_match(struct lock_list *entry, void *mask)
{
- return entry->class->usage_mask & (1 << (enum lock_usage_bit)bit);
+ return entry->class->usage_mask & *(unsigned long *)mask;
}
-
-
/*
* Find a node in the forwards-direction dependency sub-graph starting
* at @root->class that matches @bit.
@@ -1353,14 +1807,14 @@ static inline int usage_match(struct lock_list *entry, void *bit)
* Return <0 on error.
*/
static int
-find_usage_forwards(struct lock_list *root, enum lock_usage_bit bit,
+find_usage_forwards(struct lock_list *root, unsigned long usage_mask,
struct lock_list **target_entry)
{
int result;
debug_atomic_inc(nr_find_usage_forwards_checks);
- result = __bfs_forwards(root, (void *)bit, usage_match, target_entry);
+ result = __bfs_forwards(root, &usage_mask, usage_match, target_entry);
return result;
}
@@ -1376,14 +1830,14 @@ find_usage_forwards(struct lock_list *root, enum lock_usage_bit bit,
* Return <0 on error.
*/
static int
-find_usage_backwards(struct lock_list *root, enum lock_usage_bit bit,
+find_usage_backwards(struct lock_list *root, unsigned long usage_mask,
struct lock_list **target_entry)
{
int result;
debug_atomic_inc(nr_find_usage_backwards_checks);
- result = __bfs_backwards(root, (void *)bit, usage_match, target_entry);
+ result = __bfs_backwards(root, &usage_mask, usage_match, target_entry);
return result;
}
@@ -1405,7 +1859,7 @@ static void print_lock_class_header(struct lock_class *class, int depth)
len += printk("%*s %s", depth, "", usage_str[bit]);
len += printk(KERN_CONT " at:\n");
- print_stack_trace(class->usage_traces + bit, len);
+ print_lock_trace(class->usage_traces + bit, len);
}
}
printk("%*s }\n", depth, "");
@@ -1419,7 +1873,7 @@ static void print_lock_class_header(struct lock_class *class, int depth)
*/
static void __used
print_shortest_lock_dependencies(struct lock_list *leaf,
- struct lock_list *root)
+ struct lock_list *root)
{
struct lock_list *entry = leaf;
int depth;
@@ -1430,7 +1884,7 @@ print_shortest_lock_dependencies(struct lock_list *leaf,
do {
print_lock_class_header(entry->class, depth);
printk("%*s ... acquired at:\n", depth, "");
- print_stack_trace(&entry->trace, 2);
+ print_lock_trace(&entry->trace, 2);
printk("\n");
if (depth == 0 && (entry != root)) {
@@ -1441,8 +1895,6 @@ print_shortest_lock_dependencies(struct lock_list *leaf,
entry = get_lock_parent(entry);
depth--;
} while (entry && (depth >= 0));
-
- return;
}
static void
@@ -1501,7 +1953,7 @@ print_irq_lock_scenario(struct lock_list *safe_entry,
printk("\n *** DEADLOCK ***\n\n");
}
-static int
+static void
print_bad_irq_dependency(struct task_struct *curr,
struct lock_list *prev_root,
struct lock_list *next_root,
@@ -1514,7 +1966,7 @@ print_bad_irq_dependency(struct task_struct *curr,
const char *irqclass)
{
if (!debug_locks_off_graph_unlock() || debug_locks_silent)
- return 0;
+ return;
pr_warn("\n");
pr_warn("=====================================================\n");
@@ -1543,14 +1995,14 @@ print_bad_irq_dependency(struct task_struct *curr,
print_lock_name(backwards_entry->class);
pr_warn("\n... which became %s-irq-safe at:\n", irqclass);
- print_stack_trace(backwards_entry->class->usage_traces + bit1, 1);
+ print_lock_trace(backwards_entry->class->usage_traces + bit1, 1);
pr_warn("\nto a %s-irq-unsafe lock:\n", irqclass);
print_lock_name(forwards_entry->class);
pr_warn("\n... which became %s-irq-unsafe at:\n", irqclass);
pr_warn("...");
- print_stack_trace(forwards_entry->class->usage_traces + bit2, 1);
+ print_lock_trace(forwards_entry->class->usage_traces + bit2, 1);
pr_warn("\nother info that might help us debug this:\n\n");
print_irq_lock_scenario(backwards_entry, forwards_entry,
@@ -1560,52 +2012,17 @@ print_bad_irq_dependency(struct task_struct *curr,
pr_warn("\nthe dependencies between %s-irq-safe lock and the holding lock:\n", irqclass);
if (!save_trace(&prev_root->trace))
- return 0;
+ return;
print_shortest_lock_dependencies(backwards_entry, prev_root);
pr_warn("\nthe dependencies between the lock to be acquired");
pr_warn(" and %s-irq-unsafe lock:\n", irqclass);
if (!save_trace(&next_root->trace))
- return 0;
+ return;
print_shortest_lock_dependencies(forwards_entry, next_root);
pr_warn("\nstack backtrace:\n");
dump_stack();
-
- return 0;
-}
-
-static int
-check_usage(struct task_struct *curr, struct held_lock *prev,
- struct held_lock *next, enum lock_usage_bit bit_backwards,
- enum lock_usage_bit bit_forwards, const char *irqclass)
-{
- int ret;
- struct lock_list this, that;
- struct lock_list *uninitialized_var(target_entry);
- struct lock_list *uninitialized_var(target_entry1);
-
- this.parent = NULL;
-
- this.class = hlock_class(prev);
- ret = find_usage_backwards(&this, bit_backwards, &target_entry);
- if (ret < 0)
- return print_bfs_bug(ret);
- if (ret == 1)
- return ret;
-
- that.parent = NULL;
- that.class = hlock_class(next);
- ret = find_usage_forwards(&that, bit_forwards, &target_entry1);
- if (ret < 0)
- return print_bfs_bug(ret);
- if (ret == 1)
- return ret;
-
- return print_bad_irq_dependency(curr, &this, &that,
- target_entry, target_entry1,
- prev, next,
- bit_backwards, bit_forwards, irqclass);
}
static const char *state_names[] = {
@@ -1624,70 +2041,192 @@ static const char *state_rnames[] = {
static inline const char *state_name(enum lock_usage_bit bit)
{
- return (bit & 1) ? state_rnames[bit >> 2] : state_names[bit >> 2];
+ if (bit & LOCK_USAGE_READ_MASK)
+ return state_rnames[bit >> LOCK_USAGE_DIR_MASK];
+ else
+ return state_names[bit >> LOCK_USAGE_DIR_MASK];
}
+/*
+ * The bit number is encoded like:
+ *
+ * bit0: 0 exclusive, 1 read lock
+ * bit1: 0 used in irq, 1 irq enabled
+ * bit2-n: state
+ */
static int exclusive_bit(int new_bit)
{
- /*
- * USED_IN
- * USED_IN_READ
- * ENABLED
- * ENABLED_READ
- *
- * bit 0 - write/read
- * bit 1 - used_in/enabled
- * bit 2+ state
- */
-
- int state = new_bit & ~3;
- int dir = new_bit & 2;
+ int state = new_bit & LOCK_USAGE_STATE_MASK;
+ int dir = new_bit & LOCK_USAGE_DIR_MASK;
/*
* keep state, bit flip the direction and strip read.
*/
- return state | (dir ^ 2);
+ return state | (dir ^ LOCK_USAGE_DIR_MASK);
}
+/*
+ * Observe that when given a bitmask where each bitnr is encoded as above, a
+ * right shift of the mask transforms the individual bitnrs as -1 and
+ * conversely, a left shift transforms into +1 for the individual bitnrs.
+ *
+ * So for all bits whose number have LOCK_ENABLED_* set (bitnr1 == 1), we can
+ * create the mask with those bit numbers using LOCK_USED_IN_* (bitnr1 == 0)
+ * instead by subtracting the bit number by 2, or shifting the mask right by 2.
+ *
+ * Similarly, bitnr1 == 0 becomes bitnr1 == 1 by adding 2, or shifting left 2.
+ *
+ * So split the mask (note that LOCKF_ENABLED_IRQ_ALL|LOCKF_USED_IN_IRQ_ALL is
+ * all bits set) and recompose with bitnr1 flipped.
+ */
+static unsigned long invert_dir_mask(unsigned long mask)
+{
+ unsigned long excl = 0;
+
+ /* Invert dir */
+ excl |= (mask & LOCKF_ENABLED_IRQ_ALL) >> LOCK_USAGE_DIR_MASK;
+ excl |= (mask & LOCKF_USED_IN_IRQ_ALL) << LOCK_USAGE_DIR_MASK;
+
+ return excl;
+}
+
+/*
+ * As above, we clear bitnr0 (LOCK_*_READ off) with bitmask ops. First, for all
+ * bits with bitnr0 set (LOCK_*_READ), add those with bitnr0 cleared (LOCK_*).
+ * And then mask out all bitnr0.
+ */
+static unsigned long exclusive_mask(unsigned long mask)
+{
+ unsigned long excl = invert_dir_mask(mask);
+
+ /* Strip read */
+ excl |= (excl & LOCKF_IRQ_READ) >> LOCK_USAGE_READ_MASK;
+ excl &= ~LOCKF_IRQ_READ;
+
+ return excl;
+}
+
+/*
+ * Retrieve the _possible_ original mask to which @mask is
+ * exclusive. Ie: this is the opposite of exclusive_mask().
+ * Note that 2 possible original bits can match an exclusive
+ * bit: one has LOCK_USAGE_READ_MASK set, the other has it
+ * cleared. So both are returned for each exclusive bit.
+ */
+static unsigned long original_mask(unsigned long mask)
+{
+ unsigned long excl = invert_dir_mask(mask);
+
+ /* Include read in existing usages */
+ excl |= (excl & LOCKF_IRQ) << LOCK_USAGE_READ_MASK;
+
+ return excl;
+}
+
+/*
+ * Find the first pair of bit match between an original
+ * usage mask and an exclusive usage mask.
+ */
+static int find_exclusive_match(unsigned long mask,
+ unsigned long excl_mask,
+ enum lock_usage_bit *bitp,
+ enum lock_usage_bit *excl_bitp)
+{
+ int bit, excl;
+
+ for_each_set_bit(bit, &mask, LOCK_USED) {
+ excl = exclusive_bit(bit);
+ if (excl_mask & lock_flag(excl)) {
+ *bitp = bit;
+ *excl_bitp = excl;
+ return 0;
+ }
+ }
+ return -1;
+}
+
+/*
+ * Prove that the new dependency does not connect a hardirq-safe(-read)
+ * lock with a hardirq-unsafe lock - to achieve this we search
+ * the backwards-subgraph starting at <prev>, and the
+ * forwards-subgraph starting at <next>:
+ */
static int check_irq_usage(struct task_struct *curr, struct held_lock *prev,
- struct held_lock *next, enum lock_usage_bit bit)
+ struct held_lock *next)
{
+ unsigned long usage_mask = 0, forward_mask, backward_mask;
+ enum lock_usage_bit forward_bit = 0, backward_bit = 0;
+ struct lock_list *uninitialized_var(target_entry1);
+ struct lock_list *uninitialized_var(target_entry);
+ struct lock_list this, that;
+ int ret;
+
/*
- * Prove that the new dependency does not connect a hardirq-safe
- * lock with a hardirq-unsafe lock - to achieve this we search
- * the backwards-subgraph starting at <prev>, and the
- * forwards-subgraph starting at <next>:
+ * Step 1: gather all hard/soft IRQs usages backward in an
+ * accumulated usage mask.
*/
- if (!check_usage(curr, prev, next, bit,
- exclusive_bit(bit), state_name(bit)))
+ this.parent = NULL;
+ this.class = hlock_class(prev);
+
+ ret = __bfs_backwards(&this, &usage_mask, usage_accumulate, NULL);
+ if (ret < 0) {
+ print_bfs_bug(ret);
return 0;
+ }
- bit++; /* _READ */
+ usage_mask &= LOCKF_USED_IN_IRQ_ALL;
+ if (!usage_mask)
+ return 1;
/*
- * Prove that the new dependency does not connect a hardirq-safe-read
- * lock with a hardirq-unsafe lock - to achieve this we search
- * the backwards-subgraph starting at <prev>, and the
- * forwards-subgraph starting at <next>:
+ * Step 2: find exclusive uses forward that match the previous
+ * backward accumulated mask.
*/
- if (!check_usage(curr, prev, next, bit,
- exclusive_bit(bit), state_name(bit)))
+ forward_mask = exclusive_mask(usage_mask);
+
+ that.parent = NULL;
+ that.class = hlock_class(next);
+
+ ret = find_usage_forwards(&that, forward_mask, &target_entry1);
+ if (ret < 0) {
+ print_bfs_bug(ret);
return 0;
+ }
+ if (ret == 1)
+ return ret;
- return 1;
-}
+ /*
+ * Step 3: we found a bad match! Now retrieve a lock from the backward
+ * list whose usage mask matches the exclusive usage mask from the
+ * lock found on the forward list.
+ */
+ backward_mask = original_mask(target_entry1->class->usage_mask);
-static int
-check_prev_add_irq(struct task_struct *curr, struct held_lock *prev,
- struct held_lock *next)
-{
-#define LOCKDEP_STATE(__STATE) \
- if (!check_irq_usage(curr, prev, next, LOCK_USED_IN_##__STATE)) \
+ ret = find_usage_backwards(&this, backward_mask, &target_entry);
+ if (ret < 0) {
+ print_bfs_bug(ret);
return 0;
-#include "lockdep_states.h"
-#undef LOCKDEP_STATE
+ }
+ if (DEBUG_LOCKS_WARN_ON(ret == 1))
+ return 1;
- return 1;
+ /*
+ * Step 4: narrow down to a pair of incompatible usage bits
+ * and report it.
+ */
+ ret = find_exclusive_match(target_entry->class->usage_mask,
+ target_entry1->class->usage_mask,
+ &backward_bit, &forward_bit);
+ if (DEBUG_LOCKS_WARN_ON(ret == -1))
+ return 1;
+
+ print_bad_irq_dependency(curr, &this, &that,
+ target_entry, target_entry1,
+ prev, next,
+ backward_bit, forward_bit,
+ state_name(backward_bit));
+
+ return 0;
}
static void inc_chains(void)
@@ -1704,9 +2243,8 @@ static void inc_chains(void)
#else
-static inline int
-check_prev_add_irq(struct task_struct *curr, struct held_lock *prev,
- struct held_lock *next)
+static inline int check_irq_usage(struct task_struct *curr,
+ struct held_lock *prev, struct held_lock *next)
{
return 1;
}
@@ -1716,11 +2254,10 @@ static inline void inc_chains(void)
nr_process_chains++;
}
-#endif
+#endif /* CONFIG_TRACE_IRQFLAGS */
static void
-print_deadlock_scenario(struct held_lock *nxt,
- struct held_lock *prv)
+print_deadlock_scenario(struct held_lock *nxt, struct held_lock *prv)
{
struct lock_class *next = hlock_class(nxt);
struct lock_class *prev = hlock_class(prv);
@@ -1738,12 +2275,12 @@ print_deadlock_scenario(struct held_lock *nxt,
printk(" May be due to missing lock nesting notation\n\n");
}
-static int
+static void
print_deadlock_bug(struct task_struct *curr, struct held_lock *prev,
struct held_lock *next)
{
if (!debug_locks_off_graph_unlock() || debug_locks_silent)
- return 0;
+ return;
pr_warn("\n");
pr_warn("============================================\n");
@@ -1762,8 +2299,6 @@ print_deadlock_bug(struct task_struct *curr, struct held_lock *prev,
pr_warn("\nstack backtrace:\n");
dump_stack();
-
- return 0;
}
/*
@@ -1775,8 +2310,7 @@ print_deadlock_bug(struct task_struct *curr, struct held_lock *prev,
* Returns: 0 on deadlock detected, 1 on OK, 2 on recursive read
*/
static int
-check_deadlock(struct task_struct *curr, struct held_lock *next,
- struct lockdep_map *next_instance, int read)
+check_deadlock(struct task_struct *curr, struct held_lock *next)
{
struct held_lock *prev;
struct held_lock *nest = NULL;
@@ -1795,7 +2329,7 @@ check_deadlock(struct task_struct *curr, struct held_lock *next,
* Allow read-after-read recursion of the same
* lock class (i.e. read_lock(lock)+read_lock(lock)):
*/
- if ((read == 2) && prev->read)
+ if ((next->read == 2) && prev->read)
return 2;
/*
@@ -1805,14 +2339,15 @@ check_deadlock(struct task_struct *curr, struct held_lock *next,
if (nest)
return 2;
- return print_deadlock_bug(curr, prev, next);
+ print_deadlock_bug(curr, prev, next);
+ return 0;
}
return 1;
}
/*
* There was a chain-cache miss, and we are about to add a new dependency
- * to a previous lock. We recursively validate the following rules:
+ * to a previous lock. We validate the following rules:
*
* - would the adding of the <prev> -> <next> dependency create a
* circular dependency in the graph? [== circular deadlock]
@@ -1834,41 +2369,44 @@ check_deadlock(struct task_struct *curr, struct held_lock *next,
*/
static int
check_prev_add(struct task_struct *curr, struct held_lock *prev,
- struct held_lock *next, int distance, struct stack_trace *trace,
- int (*save)(struct stack_trace *trace))
+ struct held_lock *next, int distance, struct lock_trace *trace)
{
- struct lock_list *uninitialized_var(target_entry);
struct lock_list *entry;
- struct lock_list this;
int ret;
+ if (!hlock_class(prev)->key || !hlock_class(next)->key) {
+ /*
+ * The warning statements below may trigger a use-after-free
+ * of the class name. It is better to trigger a use-after free
+ * and to have the class name most of the time instead of not
+ * having the class name available.
+ */
+ WARN_ONCE(!debug_locks_silent && !hlock_class(prev)->key,
+ "Detected use-after-free of lock class %px/%s\n",
+ hlock_class(prev),
+ hlock_class(prev)->name);
+ WARN_ONCE(!debug_locks_silent && !hlock_class(next)->key,
+ "Detected use-after-free of lock class %px/%s\n",
+ hlock_class(next),
+ hlock_class(next)->name);
+ return 2;
+ }
+
/*
* Prove that the new <prev> -> <next> dependency would not
* create a circular dependency in the graph. (We do this by
- * forward-recursing into the graph starting at <next>, and
- * checking whether we can reach <prev>.)
+ * a breadth-first search into the graph starting at <next>,
+ * and check whether we can reach <prev>.)
*
- * We are using global variables to control the recursion, to
- * keep the stackframe size of the recursive functions low:
+ * The search is limited by the size of the circular queue (i.e.,
+ * MAX_CIRCULAR_QUEUE_SIZE) which keeps track of a breadth of nodes
+ * in the graph whose neighbours are to be checked.
*/
- this.class = hlock_class(next);
- this.parent = NULL;
- ret = check_noncircular(&this, hlock_class(prev), &target_entry);
- if (unlikely(!ret)) {
- if (!trace->entries) {
- /*
- * If @save fails here, the printing might trigger
- * a WARN but because of the !nr_entries it should
- * not do bad things.
- */
- save(trace);
- }
- return print_circular_bug(&this, target_entry, next, prev, trace);
- }
- else if (unlikely(ret < 0))
- return print_bfs_bug(ret);
+ ret = check_noncircular(next, prev, trace);
+ if (unlikely(ret <= 0))
+ return 0;
- if (!check_prev_add_irq(curr, prev, next))
+ if (!check_irq_usage(curr, prev, next))
return 0;
/*
@@ -1897,35 +2435,30 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev,
}
}
+#ifdef CONFIG_LOCKDEP_SMALL
/*
* Is the <prev> -> <next> link redundant?
*/
- this.class = hlock_class(prev);
- this.parent = NULL;
- ret = check_redundant(&this, hlock_class(next), &target_entry);
- if (!ret) {
- debug_atomic_inc(nr_redundant);
- return 2;
- }
- if (ret < 0)
- return print_bfs_bug(ret);
-
+ ret = check_redundant(prev, next);
+ if (ret != 1)
+ return ret;
+#endif
- if (!trace->entries && !save(trace))
+ if (!trace->nr_entries && !save_trace(trace))
return 0;
/*
* Ok, all validations passed, add the new lock
* to the previous lock's dependency list:
*/
- ret = add_lock_to_list(hlock_class(next),
+ ret = add_lock_to_list(hlock_class(next), hlock_class(prev),
&hlock_class(prev)->locks_after,
next->acquire_ip, distance, trace);
if (!ret)
return 0;
- ret = add_lock_to_list(hlock_class(prev),
+ ret = add_lock_to_list(hlock_class(prev), hlock_class(next),
&hlock_class(next)->locks_before,
next->acquire_ip, distance, trace);
if (!ret)
@@ -1943,14 +2476,9 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev,
static int
check_prevs_add(struct task_struct *curr, struct held_lock *next)
{
+ struct lock_trace trace = { .nr_entries = 0 };
int depth = curr->lockdep_depth;
struct held_lock *hlock;
- struct stack_trace trace = {
- .nr_entries = 0,
- .max_entries = 0,
- .entries = NULL,
- .skip = 0,
- };
/*
* Debugging checks.
@@ -1976,7 +2504,8 @@ check_prevs_add(struct task_struct *curr, struct held_lock *next)
* added:
*/
if (hlock->read != 2 && hlock->check) {
- int ret = check_prev_add(curr, hlock, next, distance, &trace, save_trace);
+ int ret = check_prev_add(curr, hlock, next, distance,
+ &trace);
if (!ret)
return 0;
@@ -2018,8 +2547,8 @@ out_bug:
return 0;
}
-unsigned long nr_lock_chains;
struct lock_chain lock_chains[MAX_LOCKDEP_CHAINS];
+static DECLARE_BITMAP(lock_chains_in_use, MAX_LOCKDEP_CHAINS);
int nr_chain_hlocks;
static u16 chain_hlocks[MAX_LOCKDEP_CHAIN_HLOCKS];
@@ -2065,12 +2594,13 @@ static void
print_chain_keys_held_locks(struct task_struct *curr, struct held_lock *hlock_next)
{
struct held_lock *hlock;
- u64 chain_key = 0;
+ u64 chain_key = INITIAL_CHAIN_KEY;
int depth = curr->lockdep_depth;
- int i;
+ int i = get_first_held_lock(curr, hlock_next);
- printk("depth: %u\n", depth + 1);
- for (i = get_first_held_lock(curr, hlock_next); i < depth; i++) {
+ printk("depth: %u (irq_context %u)\n", depth - i + 1,
+ hlock_next->irq_context);
+ for (; i < depth; i++) {
hlock = curr->held_locks + i;
chain_key = print_chain_key_iteration(hlock->class_idx, chain_key);
@@ -2084,13 +2614,13 @@ print_chain_keys_held_locks(struct task_struct *curr, struct held_lock *hlock_ne
static void print_chain_keys_chain(struct lock_chain *chain)
{
int i;
- u64 chain_key = 0;
+ u64 chain_key = INITIAL_CHAIN_KEY;
int class_id;
printk("depth: %u\n", chain->depth);
for (i = 0; i < chain->depth; i++) {
class_id = chain_hlocks[chain->base + i];
- chain_key = print_chain_key_iteration(class_id + 1, chain_key);
+ chain_key = print_chain_key_iteration(class_id, chain_key);
print_lock_name(lock_classes + class_id);
printk("\n");
@@ -2141,7 +2671,7 @@ static int check_no_collision(struct task_struct *curr,
}
for (j = 0; j < chain->depth - 1; j++, i++) {
- id = curr->held_locks[i].class_idx - 1;
+ id = curr->held_locks[i].class_idx;
if (DEBUG_LOCKS_WARN_ON(chain_hlocks[chain->base + j] != id)) {
print_collision(curr, hlock, chain);
@@ -2153,6 +2683,33 @@ static int check_no_collision(struct task_struct *curr,
}
/*
+ * Given an index that is >= -1, return the index of the next lock chain.
+ * Return -2 if there is no next lock chain.
+ */
+long lockdep_next_lockchain(long i)
+{
+ i = find_next_bit(lock_chains_in_use, ARRAY_SIZE(lock_chains), i + 1);
+ return i < ARRAY_SIZE(lock_chains) ? i : -2;
+}
+
+unsigned long lock_chain_count(void)
+{
+ return bitmap_weight(lock_chains_in_use, ARRAY_SIZE(lock_chains));
+}
+
+/* Must be called with the graph lock held. */
+static struct lock_chain *alloc_lock_chain(void)
+{
+ int idx = find_first_zero_bit(lock_chains_in_use,
+ ARRAY_SIZE(lock_chains));
+
+ if (unlikely(idx >= ARRAY_SIZE(lock_chains)))
+ return NULL;
+ __set_bit(idx, lock_chains_in_use);
+ return lock_chains + idx;
+}
+
+/*
* Adds a dependency chain into chain hashtable. And must be called with
* graph_lock held.
*
@@ -2169,19 +2726,15 @@ static inline int add_chain_cache(struct task_struct *curr,
int i, j;
/*
- * Allocate a new chain entry from the static array, and add
- * it to the hash:
- */
-
- /*
- * We might need to take the graph lock, ensure we've got IRQs
+ * The caller must hold the graph lock, ensure we've got IRQs
* disabled to make this an IRQ-safe lock.. for recursion reasons
* lockdep won't complain about its own locking errors.
*/
if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
return 0;
- if (unlikely(nr_lock_chains >= MAX_LOCKDEP_CHAINS)) {
+ chain = alloc_lock_chain();
+ if (!chain) {
if (!debug_locks_off_graph_unlock())
return 0;
@@ -2189,7 +2742,6 @@ static inline int add_chain_cache(struct task_struct *curr,
dump_stack();
return 0;
}
- chain = lock_chains + nr_lock_chains++;
chain->chain_key = chain_key;
chain->irq_context = hlock->irq_context;
i = get_first_held_lock(curr, hlock);
@@ -2202,20 +2754,12 @@ static inline int add_chain_cache(struct task_struct *curr,
if (likely(nr_chain_hlocks + chain->depth <= MAX_LOCKDEP_CHAIN_HLOCKS)) {
chain->base = nr_chain_hlocks;
for (j = 0; j < chain->depth - 1; j++, i++) {
- int lock_id = curr->held_locks[i].class_idx - 1;
+ int lock_id = curr->held_locks[i].class_idx;
chain_hlocks[chain->base + j] = lock_id;
}
chain_hlocks[chain->base + j] = class - lock_classes;
- }
-
- if (nr_chain_hlocks < MAX_LOCKDEP_CHAIN_HLOCKS)
nr_chain_hlocks += chain->depth;
-
-#ifdef CONFIG_DEBUG_LOCKDEP
- /*
- * Important for check_no_collision().
- */
- if (unlikely(nr_chain_hlocks > MAX_LOCKDEP_CHAIN_HLOCKS)) {
+ } else {
if (!debug_locks_off_graph_unlock())
return 0;
@@ -2223,7 +2767,6 @@ static inline int add_chain_cache(struct task_struct *curr,
dump_stack();
return 0;
}
-#endif
hlist_add_head_rcu(&chain->entry, hash_head);
debug_atomic_inc(chain_lookup_misses);
@@ -2233,19 +2776,16 @@ static inline int add_chain_cache(struct task_struct *curr,
}
/*
- * Look up a dependency chain.
+ * Look up a dependency chain. Must be called with either the graph lock or
+ * the RCU read lock held.
*/
static inline struct lock_chain *lookup_chain_cache(u64 chain_key)
{
struct hlist_head *hash_head = chainhashentry(chain_key);
struct lock_chain *chain;
- /*
- * We can walk it lock-free, because entries only get added
- * to the hash:
- */
hlist_for_each_entry_rcu(chain, hash_head, entry) {
- if (chain->chain_key == chain_key) {
+ if (READ_ONCE(chain->chain_key) == chain_key) {
debug_atomic_inc(chain_lookup_hits);
return chain;
}
@@ -2304,8 +2844,9 @@ cache_hit:
return 1;
}
-static int validate_chain(struct task_struct *curr, struct lockdep_map *lock,
- struct held_lock *hlock, int chain_head, u64 chain_key)
+static int validate_chain(struct task_struct *curr,
+ struct held_lock *hlock,
+ int chain_head, u64 chain_key)
{
/*
* Trylock needs to maintain the stack of held locks, but it
@@ -2326,12 +2867,18 @@ static int validate_chain(struct task_struct *curr, struct lockdep_map *lock,
* - is softirq-safe, if this lock is hardirq-unsafe
*
* And check whether the new lock's dependency graph
- * could lead back to the previous lock.
+ * could lead back to the previous lock:
+ *
+ * - within the current held-lock stack
+ * - across our accumulated lock dependency records
*
- * any of these scenarios could lead to a deadlock. If
- * All validations
+ * any of these scenarios could lead to a deadlock.
+ */
+ /*
+ * The simple case: does the current hold the same lock
+ * already?
*/
- int ret = check_deadlock(curr, hlock, lock, hlock->read);
+ int ret = check_deadlock(curr, hlock);
if (!ret)
return 0;
@@ -2362,12 +2909,12 @@ static int validate_chain(struct task_struct *curr, struct lockdep_map *lock,
}
#else
static inline int validate_chain(struct task_struct *curr,
- struct lockdep_map *lock, struct held_lock *hlock,
- int chain_head, u64 chain_key)
+ struct held_lock *hlock,
+ int chain_head, u64 chain_key)
{
return 1;
}
-#endif
+#endif /* CONFIG_PROVE_LOCKING */
/*
* We are building curr_chain_key incrementally, so double-check
@@ -2378,7 +2925,7 @@ static void check_chain_key(struct task_struct *curr)
#ifdef CONFIG_DEBUG_LOCKDEP
struct held_lock *hlock, *prev_hlock = NULL;
unsigned int i;
- u64 chain_key = 0;
+ u64 chain_key = INITIAL_CHAIN_KEY;
for (i = 0; i < curr->lockdep_depth; i++) {
hlock = curr->held_locks + i;
@@ -2394,15 +2941,17 @@ static void check_chain_key(struct task_struct *curr)
(unsigned long long)hlock->prev_chain_key);
return;
}
+
/*
- * Whoops ran out of static storage again?
+ * hlock->class_idx can't go beyond MAX_LOCKDEP_KEYS, but is
+ * it registered lock class index?
*/
- if (DEBUG_LOCKS_WARN_ON(hlock->class_idx > MAX_LOCKDEP_KEYS))
+ if (DEBUG_LOCKS_WARN_ON(!test_bit(hlock->class_idx, lock_classes_in_use)))
return;
if (prev_hlock && (prev_hlock->irq_context !=
hlock->irq_context))
- chain_key = 0;
+ chain_key = INITIAL_CHAIN_KEY;
chain_key = iterate_chain_key(chain_key, hlock->class_idx);
prev_hlock = hlock;
}
@@ -2420,8 +2969,11 @@ static void check_chain_key(struct task_struct *curr)
#endif
}
-static void
-print_usage_bug_scenario(struct held_lock *lock)
+#if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING)
+static int mark_lock(struct task_struct *curr, struct held_lock *this,
+ enum lock_usage_bit new_bit);
+
+static void print_usage_bug_scenario(struct held_lock *lock)
{
struct lock_class *class = hlock_class(lock);
@@ -2438,12 +2990,12 @@ print_usage_bug_scenario(struct held_lock *lock)
printk("\n *** DEADLOCK ***\n\n");
}
-static int
+static void
print_usage_bug(struct task_struct *curr, struct held_lock *this,
enum lock_usage_bit prev_bit, enum lock_usage_bit new_bit)
{
if (!debug_locks_off_graph_unlock() || debug_locks_silent)
- return 0;
+ return;
pr_warn("\n");
pr_warn("================================\n");
@@ -2463,7 +3015,7 @@ print_usage_bug(struct task_struct *curr, struct held_lock *this,
print_lock(this);
pr_warn("{%s} state was registered at:\n", usage_str[prev_bit]);
- print_stack_trace(hlock_class(this)->usage_traces + prev_bit, 1);
+ print_lock_trace(hlock_class(this)->usage_traces + prev_bit, 1);
print_irqtrace_events(curr);
pr_warn("\nother info that might help us debug this:\n");
@@ -2473,8 +3025,6 @@ print_usage_bug(struct task_struct *curr, struct held_lock *this,
pr_warn("\nstack backtrace:\n");
dump_stack();
-
- return 0;
}
/*
@@ -2484,20 +3034,18 @@ static inline int
valid_state(struct task_struct *curr, struct held_lock *this,
enum lock_usage_bit new_bit, enum lock_usage_bit bad_bit)
{
- if (unlikely(hlock_class(this)->usage_mask & (1 << bad_bit)))
- return print_usage_bug(curr, this, bad_bit, new_bit);
+ if (unlikely(hlock_class(this)->usage_mask & (1 << bad_bit))) {
+ print_usage_bug(curr, this, bad_bit, new_bit);
+ return 0;
+ }
return 1;
}
-static int mark_lock(struct task_struct *curr, struct held_lock *this,
- enum lock_usage_bit new_bit);
-
-#if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING)
/*
* print irq inversion bug:
*/
-static int
+static void
print_irq_inversion_bug(struct task_struct *curr,
struct lock_list *root, struct lock_list *other,
struct held_lock *this, int forwards,
@@ -2508,7 +3056,7 @@ print_irq_inversion_bug(struct task_struct *curr,
int depth;
if (!debug_locks_off_graph_unlock() || debug_locks_silent)
- return 0;
+ return;
pr_warn("\n");
pr_warn("========================================================\n");
@@ -2549,13 +3097,11 @@ print_irq_inversion_bug(struct task_struct *curr,
pr_warn("\nthe shortest dependencies between 2nd lock and 1st lock:\n");
if (!save_trace(&root->trace))
- return 0;
+ return;
print_shortest_lock_dependencies(other, root);
pr_warn("\nstack backtrace:\n");
dump_stack();
-
- return 0;
}
/*
@@ -2572,14 +3118,17 @@ check_usage_forwards(struct task_struct *curr, struct held_lock *this,
root.parent = NULL;
root.class = hlock_class(this);
- ret = find_usage_forwards(&root, bit, &target_entry);
- if (ret < 0)
- return print_bfs_bug(ret);
+ ret = find_usage_forwards(&root, lock_flag(bit), &target_entry);
+ if (ret < 0) {
+ print_bfs_bug(ret);
+ return 0;
+ }
if (ret == 1)
return ret;
- return print_irq_inversion_bug(curr, &root, target_entry,
- this, 1, irqclass);
+ print_irq_inversion_bug(curr, &root, target_entry,
+ this, 1, irqclass);
+ return 0;
}
/*
@@ -2596,14 +3145,17 @@ check_usage_backwards(struct task_struct *curr, struct held_lock *this,
root.parent = NULL;
root.class = hlock_class(this);
- ret = find_usage_backwards(&root, bit, &target_entry);
- if (ret < 0)
- return print_bfs_bug(ret);
+ ret = find_usage_backwards(&root, lock_flag(bit), &target_entry);
+ if (ret < 0) {
+ print_bfs_bug(ret);
+ return 0;
+ }
if (ret == 1)
return ret;
- return print_irq_inversion_bug(curr, &root, target_entry,
- this, 0, irqclass);
+ print_irq_inversion_bug(curr, &root, target_entry,
+ this, 0, irqclass);
+ return 0;
}
void print_irqtrace_events(struct task_struct *curr)
@@ -2651,7 +3203,7 @@ static int (*state_verbose_f[])(struct lock_class *class) = {
static inline int state_verbose(enum lock_usage_bit bit,
struct lock_class *class)
{
- return state_verbose_f[bit >> 2](class);
+ return state_verbose_f[bit >> LOCK_USAGE_DIR_MASK](class);
}
typedef int (*check_usage_f)(struct task_struct *, struct held_lock *,
@@ -2662,8 +3214,8 @@ mark_lock_irq(struct task_struct *curr, struct held_lock *this,
enum lock_usage_bit new_bit)
{
int excl_bit = exclusive_bit(new_bit);
- int read = new_bit & 1;
- int dir = new_bit & 2;
+ int read = new_bit & LOCK_USAGE_READ_MASK;
+ int dir = new_bit & LOCK_USAGE_DIR_MASK;
/*
* mark USED_IN has to look forwards -- to ensure no dependency
@@ -2686,20 +3238,20 @@ mark_lock_irq(struct task_struct *curr, struct held_lock *this,
* Validate that the lock dependencies don't have conflicting usage
* states.
*/
- if ((!read || !dir || STRICT_READ_CHECKS) &&
- !usage(curr, this, excl_bit, state_name(new_bit & ~1)))
+ if ((!read || STRICT_READ_CHECKS) &&
+ !usage(curr, this, excl_bit, state_name(new_bit & ~LOCK_USAGE_READ_MASK)))
return 0;
/*
* Check for read in write conflicts
*/
if (!read) {
- if (!valid_state(curr, this, new_bit, excl_bit + 1))
+ if (!valid_state(curr, this, new_bit, excl_bit + LOCK_USAGE_READ_MASK))
return 0;
if (STRICT_READ_CHECKS &&
- !usage(curr, this, excl_bit + 1,
- state_name(new_bit + 1)))
+ !usage(curr, this, excl_bit + LOCK_USAGE_READ_MASK,
+ state_name(new_bit + LOCK_USAGE_READ_MASK)))
return 0;
}
@@ -2709,35 +3261,28 @@ mark_lock_irq(struct task_struct *curr, struct held_lock *this,
return 1;
}
-enum mark_type {
-#define LOCKDEP_STATE(__STATE) __STATE,
-#include "lockdep_states.h"
-#undef LOCKDEP_STATE
-};
-
/*
* Mark all held locks with a usage bit:
*/
static int
-mark_held_locks(struct task_struct *curr, enum mark_type mark)
+mark_held_locks(struct task_struct *curr, enum lock_usage_bit base_bit)
{
- enum lock_usage_bit usage_bit;
struct held_lock *hlock;
int i;
for (i = 0; i < curr->lockdep_depth; i++) {
+ enum lock_usage_bit hlock_bit = base_bit;
hlock = curr->held_locks + i;
- usage_bit = 2 + (mark << 2); /* ENABLED */
if (hlock->read)
- usage_bit += 1; /* READ */
+ hlock_bit += LOCK_USAGE_READ_MASK;
- BUG_ON(usage_bit >= LOCK_USAGE_STATES);
+ BUG_ON(hlock_bit >= LOCK_USAGE_STATES);
if (!hlock->check)
continue;
- if (!mark_lock(curr, hlock, usage_bit))
+ if (!mark_lock(curr, hlock, hlock_bit))
return 0;
}
@@ -2758,7 +3303,7 @@ static void __trace_hardirqs_on_caller(unsigned long ip)
* We are going to turn hardirqs on, so set the
* usage bit for all held locks:
*/
- if (!mark_held_locks(curr, HARDIRQ))
+ if (!mark_held_locks(curr, LOCK_ENABLED_HARDIRQ))
return;
/*
* If we have softirqs enabled, then set the usage
@@ -2766,7 +3311,7 @@ static void __trace_hardirqs_on_caller(unsigned long ip)
* this bit from being set before)
*/
if (curr->softirqs_enabled)
- if (!mark_held_locks(curr, SOFTIRQ))
+ if (!mark_held_locks(curr, LOCK_ENABLED_SOFTIRQ))
return;
curr->hardirq_enable_ip = ip;
@@ -2800,7 +3345,7 @@ void lockdep_hardirqs_on(unsigned long ip)
/*
* See the fine text that goes along with this variable definition.
*/
- if (DEBUG_LOCKS_WARN_ON(unlikely(early_boot_irqs_disabled)))
+ if (DEBUG_LOCKS_WARN_ON(early_boot_irqs_disabled))
return;
/*
@@ -2814,6 +3359,7 @@ void lockdep_hardirqs_on(unsigned long ip)
__trace_hardirqs_on_caller(ip);
current->lockdep_recursion = 0;
}
+NOKPROBE_SYMBOL(lockdep_hardirqs_on);
/*
* Hardirqs were disabled:
@@ -2843,6 +3389,7 @@ void lockdep_hardirqs_off(unsigned long ip)
} else
debug_atomic_inc(redundant_hardirqs_off);
}
+NOKPROBE_SYMBOL(lockdep_hardirqs_off);
/*
* Softirqs will be enabled:
@@ -2880,7 +3427,7 @@ void trace_softirqs_on(unsigned long ip)
* enabled too:
*/
if (curr->hardirqs_enabled)
- mark_held_locks(curr, SOFTIRQ);
+ mark_held_locks(curr, LOCK_ENABLED_SOFTIRQ);
current->lockdep_recursion = 0;
}
@@ -2916,8 +3463,12 @@ void trace_softirqs_off(unsigned long ip)
debug_atomic_inc(redundant_softirqs_off);
}
-static int mark_irqflags(struct task_struct *curr, struct held_lock *hlock)
+static int
+mark_usage(struct task_struct *curr, struct held_lock *hlock, int check)
{
+ if (!check)
+ goto lock_used;
+
/*
* If non-trylock use in a hardirq or softirq context, then
* mark the lock as used in these contexts:
@@ -2961,6 +3512,11 @@ static int mark_irqflags(struct task_struct *curr, struct held_lock *hlock)
}
}
+lock_used:
+ /* mark it as used: */
+ if (!mark_lock(curr, hlock, LOCK_USED))
+ return 0;
+
return 1;
}
@@ -2992,35 +3548,6 @@ static int separate_irq_context(struct task_struct *curr,
return 0;
}
-#else /* defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING) */
-
-static inline
-int mark_lock_irq(struct task_struct *curr, struct held_lock *this,
- enum lock_usage_bit new_bit)
-{
- WARN_ON(1); /* Impossible innit? when we don't have TRACE_IRQFLAG */
- return 1;
-}
-
-static inline int mark_irqflags(struct task_struct *curr,
- struct held_lock *hlock)
-{
- return 1;
-}
-
-static inline unsigned int task_irq_context(struct task_struct *task)
-{
- return 0;
-}
-
-static inline int separate_irq_context(struct task_struct *curr,
- struct held_lock *hlock)
-{
- return 0;
-}
-
-#endif /* defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING) */
-
/*
* Mark a lock with a usage bit, and validate the state transition:
*/
@@ -3029,6 +3556,11 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this,
{
unsigned int new_mask = 1 << new_bit, ret = 1;
+ if (new_bit >= LOCK_USAGE_STATES) {
+ DEBUG_LOCKS_WARN_ON(1);
+ return 0;
+ }
+
/*
* If already set then do not dirty the cacheline,
* nor do any checks:
@@ -3052,25 +3584,13 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this,
return 0;
switch (new_bit) {
-#define LOCKDEP_STATE(__STATE) \
- case LOCK_USED_IN_##__STATE: \
- case LOCK_USED_IN_##__STATE##_READ: \
- case LOCK_ENABLED_##__STATE: \
- case LOCK_ENABLED_##__STATE##_READ:
-#include "lockdep_states.h"
-#undef LOCKDEP_STATE
- ret = mark_lock_irq(curr, this, new_bit);
- if (!ret)
- return 0;
- break;
case LOCK_USED:
debug_atomic_dec(nr_unused_locks);
break;
default:
- if (!debug_locks_off_graph_unlock())
+ ret = mark_lock_irq(curr, this, new_bit);
+ if (!ret)
return 0;
- WARN_ON(1);
- return 0;
}
graph_unlock();
@@ -3088,6 +3608,27 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this,
return ret;
}
+#else /* defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING) */
+
+static inline int
+mark_usage(struct task_struct *curr, struct held_lock *hlock, int check)
+{
+ return 1;
+}
+
+static inline unsigned int task_irq_context(struct task_struct *task)
+{
+ return 0;
+}
+
+static inline int separate_irq_context(struct task_struct *curr,
+ struct held_lock *hlock)
+{
+ return 0;
+}
+
+#endif /* defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING) */
+
/*
* Initialize a lock instance's lock-class mapping info:
*/
@@ -3119,13 +3660,12 @@ void lockdep_init_map(struct lockdep_map *lock, const char *name,
if (DEBUG_LOCKS_WARN_ON(!key))
return;
/*
- * Sanity check, the lock-class key must be persistent:
+ * Sanity check, the lock-class key must either have been allocated
+ * statically or must have been registered as a dynamic key.
*/
- if (!static_obj(key)) {
- printk("BUG: key %px not in .data!\n", key);
- /*
- * What it says above ^^^^^, I suggest you read it.
- */
+ if (!static_obj(key) && !is_dynamic_key(key)) {
+ if (debug_locks)
+ printk(KERN_ERR "BUG: key %px has not been registered!\n", key);
DEBUG_LOCKS_WARN_ON(1);
return;
}
@@ -3152,15 +3692,15 @@ EXPORT_SYMBOL_GPL(lockdep_init_map);
struct lock_class_key __lockdep_no_validate__;
EXPORT_SYMBOL_GPL(__lockdep_no_validate__);
-static int
+static void
print_lock_nested_lock_not_held(struct task_struct *curr,
struct held_lock *hlock,
unsigned long ip)
{
if (!debug_locks_off())
- return 0;
+ return;
if (debug_locks_silent)
- return 0;
+ return;
pr_warn("\n");
pr_warn("==================================\n");
@@ -3182,8 +3722,6 @@ print_lock_nested_lock_not_held(struct task_struct *curr,
pr_warn("\nstack backtrace:\n");
dump_stack();
-
- return 0;
}
static int __lock_is_held(const struct lockdep_map *lock, int read);
@@ -3248,24 +3786,24 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
if (DEBUG_LOCKS_WARN_ON(depth >= MAX_LOCK_DEPTH))
return 0;
- class_idx = class - lock_classes + 1;
+ class_idx = class - lock_classes;
if (depth) {
hlock = curr->held_locks + depth - 1;
if (hlock->class_idx == class_idx && nest_lock) {
- if (hlock->references) {
- /*
- * Check: unsigned int references:12, overflow.
- */
- if (DEBUG_LOCKS_WARN_ON(hlock->references == (1 << 12)-1))
- return 0;
+ if (!references)
+ references++;
+ if (!hlock->references)
hlock->references++;
- } else {
- hlock->references = 2;
- }
- return 1;
+ hlock->references += references;
+
+ /* Overflow */
+ if (DEBUG_LOCKS_WARN_ON(hlock->references < references))
+ return 0;
+
+ return 2;
}
}
@@ -3292,11 +3830,8 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
#endif
hlock->pin_count = pin_count;
- if (check && !mark_irqflags(curr, hlock))
- return 0;
-
- /* mark it as used: */
- if (!mark_lock(curr, hlock, LOCK_USED))
+ /* Initialize the lock usage bit */
+ if (!mark_usage(curr, hlock, check))
return 0;
/*
@@ -3310,9 +3845,9 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
* the hash, not class->key.
*/
/*
- * Whoops, we did it again.. ran straight out of our static allocation.
+ * Whoops, we did it again.. class_idx is invalid.
*/
- if (DEBUG_LOCKS_WARN_ON(class_idx > MAX_LOCKDEP_KEYS))
+ if (DEBUG_LOCKS_WARN_ON(!test_bit(class_idx, lock_classes_in_use)))
return 0;
chain_key = curr->curr_chain_key;
@@ -3320,22 +3855,29 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
/*
* How can we have a chain hash when we ain't got no keys?!
*/
- if (DEBUG_LOCKS_WARN_ON(chain_key != 0))
+ if (DEBUG_LOCKS_WARN_ON(chain_key != INITIAL_CHAIN_KEY))
return 0;
chain_head = 1;
}
hlock->prev_chain_key = chain_key;
if (separate_irq_context(curr, hlock)) {
- chain_key = 0;
+ chain_key = INITIAL_CHAIN_KEY;
chain_head = 1;
}
chain_key = iterate_chain_key(chain_key, class_idx);
- if (nest_lock && !__lock_is_held(nest_lock, -1))
- return print_lock_nested_lock_not_held(curr, hlock, ip);
+ if (nest_lock && !__lock_is_held(nest_lock, -1)) {
+ print_lock_nested_lock_not_held(curr, hlock, ip);
+ return 0;
+ }
+
+ if (!debug_locks_silent) {
+ WARN_ON_ONCE(depth && !hlock_class(hlock - 1)->key);
+ WARN_ON_ONCE(!hlock_class(hlock)->key);
+ }
- if (!validate_chain(curr, lock, hlock, chain_head, chain_key))
+ if (!validate_chain(curr, hlock, chain_head, chain_key))
return 0;
curr->curr_chain_key = chain_key;
@@ -3364,14 +3906,14 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
return 1;
}
-static int
-print_unlock_imbalance_bug(struct task_struct *curr, struct lockdep_map *lock,
- unsigned long ip)
+static void print_unlock_imbalance_bug(struct task_struct *curr,
+ struct lockdep_map *lock,
+ unsigned long ip)
{
if (!debug_locks_off())
- return 0;
+ return;
if (debug_locks_silent)
- return 0;
+ return;
pr_warn("\n");
pr_warn("=====================================\n");
@@ -3389,8 +3931,6 @@ print_unlock_imbalance_bug(struct task_struct *curr, struct lockdep_map *lock,
pr_warn("\nstack backtrace:\n");
dump_stack();
-
- return 0;
}
static int match_held_lock(const struct held_lock *hlock,
@@ -3422,7 +3962,7 @@ static int match_held_lock(const struct held_lock *hlock,
if (DEBUG_LOCKS_WARN_ON(!hlock->nest_lock))
return 0;
- if (hlock->class_idx == class - lock_classes + 1)
+ if (hlock->class_idx == class - lock_classes)
return 1;
}
@@ -3466,22 +4006,33 @@ out:
}
static int reacquire_held_locks(struct task_struct *curr, unsigned int depth,
- int idx)
+ int idx, unsigned int *merged)
{
struct held_lock *hlock;
+ int first_idx = idx;
if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
return 0;
for (hlock = curr->held_locks + idx; idx < depth; idx++, hlock++) {
- if (!__lock_acquire(hlock->instance,
+ switch (__lock_acquire(hlock->instance,
hlock_class(hlock)->subclass,
hlock->trylock,
hlock->read, hlock->check,
hlock->hardirqs_off,
hlock->nest_lock, hlock->acquire_ip,
- hlock->references, hlock->pin_count))
+ hlock->references, hlock->pin_count)) {
+ case 0:
return 1;
+ case 1:
+ break;
+ case 2:
+ *merged += (idx == first_idx);
+ break;
+ default:
+ WARN_ON(1);
+ return 0;
+ }
}
return 0;
}
@@ -3492,11 +4043,14 @@ __lock_set_class(struct lockdep_map *lock, const char *name,
unsigned long ip)
{
struct task_struct *curr = current;
+ unsigned int depth, merged = 0;
struct held_lock *hlock;
struct lock_class *class;
- unsigned int depth;
int i;
+ if (unlikely(!debug_locks))
+ return 0;
+
depth = curr->lockdep_depth;
/*
* This function is about (re)setting the class of a held lock,
@@ -3506,24 +4060,26 @@ __lock_set_class(struct lockdep_map *lock, const char *name,
return 0;
hlock = find_held_lock(curr, lock, depth, &i);
- if (!hlock)
- return print_unlock_imbalance_bug(curr, lock, ip);
+ if (!hlock) {
+ print_unlock_imbalance_bug(curr, lock, ip);
+ return 0;
+ }
lockdep_init_map(lock, name, key, 0);
class = register_lock_class(lock, subclass, 0);
- hlock->class_idx = class - lock_classes + 1;
+ hlock->class_idx = class - lock_classes;
curr->lockdep_depth = i;
curr->curr_chain_key = hlock->prev_chain_key;
- if (reacquire_held_locks(curr, depth, i))
+ if (reacquire_held_locks(curr, depth, i, &merged))
return 0;
/*
* I took it apart and put it back together again, except now I have
* these 'spare' parts.. where shall I put them.
*/
- if (DEBUG_LOCKS_WARN_ON(curr->lockdep_depth != depth))
+ if (DEBUG_LOCKS_WARN_ON(curr->lockdep_depth != depth - merged))
return 0;
return 1;
}
@@ -3531,10 +4087,13 @@ __lock_set_class(struct lockdep_map *lock, const char *name,
static int __lock_downgrade(struct lockdep_map *lock, unsigned long ip)
{
struct task_struct *curr = current;
+ unsigned int depth, merged = 0;
struct held_lock *hlock;
- unsigned int depth;
int i;
+ if (unlikely(!debug_locks))
+ return 0;
+
depth = curr->lockdep_depth;
/*
* This function is about (re)setting the class of a held lock,
@@ -3544,8 +4103,10 @@ static int __lock_downgrade(struct lockdep_map *lock, unsigned long ip)
return 0;
hlock = find_held_lock(curr, lock, depth, &i);
- if (!hlock)
- return print_unlock_imbalance_bug(curr, lock, ip);
+ if (!hlock) {
+ print_unlock_imbalance_bug(curr, lock, ip);
+ return 0;
+ }
curr->lockdep_depth = i;
curr->curr_chain_key = hlock->prev_chain_key;
@@ -3554,7 +4115,11 @@ static int __lock_downgrade(struct lockdep_map *lock, unsigned long ip)
hlock->read = 1;
hlock->acquire_ip = ip;
- if (reacquire_held_locks(curr, depth, i))
+ if (reacquire_held_locks(curr, depth, i, &merged))
+ return 0;
+
+ /* Merging can't happen with unchanged classes.. */
+ if (DEBUG_LOCKS_WARN_ON(merged))
return 0;
/*
@@ -3563,6 +4128,7 @@ static int __lock_downgrade(struct lockdep_map *lock, unsigned long ip)
*/
if (DEBUG_LOCKS_WARN_ON(curr->lockdep_depth != depth))
return 0;
+
return 1;
}
@@ -3574,11 +4140,11 @@ static int __lock_downgrade(struct lockdep_map *lock, unsigned long ip)
* @nested is an hysterical artifact, needs a tree wide cleanup.
*/
static int
-__lock_release(struct lockdep_map *lock, int nested, unsigned long ip)
+__lock_release(struct lockdep_map *lock, unsigned long ip)
{
struct task_struct *curr = current;
+ unsigned int depth, merged = 1;
struct held_lock *hlock;
- unsigned int depth;
int i;
if (unlikely(!debug_locks))
@@ -3589,16 +4155,20 @@ __lock_release(struct lockdep_map *lock, int nested, unsigned long ip)
* So we're all set to release this lock.. wait what lock? We don't
* own any locks, you've been drinking again?
*/
- if (DEBUG_LOCKS_WARN_ON(depth <= 0))
- return print_unlock_imbalance_bug(curr, lock, ip);
+ if (depth <= 0) {
+ print_unlock_imbalance_bug(curr, lock, ip);
+ return 0;
+ }
/*
* Check whether the lock exists in the current stack
* of held locks:
*/
hlock = find_held_lock(curr, lock, depth, &i);
- if (!hlock)
- return print_unlock_imbalance_bug(curr, lock, ip);
+ if (!hlock) {
+ print_unlock_imbalance_bug(curr, lock, ip);
+ return 0;
+ }
if (hlock->instance == lock)
lock_release_holdtime(hlock);
@@ -3633,14 +4203,15 @@ __lock_release(struct lockdep_map *lock, int nested, unsigned long ip)
if (i == depth-1)
return 1;
- if (reacquire_held_locks(curr, depth, i + 1))
+ if (reacquire_held_locks(curr, depth, i + 1, &merged))
return 0;
/*
* We had N bottles of beer on the wall, we drank one, but now
* there's not N-1 bottles of beer left on the wall...
+ * Pouring two of the bottles together is acceptable.
*/
- DEBUG_LOCKS_WARN_ON(curr->lockdep_depth != depth-1);
+ DEBUG_LOCKS_WARN_ON(curr->lockdep_depth != depth - merged);
/*
* Since reacquire_held_locks() would have called check_chain_key()
@@ -3650,7 +4221,8 @@ __lock_release(struct lockdep_map *lock, int nested, unsigned long ip)
return 0;
}
-static int __lock_is_held(const struct lockdep_map *lock, int read)
+static nokprobe_inline
+int __lock_is_held(const struct lockdep_map *lock, int read)
{
struct task_struct *curr = current;
int i;
@@ -3857,7 +4429,7 @@ void lock_release(struct lockdep_map *lock, int nested,
check_flags(flags);
current->lockdep_recursion = 1;
trace_lock_release(lock, ip);
- if (__lock_release(lock, nested, ip))
+ if (__lock_release(lock, ip))
check_chain_key(current);
current->lockdep_recursion = 0;
raw_local_irq_restore(flags);
@@ -3883,6 +4455,7 @@ int lock_is_held_type(const struct lockdep_map *lock, int read)
return ret;
}
EXPORT_SYMBOL_GPL(lock_is_held_type);
+NOKPROBE_SYMBOL(lock_is_held_type);
struct pin_cookie lock_pin_lock(struct lockdep_map *lock)
{
@@ -3939,14 +4512,14 @@ void lock_unpin_lock(struct lockdep_map *lock, struct pin_cookie cookie)
EXPORT_SYMBOL_GPL(lock_unpin_lock);
#ifdef CONFIG_LOCK_STAT
-static int
-print_lock_contention_bug(struct task_struct *curr, struct lockdep_map *lock,
- unsigned long ip)
+static void print_lock_contention_bug(struct task_struct *curr,
+ struct lockdep_map *lock,
+ unsigned long ip)
{
if (!debug_locks_off())
- return 0;
+ return;
if (debug_locks_silent)
- return 0;
+ return;
pr_warn("\n");
pr_warn("=================================\n");
@@ -3964,8 +4537,6 @@ print_lock_contention_bug(struct task_struct *curr, struct lockdep_map *lock,
pr_warn("\nstack backtrace:\n");
dump_stack();
-
- return 0;
}
static void
@@ -4110,9 +4681,7 @@ void lockdep_reset(void)
int i;
raw_local_irq_save(flags);
- current->curr_chain_key = 0;
- current->lockdep_depth = 0;
- current->lockdep_recursion = 0;
+ lockdep_init_task(current);
memset(current->held_locks, 0, MAX_LOCK_DEPTH*sizeof(struct held_lock));
nr_hardirq_chains = 0;
nr_softirq_chains = 0;
@@ -4123,29 +4692,132 @@ void lockdep_reset(void)
raw_local_irq_restore(flags);
}
+/* Remove a class from a lock chain. Must be called with the graph lock held. */
+static void remove_class_from_lock_chain(struct pending_free *pf,
+ struct lock_chain *chain,
+ struct lock_class *class)
+{
+#ifdef CONFIG_PROVE_LOCKING
+ struct lock_chain *new_chain;
+ u64 chain_key;
+ int i;
+
+ for (i = chain->base; i < chain->base + chain->depth; i++) {
+ if (chain_hlocks[i] != class - lock_classes)
+ continue;
+ /* The code below leaks one chain_hlock[] entry. */
+ if (--chain->depth > 0) {
+ memmove(&chain_hlocks[i], &chain_hlocks[i + 1],
+ (chain->base + chain->depth - i) *
+ sizeof(chain_hlocks[0]));
+ }
+ /*
+ * Each lock class occurs at most once in a lock chain so once
+ * we found a match we can break out of this loop.
+ */
+ goto recalc;
+ }
+ /* Since the chain has not been modified, return. */
+ return;
+
+recalc:
+ chain_key = INITIAL_CHAIN_KEY;
+ for (i = chain->base; i < chain->base + chain->depth; i++)
+ chain_key = iterate_chain_key(chain_key, chain_hlocks[i]);
+ if (chain->depth && chain->chain_key == chain_key)
+ return;
+ /* Overwrite the chain key for concurrent RCU readers. */
+ WRITE_ONCE(chain->chain_key, chain_key);
+ /*
+ * Note: calling hlist_del_rcu() from inside a
+ * hlist_for_each_entry_rcu() loop is safe.
+ */
+ hlist_del_rcu(&chain->entry);
+ __set_bit(chain - lock_chains, pf->lock_chains_being_freed);
+ if (chain->depth == 0)
+ return;
+ /*
+ * If the modified lock chain matches an existing lock chain, drop
+ * the modified lock chain.
+ */
+ if (lookup_chain_cache(chain_key))
+ return;
+ new_chain = alloc_lock_chain();
+ if (WARN_ON_ONCE(!new_chain)) {
+ debug_locks_off();
+ return;
+ }
+ *new_chain = *chain;
+ hlist_add_head_rcu(&new_chain->entry, chainhashentry(chain_key));
+#endif
+}
+
+/* Must be called with the graph lock held. */
+static void remove_class_from_lock_chains(struct pending_free *pf,
+ struct lock_class *class)
+{
+ struct lock_chain *chain;
+ struct hlist_head *head;
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(chainhash_table); i++) {
+ head = chainhash_table + i;
+ hlist_for_each_entry_rcu(chain, head, entry) {
+ remove_class_from_lock_chain(pf, chain, class);
+ }
+ }
+}
+
/*
* Remove all references to a lock class. The caller must hold the graph lock.
*/
-static void zap_class(struct lock_class *class)
+static void zap_class(struct pending_free *pf, struct lock_class *class)
{
+ struct lock_list *entry;
int i;
+ WARN_ON_ONCE(!class->key);
+
/*
* Remove all dependencies this lock is
* involved in:
*/
- for (i = 0; i < nr_list_entries; i++) {
- if (list_entries[i].class == class)
- list_del_rcu(&list_entries[i].entry);
+ for_each_set_bit(i, list_entries_in_use, ARRAY_SIZE(list_entries)) {
+ entry = list_entries + i;
+ if (entry->class != class && entry->links_to != class)
+ continue;
+ __clear_bit(i, list_entries_in_use);
+ nr_list_entries--;
+ list_del_rcu(&entry->entry);
+ }
+ if (list_empty(&class->locks_after) &&
+ list_empty(&class->locks_before)) {
+ list_move_tail(&class->lock_entry, &pf->zapped);
+ hlist_del_rcu(&class->hash_entry);
+ WRITE_ONCE(class->key, NULL);
+ WRITE_ONCE(class->name, NULL);
+ nr_lock_classes--;
+ __clear_bit(class - lock_classes, lock_classes_in_use);
+ } else {
+ WARN_ONCE(true, "%s() failed for class %s\n", __func__,
+ class->name);
}
- /*
- * Unhash the class and remove it from the all_lock_classes list:
- */
- hlist_del_rcu(&class->hash_entry);
- list_del(&class->lock_entry);
- RCU_INIT_POINTER(class->key, NULL);
- RCU_INIT_POINTER(class->name, NULL);
+ remove_class_from_lock_chains(pf, class);
+}
+
+static void reinit_class(struct lock_class *class)
+{
+ void *const p = class;
+ const unsigned int offset = offsetof(struct lock_class, key);
+
+ WARN_ON_ONCE(!class->lock_entry.next);
+ WARN_ON_ONCE(!list_empty(&class->locks_after));
+ WARN_ON_ONCE(!list_empty(&class->locks_before));
+ memset(p + offset, 0, sizeof(*class) - offset);
+ WARN_ON_ONCE(!class->lock_entry.next);
+ WARN_ON_ONCE(!list_empty(&class->locks_after));
+ WARN_ON_ONCE(!list_empty(&class->locks_before));
}
static inline int within(const void *addr, void *start, unsigned long size)
@@ -4153,55 +4825,171 @@ static inline int within(const void *addr, void *start, unsigned long size)
return addr >= start && addr < start + size;
}
+static bool inside_selftest(void)
+{
+ return current == lockdep_selftest_task_struct;
+}
+
+/* The caller must hold the graph lock. */
+static struct pending_free *get_pending_free(void)
+{
+ return delayed_free.pf + delayed_free.index;
+}
+
+static void free_zapped_rcu(struct rcu_head *cb);
+
/*
- * Used in module.c to remove lock classes from memory that is going to be
- * freed; and possibly re-used by other modules.
- *
- * We will have had one sync_sched() before getting here, so we're guaranteed
- * nobody will look up these exact classes -- they're properly dead but still
- * allocated.
+ * Schedule an RCU callback if no RCU callback is pending. Must be called with
+ * the graph lock held.
*/
-void lockdep_free_key_range(void *start, unsigned long size)
+static void call_rcu_zapped(struct pending_free *pf)
+{
+ WARN_ON_ONCE(inside_selftest());
+
+ if (list_empty(&pf->zapped))
+ return;
+
+ if (delayed_free.scheduled)
+ return;
+
+ delayed_free.scheduled = true;
+
+ WARN_ON_ONCE(delayed_free.pf + delayed_free.index != pf);
+ delayed_free.index ^= 1;
+
+ call_rcu(&delayed_free.rcu_head, free_zapped_rcu);
+}
+
+/* The caller must hold the graph lock. May be called from RCU context. */
+static void __free_zapped_classes(struct pending_free *pf)
{
struct lock_class *class;
- struct hlist_head *head;
+
+ check_data_structures();
+
+ list_for_each_entry(class, &pf->zapped, lock_entry)
+ reinit_class(class);
+
+ list_splice_init(&pf->zapped, &free_lock_classes);
+
+#ifdef CONFIG_PROVE_LOCKING
+ bitmap_andnot(lock_chains_in_use, lock_chains_in_use,
+ pf->lock_chains_being_freed, ARRAY_SIZE(lock_chains));
+ bitmap_clear(pf->lock_chains_being_freed, 0, ARRAY_SIZE(lock_chains));
+#endif
+}
+
+static void free_zapped_rcu(struct rcu_head *ch)
+{
+ struct pending_free *pf;
unsigned long flags;
- int i;
- int locked;
+
+ if (WARN_ON_ONCE(ch != &delayed_free.rcu_head))
+ return;
raw_local_irq_save(flags);
- locked = graph_lock();
+ arch_spin_lock(&lockdep_lock);
+ current->lockdep_recursion = 1;
+
+ /* closed head */
+ pf = delayed_free.pf + (delayed_free.index ^ 1);
+ __free_zapped_classes(pf);
+ delayed_free.scheduled = false;
/*
- * Unhash all classes that were created by this module:
+ * If there's anything on the open list, close and start a new callback.
*/
+ call_rcu_zapped(delayed_free.pf + delayed_free.index);
+
+ current->lockdep_recursion = 0;
+ arch_spin_unlock(&lockdep_lock);
+ raw_local_irq_restore(flags);
+}
+
+/*
+ * Remove all lock classes from the class hash table and from the
+ * all_lock_classes list whose key or name is in the address range [start,
+ * start + size). Move these lock classes to the zapped_classes list. Must
+ * be called with the graph lock held.
+ */
+static void __lockdep_free_key_range(struct pending_free *pf, void *start,
+ unsigned long size)
+{
+ struct lock_class *class;
+ struct hlist_head *head;
+ int i;
+
+ /* Unhash all classes that were created by a module. */
for (i = 0; i < CLASSHASH_SIZE; i++) {
head = classhash_table + i;
hlist_for_each_entry_rcu(class, head, hash_entry) {
- if (within(class->key, start, size))
- zap_class(class);
- else if (within(class->name, start, size))
- zap_class(class);
+ if (!within(class->key, start, size) &&
+ !within(class->name, start, size))
+ continue;
+ zap_class(pf, class);
}
}
+}
- if (locked)
- graph_unlock();
+/*
+ * Used in module.c to remove lock classes from memory that is going to be
+ * freed; and possibly re-used by other modules.
+ *
+ * We will have had one synchronize_rcu() before getting here, so we're
+ * guaranteed nobody will look up these exact classes -- they're properly dead
+ * but still allocated.
+ */
+static void lockdep_free_key_range_reg(void *start, unsigned long size)
+{
+ struct pending_free *pf;
+ unsigned long flags;
+
+ init_data_structures_once();
+
+ raw_local_irq_save(flags);
+ arch_spin_lock(&lockdep_lock);
+ current->lockdep_recursion = 1;
+ pf = get_pending_free();
+ __lockdep_free_key_range(pf, start, size);
+ call_rcu_zapped(pf);
+ current->lockdep_recursion = 0;
+ arch_spin_unlock(&lockdep_lock);
raw_local_irq_restore(flags);
/*
* Wait for any possible iterators from look_up_lock_class() to pass
* before continuing to free the memory they refer to.
- *
- * sync_sched() is sufficient because the read-side is IRQ disable.
*/
synchronize_rcu();
+}
- /*
- * XXX at this point we could return the resources to the pool;
- * instead we leak them. We would need to change to bitmap allocators
- * instead of the linear allocators we have now.
- */
+/*
+ * Free all lockdep keys in the range [start, start+size). Does not sleep.
+ * Ignores debug_locks. Must only be used by the lockdep selftests.
+ */
+static void lockdep_free_key_range_imm(void *start, unsigned long size)
+{
+ struct pending_free *pf = delayed_free.pf;
+ unsigned long flags;
+
+ init_data_structures_once();
+
+ raw_local_irq_save(flags);
+ arch_spin_lock(&lockdep_lock);
+ __lockdep_free_key_range(pf, start, size);
+ __free_zapped_classes(pf);
+ arch_spin_unlock(&lockdep_lock);
+ raw_local_irq_restore(flags);
+}
+
+void lockdep_free_key_range(void *start, unsigned long size)
+{
+ init_data_structures_once();
+
+ if (inside_selftest())
+ lockdep_free_key_range_imm(start, size);
+ else
+ lockdep_free_key_range_reg(start, size);
}
/*
@@ -4226,14 +5014,12 @@ static bool lock_class_cache_is_registered(struct lockdep_map *lock)
return false;
}
-void lockdep_reset_lock(struct lockdep_map *lock)
+/* The caller must hold the graph lock. Does not sleep. */
+static void __lockdep_reset_lock(struct pending_free *pf,
+ struct lockdep_map *lock)
{
struct lock_class *class;
- unsigned long flags;
- int j, locked;
-
- raw_local_irq_save(flags);
- locked = graph_lock();
+ int j;
/*
* Remove all classes this lock might have:
@@ -4244,27 +5030,104 @@ void lockdep_reset_lock(struct lockdep_map *lock)
*/
class = look_up_lock_class(lock, j);
if (class)
- zap_class(class);
+ zap_class(pf, class);
}
/*
* Debug check: in the end all mapped classes should
* be gone.
*/
- if (unlikely(lock_class_cache_is_registered(lock))) {
- if (debug_locks_off_graph_unlock()) {
- /*
- * We all just reset everything, how did it match?
- */
- WARN_ON(1);
+ if (WARN_ON_ONCE(lock_class_cache_is_registered(lock)))
+ debug_locks_off();
+}
+
+/*
+ * Remove all information lockdep has about a lock if debug_locks == 1. Free
+ * released data structures from RCU context.
+ */
+static void lockdep_reset_lock_reg(struct lockdep_map *lock)
+{
+ struct pending_free *pf;
+ unsigned long flags;
+ int locked;
+
+ raw_local_irq_save(flags);
+ locked = graph_lock();
+ if (!locked)
+ goto out_irq;
+
+ pf = get_pending_free();
+ __lockdep_reset_lock(pf, lock);
+ call_rcu_zapped(pf);
+
+ graph_unlock();
+out_irq:
+ raw_local_irq_restore(flags);
+}
+
+/*
+ * Reset a lock. Does not sleep. Ignores debug_locks. Must only be used by the
+ * lockdep selftests.
+ */
+static void lockdep_reset_lock_imm(struct lockdep_map *lock)
+{
+ struct pending_free *pf = delayed_free.pf;
+ unsigned long flags;
+
+ raw_local_irq_save(flags);
+ arch_spin_lock(&lockdep_lock);
+ __lockdep_reset_lock(pf, lock);
+ __free_zapped_classes(pf);
+ arch_spin_unlock(&lockdep_lock);
+ raw_local_irq_restore(flags);
+}
+
+void lockdep_reset_lock(struct lockdep_map *lock)
+{
+ init_data_structures_once();
+
+ if (inside_selftest())
+ lockdep_reset_lock_imm(lock);
+ else
+ lockdep_reset_lock_reg(lock);
+}
+
+/* Unregister a dynamically allocated key. */
+void lockdep_unregister_key(struct lock_class_key *key)
+{
+ struct hlist_head *hash_head = keyhashentry(key);
+ struct lock_class_key *k;
+ struct pending_free *pf;
+ unsigned long flags;
+ bool found = false;
+
+ might_sleep();
+
+ if (WARN_ON_ONCE(static_obj(key)))
+ return;
+
+ raw_local_irq_save(flags);
+ if (!graph_lock())
+ goto out_irq;
+
+ pf = get_pending_free();
+ hlist_for_each_entry_rcu(k, hash_head, hash_entry) {
+ if (k == key) {
+ hlist_del_rcu(&k->hash_entry);
+ found = true;
+ break;
}
- goto out_restore;
}
- if (locked)
- graph_unlock();
-
-out_restore:
+ WARN_ON_ONCE(!found);
+ __lockdep_free_key_range(pf, key, 1);
+ call_rcu_zapped(pf);
+ graph_unlock();
+out_irq:
raw_local_irq_restore(flags);
+
+ /* Wait until is_dynamic_key() has finished accessing k->hash_entry. */
+ synchronize_rcu();
}
+EXPORT_SYMBOL_GPL(lockdep_unregister_key);
void __init lockdep_init(void)
{
@@ -4278,20 +5141,25 @@ void __init lockdep_init(void)
printk("... MAX_LOCKDEP_CHAINS: %lu\n", MAX_LOCKDEP_CHAINS);
printk("... CHAINHASH_SIZE: %lu\n", CHAINHASH_SIZE);
- printk(" memory used by lock dependency info: %lu kB\n",
- (sizeof(struct lock_class) * MAX_LOCKDEP_KEYS +
- sizeof(struct list_head) * CLASSHASH_SIZE +
- sizeof(struct lock_list) * MAX_LOCKDEP_ENTRIES +
- sizeof(struct lock_chain) * MAX_LOCKDEP_CHAINS +
- sizeof(struct list_head) * CHAINHASH_SIZE
+ printk(" memory used by lock dependency info: %zu kB\n",
+ (sizeof(lock_classes) +
+ sizeof(lock_classes_in_use) +
+ sizeof(classhash_table) +
+ sizeof(list_entries) +
+ sizeof(list_entries_in_use) +
+ sizeof(chainhash_table) +
+ sizeof(delayed_free)
#ifdef CONFIG_PROVE_LOCKING
- + sizeof(struct circular_queue)
+ + sizeof(lock_cq)
+ + sizeof(lock_chains)
+ + sizeof(lock_chains_in_use)
+ + sizeof(chain_hlocks)
#endif
) / 1024
);
- printk(" per task-struct memory footprint: %lu bytes\n",
- sizeof(struct held_lock) * MAX_LOCK_DEPTH);
+ printk(" per task-struct memory footprint: %zu bytes\n",
+ sizeof(((struct task_struct *)NULL)->held_locks));
}
static void
diff --git a/kernel/locking/lockdep_internals.h b/kernel/locking/lockdep_internals.h
index 88c847a41c8a..cc83568d5012 100644
--- a/kernel/locking/lockdep_internals.h
+++ b/kernel/locking/lockdep_internals.h
@@ -22,6 +22,10 @@ enum lock_usage_bit {
LOCK_USAGE_STATES
};
+#define LOCK_USAGE_READ_MASK 1
+#define LOCK_USAGE_DIR_MASK 2
+#define LOCK_USAGE_STATE_MASK (~(LOCK_USAGE_READ_MASK | LOCK_USAGE_DIR_MASK))
+
/*
* Usage-state bitmasks:
*/
@@ -38,13 +42,35 @@ enum {
__LOCKF(USED)
};
-#define LOCKF_ENABLED_IRQ (LOCKF_ENABLED_HARDIRQ | LOCKF_ENABLED_SOFTIRQ)
-#define LOCKF_USED_IN_IRQ (LOCKF_USED_IN_HARDIRQ | LOCKF_USED_IN_SOFTIRQ)
+#define LOCKDEP_STATE(__STATE) LOCKF_ENABLED_##__STATE |
+static const unsigned long LOCKF_ENABLED_IRQ =
+#include "lockdep_states.h"
+ 0;
+#undef LOCKDEP_STATE
+
+#define LOCKDEP_STATE(__STATE) LOCKF_USED_IN_##__STATE |
+static const unsigned long LOCKF_USED_IN_IRQ =
+#include "lockdep_states.h"
+ 0;
+#undef LOCKDEP_STATE
+
+#define LOCKDEP_STATE(__STATE) LOCKF_ENABLED_##__STATE##_READ |
+static const unsigned long LOCKF_ENABLED_IRQ_READ =
+#include "lockdep_states.h"
+ 0;
+#undef LOCKDEP_STATE
+
+#define LOCKDEP_STATE(__STATE) LOCKF_USED_IN_##__STATE##_READ |
+static const unsigned long LOCKF_USED_IN_IRQ_READ =
+#include "lockdep_states.h"
+ 0;
+#undef LOCKDEP_STATE
+
+#define LOCKF_ENABLED_IRQ_ALL (LOCKF_ENABLED_IRQ | LOCKF_ENABLED_IRQ_READ)
+#define LOCKF_USED_IN_IRQ_ALL (LOCKF_USED_IN_IRQ | LOCKF_USED_IN_IRQ_READ)
-#define LOCKF_ENABLED_IRQ_READ \
- (LOCKF_ENABLED_HARDIRQ_READ | LOCKF_ENABLED_SOFTIRQ_READ)
-#define LOCKF_USED_IN_IRQ_READ \
- (LOCKF_USED_IN_HARDIRQ_READ | LOCKF_USED_IN_SOFTIRQ_READ)
+#define LOCKF_IRQ (LOCKF_ENABLED_IRQ | LOCKF_USED_IN_IRQ)
+#define LOCKF_IRQ_READ (LOCKF_ENABLED_IRQ_READ | LOCKF_USED_IN_IRQ_READ)
/*
* CONFIG_LOCKDEP_SMALL is defined for sparc. Sparc requires .text,
@@ -96,7 +122,8 @@ struct lock_class *lock_chain_get_class(struct lock_chain *chain, int i);
extern unsigned long nr_lock_classes;
extern unsigned long nr_list_entries;
-extern unsigned long nr_lock_chains;
+long lockdep_next_lockchain(long i);
+unsigned long lock_chain_count(void);
extern int nr_chain_hlocks;
extern unsigned long nr_stack_trace_entries;
@@ -104,7 +131,6 @@ extern unsigned int nr_hardirq_chains;
extern unsigned int nr_softirq_chains;
extern unsigned int nr_process_chains;
extern unsigned int max_lockdep_depth;
-extern unsigned int max_recursion_depth;
extern unsigned int max_bfs_queue_depth;
@@ -133,25 +159,22 @@ lockdep_count_backward_deps(struct lock_class *class)
* and we want to avoid too much cache bouncing.
*/
struct lockdep_stats {
- int chain_lookup_hits;
- int chain_lookup_misses;
- int hardirqs_on_events;
- int hardirqs_off_events;
- int redundant_hardirqs_on;
- int redundant_hardirqs_off;
- int softirqs_on_events;
- int softirqs_off_events;
- int redundant_softirqs_on;
- int redundant_softirqs_off;
- int nr_unused_locks;
- int nr_redundant_checks;
- int nr_redundant;
- int nr_cyclic_checks;
- int nr_cyclic_check_recursions;
- int nr_find_usage_forwards_checks;
- int nr_find_usage_forwards_recursions;
- int nr_find_usage_backwards_checks;
- int nr_find_usage_backwards_recursions;
+ unsigned long chain_lookup_hits;
+ unsigned int chain_lookup_misses;
+ unsigned long hardirqs_on_events;
+ unsigned long hardirqs_off_events;
+ unsigned long redundant_hardirqs_on;
+ unsigned long redundant_hardirqs_off;
+ unsigned long softirqs_on_events;
+ unsigned long softirqs_off_events;
+ unsigned long redundant_softirqs_on;
+ unsigned long redundant_softirqs_off;
+ int nr_unused_locks;
+ unsigned int nr_redundant_checks;
+ unsigned int nr_redundant;
+ unsigned int nr_cyclic_checks;
+ unsigned int nr_find_usage_forwards_checks;
+ unsigned int nr_find_usage_backwards_checks;
/*
* Per lock class locking operation stat counts
diff --git a/kernel/locking/lockdep_proc.c b/kernel/locking/lockdep_proc.c
index 3d31f9b0059e..65b6a1600c8f 100644
--- a/kernel/locking/lockdep_proc.c
+++ b/kernel/locking/lockdep_proc.c
@@ -104,18 +104,18 @@ static const struct seq_operations lockdep_ops = {
#ifdef CONFIG_PROVE_LOCKING
static void *lc_start(struct seq_file *m, loff_t *pos)
{
+ if (*pos < 0)
+ return NULL;
+
if (*pos == 0)
return SEQ_START_TOKEN;
- if (*pos - 1 < nr_lock_chains)
- return lock_chains + (*pos - 1);
-
- return NULL;
+ return lock_chains + (*pos - 1);
}
static void *lc_next(struct seq_file *m, void *v, loff_t *pos)
{
- (*pos)++;
+ *pos = lockdep_next_lockchain(*pos - 1) + 1;
return lc_start(m, pos);
}
@@ -210,6 +210,7 @@ static int lockdep_stats_show(struct seq_file *m, void *v)
nr_hardirq_read_safe = 0, nr_hardirq_read_unsafe = 0,
sum_forward_deps = 0;
+#ifdef CONFIG_PROVE_LOCKING
list_for_each_entry(class, &all_lock_classes, lock_entry) {
if (class->usage_mask == 0)
@@ -241,13 +242,13 @@ static int lockdep_stats_show(struct seq_file *m, void *v)
if (class->usage_mask & LOCKF_ENABLED_HARDIRQ_READ)
nr_hardirq_read_unsafe++;
-#ifdef CONFIG_PROVE_LOCKING
sum_forward_deps += lockdep_count_forward_deps(class);
-#endif
}
#ifdef CONFIG_DEBUG_LOCKDEP
DEBUG_LOCKS_WARN_ON(debug_atomic_read(nr_unused_locks) != nr_unused);
#endif
+
+#endif
seq_printf(m, " lock-classes: %11lu [max: %lu]\n",
nr_lock_classes, MAX_LOCKDEP_KEYS);
seq_printf(m, " direct dependencies: %11lu [max: %lu]\n",
@@ -268,7 +269,7 @@ static int lockdep_stats_show(struct seq_file *m, void *v)
#ifdef CONFIG_PROVE_LOCKING
seq_printf(m, " dependency chains: %11lu [max: %lu]\n",
- nr_lock_chains, MAX_LOCKDEP_CHAINS);
+ lock_chain_count(), MAX_LOCKDEP_CHAINS);
seq_printf(m, " dependency chain hlocks: %11d [max: %lu]\n",
nr_chain_hlocks, MAX_LOCKDEP_CHAIN_HLOCKS);
#endif
diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c
index 7d0b0ed74404..c513031cd7e3 100644
--- a/kernel/locking/locktorture.c
+++ b/kernel/locking/locktorture.c
@@ -1,23 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0+
/*
* Module-based torture test facility for locking
*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, you can access it online at
- * http://www.gnu.org/licenses/gpl-2.0.html.
- *
* Copyright (C) IBM Corporation, 2014
*
- * Authors: Paul E. McKenney <paulmck@us.ibm.com>
+ * Authors: Paul E. McKenney <paulmck@linux.ibm.com>
* Davidlohr Bueso <dave@stgolabs.net>
* Based on kernel/rcu/torture.c.
*/
@@ -45,7 +32,7 @@
#include <linux/torture.h>
MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com>");
+MODULE_AUTHOR("Paul E. McKenney <paulmck@linux.ibm.com>");
torture_param(int, nwriters_stress, -1,
"Number of write-locking stress-test threads");
@@ -842,7 +829,9 @@ static void lock_torture_cleanup(void)
"End of test: SUCCESS");
kfree(cxt.lwsa);
+ cxt.lwsa = NULL;
kfree(cxt.lrsa);
+ cxt.lrsa = NULL;
end:
torture_cleanup_end();
@@ -970,7 +959,7 @@ static int __init lock_torture_init(void)
/* Prepare torture context. */
if (onoff_interval > 0) {
firsterr = torture_onoff_init(onoff_holdoff * HZ,
- onoff_interval * HZ);
+ onoff_interval * HZ, NULL);
if (firsterr)
goto unwind;
}
@@ -986,7 +975,7 @@ static int __init lock_torture_init(void)
goto unwind;
}
if (stutter > 0) {
- firsterr = torture_stutter_init(stutter);
+ firsterr = torture_stutter_init(stutter, stutter);
if (firsterr)
goto unwind;
}
diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
index db578783dd36..0c601ae072b3 100644
--- a/kernel/locking/mutex.c
+++ b/kernel/locking/mutex.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* kernel/locking/mutex.c
*
diff --git a/kernel/locking/percpu-rwsem.c b/kernel/locking/percpu-rwsem.c
index 883cf1b92d90..364d38a0c444 100644
--- a/kernel/locking/percpu-rwsem.c
+++ b/kernel/locking/percpu-rwsem.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
#include <linux/atomic.h>
#include <linux/rwsem.h>
#include <linux/percpu.h>
@@ -7,6 +8,8 @@
#include <linux/sched.h>
#include <linux/errno.h>
+#include "rwsem.h"
+
int __percpu_init_rwsem(struct percpu_rw_semaphore *sem,
const char *name, struct lock_class_key *rwsem_key)
{
@@ -15,7 +18,7 @@ int __percpu_init_rwsem(struct percpu_rw_semaphore *sem,
return -ENOMEM;
/* ->rw_sem represents the whole percpu_rw_semaphore for lockdep */
- rcu_sync_init(&sem->rss, RCU_SCHED_SYNC);
+ rcu_sync_init(&sem->rss);
__init_rwsem(&sem->rw_sem, name, rwsem_key);
rcuwait_init(&sem->writer);
sem->readers_block = 0;
diff --git a/kernel/locking/qrwlock.c b/kernel/locking/qrwlock.c
index c7471c3fb798..fe9ca92faa2a 100644
--- a/kernel/locking/qrwlock.c
+++ b/kernel/locking/qrwlock.c
@@ -1,16 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* Queued read/write locks
*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
* (C) Copyright 2013-2014 Hewlett-Packard Development Company, L.P.
*
* Authors: Waiman Long <waiman.long@hp.com>
diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c
index 8a8c3c208c5e..2473f10c6956 100644
--- a/kernel/locking/qspinlock.c
+++ b/kernel/locking/qspinlock.c
@@ -1,16 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* Queued spinlock
*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
* (C) Copyright 2013-2015 Hewlett-Packard Development Company, L.P.
* (C) Copyright 2013-2014,2018 Red Hat, Inc.
* (C) Copyright 2015 Intel Corp.
@@ -124,9 +115,6 @@ static inline __pure u32 encode_tail(int cpu, int idx)
{
u32 tail;
-#ifdef CONFIG_DEBUG_SPINLOCK
- BUG_ON(idx > 3);
-#endif
tail = (cpu + 1) << _Q_TAIL_CPU_OFFSET;
tail |= idx << _Q_TAIL_IDX_OFFSET; /* assume < 4 */
@@ -398,7 +386,7 @@ void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val)
* 0,1,0 -> 0,0,1
*/
clear_pending_set_locked(lock);
- qstat_inc(qstat_lock_pending, true);
+ lockevent_inc(lock_pending);
return;
/*
@@ -406,18 +394,34 @@ void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val)
* queuing.
*/
queue:
- qstat_inc(qstat_lock_slowpath, true);
+ lockevent_inc(lock_slowpath);
pv_queue:
node = this_cpu_ptr(&qnodes[0].mcs);
idx = node->count++;
tail = encode_tail(smp_processor_id(), idx);
+ /*
+ * 4 nodes are allocated based on the assumption that there will
+ * not be nested NMIs taking spinlocks. That may not be true in
+ * some architectures even though the chance of needing more than
+ * 4 nodes will still be extremely unlikely. When that happens,
+ * we fall back to spinning on the lock directly without using
+ * any MCS node. This is not the most elegant solution, but is
+ * simple enough.
+ */
+ if (unlikely(idx >= MAX_NODES)) {
+ lockevent_inc(lock_no_node);
+ while (!queued_spin_trylock(lock))
+ cpu_relax();
+ goto release;
+ }
+
node = grab_mcs_node(node, idx);
/*
* Keep counts of non-zero index values:
*/
- qstat_inc(qstat_lock_idx1 + idx - 1, idx);
+ lockevent_cond_inc(lock_use_node2 + idx - 1, idx);
/*
* Ensure that we increment the head node->count before initialising
diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h
index 8f36c27c1794..89bab079e7a4 100644
--- a/kernel/locking/qspinlock_paravirt.h
+++ b/kernel/locking/qspinlock_paravirt.h
@@ -89,7 +89,7 @@ static inline bool pv_hybrid_queued_unfair_trylock(struct qspinlock *lock)
if (!(val & _Q_LOCKED_PENDING_MASK) &&
(cmpxchg_acquire(&lock->locked, 0, _Q_LOCKED_VAL) == 0)) {
- qstat_inc(qstat_pv_lock_stealing, true);
+ lockevent_inc(pv_lock_stealing);
return true;
}
if (!(val & _Q_TAIL_MASK) || (val & _Q_PENDING_MASK))
@@ -219,7 +219,7 @@ static struct qspinlock **pv_hash(struct qspinlock *lock, struct pv_node *node)
hopcnt++;
if (!cmpxchg(&he->lock, NULL, lock)) {
WRITE_ONCE(he->node, node);
- qstat_hop(hopcnt);
+ lockevent_pv_hop(hopcnt);
return &he->lock;
}
}
@@ -320,8 +320,8 @@ static void pv_wait_node(struct mcs_spinlock *node, struct mcs_spinlock *prev)
smp_store_mb(pn->state, vcpu_halted);
if (!READ_ONCE(node->locked)) {
- qstat_inc(qstat_pv_wait_node, true);
- qstat_inc(qstat_pv_wait_early, wait_early);
+ lockevent_inc(pv_wait_node);
+ lockevent_cond_inc(pv_wait_early, wait_early);
pv_wait(&pn->state, vcpu_halted);
}
@@ -339,7 +339,8 @@ static void pv_wait_node(struct mcs_spinlock *node, struct mcs_spinlock *prev)
* So it is better to spin for a while in the hope that the
* MCS lock will be released soon.
*/
- qstat_inc(qstat_pv_spurious_wakeup, !READ_ONCE(node->locked));
+ lockevent_cond_inc(pv_spurious_wakeup,
+ !READ_ONCE(node->locked));
}
/*
@@ -416,7 +417,7 @@ pv_wait_head_or_lock(struct qspinlock *lock, struct mcs_spinlock *node)
/*
* Tracking # of slowpath locking operations
*/
- qstat_inc(qstat_lock_slowpath, true);
+ lockevent_inc(lock_slowpath);
for (;; waitcnt++) {
/*
@@ -464,8 +465,8 @@ pv_wait_head_or_lock(struct qspinlock *lock, struct mcs_spinlock *node)
}
}
WRITE_ONCE(pn->state, vcpu_hashed);
- qstat_inc(qstat_pv_wait_head, true);
- qstat_inc(qstat_pv_wait_again, waitcnt);
+ lockevent_inc(pv_wait_head);
+ lockevent_cond_inc(pv_wait_again, waitcnt);
pv_wait(&lock->locked, _Q_SLOW_VAL);
/*
@@ -528,7 +529,7 @@ __pv_queued_spin_unlock_slowpath(struct qspinlock *lock, u8 locked)
* vCPU is harmless other than the additional latency in completing
* the unlock.
*/
- qstat_inc(qstat_pv_kick_unlock, true);
+ lockevent_inc(pv_kick_unlock);
pv_kick(node->cpu);
}
diff --git a/kernel/locking/qspinlock_stat.h b/kernel/locking/qspinlock_stat.h
index 42d3d8dc8f49..e625bb410aa2 100644
--- a/kernel/locking/qspinlock_stat.h
+++ b/kernel/locking/qspinlock_stat.h
@@ -1,261 +1,105 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
*
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * Authors: Waiman Long <waiman.long@hpe.com>
+ * Authors: Waiman Long <longman@redhat.com>
*/
-/*
- * When queued spinlock statistical counters are enabled, the following
- * debugfs files will be created for reporting the counter values:
- *
- * <debugfs>/qlockstat/
- * pv_hash_hops - average # of hops per hashing operation
- * pv_kick_unlock - # of vCPU kicks issued at unlock time
- * pv_kick_wake - # of vCPU kicks used for computing pv_latency_wake
- * pv_latency_kick - average latency (ns) of vCPU kick operation
- * pv_latency_wake - average latency (ns) from vCPU kick to wakeup
- * pv_lock_stealing - # of lock stealing operations
- * pv_spurious_wakeup - # of spurious wakeups in non-head vCPUs
- * pv_wait_again - # of wait's after a queue head vCPU kick
- * pv_wait_early - # of early vCPU wait's
- * pv_wait_head - # of vCPU wait's at the queue head
- * pv_wait_node - # of vCPU wait's at a non-head queue node
- * lock_pending - # of locking operations via pending code
- * lock_slowpath - # of locking operations via MCS lock queue
- *
- * Writing to the "reset_counters" file will reset all the above counter
- * values.
- *
- * These statistical counters are implemented as per-cpu variables which are
- * summed and computed whenever the corresponding debugfs files are read. This
- * minimizes added overhead making the counters usable even in a production
- * environment.
- *
- * There may be slight difference between pv_kick_wake and pv_kick_unlock.
- */
-enum qlock_stats {
- qstat_pv_hash_hops,
- qstat_pv_kick_unlock,
- qstat_pv_kick_wake,
- qstat_pv_latency_kick,
- qstat_pv_latency_wake,
- qstat_pv_lock_stealing,
- qstat_pv_spurious_wakeup,
- qstat_pv_wait_again,
- qstat_pv_wait_early,
- qstat_pv_wait_head,
- qstat_pv_wait_node,
- qstat_lock_pending,
- qstat_lock_slowpath,
- qstat_lock_idx1,
- qstat_lock_idx2,
- qstat_lock_idx3,
- qstat_num, /* Total number of statistical counters */
- qstat_reset_cnts = qstat_num,
-};
+#include "lock_events.h"
-#ifdef CONFIG_QUEUED_LOCK_STAT
+#ifdef CONFIG_LOCK_EVENT_COUNTS
+#ifdef CONFIG_PARAVIRT_SPINLOCKS
/*
- * Collect pvqspinlock statistics
+ * Collect pvqspinlock locking event counts
*/
-#include <linux/debugfs.h>
#include <linux/sched.h>
#include <linux/sched/clock.h>
#include <linux/fs.h>
-static const char * const qstat_names[qstat_num + 1] = {
- [qstat_pv_hash_hops] = "pv_hash_hops",
- [qstat_pv_kick_unlock] = "pv_kick_unlock",
- [qstat_pv_kick_wake] = "pv_kick_wake",
- [qstat_pv_spurious_wakeup] = "pv_spurious_wakeup",
- [qstat_pv_latency_kick] = "pv_latency_kick",
- [qstat_pv_latency_wake] = "pv_latency_wake",
- [qstat_pv_lock_stealing] = "pv_lock_stealing",
- [qstat_pv_wait_again] = "pv_wait_again",
- [qstat_pv_wait_early] = "pv_wait_early",
- [qstat_pv_wait_head] = "pv_wait_head",
- [qstat_pv_wait_node] = "pv_wait_node",
- [qstat_lock_pending] = "lock_pending",
- [qstat_lock_slowpath] = "lock_slowpath",
- [qstat_lock_idx1] = "lock_index1",
- [qstat_lock_idx2] = "lock_index2",
- [qstat_lock_idx3] = "lock_index3",
- [qstat_reset_cnts] = "reset_counters",
-};
+#define EVENT_COUNT(ev) lockevents[LOCKEVENT_ ## ev]
/*
- * Per-cpu counters
+ * PV specific per-cpu counter
*/
-static DEFINE_PER_CPU(unsigned long, qstats[qstat_num]);
static DEFINE_PER_CPU(u64, pv_kick_time);
/*
- * Function to read and return the qlock statistical counter values
+ * Function to read and return the PV qspinlock counts.
*
* The following counters are handled specially:
- * 1. qstat_pv_latency_kick
+ * 1. pv_latency_kick
* Average kick latency (ns) = pv_latency_kick/pv_kick_unlock
- * 2. qstat_pv_latency_wake
+ * 2. pv_latency_wake
* Average wake latency (ns) = pv_latency_wake/pv_kick_wake
- * 3. qstat_pv_hash_hops
+ * 3. pv_hash_hops
* Average hops/hash = pv_hash_hops/pv_kick_unlock
*/
-static ssize_t qstat_read(struct file *file, char __user *user_buf,
- size_t count, loff_t *ppos)
+ssize_t lockevent_read(struct file *file, char __user *user_buf,
+ size_t count, loff_t *ppos)
{
char buf[64];
- int cpu, counter, len;
- u64 stat = 0, kicks = 0;
+ int cpu, id, len;
+ u64 sum = 0, kicks = 0;
/*
* Get the counter ID stored in file->f_inode->i_private
*/
- counter = (long)file_inode(file)->i_private;
+ id = (long)file_inode(file)->i_private;
- if (counter >= qstat_num)
+ if (id >= lockevent_num)
return -EBADF;
for_each_possible_cpu(cpu) {
- stat += per_cpu(qstats[counter], cpu);
+ sum += per_cpu(lockevents[id], cpu);
/*
- * Need to sum additional counter for some of them
+ * Need to sum additional counters for some of them
*/
- switch (counter) {
+ switch (id) {
- case qstat_pv_latency_kick:
- case qstat_pv_hash_hops:
- kicks += per_cpu(qstats[qstat_pv_kick_unlock], cpu);
+ case LOCKEVENT_pv_latency_kick:
+ case LOCKEVENT_pv_hash_hops:
+ kicks += per_cpu(EVENT_COUNT(pv_kick_unlock), cpu);
break;
- case qstat_pv_latency_wake:
- kicks += per_cpu(qstats[qstat_pv_kick_wake], cpu);
+ case LOCKEVENT_pv_latency_wake:
+ kicks += per_cpu(EVENT_COUNT(pv_kick_wake), cpu);
break;
}
}
- if (counter == qstat_pv_hash_hops) {
+ if (id == LOCKEVENT_pv_hash_hops) {
u64 frac = 0;
if (kicks) {
- frac = 100ULL * do_div(stat, kicks);
+ frac = 100ULL * do_div(sum, kicks);
frac = DIV_ROUND_CLOSEST_ULL(frac, kicks);
}
/*
* Return a X.XX decimal number
*/
- len = snprintf(buf, sizeof(buf) - 1, "%llu.%02llu\n", stat, frac);
+ len = snprintf(buf, sizeof(buf) - 1, "%llu.%02llu\n",
+ sum, frac);
} else {
/*
* Round to the nearest ns
*/
- if ((counter == qstat_pv_latency_kick) ||
- (counter == qstat_pv_latency_wake)) {
+ if ((id == LOCKEVENT_pv_latency_kick) ||
+ (id == LOCKEVENT_pv_latency_wake)) {
if (kicks)
- stat = DIV_ROUND_CLOSEST_ULL(stat, kicks);
+ sum = DIV_ROUND_CLOSEST_ULL(sum, kicks);
}
- len = snprintf(buf, sizeof(buf) - 1, "%llu\n", stat);
+ len = snprintf(buf, sizeof(buf) - 1, "%llu\n", sum);
}
return simple_read_from_buffer(user_buf, count, ppos, buf, len);
}
/*
- * Function to handle write request
- *
- * When counter = reset_cnts, reset all the counter values.
- * Since the counter updates aren't atomic, the resetting is done twice
- * to make sure that the counters are very likely to be all cleared.
- */
-static ssize_t qstat_write(struct file *file, const char __user *user_buf,
- size_t count, loff_t *ppos)
-{
- int cpu;
-
- /*
- * Get the counter ID stored in file->f_inode->i_private
- */
- if ((long)file_inode(file)->i_private != qstat_reset_cnts)
- return count;
-
- for_each_possible_cpu(cpu) {
- int i;
- unsigned long *ptr = per_cpu_ptr(qstats, cpu);
-
- for (i = 0 ; i < qstat_num; i++)
- WRITE_ONCE(ptr[i], 0);
- }
- return count;
-}
-
-/*
- * Debugfs data structures
- */
-static const struct file_operations fops_qstat = {
- .read = qstat_read,
- .write = qstat_write,
- .llseek = default_llseek,
-};
-
-/*
- * Initialize debugfs for the qspinlock statistical counters
- */
-static int __init init_qspinlock_stat(void)
-{
- struct dentry *d_qstat = debugfs_create_dir("qlockstat", NULL);
- int i;
-
- if (!d_qstat)
- goto out;
-
- /*
- * Create the debugfs files
- *
- * As reading from and writing to the stat files can be slow, only
- * root is allowed to do the read/write to limit impact to system
- * performance.
- */
- for (i = 0; i < qstat_num; i++)
- if (!debugfs_create_file(qstat_names[i], 0400, d_qstat,
- (void *)(long)i, &fops_qstat))
- goto fail_undo;
-
- if (!debugfs_create_file(qstat_names[qstat_reset_cnts], 0200, d_qstat,
- (void *)(long)qstat_reset_cnts, &fops_qstat))
- goto fail_undo;
-
- return 0;
-fail_undo:
- debugfs_remove_recursive(d_qstat);
-out:
- pr_warn("Could not create 'qlockstat' debugfs entries\n");
- return -ENOMEM;
-}
-fs_initcall(init_qspinlock_stat);
-
-/*
- * Increment the PV qspinlock statistical counters
- */
-static inline void qstat_inc(enum qlock_stats stat, bool cond)
-{
- if (cond)
- this_cpu_inc(qstats[stat]);
-}
-
-/*
* PV hash hop count
*/
-static inline void qstat_hop(int hopcnt)
+static inline void lockevent_pv_hop(int hopcnt)
{
- this_cpu_add(qstats[qstat_pv_hash_hops], hopcnt);
+ this_cpu_add(EVENT_COUNT(pv_hash_hops), hopcnt);
}
/*
@@ -267,7 +111,7 @@ static inline void __pv_kick(int cpu)
per_cpu(pv_kick_time, cpu) = start;
pv_kick(cpu);
- this_cpu_add(qstats[qstat_pv_latency_kick], sched_clock() - start);
+ this_cpu_add(EVENT_COUNT(pv_latency_kick), sched_clock() - start);
}
/*
@@ -280,18 +124,19 @@ static inline void __pv_wait(u8 *ptr, u8 val)
*pkick_time = 0;
pv_wait(ptr, val);
if (*pkick_time) {
- this_cpu_add(qstats[qstat_pv_latency_wake],
+ this_cpu_add(EVENT_COUNT(pv_latency_wake),
sched_clock() - *pkick_time);
- qstat_inc(qstat_pv_kick_wake, true);
+ lockevent_inc(pv_kick_wake);
}
}
#define pv_kick(c) __pv_kick(c)
#define pv_wait(p, v) __pv_wait(p, v)
-#else /* CONFIG_QUEUED_LOCK_STAT */
+#endif /* CONFIG_PARAVIRT_SPINLOCKS */
+
+#else /* CONFIG_LOCK_EVENT_COUNTS */
-static inline void qstat_inc(enum qlock_stats stat, bool cond) { }
-static inline void qstat_hop(int hopcnt) { }
+static inline void lockevent_pv_hop(int hopcnt) { }
-#endif /* CONFIG_QUEUED_LOCK_STAT */
+#endif /* CONFIG_LOCK_EVENT_COUNTS */
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index 978d63a8261c..38fbf9fa7f1b 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* RT-Mutexes: simple blocking mutual exclusion locks with PI support
*
diff --git a/kernel/locking/rwsem-spinlock.c b/kernel/locking/rwsem-spinlock.c
deleted file mode 100644
index a7ffb2a96ede..000000000000
--- a/kernel/locking/rwsem-spinlock.c
+++ /dev/null
@@ -1,339 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/* rwsem-spinlock.c: R/W semaphores: contention handling functions for
- * generic spinlock implementation
- *
- * Copyright (c) 2001 David Howells (dhowells@redhat.com).
- * - Derived partially from idea by Andrea Arcangeli <andrea@suse.de>
- * - Derived also from comments by Linus
- */
-#include <linux/rwsem.h>
-#include <linux/sched/signal.h>
-#include <linux/sched/debug.h>
-#include <linux/export.h>
-
-enum rwsem_waiter_type {
- RWSEM_WAITING_FOR_WRITE,
- RWSEM_WAITING_FOR_READ
-};
-
-struct rwsem_waiter {
- struct list_head list;
- struct task_struct *task;
- enum rwsem_waiter_type type;
-};
-
-int rwsem_is_locked(struct rw_semaphore *sem)
-{
- int ret = 1;
- unsigned long flags;
-
- if (raw_spin_trylock_irqsave(&sem->wait_lock, flags)) {
- ret = (sem->count != 0);
- raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
- }
- return ret;
-}
-EXPORT_SYMBOL(rwsem_is_locked);
-
-/*
- * initialise the semaphore
- */
-void __init_rwsem(struct rw_semaphore *sem, const char *name,
- struct lock_class_key *key)
-{
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
- /*
- * Make sure we are not reinitializing a held semaphore:
- */
- debug_check_no_locks_freed((void *)sem, sizeof(*sem));
- lockdep_init_map(&sem->dep_map, name, key, 0);
-#endif
- sem->count = 0;
- raw_spin_lock_init(&sem->wait_lock);
- INIT_LIST_HEAD(&sem->wait_list);
-}
-EXPORT_SYMBOL(__init_rwsem);
-
-/*
- * handle the lock release when processes blocked on it that can now run
- * - if we come here, then:
- * - the 'active count' _reached_ zero
- * - the 'waiting count' is non-zero
- * - the spinlock must be held by the caller
- * - woken process blocks are discarded from the list after having task zeroed
- * - writers are only woken if wakewrite is non-zero
- */
-static inline struct rw_semaphore *
-__rwsem_do_wake(struct rw_semaphore *sem, int wakewrite)
-{
- struct rwsem_waiter *waiter;
- struct task_struct *tsk;
- int woken;
-
- waiter = list_entry(sem->wait_list.next, struct rwsem_waiter, list);
-
- if (waiter->type == RWSEM_WAITING_FOR_WRITE) {
- if (wakewrite)
- /* Wake up a writer. Note that we do not grant it the
- * lock - it will have to acquire it when it runs. */
- wake_up_process(waiter->task);
- goto out;
- }
-
- /* grant an infinite number of read locks to the front of the queue */
- woken = 0;
- do {
- struct list_head *next = waiter->list.next;
-
- list_del(&waiter->list);
- tsk = waiter->task;
- /*
- * Make sure we do not wakeup the next reader before
- * setting the nil condition to grant the next reader;
- * otherwise we could miss the wakeup on the other
- * side and end up sleeping again. See the pairing
- * in rwsem_down_read_failed().
- */
- smp_mb();
- waiter->task = NULL;
- wake_up_process(tsk);
- put_task_struct(tsk);
- woken++;
- if (next == &sem->wait_list)
- break;
- waiter = list_entry(next, struct rwsem_waiter, list);
- } while (waiter->type != RWSEM_WAITING_FOR_WRITE);
-
- sem->count += woken;
-
- out:
- return sem;
-}
-
-/*
- * wake a single writer
- */
-static inline struct rw_semaphore *
-__rwsem_wake_one_writer(struct rw_semaphore *sem)
-{
- struct rwsem_waiter *waiter;
-
- waiter = list_entry(sem->wait_list.next, struct rwsem_waiter, list);
- wake_up_process(waiter->task);
-
- return sem;
-}
-
-/*
- * get a read lock on the semaphore
- */
-int __sched __down_read_common(struct rw_semaphore *sem, int state)
-{
- struct rwsem_waiter waiter;
- unsigned long flags;
-
- raw_spin_lock_irqsave(&sem->wait_lock, flags);
-
- if (sem->count >= 0 && list_empty(&sem->wait_list)) {
- /* granted */
- sem->count++;
- raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
- goto out;
- }
-
- /* set up my own style of waitqueue */
- waiter.task = current;
- waiter.type = RWSEM_WAITING_FOR_READ;
- get_task_struct(current);
-
- list_add_tail(&waiter.list, &sem->wait_list);
-
- /* wait to be given the lock */
- for (;;) {
- if (!waiter.task)
- break;
- if (signal_pending_state(state, current))
- goto out_nolock;
- set_current_state(state);
- raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
- schedule();
- raw_spin_lock_irqsave(&sem->wait_lock, flags);
- }
-
- raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
- out:
- return 0;
-
-out_nolock:
- /*
- * We didn't take the lock, so that there is a writer, which
- * is owner or the first waiter of the sem. If it's a waiter,
- * it will be woken by current owner. Not need to wake anybody.
- */
- list_del(&waiter.list);
- raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
- return -EINTR;
-}
-
-void __sched __down_read(struct rw_semaphore *sem)
-{
- __down_read_common(sem, TASK_UNINTERRUPTIBLE);
-}
-
-int __sched __down_read_killable(struct rw_semaphore *sem)
-{
- return __down_read_common(sem, TASK_KILLABLE);
-}
-
-/*
- * trylock for reading -- returns 1 if successful, 0 if contention
- */
-int __down_read_trylock(struct rw_semaphore *sem)
-{
- unsigned long flags;
- int ret = 0;
-
-
- raw_spin_lock_irqsave(&sem->wait_lock, flags);
-
- if (sem->count >= 0 && list_empty(&sem->wait_list)) {
- /* granted */
- sem->count++;
- ret = 1;
- }
-
- raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
-
- return ret;
-}
-
-/*
- * get a write lock on the semaphore
- */
-int __sched __down_write_common(struct rw_semaphore *sem, int state)
-{
- struct rwsem_waiter waiter;
- unsigned long flags;
- int ret = 0;
-
- raw_spin_lock_irqsave(&sem->wait_lock, flags);
-
- /* set up my own style of waitqueue */
- waiter.task = current;
- waiter.type = RWSEM_WAITING_FOR_WRITE;
- list_add_tail(&waiter.list, &sem->wait_list);
-
- /* wait for someone to release the lock */
- for (;;) {
- /*
- * That is the key to support write lock stealing: allows the
- * task already on CPU to get the lock soon rather than put
- * itself into sleep and waiting for system woke it or someone
- * else in the head of the wait list up.
- */
- if (sem->count == 0)
- break;
- if (signal_pending_state(state, current))
- goto out_nolock;
-
- set_current_state(state);
- raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
- schedule();
- raw_spin_lock_irqsave(&sem->wait_lock, flags);
- }
- /* got the lock */
- sem->count = -1;
- list_del(&waiter.list);
-
- raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
-
- return ret;
-
-out_nolock:
- list_del(&waiter.list);
- if (!list_empty(&sem->wait_list) && sem->count >= 0)
- __rwsem_do_wake(sem, 0);
- raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
-
- return -EINTR;
-}
-
-void __sched __down_write(struct rw_semaphore *sem)
-{
- __down_write_common(sem, TASK_UNINTERRUPTIBLE);
-}
-
-int __sched __down_write_killable(struct rw_semaphore *sem)
-{
- return __down_write_common(sem, TASK_KILLABLE);
-}
-
-/*
- * trylock for writing -- returns 1 if successful, 0 if contention
- */
-int __down_write_trylock(struct rw_semaphore *sem)
-{
- unsigned long flags;
- int ret = 0;
-
- raw_spin_lock_irqsave(&sem->wait_lock, flags);
-
- if (sem->count == 0) {
- /* got the lock */
- sem->count = -1;
- ret = 1;
- }
-
- raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
-
- return ret;
-}
-
-/*
- * release a read lock on the semaphore
- */
-void __up_read(struct rw_semaphore *sem)
-{
- unsigned long flags;
-
- raw_spin_lock_irqsave(&sem->wait_lock, flags);
-
- if (--sem->count == 0 && !list_empty(&sem->wait_list))
- sem = __rwsem_wake_one_writer(sem);
-
- raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
-}
-
-/*
- * release a write lock on the semaphore
- */
-void __up_write(struct rw_semaphore *sem)
-{
- unsigned long flags;
-
- raw_spin_lock_irqsave(&sem->wait_lock, flags);
-
- sem->count = 0;
- if (!list_empty(&sem->wait_list))
- sem = __rwsem_do_wake(sem, 1);
-
- raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
-}
-
-/*
- * downgrade a write lock into a read lock
- * - just wake up any readers at the front of the queue
- */
-void __downgrade_write(struct rw_semaphore *sem)
-{
- unsigned long flags;
-
- raw_spin_lock_irqsave(&sem->wait_lock, flags);
-
- sem->count = 1;
- if (!list_empty(&sem->wait_list))
- sem = __rwsem_do_wake(sem, 0);
-
- raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
-}
-
diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c
deleted file mode 100644
index 50d9af615dc4..000000000000
--- a/kernel/locking/rwsem-xadd.c
+++ /dev/null
@@ -1,725 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/* rwsem.c: R/W semaphores: contention handling functions
- *
- * Written by David Howells (dhowells@redhat.com).
- * Derived from arch/i386/kernel/semaphore.c
- *
- * Writer lock-stealing by Alex Shi <alex.shi@intel.com>
- * and Michel Lespinasse <walken@google.com>
- *
- * Optimistic spinning by Tim Chen <tim.c.chen@intel.com>
- * and Davidlohr Bueso <davidlohr@hp.com>. Based on mutexes.
- */
-#include <linux/rwsem.h>
-#include <linux/init.h>
-#include <linux/export.h>
-#include <linux/sched/signal.h>
-#include <linux/sched/rt.h>
-#include <linux/sched/wake_q.h>
-#include <linux/sched/debug.h>
-#include <linux/osq_lock.h>
-
-#include "rwsem.h"
-
-/*
- * Guide to the rw_semaphore's count field for common values.
- * (32-bit case illustrated, similar for 64-bit)
- *
- * 0x0000000X (1) X readers active or attempting lock, no writer waiting
- * X = #active_readers + #readers attempting to lock
- * (X*ACTIVE_BIAS)
- *
- * 0x00000000 rwsem is unlocked, and no one is waiting for the lock or
- * attempting to read lock or write lock.
- *
- * 0xffff000X (1) X readers active or attempting lock, with waiters for lock
- * X = #active readers + # readers attempting lock
- * (X*ACTIVE_BIAS + WAITING_BIAS)
- * (2) 1 writer attempting lock, no waiters for lock
- * X-1 = #active readers + #readers attempting lock
- * ((X-1)*ACTIVE_BIAS + ACTIVE_WRITE_BIAS)
- * (3) 1 writer active, no waiters for lock
- * X-1 = #active readers + #readers attempting lock
- * ((X-1)*ACTIVE_BIAS + ACTIVE_WRITE_BIAS)
- *
- * 0xffff0001 (1) 1 reader active or attempting lock, waiters for lock
- * (WAITING_BIAS + ACTIVE_BIAS)
- * (2) 1 writer active or attempting lock, no waiters for lock
- * (ACTIVE_WRITE_BIAS)
- *
- * 0xffff0000 (1) There are writers or readers queued but none active
- * or in the process of attempting lock.
- * (WAITING_BIAS)
- * Note: writer can attempt to steal lock for this count by adding
- * ACTIVE_WRITE_BIAS in cmpxchg and checking the old count
- *
- * 0xfffe0001 (1) 1 writer active, or attempting lock. Waiters on queue.
- * (ACTIVE_WRITE_BIAS + WAITING_BIAS)
- *
- * Note: Readers attempt to lock by adding ACTIVE_BIAS in down_read and checking
- * the count becomes more than 0 for successful lock acquisition,
- * i.e. the case where there are only readers or nobody has lock.
- * (1st and 2nd case above).
- *
- * Writers attempt to lock by adding ACTIVE_WRITE_BIAS in down_write and
- * checking the count becomes ACTIVE_WRITE_BIAS for successful lock
- * acquisition (i.e. nobody else has lock or attempts lock). If
- * unsuccessful, in rwsem_down_write_failed, we'll check to see if there
- * are only waiters but none active (5th case above), and attempt to
- * steal the lock.
- *
- */
-
-/*
- * Initialize an rwsem:
- */
-void __init_rwsem(struct rw_semaphore *sem, const char *name,
- struct lock_class_key *key)
-{
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
- /*
- * Make sure we are not reinitializing a held semaphore:
- */
- debug_check_no_locks_freed((void *)sem, sizeof(*sem));
- lockdep_init_map(&sem->dep_map, name, key, 0);
-#endif
- atomic_long_set(&sem->count, RWSEM_UNLOCKED_VALUE);
- raw_spin_lock_init(&sem->wait_lock);
- INIT_LIST_HEAD(&sem->wait_list);
-#ifdef CONFIG_RWSEM_SPIN_ON_OWNER
- sem->owner = NULL;
- osq_lock_init(&sem->osq);
-#endif
-}
-
-EXPORT_SYMBOL(__init_rwsem);
-
-enum rwsem_waiter_type {
- RWSEM_WAITING_FOR_WRITE,
- RWSEM_WAITING_FOR_READ
-};
-
-struct rwsem_waiter {
- struct list_head list;
- struct task_struct *task;
- enum rwsem_waiter_type type;
-};
-
-enum rwsem_wake_type {
- RWSEM_WAKE_ANY, /* Wake whatever's at head of wait list */
- RWSEM_WAKE_READERS, /* Wake readers only */
- RWSEM_WAKE_READ_OWNED /* Waker thread holds the read lock */
-};
-
-/*
- * handle the lock release when processes blocked on it that can now run
- * - if we come here from up_xxxx(), then:
- * - the 'active part' of count (&0x0000ffff) reached 0 (but may have changed)
- * - the 'waiting part' of count (&0xffff0000) is -ve (and will still be so)
- * - there must be someone on the queue
- * - the wait_lock must be held by the caller
- * - tasks are marked for wakeup, the caller must later invoke wake_up_q()
- * to actually wakeup the blocked task(s) and drop the reference count,
- * preferably when the wait_lock is released
- * - woken process blocks are discarded from the list after having task zeroed
- * - writers are only marked woken if downgrading is false
- */
-static void __rwsem_mark_wake(struct rw_semaphore *sem,
- enum rwsem_wake_type wake_type,
- struct wake_q_head *wake_q)
-{
- struct rwsem_waiter *waiter, *tmp;
- long oldcount, woken = 0, adjustment = 0;
-
- /*
- * Take a peek at the queue head waiter such that we can determine
- * the wakeup(s) to perform.
- */
- waiter = list_first_entry(&sem->wait_list, struct rwsem_waiter, list);
-
- if (waiter->type == RWSEM_WAITING_FOR_WRITE) {
- if (wake_type == RWSEM_WAKE_ANY) {
- /*
- * Mark writer at the front of the queue for wakeup.
- * Until the task is actually later awoken later by
- * the caller, other writers are able to steal it.
- * Readers, on the other hand, will block as they
- * will notice the queued writer.
- */
- wake_q_add(wake_q, waiter->task);
- }
-
- return;
- }
-
- /*
- * Writers might steal the lock before we grant it to the next reader.
- * We prefer to do the first reader grant before counting readers
- * so we can bail out early if a writer stole the lock.
- */
- if (wake_type != RWSEM_WAKE_READ_OWNED) {
- adjustment = RWSEM_ACTIVE_READ_BIAS;
- try_reader_grant:
- oldcount = atomic_long_fetch_add(adjustment, &sem->count);
- if (unlikely(oldcount < RWSEM_WAITING_BIAS)) {
- /*
- * If the count is still less than RWSEM_WAITING_BIAS
- * after removing the adjustment, it is assumed that
- * a writer has stolen the lock. We have to undo our
- * reader grant.
- */
- if (atomic_long_add_return(-adjustment, &sem->count) <
- RWSEM_WAITING_BIAS)
- return;
-
- /* Last active locker left. Retry waking readers. */
- goto try_reader_grant;
- }
- /*
- * It is not really necessary to set it to reader-owned here,
- * but it gives the spinners an early indication that the
- * readers now have the lock.
- */
- __rwsem_set_reader_owned(sem, waiter->task);
- }
-
- /*
- * Grant an infinite number of read locks to the readers at the front
- * of the queue. We know that woken will be at least 1 as we accounted
- * for above. Note we increment the 'active part' of the count by the
- * number of readers before waking any processes up.
- */
- list_for_each_entry_safe(waiter, tmp, &sem->wait_list, list) {
- struct task_struct *tsk;
-
- if (waiter->type == RWSEM_WAITING_FOR_WRITE)
- break;
-
- woken++;
- tsk = waiter->task;
-
- get_task_struct(tsk);
- list_del(&waiter->list);
- /*
- * Ensure calling get_task_struct() before setting the reader
- * waiter to nil such that rwsem_down_read_failed() cannot
- * race with do_exit() by always holding a reference count
- * to the task to wakeup.
- */
- smp_store_release(&waiter->task, NULL);
- /*
- * Ensure issuing the wakeup (either by us or someone else)
- * after setting the reader waiter to nil.
- */
- wake_q_add(wake_q, tsk);
- /* wake_q_add() already take the task ref */
- put_task_struct(tsk);
- }
-
- adjustment = woken * RWSEM_ACTIVE_READ_BIAS - adjustment;
- if (list_empty(&sem->wait_list)) {
- /* hit end of list above */
- adjustment -= RWSEM_WAITING_BIAS;
- }
-
- if (adjustment)
- atomic_long_add(adjustment, &sem->count);
-}
-
-/*
- * Wait for the read lock to be granted
- */
-static inline struct rw_semaphore __sched *
-__rwsem_down_read_failed_common(struct rw_semaphore *sem, int state)
-{
- long count, adjustment = -RWSEM_ACTIVE_READ_BIAS;
- struct rwsem_waiter waiter;
- DEFINE_WAKE_Q(wake_q);
-
- waiter.task = current;
- waiter.type = RWSEM_WAITING_FOR_READ;
-
- raw_spin_lock_irq(&sem->wait_lock);
- if (list_empty(&sem->wait_list)) {
- /*
- * In case the wait queue is empty and the lock isn't owned
- * by a writer, this reader can exit the slowpath and return
- * immediately as its RWSEM_ACTIVE_READ_BIAS has already
- * been set in the count.
- */
- if (atomic_long_read(&sem->count) >= 0) {
- raw_spin_unlock_irq(&sem->wait_lock);
- return sem;
- }
- adjustment += RWSEM_WAITING_BIAS;
- }
- list_add_tail(&waiter.list, &sem->wait_list);
-
- /* we're now waiting on the lock, but no longer actively locking */
- count = atomic_long_add_return(adjustment, &sem->count);
-
- /*
- * If there are no active locks, wake the front queued process(es).
- *
- * If there are no writers and we are first in the queue,
- * wake our own waiter to join the existing active readers !
- */
- if (count == RWSEM_WAITING_BIAS ||
- (count > RWSEM_WAITING_BIAS &&
- adjustment != -RWSEM_ACTIVE_READ_BIAS))
- __rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q);
-
- raw_spin_unlock_irq(&sem->wait_lock);
- wake_up_q(&wake_q);
-
- /* wait to be given the lock */
- while (true) {
- set_current_state(state);
- if (!waiter.task)
- break;
- if (signal_pending_state(state, current)) {
- raw_spin_lock_irq(&sem->wait_lock);
- if (waiter.task)
- goto out_nolock;
- raw_spin_unlock_irq(&sem->wait_lock);
- break;
- }
- schedule();
- }
-
- __set_current_state(TASK_RUNNING);
- return sem;
-out_nolock:
- list_del(&waiter.list);
- if (list_empty(&sem->wait_list))
- atomic_long_add(-RWSEM_WAITING_BIAS, &sem->count);
- raw_spin_unlock_irq(&sem->wait_lock);
- __set_current_state(TASK_RUNNING);
- return ERR_PTR(-EINTR);
-}
-
-__visible struct rw_semaphore * __sched
-rwsem_down_read_failed(struct rw_semaphore *sem)
-{
- return __rwsem_down_read_failed_common(sem, TASK_UNINTERRUPTIBLE);
-}
-EXPORT_SYMBOL(rwsem_down_read_failed);
-
-__visible struct rw_semaphore * __sched
-rwsem_down_read_failed_killable(struct rw_semaphore *sem)
-{
- return __rwsem_down_read_failed_common(sem, TASK_KILLABLE);
-}
-EXPORT_SYMBOL(rwsem_down_read_failed_killable);
-
-/*
- * This function must be called with the sem->wait_lock held to prevent
- * race conditions between checking the rwsem wait list and setting the
- * sem->count accordingly.
- */
-static inline bool rwsem_try_write_lock(long count, struct rw_semaphore *sem)
-{
- /*
- * Avoid trying to acquire write lock if count isn't RWSEM_WAITING_BIAS.
- */
- if (count != RWSEM_WAITING_BIAS)
- return false;
-
- /*
- * Acquire the lock by trying to set it to ACTIVE_WRITE_BIAS. If there
- * are other tasks on the wait list, we need to add on WAITING_BIAS.
- */
- count = list_is_singular(&sem->wait_list) ?
- RWSEM_ACTIVE_WRITE_BIAS :
- RWSEM_ACTIVE_WRITE_BIAS + RWSEM_WAITING_BIAS;
-
- if (atomic_long_cmpxchg_acquire(&sem->count, RWSEM_WAITING_BIAS, count)
- == RWSEM_WAITING_BIAS) {
- rwsem_set_owner(sem);
- return true;
- }
-
- return false;
-}
-
-#ifdef CONFIG_RWSEM_SPIN_ON_OWNER
-/*
- * Try to acquire write lock before the writer has been put on wait queue.
- */
-static inline bool rwsem_try_write_lock_unqueued(struct rw_semaphore *sem)
-{
- long old, count = atomic_long_read(&sem->count);
-
- while (true) {
- if (!(count == 0 || count == RWSEM_WAITING_BIAS))
- return false;
-
- old = atomic_long_cmpxchg_acquire(&sem->count, count,
- count + RWSEM_ACTIVE_WRITE_BIAS);
- if (old == count) {
- rwsem_set_owner(sem);
- return true;
- }
-
- count = old;
- }
-}
-
-static inline bool owner_on_cpu(struct task_struct *owner)
-{
- /*
- * As lock holder preemption issue, we both skip spinning if
- * task is not on cpu or its cpu is preempted
- */
- return owner->on_cpu && !vcpu_is_preempted(task_cpu(owner));
-}
-
-static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem)
-{
- struct task_struct *owner;
- bool ret = true;
-
- BUILD_BUG_ON(!rwsem_has_anonymous_owner(RWSEM_OWNER_UNKNOWN));
-
- if (need_resched())
- return false;
-
- rcu_read_lock();
- owner = READ_ONCE(sem->owner);
- if (owner) {
- ret = is_rwsem_owner_spinnable(owner) &&
- owner_on_cpu(owner);
- }
- rcu_read_unlock();
- return ret;
-}
-
-/*
- * Return true only if we can still spin on the owner field of the rwsem.
- */
-static noinline bool rwsem_spin_on_owner(struct rw_semaphore *sem)
-{
- struct task_struct *owner = READ_ONCE(sem->owner);
-
- if (!is_rwsem_owner_spinnable(owner))
- return false;
-
- rcu_read_lock();
- while (owner && (READ_ONCE(sem->owner) == owner)) {
- /*
- * Ensure we emit the owner->on_cpu, dereference _after_
- * checking sem->owner still matches owner, if that fails,
- * owner might point to free()d memory, if it still matches,
- * the rcu_read_lock() ensures the memory stays valid.
- */
- barrier();
-
- /*
- * abort spinning when need_resched or owner is not running or
- * owner's cpu is preempted.
- */
- if (need_resched() || !owner_on_cpu(owner)) {
- rcu_read_unlock();
- return false;
- }
-
- cpu_relax();
- }
- rcu_read_unlock();
-
- /*
- * If there is a new owner or the owner is not set, we continue
- * spinning.
- */
- return is_rwsem_owner_spinnable(READ_ONCE(sem->owner));
-}
-
-static bool rwsem_optimistic_spin(struct rw_semaphore *sem)
-{
- bool taken = false;
-
- preempt_disable();
-
- /* sem->wait_lock should not be held when doing optimistic spinning */
- if (!rwsem_can_spin_on_owner(sem))
- goto done;
-
- if (!osq_lock(&sem->osq))
- goto done;
-
- /*
- * Optimistically spin on the owner field and attempt to acquire the
- * lock whenever the owner changes. Spinning will be stopped when:
- * 1) the owning writer isn't running; or
- * 2) readers own the lock as we can't determine if they are
- * actively running or not.
- */
- while (rwsem_spin_on_owner(sem)) {
- /*
- * Try to acquire the lock
- */
- if (rwsem_try_write_lock_unqueued(sem)) {
- taken = true;
- break;
- }
-
- /*
- * When there's no owner, we might have preempted between the
- * owner acquiring the lock and setting the owner field. If
- * we're an RT task that will live-lock because we won't let
- * the owner complete.
- */
- if (!sem->owner && (need_resched() || rt_task(current)))
- break;
-
- /*
- * The cpu_relax() call is a compiler barrier which forces
- * everything in this loop to be re-loaded. We don't need
- * memory barriers as we'll eventually observe the right
- * values at the cost of a few extra spins.
- */
- cpu_relax();
- }
- osq_unlock(&sem->osq);
-done:
- preempt_enable();
- return taken;
-}
-
-/*
- * Return true if the rwsem has active spinner
- */
-static inline bool rwsem_has_spinner(struct rw_semaphore *sem)
-{
- return osq_is_locked(&sem->osq);
-}
-
-#else
-static bool rwsem_optimistic_spin(struct rw_semaphore *sem)
-{
- return false;
-}
-
-static inline bool rwsem_has_spinner(struct rw_semaphore *sem)
-{
- return false;
-}
-#endif
-
-/*
- * Wait until we successfully acquire the write lock
- */
-static inline struct rw_semaphore *
-__rwsem_down_write_failed_common(struct rw_semaphore *sem, int state)
-{
- long count;
- bool waiting = true; /* any queued threads before us */
- struct rwsem_waiter waiter;
- struct rw_semaphore *ret = sem;
- DEFINE_WAKE_Q(wake_q);
-
- /* undo write bias from down_write operation, stop active locking */
- count = atomic_long_sub_return(RWSEM_ACTIVE_WRITE_BIAS, &sem->count);
-
- /* do optimistic spinning and steal lock if possible */
- if (rwsem_optimistic_spin(sem))
- return sem;
-
- /*
- * Optimistic spinning failed, proceed to the slowpath
- * and block until we can acquire the sem.
- */
- waiter.task = current;
- waiter.type = RWSEM_WAITING_FOR_WRITE;
-
- raw_spin_lock_irq(&sem->wait_lock);
-
- /* account for this before adding a new element to the list */
- if (list_empty(&sem->wait_list))
- waiting = false;
-
- list_add_tail(&waiter.list, &sem->wait_list);
-
- /* we're now waiting on the lock, but no longer actively locking */
- if (waiting) {
- count = atomic_long_read(&sem->count);
-
- /*
- * If there were already threads queued before us and there are
- * no active writers, the lock must be read owned; so we try to
- * wake any read locks that were queued ahead of us.
- */
- if (count > RWSEM_WAITING_BIAS) {
- __rwsem_mark_wake(sem, RWSEM_WAKE_READERS, &wake_q);
- /*
- * The wakeup is normally called _after_ the wait_lock
- * is released, but given that we are proactively waking
- * readers we can deal with the wake_q overhead as it is
- * similar to releasing and taking the wait_lock again
- * for attempting rwsem_try_write_lock().
- */
- wake_up_q(&wake_q);
-
- /*
- * Reinitialize wake_q after use.
- */
- wake_q_init(&wake_q);
- }
-
- } else
- count = atomic_long_add_return(RWSEM_WAITING_BIAS, &sem->count);
-
- /* wait until we successfully acquire the lock */
- set_current_state(state);
- while (true) {
- if (rwsem_try_write_lock(count, sem))
- break;
- raw_spin_unlock_irq(&sem->wait_lock);
-
- /* Block until there are no active lockers. */
- do {
- if (signal_pending_state(state, current))
- goto out_nolock;
-
- schedule();
- set_current_state(state);
- } while ((count = atomic_long_read(&sem->count)) & RWSEM_ACTIVE_MASK);
-
- raw_spin_lock_irq(&sem->wait_lock);
- }
- __set_current_state(TASK_RUNNING);
- list_del(&waiter.list);
- raw_spin_unlock_irq(&sem->wait_lock);
-
- return ret;
-
-out_nolock:
- __set_current_state(TASK_RUNNING);
- raw_spin_lock_irq(&sem->wait_lock);
- list_del(&waiter.list);
- if (list_empty(&sem->wait_list))
- atomic_long_add(-RWSEM_WAITING_BIAS, &sem->count);
- else
- __rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q);
- raw_spin_unlock_irq(&sem->wait_lock);
- wake_up_q(&wake_q);
-
- return ERR_PTR(-EINTR);
-}
-
-__visible struct rw_semaphore * __sched
-rwsem_down_write_failed(struct rw_semaphore *sem)
-{
- return __rwsem_down_write_failed_common(sem, TASK_UNINTERRUPTIBLE);
-}
-EXPORT_SYMBOL(rwsem_down_write_failed);
-
-__visible struct rw_semaphore * __sched
-rwsem_down_write_failed_killable(struct rw_semaphore *sem)
-{
- return __rwsem_down_write_failed_common(sem, TASK_KILLABLE);
-}
-EXPORT_SYMBOL(rwsem_down_write_failed_killable);
-
-/*
- * handle waking up a waiter on the semaphore
- * - up_read/up_write has decremented the active part of count if we come here
- */
-__visible
-struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem)
-{
- unsigned long flags;
- DEFINE_WAKE_Q(wake_q);
-
- /*
- * __rwsem_down_write_failed_common(sem)
- * rwsem_optimistic_spin(sem)
- * osq_unlock(sem->osq)
- * ...
- * atomic_long_add_return(&sem->count)
- *
- * - VS -
- *
- * __up_write()
- * if (atomic_long_sub_return_release(&sem->count) < 0)
- * rwsem_wake(sem)
- * osq_is_locked(&sem->osq)
- *
- * And __up_write() must observe !osq_is_locked() when it observes the
- * atomic_long_add_return() in order to not miss a wakeup.
- *
- * This boils down to:
- *
- * [S.rel] X = 1 [RmW] r0 = (Y += 0)
- * MB RMB
- * [RmW] Y += 1 [L] r1 = X
- *
- * exists (r0=1 /\ r1=0)
- */
- smp_rmb();
-
- /*
- * If a spinner is present, it is not necessary to do the wakeup.
- * Try to do wakeup only if the trylock succeeds to minimize
- * spinlock contention which may introduce too much delay in the
- * unlock operation.
- *
- * spinning writer up_write/up_read caller
- * --------------- -----------------------
- * [S] osq_unlock() [L] osq
- * MB RMB
- * [RmW] rwsem_try_write_lock() [RmW] spin_trylock(wait_lock)
- *
- * Here, it is important to make sure that there won't be a missed
- * wakeup while the rwsem is free and the only spinning writer goes
- * to sleep without taking the rwsem. Even when the spinning writer
- * is just going to break out of the waiting loop, it will still do
- * a trylock in rwsem_down_write_failed() before sleeping. IOW, if
- * rwsem_has_spinner() is true, it will guarantee at least one
- * trylock attempt on the rwsem later on.
- */
- if (rwsem_has_spinner(sem)) {
- /*
- * The smp_rmb() here is to make sure that the spinner
- * state is consulted before reading the wait_lock.
- */
- smp_rmb();
- if (!raw_spin_trylock_irqsave(&sem->wait_lock, flags))
- return sem;
- goto locked;
- }
- raw_spin_lock_irqsave(&sem->wait_lock, flags);
-locked:
-
- if (!list_empty(&sem->wait_list))
- __rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q);
-
- raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
- wake_up_q(&wake_q);
-
- return sem;
-}
-EXPORT_SYMBOL(rwsem_wake);
-
-/*
- * downgrade a write lock into a read lock
- * - caller incremented waiting part of count and discovered it still negative
- * - just wake up any readers at the front of the queue
- */
-__visible
-struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem)
-{
- unsigned long flags;
- DEFINE_WAKE_Q(wake_q);
-
- raw_spin_lock_irqsave(&sem->wait_lock, flags);
-
- if (!list_empty(&sem->wait_list))
- __rwsem_mark_wake(sem, RWSEM_WAKE_READ_OWNED, &wake_q);
-
- raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
- wake_up_q(&wake_q);
-
- return sem;
-}
-EXPORT_SYMBOL(rwsem_downgrade_wake);
diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c
index e586f0d03ad3..37524a47f002 100644
--- a/kernel/locking/rwsem.c
+++ b/kernel/locking/rwsem.c
@@ -3,17 +3,1438 @@
*
* Written by David Howells (dhowells@redhat.com).
* Derived from asm-i386/semaphore.h
+ *
+ * Writer lock-stealing by Alex Shi <alex.shi@intel.com>
+ * and Michel Lespinasse <walken@google.com>
+ *
+ * Optimistic spinning by Tim Chen <tim.c.chen@intel.com>
+ * and Davidlohr Bueso <davidlohr@hp.com>. Based on mutexes.
+ *
+ * Rwsem count bit fields re-definition and rwsem rearchitecture by
+ * Waiman Long <longman@redhat.com> and
+ * Peter Zijlstra <peterz@infradead.org>.
*/
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/sched.h>
+#include <linux/sched/rt.h>
+#include <linux/sched/task.h>
#include <linux/sched/debug.h>
+#include <linux/sched/wake_q.h>
+#include <linux/sched/signal.h>
+#include <linux/sched/clock.h>
#include <linux/export.h>
#include <linux/rwsem.h>
#include <linux/atomic.h>
#include "rwsem.h"
+#include "lock_events.h"
+
+/*
+ * The least significant 3 bits of the owner value has the following
+ * meanings when set.
+ * - Bit 0: RWSEM_READER_OWNED - The rwsem is owned by readers
+ * - Bit 1: RWSEM_RD_NONSPINNABLE - Readers cannot spin on this lock.
+ * - Bit 2: RWSEM_WR_NONSPINNABLE - Writers cannot spin on this lock.
+ *
+ * When the rwsem is either owned by an anonymous writer, or it is
+ * reader-owned, but a spinning writer has timed out, both nonspinnable
+ * bits will be set to disable optimistic spinning by readers and writers.
+ * In the later case, the last unlocking reader should then check the
+ * writer nonspinnable bit and clear it only to give writers preference
+ * to acquire the lock via optimistic spinning, but not readers. Similar
+ * action is also done in the reader slowpath.
+
+ * When a writer acquires a rwsem, it puts its task_struct pointer
+ * into the owner field. It is cleared after an unlock.
+ *
+ * When a reader acquires a rwsem, it will also puts its task_struct
+ * pointer into the owner field with the RWSEM_READER_OWNED bit set.
+ * On unlock, the owner field will largely be left untouched. So
+ * for a free or reader-owned rwsem, the owner value may contain
+ * information about the last reader that acquires the rwsem.
+ *
+ * That information may be helpful in debugging cases where the system
+ * seems to hang on a reader owned rwsem especially if only one reader
+ * is involved. Ideally we would like to track all the readers that own
+ * a rwsem, but the overhead is simply too big.
+ *
+ * Reader optimistic spinning is helpful when the reader critical section
+ * is short and there aren't that many readers around. It makes readers
+ * relatively more preferred than writers. When a writer times out spinning
+ * on a reader-owned lock and set the nospinnable bits, there are two main
+ * reasons for that.
+ *
+ * 1) The reader critical section is long, perhaps the task sleeps after
+ * acquiring the read lock.
+ * 2) There are just too many readers contending the lock causing it to
+ * take a while to service all of them.
+ *
+ * In the former case, long reader critical section will impede the progress
+ * of writers which is usually more important for system performance. In
+ * the later case, reader optimistic spinning tends to make the reader
+ * groups that contain readers that acquire the lock together smaller
+ * leading to more of them. That may hurt performance in some cases. In
+ * other words, the setting of nonspinnable bits indicates that reader
+ * optimistic spinning may not be helpful for those workloads that cause
+ * it.
+ *
+ * Therefore, any writers that had observed the setting of the writer
+ * nonspinnable bit for a given rwsem after they fail to acquire the lock
+ * via optimistic spinning will set the reader nonspinnable bit once they
+ * acquire the write lock. Similarly, readers that observe the setting
+ * of reader nonspinnable bit at slowpath entry will set the reader
+ * nonspinnable bits when they acquire the read lock via the wakeup path.
+ *
+ * Once the reader nonspinnable bit is on, it will only be reset when
+ * a writer is able to acquire the rwsem in the fast path or somehow a
+ * reader or writer in the slowpath doesn't observe the nonspinable bit.
+ *
+ * This is to discourage reader optmistic spinning on that particular
+ * rwsem and make writers more preferred. This adaptive disabling of reader
+ * optimistic spinning will alleviate the negative side effect of this
+ * feature.
+ */
+#define RWSEM_READER_OWNED (1UL << 0)
+#define RWSEM_RD_NONSPINNABLE (1UL << 1)
+#define RWSEM_WR_NONSPINNABLE (1UL << 2)
+#define RWSEM_NONSPINNABLE (RWSEM_RD_NONSPINNABLE | RWSEM_WR_NONSPINNABLE)
+#define RWSEM_OWNER_FLAGS_MASK (RWSEM_READER_OWNED | RWSEM_NONSPINNABLE)
+
+#ifdef CONFIG_DEBUG_RWSEMS
+# define DEBUG_RWSEMS_WARN_ON(c, sem) do { \
+ if (!debug_locks_silent && \
+ WARN_ONCE(c, "DEBUG_RWSEMS_WARN_ON(%s): count = 0x%lx, owner = 0x%lx, curr 0x%lx, list %sempty\n",\
+ #c, atomic_long_read(&(sem)->count), \
+ atomic_long_read(&(sem)->owner), (long)current, \
+ list_empty(&(sem)->wait_list) ? "" : "not ")) \
+ debug_locks_off(); \
+ } while (0)
+#else
+# define DEBUG_RWSEMS_WARN_ON(c, sem)
+#endif
+
+/*
+ * On 64-bit architectures, the bit definitions of the count are:
+ *
+ * Bit 0 - writer locked bit
+ * Bit 1 - waiters present bit
+ * Bit 2 - lock handoff bit
+ * Bits 3-7 - reserved
+ * Bits 8-62 - 55-bit reader count
+ * Bit 63 - read fail bit
+ *
+ * On 32-bit architectures, the bit definitions of the count are:
+ *
+ * Bit 0 - writer locked bit
+ * Bit 1 - waiters present bit
+ * Bit 2 - lock handoff bit
+ * Bits 3-7 - reserved
+ * Bits 8-30 - 23-bit reader count
+ * Bit 31 - read fail bit
+ *
+ * It is not likely that the most significant bit (read fail bit) will ever
+ * be set. This guard bit is still checked anyway in the down_read() fastpath
+ * just in case we need to use up more of the reader bits for other purpose
+ * in the future.
+ *
+ * atomic_long_fetch_add() is used to obtain reader lock, whereas
+ * atomic_long_cmpxchg() will be used to obtain writer lock.
+ *
+ * There are three places where the lock handoff bit may be set or cleared.
+ * 1) rwsem_mark_wake() for readers.
+ * 2) rwsem_try_write_lock() for writers.
+ * 3) Error path of rwsem_down_write_slowpath().
+ *
+ * For all the above cases, wait_lock will be held. A writer must also
+ * be the first one in the wait_list to be eligible for setting the handoff
+ * bit. So concurrent setting/clearing of handoff bit is not possible.
+ */
+#define RWSEM_WRITER_LOCKED (1UL << 0)
+#define RWSEM_FLAG_WAITERS (1UL << 1)
+#define RWSEM_FLAG_HANDOFF (1UL << 2)
+#define RWSEM_FLAG_READFAIL (1UL << (BITS_PER_LONG - 1))
+
+#define RWSEM_READER_SHIFT 8
+#define RWSEM_READER_BIAS (1UL << RWSEM_READER_SHIFT)
+#define RWSEM_READER_MASK (~(RWSEM_READER_BIAS - 1))
+#define RWSEM_WRITER_MASK RWSEM_WRITER_LOCKED
+#define RWSEM_LOCK_MASK (RWSEM_WRITER_MASK|RWSEM_READER_MASK)
+#define RWSEM_READ_FAILED_MASK (RWSEM_WRITER_MASK|RWSEM_FLAG_WAITERS|\
+ RWSEM_FLAG_HANDOFF|RWSEM_FLAG_READFAIL)
+
+/*
+ * All writes to owner are protected by WRITE_ONCE() to make sure that
+ * store tearing can't happen as optimistic spinners may read and use
+ * the owner value concurrently without lock. Read from owner, however,
+ * may not need READ_ONCE() as long as the pointer value is only used
+ * for comparison and isn't being dereferenced.
+ */
+static inline void rwsem_set_owner(struct rw_semaphore *sem)
+{
+ atomic_long_set(&sem->owner, (long)current);
+}
+
+static inline void rwsem_clear_owner(struct rw_semaphore *sem)
+{
+ atomic_long_set(&sem->owner, 0);
+}
+
+/*
+ * Test the flags in the owner field.
+ */
+static inline bool rwsem_test_oflags(struct rw_semaphore *sem, long flags)
+{
+ return atomic_long_read(&sem->owner) & flags;
+}
+
+/*
+ * The task_struct pointer of the last owning reader will be left in
+ * the owner field.
+ *
+ * Note that the owner value just indicates the task has owned the rwsem
+ * previously, it may not be the real owner or one of the real owners
+ * anymore when that field is examined, so take it with a grain of salt.
+ *
+ * The reader non-spinnable bit is preserved.
+ */
+static inline void __rwsem_set_reader_owned(struct rw_semaphore *sem,
+ struct task_struct *owner)
+{
+ unsigned long val = (unsigned long)owner | RWSEM_READER_OWNED |
+ (atomic_long_read(&sem->owner) & RWSEM_RD_NONSPINNABLE);
+
+ atomic_long_set(&sem->owner, val);
+}
+
+static inline void rwsem_set_reader_owned(struct rw_semaphore *sem)
+{
+ __rwsem_set_reader_owned(sem, current);
+}
+
+/*
+ * Return true if the rwsem is owned by a reader.
+ */
+static inline bool is_rwsem_reader_owned(struct rw_semaphore *sem)
+{
+#ifdef CONFIG_DEBUG_RWSEMS
+ /*
+ * Check the count to see if it is write-locked.
+ */
+ long count = atomic_long_read(&sem->count);
+
+ if (count & RWSEM_WRITER_MASK)
+ return false;
+#endif
+ return rwsem_test_oflags(sem, RWSEM_READER_OWNED);
+}
+
+#ifdef CONFIG_DEBUG_RWSEMS
+/*
+ * With CONFIG_DEBUG_RWSEMS configured, it will make sure that if there
+ * is a task pointer in owner of a reader-owned rwsem, it will be the
+ * real owner or one of the real owners. The only exception is when the
+ * unlock is done by up_read_non_owner().
+ */
+static inline void rwsem_clear_reader_owned(struct rw_semaphore *sem)
+{
+ unsigned long val = atomic_long_read(&sem->owner);
+
+ while ((val & ~RWSEM_OWNER_FLAGS_MASK) == (unsigned long)current) {
+ if (atomic_long_try_cmpxchg(&sem->owner, &val,
+ val & RWSEM_OWNER_FLAGS_MASK))
+ return;
+ }
+}
+#else
+static inline void rwsem_clear_reader_owned(struct rw_semaphore *sem)
+{
+}
+#endif
+
+/*
+ * Set the RWSEM_NONSPINNABLE bits if the RWSEM_READER_OWNED flag
+ * remains set. Otherwise, the operation will be aborted.
+ */
+static inline void rwsem_set_nonspinnable(struct rw_semaphore *sem)
+{
+ unsigned long owner = atomic_long_read(&sem->owner);
+
+ do {
+ if (!(owner & RWSEM_READER_OWNED))
+ break;
+ if (owner & RWSEM_NONSPINNABLE)
+ break;
+ } while (!atomic_long_try_cmpxchg(&sem->owner, &owner,
+ owner | RWSEM_NONSPINNABLE));
+}
+
+static inline bool rwsem_read_trylock(struct rw_semaphore *sem)
+{
+ long cnt = atomic_long_add_return_acquire(RWSEM_READER_BIAS, &sem->count);
+ if (WARN_ON_ONCE(cnt < 0))
+ rwsem_set_nonspinnable(sem);
+ return !(cnt & RWSEM_READ_FAILED_MASK);
+}
+
+/*
+ * Return just the real task structure pointer of the owner
+ */
+static inline struct task_struct *rwsem_owner(struct rw_semaphore *sem)
+{
+ return (struct task_struct *)
+ (atomic_long_read(&sem->owner) & ~RWSEM_OWNER_FLAGS_MASK);
+}
+
+/*
+ * Return the real task structure pointer of the owner and the embedded
+ * flags in the owner. pflags must be non-NULL.
+ */
+static inline struct task_struct *
+rwsem_owner_flags(struct rw_semaphore *sem, unsigned long *pflags)
+{
+ unsigned long owner = atomic_long_read(&sem->owner);
+
+ *pflags = owner & RWSEM_OWNER_FLAGS_MASK;
+ return (struct task_struct *)(owner & ~RWSEM_OWNER_FLAGS_MASK);
+}
+
+/*
+ * Guide to the rw_semaphore's count field.
+ *
+ * When the RWSEM_WRITER_LOCKED bit in count is set, the lock is owned
+ * by a writer.
+ *
+ * The lock is owned by readers when
+ * (1) the RWSEM_WRITER_LOCKED isn't set in count,
+ * (2) some of the reader bits are set in count, and
+ * (3) the owner field has RWSEM_READ_OWNED bit set.
+ *
+ * Having some reader bits set is not enough to guarantee a readers owned
+ * lock as the readers may be in the process of backing out from the count
+ * and a writer has just released the lock. So another writer may steal
+ * the lock immediately after that.
+ */
+
+/*
+ * Initialize an rwsem:
+ */
+void __init_rwsem(struct rw_semaphore *sem, const char *name,
+ struct lock_class_key *key)
+{
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+ /*
+ * Make sure we are not reinitializing a held semaphore:
+ */
+ debug_check_no_locks_freed((void *)sem, sizeof(*sem));
+ lockdep_init_map(&sem->dep_map, name, key, 0);
+#endif
+ atomic_long_set(&sem->count, RWSEM_UNLOCKED_VALUE);
+ raw_spin_lock_init(&sem->wait_lock);
+ INIT_LIST_HEAD(&sem->wait_list);
+ atomic_long_set(&sem->owner, 0L);
+#ifdef CONFIG_RWSEM_SPIN_ON_OWNER
+ osq_lock_init(&sem->osq);
+#endif
+}
+EXPORT_SYMBOL(__init_rwsem);
+
+enum rwsem_waiter_type {
+ RWSEM_WAITING_FOR_WRITE,
+ RWSEM_WAITING_FOR_READ
+};
+
+struct rwsem_waiter {
+ struct list_head list;
+ struct task_struct *task;
+ enum rwsem_waiter_type type;
+ unsigned long timeout;
+ unsigned long last_rowner;
+};
+#define rwsem_first_waiter(sem) \
+ list_first_entry(&sem->wait_list, struct rwsem_waiter, list)
+
+enum rwsem_wake_type {
+ RWSEM_WAKE_ANY, /* Wake whatever's at head of wait list */
+ RWSEM_WAKE_READERS, /* Wake readers only */
+ RWSEM_WAKE_READ_OWNED /* Waker thread holds the read lock */
+};
+
+enum writer_wait_state {
+ WRITER_NOT_FIRST, /* Writer is not first in wait list */
+ WRITER_FIRST, /* Writer is first in wait list */
+ WRITER_HANDOFF /* Writer is first & handoff needed */
+};
+
+/*
+ * The typical HZ value is either 250 or 1000. So set the minimum waiting
+ * time to at least 4ms or 1 jiffy (if it is higher than 4ms) in the wait
+ * queue before initiating the handoff protocol.
+ */
+#define RWSEM_WAIT_TIMEOUT DIV_ROUND_UP(HZ, 250)
+
+/*
+ * Magic number to batch-wakeup waiting readers, even when writers are
+ * also present in the queue. This both limits the amount of work the
+ * waking thread must do and also prevents any potential counter overflow,
+ * however unlikely.
+ */
+#define MAX_READERS_WAKEUP 0x100
+
+/*
+ * handle the lock release when processes blocked on it that can now run
+ * - if we come here from up_xxxx(), then the RWSEM_FLAG_WAITERS bit must
+ * have been set.
+ * - there must be someone on the queue
+ * - the wait_lock must be held by the caller
+ * - tasks are marked for wakeup, the caller must later invoke wake_up_q()
+ * to actually wakeup the blocked task(s) and drop the reference count,
+ * preferably when the wait_lock is released
+ * - woken process blocks are discarded from the list after having task zeroed
+ * - writers are only marked woken if downgrading is false
+ */
+static void rwsem_mark_wake(struct rw_semaphore *sem,
+ enum rwsem_wake_type wake_type,
+ struct wake_q_head *wake_q)
+{
+ struct rwsem_waiter *waiter, *tmp;
+ long oldcount, woken = 0, adjustment = 0;
+ struct list_head wlist;
+
+ lockdep_assert_held(&sem->wait_lock);
+
+ /*
+ * Take a peek at the queue head waiter such that we can determine
+ * the wakeup(s) to perform.
+ */
+ waiter = rwsem_first_waiter(sem);
+
+ if (waiter->type == RWSEM_WAITING_FOR_WRITE) {
+ if (wake_type == RWSEM_WAKE_ANY) {
+ /*
+ * Mark writer at the front of the queue for wakeup.
+ * Until the task is actually later awoken later by
+ * the caller, other writers are able to steal it.
+ * Readers, on the other hand, will block as they
+ * will notice the queued writer.
+ */
+ wake_q_add(wake_q, waiter->task);
+ lockevent_inc(rwsem_wake_writer);
+ }
+
+ return;
+ }
+
+ /*
+ * No reader wakeup if there are too many of them already.
+ */
+ if (unlikely(atomic_long_read(&sem->count) < 0))
+ return;
+
+ /*
+ * Writers might steal the lock before we grant it to the next reader.
+ * We prefer to do the first reader grant before counting readers
+ * so we can bail out early if a writer stole the lock.
+ */
+ if (wake_type != RWSEM_WAKE_READ_OWNED) {
+ struct task_struct *owner;
+
+ adjustment = RWSEM_READER_BIAS;
+ oldcount = atomic_long_fetch_add(adjustment, &sem->count);
+ if (unlikely(oldcount & RWSEM_WRITER_MASK)) {
+ /*
+ * When we've been waiting "too" long (for writers
+ * to give up the lock), request a HANDOFF to
+ * force the issue.
+ */
+ if (!(oldcount & RWSEM_FLAG_HANDOFF) &&
+ time_after(jiffies, waiter->timeout)) {
+ adjustment -= RWSEM_FLAG_HANDOFF;
+ lockevent_inc(rwsem_rlock_handoff);
+ }
+
+ atomic_long_add(-adjustment, &sem->count);
+ return;
+ }
+ /*
+ * Set it to reader-owned to give spinners an early
+ * indication that readers now have the lock.
+ * The reader nonspinnable bit seen at slowpath entry of
+ * the reader is copied over.
+ */
+ owner = waiter->task;
+ if (waiter->last_rowner & RWSEM_RD_NONSPINNABLE) {
+ owner = (void *)((unsigned long)owner | RWSEM_RD_NONSPINNABLE);
+ lockevent_inc(rwsem_opt_norspin);
+ }
+ __rwsem_set_reader_owned(sem, owner);
+ }
+
+ /*
+ * Grant up to MAX_READERS_WAKEUP read locks to all the readers in the
+ * queue. We know that the woken will be at least 1 as we accounted
+ * for above. Note we increment the 'active part' of the count by the
+ * number of readers before waking any processes up.
+ *
+ * This is an adaptation of the phase-fair R/W locks where at the
+ * reader phase (first waiter is a reader), all readers are eligible
+ * to acquire the lock at the same time irrespective of their order
+ * in the queue. The writers acquire the lock according to their
+ * order in the queue.
+ *
+ * We have to do wakeup in 2 passes to prevent the possibility that
+ * the reader count may be decremented before it is incremented. It
+ * is because the to-be-woken waiter may not have slept yet. So it
+ * may see waiter->task got cleared, finish its critical section and
+ * do an unlock before the reader count increment.
+ *
+ * 1) Collect the read-waiters in a separate list, count them and
+ * fully increment the reader count in rwsem.
+ * 2) For each waiters in the new list, clear waiter->task and
+ * put them into wake_q to be woken up later.
+ */
+ INIT_LIST_HEAD(&wlist);
+ list_for_each_entry_safe(waiter, tmp, &sem->wait_list, list) {
+ if (waiter->type == RWSEM_WAITING_FOR_WRITE)
+ continue;
+
+ woken++;
+ list_move_tail(&waiter->list, &wlist);
+
+ /*
+ * Limit # of readers that can be woken up per wakeup call.
+ */
+ if (woken >= MAX_READERS_WAKEUP)
+ break;
+ }
+
+ adjustment = woken * RWSEM_READER_BIAS - adjustment;
+ lockevent_cond_inc(rwsem_wake_reader, woken);
+ if (list_empty(&sem->wait_list)) {
+ /* hit end of list above */
+ adjustment -= RWSEM_FLAG_WAITERS;
+ }
+
+ /*
+ * When we've woken a reader, we no longer need to force writers
+ * to give up the lock and we can clear HANDOFF.
+ */
+ if (woken && (atomic_long_read(&sem->count) & RWSEM_FLAG_HANDOFF))
+ adjustment -= RWSEM_FLAG_HANDOFF;
+
+ if (adjustment)
+ atomic_long_add(adjustment, &sem->count);
+
+ /* 2nd pass */
+ list_for_each_entry_safe(waiter, tmp, &wlist, list) {
+ struct task_struct *tsk;
+
+ tsk = waiter->task;
+ get_task_struct(tsk);
+
+ /*
+ * Ensure calling get_task_struct() before setting the reader
+ * waiter to nil such that rwsem_down_read_slowpath() cannot
+ * race with do_exit() by always holding a reference count
+ * to the task to wakeup.
+ */
+ smp_store_release(&waiter->task, NULL);
+ /*
+ * Ensure issuing the wakeup (either by us or someone else)
+ * after setting the reader waiter to nil.
+ */
+ wake_q_add_safe(wake_q, tsk);
+ }
+}
+
+/*
+ * This function must be called with the sem->wait_lock held to prevent
+ * race conditions between checking the rwsem wait list and setting the
+ * sem->count accordingly.
+ *
+ * If wstate is WRITER_HANDOFF, it will make sure that either the handoff
+ * bit is set or the lock is acquired with handoff bit cleared.
+ */
+static inline bool rwsem_try_write_lock(struct rw_semaphore *sem,
+ enum writer_wait_state wstate)
+{
+ long count, new;
+
+ lockdep_assert_held(&sem->wait_lock);
+
+ count = atomic_long_read(&sem->count);
+ do {
+ bool has_handoff = !!(count & RWSEM_FLAG_HANDOFF);
+
+ if (has_handoff && wstate == WRITER_NOT_FIRST)
+ return false;
+
+ new = count;
+
+ if (count & RWSEM_LOCK_MASK) {
+ if (has_handoff || (wstate != WRITER_HANDOFF))
+ return false;
+
+ new |= RWSEM_FLAG_HANDOFF;
+ } else {
+ new |= RWSEM_WRITER_LOCKED;
+ new &= ~RWSEM_FLAG_HANDOFF;
+
+ if (list_is_singular(&sem->wait_list))
+ new &= ~RWSEM_FLAG_WAITERS;
+ }
+ } while (!atomic_long_try_cmpxchg_acquire(&sem->count, &count, new));
+
+ /*
+ * We have either acquired the lock with handoff bit cleared or
+ * set the handoff bit.
+ */
+ if (new & RWSEM_FLAG_HANDOFF)
+ return false;
+
+ rwsem_set_owner(sem);
+ return true;
+}
+
+#ifdef CONFIG_RWSEM_SPIN_ON_OWNER
+/*
+ * Try to acquire read lock before the reader is put on wait queue.
+ * Lock acquisition isn't allowed if the rwsem is locked or a writer handoff
+ * is ongoing.
+ */
+static inline bool rwsem_try_read_lock_unqueued(struct rw_semaphore *sem)
+{
+ long count = atomic_long_read(&sem->count);
+
+ if (count & (RWSEM_WRITER_MASK | RWSEM_FLAG_HANDOFF))
+ return false;
+
+ count = atomic_long_fetch_add_acquire(RWSEM_READER_BIAS, &sem->count);
+ if (!(count & (RWSEM_WRITER_MASK | RWSEM_FLAG_HANDOFF))) {
+ rwsem_set_reader_owned(sem);
+ lockevent_inc(rwsem_opt_rlock);
+ return true;
+ }
+
+ /* Back out the change */
+ atomic_long_add(-RWSEM_READER_BIAS, &sem->count);
+ return false;
+}
+
+/*
+ * Try to acquire write lock before the writer has been put on wait queue.
+ */
+static inline bool rwsem_try_write_lock_unqueued(struct rw_semaphore *sem)
+{
+ long count = atomic_long_read(&sem->count);
+
+ while (!(count & (RWSEM_LOCK_MASK|RWSEM_FLAG_HANDOFF))) {
+ if (atomic_long_try_cmpxchg_acquire(&sem->count, &count,
+ count | RWSEM_WRITER_LOCKED)) {
+ rwsem_set_owner(sem);
+ lockevent_inc(rwsem_opt_wlock);
+ return true;
+ }
+ }
+ return false;
+}
+
+static inline bool owner_on_cpu(struct task_struct *owner)
+{
+ /*
+ * As lock holder preemption issue, we both skip spinning if
+ * task is not on cpu or its cpu is preempted
+ */
+ return owner->on_cpu && !vcpu_is_preempted(task_cpu(owner));
+}
+
+static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem,
+ unsigned long nonspinnable)
+{
+ struct task_struct *owner;
+ unsigned long flags;
+ bool ret = true;
+
+ BUILD_BUG_ON(!(RWSEM_OWNER_UNKNOWN & RWSEM_NONSPINNABLE));
+
+ if (need_resched()) {
+ lockevent_inc(rwsem_opt_fail);
+ return false;
+ }
+
+ preempt_disable();
+ rcu_read_lock();
+ owner = rwsem_owner_flags(sem, &flags);
+ if ((flags & nonspinnable) || (owner && !owner_on_cpu(owner)))
+ ret = false;
+ rcu_read_unlock();
+ preempt_enable();
+
+ lockevent_cond_inc(rwsem_opt_fail, !ret);
+ return ret;
+}
+
+/*
+ * The rwsem_spin_on_owner() function returns the folowing 4 values
+ * depending on the lock owner state.
+ * OWNER_NULL : owner is currently NULL
+ * OWNER_WRITER: when owner changes and is a writer
+ * OWNER_READER: when owner changes and the new owner may be a reader.
+ * OWNER_NONSPINNABLE:
+ * when optimistic spinning has to stop because either the
+ * owner stops running, is unknown, or its timeslice has
+ * been used up.
+ */
+enum owner_state {
+ OWNER_NULL = 1 << 0,
+ OWNER_WRITER = 1 << 1,
+ OWNER_READER = 1 << 2,
+ OWNER_NONSPINNABLE = 1 << 3,
+};
+#define OWNER_SPINNABLE (OWNER_NULL | OWNER_WRITER | OWNER_READER)
+
+static inline enum owner_state
+rwsem_owner_state(struct task_struct *owner, unsigned long flags, unsigned long nonspinnable)
+{
+ if (flags & nonspinnable)
+ return OWNER_NONSPINNABLE;
+
+ if (flags & RWSEM_READER_OWNED)
+ return OWNER_READER;
+
+ return owner ? OWNER_WRITER : OWNER_NULL;
+}
+
+static noinline enum owner_state
+rwsem_spin_on_owner(struct rw_semaphore *sem, unsigned long nonspinnable)
+{
+ struct task_struct *new, *owner;
+ unsigned long flags, new_flags;
+ enum owner_state state;
+
+ owner = rwsem_owner_flags(sem, &flags);
+ state = rwsem_owner_state(owner, flags, nonspinnable);
+ if (state != OWNER_WRITER)
+ return state;
+
+ rcu_read_lock();
+ for (;;) {
+ if (atomic_long_read(&sem->count) & RWSEM_FLAG_HANDOFF) {
+ state = OWNER_NONSPINNABLE;
+ break;
+ }
+
+ new = rwsem_owner_flags(sem, &new_flags);
+ if ((new != owner) || (new_flags != flags)) {
+ state = rwsem_owner_state(new, new_flags, nonspinnable);
+ break;
+ }
+
+ /*
+ * Ensure we emit the owner->on_cpu, dereference _after_
+ * checking sem->owner still matches owner, if that fails,
+ * owner might point to free()d memory, if it still matches,
+ * the rcu_read_lock() ensures the memory stays valid.
+ */
+ barrier();
+
+ if (need_resched() || !owner_on_cpu(owner)) {
+ state = OWNER_NONSPINNABLE;
+ break;
+ }
+
+ cpu_relax();
+ }
+ rcu_read_unlock();
+
+ return state;
+}
+
+/*
+ * Calculate reader-owned rwsem spinning threshold for writer
+ *
+ * The more readers own the rwsem, the longer it will take for them to
+ * wind down and free the rwsem. So the empirical formula used to
+ * determine the actual spinning time limit here is:
+ *
+ * Spinning threshold = (10 + nr_readers/2)us
+ *
+ * The limit is capped to a maximum of 25us (30 readers). This is just
+ * a heuristic and is subjected to change in the future.
+ */
+static inline u64 rwsem_rspin_threshold(struct rw_semaphore *sem)
+{
+ long count = atomic_long_read(&sem->count);
+ int readers = count >> RWSEM_READER_SHIFT;
+ u64 delta;
+
+ if (readers > 30)
+ readers = 30;
+ delta = (20 + readers) * NSEC_PER_USEC / 2;
+
+ return sched_clock() + delta;
+}
+
+static bool rwsem_optimistic_spin(struct rw_semaphore *sem, bool wlock)
+{
+ bool taken = false;
+ int prev_owner_state = OWNER_NULL;
+ int loop = 0;
+ u64 rspin_threshold = 0;
+ unsigned long nonspinnable = wlock ? RWSEM_WR_NONSPINNABLE
+ : RWSEM_RD_NONSPINNABLE;
+
+ preempt_disable();
+
+ /* sem->wait_lock should not be held when doing optimistic spinning */
+ if (!osq_lock(&sem->osq))
+ goto done;
+
+ /*
+ * Optimistically spin on the owner field and attempt to acquire the
+ * lock whenever the owner changes. Spinning will be stopped when:
+ * 1) the owning writer isn't running; or
+ * 2) readers own the lock and spinning time has exceeded limit.
+ */
+ for (;;) {
+ enum owner_state owner_state;
+
+ owner_state = rwsem_spin_on_owner(sem, nonspinnable);
+ if (!(owner_state & OWNER_SPINNABLE))
+ break;
+
+ /*
+ * Try to acquire the lock
+ */
+ taken = wlock ? rwsem_try_write_lock_unqueued(sem)
+ : rwsem_try_read_lock_unqueued(sem);
+
+ if (taken)
+ break;
+
+ /*
+ * Time-based reader-owned rwsem optimistic spinning
+ */
+ if (wlock && (owner_state == OWNER_READER)) {
+ /*
+ * Re-initialize rspin_threshold every time when
+ * the owner state changes from non-reader to reader.
+ * This allows a writer to steal the lock in between
+ * 2 reader phases and have the threshold reset at
+ * the beginning of the 2nd reader phase.
+ */
+ if (prev_owner_state != OWNER_READER) {
+ if (rwsem_test_oflags(sem, nonspinnable))
+ break;
+ rspin_threshold = rwsem_rspin_threshold(sem);
+ loop = 0;
+ }
+
+ /*
+ * Check time threshold once every 16 iterations to
+ * avoid calling sched_clock() too frequently so
+ * as to reduce the average latency between the times
+ * when the lock becomes free and when the spinner
+ * is ready to do a trylock.
+ */
+ else if (!(++loop & 0xf) && (sched_clock() > rspin_threshold)) {
+ rwsem_set_nonspinnable(sem);
+ lockevent_inc(rwsem_opt_nospin);
+ break;
+ }
+ }
+
+ /*
+ * An RT task cannot do optimistic spinning if it cannot
+ * be sure the lock holder is running or live-lock may
+ * happen if the current task and the lock holder happen
+ * to run in the same CPU. However, aborting optimistic
+ * spinning while a NULL owner is detected may miss some
+ * opportunity where spinning can continue without causing
+ * problem.
+ *
+ * There are 2 possible cases where an RT task may be able
+ * to continue spinning.
+ *
+ * 1) The lock owner is in the process of releasing the
+ * lock, sem->owner is cleared but the lock has not
+ * been released yet.
+ * 2) The lock was free and owner cleared, but another
+ * task just comes in and acquire the lock before
+ * we try to get it. The new owner may be a spinnable
+ * writer.
+ *
+ * To take advantage of two scenarios listed agove, the RT
+ * task is made to retry one more time to see if it can
+ * acquire the lock or continue spinning on the new owning
+ * writer. Of course, if the time lag is long enough or the
+ * new owner is not a writer or spinnable, the RT task will
+ * quit spinning.
+ *
+ * If the owner is a writer, the need_resched() check is
+ * done inside rwsem_spin_on_owner(). If the owner is not
+ * a writer, need_resched() check needs to be done here.
+ */
+ if (owner_state != OWNER_WRITER) {
+ if (need_resched())
+ break;
+ if (rt_task(current) &&
+ (prev_owner_state != OWNER_WRITER))
+ break;
+ }
+ prev_owner_state = owner_state;
+
+ /*
+ * The cpu_relax() call is a compiler barrier which forces
+ * everything in this loop to be re-loaded. We don't need
+ * memory barriers as we'll eventually observe the right
+ * values at the cost of a few extra spins.
+ */
+ cpu_relax();
+ }
+ osq_unlock(&sem->osq);
+done:
+ preempt_enable();
+ lockevent_cond_inc(rwsem_opt_fail, !taken);
+ return taken;
+}
+
+/*
+ * Clear the owner's RWSEM_WR_NONSPINNABLE bit if it is set. This should
+ * only be called when the reader count reaches 0.
+ *
+ * This give writers better chance to acquire the rwsem first before
+ * readers when the rwsem was being held by readers for a relatively long
+ * period of time. Race can happen that an optimistic spinner may have
+ * just stolen the rwsem and set the owner, but just clearing the
+ * RWSEM_WR_NONSPINNABLE bit will do no harm anyway.
+ */
+static inline void clear_wr_nonspinnable(struct rw_semaphore *sem)
+{
+ if (rwsem_test_oflags(sem, RWSEM_WR_NONSPINNABLE))
+ atomic_long_andnot(RWSEM_WR_NONSPINNABLE, &sem->owner);
+}
+
+/*
+ * This function is called when the reader fails to acquire the lock via
+ * optimistic spinning. In this case we will still attempt to do a trylock
+ * when comparing the rwsem state right now with the state when entering
+ * the slowpath indicates that the reader is still in a valid reader phase.
+ * This happens when the following conditions are true:
+ *
+ * 1) The lock is currently reader owned, and
+ * 2) The lock is previously not reader-owned or the last read owner changes.
+ *
+ * In the former case, we have transitioned from a writer phase to a
+ * reader-phase while spinning. In the latter case, it means the reader
+ * phase hasn't ended when we entered the optimistic spinning loop. In
+ * both cases, the reader is eligible to acquire the lock. This is the
+ * secondary path where a read lock is acquired optimistically.
+ *
+ * The reader non-spinnable bit wasn't set at time of entry or it will
+ * not be here at all.
+ */
+static inline bool rwsem_reader_phase_trylock(struct rw_semaphore *sem,
+ unsigned long last_rowner)
+{
+ unsigned long owner = atomic_long_read(&sem->owner);
+
+ if (!(owner & RWSEM_READER_OWNED))
+ return false;
+
+ if (((owner ^ last_rowner) & ~RWSEM_OWNER_FLAGS_MASK) &&
+ rwsem_try_read_lock_unqueued(sem)) {
+ lockevent_inc(rwsem_opt_rlock2);
+ lockevent_add(rwsem_opt_fail, -1);
+ return true;
+ }
+ return false;
+}
+#else
+static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem,
+ unsigned long nonspinnable)
+{
+ return false;
+}
+
+static inline bool rwsem_optimistic_spin(struct rw_semaphore *sem, bool wlock)
+{
+ return false;
+}
+
+static inline void clear_wr_nonspinnable(struct rw_semaphore *sem) { }
+
+static inline bool rwsem_reader_phase_trylock(struct rw_semaphore *sem,
+ unsigned long last_rowner)
+{
+ return false;
+}
+#endif
+
+/*
+ * Wait for the read lock to be granted
+ */
+static struct rw_semaphore __sched *
+rwsem_down_read_slowpath(struct rw_semaphore *sem, int state)
+{
+ long count, adjustment = -RWSEM_READER_BIAS;
+ struct rwsem_waiter waiter;
+ DEFINE_WAKE_Q(wake_q);
+ bool wake = false;
+
+ /*
+ * Save the current read-owner of rwsem, if available, and the
+ * reader nonspinnable bit.
+ */
+ waiter.last_rowner = atomic_long_read(&sem->owner);
+ if (!(waiter.last_rowner & RWSEM_READER_OWNED))
+ waiter.last_rowner &= RWSEM_RD_NONSPINNABLE;
+
+ if (!rwsem_can_spin_on_owner(sem, RWSEM_RD_NONSPINNABLE))
+ goto queue;
+
+ /*
+ * Undo read bias from down_read() and do optimistic spinning.
+ */
+ atomic_long_add(-RWSEM_READER_BIAS, &sem->count);
+ adjustment = 0;
+ if (rwsem_optimistic_spin(sem, false)) {
+ /*
+ * Wake up other readers in the wait list if the front
+ * waiter is a reader.
+ */
+ if ((atomic_long_read(&sem->count) & RWSEM_FLAG_WAITERS)) {
+ raw_spin_lock_irq(&sem->wait_lock);
+ if (!list_empty(&sem->wait_list))
+ rwsem_mark_wake(sem, RWSEM_WAKE_READ_OWNED,
+ &wake_q);
+ raw_spin_unlock_irq(&sem->wait_lock);
+ wake_up_q(&wake_q);
+ }
+ return sem;
+ } else if (rwsem_reader_phase_trylock(sem, waiter.last_rowner)) {
+ return sem;
+ }
+
+queue:
+ waiter.task = current;
+ waiter.type = RWSEM_WAITING_FOR_READ;
+ waiter.timeout = jiffies + RWSEM_WAIT_TIMEOUT;
+
+ raw_spin_lock_irq(&sem->wait_lock);
+ if (list_empty(&sem->wait_list)) {
+ /*
+ * In case the wait queue is empty and the lock isn't owned
+ * by a writer or has the handoff bit set, this reader can
+ * exit the slowpath and return immediately as its
+ * RWSEM_READER_BIAS has already been set in the count.
+ */
+ if (adjustment && !(atomic_long_read(&sem->count) &
+ (RWSEM_WRITER_MASK | RWSEM_FLAG_HANDOFF))) {
+ raw_spin_unlock_irq(&sem->wait_lock);
+ rwsem_set_reader_owned(sem);
+ lockevent_inc(rwsem_rlock_fast);
+ return sem;
+ }
+ adjustment += RWSEM_FLAG_WAITERS;
+ }
+ list_add_tail(&waiter.list, &sem->wait_list);
+
+ /* we're now waiting on the lock, but no longer actively locking */
+ if (adjustment)
+ count = atomic_long_add_return(adjustment, &sem->count);
+ else
+ count = atomic_long_read(&sem->count);
+
+ /*
+ * If there are no active locks, wake the front queued process(es).
+ *
+ * If there are no writers and we are first in the queue,
+ * wake our own waiter to join the existing active readers !
+ */
+ if (!(count & RWSEM_LOCK_MASK)) {
+ clear_wr_nonspinnable(sem);
+ wake = true;
+ }
+ if (wake || (!(count & RWSEM_WRITER_MASK) &&
+ (adjustment & RWSEM_FLAG_WAITERS)))
+ rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q);
+
+ raw_spin_unlock_irq(&sem->wait_lock);
+ wake_up_q(&wake_q);
+
+ /* wait to be given the lock */
+ while (true) {
+ set_current_state(state);
+ if (!waiter.task)
+ break;
+ if (signal_pending_state(state, current)) {
+ raw_spin_lock_irq(&sem->wait_lock);
+ if (waiter.task)
+ goto out_nolock;
+ raw_spin_unlock_irq(&sem->wait_lock);
+ break;
+ }
+ schedule();
+ lockevent_inc(rwsem_sleep_reader);
+ }
+
+ __set_current_state(TASK_RUNNING);
+ lockevent_inc(rwsem_rlock);
+ return sem;
+out_nolock:
+ list_del(&waiter.list);
+ if (list_empty(&sem->wait_list)) {
+ atomic_long_andnot(RWSEM_FLAG_WAITERS|RWSEM_FLAG_HANDOFF,
+ &sem->count);
+ }
+ raw_spin_unlock_irq(&sem->wait_lock);
+ __set_current_state(TASK_RUNNING);
+ lockevent_inc(rwsem_rlock_fail);
+ return ERR_PTR(-EINTR);
+}
+
+/*
+ * This function is called by the a write lock owner. So the owner value
+ * won't get changed by others.
+ */
+static inline void rwsem_disable_reader_optspin(struct rw_semaphore *sem,
+ bool disable)
+{
+ if (unlikely(disable)) {
+ atomic_long_or(RWSEM_RD_NONSPINNABLE, &sem->owner);
+ lockevent_inc(rwsem_opt_norspin);
+ }
+}
+
+/*
+ * Wait until we successfully acquire the write lock
+ */
+static struct rw_semaphore *
+rwsem_down_write_slowpath(struct rw_semaphore *sem, int state)
+{
+ long count;
+ bool disable_rspin;
+ enum writer_wait_state wstate;
+ struct rwsem_waiter waiter;
+ struct rw_semaphore *ret = sem;
+ DEFINE_WAKE_Q(wake_q);
+
+ /* do optimistic spinning and steal lock if possible */
+ if (rwsem_can_spin_on_owner(sem, RWSEM_WR_NONSPINNABLE) &&
+ rwsem_optimistic_spin(sem, true))
+ return sem;
+
+ /*
+ * Disable reader optimistic spinning for this rwsem after
+ * acquiring the write lock when the setting of the nonspinnable
+ * bits are observed.
+ */
+ disable_rspin = atomic_long_read(&sem->owner) & RWSEM_NONSPINNABLE;
+
+ /*
+ * Optimistic spinning failed, proceed to the slowpath
+ * and block until we can acquire the sem.
+ */
+ waiter.task = current;
+ waiter.type = RWSEM_WAITING_FOR_WRITE;
+ waiter.timeout = jiffies + RWSEM_WAIT_TIMEOUT;
+
+ raw_spin_lock_irq(&sem->wait_lock);
+
+ /* account for this before adding a new element to the list */
+ wstate = list_empty(&sem->wait_list) ? WRITER_FIRST : WRITER_NOT_FIRST;
+
+ list_add_tail(&waiter.list, &sem->wait_list);
+
+ /* we're now waiting on the lock */
+ if (wstate == WRITER_NOT_FIRST) {
+ count = atomic_long_read(&sem->count);
+
+ /*
+ * If there were already threads queued before us and:
+ * 1) there are no no active locks, wake the front
+ * queued process(es) as the handoff bit might be set.
+ * 2) there are no active writers and some readers, the lock
+ * must be read owned; so we try to wake any read lock
+ * waiters that were queued ahead of us.
+ */
+ if (count & RWSEM_WRITER_MASK)
+ goto wait;
+
+ rwsem_mark_wake(sem, (count & RWSEM_READER_MASK)
+ ? RWSEM_WAKE_READERS
+ : RWSEM_WAKE_ANY, &wake_q);
+
+ if (!wake_q_empty(&wake_q)) {
+ /*
+ * We want to minimize wait_lock hold time especially
+ * when a large number of readers are to be woken up.
+ */
+ raw_spin_unlock_irq(&sem->wait_lock);
+ wake_up_q(&wake_q);
+ wake_q_init(&wake_q); /* Used again, reinit */
+ raw_spin_lock_irq(&sem->wait_lock);
+ }
+ } else {
+ atomic_long_or(RWSEM_FLAG_WAITERS, &sem->count);
+ }
+
+wait:
+ /* wait until we successfully acquire the lock */
+ set_current_state(state);
+ while (true) {
+ if (rwsem_try_write_lock(sem, wstate))
+ break;
+
+ raw_spin_unlock_irq(&sem->wait_lock);
+
+ /* Block until there are no active lockers. */
+ for (;;) {
+ if (signal_pending_state(state, current))
+ goto out_nolock;
+
+ schedule();
+ lockevent_inc(rwsem_sleep_writer);
+ set_current_state(state);
+ /*
+ * If HANDOFF bit is set, unconditionally do
+ * a trylock.
+ */
+ if (wstate == WRITER_HANDOFF)
+ break;
+
+ if ((wstate == WRITER_NOT_FIRST) &&
+ (rwsem_first_waiter(sem) == &waiter))
+ wstate = WRITER_FIRST;
+
+ count = atomic_long_read(&sem->count);
+ if (!(count & RWSEM_LOCK_MASK))
+ break;
+
+ /*
+ * The setting of the handoff bit is deferred
+ * until rwsem_try_write_lock() is called.
+ */
+ if ((wstate == WRITER_FIRST) && (rt_task(current) ||
+ time_after(jiffies, waiter.timeout))) {
+ wstate = WRITER_HANDOFF;
+ lockevent_inc(rwsem_wlock_handoff);
+ break;
+ }
+ }
+
+ raw_spin_lock_irq(&sem->wait_lock);
+ }
+ __set_current_state(TASK_RUNNING);
+ list_del(&waiter.list);
+ rwsem_disable_reader_optspin(sem, disable_rspin);
+ raw_spin_unlock_irq(&sem->wait_lock);
+ lockevent_inc(rwsem_wlock);
+
+ return ret;
+
+out_nolock:
+ __set_current_state(TASK_RUNNING);
+ raw_spin_lock_irq(&sem->wait_lock);
+ list_del(&waiter.list);
+
+ if (unlikely(wstate == WRITER_HANDOFF))
+ atomic_long_add(-RWSEM_FLAG_HANDOFF, &sem->count);
+
+ if (list_empty(&sem->wait_list))
+ atomic_long_andnot(RWSEM_FLAG_WAITERS, &sem->count);
+ else
+ rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q);
+ raw_spin_unlock_irq(&sem->wait_lock);
+ wake_up_q(&wake_q);
+ lockevent_inc(rwsem_wlock_fail);
+
+ return ERR_PTR(-EINTR);
+}
+
+/*
+ * handle waking up a waiter on the semaphore
+ * - up_read/up_write has decremented the active part of count if we come here
+ */
+static struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem, long count)
+{
+ unsigned long flags;
+ DEFINE_WAKE_Q(wake_q);
+
+ raw_spin_lock_irqsave(&sem->wait_lock, flags);
+
+ if (!list_empty(&sem->wait_list))
+ rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q);
+
+ raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
+ wake_up_q(&wake_q);
+
+ return sem;
+}
+
+/*
+ * downgrade a write lock into a read lock
+ * - caller incremented waiting part of count and discovered it still negative
+ * - just wake up any readers at the front of the queue
+ */
+static struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem)
+{
+ unsigned long flags;
+ DEFINE_WAKE_Q(wake_q);
+
+ raw_spin_lock_irqsave(&sem->wait_lock, flags);
+
+ if (!list_empty(&sem->wait_list))
+ rwsem_mark_wake(sem, RWSEM_WAKE_READ_OWNED, &wake_q);
+
+ raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
+ wake_up_q(&wake_q);
+
+ return sem;
+}
+
+/*
+ * lock for reading
+ */
+inline void __down_read(struct rw_semaphore *sem)
+{
+ if (!rwsem_read_trylock(sem)) {
+ rwsem_down_read_slowpath(sem, TASK_UNINTERRUPTIBLE);
+ DEBUG_RWSEMS_WARN_ON(!is_rwsem_reader_owned(sem), sem);
+ } else {
+ rwsem_set_reader_owned(sem);
+ }
+}
+
+static inline int __down_read_killable(struct rw_semaphore *sem)
+{
+ if (!rwsem_read_trylock(sem)) {
+ if (IS_ERR(rwsem_down_read_slowpath(sem, TASK_KILLABLE)))
+ return -EINTR;
+ DEBUG_RWSEMS_WARN_ON(!is_rwsem_reader_owned(sem), sem);
+ } else {
+ rwsem_set_reader_owned(sem);
+ }
+ return 0;
+}
+
+static inline int __down_read_trylock(struct rw_semaphore *sem)
+{
+ /*
+ * Optimize for the case when the rwsem is not locked at all.
+ */
+ long tmp = RWSEM_UNLOCKED_VALUE;
+
+ do {
+ if (atomic_long_try_cmpxchg_acquire(&sem->count, &tmp,
+ tmp + RWSEM_READER_BIAS)) {
+ rwsem_set_reader_owned(sem);
+ return 1;
+ }
+ } while (!(tmp & RWSEM_READ_FAILED_MASK));
+ return 0;
+}
+
+/*
+ * lock for writing
+ */
+static inline void __down_write(struct rw_semaphore *sem)
+{
+ long tmp = RWSEM_UNLOCKED_VALUE;
+
+ if (unlikely(!atomic_long_try_cmpxchg_acquire(&sem->count, &tmp,
+ RWSEM_WRITER_LOCKED)))
+ rwsem_down_write_slowpath(sem, TASK_UNINTERRUPTIBLE);
+ else
+ rwsem_set_owner(sem);
+}
+
+static inline int __down_write_killable(struct rw_semaphore *sem)
+{
+ long tmp = RWSEM_UNLOCKED_VALUE;
+
+ if (unlikely(!atomic_long_try_cmpxchg_acquire(&sem->count, &tmp,
+ RWSEM_WRITER_LOCKED))) {
+ if (IS_ERR(rwsem_down_write_slowpath(sem, TASK_KILLABLE)))
+ return -EINTR;
+ } else {
+ rwsem_set_owner(sem);
+ }
+ return 0;
+}
+
+static inline int __down_write_trylock(struct rw_semaphore *sem)
+{
+ long tmp = RWSEM_UNLOCKED_VALUE;
+
+ if (atomic_long_try_cmpxchg_acquire(&sem->count, &tmp,
+ RWSEM_WRITER_LOCKED)) {
+ rwsem_set_owner(sem);
+ return true;
+ }
+ return false;
+}
+
+/*
+ * unlock after reading
+ */
+inline void __up_read(struct rw_semaphore *sem)
+{
+ long tmp;
+
+ DEBUG_RWSEMS_WARN_ON(!is_rwsem_reader_owned(sem), sem);
+ rwsem_clear_reader_owned(sem);
+ tmp = atomic_long_add_return_release(-RWSEM_READER_BIAS, &sem->count);
+ DEBUG_RWSEMS_WARN_ON(tmp < 0, sem);
+ if (unlikely((tmp & (RWSEM_LOCK_MASK|RWSEM_FLAG_WAITERS)) ==
+ RWSEM_FLAG_WAITERS)) {
+ clear_wr_nonspinnable(sem);
+ rwsem_wake(sem, tmp);
+ }
+}
+
+/*
+ * unlock after writing
+ */
+static inline void __up_write(struct rw_semaphore *sem)
+{
+ long tmp;
+
+ /*
+ * sem->owner may differ from current if the ownership is transferred
+ * to an anonymous writer by setting the RWSEM_NONSPINNABLE bits.
+ */
+ DEBUG_RWSEMS_WARN_ON((rwsem_owner(sem) != current) &&
+ !rwsem_test_oflags(sem, RWSEM_NONSPINNABLE), sem);
+ rwsem_clear_owner(sem);
+ tmp = atomic_long_fetch_add_release(-RWSEM_WRITER_LOCKED, &sem->count);
+ if (unlikely(tmp & RWSEM_FLAG_WAITERS))
+ rwsem_wake(sem, tmp);
+}
+
+/*
+ * downgrade write lock to read lock
+ */
+static inline void __downgrade_write(struct rw_semaphore *sem)
+{
+ long tmp;
+
+ /*
+ * When downgrading from exclusive to shared ownership,
+ * anything inside the write-locked region cannot leak
+ * into the read side. In contrast, anything in the
+ * read-locked region is ok to be re-ordered into the
+ * write side. As such, rely on RELEASE semantics.
+ */
+ DEBUG_RWSEMS_WARN_ON(rwsem_owner(sem) != current, sem);
+ tmp = atomic_long_fetch_add_release(
+ -RWSEM_WRITER_LOCKED+RWSEM_READER_BIAS, &sem->count);
+ rwsem_set_reader_owned(sem);
+ if (tmp & RWSEM_FLAG_WAITERS)
+ rwsem_downgrade_wake(sem);
+}
/*
* lock for reading
@@ -24,9 +1445,7 @@ void __sched down_read(struct rw_semaphore *sem)
rwsem_acquire_read(&sem->dep_map, 0, 0, _RET_IP_);
LOCK_CONTENDED(sem, __down_read_trylock, __down_read);
- rwsem_set_reader_owned(sem);
}
-
EXPORT_SYMBOL(down_read);
int __sched down_read_killable(struct rw_semaphore *sem)
@@ -39,10 +1458,8 @@ int __sched down_read_killable(struct rw_semaphore *sem)
return -EINTR;
}
- rwsem_set_reader_owned(sem);
return 0;
}
-
EXPORT_SYMBOL(down_read_killable);
/*
@@ -52,13 +1469,10 @@ int down_read_trylock(struct rw_semaphore *sem)
{
int ret = __down_read_trylock(sem);
- if (ret == 1) {
+ if (ret == 1)
rwsem_acquire_read(&sem->dep_map, 0, 1, _RET_IP_);
- rwsem_set_reader_owned(sem);
- }
return ret;
}
-
EXPORT_SYMBOL(down_read_trylock);
/*
@@ -68,11 +1482,8 @@ void __sched down_write(struct rw_semaphore *sem)
{
might_sleep();
rwsem_acquire(&sem->dep_map, 0, 0, _RET_IP_);
-
LOCK_CONTENDED(sem, __down_write_trylock, __down_write);
- rwsem_set_owner(sem);
}
-
EXPORT_SYMBOL(down_write);
/*
@@ -83,15 +1494,14 @@ int __sched down_write_killable(struct rw_semaphore *sem)
might_sleep();
rwsem_acquire(&sem->dep_map, 0, 0, _RET_IP_);
- if (LOCK_CONTENDED_RETURN(sem, __down_write_trylock, __down_write_killable)) {
+ if (LOCK_CONTENDED_RETURN(sem, __down_write_trylock,
+ __down_write_killable)) {
rwsem_release(&sem->dep_map, 1, _RET_IP_);
return -EINTR;
}
- rwsem_set_owner(sem);
return 0;
}
-
EXPORT_SYMBOL(down_write_killable);
/*
@@ -101,14 +1511,11 @@ int down_write_trylock(struct rw_semaphore *sem)
{
int ret = __down_write_trylock(sem);
- if (ret == 1) {
+ if (ret == 1)
rwsem_acquire(&sem->dep_map, 0, 1, _RET_IP_);
- rwsem_set_owner(sem);
- }
return ret;
}
-
EXPORT_SYMBOL(down_write_trylock);
/*
@@ -117,12 +1524,8 @@ EXPORT_SYMBOL(down_write_trylock);
void up_read(struct rw_semaphore *sem)
{
rwsem_release(&sem->dep_map, 1, _RET_IP_);
- DEBUG_RWSEMS_WARN_ON(!((unsigned long)sem->owner & RWSEM_READER_OWNED));
-
- rwsem_clear_reader_owned(sem);
__up_read(sem);
}
-
EXPORT_SYMBOL(up_read);
/*
@@ -131,12 +1534,8 @@ EXPORT_SYMBOL(up_read);
void up_write(struct rw_semaphore *sem)
{
rwsem_release(&sem->dep_map, 1, _RET_IP_);
- DEBUG_RWSEMS_WARN_ON(sem->owner != current);
-
- rwsem_clear_owner(sem);
__up_write(sem);
}
-
EXPORT_SYMBOL(up_write);
/*
@@ -145,12 +1544,8 @@ EXPORT_SYMBOL(up_write);
void downgrade_write(struct rw_semaphore *sem)
{
lock_downgrade(&sem->dep_map, _RET_IP_);
- DEBUG_RWSEMS_WARN_ON(sem->owner != current);
-
- rwsem_set_reader_owned(sem);
__downgrade_write(sem);
}
-
EXPORT_SYMBOL(downgrade_write);
#ifdef CONFIG_DEBUG_LOCK_ALLOC
@@ -159,43 +1554,32 @@ void down_read_nested(struct rw_semaphore *sem, int subclass)
{
might_sleep();
rwsem_acquire_read(&sem->dep_map, subclass, 0, _RET_IP_);
-
LOCK_CONTENDED(sem, __down_read_trylock, __down_read);
- rwsem_set_reader_owned(sem);
}
-
EXPORT_SYMBOL(down_read_nested);
void _down_write_nest_lock(struct rw_semaphore *sem, struct lockdep_map *nest)
{
might_sleep();
rwsem_acquire_nest(&sem->dep_map, 0, 0, nest, _RET_IP_);
-
LOCK_CONTENDED(sem, __down_write_trylock, __down_write);
- rwsem_set_owner(sem);
}
-
EXPORT_SYMBOL(_down_write_nest_lock);
void down_read_non_owner(struct rw_semaphore *sem)
{
might_sleep();
-
__down_read(sem);
__rwsem_set_reader_owned(sem, NULL);
}
-
EXPORT_SYMBOL(down_read_non_owner);
void down_write_nested(struct rw_semaphore *sem, int subclass)
{
might_sleep();
rwsem_acquire(&sem->dep_map, subclass, 0, _RET_IP_);
-
LOCK_CONTENDED(sem, __down_write_trylock, __down_write);
- rwsem_set_owner(sem);
}
-
EXPORT_SYMBOL(down_write_nested);
int __sched down_write_killable_nested(struct rw_semaphore *sem, int subclass)
@@ -203,23 +1587,21 @@ int __sched down_write_killable_nested(struct rw_semaphore *sem, int subclass)
might_sleep();
rwsem_acquire(&sem->dep_map, subclass, 0, _RET_IP_);
- if (LOCK_CONTENDED_RETURN(sem, __down_write_trylock, __down_write_killable)) {
+ if (LOCK_CONTENDED_RETURN(sem, __down_write_trylock,
+ __down_write_killable)) {
rwsem_release(&sem->dep_map, 1, _RET_IP_);
return -EINTR;
}
- rwsem_set_owner(sem);
return 0;
}
-
EXPORT_SYMBOL(down_write_killable_nested);
void up_read_non_owner(struct rw_semaphore *sem)
{
- DEBUG_RWSEMS_WARN_ON(!((unsigned long)sem->owner & RWSEM_READER_OWNED));
+ DEBUG_RWSEMS_WARN_ON(!is_rwsem_reader_owned(sem), sem);
__up_read(sem);
}
-
EXPORT_SYMBOL(up_read_non_owner);
#endif
diff --git a/kernel/locking/rwsem.h b/kernel/locking/rwsem.h
index bad2bca0268b..2534ce49f648 100644
--- a/kernel/locking/rwsem.h
+++ b/kernel/locking/rwsem.h
@@ -1,134 +1,10 @@
/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * The least significant 2 bits of the owner value has the following
- * meanings when set.
- * - RWSEM_READER_OWNED (bit 0): The rwsem is owned by readers
- * - RWSEM_ANONYMOUSLY_OWNED (bit 1): The rwsem is anonymously owned,
- * i.e. the owner(s) cannot be readily determined. It can be reader
- * owned or the owning writer is indeterminate.
- *
- * When a writer acquires a rwsem, it puts its task_struct pointer
- * into the owner field. It is cleared after an unlock.
- *
- * When a reader acquires a rwsem, it will also puts its task_struct
- * pointer into the owner field with both the RWSEM_READER_OWNED and
- * RWSEM_ANONYMOUSLY_OWNED bits set. On unlock, the owner field will
- * largely be left untouched. So for a free or reader-owned rwsem,
- * the owner value may contain information about the last reader that
- * acquires the rwsem. The anonymous bit is set because that particular
- * reader may or may not still own the lock.
- *
- * That information may be helpful in debugging cases where the system
- * seems to hang on a reader owned rwsem especially if only one reader
- * is involved. Ideally we would like to track all the readers that own
- * a rwsem, but the overhead is simply too big.
- */
-#define RWSEM_READER_OWNED (1UL << 0)
-#define RWSEM_ANONYMOUSLY_OWNED (1UL << 1)
-#ifdef CONFIG_DEBUG_RWSEMS
-# define DEBUG_RWSEMS_WARN_ON(c) DEBUG_LOCKS_WARN_ON(c)
-#else
-# define DEBUG_RWSEMS_WARN_ON(c)
-#endif
+#ifndef __INTERNAL_RWSEM_H
+#define __INTERNAL_RWSEM_H
+#include <linux/rwsem.h>
-#ifdef CONFIG_RWSEM_SPIN_ON_OWNER
-/*
- * All writes to owner are protected by WRITE_ONCE() to make sure that
- * store tearing can't happen as optimistic spinners may read and use
- * the owner value concurrently without lock. Read from owner, however,
- * may not need READ_ONCE() as long as the pointer value is only used
- * for comparison and isn't being dereferenced.
- */
-static inline void rwsem_set_owner(struct rw_semaphore *sem)
-{
- WRITE_ONCE(sem->owner, current);
-}
+extern void __down_read(struct rw_semaphore *sem);
+extern void __up_read(struct rw_semaphore *sem);
-static inline void rwsem_clear_owner(struct rw_semaphore *sem)
-{
- WRITE_ONCE(sem->owner, NULL);
-}
-
-/*
- * The task_struct pointer of the last owning reader will be left in
- * the owner field.
- *
- * Note that the owner value just indicates the task has owned the rwsem
- * previously, it may not be the real owner or one of the real owners
- * anymore when that field is examined, so take it with a grain of salt.
- */
-static inline void __rwsem_set_reader_owned(struct rw_semaphore *sem,
- struct task_struct *owner)
-{
- unsigned long val = (unsigned long)owner | RWSEM_READER_OWNED
- | RWSEM_ANONYMOUSLY_OWNED;
-
- WRITE_ONCE(sem->owner, (struct task_struct *)val);
-}
-
-static inline void rwsem_set_reader_owned(struct rw_semaphore *sem)
-{
- __rwsem_set_reader_owned(sem, current);
-}
-
-/*
- * Return true if the a rwsem waiter can spin on the rwsem's owner
- * and steal the lock, i.e. the lock is not anonymously owned.
- * N.B. !owner is considered spinnable.
- */
-static inline bool is_rwsem_owner_spinnable(struct task_struct *owner)
-{
- return !((unsigned long)owner & RWSEM_ANONYMOUSLY_OWNED);
-}
-
-/*
- * Return true if rwsem is owned by an anonymous writer or readers.
- */
-static inline bool rwsem_has_anonymous_owner(struct task_struct *owner)
-{
- return (unsigned long)owner & RWSEM_ANONYMOUSLY_OWNED;
-}
-
-#ifdef CONFIG_DEBUG_RWSEMS
-/*
- * With CONFIG_DEBUG_RWSEMS configured, it will make sure that if there
- * is a task pointer in owner of a reader-owned rwsem, it will be the
- * real owner or one of the real owners. The only exception is when the
- * unlock is done by up_read_non_owner().
- */
-#define rwsem_clear_reader_owned rwsem_clear_reader_owned
-static inline void rwsem_clear_reader_owned(struct rw_semaphore *sem)
-{
- unsigned long val = (unsigned long)current | RWSEM_READER_OWNED
- | RWSEM_ANONYMOUSLY_OWNED;
- if (READ_ONCE(sem->owner) == (struct task_struct *)val)
- cmpxchg_relaxed((unsigned long *)&sem->owner, val,
- RWSEM_READER_OWNED | RWSEM_ANONYMOUSLY_OWNED);
-}
-#endif
-
-#else
-static inline void rwsem_set_owner(struct rw_semaphore *sem)
-{
-}
-
-static inline void rwsem_clear_owner(struct rw_semaphore *sem)
-{
-}
-
-static inline void __rwsem_set_reader_owned(struct rw_semaphore *sem,
- struct task_struct *owner)
-{
-}
-
-static inline void rwsem_set_reader_owned(struct rw_semaphore *sem)
-{
-}
-#endif
-
-#ifndef rwsem_clear_reader_owned
-static inline void rwsem_clear_reader_owned(struct rw_semaphore *sem)
-{
-}
-#endif
+#endif /* __INTERNAL_RWSEM_H */
diff --git a/kernel/locking/semaphore.c b/kernel/locking/semaphore.c
index 561acdd39960..d9dd94defc0a 100644
--- a/kernel/locking/semaphore.c
+++ b/kernel/locking/semaphore.c
@@ -1,9 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright (c) 2008 Intel Corporation
* Author: Matthew Wilcox <willy@linux.intel.com>
*
- * Distributed under the terms of the GNU GPL, version 2
- *
* This file implements counting semaphores.
* A counting semaphore may be acquired 'n' times before sleeping.
* See mutex.c for single-acquisition sleeping locks which enforce
diff --git a/kernel/locking/spinlock.c b/kernel/locking/spinlock.c
index 936f3d14dd6b..0ff08380f531 100644
--- a/kernel/locking/spinlock.c
+++ b/kernel/locking/spinlock.c
@@ -22,6 +22,13 @@
#include <linux/debug_locks.h>
#include <linux/export.h>
+#ifdef CONFIG_MMIOWB
+#ifndef arch_mmiowb_state
+DEFINE_PER_CPU(struct mmiowb_state, __mmiowb_state);
+EXPORT_PER_CPU_SYMBOL(__mmiowb_state);
+#endif
+#endif
+
/*
* If lockdep is enabled then we use the non-preemption spin-ops
* even on CONFIG_PREEMPT, because lockdep assumes that interrupts are
diff --git a/kernel/locking/spinlock_debug.c b/kernel/locking/spinlock_debug.c
index 9aa0fccd5d43..399669f7eba8 100644
--- a/kernel/locking/spinlock_debug.c
+++ b/kernel/locking/spinlock_debug.c
@@ -111,6 +111,7 @@ void do_raw_spin_lock(raw_spinlock_t *lock)
{
debug_spin_lock_before(lock);
arch_spin_lock(&lock->raw_lock);
+ mmiowb_spin_lock();
debug_spin_lock_after(lock);
}
@@ -118,8 +119,10 @@ int do_raw_spin_trylock(raw_spinlock_t *lock)
{
int ret = arch_spin_trylock(&lock->raw_lock);
- if (ret)
+ if (ret) {
+ mmiowb_spin_lock();
debug_spin_lock_after(lock);
+ }
#ifndef CONFIG_SMP
/*
* Must not happen on UP:
@@ -131,6 +134,7 @@ int do_raw_spin_trylock(raw_spinlock_t *lock)
void do_raw_spin_unlock(raw_spinlock_t *lock)
{
+ mmiowb_spin_unlock();
debug_spin_unlock(lock);
arch_spin_unlock(&lock->raw_lock);
}
diff --git a/kernel/locking/test-ww_mutex.c b/kernel/locking/test-ww_mutex.c
index 65a3b7e55b9f..3e82f449b4ff 100644
--- a/kernel/locking/test-ww_mutex.c
+++ b/kernel/locking/test-ww_mutex.c
@@ -1,19 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* Module-based API test facility for ww_mutexes
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, you can access it online at
- * http://www.gnu.org/licenses/gpl-2.0.html.
*/
#include <linux/kernel.h>
diff --git a/kernel/memremap.c b/kernel/memremap.c
index a856cb5ff192..6e1970719dc2 100644
--- a/kernel/memremap.c
+++ b/kernel/memremap.c
@@ -45,7 +45,6 @@ vm_fault_t device_private_entry_fault(struct vm_area_struct *vma,
*/
return devmem->page_fault(vma, addr, page, flags, pmdp);
}
-EXPORT_SYMBOL(device_private_entry_fault);
#endif /* CONFIG_DEVICE_PRIVATE */
static void pgmap_array_delete(struct resource *res)
@@ -96,6 +95,7 @@ static void devm_memremap_pages_release(void *data)
pgmap->kill(pgmap->ref);
for_each_device_pfn(pfn, pgmap)
put_page(pfn_to_page(pfn));
+ pgmap->cleanup(pgmap->ref);
/* pages are dead and unused, undo the arch mapping */
align_start = res->start & ~(SECTION_SIZE - 1);
@@ -134,8 +134,8 @@ static void devm_memremap_pages_release(void *data)
* 2/ The altmap field may optionally be initialized, in which case altmap_valid
* must be set to true
*
- * 3/ pgmap->ref must be 'live' on entry and will be killed at
- * devm_memremap_pages_release() time, or if this routine fails.
+ * 3/ pgmap->ref must be 'live' on entry and will be killed and reaped
+ * at devm_memremap_pages_release() time, or if this routine fails.
*
* 4/ res is expected to be a host memory range that could feasibly be
* treated as a "System RAM" range, i.e. not a device mmio range, but
@@ -148,11 +148,19 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap)
&pgmap->altmap : NULL;
struct resource *res = &pgmap->res;
struct dev_pagemap *conflict_pgmap;
+ struct mhp_restrictions restrictions = {
+ /*
+ * We do not want any optional features only our own memmap
+ */
+ .altmap = altmap,
+ };
pgprot_t pgprot = PAGE_KERNEL;
int error, nid, is_ram;
- if (!pgmap->ref || !pgmap->kill)
+ if (!pgmap->ref || !pgmap->kill || !pgmap->cleanup) {
+ WARN(1, "Missing reference count teardown definition\n");
return ERR_PTR(-EINVAL);
+ }
align_start = res->start & ~(SECTION_SIZE - 1);
align_size = ALIGN(res->start + resource_size(res), SECTION_SIZE)
@@ -163,14 +171,16 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap)
if (conflict_pgmap) {
dev_WARN(dev, "Conflicting mapping in same section\n");
put_dev_pagemap(conflict_pgmap);
- return ERR_PTR(-ENOMEM);
+ error = -ENOMEM;
+ goto err_array;
}
conflict_pgmap = get_dev_pagemap(PHYS_PFN(align_end), NULL);
if (conflict_pgmap) {
dev_WARN(dev, "Conflicting mapping in same section\n");
put_dev_pagemap(conflict_pgmap);
- return ERR_PTR(-ENOMEM);
+ error = -ENOMEM;
+ goto err_array;
}
is_ram = region_intersects(align_start, align_size,
@@ -214,7 +224,7 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap)
*/
if (pgmap->type == MEMORY_DEVICE_PRIVATE) {
error = add_pages(nid, align_start >> PAGE_SHIFT,
- align_size >> PAGE_SHIFT, NULL, false);
+ align_size >> PAGE_SHIFT, &restrictions);
} else {
error = kasan_add_zero_shadow(__va(align_start), align_size);
if (error) {
@@ -222,8 +232,8 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap)
goto err_kasan;
}
- error = arch_add_memory(nid, align_start, align_size, altmap,
- false);
+ error = arch_add_memory(nid, align_start, align_size,
+ &restrictions);
}
if (!error) {
@@ -262,10 +272,18 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap)
pgmap_array_delete(res);
err_array:
pgmap->kill(pgmap->ref);
+ pgmap->cleanup(pgmap->ref);
+
return ERR_PTR(error);
}
EXPORT_SYMBOL_GPL(devm_memremap_pages);
+void devm_memunmap_pages(struct device *dev, struct dev_pagemap *pgmap)
+{
+ devm_release_action(dev, devm_memremap_pages_release, pgmap);
+}
+EXPORT_SYMBOL_GPL(devm_memunmap_pages);
+
unsigned long vmem_altmap_offset(struct vmem_altmap *altmap)
{
/* number of pfns from base where pfn_to_page() is valid */
diff --git a/kernel/module-internal.h b/kernel/module-internal.h
index 79c9be2dbbe9..33783abc377b 100644
--- a/kernel/module-internal.h
+++ b/kernel/module-internal.h
@@ -1,12 +1,8 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
/* Module internals
*
* Copyright (C) 2012 Red Hat, Inc. All Rights Reserved.
* Written by David Howells (dhowells@redhat.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public Licence
- * as published by the Free Software Foundation; either version
- * 2 of the Licence, or (at your option) any later version.
*/
#include <linux/elf.h>
@@ -20,7 +16,7 @@ struct load_info {
unsigned long len;
Elf_Shdr *sechdrs;
char *secstrings, *strtab;
- unsigned long symoffs, stroffs;
+ unsigned long symoffs, stroffs, init_typeoffs, core_typeoffs;
struct _ddebug *debug;
unsigned int num_debug;
bool sig_ok;
diff --git a/kernel/module.c b/kernel/module.c
index 2ad1b5239910..a2cee14a83f3 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -1,20 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
Copyright (C) 2002 Richard Henderson
Copyright (C) 2001 Rusty Russell, 2002, 2010 Rusty Russell IBM.
- This program is free software; you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation; either version 2 of the License, or
- (at your option) any later version.
-
- This program is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program; if not, write to the Free Software
- Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
#include <linux/export.h>
#include <linux/extable.h>
@@ -98,6 +86,10 @@ DEFINE_MUTEX(module_mutex);
EXPORT_SYMBOL_GPL(module_mutex);
static LIST_HEAD(modules);
+/* Work queue for freeing init sections in success case */
+static struct work_struct init_free_wq;
+static struct llist_head init_free_list;
+
#ifdef CONFIG_MODULES_TREE_LOOKUP
/*
@@ -286,6 +278,11 @@ bool is_module_sig_enforced(void)
}
EXPORT_SYMBOL(is_module_sig_enforced);
+void set_module_sig_enforced(void)
+{
+ sig_enforce = true;
+}
+
/* Block module loading/unloading? */
int modules_disabled = 0;
core_param(nomodule, modules_disabled, bint, 0);
@@ -1949,9 +1946,16 @@ void module_enable_ro(const struct module *mod, bool after_init)
if (!rodata_enabled)
return;
+ set_vm_flush_reset_perms(mod->core_layout.base);
+ set_vm_flush_reset_perms(mod->init_layout.base);
frob_text(&mod->core_layout, set_memory_ro);
+ frob_text(&mod->core_layout, set_memory_x);
+
frob_rodata(&mod->core_layout, set_memory_ro);
+
frob_text(&mod->init_layout, set_memory_ro);
+ frob_text(&mod->init_layout, set_memory_x);
+
frob_rodata(&mod->init_layout, set_memory_ro);
if (after_init)
@@ -1967,15 +1971,6 @@ static void module_enable_nx(const struct module *mod)
frob_writable_data(&mod->init_layout, set_memory_nx);
}
-static void module_disable_nx(const struct module *mod)
-{
- frob_rodata(&mod->core_layout, set_memory_x);
- frob_ro_after_init(&mod->core_layout, set_memory_x);
- frob_writable_data(&mod->core_layout, set_memory_x);
- frob_rodata(&mod->init_layout, set_memory_x);
- frob_writable_data(&mod->init_layout, set_memory_x);
-}
-
/* Iterate through all modules and set each module's text as RW */
void set_all_modules_text_rw(void)
{
@@ -2019,23 +2014,8 @@ void set_all_modules_text_ro(void)
}
mutex_unlock(&module_mutex);
}
-
-static void disable_ro_nx(const struct module_layout *layout)
-{
- if (rodata_enabled) {
- frob_text(layout, set_memory_rw);
- frob_rodata(layout, set_memory_rw);
- frob_ro_after_init(layout, set_memory_rw);
- }
- frob_rodata(layout, set_memory_x);
- frob_ro_after_init(layout, set_memory_x);
- frob_writable_data(layout, set_memory_x);
-}
-
#else
-static void disable_ro_nx(const struct module_layout *layout) { }
static void module_enable_nx(const struct module *mod) { }
-static void module_disable_nx(const struct module *mod) { }
#endif
#ifdef CONFIG_LIVEPATCH
@@ -2115,6 +2095,11 @@ static void free_module_elf(struct module *mod)
void __weak module_memfree(void *module_region)
{
+ /*
+ * This memory may be RO, and freeing RO memory in an interrupt is not
+ * supported by vmalloc.
+ */
+ WARN_ON(in_interrupt());
vfree(module_region);
}
@@ -2166,7 +2151,6 @@ static void free_module(struct module *mod)
mutex_unlock(&module_mutex);
/* This may be empty, but that's OK */
- disable_ro_nx(&mod->init_layout);
module_arch_freeing_init(mod);
module_memfree(mod->init_layout.base);
kfree(mod->args);
@@ -2176,7 +2160,6 @@ static void free_module(struct module *mod)
lockdep_free_key_range(mod->core_layout.base, mod->core_layout.size);
/* Finally, free the core (containing the module structure) */
- disable_ro_nx(&mod->core_layout);
module_memfree(mod->core_layout.base);
}
@@ -2647,6 +2630,8 @@ static void layout_symtab(struct module *mod, struct load_info *info)
info->symoffs = ALIGN(mod->core_layout.size, symsect->sh_addralign ?: 1);
info->stroffs = mod->core_layout.size = info->symoffs + ndst * sizeof(Elf_Sym);
mod->core_layout.size += strtab_size;
+ info->core_typeoffs = mod->core_layout.size;
+ mod->core_layout.size += ndst * sizeof(char);
mod->core_layout.size = debug_align(mod->core_layout.size);
/* Put string table section at end of init part of module. */
@@ -2660,6 +2645,8 @@ static void layout_symtab(struct module *mod, struct load_info *info)
__alignof__(struct mod_kallsyms));
info->mod_kallsyms_init_off = mod->init_layout.size;
mod->init_layout.size += sizeof(struct mod_kallsyms);
+ info->init_typeoffs = mod->init_layout.size;
+ mod->init_layout.size += nsrc * sizeof(char);
mod->init_layout.size = debug_align(mod->init_layout.size);
}
@@ -2683,20 +2670,23 @@ static void add_kallsyms(struct module *mod, const struct load_info *info)
mod->kallsyms->num_symtab = symsec->sh_size / sizeof(Elf_Sym);
/* Make sure we get permanent strtab: don't use info->strtab. */
mod->kallsyms->strtab = (void *)info->sechdrs[info->index.str].sh_addr;
+ mod->kallsyms->typetab = mod->init_layout.base + info->init_typeoffs;
- /* Set types up while we still have access to sections. */
- for (i = 0; i < mod->kallsyms->num_symtab; i++)
- mod->kallsyms->symtab[i].st_size
- = elf_type(&mod->kallsyms->symtab[i], info);
-
- /* Now populate the cut down core kallsyms for after init. */
+ /*
+ * Now populate the cut down core kallsyms for after init
+ * and set types up while we still have access to sections.
+ */
mod->core_kallsyms.symtab = dst = mod->core_layout.base + info->symoffs;
mod->core_kallsyms.strtab = s = mod->core_layout.base + info->stroffs;
+ mod->core_kallsyms.typetab = mod->core_layout.base + info->core_typeoffs;
src = mod->kallsyms->symtab;
for (ndst = i = 0; i < mod->kallsyms->num_symtab; i++) {
+ mod->kallsyms->typetab[i] = elf_type(src + i, info);
if (i == 0 || is_livepatch_module(mod) ||
is_core_symbol(src+i, info->sechdrs, info->hdr->e_shnum,
info->index.pcpu)) {
+ mod->core_kallsyms.typetab[ndst] =
+ mod->kallsyms->typetab[i];
dst[ndst] = src[i];
dst[ndst++].st_name = s - mod->core_kallsyms.strtab;
s += strlcpy(s, &mod->kallsyms->strtab[src[i].st_name],
@@ -2719,11 +2709,7 @@ static void dynamic_debug_setup(struct module *mod, struct _ddebug *debug, unsig
{
if (!debug)
return;
-#ifdef CONFIG_DYNAMIC_DEBUG
- if (ddebug_add_module(debug, num, mod->name))
- pr_err("dynamic debug error adding module: %s\n",
- debug->modname);
-#endif
+ ddebug_add_module(debug, num, mod->name);
}
static void dynamic_debug_remove(struct module *mod, struct _ddebug *debug)
@@ -3097,6 +3083,11 @@ static int find_module_sections(struct module *mod, struct load_info *info)
sizeof(*mod->tracepoints_ptrs),
&mod->num_tracepoints);
#endif
+#ifdef CONFIG_TREE_SRCU
+ mod->srcu_struct_ptrs = section_objs(info, "___srcu_struct_ptrs",
+ sizeof(*mod->srcu_struct_ptrs),
+ &mod->num_srcu_structs);
+#endif
#ifdef CONFIG_BPF_EVENTS
mod->bpf_raw_events = section_objs(info, "__bpf_raw_tp_map",
sizeof(*mod->bpf_raw_events),
@@ -3419,16 +3410,33 @@ static void do_mod_ctors(struct module *mod)
/* For freeing module_init on success, in case kallsyms traversing */
struct mod_initfree {
- struct rcu_head rcu;
+ struct llist_node node;
void *module_init;
};
-static void do_free_init(struct rcu_head *head)
+static void do_free_init(struct work_struct *w)
+{
+ struct llist_node *pos, *n, *list;
+ struct mod_initfree *initfree;
+
+ list = llist_del_all(&init_free_list);
+
+ synchronize_rcu();
+
+ llist_for_each_safe(pos, n, list) {
+ initfree = container_of(pos, struct mod_initfree, node);
+ module_memfree(initfree->module_init);
+ kfree(initfree);
+ }
+}
+
+static int __init modules_wq_init(void)
{
- struct mod_initfree *m = container_of(head, struct mod_initfree, rcu);
- module_memfree(m->module_init);
- kfree(m);
+ INIT_WORK(&init_free_wq, do_free_init);
+ init_llist_head(&init_free_list);
+ return 0;
}
+module_init(modules_wq_init);
/*
* This is where the real work happens.
@@ -3506,7 +3514,6 @@ static noinline int do_init_module(struct module *mod)
#endif
module_enable_ro(mod, true);
mod_tree_remove_init(mod);
- disable_ro_nx(&mod->init_layout);
module_arch_freeing_init(mod);
mod->init_layout.base = NULL;
mod->init_layout.size = 0;
@@ -3517,14 +3524,18 @@ static noinline int do_init_module(struct module *mod)
* We want to free module_init, but be aware that kallsyms may be
* walking this with preempt disabled. In all the failure paths, we
* call synchronize_rcu(), but we don't want to slow down the success
- * path, so use actual RCU here.
+ * path. module_memfree() cannot be called in an interrupt, so do the
+ * work and call synchronize_rcu() in a work queue.
+ *
* Note that module_alloc() on most architectures creates W+X page
* mappings which won't be cleaned up until do_free_init() runs. Any
* code such as mark_rodata_ro() which depends on those mappings to
* be cleaned up needs to sync with the queued work - ie
* rcu_barrier()
*/
- call_rcu(&freeinit->rcu, do_free_init);
+ if (llist_add(&freeinit->node, &init_free_list))
+ schedule_work(&init_free_wq);
+
mutex_unlock(&module_mutex);
wake_up_all(&module_wq);
@@ -3821,10 +3832,6 @@ static int load_module(struct load_info *info, const char __user *uargs,
module_bug_cleanup(mod);
mutex_unlock(&module_mutex);
- /* we can't deallocate the module until we clear memory protection */
- module_disable_ro(mod);
- module_disable_nx(mod);
-
ddebug_cleanup:
ftrace_release_mod(mod);
dynamic_debug_remove(mod, info->debug);
@@ -4084,7 +4091,7 @@ int module_get_kallsym(unsigned int symnum, unsigned long *value, char *type,
const Elf_Sym *sym = &kallsyms->symtab[symnum];
*value = kallsyms_symbol_value(sym);
- *type = sym->st_size;
+ *type = kallsyms->typetab[symnum];
strlcpy(name, kallsyms_symbol_name(kallsyms, symnum), KSYM_NAME_LEN);
strlcpy(module_name, mod->name, MODULE_NAME_LEN);
*exported = is_exported(name, *value, mod);
diff --git a/kernel/module_signing.c b/kernel/module_signing.c
index 6b9a926fd86b..b10fb1986ca9 100644
--- a/kernel/module_signing.c
+++ b/kernel/module_signing.c
@@ -1,12 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/* Module signature checker
*
* Copyright (C) 2012 Red Hat, Inc. All Rights Reserved.
* Written by David Howells (dhowells@redhat.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public Licence
- * as published by the Free Software Foundation; either version
- * 2 of the Licence, or (at your option) any later version.
*/
#include <linux/kernel.h>
diff --git a/kernel/notifier.c b/kernel/notifier.c
index 6196af8a8223..d9f5081d578d 100644
--- a/kernel/notifier.c
+++ b/kernel/notifier.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
#include <linux/kdebug.h>
#include <linux/kprobes.h>
#include <linux/export.h>
@@ -22,6 +23,7 @@ static int notifier_chain_register(struct notifier_block **nl,
struct notifier_block *n)
{
while ((*nl) != NULL) {
+ WARN_ONCE(((*nl) == n), "double register detected");
if (n->priority > (*nl)->priority)
break;
nl = &((*nl)->next);
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index f6c5d330059a..c815f58e6bc0 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -1,13 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright (C) 2006 IBM Corporation
*
* Author: Serge Hallyn <serue@us.ibm.com>
*
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation, version 2 of the
- * License.
- *
* Jun 2006 - namespaces support
* OpenVZ, SWsoft Inc.
* Pavel Emelianov <xemul@openvz.org>
diff --git a/kernel/padata.c b/kernel/padata.c
index 3e2633ae3bca..2d2fddbb7a4c 100644
--- a/kernel/padata.c
+++ b/kernel/padata.c
@@ -957,6 +957,7 @@ static struct attribute *padata_default_attrs[] = {
&parallel_cpumask_attr.attr,
NULL,
};
+ATTRIBUTE_GROUPS(padata_default);
static ssize_t padata_sysfs_show(struct kobject *kobj,
struct attribute *attr, char *buf)
@@ -995,7 +996,7 @@ static const struct sysfs_ops padata_sysfs_ops = {
static struct kobj_type padata_attr_type = {
.sysfs_ops = &padata_sysfs_ops,
- .default_attrs = padata_default_attrs,
+ .default_groups = padata_default_groups,
.release = padata_sysfs_release,
};
diff --git a/kernel/panic.c b/kernel/panic.c
index f121e6ba7e11..4d9f55bf7d38 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* linux/kernel/panic.c
*
@@ -51,6 +52,7 @@ EXPORT_SYMBOL_GPL(panic_timeout);
#define PANIC_PRINT_TIMER_INFO 0x00000004
#define PANIC_PRINT_LOCK_INFO 0x00000008
#define PANIC_PRINT_FTRACE_INFO 0x00000010
+#define PANIC_PRINT_ALL_PRINTK_MSG 0x00000020
unsigned long panic_print;
ATOMIC_NOTIFIER_HEAD(panic_notifier_list);
@@ -134,6 +136,9 @@ EXPORT_SYMBOL(nmi_panic);
static void panic_print_sys_info(void)
{
+ if (panic_print & PANIC_PRINT_ALL_PRINTK_MSG)
+ console_flush_on_panic(CONSOLE_REPLAY_ALL);
+
if (panic_print & PANIC_PRINT_TASK_INFO)
show_state();
@@ -277,7 +282,7 @@ void panic(const char *fmt, ...)
* panic() is not being callled from OOPS.
*/
debug_locks_off();
- console_flush_on_panic();
+ console_flush_on_panic(CONSOLE_FLUSH_PENDING);
panic_print_sys_info();
@@ -306,6 +311,8 @@ void panic(const char *fmt, ...)
* shutting down. But if there is a chance of
* rebooting the system it will be rebooted.
*/
+ if (panic_reboot_mode != REBOOT_UNDEFINED)
+ reboot_mode = panic_reboot_mode;
emergency_restart();
}
#ifdef __sparc__
@@ -318,14 +325,12 @@ void panic(const char *fmt, ...)
}
#endif
#if defined(CONFIG_S390)
- {
- unsigned long caller;
-
- caller = (unsigned long)__builtin_return_address(0);
- disabled_wait(caller);
- }
+ disabled_wait();
#endif
pr_emerg("---[ end Kernel panic - not syncing: %s ]---\n", buf);
+
+ /* Do not scroll important messages printed above */
+ suppress_printk = 1;
local_irq_enable();
for (i = 0; ; i += PANIC_TIMER_STEP) {
touch_softlockup_watchdog();
@@ -642,16 +647,14 @@ static int clear_warn_once_set(void *data, u64 val)
return 0;
}
-DEFINE_SIMPLE_ATTRIBUTE(clear_warn_once_fops,
- NULL,
- clear_warn_once_set,
- "%lld\n");
+DEFINE_DEBUGFS_ATTRIBUTE(clear_warn_once_fops, NULL, clear_warn_once_set,
+ "%lld\n");
static __init int register_warn_debugfs(void)
{
/* Don't care about failure */
- debugfs_create_file("clear_warn_once", 0200, NULL,
- NULL, &clear_warn_once_fops);
+ debugfs_create_file_unsafe("clear_warn_once", 0200, NULL, NULL,
+ &clear_warn_once_fops);
return 0;
}
diff --git a/kernel/params.c b/kernel/params.c
index ce89f757e6da..cf448785d058 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -1,19 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/* Helpers for initial module or kernel cmdline parsing
Copyright (C) 2001 Rusty Russell.
- This program is free software; you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation; either version 2 of the License, or
- (at your option) any later version.
-
- This program is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program; if not, write to the Free Software
- Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
#include <linux/kernel.h>
#include <linux/string.h>
diff --git a/kernel/pid.c b/kernel/pid.c
index 20881598bdfa..16263b526560 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Generic pidhash and scalable, time-bounded PID allocator
*
@@ -32,12 +33,13 @@
#include <linux/init.h>
#include <linux/rculist.h>
#include <linux/memblock.h>
-#include <linux/hash.h>
#include <linux/pid_namespace.h>
#include <linux/init_task.h>
#include <linux/syscalls.h>
#include <linux/proc_ns.h>
#include <linux/proc_fs.h>
+#include <linux/anon_inodes.h>
+#include <linux/sched/signal.h>
#include <linux/sched/task.h>
#include <linux/idr.h>
@@ -214,6 +216,8 @@ struct pid *alloc_pid(struct pid_namespace *ns)
for (type = 0; type < PIDTYPE_MAX; ++type)
INIT_HLIST_HEAD(&pid->tasks[type]);
+ init_waitqueue_head(&pid->wait_pidfd);
+
upid = pid->numbers + ns->level;
spin_lock_irq(&pidmap_lock);
if (!(ns->pid_allocated & PIDNS_ADDING))
@@ -451,6 +455,73 @@ struct pid *find_ge_pid(int nr, struct pid_namespace *ns)
return idr_get_next(&ns->idr, &nr);
}
+/**
+ * pidfd_create() - Create a new pid file descriptor.
+ *
+ * @pid: struct pid that the pidfd will reference
+ *
+ * This creates a new pid file descriptor with the O_CLOEXEC flag set.
+ *
+ * Note, that this function can only be called after the fd table has
+ * been unshared to avoid leaking the pidfd to the new process.
+ *
+ * Return: On success, a cloexec pidfd is returned.
+ * On error, a negative errno number will be returned.
+ */
+static int pidfd_create(struct pid *pid)
+{
+ int fd;
+
+ fd = anon_inode_getfd("[pidfd]", &pidfd_fops, get_pid(pid),
+ O_RDWR | O_CLOEXEC);
+ if (fd < 0)
+ put_pid(pid);
+
+ return fd;
+}
+
+/**
+ * pidfd_open() - Open new pid file descriptor.
+ *
+ * @pid: pid for which to retrieve a pidfd
+ * @flags: flags to pass
+ *
+ * This creates a new pid file descriptor with the O_CLOEXEC flag set for
+ * the process identified by @pid. Currently, the process identified by
+ * @pid must be a thread-group leader. This restriction currently exists
+ * for all aspects of pidfds including pidfd creation (CLONE_PIDFD cannot
+ * be used with CLONE_THREAD) and pidfd polling (only supports thread group
+ * leaders).
+ *
+ * Return: On success, a cloexec pidfd is returned.
+ * On error, a negative errno number will be returned.
+ */
+SYSCALL_DEFINE2(pidfd_open, pid_t, pid, unsigned int, flags)
+{
+ int fd, ret;
+ struct pid *p;
+
+ if (flags)
+ return -EINVAL;
+
+ if (pid <= 0)
+ return -EINVAL;
+
+ p = find_get_pid(pid);
+ if (!p)
+ return -ESRCH;
+
+ ret = 0;
+ rcu_read_lock();
+ if (!pid_task(p, PIDTYPE_TGID))
+ ret = -EINVAL;
+ rcu_read_unlock();
+
+ fd = ret ?: pidfd_create(p);
+ put_pid(p);
+ return fd;
+}
+
void __init pid_idr_init(void)
{
/* Verify no one has done anything silly: */
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index aa6e72fb7c08..6d726cef241c 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Pid namespaces
*
@@ -325,7 +326,7 @@ int reboot_pid_ns(struct pid_namespace *pid_ns, int cmd)
}
read_lock(&tasklist_lock);
- force_sig(SIGKILL, pid_ns->child_reaper);
+ send_sig(SIGKILL, pid_ns->child_reaper, 1);
read_unlock(&tasklist_lock);
do_exit(0);
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index f8fe57d1022e..ff8592ddedee 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
config SUSPEND
bool "Suspend to RAM and standby"
depends on ARCH_SUSPEND_POSSIBLE
@@ -114,6 +115,15 @@ config PM_SLEEP_SMP
depends on PM_SLEEP
select HOTPLUG_CPU
+config PM_SLEEP_SMP_NONZERO_CPU
+ def_bool y
+ depends on PM_SLEEP_SMP
+ depends on ARCH_SUSPEND_NONZERO_CPU
+ ---help---
+ If an arch can suspend (for suspend, hibernate, kexec, etc) on a
+ non-zero numbered CPU, it may define ARCH_SUSPEND_NONZERO_CPU. This
+ will allow nohz_full mask to include CPU0.
+
config PM_AUTOSLEEP
bool "Opportunistic sleep"
depends on PM_SLEEP
diff --git a/kernel/power/energy_model.c b/kernel/power/energy_model.c
index d9dc2c38764a..0a9326f5f421 100644
--- a/kernel/power/energy_model.c
+++ b/kernel/power/energy_model.c
@@ -10,6 +10,7 @@
#include <linux/cpu.h>
#include <linux/cpumask.h>
+#include <linux/debugfs.h>
#include <linux/energy_model.h>
#include <linux/sched/topology.h>
#include <linux/slab.h>
@@ -23,6 +24,60 @@ static DEFINE_PER_CPU(struct em_perf_domain *, em_data);
*/
static DEFINE_MUTEX(em_pd_mutex);
+#ifdef CONFIG_DEBUG_FS
+static struct dentry *rootdir;
+
+static void em_debug_create_cs(struct em_cap_state *cs, struct dentry *pd)
+{
+ struct dentry *d;
+ char name[24];
+
+ snprintf(name, sizeof(name), "cs:%lu", cs->frequency);
+
+ /* Create per-cs directory */
+ d = debugfs_create_dir(name, pd);
+ debugfs_create_ulong("frequency", 0444, d, &cs->frequency);
+ debugfs_create_ulong("power", 0444, d, &cs->power);
+ debugfs_create_ulong("cost", 0444, d, &cs->cost);
+}
+
+static int em_debug_cpus_show(struct seq_file *s, void *unused)
+{
+ seq_printf(s, "%*pbl\n", cpumask_pr_args(to_cpumask(s->private)));
+
+ return 0;
+}
+DEFINE_SHOW_ATTRIBUTE(em_debug_cpus);
+
+static void em_debug_create_pd(struct em_perf_domain *pd, int cpu)
+{
+ struct dentry *d;
+ char name[8];
+ int i;
+
+ snprintf(name, sizeof(name), "pd%d", cpu);
+
+ /* Create the directory of the performance domain */
+ d = debugfs_create_dir(name, rootdir);
+
+ debugfs_create_file("cpus", 0444, d, pd->cpus, &em_debug_cpus_fops);
+
+ /* Create a sub-directory for each capacity state */
+ for (i = 0; i < pd->nr_cap_states; i++)
+ em_debug_create_cs(&pd->table[i], d);
+}
+
+static int __init em_debug_init(void)
+{
+ /* Create /sys/kernel/debug/energy_model directory */
+ rootdir = debugfs_create_dir("energy_model", NULL);
+
+ return 0;
+}
+core_initcall(em_debug_init);
+#else /* CONFIG_DEBUG_FS */
+static void em_debug_create_pd(struct em_perf_domain *pd, int cpu) {}
+#endif
static struct em_perf_domain *em_create_pd(cpumask_t *span, int nr_states,
struct em_data_callback *cb)
{
@@ -102,6 +157,8 @@ static struct em_perf_domain *em_create_pd(cpumask_t *span, int nr_states,
pd->nr_cap_states = nr_states;
cpumask_copy(to_cpumask(pd->cpus), span);
+ em_debug_create_pd(pd, cpu);
+
return pd;
free_cs_table:
@@ -166,7 +223,7 @@ int em_register_perf_domain(cpumask_t *span, unsigned int nr_states,
* All CPUs of a domain must have the same micro-architecture
* since they all share the same table.
*/
- cap = arch_scale_cpu_capacity(NULL, cpu);
+ cap = arch_scale_cpu_capacity(cpu);
if (prev_cap && prev_cap != cap) {
pr_err("CPUs of %*pbl must have the same capacity\n",
cpumask_pr_args(span));
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index abef759de7c8..cd7434e6000d 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* kernel/power/hibernate.c - Hibernation (a.k.a suspend-to-disk) support.
*
@@ -6,15 +7,12 @@
* Copyright (c) 2004 Pavel Machek <pavel@ucw.cz>
* Copyright (c) 2009 Rafael J. Wysocki, Novell Inc.
* Copyright (C) 2012 Bojan Smojver <bojan@rexursive.com>
- *
- * This file is released under the GPLv2.
*/
#define pr_fmt(fmt) "PM: " fmt
#include <linux/export.h>
#include <linux/suspend.h>
-#include <linux/syscalls.h>
#include <linux/reboot.h>
#include <linux/string.h>
#include <linux/device.h>
@@ -130,7 +128,7 @@ static int hibernation_test(int level) { return 0; }
static int platform_begin(int platform_mode)
{
return (platform_mode && hibernation_ops) ?
- hibernation_ops->begin() : 0;
+ hibernation_ops->begin(PMSG_FREEZE) : 0;
}
/**
@@ -258,6 +256,11 @@ void swsusp_show_speed(ktime_t start, ktime_t stop,
(kps % 1000) / 10);
}
+__weak int arch_resume_nosmt(void)
+{
+ return 0;
+}
+
/**
* create_image - Create a hibernation image.
* @platform_mode: Whether or not to use the platform driver.
@@ -281,7 +284,7 @@ static int create_image(int platform_mode)
if (error || hibernation_test(TEST_PLATFORM))
goto Platform_finish;
- error = disable_nonboot_cpus();
+ error = suspend_disable_secondary_cpus();
if (error || hibernation_test(TEST_CPUS))
goto Enable_cpus;
@@ -323,7 +326,11 @@ static int create_image(int platform_mode)
local_irq_enable();
Enable_cpus:
- enable_nonboot_cpus();
+ suspend_enable_secondary_cpus();
+
+ /* Allow architectures to do nosmt-specific post-resume dances */
+ if (!in_suspend)
+ error = arch_resume_nosmt();
Platform_finish:
platform_finish(platform_mode);
@@ -417,7 +424,7 @@ int hibernation_snapshot(int platform_mode)
int __weak hibernate_resume_nonboot_cpu_disable(void)
{
- return disable_nonboot_cpus();
+ return suspend_disable_secondary_cpus();
}
/**
@@ -486,7 +493,7 @@ static int resume_target_kernel(bool platform_mode)
local_irq_enable();
Enable_cpus:
- enable_nonboot_cpus();
+ suspend_enable_secondary_cpus();
Cleanup:
platform_restore_cleanup(platform_mode);
@@ -543,7 +550,7 @@ int hibernation_platform_enter(void)
* hibernation_ops->finish() before saving the image, so we should let
* the firmware know that we're going to enter the sleep state after all
*/
- error = hibernation_ops->begin();
+ error = hibernation_ops->begin(PMSG_HIBERNATE);
if (error)
goto Close;
@@ -564,7 +571,7 @@ int hibernation_platform_enter(void)
if (error)
goto Platform_finish;
- error = disable_nonboot_cpus();
+ error = suspend_disable_secondary_cpus();
if (error)
goto Enable_cpus;
@@ -586,7 +593,7 @@ int hibernation_platform_enter(void)
local_irq_enable();
Enable_cpus:
- enable_nonboot_cpus();
+ suspend_enable_secondary_cpus();
Platform_finish:
hibernation_ops->finish();
@@ -709,9 +716,7 @@ int hibernate(void)
goto Exit;
}
- pr_info("Syncing filesystems ... \n");
- ksys_sync();
- pr_info("done.\n");
+ ksys_sync_helper();
error = freeze_processes();
if (error)
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 98e76cad128b..bdbd605c4215 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -1,11 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* kernel/power/main.c - PM subsystem core functionality.
*
* Copyright (c) 2003 Patrick Mochel
* Copyright (c) 2003 Open Source Development Lab
- *
- * This file is released under the GPLv2
- *
*/
#include <linux/export.h>
@@ -16,6 +14,7 @@
#include <linux/debugfs.h>
#include <linux/seq_file.h>
#include <linux/suspend.h>
+#include <linux/syscalls.h>
#include "power.h"
@@ -51,6 +50,19 @@ void unlock_system_sleep(void)
}
EXPORT_SYMBOL_GPL(unlock_system_sleep);
+void ksys_sync_helper(void)
+{
+ ktime_t start;
+ long elapsed_msecs;
+
+ start = ktime_get();
+ ksys_sync();
+ elapsed_msecs = ktime_to_ms(ktime_sub(ktime_get(), start));
+ pr_info("Filesystems sync: %ld.%03ld seconds\n",
+ elapsed_msecs / MSEC_PER_SEC, elapsed_msecs % MSEC_PER_SEC);
+}
+EXPORT_SYMBOL_GPL(ksys_sync_helper);
+
/* Routines for PM-transition notifications */
static BLOCKING_NOTIFIER_HEAD(pm_chain_head);
diff --git a/kernel/power/power.h b/kernel/power/power.h
index 9e58bdc8a562..44bee462ff57 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -75,8 +75,6 @@ static inline void hibernate_reserved_size_init(void) {}
static inline void hibernate_image_size_init(void) {}
#endif /* !CONFIG_HIBERNATION */
-extern int pfn_is_nosave(unsigned long);
-
#define power_attr(_name) \
static struct kobj_attribute _name##_attr = { \
.attr = { \
diff --git a/kernel/power/poweroff.c b/kernel/power/poweroff.c
index 7ef6866b521d..6d475281c730 100644
--- a/kernel/power/poweroff.c
+++ b/kernel/power/poweroff.c
@@ -1,7 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* poweroff.c - sysrq handler to gracefully power down machine.
- *
- * This file is released under the GPL v2
*/
#include <linux/kernel.h>
diff --git a/kernel/power/qos.c b/kernel/power/qos.c
index b7a82502857a..33e3febaba53 100644
--- a/kernel/power/qos.c
+++ b/kernel/power/qos.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* This module exposes the interface to kernel space for specifying
* QoS dependencies. It provides infrastructure for registration of:
@@ -582,10 +583,8 @@ static int register_pm_qos_misc(struct pm_qos_object *qos, struct dentry *d)
qos->pm_qos_power_miscdev.name = qos->name;
qos->pm_qos_power_miscdev.fops = &pm_qos_power_fops;
- if (d) {
- (void)debugfs_create_file(qos->name, S_IRUGO, d,
- (void *)qos, &pm_qos_debug_fops);
- }
+ debugfs_create_file(qos->name, S_IRUGO, d, (void *)qos,
+ &pm_qos_debug_fops);
return misc_register(&qos->pm_qos_power_miscdev);
}
@@ -685,8 +684,6 @@ static int __init pm_qos_power_init(void)
BUILD_BUG_ON(ARRAY_SIZE(pm_qos_array) != PM_QOS_NUM_CLASSES);
d = debugfs_create_dir("pm_qos", NULL);
- if (IS_ERR_OR_NULL(d))
- d = NULL;
for (i = PM_QOS_CPU_DMA_LATENCY; i < PM_QOS_NUM_CLASSES; i++) {
ret = register_pm_qos_misc(pm_qos_array[i], d);
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 640b2034edd6..83105874f255 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* linux/kernel/power/snapshot.c
*
@@ -5,9 +6,6 @@
*
* Copyright (C) 1998-2005 Pavel Machek <pavel@ucw.cz>
* Copyright (C) 2006 Rafael J. Wysocki <rjw@sisk.pl>
- *
- * This file is released under the GPLv2.
- *
*/
#define pr_fmt(fmt) "PM: " fmt
@@ -965,6 +963,9 @@ void __init __register_nosave_region(unsigned long start_pfn,
/* This allocation cannot fail */
region = memblock_alloc(sizeof(struct nosave_region),
SMP_CACHE_BYTES);
+ if (!region)
+ panic("%s: Failed to allocate %zu bytes\n", __func__,
+ sizeof(struct nosave_region));
}
region->start_pfn = start_pfn;
region->end_pfn = end_pfn;
@@ -1215,14 +1216,16 @@ static struct page *saveable_highmem_page(struct zone *zone, unsigned long pfn)
if (!pfn_valid(pfn))
return NULL;
- page = pfn_to_page(pfn);
- if (page_zone(page) != zone)
+ page = pfn_to_online_page(pfn);
+ if (!page || page_zone(page) != zone)
return NULL;
BUG_ON(!PageHighMem(page));
- if (swsusp_page_is_forbidden(page) || swsusp_page_is_free(page) ||
- PageReserved(page))
+ if (swsusp_page_is_forbidden(page) || swsusp_page_is_free(page))
+ return NULL;
+
+ if (PageReserved(page) || PageOffline(page))
return NULL;
if (page_is_guard(page))
@@ -1277,8 +1280,8 @@ static struct page *saveable_page(struct zone *zone, unsigned long pfn)
if (!pfn_valid(pfn))
return NULL;
- page = pfn_to_page(pfn);
- if (page_zone(page) != zone)
+ page = pfn_to_online_page(pfn);
+ if (!page || page_zone(page) != zone)
return NULL;
BUG_ON(PageHighMem(page));
@@ -1286,6 +1289,9 @@ static struct page *saveable_page(struct zone *zone, unsigned long pfn)
if (swsusp_page_is_forbidden(page) || swsusp_page_is_free(page))
return NULL;
+ if (PageOffline(page))
+ return NULL;
+
if (PageReserved(page)
&& (!kernel_page_present(page) || pfn_is_nosave(pfn)))
return NULL;
@@ -1334,8 +1340,9 @@ static inline void do_copy_page(long *dst, long *src)
* safe_copy_page - Copy a page in a safe way.
*
* Check if the page we are going to copy is marked as present in the kernel
- * page tables (this always is the case if CONFIG_DEBUG_PAGEALLOC is not set
- * and in that case kernel_page_present() always returns 'true').
+ * page tables. This always is the case if CONFIG_DEBUG_PAGEALLOC or
+ * CONFIG_ARCH_HAS_SET_DIRECT_MAP is not set. In that case kernel_page_present()
+ * always returns 'true'.
*/
static void safe_copy_page(void *dst, struct page *s_page)
{
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index 0bd595a0b610..c874a7026e24 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -1,11 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* kernel/power/suspend.c - Suspend to RAM and standby functionality.
*
* Copyright (c) 2003 Patrick Mochel
* Copyright (c) 2003 Open Source Development Lab
* Copyright (c) 2009 Rafael J. Wysocki <rjw@sisk.pl>, Novell Inc.
- *
- * This file is released under the GPLv2.
*/
#define pr_fmt(fmt) "PM: " fmt
@@ -17,7 +16,6 @@
#include <linux/console.h>
#include <linux/cpu.h>
#include <linux/cpuidle.h>
-#include <linux/syscalls.h>
#include <linux/gfp.h>
#include <linux/io.h>
#include <linux/kernel.h>
@@ -63,11 +61,17 @@ static DECLARE_SWAIT_QUEUE_HEAD(s2idle_wait_head);
enum s2idle_states __read_mostly s2idle_state;
static DEFINE_RAW_SPINLOCK(s2idle_lock);
-bool pm_suspend_via_s2idle(void)
+/**
+ * pm_suspend_default_s2idle - Check if suspend-to-idle is the default suspend.
+ *
+ * Return 'true' if suspend-to-idle has been selected as the default system
+ * suspend method.
+ */
+bool pm_suspend_default_s2idle(void)
{
return mem_sleep_current == PM_SUSPEND_TO_IDLE;
}
-EXPORT_SYMBOL_GPL(pm_suspend_via_s2idle);
+EXPORT_SYMBOL_GPL(pm_suspend_default_s2idle);
void s2idle_set_ops(const struct platform_s2idle_ops *ops)
{
@@ -428,7 +432,7 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
if (suspend_test(TEST_PLATFORM))
goto Platform_wake;
- error = disable_nonboot_cpus();
+ error = suspend_disable_secondary_cpus();
if (error || suspend_test(TEST_CPUS))
goto Enable_cpus;
@@ -458,7 +462,7 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
BUG_ON(irqs_disabled());
Enable_cpus:
- enable_nonboot_cpus();
+ suspend_enable_secondary_cpus();
Platform_wake:
platform_resume_noirq(state);
@@ -489,6 +493,9 @@ int suspend_devices_and_enter(suspend_state_t state)
pm_suspend_target_state = state;
+ if (state == PM_SUSPEND_TO_IDLE)
+ pm_set_suspend_no_platform();
+
error = platform_suspend_begin(state);
if (error)
goto Close;
@@ -568,13 +575,11 @@ static int enter_state(suspend_state_t state)
if (state == PM_SUSPEND_TO_IDLE)
s2idle_begin();
-#ifndef CONFIG_SUSPEND_SKIP_SYNC
- trace_suspend_resume(TPS("sync_filesystems"), 0, true);
- pr_info("Syncing filesystems ... ");
- ksys_sync();
- pr_cont("done.\n");
- trace_suspend_resume(TPS("sync_filesystems"), 0, false);
-#endif
+ if (!IS_ENABLED(CONFIG_SUSPEND_SKIP_SYNC)) {
+ trace_suspend_resume(TPS("sync_filesystems"), 0, true);
+ ksys_sync_helper();
+ trace_suspend_resume(TPS("sync_filesystems"), 0, false);
+ }
pm_pr_dbg("Preparing system for sleep (%s)\n", mem_sleep_labels[state]);
pm_suspend_clear_flags();
diff --git a/kernel/power/suspend_test.c b/kernel/power/suspend_test.c
index 6a897e8b2a88..60564b58de07 100644
--- a/kernel/power/suspend_test.c
+++ b/kernel/power/suspend_test.c
@@ -1,9 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* kernel/power/suspend_test.c - Suspend to RAM and standby test facility.
*
* Copyright (c) 2009 Pavel Machek <pavel@ucw.cz>
- *
- * This file is released under the GPLv2.
*/
#include <linux/init.h>
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index d7f6c1a288d3..ca0fcb5ced71 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* linux/kernel/power/swap.c
*
@@ -7,9 +8,6 @@
* Copyright (C) 1998,2001-2005 Pavel Machek <pavel@ucw.cz>
* Copyright (C) 2006 Rafael J. Wysocki <rjw@sisk.pl>
* Copyright (C) 2010-2012 Bojan Smojver <bojan@rexursive.com>
- *
- * This file is released under the GPLv2.
- *
*/
#define pr_fmt(fmt) "PM: " fmt
@@ -976,12 +974,11 @@ static int get_swap_reader(struct swap_map_handle *handle,
last = handle->maps = NULL;
offset = swsusp_header->image;
while (offset) {
- tmp = kmalloc(sizeof(*handle->maps), GFP_KERNEL);
+ tmp = kzalloc(sizeof(*handle->maps), GFP_KERNEL);
if (!tmp) {
release_swap_reader(handle);
return -ENOMEM;
}
- memset(tmp, 0, sizeof(*tmp));
if (!handle->maps)
handle->maps = tmp;
if (last)
diff --git a/kernel/power/user.c b/kernel/power/user.c
index 2d8b60a3c86b..77438954cc2b 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -1,16 +1,13 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* linux/kernel/power/user.c
*
* This file provides the user space interface for software suspend/resume.
*
* Copyright (C) 2006 Rafael J. Wysocki <rjw@sisk.pl>
- *
- * This file is released under the GPLv2.
- *
*/
#include <linux/suspend.h>
-#include <linux/syscalls.h>
#include <linux/reboot.h>
#include <linux/string.h>
#include <linux/device.h>
@@ -228,9 +225,7 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
if (data->frozen)
break;
- printk("Syncing filesystems ... ");
- ksys_sync();
- printk("done.\n");
+ ksys_sync_helper();
error = freeze_processes();
if (error)
diff --git a/kernel/printk/Makefile b/kernel/printk/Makefile
index 4a2ffc39eb95..4d052fc6bcde 100644
--- a/kernel/printk/Makefile
+++ b/kernel/printk/Makefile
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
obj-y = printk.o
obj-$(CONFIG_PRINTK) += printk_safe.o
obj-$(CONFIG_A11Y_BRAILLE_CONSOLE) += braille.o
diff --git a/kernel/printk/internal.h b/kernel/printk/internal.h
index 0f1898820cba..c8e6ab689d42 100644
--- a/kernel/printk/internal.h
+++ b/kernel/printk/internal.h
@@ -1,18 +1,6 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
* internal.h - printk internal definitions
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version 2
- * of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, see <http://www.gnu.org/licenses/>.
*/
#include <linux/percpu.h>
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index d3d170374ceb..1888f6a3b694 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* linux/kernel/printk.c
*
@@ -65,6 +66,7 @@ int console_printk[4] = {
CONSOLE_LOGLEVEL_MIN, /* minimum_console_loglevel */
CONSOLE_LOGLEVEL_DEFAULT, /* default_console_loglevel */
};
+EXPORT_SYMBOL_GPL(console_printk);
atomic_t ignore_console_lock_warning __read_mostly = ATOMIC_INIT(0);
EXPORT_SYMBOL(ignore_console_lock_warning);
@@ -85,6 +87,12 @@ static DEFINE_SEMAPHORE(console_sem);
struct console *console_drivers;
EXPORT_SYMBOL_GPL(console_drivers);
+/*
+ * System may need to suppress printk message under certain
+ * circumstances, like after kernel panic happens.
+ */
+int __read_mostly suppress_printk;
+
#ifdef CONFIG_LOCKDEP
static struct lockdep_map console_lock_dep_map = {
.name = "console_lock"
@@ -344,7 +352,6 @@ static int console_msg_format = MSG_FORMAT_DEFAULT;
enum log_flags {
LOG_NEWLINE = 2, /* text ended with a newline */
- LOG_PREFIX = 4, /* text started with a prefix */
LOG_CONT = 8, /* text is a fragment of a continuation line */
};
@@ -356,6 +363,9 @@ struct printk_log {
u8 facility; /* syslog facility */
u8 flags:5; /* internal record flags */
u8 level:3; /* syslog level */
+#ifdef CONFIG_PRINTK_CALLER
+ u32 caller_id; /* thread id or processor id */
+#endif
}
#ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
__packed __aligned(4)
@@ -422,7 +432,11 @@ static u64 exclusive_console_stop_seq;
static u64 clear_seq;
static u32 clear_idx;
+#ifdef CONFIG_PRINTK_CALLER
+#define PREFIX_MAX 48
+#else
#define PREFIX_MAX 32
+#endif
#define LOG_LINE_MAX (1024 - PREFIX_MAX)
#define LOG_LEVEL(v) ((v) & 0x07)
@@ -577,7 +591,7 @@ static u32 truncate_msg(u16 *text_len, u16 *trunc_msg_len,
}
/* insert record into the buffer, discard old ones, update heads */
-static int log_store(int facility, int level,
+static int log_store(u32 caller_id, int facility, int level,
enum log_flags flags, u64 ts_nsec,
const char *dict, u16 dict_len,
const char *text, u16 text_len)
@@ -625,6 +639,9 @@ static int log_store(int facility, int level,
msg->ts_nsec = ts_nsec;
else
msg->ts_nsec = local_clock();
+#ifdef CONFIG_PRINTK_CALLER
+ msg->caller_id = caller_id;
+#endif
memset(log_dict(msg) + dict_len, 0, pad_len);
msg->len = size;
@@ -688,12 +705,21 @@ static ssize_t msg_print_ext_header(char *buf, size_t size,
struct printk_log *msg, u64 seq)
{
u64 ts_usec = msg->ts_nsec;
+ char caller[20];
+#ifdef CONFIG_PRINTK_CALLER
+ u32 id = msg->caller_id;
+
+ snprintf(caller, sizeof(caller), ",caller=%c%u",
+ id & 0x80000000 ? 'C' : 'T', id & ~0x80000000);
+#else
+ caller[0] = '\0';
+#endif
do_div(ts_usec, 1000);
- return scnprintf(buf, size, "%u,%llu,%llu,%c;",
- (msg->facility << 3) | msg->level, seq, ts_usec,
- msg->flags & LOG_CONT ? 'c' : '-');
+ return scnprintf(buf, size, "%u,%llu,%llu,%c%s;",
+ (msg->facility << 3) | msg->level, seq, ts_usec,
+ msg->flags & LOG_CONT ? 'c' : '-', caller);
}
static ssize_t msg_print_ext_body(char *buf, size_t size,
@@ -1038,6 +1064,9 @@ void log_buf_vmcoreinfo_setup(void)
VMCOREINFO_OFFSET(printk_log, len);
VMCOREINFO_OFFSET(printk_log, text_len);
VMCOREINFO_OFFSET(printk_log, dict_len);
+#ifdef CONFIG_PRINTK_CALLER
+ VMCOREINFO_OFFSET(printk_log, caller_id);
+#endif
}
#endif
@@ -1122,14 +1151,7 @@ void __init setup_log_buf(int early)
if (!new_log_buf_len)
return;
- if (early) {
- new_log_buf =
- memblock_alloc(new_log_buf_len, LOG_ALIGN);
- } else {
- new_log_buf = memblock_alloc_nopanic(new_log_buf_len,
- LOG_ALIGN);
- }
-
+ new_log_buf = memblock_alloc(new_log_buf_len, LOG_ALIGN);
if (unlikely(!new_log_buf)) {
pr_err("log_buf_len: %lu bytes not available\n",
new_log_buf_len);
@@ -1236,10 +1258,23 @@ static size_t print_time(u64 ts, char *buf)
{
unsigned long rem_nsec = do_div(ts, 1000000000);
- return sprintf(buf, "[%5lu.%06lu] ",
+ return sprintf(buf, "[%5lu.%06lu]",
(unsigned long)ts, rem_nsec / 1000);
}
+#ifdef CONFIG_PRINTK_CALLER
+static size_t print_caller(u32 id, char *buf)
+{
+ char caller[12];
+
+ snprintf(caller, sizeof(caller), "%c%u",
+ id & 0x80000000 ? 'C' : 'T', id & ~0x80000000);
+ return sprintf(buf, "[%6s]", caller);
+}
+#else
+#define print_caller(id, buf) 0
+#endif
+
static size_t print_prefix(const struct printk_log *msg, bool syslog,
bool time, char *buf)
{
@@ -1247,8 +1282,17 @@ static size_t print_prefix(const struct printk_log *msg, bool syslog,
if (syslog)
len = print_syslog((msg->facility << 3) | msg->level, buf);
+
if (time)
len += print_time(msg->ts_nsec, buf + len);
+
+ len += print_caller(msg->caller_id, buf + len);
+
+ if (IS_ENABLED(CONFIG_PRINTK_CALLER) || time) {
+ buf[len++] = ' ';
+ buf[len] = '\0';
+ }
+
return len;
}
@@ -1752,6 +1796,12 @@ static inline void printk_delay(void)
}
}
+static inline u32 printk_caller_id(void)
+{
+ return in_task() ? task_pid_nr(current) :
+ 0x80000000 + raw_smp_processor_id();
+}
+
/*
* Continuation lines are buffered, and not committed to the record buffer
* until the line is complete, or a race forces it. The line fragments
@@ -1761,7 +1811,7 @@ static inline void printk_delay(void)
static struct cont {
char buf[LOG_LINE_MAX];
size_t len; /* length == 0 means unused buffer */
- struct task_struct *owner; /* task of first print*/
+ u32 caller_id; /* printk_caller_id() of first print */
u64 ts_nsec; /* time of first print */
u8 level; /* log level of first message */
u8 facility; /* log facility of first message */
@@ -1773,12 +1823,13 @@ static void cont_flush(void)
if (cont.len == 0)
return;
- log_store(cont.facility, cont.level, cont.flags, cont.ts_nsec,
- NULL, 0, cont.buf, cont.len);
+ log_store(cont.caller_id, cont.facility, cont.level, cont.flags,
+ cont.ts_nsec, NULL, 0, cont.buf, cont.len);
cont.len = 0;
}
-static bool cont_add(int facility, int level, enum log_flags flags, const char *text, size_t len)
+static bool cont_add(u32 caller_id, int facility, int level,
+ enum log_flags flags, const char *text, size_t len)
{
/* If the line gets too long, split it up in separate records. */
if (cont.len + len > sizeof(cont.buf)) {
@@ -1789,7 +1840,7 @@ static bool cont_add(int facility, int level, enum log_flags flags, const char *
if (!cont.len) {
cont.facility = facility;
cont.level = level;
- cont.owner = current;
+ cont.caller_id = caller_id;
cont.ts_nsec = local_clock();
cont.flags = flags;
}
@@ -1809,13 +1860,15 @@ static bool cont_add(int facility, int level, enum log_flags flags, const char *
static size_t log_output(int facility, int level, enum log_flags lflags, const char *dict, size_t dictlen, char *text, size_t text_len)
{
+ const u32 caller_id = printk_caller_id();
+
/*
* If an earlier line was buffered, and we're a continuation
- * write from the same process, try to add it to the buffer.
+ * write from the same context, try to add it to the buffer.
*/
if (cont.len) {
- if (cont.owner == current && (lflags & LOG_CONT)) {
- if (cont_add(facility, level, lflags, text, text_len))
+ if (cont.caller_id == caller_id && (lflags & LOG_CONT)) {
+ if (cont_add(caller_id, facility, level, lflags, text, text_len))
return text_len;
}
/* Otherwise, make sure it's flushed */
@@ -1828,12 +1881,13 @@ static size_t log_output(int facility, int level, enum log_flags lflags, const c
/* If it doesn't end in a newline, try to buffer the current line */
if (!(lflags & LOG_NEWLINE)) {
- if (cont_add(facility, level, lflags, text, text_len))
+ if (cont_add(caller_id, facility, level, lflags, text, text_len))
return text_len;
}
/* Store it in the record log */
- return log_store(facility, level, lflags, 0, dict, dictlen, text, text_len);
+ return log_store(caller_id, facility, level, lflags, 0,
+ dict, dictlen, text, text_len);
}
/* Must be called under logbuf_lock. */
@@ -1867,9 +1921,6 @@ int vprintk_store(int facility, int level,
case '0' ... '7':
if (level == LOGLEVEL_DEFAULT)
level = kern_level - '0';
- /* fallthrough */
- case 'd': /* KERN_DEFAULT */
- lflags |= LOG_PREFIX;
break;
case 'c': /* KERN_CONT */
lflags |= LOG_CONT;
@@ -1884,7 +1935,7 @@ int vprintk_store(int facility, int level,
level = default_message_loglevel;
if (dict)
- lflags |= LOG_PREFIX|LOG_NEWLINE;
+ lflags |= LOG_NEWLINE;
return log_output(facility, level, lflags,
dict, dictlen, text, text_len);
@@ -1899,6 +1950,10 @@ asmlinkage int vprintk_emit(int facility, int level,
unsigned long flags;
u64 curr_log_seq;
+ /* Suppress unimportant messages after panic happens */
+ if (unlikely(suppress_printk))
+ return 0;
+
if (level == LOGLEVEL_SCHED) {
level = LOGLEVEL_DEFAULT;
in_sched = true;
@@ -2481,10 +2536,11 @@ void console_unblank(void)
/**
* console_flush_on_panic - flush console content on panic
+ * @mode: flush all messages in buffer or just the pending ones
*
* Immediately output all pending messages no matter what.
*/
-void console_flush_on_panic(void)
+void console_flush_on_panic(enum con_flush_mode mode)
{
/*
* If someone else is holding the console lock, trylock will fail
@@ -2495,6 +2551,15 @@ void console_flush_on_panic(void)
*/
console_trylock();
console_may_schedule = 0;
+
+ if (mode == CONSOLE_REPLAY_ALL) {
+ unsigned long flags;
+
+ logbuf_lock_irqsave(flags);
+ console_seq = log_first_seq;
+ console_idx = log_first_idx;
+ logbuf_unlock_irqrestore(flags);
+ }
console_unlock();
}
diff --git a/kernel/printk/printk_safe.c b/kernel/printk/printk_safe.c
index 0913b4d385de..b4045e782743 100644
--- a/kernel/printk/printk_safe.c
+++ b/kernel/printk/printk_safe.c
@@ -1,18 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* printk_safe.c - Safe printk for printk-deadlock-prone contexts
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version 2
- * of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, see <http://www.gnu.org/licenses/>.
*/
#include <linux/preempt.h>
diff --git a/kernel/profile.c b/kernel/profile.c
index 9c08a2c7cb1d..af7c94bf5fa1 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* linux/kernel/profile.c
* Simple profiling. Manages a direct-mapped profile hit count buffer,
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 771e93f9c43f..83a531cea2f3 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* linux/kernel/ptrace.c
*
@@ -29,6 +30,7 @@
#include <linux/hw_breakpoint.h>
#include <linux/cn_proc.h>
#include <linux/compat.h>
+#include <linux/sched/signal.h>
/*
* Access another process' address space via ptrace.
@@ -77,9 +79,7 @@ void __ptrace_link(struct task_struct *child, struct task_struct *new_parent,
*/
static void ptrace_link(struct task_struct *child, struct task_struct *new_parent)
{
- rcu_read_lock();
- __ptrace_link(child, new_parent, __task_cred(new_parent));
- rcu_read_unlock();
+ __ptrace_link(child, new_parent, current_cred());
}
/**
@@ -116,6 +116,9 @@ void __ptrace_unlink(struct task_struct *child)
BUG_ON(!child->ptrace);
clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE);
+#ifdef TIF_SYSCALL_EMU
+ clear_tsk_thread_flag(child, TIF_SYSCALL_EMU);
+#endif
child->parent = child->real_parent;
list_del_init(&child->ptrace_entry);
@@ -322,6 +325,16 @@ static int __ptrace_may_access(struct task_struct *task, unsigned int mode)
return -EPERM;
ok:
rcu_read_unlock();
+ /*
+ * If a task drops privileges and becomes nondumpable (through a syscall
+ * like setresuid()) while we are trying to access it, we must ensure
+ * that the dumpability is read after the credentials; otherwise,
+ * we may be able to attach to a task that we shouldn't be able to
+ * attach to (as if the task had dropped privileges without becoming
+ * nondumpable).
+ * Pairs with a write barrier in commit_creds().
+ */
+ smp_rmb();
mm = task->mm;
if (mm &&
((get_dumpable(mm) != SUID_DUMP_USER) &&
@@ -703,6 +716,10 @@ static int ptrace_peek_siginfo(struct task_struct *child,
if (arg.nr < 0)
return -EINVAL;
+ /* Ensure arg.off fits in an unsigned long */
+ if (arg.off > ULONG_MAX)
+ return 0;
+
if (arg.flags & PTRACE_PEEKSIGINFO_SHARED)
pending = &child->signal->shared_pending;
else
@@ -710,18 +727,20 @@ static int ptrace_peek_siginfo(struct task_struct *child,
for (i = 0; i < arg.nr; ) {
kernel_siginfo_t info;
- s32 off = arg.off + i;
+ unsigned long off = arg.off + i;
+ bool found = false;
spin_lock_irq(&child->sighand->siglock);
list_for_each_entry(q, &pending->list, list) {
if (!off--) {
+ found = true;
copy_siginfo(&info, &q->info);
break;
}
}
spin_unlock_irq(&child->sighand->siglock);
- if (off >= 0) /* beyond the end of the list */
+ if (!found) /* beyond the end of the list */
break;
#ifdef CONFIG_COMPAT
@@ -924,18 +943,26 @@ int ptrace_request(struct task_struct *child, long request,
ret = ptrace_setsiginfo(child, &siginfo);
break;
- case PTRACE_GETSIGMASK:
+ case PTRACE_GETSIGMASK: {
+ sigset_t *mask;
+
if (addr != sizeof(sigset_t)) {
ret = -EINVAL;
break;
}
- if (copy_to_user(datavp, &child->blocked, sizeof(sigset_t)))
+ if (test_tsk_restore_sigmask(child))
+ mask = &child->saved_sigmask;
+ else
+ mask = &child->blocked;
+
+ if (copy_to_user(datavp, mask, sizeof(sigset_t)))
ret = -EFAULT;
else
ret = 0;
break;
+ }
case PTRACE_SETSIGMASK: {
sigset_t new_set;
@@ -961,6 +988,8 @@ int ptrace_request(struct task_struct *child, long request,
child->blocked = new_set;
spin_unlock_irq(&child->sighand->siglock);
+ clear_tsk_restore_sigmask(child);
+
ret = 0;
break;
}
diff --git a/kernel/rcu/Kconfig b/kernel/rcu/Kconfig
index 939a2056c87a..480edf328b51 100644
--- a/kernel/rcu/Kconfig
+++ b/kernel/rcu/Kconfig
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
#
# RCU-related configuration options
#
@@ -87,36 +88,6 @@ config RCU_STALL_COMMON
config RCU_NEED_SEGCBLIST
def_bool ( TREE_RCU || PREEMPT_RCU || TREE_SRCU )
-config CONTEXT_TRACKING
- bool
-
-config CONTEXT_TRACKING_FORCE
- bool "Force context tracking"
- depends on CONTEXT_TRACKING
- default y if !NO_HZ_FULL
- help
- The major pre-requirement for full dynticks to work is to
- support the context tracking subsystem. But there are also
- other dependencies to provide in order to make the full
- dynticks working.
-
- This option stands for testing when an arch implements the
- context tracking backend but doesn't yet fullfill all the
- requirements to make the full dynticks feature working.
- Without the full dynticks, there is no way to test the support
- for context tracking and the subsystems that rely on it: RCU
- userspace extended quiescent state and tickless cputime
- accounting. This option copes with the absence of the full
- dynticks subsystem by forcing the context tracking on all
- CPUs in the system.
-
- Say Y only if you're working on the development of an
- architecture backend for the context tracking.
-
- Say N otherwise, this option brings an overhead that you
- don't want in production.
-
-
config RCU_FANOUT
int "Tree-based hierarchical RCU fanout value"
range 2 64 if 64BIT
diff --git a/kernel/rcu/Kconfig.debug b/kernel/rcu/Kconfig.debug
index 0ec7d1d33a14..5ec3ea4028e2 100644
--- a/kernel/rcu/Kconfig.debug
+++ b/kernel/rcu/Kconfig.debug
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
#
# RCU-related debugging configuration options
#
diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h
index a393e24a9195..5290b01de534 100644
--- a/kernel/rcu/rcu.h
+++ b/kernel/rcu/rcu.h
@@ -1,36 +1,18 @@
+/* SPDX-License-Identifier: GPL-2.0+ */
/*
* Read-Copy Update definitions shared among RCU implementations.
*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, you can access it online at
- * http://www.gnu.org/licenses/gpl-2.0.html.
- *
* Copyright IBM Corporation, 2011
*
- * Author: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
+ * Author: Paul E. McKenney <paulmck@linux.ibm.com>
*/
#ifndef __LINUX_RCU_H
#define __LINUX_RCU_H
#include <trace/events/rcu.h>
-#ifdef CONFIG_RCU_TRACE
-#define RCU_TRACE(stmt) stmt
-#else /* #ifdef CONFIG_RCU_TRACE */
-#define RCU_TRACE(stmt)
-#endif /* #else #ifdef CONFIG_RCU_TRACE */
-/* Offset to allow for unmatched rcu_irq_{enter,exit}(). */
+/* Offset to allow distinguishing irq vs. task-based idle entry/exit. */
#define DYNTICK_IRQ_NONIDLE ((LONG_MAX / 2) + 1)
@@ -229,12 +211,12 @@ static inline bool __rcu_reclaim(const char *rn, struct rcu_head *head)
rcu_lock_acquire(&rcu_callback_map);
if (__is_kfree_rcu_offset(offset)) {
- RCU_TRACE(trace_rcu_invoke_kfree_callback(rn, head, offset);)
+ trace_rcu_invoke_kfree_callback(rn, head, offset);
kfree((void *)head - offset);
rcu_lock_release(&rcu_callback_map);
return true;
} else {
- RCU_TRACE(trace_rcu_invoke_callback(rn, head);)
+ trace_rcu_invoke_callback(rn, head);
f = head->func;
WRITE_ONCE(head->func, (rcu_callback_t)0L);
f(head);
@@ -246,6 +228,7 @@ static inline bool __rcu_reclaim(const char *rn, struct rcu_head *head)
#ifdef CONFIG_RCU_STALL_COMMON
extern int rcu_cpu_stall_suppress;
+extern int rcu_cpu_stall_timeout;
int rcu_jiffies_till_stall_check(void);
#define rcu_ftrace_dump_stall_suppress() \
@@ -462,9 +445,8 @@ void rcu_request_urgent_qs_task(struct task_struct *t);
enum rcutorture_type {
RCU_FLAVOR,
- RCU_BH_FLAVOR,
- RCU_SCHED_FLAVOR,
RCU_TASKS_FLAVOR,
+ RCU_TRIVIAL_FLAVOR,
SRCU_FLAVOR,
INVALID_RCU_FLAVOR
};
@@ -498,6 +480,10 @@ void do_trace_rcu_torture_read(const char *rcutorturename,
#endif
#endif
+#if IS_ENABLED(CONFIG_RCU_TORTURE_TEST) || IS_MODULE(CONFIG_RCU_TORTURE_TEST)
+long rcutorture_sched_setaffinity(pid_t pid, const struct cpumask *in_mask);
+#endif
+
#ifdef CONFIG_TINY_SRCU
static inline void srcutorture_get_gp_data(enum rcutorture_type test_type,
diff --git a/kernel/rcu/rcu_segcblist.c b/kernel/rcu/rcu_segcblist.c
index 5aff271adf1e..9bd5f6023c21 100644
--- a/kernel/rcu/rcu_segcblist.c
+++ b/kernel/rcu/rcu_segcblist.c
@@ -1,23 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0+
/*
* RCU segmented callback lists, function definitions
*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, you can access it online at
- * http://www.gnu.org/licenses/gpl-2.0.html.
- *
* Copyright IBM Corporation, 2017
*
- * Authors: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
+ * Authors: Paul E. McKenney <paulmck@linux.ibm.com>
*/
#include <linux/types.h>
diff --git a/kernel/rcu/rcu_segcblist.h b/kernel/rcu/rcu_segcblist.h
index 948470cef385..71b64648464e 100644
--- a/kernel/rcu/rcu_segcblist.h
+++ b/kernel/rcu/rcu_segcblist.h
@@ -1,23 +1,10 @@
+/* SPDX-License-Identifier: GPL-2.0+ */
/*
* RCU segmented callback lists, internal-to-rcu header file
*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, you can access it online at
- * http://www.gnu.org/licenses/gpl-2.0.html.
- *
* Copyright IBM Corporation, 2017
*
- * Authors: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
+ * Authors: Paul E. McKenney <paulmck@linux.ibm.com>
*/
#include <linux/rcu_segcblist.h>
diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuperf.c
index b459da70b4fc..7a6890b23c5f 100644
--- a/kernel/rcu/rcuperf.c
+++ b/kernel/rcu/rcuperf.c
@@ -1,23 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0+
/*
* Read-Copy Update module-based performance-test facility
*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, you can access it online at
- * http://www.gnu.org/licenses/gpl-2.0.html.
- *
* Copyright (C) IBM Corporation, 2015
*
- * Authors: Paul E. McKenney <paulmck@us.ibm.com>
+ * Authors: Paul E. McKenney <paulmck@linux.ibm.com>
*/
#define pr_fmt(fmt) fmt
@@ -54,7 +41,7 @@
#include "rcu.h"
MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Paul E. McKenney <paulmck@linux.vnet.ibm.com>");
+MODULE_AUTHOR("Paul E. McKenney <paulmck@linux.ibm.com>");
#define PERF_FLAG "-perf:"
#define PERFOUT_STRING(s) \
@@ -83,13 +70,19 @@ MODULE_AUTHOR("Paul E. McKenney <paulmck@linux.vnet.ibm.com>");
* Various other use cases may of course be specified.
*/
+#ifdef MODULE
+# define RCUPERF_SHUTDOWN 0
+#else
+# define RCUPERF_SHUTDOWN 1
+#endif
+
torture_param(bool, gp_async, false, "Use asynchronous GP wait primitives");
torture_param(int, gp_async_max, 1000, "Max # outstanding waits per reader");
torture_param(bool, gp_exp, false, "Use expedited GP wait primitives");
torture_param(int, holdoff, 10, "Holdoff time before test start (s)");
torture_param(int, nreaders, -1, "Number of RCU reader threads");
torture_param(int, nwriters, -1, "Number of RCU updater threads");
-torture_param(bool, shutdown, !IS_ENABLED(MODULE),
+torture_param(bool, shutdown, RCUPERF_SHUTDOWN,
"Shutdown at end of performance tests.");
torture_param(int, verbose, 1, "Enable verbose debugging printk()s");
torture_param(int, writer_holdoff, 0, "Holdoff (us) between GPs, zero to disable");
@@ -501,6 +494,10 @@ rcu_perf_cleanup(void)
if (torture_cleanup_begin())
return;
+ if (!cur_ops) {
+ torture_cleanup_end();
+ return;
+ }
if (reader_tasks) {
for (i = 0; i < nrealreaders; i++)
@@ -621,6 +618,7 @@ rcu_perf_init(void)
pr_cont("\n");
WARN_ON(!IS_MODULE(CONFIG_RCU_PERF_TEST));
firsterr = -EINVAL;
+ cur_ops = NULL;
goto unwind;
}
if (cur_ops->init)
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index f6e85faa4ff4..fce4e7e6f502 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -1,23 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0+
/*
* Read-Copy Update module-based torture test facility
*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, you can access it online at
- * http://www.gnu.org/licenses/gpl-2.0.html.
- *
* Copyright (C) IBM Corporation, 2005, 2006
*
- * Authors: Paul E. McKenney <paulmck@us.ibm.com>
+ * Authors: Paul E. McKenney <paulmck@linux.ibm.com>
* Josh Triplett <josh@joshtriplett.org>
*
* See also: Documentation/RCU/torture.txt
@@ -61,7 +48,7 @@
#include "rcu.h"
MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com> and Josh Triplett <josh@joshtriplett.org>");
+MODULE_AUTHOR("Paul E. McKenney <paulmck@linux.ibm.com> and Josh Triplett <josh@joshtriplett.org>");
/* Bits for ->extendables field, extendables param, and related definitions. */
@@ -312,7 +299,7 @@ struct rcu_torture_ops {
int irq_capable;
int can_boost;
int extendables;
- int ext_irq_conflict;
+ int slow_gps;
const char *name;
};
@@ -605,12 +592,7 @@ static void srcu_torture_init(void)
static void srcu_torture_cleanup(void)
{
- static DEFINE_TORTURE_RANDOM(rand);
-
- if (torture_random(&rand) & 0x800)
- cleanup_srcu_struct(&srcu_ctld);
- else
- cleanup_srcu_struct_quiesced(&srcu_ctld);
+ cleanup_srcu_struct(&srcu_ctld);
srcu_ctlp = &srcu_ctl; /* In case of a later rcutorture run. */
}
@@ -686,9 +668,51 @@ static struct rcu_torture_ops tasks_ops = {
.fqs = NULL,
.stats = NULL,
.irq_capable = 1,
+ .slow_gps = 1,
.name = "tasks"
};
+/*
+ * Definitions for trivial CONFIG_PREEMPT=n-only torture testing.
+ * This implementation does not necessarily work well with CPU hotplug.
+ */
+
+static void synchronize_rcu_trivial(void)
+{
+ int cpu;
+
+ for_each_online_cpu(cpu) {
+ rcutorture_sched_setaffinity(current->pid, cpumask_of(cpu));
+ WARN_ON_ONCE(raw_smp_processor_id() != cpu);
+ }
+}
+
+static int rcu_torture_read_lock_trivial(void) __acquires(RCU)
+{
+ preempt_disable();
+ return 0;
+}
+
+static void rcu_torture_read_unlock_trivial(int idx) __releases(RCU)
+{
+ preempt_enable();
+}
+
+static struct rcu_torture_ops trivial_ops = {
+ .ttype = RCU_TRIVIAL_FLAVOR,
+ .init = rcu_sync_torture_init,
+ .readlock = rcu_torture_read_lock_trivial,
+ .read_delay = rcu_read_delay, /* just reuse rcu's version. */
+ .readunlock = rcu_torture_read_unlock_trivial,
+ .get_gp_seq = rcu_no_completed,
+ .sync = synchronize_rcu_trivial,
+ .exp_sync = synchronize_rcu_trivial,
+ .fqs = NULL,
+ .stats = NULL,
+ .irq_capable = 1,
+ .name = "trivial"
+};
+
static unsigned long rcutorture_seq_diff(unsigned long new, unsigned long old)
{
if (!cur_ops->gp_diff)
@@ -1029,10 +1053,17 @@ rcu_torture_writer(void *arg)
!rcu_gp_is_normal();
}
rcu_torture_writer_state = RTWS_STUTTER;
- if (stutter_wait("rcu_torture_writer"))
+ if (stutter_wait("rcu_torture_writer") &&
+ !READ_ONCE(rcu_fwd_cb_nodelay) &&
+ !cur_ops->slow_gps &&
+ !torture_must_stop())
for (i = 0; i < ARRAY_SIZE(rcu_tortures); i++)
- if (list_empty(&rcu_tortures[i].rtort_free))
- WARN_ON_ONCE(1);
+ if (list_empty(&rcu_tortures[i].rtort_free) &&
+ rcu_access_pointer(rcu_torture_current) !=
+ &rcu_tortures[i]) {
+ rcu_ftrace_dump(DUMP_ALL);
+ WARN(1, "%s: rtort_pipe_count: %d\n", __func__, rcu_tortures[i].rtort_pipe_count);
+ }
} while (!torture_must_stop());
/* Reset expediting back to unexpedited. */
if (expediting > 0)
@@ -1173,7 +1204,7 @@ rcutorture_extend_mask(int oldmask, struct torture_random_state *trsp)
unsigned long randmask2 = randmask1 >> 3;
WARN_ON_ONCE(mask >> RCUTORTURE_RDR_SHIFT);
- /* Most of the time lots of bits, half the time only one bit. */
+ /* Mostly only one bit (need preemption!), sometimes lots of bits. */
if (!(randmask1 & 0x7))
mask = mask & randmask2;
else
@@ -1183,10 +1214,6 @@ rcutorture_extend_mask(int oldmask, struct torture_random_state *trsp)
((!(mask & RCUTORTURE_RDR_BH) && (oldmask & RCUTORTURE_RDR_BH)) ||
(!(mask & RCUTORTURE_RDR_RBH) && (oldmask & RCUTORTURE_RDR_RBH))))
mask |= RCUTORTURE_RDR_BH | RCUTORTURE_RDR_RBH;
- if ((mask & RCUTORTURE_RDR_IRQ) &&
- !(mask & cur_ops->ext_irq_conflict) &&
- (oldmask & cur_ops->ext_irq_conflict))
- mask |= cur_ops->ext_irq_conflict; /* Or if readers object. */
return mask ?: RCUTORTURE_RDR_RCU;
}
@@ -1381,8 +1408,9 @@ rcu_torture_stats_print(void)
}
pr_alert("%s%s ", torture_type, TORTURE_FLAG);
- pr_cont("rtc: %p ver: %lu tfle: %d rta: %d rtaf: %d rtf: %d ",
+ pr_cont("rtc: %p %s: %lu tfle: %d rta: %d rtaf: %d rtf: %d ",
rcu_torture_current,
+ rcu_torture_current ? "ver" : "VER",
rcu_torture_current_version,
list_empty(&rcu_torture_freelist),
atomic_read(&n_rcu_torture_alloc),
@@ -1630,21 +1658,34 @@ static bool rcu_fwd_emergency_stop;
#define MIN_FWD_CB_LAUNDERS 3 /* This many CB invocations to count. */
#define MIN_FWD_CBS_LAUNDERED 100 /* Number of counted CBs. */
#define FWD_CBS_HIST_DIV 10 /* Histogram buckets/second. */
-static long n_launders_hist[2 * MAX_FWD_CB_JIFFIES / (HZ / FWD_CBS_HIST_DIV)];
+struct rcu_launder_hist {
+ long n_launders;
+ unsigned long launder_gp_seq;
+};
+#define N_LAUNDERS_HIST (2 * MAX_FWD_CB_JIFFIES / (HZ / FWD_CBS_HIST_DIV))
+static struct rcu_launder_hist n_launders_hist[N_LAUNDERS_HIST];
+static unsigned long rcu_launder_gp_seq_start;
static void rcu_torture_fwd_cb_hist(void)
{
+ unsigned long gps;
+ unsigned long gps_old;
int i;
int j;
for (i = ARRAY_SIZE(n_launders_hist) - 1; i > 0; i--)
- if (n_launders_hist[i] > 0)
+ if (n_launders_hist[i].n_launders > 0)
break;
pr_alert("%s: Callback-invocation histogram (duration %lu jiffies):",
__func__, jiffies - rcu_fwd_startat);
- for (j = 0; j <= i; j++)
- pr_cont(" %ds/%d: %ld",
- j + 1, FWD_CBS_HIST_DIV, n_launders_hist[j]);
+ gps_old = rcu_launder_gp_seq_start;
+ for (j = 0; j <= i; j++) {
+ gps = n_launders_hist[j].launder_gp_seq;
+ pr_cont(" %ds/%d: %ld:%ld",
+ j + 1, FWD_CBS_HIST_DIV, n_launders_hist[j].n_launders,
+ rcutorture_seq_diff(gps, gps_old));
+ gps_old = gps;
+ }
pr_cont("\n");
}
@@ -1666,10 +1707,22 @@ static void rcu_torture_fwd_cb_cr(struct rcu_head *rhp)
i = ((jiffies - rcu_fwd_startat) / (HZ / FWD_CBS_HIST_DIV));
if (i >= ARRAY_SIZE(n_launders_hist))
i = ARRAY_SIZE(n_launders_hist) - 1;
- n_launders_hist[i]++;
+ n_launders_hist[i].n_launders++;
+ n_launders_hist[i].launder_gp_seq = cur_ops->get_gp_seq();
spin_unlock_irqrestore(&rcu_fwd_lock, flags);
}
+// Give the scheduler a chance, even on nohz_full CPUs.
+static void rcu_torture_fwd_prog_cond_resched(void)
+{
+ if (IS_ENABLED(CONFIG_PREEMPT) && IS_ENABLED(CONFIG_NO_HZ_FULL)) {
+ if (need_resched())
+ schedule();
+ } else {
+ cond_resched();
+ }
+}
+
/*
* Free all callbacks on the rcu_fwd_cb_head list, either because the
* test is over or because we hit an OOM event.
@@ -1683,16 +1736,18 @@ static unsigned long rcu_torture_fwd_prog_cbfree(void)
for (;;) {
spin_lock_irqsave(&rcu_fwd_lock, flags);
rfcp = rcu_fwd_cb_head;
- if (!rfcp)
+ if (!rfcp) {
+ spin_unlock_irqrestore(&rcu_fwd_lock, flags);
break;
+ }
rcu_fwd_cb_head = rfcp->rfc_next;
if (!rcu_fwd_cb_head)
rcu_fwd_cb_tail = &rcu_fwd_cb_head;
spin_unlock_irqrestore(&rcu_fwd_lock, flags);
kfree(rfcp);
freed++;
+ rcu_torture_fwd_prog_cond_resched();
}
- spin_unlock_irqrestore(&rcu_fwd_lock, flags);
return freed;
}
@@ -1716,6 +1771,8 @@ static void rcu_torture_fwd_prog_nr(int *tested, int *tested_tries)
}
/* Tight loop containing cond_resched(). */
+ WRITE_ONCE(rcu_fwd_cb_nodelay, true);
+ cur_ops->sync(); /* Later readers see above write. */
if (selfpropcb) {
WRITE_ONCE(fcs.stop, 0);
cur_ops->call(&fcs.rh, rcu_torture_fwd_prog_cb);
@@ -1733,7 +1790,7 @@ static void rcu_torture_fwd_prog_nr(int *tested, int *tested_tries)
udelay(10);
cur_ops->readunlock(idx);
if (!fwd_progress_need_resched || need_resched())
- cond_resched();
+ rcu_torture_fwd_prog_cond_resched();
}
(*tested_tries)++;
if (!time_before(jiffies, stopat) &&
@@ -1754,6 +1811,8 @@ static void rcu_torture_fwd_prog_nr(int *tested, int *tested_tries)
WARN_ON(READ_ONCE(fcs.stop) != 2);
destroy_rcu_head_on_stack(&fcs.rh);
}
+ schedule_timeout_uninterruptible(HZ / 10); /* Let kthreads recover. */
+ WRITE_ONCE(rcu_fwd_cb_nodelay, false);
}
/* Carry out call_rcu() forward-progress testing. */
@@ -1774,6 +1833,8 @@ static void rcu_torture_fwd_prog_cr(void)
if (READ_ONCE(rcu_fwd_emergency_stop))
return; /* Get out of the way quickly, no GP wait! */
+ if (!cur_ops->call)
+ return; /* Can't do call_rcu() fwd prog without ->call. */
/* Loop continuously posting RCU callbacks. */
WRITE_ONCE(rcu_fwd_cb_nodelay, true);
@@ -1786,9 +1847,10 @@ static void rcu_torture_fwd_prog_cr(void)
n_max_cbs = 0;
n_max_gps = 0;
for (i = 0; i < ARRAY_SIZE(n_launders_hist); i++)
- n_launders_hist[i] = 0;
+ n_launders_hist[i].n_launders = 0;
cver = READ_ONCE(rcu_torture_current_version);
gps = cur_ops->get_gp_seq();
+ rcu_launder_gp_seq_start = gps;
while (time_before(jiffies, stopat) &&
!READ_ONCE(rcu_fwd_emergency_stop) && !torture_must_stop()) {
rfcp = READ_ONCE(rcu_fwd_cb_head);
@@ -1813,7 +1875,7 @@ static void rcu_torture_fwd_prog_cr(void)
rfcp->rfc_gps = 0;
}
cur_ops->call(&rfcp->rh, rcu_torture_fwd_cb_cr);
- cond_resched();
+ rcu_torture_fwd_prog_cond_resched();
}
stoppedat = jiffies;
n_launders_cb_snap = READ_ONCE(n_launders_cb);
@@ -1822,7 +1884,6 @@ static void rcu_torture_fwd_prog_cr(void)
cur_ops->cb_barrier(); /* Wait for callbacks to be invoked. */
(void)rcu_torture_fwd_prog_cbfree();
- WRITE_ONCE(rcu_fwd_cb_nodelay, false);
if (!torture_must_stop() && !READ_ONCE(rcu_fwd_emergency_stop)) {
WARN_ON(n_max_gps < MIN_FWD_CBS_LAUNDERED);
pr_alert("%s Duration %lu barrier: %lu pending %ld n_launders: %ld n_launders_sa: %ld n_max_gps: %ld n_max_cbs: %ld cver %ld gps %ld\n",
@@ -1833,6 +1894,8 @@ static void rcu_torture_fwd_prog_cr(void)
n_max_gps, n_max_cbs, cver, gps);
rcu_torture_fwd_cb_hist();
}
+ schedule_timeout_uninterruptible(HZ); /* Let CBs drain. */
+ WRITE_ONCE(rcu_fwd_cb_nodelay, false);
}
@@ -1846,7 +1909,7 @@ static int rcutorture_oom_notify(struct notifier_block *self,
WARN(1, "%s invoked upon OOM during forward-progress testing.\n",
__func__);
rcu_torture_fwd_cb_hist();
- rcu_fwd_progress_check(1 + (jiffies - READ_ONCE(rcu_fwd_startat) / 2));
+ rcu_fwd_progress_check(1 + (jiffies - READ_ONCE(rcu_fwd_startat)) / 2);
WRITE_ONCE(rcu_fwd_emergency_stop, true);
smp_mb(); /* Emergency stop before free and wait to avoid hangs. */
pr_info("%s: Freed %lu RCU callbacks.\n",
@@ -2092,6 +2155,10 @@ rcu_torture_cleanup(void)
cur_ops->cb_barrier();
return;
}
+ if (!cur_ops) {
+ torture_cleanup_end();
+ return;
+ }
rcu_torture_barrier_cleanup();
torture_stop_kthread(rcu_torture_fwd_prog, fwd_prog_task);
@@ -2228,6 +2295,14 @@ static void rcu_test_debug_objects(void)
#endif /* #else #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */
}
+static void rcutorture_sync(void)
+{
+ static unsigned long n;
+
+ if (cur_ops->sync && !(++n & 0xfff))
+ cur_ops->sync();
+}
+
static int __init
rcu_torture_init(void)
{
@@ -2236,7 +2311,7 @@ rcu_torture_init(void)
int firsterr = 0;
static struct rcu_torture_ops *torture_ops[] = {
&rcu_ops, &rcu_busted_ops, &srcu_ops, &srcud_ops,
- &busted_srcud_ops, &tasks_ops,
+ &busted_srcud_ops, &tasks_ops, &trivial_ops,
};
if (!torture_init_begin(torture_type, verbose))
@@ -2257,6 +2332,7 @@ rcu_torture_init(void)
pr_cont("\n");
WARN_ON(!IS_MODULE(CONFIG_RCU_TORTURE_TEST));
firsterr = -EINVAL;
+ cur_ops = NULL;
goto unwind;
}
if (cur_ops->fqs == NULL && fqs_duration != 0) {
@@ -2358,7 +2434,10 @@ rcu_torture_init(void)
if (stutter < 0)
stutter = 0;
if (stutter) {
- firsterr = torture_stutter_init(stutter * HZ);
+ int t;
+
+ t = cur_ops->stall_dur ? cur_ops->stall_dur() : stutter * HZ;
+ firsterr = torture_stutter_init(stutter * HZ, t);
if (firsterr)
goto unwind;
}
@@ -2389,7 +2468,8 @@ rcu_torture_init(void)
firsterr = torture_shutdown_init(shutdown_secs, rcu_torture_cleanup);
if (firsterr)
goto unwind;
- firsterr = torture_onoff_init(onoff_holdoff * HZ, onoff_interval);
+ firsterr = torture_onoff_init(onoff_holdoff * HZ, onoff_interval,
+ rcutorture_sync);
if (firsterr)
goto unwind;
firsterr = rcu_torture_stall_init();
diff --git a/kernel/rcu/srcutiny.c b/kernel/rcu/srcutiny.c
index 32dfd6522548..44d6606b8325 100644
--- a/kernel/rcu/srcutiny.c
+++ b/kernel/rcu/srcutiny.c
@@ -1,24 +1,11 @@
+// SPDX-License-Identifier: GPL-2.0+
/*
* Sleepable Read-Copy Update mechanism for mutual exclusion,
* tiny version for non-preemptible single-CPU use.
*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, you can access it online at
- * http://www.gnu.org/licenses/gpl-2.0.html.
- *
* Copyright (C) IBM Corporation, 2017
*
- * Author: Paul McKenney <paulmck@us.ibm.com>
+ * Author: Paul McKenney <paulmck@linux.ibm.com>
*/
#include <linux/export.h>
@@ -89,19 +76,16 @@ EXPORT_SYMBOL_GPL(init_srcu_struct);
* Must invoke this after you are finished using a given srcu_struct that
* was initialized via init_srcu_struct(), else you leak memory.
*/
-void _cleanup_srcu_struct(struct srcu_struct *ssp, bool quiesced)
+void cleanup_srcu_struct(struct srcu_struct *ssp)
{
WARN_ON(ssp->srcu_lock_nesting[0] || ssp->srcu_lock_nesting[1]);
- if (quiesced)
- WARN_ON(work_pending(&ssp->srcu_work));
- else
- flush_work(&ssp->srcu_work);
+ flush_work(&ssp->srcu_work);
WARN_ON(ssp->srcu_gp_running);
WARN_ON(ssp->srcu_gp_waiting);
WARN_ON(ssp->srcu_cb_head);
WARN_ON(&ssp->srcu_cb_head != ssp->srcu_cb_tail);
}
-EXPORT_SYMBOL_GPL(_cleanup_srcu_struct);
+EXPORT_SYMBOL_GPL(cleanup_srcu_struct);
/*
* Removes the count for the old reader from the appropriate element of
diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c
index 3600d88d8956..cf0e886314f2 100644
--- a/kernel/rcu/srcutree.c
+++ b/kernel/rcu/srcutree.c
@@ -1,24 +1,11 @@
+// SPDX-License-Identifier: GPL-2.0+
/*
* Sleepable Read-Copy Update mechanism for mutual exclusion.
*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, you can access it online at
- * http://www.gnu.org/licenses/gpl-2.0.html.
- *
* Copyright (C) IBM Corporation, 2006
* Copyright (C) Fujitsu, 2012
*
- * Author: Paul McKenney <paulmck@us.ibm.com>
+ * Author: Paul McKenney <paulmck@linux.ibm.com>
* Lai Jiangshan <laijs@cn.fujitsu.com>
*
* For detailed explanation of Read-Copy Update mechanism see -
@@ -58,6 +45,7 @@ static bool __read_mostly srcu_init_done;
static void srcu_invoke_callbacks(struct work_struct *work);
static void srcu_reschedule(struct srcu_struct *ssp, unsigned long delay);
static void process_srcu(struct work_struct *work);
+static void srcu_delay_timer(struct timer_list *t);
/* Wrappers for lock acquisition and release, see raw_spin_lock_rcu_node(). */
#define spin_lock_rcu_node(p) \
@@ -156,7 +144,8 @@ static void init_srcu_struct_nodes(struct srcu_struct *ssp, bool is_static)
snp->grphi = cpu;
}
sdp->cpu = cpu;
- INIT_DELAYED_WORK(&sdp->work, srcu_invoke_callbacks);
+ INIT_WORK(&sdp->work, srcu_invoke_callbacks);
+ timer_setup(&sdp->delay_work, srcu_delay_timer, 0);
sdp->ssp = ssp;
sdp->grpmask = 1 << (cpu - sdp->mynode->grplo);
if (is_static)
@@ -371,8 +360,14 @@ static unsigned long srcu_get_delay(struct srcu_struct *ssp)
return SRCU_INTERVAL;
}
-/* Helper for cleanup_srcu_struct() and cleanup_srcu_struct_quiesced(). */
-void _cleanup_srcu_struct(struct srcu_struct *ssp, bool quiesced)
+/**
+ * cleanup_srcu_struct - deconstruct a sleep-RCU structure
+ * @ssp: structure to clean up.
+ *
+ * Must invoke this after you are finished using a given srcu_struct that
+ * was initialized via init_srcu_struct(), else you leak memory.
+ */
+void cleanup_srcu_struct(struct srcu_struct *ssp)
{
int cpu;
@@ -380,19 +375,15 @@ void _cleanup_srcu_struct(struct srcu_struct *ssp, bool quiesced)
return; /* Just leak it! */
if (WARN_ON(srcu_readers_active(ssp)))
return; /* Just leak it! */
- if (quiesced) {
- if (WARN_ON(delayed_work_pending(&ssp->work)))
- return; /* Just leak it! */
- } else {
- flush_delayed_work(&ssp->work);
+ flush_delayed_work(&ssp->work);
+ for_each_possible_cpu(cpu) {
+ struct srcu_data *sdp = per_cpu_ptr(ssp->sda, cpu);
+
+ del_timer_sync(&sdp->delay_work);
+ flush_work(&sdp->work);
+ if (WARN_ON(rcu_segcblist_n_cbs(&sdp->srcu_cblist)))
+ return; /* Forgot srcu_barrier(), so just leak it! */
}
- for_each_possible_cpu(cpu)
- if (quiesced) {
- if (WARN_ON(delayed_work_pending(&per_cpu_ptr(ssp->sda, cpu)->work)))
- return; /* Just leak it! */
- } else {
- flush_delayed_work(&per_cpu_ptr(ssp->sda, cpu)->work);
- }
if (WARN_ON(rcu_seq_state(READ_ONCE(ssp->srcu_gp_seq)) != SRCU_STATE_IDLE) ||
WARN_ON(srcu_readers_active(ssp))) {
pr_info("%s: Active srcu_struct %p state: %d\n",
@@ -402,7 +393,7 @@ void _cleanup_srcu_struct(struct srcu_struct *ssp, bool quiesced)
free_percpu(ssp->sda);
ssp->sda = NULL;
}
-EXPORT_SYMBOL_GPL(_cleanup_srcu_struct);
+EXPORT_SYMBOL_GPL(cleanup_srcu_struct);
/*
* Counts the new reader in the appropriate per-CPU element of the
@@ -463,39 +454,23 @@ static void srcu_gp_start(struct srcu_struct *ssp)
WARN_ON_ONCE(state != SRCU_STATE_SCAN1);
}
-/*
- * Track online CPUs to guide callback workqueue placement.
- */
-DEFINE_PER_CPU(bool, srcu_online);
-void srcu_online_cpu(unsigned int cpu)
+static void srcu_delay_timer(struct timer_list *t)
{
- WRITE_ONCE(per_cpu(srcu_online, cpu), true);
-}
+ struct srcu_data *sdp = container_of(t, struct srcu_data, delay_work);
-void srcu_offline_cpu(unsigned int cpu)
-{
- WRITE_ONCE(per_cpu(srcu_online, cpu), false);
+ queue_work_on(sdp->cpu, rcu_gp_wq, &sdp->work);
}
-/*
- * Place the workqueue handler on the specified CPU if online, otherwise
- * just run it whereever. This is useful for placing workqueue handlers
- * that are to invoke the specified CPU's callbacks.
- */
-static bool srcu_queue_delayed_work_on(int cpu, struct workqueue_struct *wq,
- struct delayed_work *dwork,
+static void srcu_queue_delayed_work_on(struct srcu_data *sdp,
unsigned long delay)
{
- bool ret;
+ if (!delay) {
+ queue_work_on(sdp->cpu, rcu_gp_wq, &sdp->work);
+ return;
+ }
- preempt_disable();
- if (READ_ONCE(per_cpu(srcu_online, cpu)))
- ret = queue_delayed_work_on(cpu, wq, dwork, delay);
- else
- ret = queue_delayed_work(wq, dwork, delay);
- preempt_enable();
- return ret;
+ timer_reduce(&sdp->delay_work, jiffies + delay);
}
/*
@@ -504,7 +479,7 @@ static bool srcu_queue_delayed_work_on(int cpu, struct workqueue_struct *wq,
*/
static void srcu_schedule_cbs_sdp(struct srcu_data *sdp, unsigned long delay)
{
- srcu_queue_delayed_work_on(sdp->cpu, rcu_gp_wq, &sdp->work, delay);
+ srcu_queue_delayed_work_on(sdp, delay);
}
/*
@@ -856,8 +831,8 @@ static void srcu_leak_callback(struct rcu_head *rhp)
* srcu_read_lock(), and srcu_read_unlock() that are all passed the same
* srcu_struct structure.
*/
-void __call_srcu(struct srcu_struct *ssp, struct rcu_head *rhp,
- rcu_callback_t func, bool do_norm)
+static void __call_srcu(struct srcu_struct *ssp, struct rcu_head *rhp,
+ rcu_callback_t func, bool do_norm)
{
unsigned long flags;
int idx;
@@ -1186,7 +1161,8 @@ static void srcu_invoke_callbacks(struct work_struct *work)
struct srcu_data *sdp;
struct srcu_struct *ssp;
- sdp = container_of(work, struct srcu_data, work.work);
+ sdp = container_of(work, struct srcu_data, work);
+
ssp = sdp->ssp;
rcu_cblist_init(&ready_cbs);
spin_lock_irq_rcu_node(sdp);
@@ -1334,3 +1310,68 @@ void __init srcu_init(void)
queue_work(rcu_gp_wq, &ssp->work.work);
}
}
+
+#ifdef CONFIG_MODULES
+
+/* Initialize any global-scope srcu_struct structures used by this module. */
+static int srcu_module_coming(struct module *mod)
+{
+ int i;
+ struct srcu_struct **sspp = mod->srcu_struct_ptrs;
+ int ret;
+
+ for (i = 0; i < mod->num_srcu_structs; i++) {
+ ret = init_srcu_struct(*(sspp++));
+ if (WARN_ON_ONCE(ret))
+ return ret;
+ }
+ return 0;
+}
+
+/* Clean up any global-scope srcu_struct structures used by this module. */
+static void srcu_module_going(struct module *mod)
+{
+ int i;
+ struct srcu_struct **sspp = mod->srcu_struct_ptrs;
+
+ for (i = 0; i < mod->num_srcu_structs; i++)
+ cleanup_srcu_struct(*(sspp++));
+}
+
+/* Handle one module, either coming or going. */
+static int srcu_module_notify(struct notifier_block *self,
+ unsigned long val, void *data)
+{
+ struct module *mod = data;
+ int ret = 0;
+
+ switch (val) {
+ case MODULE_STATE_COMING:
+ ret = srcu_module_coming(mod);
+ break;
+ case MODULE_STATE_GOING:
+ srcu_module_going(mod);
+ break;
+ default:
+ break;
+ }
+ return ret;
+}
+
+static struct notifier_block srcu_module_nb = {
+ .notifier_call = srcu_module_notify,
+ .priority = 0,
+};
+
+static __init int init_srcu_module_notifier(void)
+{
+ int ret;
+
+ ret = register_module_notifier(&srcu_module_nb);
+ if (ret)
+ pr_warn("Failed to register srcu module notifier\n");
+ return ret;
+}
+late_initcall(init_srcu_module_notifier);
+
+#endif /* #ifdef CONFIG_MODULES */
diff --git a/kernel/rcu/sync.c b/kernel/rcu/sync.c
index be10036fa621..d4558ab7a07d 100644
--- a/kernel/rcu/sync.c
+++ b/kernel/rcu/sync.c
@@ -1,20 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0+
/*
* RCU-based infrastructure for lightweight reader-writer locking
*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, you can access it online at
- * http://www.gnu.org/licenses/gpl-2.0.html.
- *
* Copyright (c) 2015, Red Hat, Inc.
*
* Author: Oleg Nesterov <oleg@redhat.com>
@@ -23,65 +10,18 @@
#include <linux/rcu_sync.h>
#include <linux/sched.h>
-#ifdef CONFIG_PROVE_RCU
-#define __INIT_HELD(func) .held = func,
-#else
-#define __INIT_HELD(func)
-#endif
-
-static const struct {
- void (*sync)(void);
- void (*call)(struct rcu_head *, void (*)(struct rcu_head *));
- void (*wait)(void);
-#ifdef CONFIG_PROVE_RCU
- int (*held)(void);
-#endif
-} gp_ops[] = {
- [RCU_SYNC] = {
- .sync = synchronize_rcu,
- .call = call_rcu,
- .wait = rcu_barrier,
- __INIT_HELD(rcu_read_lock_held)
- },
- [RCU_SCHED_SYNC] = {
- .sync = synchronize_rcu,
- .call = call_rcu,
- .wait = rcu_barrier,
- __INIT_HELD(rcu_read_lock_sched_held)
- },
- [RCU_BH_SYNC] = {
- .sync = synchronize_rcu,
- .call = call_rcu,
- .wait = rcu_barrier,
- __INIT_HELD(rcu_read_lock_bh_held)
- },
-};
-
-enum { GP_IDLE = 0, GP_PENDING, GP_PASSED };
-enum { CB_IDLE = 0, CB_PENDING, CB_REPLAY };
+enum { GP_IDLE = 0, GP_ENTER, GP_PASSED, GP_EXIT, GP_REPLAY };
#define rss_lock gp_wait.lock
-#ifdef CONFIG_PROVE_RCU
-void rcu_sync_lockdep_assert(struct rcu_sync *rsp)
-{
- RCU_LOCKDEP_WARN(!gp_ops[rsp->gp_type].held(),
- "suspicious rcu_sync_is_idle() usage");
-}
-
-EXPORT_SYMBOL_GPL(rcu_sync_lockdep_assert);
-#endif
-
/**
* rcu_sync_init() - Initialize an rcu_sync structure
* @rsp: Pointer to rcu_sync structure to be initialized
- * @type: Flavor of RCU with which to synchronize rcu_sync structure
*/
-void rcu_sync_init(struct rcu_sync *rsp, enum rcu_sync_type type)
+void rcu_sync_init(struct rcu_sync *rsp)
{
memset(rsp, 0, sizeof(*rsp));
init_waitqueue_head(&rsp->gp_wait);
- rsp->gp_type = type;
}
/**
@@ -99,56 +39,26 @@ void rcu_sync_enter_start(struct rcu_sync *rsp)
rsp->gp_state = GP_PASSED;
}
-/**
- * rcu_sync_enter() - Force readers onto slowpath
- * @rsp: Pointer to rcu_sync structure to use for synchronization
- *
- * This function is used by updaters who need readers to make use of
- * a slowpath during the update. After this function returns, all
- * subsequent calls to rcu_sync_is_idle() will return false, which
- * tells readers to stay off their fastpaths. A later call to
- * rcu_sync_exit() re-enables reader slowpaths.
- *
- * When called in isolation, rcu_sync_enter() must wait for a grace
- * period, however, closely spaced calls to rcu_sync_enter() can
- * optimize away the grace-period wait via a state machine implemented
- * by rcu_sync_enter(), rcu_sync_exit(), and rcu_sync_func().
- */
-void rcu_sync_enter(struct rcu_sync *rsp)
-{
- bool need_wait, need_sync;
- spin_lock_irq(&rsp->rss_lock);
- need_wait = rsp->gp_count++;
- need_sync = rsp->gp_state == GP_IDLE;
- if (need_sync)
- rsp->gp_state = GP_PENDING;
- spin_unlock_irq(&rsp->rss_lock);
+static void rcu_sync_func(struct rcu_head *rhp);
- WARN_ON_ONCE(need_wait && need_sync);
- if (need_sync) {
- gp_ops[rsp->gp_type].sync();
- rsp->gp_state = GP_PASSED;
- wake_up_all(&rsp->gp_wait);
- } else if (need_wait) {
- wait_event(rsp->gp_wait, rsp->gp_state == GP_PASSED);
- } else {
- /*
- * Possible when there's a pending CB from a rcu_sync_exit().
- * Nobody has yet been allowed the 'fast' path and thus we can
- * avoid doing any sync(). The callback will get 'dropped'.
- */
- WARN_ON_ONCE(rsp->gp_state != GP_PASSED);
- }
+static void rcu_sync_call(struct rcu_sync *rsp)
+{
+ call_rcu(&rsp->cb_head, rcu_sync_func);
}
/**
* rcu_sync_func() - Callback function managing reader access to fastpath
* @rhp: Pointer to rcu_head in rcu_sync structure to use for synchronization
*
- * This function is passed to one of the call_rcu() functions by
+ * This function is passed to call_rcu() function by rcu_sync_enter() and
* rcu_sync_exit(), so that it is invoked after a grace period following the
- * that invocation of rcu_sync_exit(). It takes action based on events that
+ * that invocation of enter/exit.
+ *
+ * If it is called by rcu_sync_enter() it signals that all the readers were
+ * switched onto slow path.
+ *
+ * If it is called by rcu_sync_exit() it takes action based on events that
* have taken place in the meantime, so that closely spaced rcu_sync_enter()
* and rcu_sync_exit() pairs need not wait for a grace period.
*
@@ -165,35 +75,88 @@ static void rcu_sync_func(struct rcu_head *rhp)
struct rcu_sync *rsp = container_of(rhp, struct rcu_sync, cb_head);
unsigned long flags;
- WARN_ON_ONCE(rsp->gp_state != GP_PASSED);
- WARN_ON_ONCE(rsp->cb_state == CB_IDLE);
+ WARN_ON_ONCE(READ_ONCE(rsp->gp_state) == GP_IDLE);
+ WARN_ON_ONCE(READ_ONCE(rsp->gp_state) == GP_PASSED);
spin_lock_irqsave(&rsp->rss_lock, flags);
if (rsp->gp_count) {
/*
- * A new rcu_sync_begin() has happened; drop the callback.
+ * We're at least a GP after the GP_IDLE->GP_ENTER transition.
*/
- rsp->cb_state = CB_IDLE;
- } else if (rsp->cb_state == CB_REPLAY) {
+ WRITE_ONCE(rsp->gp_state, GP_PASSED);
+ wake_up_locked(&rsp->gp_wait);
+ } else if (rsp->gp_state == GP_REPLAY) {
/*
- * A new rcu_sync_exit() has happened; requeue the callback
- * to catch a later GP.
+ * A new rcu_sync_exit() has happened; requeue the callback to
+ * catch a later GP.
*/
- rsp->cb_state = CB_PENDING;
- gp_ops[rsp->gp_type].call(&rsp->cb_head, rcu_sync_func);
+ WRITE_ONCE(rsp->gp_state, GP_EXIT);
+ rcu_sync_call(rsp);
} else {
/*
- * We're at least a GP after rcu_sync_exit(); eveybody will now
- * have observed the write side critical section. Let 'em rip!.
+ * We're at least a GP after the last rcu_sync_exit(); eveybody
+ * will now have observed the write side critical section.
+ * Let 'em rip!.
*/
- rsp->cb_state = CB_IDLE;
- rsp->gp_state = GP_IDLE;
+ WRITE_ONCE(rsp->gp_state, GP_IDLE);
}
spin_unlock_irqrestore(&rsp->rss_lock, flags);
}
/**
- * rcu_sync_exit() - Allow readers back onto fast patch after grace period
+ * rcu_sync_enter() - Force readers onto slowpath
+ * @rsp: Pointer to rcu_sync structure to use for synchronization
+ *
+ * This function is used by updaters who need readers to make use of
+ * a slowpath during the update. After this function returns, all
+ * subsequent calls to rcu_sync_is_idle() will return false, which
+ * tells readers to stay off their fastpaths. A later call to
+ * rcu_sync_exit() re-enables reader slowpaths.
+ *
+ * When called in isolation, rcu_sync_enter() must wait for a grace
+ * period, however, closely spaced calls to rcu_sync_enter() can
+ * optimize away the grace-period wait via a state machine implemented
+ * by rcu_sync_enter(), rcu_sync_exit(), and rcu_sync_func().
+ */
+void rcu_sync_enter(struct rcu_sync *rsp)
+{
+ int gp_state;
+
+ spin_lock_irq(&rsp->rss_lock);
+ gp_state = rsp->gp_state;
+ if (gp_state == GP_IDLE) {
+ WRITE_ONCE(rsp->gp_state, GP_ENTER);
+ WARN_ON_ONCE(rsp->gp_count);
+ /*
+ * Note that we could simply do rcu_sync_call(rsp) here and
+ * avoid the "if (gp_state == GP_IDLE)" block below.
+ *
+ * However, synchronize_rcu() can be faster if rcu_expedited
+ * or rcu_blocking_is_gp() is true.
+ *
+ * Another reason is that we can't wait for rcu callback if
+ * we are called at early boot time but this shouldn't happen.
+ */
+ }
+ rsp->gp_count++;
+ spin_unlock_irq(&rsp->rss_lock);
+
+ if (gp_state == GP_IDLE) {
+ /*
+ * See the comment above, this simply does the "synchronous"
+ * call_rcu(rcu_sync_func) which does GP_ENTER -> GP_PASSED.
+ */
+ synchronize_rcu();
+ rcu_sync_func(&rsp->cb_head);
+ /* Not really needed, wait_event() would see GP_PASSED. */
+ return;
+ }
+
+ wait_event(rsp->gp_wait, READ_ONCE(rsp->gp_state) >= GP_PASSED);
+}
+
+/**
+ * rcu_sync_exit() - Allow readers back onto fast path after grace period
* @rsp: Pointer to rcu_sync structure to use for synchronization
*
* This function is used by updaters who have completed, and can therefore
@@ -204,13 +167,16 @@ static void rcu_sync_func(struct rcu_head *rhp)
*/
void rcu_sync_exit(struct rcu_sync *rsp)
{
+ WARN_ON_ONCE(READ_ONCE(rsp->gp_state) == GP_IDLE);
+ WARN_ON_ONCE(READ_ONCE(rsp->gp_count) == 0);
+
spin_lock_irq(&rsp->rss_lock);
if (!--rsp->gp_count) {
- if (rsp->cb_state == CB_IDLE) {
- rsp->cb_state = CB_PENDING;
- gp_ops[rsp->gp_type].call(&rsp->cb_head, rcu_sync_func);
- } else if (rsp->cb_state == CB_PENDING) {
- rsp->cb_state = CB_REPLAY;
+ if (rsp->gp_state == GP_PASSED) {
+ WRITE_ONCE(rsp->gp_state, GP_EXIT);
+ rcu_sync_call(rsp);
+ } else if (rsp->gp_state == GP_EXIT) {
+ WRITE_ONCE(rsp->gp_state, GP_REPLAY);
}
}
spin_unlock_irq(&rsp->rss_lock);
@@ -222,18 +188,19 @@ void rcu_sync_exit(struct rcu_sync *rsp)
*/
void rcu_sync_dtor(struct rcu_sync *rsp)
{
- int cb_state;
+ int gp_state;
- WARN_ON_ONCE(rsp->gp_count);
+ WARN_ON_ONCE(READ_ONCE(rsp->gp_count));
+ WARN_ON_ONCE(READ_ONCE(rsp->gp_state) == GP_PASSED);
spin_lock_irq(&rsp->rss_lock);
- if (rsp->cb_state == CB_REPLAY)
- rsp->cb_state = CB_PENDING;
- cb_state = rsp->cb_state;
+ if (rsp->gp_state == GP_REPLAY)
+ WRITE_ONCE(rsp->gp_state, GP_EXIT);
+ gp_state = rsp->gp_state;
spin_unlock_irq(&rsp->rss_lock);
- if (cb_state != CB_IDLE) {
- gp_ops[rsp->gp_type].wait();
- WARN_ON_ONCE(rsp->cb_state != CB_IDLE);
+ if (gp_state != GP_IDLE) {
+ rcu_barrier();
+ WARN_ON_ONCE(rsp->gp_state != GP_IDLE);
}
}
diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c
index 5f5963ba313e..477b4eb44af5 100644
--- a/kernel/rcu/tiny.c
+++ b/kernel/rcu/tiny.c
@@ -1,23 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0+
/*
* Read-Copy Update mechanism for mutual exclusion, the Bloatwatch edition.
*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, you can access it online at
- * http://www.gnu.org/licenses/gpl-2.0.html.
- *
* Copyright IBM Corporation, 2008
*
- * Author: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
+ * Author: Paul E. McKenney <paulmck@linux.ibm.com>
*
* For detailed explanation of Read-Copy Update mechanism see -
* Documentation/RCU
@@ -65,7 +52,7 @@ void rcu_qs(void)
local_irq_save(flags);
if (rcu_ctrlblk.donetail != rcu_ctrlblk.curtail) {
rcu_ctrlblk.donetail = rcu_ctrlblk.curtail;
- raise_softirq(RCU_SOFTIRQ);
+ raise_softirq_irqoff(RCU_SOFTIRQ);
}
local_irq_restore(flags);
}
@@ -76,7 +63,7 @@ void rcu_qs(void)
* be called from hardirq context. It is normally called from the
* scheduling-clock interrupt.
*/
-void rcu_check_callbacks(int user)
+void rcu_sched_clock_irq(int user)
{
if (user) {
rcu_qs();
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 9180158756d2..a14e5fbbea46 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -1,27 +1,14 @@
+// SPDX-License-Identifier: GPL-2.0+
/*
* Read-Copy Update mechanism for mutual exclusion
*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, you can access it online at
- * http://www.gnu.org/licenses/gpl-2.0.html.
- *
* Copyright IBM Corporation, 2008
*
* Authors: Dipankar Sarma <dipankar@in.ibm.com>
* Manfred Spraul <manfred@colorfullife.com>
- * Paul E. McKenney <paulmck@linux.vnet.ibm.com> Hierarchical version
+ * Paul E. McKenney <paulmck@linux.ibm.com> Hierarchical version
*
- * Based on the original work by Paul McKenney <paulmck@us.ibm.com>
+ * Based on the original work by Paul McKenney <paulmck@linux.ibm.com>
* and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen.
*
* For detailed explanation of Read-Copy Update mechanism see -
@@ -62,6 +49,14 @@
#include <linux/suspend.h>
#include <linux/ftrace.h>
#include <linux/tick.h>
+#include <linux/sysrq.h>
+#include <linux/kprobes.h>
+#include <linux/gfp.h>
+#include <linux/oom.h>
+#include <linux/smpboot.h>
+#include <linux/jiffies.h>
+#include <linux/sched/isolation.h>
+#include "../time/tick-internal.h"
#include "tree.h"
#include "rcu.h"
@@ -103,6 +98,9 @@ struct rcu_state rcu_state = {
/* Dump rcu_node combining tree at boot to verify correct setup. */
static bool dump_tree;
module_param(dump_tree, bool, 0444);
+/* By default, use RCU_SOFTIRQ instead of rcuc kthreads. */
+static bool use_softirq = 1;
+module_param(use_softirq, bool, 0444);
/* Control rcu_node-tree auto-balancing at boot time. */
static bool rcu_fanout_exact;
module_param(rcu_fanout_exact, bool, 0444);
@@ -113,8 +111,6 @@ int rcu_num_lvls __read_mostly = RCU_NUM_LVLS;
/* Number of rcu_nodes at specified level. */
int num_rcu_lvl[] = NUM_RCU_LVL_INIT;
int rcu_num_nodes __read_mostly = NUM_RCU_NODES; /* Total # rcu_nodes in use. */
-/* panic() on RCU Stall sysctl. */
-int sysctl_panic_on_rcu_stall __read_mostly;
/*
* The rcu_scheduler_active variable is initialized to the value
@@ -151,13 +147,12 @@ static void rcu_init_new_rnp(struct rcu_node *rnp_leaf);
static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf);
static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu);
static void invoke_rcu_core(void);
-static void invoke_rcu_callbacks(struct rcu_data *rdp);
static void rcu_report_exp_rdp(struct rcu_data *rdp);
static void sync_sched_exp_online_cleanup(int cpu);
/* rcuc/rcub kthread realtime priority */
static int kthread_prio = IS_ENABLED(CONFIG_RCU_BOOST) ? 1 : 0;
-module_param(kthread_prio, int, 0644);
+module_param(kthread_prio, int, 0444);
/* Delay in jiffies for grace-period initialization delays, debug only. */
@@ -381,19 +376,33 @@ static void __maybe_unused rcu_momentary_dyntick_idle(void)
}
/**
- * rcu_is_cpu_rrupt_from_idle - see if idle or immediately interrupted from idle
+ * rcu_is_cpu_rrupt_from_idle - see if interrupted from idle
*
- * If the current CPU is idle or running at a first-level (not nested)
+ * If the current CPU is idle and running at a first-level (not nested)
* interrupt from idle, return true. The caller must have at least
* disabled preemption.
*/
static int rcu_is_cpu_rrupt_from_idle(void)
{
- return __this_cpu_read(rcu_data.dynticks_nesting) <= 0 &&
- __this_cpu_read(rcu_data.dynticks_nmi_nesting) <= 1;
+ /* Called only from within the scheduling-clock interrupt */
+ lockdep_assert_in_irq();
+
+ /* Check for counter underflows */
+ RCU_LOCKDEP_WARN(__this_cpu_read(rcu_data.dynticks_nesting) < 0,
+ "RCU dynticks_nesting counter underflow!");
+ RCU_LOCKDEP_WARN(__this_cpu_read(rcu_data.dynticks_nmi_nesting) <= 0,
+ "RCU dynticks_nmi_nesting counter underflow/zero!");
+
+ /* Are we at first interrupt nesting level? */
+ if (__this_cpu_read(rcu_data.dynticks_nmi_nesting) != 1)
+ return false;
+
+ /* Does CPU appear to be idle from an RCU standpoint? */
+ return __this_cpu_read(rcu_data.dynticks_nesting) == 0;
}
-#define DEFAULT_RCU_BLIMIT 10 /* Maximum callbacks per rcu_do_batch. */
+#define DEFAULT_RCU_BLIMIT 10 /* Maximum callbacks per rcu_do_batch ... */
+#define DEFAULT_MAX_RCU_BLIMIT 10000 /* ... even during callback flood. */
static long blimit = DEFAULT_RCU_BLIMIT;
#define DEFAULT_RCU_QHIMARK 10000 /* If this many pending, ignore blimit. */
static long qhimark = DEFAULT_RCU_QHIMARK;
@@ -414,7 +423,7 @@ static bool rcu_kick_kthreads;
*/
static ulong jiffies_till_sched_qs = ULONG_MAX;
module_param(jiffies_till_sched_qs, ulong, 0444);
-static ulong jiffies_to_sched_qs; /* Adjusted version of above if not default */
+static ulong jiffies_to_sched_qs; /* See adjust_jiffies_till_sched_qs(). */
module_param(jiffies_to_sched_qs, ulong, 0444); /* Display only! */
/*
@@ -432,6 +441,7 @@ static void adjust_jiffies_till_sched_qs(void)
WRITE_ONCE(jiffies_to_sched_qs, jiffies_till_sched_qs);
return;
}
+ /* Otherwise, set to third fqs scan, but bound below on large system. */
j = READ_ONCE(jiffies_till_first_fqs) +
2 * READ_ONCE(jiffies_till_next_fqs);
if (j < HZ / 10 + nr_cpu_ids / RCU_JIFFIES_FQS_DIV)
@@ -479,7 +489,6 @@ module_param_cb(jiffies_till_next_fqs, &next_fqs_jiffies_ops, &jiffies_till_next
module_param(rcu_kick_kthreads, bool, 0644);
static void force_qs_rnp(int (*f)(struct rcu_data *rdp));
-static void force_quiescent_state(void);
static int rcu_pending(void);
/*
@@ -504,13 +513,12 @@ unsigned long rcu_exp_batches_completed(void)
EXPORT_SYMBOL_GPL(rcu_exp_batches_completed);
/*
- * Force a quiescent state.
+ * Return the root node of the rcu_state structure.
*/
-void rcu_force_quiescent_state(void)
+static struct rcu_node *rcu_get_root(void)
{
- force_quiescent_state();
+ return &rcu_state.node[0];
}
-EXPORT_SYMBOL_GPL(rcu_force_quiescent_state);
/*
* Convert a ->gp_state value to a character string.
@@ -523,42 +531,6 @@ static const char *gp_state_getname(short gs)
}
/*
- * Show the state of the grace-period kthreads.
- */
-void show_rcu_gp_kthreads(void)
-{
- int cpu;
- unsigned long j;
- struct rcu_data *rdp;
- struct rcu_node *rnp;
-
- j = jiffies - READ_ONCE(rcu_state.gp_activity);
- pr_info("%s: wait state: %s(%d) ->state: %#lx delta ->gp_activity %ld\n",
- rcu_state.name, gp_state_getname(rcu_state.gp_state),
- rcu_state.gp_state, rcu_state.gp_kthread->state, j);
- rcu_for_each_node_breadth_first(rnp) {
- if (ULONG_CMP_GE(rcu_state.gp_seq, rnp->gp_seq_needed))
- continue;
- pr_info("\trcu_node %d:%d ->gp_seq %lu ->gp_seq_needed %lu\n",
- rnp->grplo, rnp->grphi, rnp->gp_seq,
- rnp->gp_seq_needed);
- if (!rcu_is_leaf_node(rnp))
- continue;
- for_each_leaf_node_possible_cpu(rnp, cpu) {
- rdp = per_cpu_ptr(&rcu_data, cpu);
- if (rdp->gpwrap ||
- ULONG_CMP_GE(rcu_state.gp_seq,
- rdp->gp_seq_needed))
- continue;
- pr_info("\tcpu %d ->gp_seq_needed %lu\n",
- cpu, rdp->gp_seq_needed);
- }
- }
- /* sched_show_task(rcu_state.gp_kthread); */
-}
-EXPORT_SYMBOL_GPL(show_rcu_gp_kthreads);
-
-/*
* Send along grace-period-related data for rcutorture diagnostics.
*/
void rcutorture_get_gp_data(enum rcutorture_type test_type, int *flags,
@@ -566,8 +538,6 @@ void rcutorture_get_gp_data(enum rcutorture_type test_type, int *flags,
{
switch (test_type) {
case RCU_FLAVOR:
- case RCU_BH_FLAVOR:
- case RCU_SCHED_FLAVOR:
*flags = READ_ONCE(rcu_state.gp_flags);
*gp_seq = rcu_seq_current(&rcu_state.gp_seq);
break;
@@ -578,14 +548,6 @@ void rcutorture_get_gp_data(enum rcutorture_type test_type, int *flags,
EXPORT_SYMBOL_GPL(rcutorture_get_gp_data);
/*
- * Return the root node of the rcu_state structure.
- */
-static struct rcu_node *rcu_get_root(void)
-{
- return &rcu_state.node[0];
-}
-
-/*
* Enter an RCU extended quiescent state, which can be either the
* idle loop or adaptive-tickless usermode execution.
*
@@ -701,7 +663,6 @@ static __always_inline void rcu_nmi_exit_common(bool irq)
/**
* rcu_nmi_exit - inform RCU of exit from NMI context
- * @irq: Is this call from rcu_irq_exit?
*
* If you add or remove a call to rcu_nmi_exit(), be sure to test
* with CONFIG_RCU_EQS_DEBUG=y.
@@ -872,6 +833,7 @@ void rcu_nmi_enter(void)
{
rcu_nmi_enter_common(false);
}
+NOKPROBE_SYMBOL(rcu_nmi_enter);
/**
* rcu_irq_enter - inform RCU that current CPU is entering irq away from idle
@@ -1022,27 +984,6 @@ static int dyntick_save_progress_counter(struct rcu_data *rdp)
}
/*
- * Handler for the irq_work request posted when a grace period has
- * gone on for too long, but not yet long enough for an RCU CPU
- * stall warning. Set state appropriately, but just complain if
- * there is unexpected state on entry.
- */
-static void rcu_iw_handler(struct irq_work *iwp)
-{
- struct rcu_data *rdp;
- struct rcu_node *rnp;
-
- rdp = container_of(iwp, struct rcu_data, rcu_iw);
- rnp = rdp->mynode;
- raw_spin_lock_rcu_node(rnp);
- if (!WARN_ON_ONCE(!rdp->rcu_iw_pending)) {
- rdp->rcu_iw_gp_seq = rnp->gp_seq;
- rdp->rcu_iw_pending = false;
- }
- raw_spin_unlock_rcu_node(rnp);
-}
-
-/*
* Return true if the specified CPU has passed through a quiescent
* state by virtue of being in or having passed through an dynticks
* idle state since the last call to dyntick_save_progress_counter()
@@ -1115,7 +1056,7 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
}
/*
- * NO_HZ_FULL CPUs can run in-kernel without rcu_check_callbacks!
+ * NO_HZ_FULL CPUs can run in-kernel without rcu_sched_clock_irq!
* The above code handles this, but only for straight cond_resched().
* And some in-kernel loops check need_resched() before calling
* cond_resched(), which defeats the above code for CPUs that are
@@ -1155,295 +1096,6 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
return 0;
}
-static void record_gp_stall_check_time(void)
-{
- unsigned long j = jiffies;
- unsigned long j1;
-
- rcu_state.gp_start = j;
- j1 = rcu_jiffies_till_stall_check();
- /* Record ->gp_start before ->jiffies_stall. */
- smp_store_release(&rcu_state.jiffies_stall, j + j1); /* ^^^ */
- rcu_state.jiffies_resched = j + j1 / 2;
- rcu_state.n_force_qs_gpstart = READ_ONCE(rcu_state.n_force_qs);
-}
-
-/*
- * Complain about starvation of grace-period kthread.
- */
-static void rcu_check_gp_kthread_starvation(void)
-{
- struct task_struct *gpk = rcu_state.gp_kthread;
- unsigned long j;
-
- j = jiffies - READ_ONCE(rcu_state.gp_activity);
- if (j > 2 * HZ) {
- pr_err("%s kthread starved for %ld jiffies! g%ld f%#x %s(%d) ->state=%#lx ->cpu=%d\n",
- rcu_state.name, j,
- (long)rcu_seq_current(&rcu_state.gp_seq),
- rcu_state.gp_flags,
- gp_state_getname(rcu_state.gp_state), rcu_state.gp_state,
- gpk ? gpk->state : ~0, gpk ? task_cpu(gpk) : -1);
- if (gpk) {
- pr_err("RCU grace-period kthread stack dump:\n");
- sched_show_task(gpk);
- wake_up_process(gpk);
- }
- }
-}
-
-/*
- * Dump stacks of all tasks running on stalled CPUs. First try using
- * NMIs, but fall back to manual remote stack tracing on architectures
- * that don't support NMI-based stack dumps. The NMI-triggered stack
- * traces are more accurate because they are printed by the target CPU.
- */
-static void rcu_dump_cpu_stacks(void)
-{
- int cpu;
- unsigned long flags;
- struct rcu_node *rnp;
-
- rcu_for_each_leaf_node(rnp) {
- raw_spin_lock_irqsave_rcu_node(rnp, flags);
- for_each_leaf_node_possible_cpu(rnp, cpu)
- if (rnp->qsmask & leaf_node_cpu_bit(rnp, cpu))
- if (!trigger_single_cpu_backtrace(cpu))
- dump_cpu_task(cpu);
- raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
- }
-}
-
-/*
- * If too much time has passed in the current grace period, and if
- * so configured, go kick the relevant kthreads.
- */
-static void rcu_stall_kick_kthreads(void)
-{
- unsigned long j;
-
- if (!rcu_kick_kthreads)
- return;
- j = READ_ONCE(rcu_state.jiffies_kick_kthreads);
- if (time_after(jiffies, j) && rcu_state.gp_kthread &&
- (rcu_gp_in_progress() || READ_ONCE(rcu_state.gp_flags))) {
- WARN_ONCE(1, "Kicking %s grace-period kthread\n",
- rcu_state.name);
- rcu_ftrace_dump(DUMP_ALL);
- wake_up_process(rcu_state.gp_kthread);
- WRITE_ONCE(rcu_state.jiffies_kick_kthreads, j + HZ);
- }
-}
-
-static void panic_on_rcu_stall(void)
-{
- if (sysctl_panic_on_rcu_stall)
- panic("RCU Stall\n");
-}
-
-static void print_other_cpu_stall(unsigned long gp_seq)
-{
- int cpu;
- unsigned long flags;
- unsigned long gpa;
- unsigned long j;
- int ndetected = 0;
- struct rcu_node *rnp = rcu_get_root();
- long totqlen = 0;
-
- /* Kick and suppress, if so configured. */
- rcu_stall_kick_kthreads();
- if (rcu_cpu_stall_suppress)
- return;
-
- /*
- * OK, time to rat on our buddy...
- * See Documentation/RCU/stallwarn.txt for info on how to debug
- * RCU CPU stall warnings.
- */
- pr_err("INFO: %s detected stalls on CPUs/tasks:", rcu_state.name);
- print_cpu_stall_info_begin();
- rcu_for_each_leaf_node(rnp) {
- raw_spin_lock_irqsave_rcu_node(rnp, flags);
- ndetected += rcu_print_task_stall(rnp);
- if (rnp->qsmask != 0) {
- for_each_leaf_node_possible_cpu(rnp, cpu)
- if (rnp->qsmask & leaf_node_cpu_bit(rnp, cpu)) {
- print_cpu_stall_info(cpu);
- ndetected++;
- }
- }
- raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
- }
-
- print_cpu_stall_info_end();
- for_each_possible_cpu(cpu)
- totqlen += rcu_get_n_cbs_cpu(cpu);
- pr_cont("(detected by %d, t=%ld jiffies, g=%ld, q=%lu)\n",
- smp_processor_id(), (long)(jiffies - rcu_state.gp_start),
- (long)rcu_seq_current(&rcu_state.gp_seq), totqlen);
- if (ndetected) {
- rcu_dump_cpu_stacks();
-
- /* Complain about tasks blocking the grace period. */
- rcu_print_detail_task_stall();
- } else {
- if (rcu_seq_current(&rcu_state.gp_seq) != gp_seq) {
- pr_err("INFO: Stall ended before state dump start\n");
- } else {
- j = jiffies;
- gpa = READ_ONCE(rcu_state.gp_activity);
- pr_err("All QSes seen, last %s kthread activity %ld (%ld-%ld), jiffies_till_next_fqs=%ld, root ->qsmask %#lx\n",
- rcu_state.name, j - gpa, j, gpa,
- READ_ONCE(jiffies_till_next_fqs),
- rcu_get_root()->qsmask);
- /* In this case, the current CPU might be at fault. */
- sched_show_task(current);
- }
- }
- /* Rewrite if needed in case of slow consoles. */
- if (ULONG_CMP_GE(jiffies, READ_ONCE(rcu_state.jiffies_stall)))
- WRITE_ONCE(rcu_state.jiffies_stall,
- jiffies + 3 * rcu_jiffies_till_stall_check() + 3);
-
- rcu_check_gp_kthread_starvation();
-
- panic_on_rcu_stall();
-
- force_quiescent_state(); /* Kick them all. */
-}
-
-static void print_cpu_stall(void)
-{
- int cpu;
- unsigned long flags;
- struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
- struct rcu_node *rnp = rcu_get_root();
- long totqlen = 0;
-
- /* Kick and suppress, if so configured. */
- rcu_stall_kick_kthreads();
- if (rcu_cpu_stall_suppress)
- return;
-
- /*
- * OK, time to rat on ourselves...
- * See Documentation/RCU/stallwarn.txt for info on how to debug
- * RCU CPU stall warnings.
- */
- pr_err("INFO: %s self-detected stall on CPU", rcu_state.name);
- print_cpu_stall_info_begin();
- raw_spin_lock_irqsave_rcu_node(rdp->mynode, flags);
- print_cpu_stall_info(smp_processor_id());
- raw_spin_unlock_irqrestore_rcu_node(rdp->mynode, flags);
- print_cpu_stall_info_end();
- for_each_possible_cpu(cpu)
- totqlen += rcu_get_n_cbs_cpu(cpu);
- pr_cont(" (t=%lu jiffies g=%ld q=%lu)\n",
- jiffies - rcu_state.gp_start,
- (long)rcu_seq_current(&rcu_state.gp_seq), totqlen);
-
- rcu_check_gp_kthread_starvation();
-
- rcu_dump_cpu_stacks();
-
- raw_spin_lock_irqsave_rcu_node(rnp, flags);
- /* Rewrite if needed in case of slow consoles. */
- if (ULONG_CMP_GE(jiffies, READ_ONCE(rcu_state.jiffies_stall)))
- WRITE_ONCE(rcu_state.jiffies_stall,
- jiffies + 3 * rcu_jiffies_till_stall_check() + 3);
- raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
-
- panic_on_rcu_stall();
-
- /*
- * Attempt to revive the RCU machinery by forcing a context switch.
- *
- * A context switch would normally allow the RCU state machine to make
- * progress and it could be we're stuck in kernel space without context
- * switches for an entirely unreasonable amount of time.
- */
- set_tsk_need_resched(current);
- set_preempt_need_resched();
-}
-
-static void check_cpu_stall(struct rcu_data *rdp)
-{
- unsigned long gs1;
- unsigned long gs2;
- unsigned long gps;
- unsigned long j;
- unsigned long jn;
- unsigned long js;
- struct rcu_node *rnp;
-
- if ((rcu_cpu_stall_suppress && !rcu_kick_kthreads) ||
- !rcu_gp_in_progress())
- return;
- rcu_stall_kick_kthreads();
- j = jiffies;
-
- /*
- * Lots of memory barriers to reject false positives.
- *
- * The idea is to pick up rcu_state.gp_seq, then
- * rcu_state.jiffies_stall, then rcu_state.gp_start, and finally
- * another copy of rcu_state.gp_seq. These values are updated in
- * the opposite order with memory barriers (or equivalent) during
- * grace-period initialization and cleanup. Now, a false positive
- * can occur if we get an new value of rcu_state.gp_start and a old
- * value of rcu_state.jiffies_stall. But given the memory barriers,
- * the only way that this can happen is if one grace period ends
- * and another starts between these two fetches. This is detected
- * by comparing the second fetch of rcu_state.gp_seq with the
- * previous fetch from rcu_state.gp_seq.
- *
- * Given this check, comparisons of jiffies, rcu_state.jiffies_stall,
- * and rcu_state.gp_start suffice to forestall false positives.
- */
- gs1 = READ_ONCE(rcu_state.gp_seq);
- smp_rmb(); /* Pick up ->gp_seq first... */
- js = READ_ONCE(rcu_state.jiffies_stall);
- smp_rmb(); /* ...then ->jiffies_stall before the rest... */
- gps = READ_ONCE(rcu_state.gp_start);
- smp_rmb(); /* ...and finally ->gp_start before ->gp_seq again. */
- gs2 = READ_ONCE(rcu_state.gp_seq);
- if (gs1 != gs2 ||
- ULONG_CMP_LT(j, js) ||
- ULONG_CMP_GE(gps, js))
- return; /* No stall or GP completed since entering function. */
- rnp = rdp->mynode;
- jn = jiffies + 3 * rcu_jiffies_till_stall_check() + 3;
- if (rcu_gp_in_progress() &&
- (READ_ONCE(rnp->qsmask) & rdp->grpmask) &&
- cmpxchg(&rcu_state.jiffies_stall, js, jn) == js) {
-
- /* We haven't checked in, so go dump stack. */
- print_cpu_stall();
-
- } else if (rcu_gp_in_progress() &&
- ULONG_CMP_GE(j, js + RCU_STALL_RAT_DELAY) &&
- cmpxchg(&rcu_state.jiffies_stall, js, jn) == js) {
-
- /* They had a few time units to dump stack, so complain. */
- print_other_cpu_stall(gs2);
- }
-}
-
-/**
- * rcu_cpu_stall_reset - prevent further stall warnings in current grace period
- *
- * Set the stall-warning timeout way off into the future, thus preventing
- * any RCU CPU stall-warning messages from appearing in the current set of
- * RCU grace periods.
- *
- * The caller must disable hard irqs.
- */
-void rcu_cpu_stall_reset(void)
-{
- WRITE_ONCE(rcu_state.jiffies_stall, jiffies + ULONG_MAX / 2);
-}
-
/* Trace-event wrapper function for trace_rcu_future_grace_period. */
static void trace_rcu_this_gp(struct rcu_node *rnp, struct rcu_data *rdp,
unsigned long gp_seq_req, const char *s)
@@ -1557,17 +1209,28 @@ static bool rcu_future_gp_cleanup(struct rcu_node *rnp)
}
/*
- * Awaken the grace-period kthread. Don't do a self-awaken, and don't
- * bother awakening when there is nothing for the grace-period kthread
- * to do (as in several CPUs raced to awaken, and we lost), and finally
- * don't try to awaken a kthread that has not yet been created.
+ * Awaken the grace-period kthread. Don't do a self-awaken (unless in
+ * an interrupt or softirq handler), and don't bother awakening when there
+ * is nothing for the grace-period kthread to do (as in several CPUs raced
+ * to awaken, and we lost), and finally don't try to awaken a kthread that
+ * has not yet been created. If all those checks are passed, track some
+ * debug information and awaken.
+ *
+ * So why do the self-wakeup when in an interrupt or softirq handler
+ * in the grace-period kthread's context? Because the kthread might have
+ * been interrupted just as it was going to sleep, and just after the final
+ * pre-sleep check of the awaken condition. In this case, a wakeup really
+ * is required, and is therefore supplied.
*/
static void rcu_gp_kthread_wake(void)
{
- if (current == rcu_state.gp_kthread ||
+ if ((current == rcu_state.gp_kthread &&
+ !in_irq() && !in_serving_softirq()) ||
!READ_ONCE(rcu_state.gp_flags) ||
!rcu_state.gp_kthread)
return;
+ WRITE_ONCE(rcu_state.gp_wake_time, jiffies);
+ WRITE_ONCE(rcu_state.gp_wake_seq, READ_ONCE(rcu_state.gp_seq));
swake_up_one(&rcu_state.gp_wq);
}
@@ -1711,7 +1374,7 @@ static bool __note_gp_changes(struct rcu_node *rnp, struct rcu_data *rdp)
zero_cpu_stall_ticks(rdp);
}
rdp->gp_seq = rnp->gp_seq; /* Remember new grace-period state. */
- if (ULONG_CMP_GE(rnp->gp_seq_needed, rdp->gp_seq_needed) || rdp->gpwrap)
+ if (ULONG_CMP_LT(rdp->gp_seq_needed, rnp->gp_seq_needed) || rdp->gpwrap)
rdp->gp_seq_needed = rnp->gp_seq_needed;
WRITE_ONCE(rdp->gpwrap, false);
rcu_gpnum_ovf(rnp, rdp);
@@ -1939,7 +1602,7 @@ static void rcu_gp_fqs_loop(void)
if (!ret) {
rcu_state.jiffies_force_qs = jiffies + j;
WRITE_ONCE(rcu_state.jiffies_kick_kthreads,
- jiffies + 3 * j);
+ jiffies + (j ? 3 * j : 2));
}
trace_rcu_grace_period(rcu_state.name,
READ_ONCE(rcu_state.gp_seq),
@@ -2272,11 +1935,10 @@ rcu_report_qs_rdp(int cpu, struct rcu_data *rdp)
return;
}
mask = rdp->grpmask;
+ rdp->core_needs_qs = false;
if ((rnp->qsmask & mask) == 0) {
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
} else {
- rdp->core_needs_qs = false;
-
/*
* This GP can't end until cpu checks in, so all of our
* callbacks can be processed during the next GP.
@@ -2329,14 +1991,14 @@ rcu_check_quiescent_state(struct rcu_data *rdp)
*/
int rcutree_dying_cpu(unsigned int cpu)
{
- RCU_TRACE(bool blkd;)
- RCU_TRACE(struct rcu_data *rdp = this_cpu_ptr(&rcu_data);)
- RCU_TRACE(struct rcu_node *rnp = rdp->mynode;)
+ bool blkd;
+ struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
+ struct rcu_node *rnp = rdp->mynode;
if (!IS_ENABLED(CONFIG_HOTPLUG_CPU))
return 0;
- RCU_TRACE(blkd = !!(rnp->qsmask & rdp->grpmask);)
+ blkd = !!(rnp->qsmask & rdp->grpmask);
trace_rcu_grace_period(rcu_state.name, rnp->gp_seq,
blkd ? TPS("cpuofl") : TPS("cpuofl-bgp"));
return 0;
@@ -2473,7 +2135,7 @@ static void rcu_do_batch(struct rcu_data *rdp)
/* Reinstate batch limit if we have worked down the excess. */
count = rcu_segcblist_n_cbs(&rdp->cblist);
- if (rdp->blimit == LONG_MAX && count <= qlowmark)
+ if (rdp->blimit >= DEFAULT_MAX_RCU_BLIMIT && count <= qlowmark)
rdp->blimit = blimit;
/* Reset ->qlen_last_fqs_check trigger if enough CBs have drained. */
@@ -2497,14 +2159,14 @@ static void rcu_do_batch(struct rcu_data *rdp)
}
/*
- * Check to see if this CPU is in a non-context-switch quiescent state
- * (user mode or idle loop for rcu, non-softirq execution for rcu_bh).
- * Also schedule RCU core processing.
- *
- * This function must be called from hardirq context. It is normally
- * invoked from the scheduling-clock interrupt.
+ * This function is invoked from each scheduling-clock interrupt,
+ * and checks to see if this CPU is in a non-context-switch quiescent
+ * state, for example, user mode or idle loop. It also schedules RCU
+ * core processing. If the current grace period has gone on too long,
+ * it will ask the scheduler to manufacture a context switch for the sole
+ * purpose of providing a providing the needed quiescent state.
*/
-void rcu_check_callbacks(int user)
+void rcu_sched_clock_irq(int user)
{
trace_rcu_utilization(TPS("Start scheduler-tick"));
raw_cpu_inc(rcu_data.ticks_this_gp);
@@ -2517,7 +2179,7 @@ void rcu_check_callbacks(int user)
}
__this_cpu_write(rcu_data.rcu_urgent_qs, false);
}
- rcu_flavor_check_callbacks(user);
+ rcu_flavor_sched_clock_irq(user);
if (rcu_pending())
invoke_rcu_core();
@@ -2525,11 +2187,11 @@ void rcu_check_callbacks(int user)
}
/*
- * Scan the leaf rcu_node structures, processing dyntick state for any that
- * have not yet encountered a quiescent state, using the function specified.
- * Also initiate boosting for any threads blocked on the root rcu_node.
- *
- * The caller must have suppressed start of new grace periods.
+ * Scan the leaf rcu_node structures. For each structure on which all
+ * CPUs have reported a quiescent state and on which there are tasks
+ * blocking the current grace period, initiate RCU priority boosting.
+ * Otherwise, invoke the specified function to check dyntick state for
+ * each CPU that has not yet reported a quiescent state.
*/
static void force_qs_rnp(int (*f)(struct rcu_data *rdp))
{
@@ -2578,7 +2240,7 @@ static void force_qs_rnp(int (*f)(struct rcu_data *rdp))
* Force quiescent states on reluctant CPUs, and also detect which
* CPUs are in dyntick-idle mode.
*/
-static void force_quiescent_state(void)
+void rcu_force_quiescent_state(void)
{
unsigned long flags;
bool ret;
@@ -2610,113 +2272,10 @@ static void force_quiescent_state(void)
raw_spin_unlock_irqrestore_rcu_node(rnp_old, flags);
rcu_gp_kthread_wake();
}
+EXPORT_SYMBOL_GPL(rcu_force_quiescent_state);
-/*
- * This function checks for grace-period requests that fail to motivate
- * RCU to come out of its idle mode.
- */
-void
-rcu_check_gp_start_stall(struct rcu_node *rnp, struct rcu_data *rdp,
- const unsigned long gpssdelay)
-{
- unsigned long flags;
- unsigned long j;
- struct rcu_node *rnp_root = rcu_get_root();
- static atomic_t warned = ATOMIC_INIT(0);
-
- if (!IS_ENABLED(CONFIG_PROVE_RCU) || rcu_gp_in_progress() ||
- ULONG_CMP_GE(rnp_root->gp_seq, rnp_root->gp_seq_needed))
- return;
- j = jiffies; /* Expensive access, and in common case don't get here. */
- if (time_before(j, READ_ONCE(rcu_state.gp_req_activity) + gpssdelay) ||
- time_before(j, READ_ONCE(rcu_state.gp_activity) + gpssdelay) ||
- atomic_read(&warned))
- return;
-
- raw_spin_lock_irqsave_rcu_node(rnp, flags);
- j = jiffies;
- if (rcu_gp_in_progress() ||
- ULONG_CMP_GE(rnp_root->gp_seq, rnp_root->gp_seq_needed) ||
- time_before(j, READ_ONCE(rcu_state.gp_req_activity) + gpssdelay) ||
- time_before(j, READ_ONCE(rcu_state.gp_activity) + gpssdelay) ||
- atomic_read(&warned)) {
- raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
- return;
- }
- /* Hold onto the leaf lock to make others see warned==1. */
-
- if (rnp_root != rnp)
- raw_spin_lock_rcu_node(rnp_root); /* irqs already disabled. */
- j = jiffies;
- if (rcu_gp_in_progress() ||
- ULONG_CMP_GE(rnp_root->gp_seq, rnp_root->gp_seq_needed) ||
- time_before(j, rcu_state.gp_req_activity + gpssdelay) ||
- time_before(j, rcu_state.gp_activity + gpssdelay) ||
- atomic_xchg(&warned, 1)) {
- raw_spin_unlock_rcu_node(rnp_root); /* irqs remain disabled. */
- raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
- return;
- }
- pr_alert("%s: g%ld->%ld gar:%lu ga:%lu f%#x gs:%d %s->state:%#lx\n",
- __func__, (long)READ_ONCE(rcu_state.gp_seq),
- (long)READ_ONCE(rnp_root->gp_seq_needed),
- j - rcu_state.gp_req_activity, j - rcu_state.gp_activity,
- rcu_state.gp_flags, rcu_state.gp_state, rcu_state.name,
- rcu_state.gp_kthread ? rcu_state.gp_kthread->state : 0x1ffffL);
- WARN_ON(1);
- if (rnp_root != rnp)
- raw_spin_unlock_rcu_node(rnp_root);
- raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
-}
-
-/*
- * Do a forward-progress check for rcutorture. This is normally invoked
- * due to an OOM event. The argument "j" gives the time period during
- * which rcutorture would like progress to have been made.
- */
-void rcu_fwd_progress_check(unsigned long j)
-{
- unsigned long cbs;
- int cpu;
- unsigned long max_cbs = 0;
- int max_cpu = -1;
- struct rcu_data *rdp;
-
- if (rcu_gp_in_progress()) {
- pr_info("%s: GP age %lu jiffies\n",
- __func__, jiffies - rcu_state.gp_start);
- show_rcu_gp_kthreads();
- } else {
- pr_info("%s: Last GP end %lu jiffies ago\n",
- __func__, jiffies - rcu_state.gp_end);
- preempt_disable();
- rdp = this_cpu_ptr(&rcu_data);
- rcu_check_gp_start_stall(rdp->mynode, rdp, j);
- preempt_enable();
- }
- for_each_possible_cpu(cpu) {
- cbs = rcu_get_n_cbs_cpu(cpu);
- if (!cbs)
- continue;
- if (max_cpu < 0)
- pr_info("%s: callbacks", __func__);
- pr_cont(" %d: %lu", cpu, cbs);
- if (cbs <= max_cbs)
- continue;
- max_cbs = cbs;
- max_cpu = cpu;
- }
- if (max_cpu >= 0)
- pr_cont("\n");
-}
-EXPORT_SYMBOL_GPL(rcu_fwd_progress_check);
-
-/*
- * This does the RCU core processing work for the specified rcu_data
- * structures. This may be called only from the CPU to whom the rdp
- * belongs.
- */
-static __latent_entropy void rcu_process_callbacks(struct softirq_action *unused)
+/* Perform RCU core processing work for the current CPU. */
+static __latent_entropy void rcu_core(void)
{
unsigned long flags;
struct rcu_data *rdp = raw_cpu_ptr(&rcu_data);
@@ -2750,37 +2309,126 @@ static __latent_entropy void rcu_process_callbacks(struct softirq_action *unused
rcu_check_gp_start_stall(rnp, rdp, rcu_jiffies_till_stall_check());
/* If there are callbacks ready, invoke them. */
- if (rcu_segcblist_ready_cbs(&rdp->cblist))
- invoke_rcu_callbacks(rdp);
+ if (rcu_segcblist_ready_cbs(&rdp->cblist) &&
+ likely(READ_ONCE(rcu_scheduler_fully_active)))
+ rcu_do_batch(rdp);
/* Do any needed deferred wakeups of rcuo kthreads. */
do_nocb_deferred_wakeup(rdp);
trace_rcu_utilization(TPS("End RCU core"));
}
+static void rcu_core_si(struct softirq_action *h)
+{
+ rcu_core();
+}
+
+static void rcu_wake_cond(struct task_struct *t, int status)
+{
+ /*
+ * If the thread is yielding, only wake it when this
+ * is invoked from idle
+ */
+ if (t && (status != RCU_KTHREAD_YIELDING || is_idle_task(current)))
+ wake_up_process(t);
+}
+
+static void invoke_rcu_core_kthread(void)
+{
+ struct task_struct *t;
+ unsigned long flags;
+
+ local_irq_save(flags);
+ __this_cpu_write(rcu_data.rcu_cpu_has_work, 1);
+ t = __this_cpu_read(rcu_data.rcu_cpu_kthread_task);
+ if (t != NULL && t != current)
+ rcu_wake_cond(t, __this_cpu_read(rcu_data.rcu_cpu_kthread_status));
+ local_irq_restore(flags);
+}
+
/*
- * Schedule RCU callback invocation. If the running implementation of RCU
- * does not support RCU priority boosting, just do a direct call, otherwise
- * wake up the per-CPU kernel kthread. Note that because we are running
- * on the current CPU with softirqs disabled, the rcu_cpu_kthread_task
- * cannot disappear out from under us.
+ * Wake up this CPU's rcuc kthread to do RCU core processing.
*/
-static void invoke_rcu_callbacks(struct rcu_data *rdp)
+static void invoke_rcu_core(void)
{
- if (unlikely(!READ_ONCE(rcu_scheduler_fully_active)))
- return;
- if (likely(!rcu_state.boost)) {
- rcu_do_batch(rdp);
+ if (!cpu_online(smp_processor_id()))
return;
+ if (use_softirq)
+ raise_softirq(RCU_SOFTIRQ);
+ else
+ invoke_rcu_core_kthread();
+}
+
+static void rcu_cpu_kthread_park(unsigned int cpu)
+{
+ per_cpu(rcu_data.rcu_cpu_kthread_status, cpu) = RCU_KTHREAD_OFFCPU;
+}
+
+static int rcu_cpu_kthread_should_run(unsigned int cpu)
+{
+ return __this_cpu_read(rcu_data.rcu_cpu_has_work);
+}
+
+/*
+ * Per-CPU kernel thread that invokes RCU callbacks. This replaces
+ * the RCU softirq used in configurations of RCU that do not support RCU
+ * priority boosting.
+ */
+static void rcu_cpu_kthread(unsigned int cpu)
+{
+ unsigned int *statusp = this_cpu_ptr(&rcu_data.rcu_cpu_kthread_status);
+ char work, *workp = this_cpu_ptr(&rcu_data.rcu_cpu_has_work);
+ int spincnt;
+
+ for (spincnt = 0; spincnt < 10; spincnt++) {
+ trace_rcu_utilization(TPS("Start CPU kthread@rcu_wait"));
+ local_bh_disable();
+ *statusp = RCU_KTHREAD_RUNNING;
+ local_irq_disable();
+ work = *workp;
+ *workp = 0;
+ local_irq_enable();
+ if (work)
+ rcu_core();
+ local_bh_enable();
+ if (*workp == 0) {
+ trace_rcu_utilization(TPS("End CPU kthread@rcu_wait"));
+ *statusp = RCU_KTHREAD_WAITING;
+ return;
+ }
}
- invoke_rcu_callbacks_kthread();
+ *statusp = RCU_KTHREAD_YIELDING;
+ trace_rcu_utilization(TPS("Start CPU kthread@rcu_yield"));
+ schedule_timeout_interruptible(2);
+ trace_rcu_utilization(TPS("End CPU kthread@rcu_yield"));
+ *statusp = RCU_KTHREAD_WAITING;
}
-static void invoke_rcu_core(void)
+static struct smp_hotplug_thread rcu_cpu_thread_spec = {
+ .store = &rcu_data.rcu_cpu_kthread_task,
+ .thread_should_run = rcu_cpu_kthread_should_run,
+ .thread_fn = rcu_cpu_kthread,
+ .thread_comm = "rcuc/%u",
+ .setup = rcu_cpu_kthread_setup,
+ .park = rcu_cpu_kthread_park,
+};
+
+/*
+ * Spawn per-CPU RCU core processing kthreads.
+ */
+static int __init rcu_spawn_core_kthreads(void)
{
- if (cpu_online(smp_processor_id()))
- raise_softirq(RCU_SOFTIRQ);
+ int cpu;
+
+ for_each_possible_cpu(cpu)
+ per_cpu(rcu_data.rcu_cpu_has_work, cpu) = 0;
+ if (!IS_ENABLED(CONFIG_RCU_BOOST) && use_softirq)
+ return 0;
+ WARN_ONCE(smpboot_register_percpu_thread(&rcu_cpu_thread_spec),
+ "%s: Could not start rcuc kthread, OOM is now expected behavior\n", __func__);
+ return 0;
}
+early_initcall(rcu_spawn_core_kthreads);
/*
* Handle any core-RCU processing required by a call_rcu() invocation.
@@ -2801,9 +2449,9 @@ static void __call_rcu_core(struct rcu_data *rdp, struct rcu_head *head,
/*
* Force the grace period if too many callbacks or too long waiting.
- * Enforce hysteresis, and don't invoke force_quiescent_state()
+ * Enforce hysteresis, and don't invoke rcu_force_quiescent_state()
* if some other CPU has recently done so. Also, don't bother
- * invoking force_quiescent_state() if the newly enqueued callback
+ * invoking rcu_force_quiescent_state() if the newly enqueued callback
* is the only one waiting for a grace period to complete.
*/
if (unlikely(rcu_segcblist_n_cbs(&rdp->cblist) >
@@ -2817,10 +2465,10 @@ static void __call_rcu_core(struct rcu_data *rdp, struct rcu_head *head,
rcu_accelerate_cbs_unlocked(rdp->mynode, rdp);
} else {
/* Give the grace period a kick. */
- rdp->blimit = LONG_MAX;
+ rdp->blimit = DEFAULT_MAX_RCU_BLIMIT;
if (rcu_state.n_force_qs == rdp->n_force_qs_snap &&
rcu_segcblist_first_pend_cb(&rdp->cblist) != head)
- force_quiescent_state();
+ rcu_force_quiescent_state();
rdp->n_force_qs_snap = rcu_state.n_force_qs;
rdp->qlen_last_fqs_check = rcu_segcblist_n_cbs(&rdp->cblist);
}
@@ -2855,7 +2503,7 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func, int cpu, bool lazy)
* Use rcu:rcu_callback trace event to find the previous
* time callback was passed to __call_rcu().
*/
- WARN_ONCE(1, "__call_rcu(): Double-freed CB %p->%pF()!!!\n",
+ WARN_ONCE(1, "__call_rcu(): Double-freed CB %p->%pS()!!!\n",
head, head->func);
WRITE_ONCE(head->func, rcu_leak_callback);
return;
@@ -2889,9 +2537,6 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func, int cpu, bool lazy)
rcu_segcblist_init(&rdp->cblist);
}
rcu_segcblist_enqueue(&rdp->cblist, head, lazy);
- if (!lazy)
- rcu_idle_count_callbacks_posted();
-
if (__is_kfree_rcu_offset((unsigned long)func))
trace_rcu_kfree_callback(rcu_state.name, head,
(unsigned long)func,
@@ -2961,6 +2606,79 @@ void kfree_call_rcu(struct rcu_head *head, rcu_callback_t func)
}
EXPORT_SYMBOL_GPL(kfree_call_rcu);
+/*
+ * During early boot, any blocking grace-period wait automatically
+ * implies a grace period. Later on, this is never the case for PREEMPT.
+ *
+ * Howevr, because a context switch is a grace period for !PREEMPT, any
+ * blocking grace-period wait automatically implies a grace period if
+ * there is only one CPU online at any point time during execution of
+ * either synchronize_rcu() or synchronize_rcu_expedited(). It is OK to
+ * occasionally incorrectly indicate that there are multiple CPUs online
+ * when there was in fact only one the whole time, as this just adds some
+ * overhead: RCU still operates correctly.
+ */
+static int rcu_blocking_is_gp(void)
+{
+ int ret;
+
+ if (IS_ENABLED(CONFIG_PREEMPT))
+ return rcu_scheduler_active == RCU_SCHEDULER_INACTIVE;
+ might_sleep(); /* Check for RCU read-side critical section. */
+ preempt_disable();
+ ret = num_online_cpus() <= 1;
+ preempt_enable();
+ return ret;
+}
+
+/**
+ * synchronize_rcu - wait until a grace period has elapsed.
+ *
+ * Control will return to the caller some time after a full grace
+ * period has elapsed, in other words after all currently executing RCU
+ * read-side critical sections have completed. Note, however, that
+ * upon return from synchronize_rcu(), the caller might well be executing
+ * concurrently with new RCU read-side critical sections that began while
+ * synchronize_rcu() was waiting. RCU read-side critical sections are
+ * delimited by rcu_read_lock() and rcu_read_unlock(), and may be nested.
+ * In addition, regions of code across which interrupts, preemption, or
+ * softirqs have been disabled also serve as RCU read-side critical
+ * sections. This includes hardware interrupt handlers, softirq handlers,
+ * and NMI handlers.
+ *
+ * Note that this guarantee implies further memory-ordering guarantees.
+ * On systems with more than one CPU, when synchronize_rcu() returns,
+ * each CPU is guaranteed to have executed a full memory barrier since
+ * the end of its last RCU read-side critical section whose beginning
+ * preceded the call to synchronize_rcu(). In addition, each CPU having
+ * an RCU read-side critical section that extends beyond the return from
+ * synchronize_rcu() is guaranteed to have executed a full memory barrier
+ * after the beginning of synchronize_rcu() and before the beginning of
+ * that RCU read-side critical section. Note that these guarantees include
+ * CPUs that are offline, idle, or executing in user mode, as well as CPUs
+ * that are executing in the kernel.
+ *
+ * Furthermore, if CPU A invoked synchronize_rcu(), which returned
+ * to its caller on CPU B, then both CPU A and CPU B are guaranteed
+ * to have executed a full memory barrier during the execution of
+ * synchronize_rcu() -- even if CPU A and CPU B are the same CPU (but
+ * again only if the system has more than one CPU).
+ */
+void synchronize_rcu(void)
+{
+ RCU_LOCKDEP_WARN(lock_is_held(&rcu_bh_lock_map) ||
+ lock_is_held(&rcu_lock_map) ||
+ lock_is_held(&rcu_sched_lock_map),
+ "Illegal synchronize_rcu() in RCU read-side critical section");
+ if (rcu_blocking_is_gp())
+ return;
+ if (rcu_gp_is_expedited())
+ synchronize_rcu_expedited();
+ else
+ wait_rcu_gp(call_rcu);
+}
+EXPORT_SYMBOL_GPL(synchronize_rcu);
+
/**
* get_state_synchronize_rcu - Snapshot current RCU state
*
@@ -3049,28 +2767,6 @@ static int rcu_pending(void)
}
/*
- * Return true if the specified CPU has any callback. If all_lazy is
- * non-NULL, store an indication of whether all callbacks are lazy.
- * (If there are no callbacks, all of them are deemed to be lazy.)
- */
-static bool rcu_cpu_has_callbacks(bool *all_lazy)
-{
- bool al = true;
- bool hc = false;
- struct rcu_data *rdp;
-
- rdp = this_cpu_ptr(&rcu_data);
- if (!rcu_segcblist_empty(&rdp->cblist)) {
- hc = true;
- if (rcu_segcblist_n_nonlazy_cbs(&rdp->cblist))
- al = false;
- }
- if (all_lazy)
- *all_lazy = al;
- return hc;
-}
-
-/*
* Helper function for rcu_barrier() tracing. If tracing is disabled,
* the compiler is expected to optimize this away.
*/
@@ -3299,7 +2995,7 @@ int rcutree_prepare_cpu(unsigned int cpu)
trace_rcu_grace_period(rcu_state.name, rdp->gp_seq, TPS("cpuonl"));
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
rcu_prepare_kthreads(cpu);
- rcu_spawn_all_nocb_kthreads(cpu);
+ rcu_spawn_cpu_nocb_kthread(cpu);
return 0;
}
@@ -3329,8 +3025,6 @@ int rcutree_online_cpu(unsigned int cpu)
raw_spin_lock_irqsave_rcu_node(rnp, flags);
rnp->ffmask |= rdp->grpmask;
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
- if (IS_ENABLED(CONFIG_TREE_SRCU))
- srcu_online_cpu(cpu);
if (rcu_scheduler_active == RCU_SCHEDULER_INACTIVE)
return 0; /* Too early in boot for scheduler work. */
sync_sched_exp_online_cleanup(cpu);
@@ -3355,8 +3049,6 @@ int rcutree_offline_cpu(unsigned int cpu)
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
rcutree_affinity_setting(cpu, cpu);
- if (IS_ENABLED(CONFIG_TREE_SRCU))
- srcu_offline_cpu(cpu);
return 0;
}
@@ -3500,13 +3192,11 @@ static int rcu_pm_notify(struct notifier_block *self,
switch (action) {
case PM_HIBERNATION_PREPARE:
case PM_SUSPEND_PREPARE:
- if (nr_cpu_ids <= 256) /* Expediting bad for large systems. */
- rcu_expedite_gp();
+ rcu_expedite_gp();
break;
case PM_POST_HIBERNATION:
case PM_POST_SUSPEND:
- if (nr_cpu_ids <= 256) /* Expediting bad for large systems. */
- rcu_unexpedite_gp();
+ rcu_unexpedite_gp();
break;
default:
break;
@@ -3683,8 +3373,7 @@ static void __init rcu_init_geometry(void)
jiffies_till_first_fqs = d;
if (jiffies_till_next_fqs == ULONG_MAX)
jiffies_till_next_fqs = d;
- if (jiffies_till_sched_qs == ULONG_MAX)
- adjust_jiffies_till_sched_qs();
+ adjust_jiffies_till_sched_qs();
/* If the compile-time values are accurate, just leave. */
if (rcu_fanout_leaf == RCU_FANOUT_LEAF &&
@@ -3777,7 +3466,8 @@ void __init rcu_init(void)
rcu_init_one();
if (dump_tree)
rcu_dump_rcu_node_tree();
- open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
+ if (use_softirq)
+ open_softirq(RCU_SOFTIRQ, rcu_core_si);
/*
* We don't need protection against CPU-hotplug here because
@@ -3799,5 +3489,6 @@ void __init rcu_init(void)
srcu_init();
}
+#include "tree_stall.h"
#include "tree_exp.h"
#include "tree_plugin.h"
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index d90b02b53c0e..7acaf3a62d39 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -1,25 +1,12 @@
+/* SPDX-License-Identifier: GPL-2.0+ */
/*
* Read-Copy Update mechanism for mutual exclusion (tree-based version)
* Internal non-public definitions.
*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, you can access it online at
- * http://www.gnu.org/licenses/gpl-2.0.html.
- *
* Copyright IBM Corporation, 2008
*
* Author: Ingo Molnar <mingo@elte.hu>
- * Paul E. McKenney <paulmck@linux.vnet.ibm.com>
+ * Paul E. McKenney <paulmck@linux.ibm.com>
*/
#include <linux/cache.h>
@@ -36,7 +23,6 @@
/* Communicate arguments to a workqueue handler. */
struct rcu_exp_work {
- smp_call_func_t rew_func;
unsigned long rew_s;
struct work_struct rew_work;
};
@@ -168,13 +154,15 @@ struct rcu_data {
bool core_needs_qs; /* Core waits for quiesc state. */
bool beenonline; /* CPU online at least once. */
bool gpwrap; /* Possible ->gp_seq wrap. */
- bool deferred_qs; /* This CPU awaiting a deferred QS? */
+ bool exp_deferred_qs; /* This CPU awaiting a deferred QS? */
struct rcu_node *mynode; /* This CPU's leaf of hierarchy */
unsigned long grpmask; /* Mask to apply to leaf qsmask. */
unsigned long ticks_this_gp; /* The number of scheduling-clock */
/* ticks this CPU has handled */
/* during and after the last grace */
/* period it is aware of. */
+ struct irq_work defer_qs_iw; /* Obtain later scheduler attention. */
+ bool defer_qs_iw_pending; /* Scheduler attention pending? */
/* 2) batch handling */
struct rcu_segcblist cblist; /* Segmented callback list, with */
@@ -194,10 +182,7 @@ struct rcu_data {
bool rcu_need_heavy_qs; /* GP old, so heavy quiescent state! */
bool rcu_urgent_qs; /* GP old need light quiescent state. */
#ifdef CONFIG_RCU_FAST_NO_HZ
- bool all_lazy; /* Are all CPU's CBs lazy? */
- unsigned long nonlazy_posted; /* # times non-lazy CB posted to CPU. */
- unsigned long nonlazy_posted_snap;
- /* Nonlazy_posted snapshot. */
+ bool all_lazy; /* All CPU's CBs lazy at idle start? */
unsigned long last_accelerate; /* Last jiffy CBs were accelerated. */
unsigned long last_advance_all; /* Last jiffy CBs were all advanced. */
int tick_nohz_enabled_snap; /* Previously seen value from sysfs. */
@@ -234,7 +219,13 @@ struct rcu_data {
/* Leader CPU takes GP-end wakeups. */
#endif /* #ifdef CONFIG_RCU_NOCB_CPU */
- /* 6) Diagnostic data, including RCU CPU stall warnings. */
+ /* 6) RCU priority boosting. */
+ struct task_struct *rcu_cpu_kthread_task;
+ /* rcuc per-CPU kthread or NULL. */
+ unsigned int rcu_cpu_kthread_status;
+ char rcu_cpu_has_work;
+
+ /* 7) Diagnostic data, including RCU CPU stall warnings. */
unsigned int softirq_snap; /* Snapshot of softirq activity. */
/* ->rcu_iw* fields protected by leaf rcu_node ->lock. */
struct irq_work rcu_iw; /* Check for non-irq activity. */
@@ -303,6 +294,8 @@ struct rcu_state {
struct swait_queue_head gp_wq; /* Where GP task waits. */
short gp_flags; /* Commands for GP task. */
short gp_state; /* GP kthread sleep state. */
+ unsigned long gp_wake_time; /* Last GP kthread wake. */
+ unsigned long gp_wake_seq; /* ->gp_seq at ^^^. */
/* End of fields guarded by root rcu_node's lock. */
@@ -402,42 +395,29 @@ static const char *tp_rcu_varname __used __tracepoint_string = rcu_name;
int rcu_dynticks_snap(struct rcu_data *rdp);
-#ifdef CONFIG_RCU_BOOST
-DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_status);
-DECLARE_PER_CPU(int, rcu_cpu_kthread_cpu);
-DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_loops);
-DECLARE_PER_CPU(char, rcu_cpu_has_work);
-#endif /* #ifdef CONFIG_RCU_BOOST */
-
-/* Forward declarations for rcutree_plugin.h */
+/* Forward declarations for tree_plugin.h */
static void rcu_bootup_announce(void);
static void rcu_qs(void);
static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp);
#ifdef CONFIG_HOTPLUG_CPU
static bool rcu_preempt_has_tasks(struct rcu_node *rnp);
#endif /* #ifdef CONFIG_HOTPLUG_CPU */
-static void rcu_print_detail_task_stall(void);
-static int rcu_print_task_stall(struct rcu_node *rnp);
static int rcu_print_task_exp_stall(struct rcu_node *rnp);
static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp);
-static void rcu_flavor_check_callbacks(int user);
+static void rcu_flavor_sched_clock_irq(int user);
void call_rcu(struct rcu_head *head, rcu_callback_t func);
static void dump_blkd_tasks(struct rcu_node *rnp, int ncheck);
static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags);
static void rcu_preempt_boost_start_gp(struct rcu_node *rnp);
-static void invoke_rcu_callbacks_kthread(void);
static bool rcu_is_callbacks_kthread(void);
+static void rcu_cpu_kthread_setup(unsigned int cpu);
static void __init rcu_spawn_boost_kthreads(void);
static void rcu_prepare_kthreads(int cpu);
static void rcu_cleanup_after_idle(void);
static void rcu_prepare_for_idle(void);
-static void rcu_idle_count_callbacks_posted(void);
static bool rcu_preempt_has_tasks(struct rcu_node *rnp);
static bool rcu_preempt_need_deferred_qs(struct task_struct *t);
static void rcu_preempt_deferred_qs(struct task_struct *t);
-static void print_cpu_stall_info_begin(void);
-static void print_cpu_stall_info(int cpu);
-static void print_cpu_stall_info_end(void);
static void zero_cpu_stall_ticks(struct rcu_data *rdp);
static bool rcu_nocb_cpu_needs_barrier(int cpu);
static struct swait_queue_head *rcu_nocb_gp_get(struct rcu_node *rnp);
@@ -451,7 +431,7 @@ static bool rcu_nocb_adopt_orphan_cbs(struct rcu_data *my_rdp,
static int rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp);
static void do_nocb_deferred_wakeup(struct rcu_data *rdp);
static void rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp);
-static void rcu_spawn_all_nocb_kthreads(int cpu);
+static void rcu_spawn_cpu_nocb_kthread(int cpu);
static void __init rcu_spawn_nocb_kthreads(void);
#ifdef CONFIG_RCU_NOCB_CPU
static void __init rcu_organize_nocb_kthreads(void);
@@ -463,10 +443,9 @@ static bool rcu_nohz_full_cpu(void);
static void rcu_dynticks_task_enter(void);
static void rcu_dynticks_task_exit(void);
-#ifdef CONFIG_SRCU
-void srcu_online_cpu(unsigned int cpu);
-void srcu_offline_cpu(unsigned int cpu);
-#else /* #ifdef CONFIG_SRCU */
-void srcu_online_cpu(unsigned int cpu) { }
-void srcu_offline_cpu(unsigned int cpu) { }
-#endif /* #else #ifdef CONFIG_SRCU */
+/* Forward declarations for tree_stall.h */
+static void record_gp_stall_check_time(void);
+static void rcu_iw_handler(struct irq_work *iwp);
+static void check_cpu_stall(struct rcu_data *rdp);
+static void rcu_check_gp_start_stall(struct rcu_node *rnp, struct rcu_data *rdp,
+ const unsigned long gpssdelay);
diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h
index 928fe5893a57..af7e7b9c86af 100644
--- a/kernel/rcu/tree_exp.h
+++ b/kernel/rcu/tree_exp.h
@@ -1,27 +1,17 @@
+/* SPDX-License-Identifier: GPL-2.0+ */
/*
* RCU expedited grace periods
*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, you can access it online at
- * http://www.gnu.org/licenses/gpl-2.0.html.
- *
* Copyright IBM Corporation, 2016
*
- * Authors: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
+ * Authors: Paul E. McKenney <paulmck@linux.ibm.com>
*/
#include <linux/lockdep.h>
+static void rcu_exp_handler(void *unused);
+static int rcu_print_task_exp_stall(struct rcu_node *rnp);
+
/*
* Record the start of an expedited grace period.
*/
@@ -260,7 +250,7 @@ static void rcu_report_exp_cpu_mult(struct rcu_node *rnp,
*/
static void rcu_report_exp_rdp(struct rcu_data *rdp)
{
- WRITE_ONCE(rdp->deferred_qs, false);
+ WRITE_ONCE(rdp->exp_deferred_qs, false);
rcu_report_exp_cpu_mult(rdp->mynode, rdp->grpmask, true);
}
@@ -269,8 +259,7 @@ static bool sync_exp_work_done(unsigned long s)
{
if (rcu_exp_gp_seq_done(s)) {
trace_rcu_exp_grace_period(rcu_state.name, s, TPS("done"));
- /* Ensure test happens before caller kfree(). */
- smp_mb__before_atomic(); /* ^^^ */
+ smp_mb(); /* Ensure test happens before caller kfree(). */
return true;
}
return false;
@@ -344,7 +333,6 @@ static void sync_rcu_exp_select_node_cpus(struct work_struct *wp)
{
int cpu;
unsigned long flags;
- smp_call_func_t func;
unsigned long mask_ofl_test;
unsigned long mask_ofl_ipi;
int ret;
@@ -352,7 +340,6 @@ static void sync_rcu_exp_select_node_cpus(struct work_struct *wp)
container_of(wp, struct rcu_exp_work, rew_work);
struct rcu_node *rnp = container_of(rewp, struct rcu_node, rew);
- func = rewp->rew_func;
raw_spin_lock_irqsave_rcu_node(rnp, flags);
/* Each pass checks a CPU for identity, offline, and idle. */
@@ -396,7 +383,12 @@ retry_ipi:
mask_ofl_test |= mask;
continue;
}
- ret = smp_call_function_single(cpu, func, NULL, 0);
+ if (get_cpu() == cpu) {
+ put_cpu();
+ continue;
+ }
+ ret = smp_call_function_single(cpu, rcu_exp_handler, NULL, 0);
+ put_cpu();
if (!ret) {
mask_ofl_ipi &= ~mask;
continue;
@@ -426,7 +418,7 @@ retry_ipi:
* Select the nodes that the upcoming expedited grace period needs
* to wait for.
*/
-static void sync_rcu_exp_select_cpus(smp_call_func_t func)
+static void sync_rcu_exp_select_cpus(void)
{
int cpu;
struct rcu_node *rnp;
@@ -440,7 +432,6 @@ static void sync_rcu_exp_select_cpus(smp_call_func_t func)
rnp->exp_need_flush = false;
if (!READ_ONCE(rnp->expmask))
continue; /* Avoid early boot non-existent wq. */
- rnp->rew.rew_func = func;
if (!READ_ONCE(rcu_par_gp_wq) ||
rcu_scheduler_active != RCU_SCHEDULER_RUNNING ||
rcu_is_last_leaf_node(rnp)) {
@@ -449,7 +440,6 @@ static void sync_rcu_exp_select_cpus(smp_call_func_t func)
continue;
}
INIT_WORK(&rnp->rew.rew_work, sync_rcu_exp_select_node_cpus);
- preempt_disable();
cpu = find_next_bit(&rnp->ffmask, BITS_PER_LONG, -1);
/* If all offline, queue the work on an unbound CPU. */
if (unlikely(cpu > rnp->grphi - rnp->grplo))
@@ -457,7 +447,6 @@ static void sync_rcu_exp_select_cpus(smp_call_func_t func)
else
cpu += rnp->grplo;
queue_work_on(cpu, rcu_par_gp_wq, &rnp->rew.rew_work);
- preempt_enable();
rnp->exp_need_flush = true;
}
@@ -580,10 +569,10 @@ static void rcu_exp_wait_wake(unsigned long s)
* Common code to drive an expedited grace period forward, used by
* workqueues and mid-boot-time tasks.
*/
-static void rcu_exp_sel_wait_wake(smp_call_func_t func, unsigned long s)
+static void rcu_exp_sel_wait_wake(unsigned long s)
{
/* Initialize the rcu_node tree in preparation for the wait. */
- sync_rcu_exp_select_cpus(func);
+ sync_rcu_exp_select_cpus();
/* Wait and clean up, including waking everyone. */
rcu_exp_wait_wake(s);
@@ -597,52 +586,7 @@ static void wait_rcu_exp_gp(struct work_struct *wp)
struct rcu_exp_work *rewp;
rewp = container_of(wp, struct rcu_exp_work, rew_work);
- rcu_exp_sel_wait_wake(rewp->rew_func, rewp->rew_s);
-}
-
-/*
- * Given a smp_call_function() handler, kick off the specified
- * implementation of expedited grace period.
- */
-static void _synchronize_rcu_expedited(smp_call_func_t func)
-{
- struct rcu_data *rdp;
- struct rcu_exp_work rew;
- struct rcu_node *rnp;
- unsigned long s;
-
- /* If expedited grace periods are prohibited, fall back to normal. */
- if (rcu_gp_is_normal()) {
- wait_rcu_gp(call_rcu);
- return;
- }
-
- /* Take a snapshot of the sequence number. */
- s = rcu_exp_gp_seq_snap();
- if (exp_funnel_lock(s))
- return; /* Someone else did our work for us. */
-
- /* Ensure that load happens before action based on it. */
- if (unlikely(rcu_scheduler_active == RCU_SCHEDULER_INIT)) {
- /* Direct call during scheduler init and early_initcalls(). */
- rcu_exp_sel_wait_wake(func, s);
- } else {
- /* Marshall arguments & schedule the expedited grace period. */
- rew.rew_func = func;
- rew.rew_s = s;
- INIT_WORK_ONSTACK(&rew.rew_work, wait_rcu_exp_gp);
- queue_work(rcu_gp_wq, &rew.rew_work);
- }
-
- /* Wait for expedited grace period to complete. */
- rdp = per_cpu_ptr(&rcu_data, raw_smp_processor_id());
- rnp = rcu_get_root();
- wait_event(rnp->exp_wq[rcu_seq_ctr(s) & 0x3],
- sync_exp_work_done(s));
- smp_mb(); /* Workqueue actions happen before return. */
-
- /* Let the next expedited grace period start. */
- mutex_unlock(&rcu_state.exp_mutex);
+ rcu_exp_sel_wait_wake(rewp->rew_s);
}
#ifdef CONFIG_PREEMPT_RCU
@@ -654,7 +598,7 @@ static void _synchronize_rcu_expedited(smp_call_func_t func)
* ->expmask fields in the rcu_node tree. Otherwise, immediately
* report the quiescent state.
*/
-static void sync_rcu_exp_handler(void *unused)
+static void rcu_exp_handler(void *unused)
{
unsigned long flags;
struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
@@ -671,7 +615,7 @@ static void sync_rcu_exp_handler(void *unused)
rcu_dynticks_curr_cpu_in_eqs()) {
rcu_report_exp_rdp(rdp);
} else {
- rdp->deferred_qs = true;
+ rdp->exp_deferred_qs = true;
set_tsk_need_resched(t);
set_preempt_need_resched();
}
@@ -693,10 +637,11 @@ static void sync_rcu_exp_handler(void *unused)
if (t->rcu_read_lock_nesting > 0) {
raw_spin_lock_irqsave_rcu_node(rnp, flags);
if (rnp->expmask & rdp->grpmask) {
- rdp->deferred_qs = true;
- WRITE_ONCE(t->rcu_read_unlock_special.b.exp_hint, true);
+ rdp->exp_deferred_qs = true;
+ t->rcu_read_unlock_special.b.exp_hint = true;
}
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+ return;
}
/*
@@ -708,14 +653,14 @@ static void sync_rcu_exp_handler(void *unused)
*
* If the CPU is fully enabled (or if some buggy RCU-preempt
* read-side critical section is being used from idle), just
- * invoke rcu_preempt_defer_qs() to immediately report the
+ * invoke rcu_preempt_deferred_qs() to immediately report the
* quiescent state. We cannot use rcu_read_unlock_special()
* because we are in an interrupt handler, which will cause that
* function to take an early exit without doing anything.
*
* Otherwise, force a context switch after the CPU enables everything.
*/
- rdp->deferred_qs = true;
+ rdp->exp_deferred_qs = true;
if (!(preempt_count() & (PREEMPT_MASK | SOFTIRQ_MASK)) ||
WARN_ON_ONCE(rcu_dynticks_curr_cpu_in_eqs())) {
rcu_preempt_deferred_qs(t);
@@ -730,43 +675,41 @@ static void sync_sched_exp_online_cleanup(int cpu)
{
}
-/**
- * synchronize_rcu_expedited - Brute-force RCU grace period
- *
- * Wait for an RCU-preempt grace period, but expedite it. The basic
- * idea is to IPI all non-idle non-nohz online CPUs. The IPI handler
- * checks whether the CPU is in an RCU-preempt critical section, and
- * if so, it sets a flag that causes the outermost rcu_read_unlock()
- * to report the quiescent state. On the other hand, if the CPU is
- * not in an RCU read-side critical section, the IPI handler reports
- * the quiescent state immediately.
- *
- * Although this is a greate improvement over previous expedited
- * implementations, it is still unfriendly to real-time workloads, so is
- * thus not recommended for any sort of common-case code. In fact, if
- * you are using synchronize_rcu_expedited() in a loop, please restructure
- * your code to batch your updates, and then Use a single synchronize_rcu()
- * instead.
- *
- * This has the same semantics as (but is more brutal than) synchronize_rcu().
+/*
+ * Scan the current list of tasks blocked within RCU read-side critical
+ * sections, printing out the tid of each that is blocking the current
+ * expedited grace period.
*/
-void synchronize_rcu_expedited(void)
+static int rcu_print_task_exp_stall(struct rcu_node *rnp)
{
- RCU_LOCKDEP_WARN(lock_is_held(&rcu_bh_lock_map) ||
- lock_is_held(&rcu_lock_map) ||
- lock_is_held(&rcu_sched_lock_map),
- "Illegal synchronize_rcu_expedited() in RCU read-side critical section");
-
- if (rcu_scheduler_active == RCU_SCHEDULER_INACTIVE)
- return;
- _synchronize_rcu_expedited(sync_rcu_exp_handler);
+ struct task_struct *t;
+ int ndetected = 0;
+
+ if (!rnp->exp_tasks)
+ return 0;
+ t = list_entry(rnp->exp_tasks->prev,
+ struct task_struct, rcu_node_entry);
+ list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) {
+ pr_cont(" P%d", t->pid);
+ ndetected++;
+ }
+ return ndetected;
}
-EXPORT_SYMBOL_GPL(synchronize_rcu_expedited);
#else /* #ifdef CONFIG_PREEMPT_RCU */
+/* Request an expedited quiescent state. */
+static void rcu_exp_need_qs(void)
+{
+ __this_cpu_write(rcu_data.cpu_no_qs.b.exp, true);
+ /* Store .exp before .rcu_urgent_qs. */
+ smp_store_release(this_cpu_ptr(&rcu_data.rcu_urgent_qs), true);
+ set_tsk_need_resched(current);
+ set_preempt_need_resched();
+}
+
/* Invoked on each online non-idle CPU for expedited quiescent state. */
-static void sync_sched_exp_handler(void *unused)
+static void rcu_exp_handler(void *unused)
{
struct rcu_data *rdp;
struct rcu_node *rnp;
@@ -780,62 +723,117 @@ static void sync_sched_exp_handler(void *unused)
rcu_report_exp_rdp(this_cpu_ptr(&rcu_data));
return;
}
- __this_cpu_write(rcu_data.cpu_no_qs.b.exp, true);
- /* Store .exp before .rcu_urgent_qs. */
- smp_store_release(this_cpu_ptr(&rcu_data.rcu_urgent_qs), true);
- set_tsk_need_resched(current);
- set_preempt_need_resched();
+ rcu_exp_need_qs();
}
/* Send IPI for expedited cleanup if needed at end of CPU-hotplug operation. */
static void sync_sched_exp_online_cleanup(int cpu)
{
+ unsigned long flags;
+ int my_cpu;
struct rcu_data *rdp;
int ret;
struct rcu_node *rnp;
rdp = per_cpu_ptr(&rcu_data, cpu);
rnp = rdp->mynode;
- if (!(READ_ONCE(rnp->expmask) & rdp->grpmask))
+ my_cpu = get_cpu();
+ /* Quiescent state either not needed or already requested, leave. */
+ if (!(READ_ONCE(rnp->expmask) & rdp->grpmask) ||
+ __this_cpu_read(rcu_data.cpu_no_qs.b.exp)) {
+ put_cpu();
return;
- ret = smp_call_function_single(cpu, sync_sched_exp_handler, NULL, 0);
+ }
+ /* Quiescent state needed on current CPU, so set it up locally. */
+ if (my_cpu == cpu) {
+ local_irq_save(flags);
+ rcu_exp_need_qs();
+ local_irq_restore(flags);
+ put_cpu();
+ return;
+ }
+ /* Quiescent state needed on some other CPU, send IPI. */
+ ret = smp_call_function_single(cpu, rcu_exp_handler, NULL, 0);
+ put_cpu();
WARN_ON_ONCE(ret);
}
/*
- * Because a context switch is a grace period for !PREEMPT, any
- * blocking grace-period wait automatically implies a grace period if
- * there is only one CPU online at any point time during execution of
- * either synchronize_rcu() or synchronize_rcu_expedited(). It is OK to
- * occasionally incorrectly indicate that there are multiple CPUs online
- * when there was in fact only one the whole time, as this just adds some
- * overhead: RCU still operates correctly.
+ * Because preemptible RCU does not exist, we never have to check for
+ * tasks blocked within RCU read-side critical sections that are
+ * blocking the current expedited grace period.
*/
-static int rcu_blocking_is_gp(void)
+static int rcu_print_task_exp_stall(struct rcu_node *rnp)
{
- int ret;
-
- might_sleep(); /* Check for RCU read-side critical section. */
- preempt_disable();
- ret = num_online_cpus() <= 1;
- preempt_enable();
- return ret;
+ return 0;
}
-/* PREEMPT=n implementation of synchronize_rcu_expedited(). */
+#endif /* #else #ifdef CONFIG_PREEMPT_RCU */
+
+/**
+ * synchronize_rcu_expedited - Brute-force RCU grace period
+ *
+ * Wait for an RCU grace period, but expedite it. The basic idea is to
+ * IPI all non-idle non-nohz online CPUs. The IPI handler checks whether
+ * the CPU is in an RCU critical section, and if so, it sets a flag that
+ * causes the outermost rcu_read_unlock() to report the quiescent state
+ * for RCU-preempt or asks the scheduler for help for RCU-sched. On the
+ * other hand, if the CPU is not in an RCU read-side critical section,
+ * the IPI handler reports the quiescent state immediately.
+ *
+ * Although this is a greate improvement over previous expedited
+ * implementations, it is still unfriendly to real-time workloads, so is
+ * thus not recommended for any sort of common-case code. In fact, if
+ * you are using synchronize_rcu_expedited() in a loop, please restructure
+ * your code to batch your updates, and then Use a single synchronize_rcu()
+ * instead.
+ *
+ * This has the same semantics as (but is more brutal than) synchronize_rcu().
+ */
void synchronize_rcu_expedited(void)
{
+ struct rcu_exp_work rew;
+ struct rcu_node *rnp;
+ unsigned long s;
+
RCU_LOCKDEP_WARN(lock_is_held(&rcu_bh_lock_map) ||
lock_is_held(&rcu_lock_map) ||
lock_is_held(&rcu_sched_lock_map),
"Illegal synchronize_rcu_expedited() in RCU read-side critical section");
- /* If only one CPU, this is automatically a grace period. */
+ /* Is the state is such that the call is a grace period? */
if (rcu_blocking_is_gp())
return;
- _synchronize_rcu_expedited(sync_sched_exp_handler);
+ /* If expedited grace periods are prohibited, fall back to normal. */
+ if (rcu_gp_is_normal()) {
+ wait_rcu_gp(call_rcu);
+ return;
+ }
+
+ /* Take a snapshot of the sequence number. */
+ s = rcu_exp_gp_seq_snap();
+ if (exp_funnel_lock(s))
+ return; /* Someone else did our work for us. */
+
+ /* Ensure that load happens before action based on it. */
+ if (unlikely(rcu_scheduler_active == RCU_SCHEDULER_INIT)) {
+ /* Direct call during scheduler init and early_initcalls(). */
+ rcu_exp_sel_wait_wake(s);
+ } else {
+ /* Marshall arguments & schedule the expedited grace period. */
+ rew.rew_s = s;
+ INIT_WORK_ONSTACK(&rew.rew_work, wait_rcu_exp_gp);
+ queue_work(rcu_gp_wq, &rew.rew_work);
+ }
+
+ /* Wait for expedited grace period to complete. */
+ rnp = rcu_get_root();
+ wait_event(rnp->exp_wq[rcu_seq_ctr(s) & 0x3],
+ sync_exp_work_done(s));
+ smp_mb(); /* Workqueue actions happen before return. */
+
+ /* Let the next expedited grace period start. */
+ mutex_unlock(&rcu_state.exp_mutex);
}
EXPORT_SYMBOL_GPL(synchronize_rcu_expedited);
-
-#endif /* #else #ifdef CONFIG_PREEMPT_RCU */
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 1b3dd2fc0cd6..acb225023ed1 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -1,63 +1,18 @@
+/* SPDX-License-Identifier: GPL-2.0+ */
/*
* Read-Copy Update mechanism for mutual exclusion (tree-based version)
* Internal non-public definitions that provide either classic
* or preemptible semantics.
*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, you can access it online at
- * http://www.gnu.org/licenses/gpl-2.0.html.
- *
* Copyright Red Hat, 2009
* Copyright IBM Corporation, 2009
*
* Author: Ingo Molnar <mingo@elte.hu>
- * Paul E. McKenney <paulmck@linux.vnet.ibm.com>
+ * Paul E. McKenney <paulmck@linux.ibm.com>
*/
-#include <linux/delay.h>
-#include <linux/gfp.h>
-#include <linux/oom.h>
-#include <linux/sched/debug.h>
-#include <linux/smpboot.h>
-#include <linux/sched/isolation.h>
-#include <uapi/linux/sched/types.h>
-#include "../time/tick-internal.h"
-
-#ifdef CONFIG_RCU_BOOST
-
#include "../locking/rtmutex_common.h"
-/*
- * Control variables for per-CPU and per-rcu_node kthreads.
- */
-static DEFINE_PER_CPU(struct task_struct *, rcu_cpu_kthread_task);
-DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_status);
-DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_loops);
-DEFINE_PER_CPU(char, rcu_cpu_has_work);
-
-#else /* #ifdef CONFIG_RCU_BOOST */
-
-/*
- * Some architectures do not define rt_mutexes, but if !CONFIG_RCU_BOOST,
- * all uses are in dead code. Provide a definition to keep the compiler
- * happy, but add WARN_ON_ONCE() to complain if used in the wrong place.
- * This probably needs to be excluded from -rt builds.
- */
-#define rt_mutex_owner(a) ({ WARN_ON_ONCE(1); NULL; })
-#define rt_mutex_futex_unlock(x) WARN_ON_ONCE(1)
-
-#endif /* #else #ifdef CONFIG_RCU_BOOST */
-
#ifdef CONFIG_RCU_NOCB_CPU
static cpumask_var_t rcu_nocb_mask; /* CPUs to have callbacks offloaded. */
static bool __read_mostly rcu_nocb_poll; /* Offload kthread are to poll. */
@@ -117,6 +72,8 @@ static void __init rcu_bootup_announce_oddness(void)
pr_info("\tRCU debug GP init slowdown %d jiffies.\n", gp_init_delay);
if (gp_cleanup_delay)
pr_info("\tRCU debug GP init slowdown %d jiffies.\n", gp_cleanup_delay);
+ if (!use_softirq)
+ pr_info("\tRCU_SOFTIRQ processing moved to rcuc kthreads.\n");
if (IS_ENABLED(CONFIG_RCU_EQS_DEBUG))
pr_info("\tRCU debug extended QS entry/exit.\n");
rcupdate_announce_bootup_oddness();
@@ -280,10 +237,10 @@ static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp)
* no need to check for a subsequent expedited GP. (Though we are
* still in a quiescent state in any case.)
*/
- if (blkd_state & RCU_EXP_BLKD && rdp->deferred_qs)
+ if (blkd_state & RCU_EXP_BLKD && rdp->exp_deferred_qs)
rcu_report_exp_rdp(rdp);
else
- WARN_ON_ONCE(rdp->deferred_qs);
+ WARN_ON_ONCE(rdp->exp_deferred_qs);
}
/*
@@ -307,8 +264,8 @@ static void rcu_qs(void)
__this_cpu_read(rcu_data.gp_seq),
TPS("cpuqs"));
__this_cpu_write(rcu_data.cpu_no_qs.b.norm, false);
- barrier(); /* Coordinate with rcu_flavor_check_callbacks(). */
- current->rcu_read_unlock_special.b.need_qs = false;
+ barrier(); /* Coordinate with rcu_flavor_sched_clock_irq(). */
+ WRITE_ONCE(current->rcu_read_unlock_special.b.need_qs, false);
}
}
@@ -380,7 +337,7 @@ void rcu_note_context_switch(bool preempt)
* means that we continue to block the current grace period.
*/
rcu_qs();
- if (rdp->deferred_qs)
+ if (rdp->exp_deferred_qs)
rcu_report_exp_rdp(rdp);
trace_rcu_utilization(TPS("End context switch"));
barrier(); /* Avoid RCU read-side critical sections leaking up. */
@@ -494,14 +451,15 @@ rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, unsigned long flags)
*/
special = t->rcu_read_unlock_special;
rdp = this_cpu_ptr(&rcu_data);
- if (!special.s && !rdp->deferred_qs) {
+ if (!special.s && !rdp->exp_deferred_qs) {
local_irq_restore(flags);
return;
}
+ t->rcu_read_unlock_special.b.deferred_qs = false;
if (special.b.need_qs) {
rcu_qs();
t->rcu_read_unlock_special.b.need_qs = false;
- if (!t->rcu_read_unlock_special.s && !rdp->deferred_qs) {
+ if (!t->rcu_read_unlock_special.s && !rdp->exp_deferred_qs) {
local_irq_restore(flags);
return;
}
@@ -513,7 +471,7 @@ rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, unsigned long flags)
* tasks are handled when removing the task from the
* blocked-tasks list below.
*/
- if (rdp->deferred_qs) {
+ if (rdp->exp_deferred_qs) {
rcu_report_exp_rdp(rdp);
if (!t->rcu_read_unlock_special.s) {
local_irq_restore(flags);
@@ -602,7 +560,7 @@ rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, unsigned long flags)
*/
static bool rcu_preempt_need_deferred_qs(struct task_struct *t)
{
- return (__this_cpu_read(rcu_data.deferred_qs) ||
+ return (__this_cpu_read(rcu_data.exp_deferred_qs) ||
READ_ONCE(t->rcu_read_unlock_special.s)) &&
t->rcu_read_lock_nesting <= 0;
}
@@ -630,6 +588,17 @@ static void rcu_preempt_deferred_qs(struct task_struct *t)
}
/*
+ * Minimal handler to give the scheduler a chance to re-evaluate.
+ */
+static void rcu_preempt_deferred_qs_handler(struct irq_work *iwp)
+{
+ struct rcu_data *rdp;
+
+ rdp = container_of(iwp, struct rcu_data, defer_qs_iw);
+ rdp->defer_qs_iw_pending = false;
+}
+
+/*
* Handle special cases during rcu_read_unlock(), such as needing to
* notify RCU core processing or task having blocked during the RCU
* read-side critical section.
@@ -648,16 +617,41 @@ static void rcu_read_unlock_special(struct task_struct *t)
local_irq_save(flags);
irqs_were_disabled = irqs_disabled_flags(flags);
if (preempt_bh_were_disabled || irqs_were_disabled) {
- WRITE_ONCE(t->rcu_read_unlock_special.b.exp_hint, false);
- /* Need to defer quiescent state until everything is enabled. */
- if (irqs_were_disabled) {
- /* Enabling irqs does not reschedule, so... */
+ bool exp;
+ struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
+ struct rcu_node *rnp = rdp->mynode;
+
+ t->rcu_read_unlock_special.b.exp_hint = false;
+ exp = (t->rcu_blocked_node && t->rcu_blocked_node->exp_tasks) ||
+ (rdp->grpmask & rnp->expmask) ||
+ tick_nohz_full_cpu(rdp->cpu);
+ // Need to defer quiescent state until everything is enabled.
+ if ((exp || in_irq()) && irqs_were_disabled && use_softirq &&
+ (in_irq() || !t->rcu_read_unlock_special.b.deferred_qs)) {
+ // Using softirq, safe to awaken, and we get
+ // no help from enabling irqs, unlike bh/preempt.
raise_softirq_irqoff(RCU_SOFTIRQ);
+ } else if (exp && irqs_were_disabled && !use_softirq &&
+ !t->rcu_read_unlock_special.b.deferred_qs) {
+ // Safe to awaken and we get no help from enabling
+ // irqs, unlike bh/preempt.
+ invoke_rcu_core();
} else {
- /* Enabling BH or preempt does reschedule, so... */
+ // Enabling BH or preempt does reschedule, so...
+ // Also if no expediting or NO_HZ_FULL, slow is OK.
set_tsk_need_resched(current);
set_preempt_need_resched();
+ if (IS_ENABLED(CONFIG_IRQ_WORK) &&
+ !rdp->defer_qs_iw_pending && exp) {
+ // Get scheduler to re-evaluate and call hooks.
+ // If !IRQ_WORK, FQS scan will eventually IPI.
+ init_irq_work(&rdp->defer_qs_iw,
+ rcu_preempt_deferred_qs_handler);
+ rdp->defer_qs_iw_pending = true;
+ irq_work_queue_on(&rdp->defer_qs_iw, rdp->cpu);
+ }
}
+ t->rcu_read_unlock_special.b.deferred_qs = true;
local_irq_restore(flags);
return;
}
@@ -666,100 +660,6 @@ static void rcu_read_unlock_special(struct task_struct *t)
}
/*
- * Dump detailed information for all tasks blocking the current RCU
- * grace period on the specified rcu_node structure.
- */
-static void rcu_print_detail_task_stall_rnp(struct rcu_node *rnp)
-{
- unsigned long flags;
- struct task_struct *t;
-
- raw_spin_lock_irqsave_rcu_node(rnp, flags);
- if (!rcu_preempt_blocked_readers_cgp(rnp)) {
- raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
- return;
- }
- t = list_entry(rnp->gp_tasks->prev,
- struct task_struct, rcu_node_entry);
- list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) {
- /*
- * We could be printing a lot while holding a spinlock.
- * Avoid triggering hard lockup.
- */
- touch_nmi_watchdog();
- sched_show_task(t);
- }
- raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
-}
-
-/*
- * Dump detailed information for all tasks blocking the current RCU
- * grace period.
- */
-static void rcu_print_detail_task_stall(void)
-{
- struct rcu_node *rnp = rcu_get_root();
-
- rcu_print_detail_task_stall_rnp(rnp);
- rcu_for_each_leaf_node(rnp)
- rcu_print_detail_task_stall_rnp(rnp);
-}
-
-static void rcu_print_task_stall_begin(struct rcu_node *rnp)
-{
- pr_err("\tTasks blocked on level-%d rcu_node (CPUs %d-%d):",
- rnp->level, rnp->grplo, rnp->grphi);
-}
-
-static void rcu_print_task_stall_end(void)
-{
- pr_cont("\n");
-}
-
-/*
- * Scan the current list of tasks blocked within RCU read-side critical
- * sections, printing out the tid of each.
- */
-static int rcu_print_task_stall(struct rcu_node *rnp)
-{
- struct task_struct *t;
- int ndetected = 0;
-
- if (!rcu_preempt_blocked_readers_cgp(rnp))
- return 0;
- rcu_print_task_stall_begin(rnp);
- t = list_entry(rnp->gp_tasks->prev,
- struct task_struct, rcu_node_entry);
- list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) {
- pr_cont(" P%d", t->pid);
- ndetected++;
- }
- rcu_print_task_stall_end();
- return ndetected;
-}
-
-/*
- * Scan the current list of tasks blocked within RCU read-side critical
- * sections, printing out the tid of each that is blocking the current
- * expedited grace period.
- */
-static int rcu_print_task_exp_stall(struct rcu_node *rnp)
-{
- struct task_struct *t;
- int ndetected = 0;
-
- if (!rnp->exp_tasks)
- return 0;
- t = list_entry(rnp->exp_tasks->prev,
- struct task_struct, rcu_node_entry);
- list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) {
- pr_cont(" P%d", t->pid);
- ndetected++;
- }
- return ndetected;
-}
-
-/*
* Check that the list of blocked tasks for the newly completed grace
* period is in fact empty. It is a serious bug to complete a grace
* period that still has RCU readers blocked! This function must be
@@ -788,13 +688,13 @@ static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp)
}
/*
- * Check for a quiescent state from the current CPU. When a task blocks,
- * the task is recorded in the corresponding CPU's rcu_node structure,
- * which is checked elsewhere.
- *
- * Caller must disable hard irqs.
+ * Check for a quiescent state from the current CPU, including voluntary
+ * context switches for Tasks RCU. When a task blocks, the task is
+ * recorded in the corresponding CPU's rcu_node structure, which is checked
+ * elsewhere, hence this function need only check for quiescent states
+ * related to the current CPU, not to those related to tasks.
*/
-static void rcu_flavor_check_callbacks(int user)
+static void rcu_flavor_sched_clock_irq(int user)
{
struct task_struct *t = current;
@@ -825,69 +725,27 @@ static void rcu_flavor_check_callbacks(int user)
t->rcu_read_unlock_special.b.need_qs = true;
}
-/**
- * synchronize_rcu - wait until a grace period has elapsed.
- *
- * Control will return to the caller some time after a full grace
- * period has elapsed, in other words after all currently executing RCU
- * read-side critical sections have completed. Note, however, that
- * upon return from synchronize_rcu(), the caller might well be executing
- * concurrently with new RCU read-side critical sections that began while
- * synchronize_rcu() was waiting. RCU read-side critical sections are
- * delimited by rcu_read_lock() and rcu_read_unlock(), and may be nested.
- * In addition, regions of code across which interrupts, preemption, or
- * softirqs have been disabled also serve as RCU read-side critical
- * sections. This includes hardware interrupt handlers, softirq handlers,
- * and NMI handlers.
- *
- * Note that this guarantee implies further memory-ordering guarantees.
- * On systems with more than one CPU, when synchronize_rcu() returns,
- * each CPU is guaranteed to have executed a full memory barrier since
- * the end of its last RCU read-side critical section whose beginning
- * preceded the call to synchronize_rcu(). In addition, each CPU having
- * an RCU read-side critical section that extends beyond the return from
- * synchronize_rcu() is guaranteed to have executed a full memory barrier
- * after the beginning of synchronize_rcu() and before the beginning of
- * that RCU read-side critical section. Note that these guarantees include
- * CPUs that are offline, idle, or executing in user mode, as well as CPUs
- * that are executing in the kernel.
- *
- * Furthermore, if CPU A invoked synchronize_rcu(), which returned
- * to its caller on CPU B, then both CPU A and CPU B are guaranteed
- * to have executed a full memory barrier during the execution of
- * synchronize_rcu() -- even if CPU A and CPU B are the same CPU (but
- * again only if the system has more than one CPU).
- */
-void synchronize_rcu(void)
-{
- RCU_LOCKDEP_WARN(lock_is_held(&rcu_bh_lock_map) ||
- lock_is_held(&rcu_lock_map) ||
- lock_is_held(&rcu_sched_lock_map),
- "Illegal synchronize_rcu() in RCU read-side critical section");
- if (rcu_scheduler_active == RCU_SCHEDULER_INACTIVE)
- return;
- if (rcu_gp_is_expedited())
- synchronize_rcu_expedited();
- else
- wait_rcu_gp(call_rcu);
-}
-EXPORT_SYMBOL_GPL(synchronize_rcu);
-
/*
* Check for a task exiting while in a preemptible-RCU read-side
- * critical section, clean up if so. No need to issue warnings,
- * as debug_check_no_locks_held() already does this if lockdep
- * is enabled.
+ * critical section, clean up if so. No need to issue warnings, as
+ * debug_check_no_locks_held() already does this if lockdep is enabled.
+ * Besides, if this function does anything other than just immediately
+ * return, there was a bug of some sort. Spewing warnings from this
+ * function is like as not to simply obscure important prior warnings.
*/
void exit_rcu(void)
{
struct task_struct *t = current;
- if (likely(list_empty(&current->rcu_node_entry)))
+ if (unlikely(!list_empty(&current->rcu_node_entry))) {
+ t->rcu_read_lock_nesting = 1;
+ barrier();
+ WRITE_ONCE(t->rcu_read_unlock_special.b.blocked, true);
+ } else if (unlikely(t->rcu_read_lock_nesting)) {
+ t->rcu_read_lock_nesting = 1;
+ } else {
return;
- t->rcu_read_lock_nesting = 1;
- barrier();
- t->rcu_read_unlock_special.b.blocked = true;
+ }
__rcu_read_unlock();
rcu_preempt_deferred_qs(current);
}
@@ -919,7 +777,7 @@ dump_blkd_tasks(struct rcu_node *rnp, int ncheck)
i = 0;
list_for_each(lhp, &rnp->blkd_tasks) {
pr_cont(" %p", lhp);
- if (++i >= 10)
+ if (++i >= ncheck)
break;
}
pr_cont("\n");
@@ -1051,33 +909,6 @@ static bool rcu_preempt_need_deferred_qs(struct task_struct *t)
static void rcu_preempt_deferred_qs(struct task_struct *t) { }
/*
- * Because preemptible RCU does not exist, we never have to check for
- * tasks blocked within RCU read-side critical sections.
- */
-static void rcu_print_detail_task_stall(void)
-{
-}
-
-/*
- * Because preemptible RCU does not exist, we never have to check for
- * tasks blocked within RCU read-side critical sections.
- */
-static int rcu_print_task_stall(struct rcu_node *rnp)
-{
- return 0;
-}
-
-/*
- * Because preemptible RCU does not exist, we never have to check for
- * tasks blocked within RCU read-side critical sections that are
- * blocking the current expedited grace period.
- */
-static int rcu_print_task_exp_stall(struct rcu_node *rnp)
-{
- return 0;
-}
-
-/*
* Because there is no preemptible RCU, there can be no readers blocked,
* so there is no need to check for blocked tasks. So check only for
* bogus qsmask values.
@@ -1088,14 +919,10 @@ static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp)
}
/*
- * Check to see if this CPU is in a non-context-switch quiescent state
- * (user mode or idle loop for rcu, non-softirq execution for rcu_bh).
- * Also schedule RCU core processing.
- *
- * This function must be called from hardirq context. It is normally
- * invoked from the scheduling-clock interrupt.
+ * Check to see if this CPU is in a non-context-switch quiescent state,
+ * namely user mode and idle loop.
*/
-static void rcu_flavor_check_callbacks(int user)
+static void rcu_flavor_sched_clock_irq(int user)
{
if (user || rcu_is_cpu_rrupt_from_idle()) {
@@ -1115,22 +942,6 @@ static void rcu_flavor_check_callbacks(int user)
}
}
-/* PREEMPT=n implementation of synchronize_rcu(). */
-void synchronize_rcu(void)
-{
- RCU_LOCKDEP_WARN(lock_is_held(&rcu_bh_lock_map) ||
- lock_is_held(&rcu_lock_map) ||
- lock_is_held(&rcu_sched_lock_map),
- "Illegal synchronize_rcu() in RCU read-side critical section");
- if (rcu_blocking_is_gp())
- return;
- if (rcu_gp_is_expedited())
- synchronize_rcu_expedited();
- else
- wait_rcu_gp(call_rcu);
-}
-EXPORT_SYMBOL_GPL(synchronize_rcu);
-
/*
* Because preemptible RCU does not exist, tasks cannot possibly exit
* while in preemptible RCU read-side critical sections.
@@ -1150,18 +961,21 @@ dump_blkd_tasks(struct rcu_node *rnp, int ncheck)
#endif /* #else #ifdef CONFIG_PREEMPT_RCU */
+/*
+ * If boosting, set rcuc kthreads to realtime priority.
+ */
+static void rcu_cpu_kthread_setup(unsigned int cpu)
+{
#ifdef CONFIG_RCU_BOOST
+ struct sched_param sp;
-static void rcu_wake_cond(struct task_struct *t, int status)
-{
- /*
- * If the thread is yielding, only wake it when this
- * is invoked from idle
- */
- if (status != RCU_KTHREAD_YIELDING || is_idle_task(current))
- wake_up_process(t);
+ sp.sched_priority = kthread_prio;
+ sched_setscheduler_nocheck(current, SCHED_FIFO, &sp);
+#endif /* #ifdef CONFIG_RCU_BOOST */
}
+#ifdef CONFIG_RCU_BOOST
+
/*
* Carry out RCU priority boosting on the task indicated by ->exp_tasks
* or ->boost_tasks, advancing the pointer to the next task in the
@@ -1276,8 +1090,6 @@ static int rcu_boost_kthread(void *arg)
static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags)
__releases(rnp->lock)
{
- struct task_struct *t;
-
raw_lockdep_assert_held_rcu_node(rnp);
if (!rcu_preempt_blocked_readers_cgp(rnp) && rnp->exp_tasks == NULL) {
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
@@ -1291,38 +1103,20 @@ static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags)
if (rnp->exp_tasks == NULL)
rnp->boost_tasks = rnp->gp_tasks;
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
- t = rnp->boost_kthread_task;
- if (t)
- rcu_wake_cond(t, rnp->boost_kthread_status);
+ rcu_wake_cond(rnp->boost_kthread_task,
+ rnp->boost_kthread_status);
} else {
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
}
}
/*
- * Wake up the per-CPU kthread to invoke RCU callbacks.
- */
-static void invoke_rcu_callbacks_kthread(void)
-{
- unsigned long flags;
-
- local_irq_save(flags);
- __this_cpu_write(rcu_cpu_has_work, 1);
- if (__this_cpu_read(rcu_cpu_kthread_task) != NULL &&
- current != __this_cpu_read(rcu_cpu_kthread_task)) {
- rcu_wake_cond(__this_cpu_read(rcu_cpu_kthread_task),
- __this_cpu_read(rcu_cpu_kthread_status));
- }
- local_irq_restore(flags);
-}
-
-/*
* Is the current CPU running the RCU-callbacks kthread?
* Caller must have preemption disabled.
*/
static bool rcu_is_callbacks_kthread(void)
{
- return __this_cpu_read(rcu_cpu_kthread_task) == current;
+ return __this_cpu_read(rcu_data.rcu_cpu_kthread_task) == current;
}
#define RCU_BOOST_DELAY_JIFFIES DIV_ROUND_UP(CONFIG_RCU_BOOST_DELAY * HZ, 1000)
@@ -1369,65 +1163,6 @@ static int rcu_spawn_one_boost_kthread(struct rcu_node *rnp)
return 0;
}
-static void rcu_kthread_do_work(void)
-{
- rcu_do_batch(this_cpu_ptr(&rcu_data));
-}
-
-static void rcu_cpu_kthread_setup(unsigned int cpu)
-{
- struct sched_param sp;
-
- sp.sched_priority = kthread_prio;
- sched_setscheduler_nocheck(current, SCHED_FIFO, &sp);
-}
-
-static void rcu_cpu_kthread_park(unsigned int cpu)
-{
- per_cpu(rcu_cpu_kthread_status, cpu) = RCU_KTHREAD_OFFCPU;
-}
-
-static int rcu_cpu_kthread_should_run(unsigned int cpu)
-{
- return __this_cpu_read(rcu_cpu_has_work);
-}
-
-/*
- * Per-CPU kernel thread that invokes RCU callbacks. This replaces
- * the RCU softirq used in configurations of RCU that do not support RCU
- * priority boosting.
- */
-static void rcu_cpu_kthread(unsigned int cpu)
-{
- unsigned int *statusp = this_cpu_ptr(&rcu_cpu_kthread_status);
- char work, *workp = this_cpu_ptr(&rcu_cpu_has_work);
- int spincnt;
-
- for (spincnt = 0; spincnt < 10; spincnt++) {
- trace_rcu_utilization(TPS("Start CPU kthread@rcu_wait"));
- local_bh_disable();
- *statusp = RCU_KTHREAD_RUNNING;
- this_cpu_inc(rcu_cpu_kthread_loops);
- local_irq_disable();
- work = *workp;
- *workp = 0;
- local_irq_enable();
- if (work)
- rcu_kthread_do_work();
- local_bh_enable();
- if (*workp == 0) {
- trace_rcu_utilization(TPS("End CPU kthread@rcu_wait"));
- *statusp = RCU_KTHREAD_WAITING;
- return;
- }
- }
- *statusp = RCU_KTHREAD_YIELDING;
- trace_rcu_utilization(TPS("Start CPU kthread@rcu_yield"));
- schedule_timeout_interruptible(2);
- trace_rcu_utilization(TPS("End CPU kthread@rcu_yield"));
- *statusp = RCU_KTHREAD_WAITING;
-}
-
/*
* Set the per-rcu_node kthread's affinity to cover all CPUs that are
* served by the rcu_node in question. The CPU hotplug lock is still
@@ -1458,27 +1193,13 @@ static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu)
free_cpumask_var(cm);
}
-static struct smp_hotplug_thread rcu_cpu_thread_spec = {
- .store = &rcu_cpu_kthread_task,
- .thread_should_run = rcu_cpu_kthread_should_run,
- .thread_fn = rcu_cpu_kthread,
- .thread_comm = "rcuc/%u",
- .setup = rcu_cpu_kthread_setup,
- .park = rcu_cpu_kthread_park,
-};
-
/*
* Spawn boost kthreads -- called as soon as the scheduler is running.
*/
static void __init rcu_spawn_boost_kthreads(void)
{
struct rcu_node *rnp;
- int cpu;
- for_each_possible_cpu(cpu)
- per_cpu(rcu_cpu_has_work, cpu) = 0;
- if (WARN_ONCE(smpboot_register_percpu_thread(&rcu_cpu_thread_spec), "%s: Could not start rcub kthread, OOM is now expected behavior\n", __func__))
- return;
rcu_for_each_leaf_node(rnp)
(void)rcu_spawn_one_boost_kthread(rnp);
}
@@ -1501,11 +1222,6 @@ static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags)
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
}
-static void invoke_rcu_callbacks_kthread(void)
-{
- WARN_ON_ONCE(1);
-}
-
static bool rcu_is_callbacks_kthread(void)
{
return false;
@@ -1543,7 +1259,7 @@ static void rcu_prepare_kthreads(int cpu)
int rcu_needs_cpu(u64 basemono, u64 *nextevt)
{
*nextevt = KTIME_MAX;
- return rcu_cpu_has_callbacks(NULL);
+ return !rcu_segcblist_empty(&this_cpu_ptr(&rcu_data)->cblist);
}
/*
@@ -1562,14 +1278,6 @@ static void rcu_prepare_for_idle(void)
{
}
-/*
- * Don't bother keeping a running count of the number of RCU callbacks
- * posted because CONFIG_RCU_FAST_NO_HZ=n.
- */
-static void rcu_idle_count_callbacks_posted(void)
-{
-}
-
#else /* #if !defined(CONFIG_RCU_FAST_NO_HZ) */
/*
@@ -1652,11 +1360,8 @@ int rcu_needs_cpu(u64 basemono, u64 *nextevt)
lockdep_assert_irqs_disabled();
- /* Snapshot to detect later posting of non-lazy callback. */
- rdp->nonlazy_posted_snap = rdp->nonlazy_posted;
-
/* If no callbacks, RCU doesn't need the CPU. */
- if (!rcu_cpu_has_callbacks(&rdp->all_lazy)) {
+ if (rcu_segcblist_empty(&rdp->cblist)) {
*nextevt = KTIME_MAX;
return 0;
}
@@ -1670,11 +1375,12 @@ int rcu_needs_cpu(u64 basemono, u64 *nextevt)
rdp->last_accelerate = jiffies;
/* Request timer delay depending on laziness, and round. */
- if (!rdp->all_lazy) {
+ rdp->all_lazy = !rcu_segcblist_n_nonlazy_cbs(&rdp->cblist);
+ if (rdp->all_lazy) {
+ dj = round_jiffies(rcu_idle_lazy_gp_delay + jiffies) - jiffies;
+ } else {
dj = round_up(rcu_idle_gp_delay + jiffies,
rcu_idle_gp_delay) - jiffies;
- } else {
- dj = round_jiffies(rcu_idle_lazy_gp_delay + jiffies) - jiffies;
}
*nextevt = basemono + dj * TICK_NSEC;
return 0;
@@ -1704,7 +1410,7 @@ static void rcu_prepare_for_idle(void)
/* Handle nohz enablement switches conservatively. */
tne = READ_ONCE(tick_nohz_active);
if (tne != rdp->tick_nohz_enabled_snap) {
- if (rcu_cpu_has_callbacks(NULL))
+ if (!rcu_segcblist_empty(&rdp->cblist))
invoke_rcu_core(); /* force nohz to see update. */
rdp->tick_nohz_enabled_snap = tne;
return;
@@ -1717,10 +1423,8 @@ static void rcu_prepare_for_idle(void)
* callbacks, invoke RCU core for the side-effect of recalculating
* idle duration on re-entry to idle.
*/
- if (rdp->all_lazy &&
- rdp->nonlazy_posted != rdp->nonlazy_posted_snap) {
+ if (rdp->all_lazy && rcu_segcblist_n_nonlazy_cbs(&rdp->cblist)) {
rdp->all_lazy = false;
- rdp->nonlazy_posted_snap = rdp->nonlazy_posted;
invoke_rcu_core();
return;
}
@@ -1756,142 +1460,49 @@ static void rcu_cleanup_after_idle(void)
invoke_rcu_core();
}
-/*
- * Keep a running count of the number of non-lazy callbacks posted
- * on this CPU. This running counter (which is never decremented) allows
- * rcu_prepare_for_idle() to detect when something out of the idle loop
- * posts a callback, even if an equal number of callbacks are invoked.
- * Of course, callbacks should only be posted from within a trace event
- * designed to be called from idle or from within RCU_NONIDLE().
- */
-static void rcu_idle_count_callbacks_posted(void)
-{
- __this_cpu_add(rcu_data.nonlazy_posted, 1);
-}
-
#endif /* #else #if !defined(CONFIG_RCU_FAST_NO_HZ) */
-#ifdef CONFIG_RCU_FAST_NO_HZ
-
-static void print_cpu_stall_fast_no_hz(char *cp, int cpu)
-{
- struct rcu_data *rdp = &per_cpu(rcu_data, cpu);
- unsigned long nlpd = rdp->nonlazy_posted - rdp->nonlazy_posted_snap;
-
- sprintf(cp, "last_accelerate: %04lx/%04lx, nonlazy_posted: %ld, %c%c",
- rdp->last_accelerate & 0xffff, jiffies & 0xffff,
- ulong2long(nlpd),
- rdp->all_lazy ? 'L' : '.',
- rdp->tick_nohz_enabled_snap ? '.' : 'D');
-}
-
-#else /* #ifdef CONFIG_RCU_FAST_NO_HZ */
-
-static void print_cpu_stall_fast_no_hz(char *cp, int cpu)
-{
- *cp = '\0';
-}
-
-#endif /* #else #ifdef CONFIG_RCU_FAST_NO_HZ */
-
-/* Initiate the stall-info list. */
-static void print_cpu_stall_info_begin(void)
-{
- pr_cont("\n");
-}
-
-/*
- * Print out diagnostic information for the specified stalled CPU.
- *
- * If the specified CPU is aware of the current RCU grace period, then
- * print the number of scheduling clock interrupts the CPU has taken
- * during the time that it has been aware. Otherwise, print the number
- * of RCU grace periods that this CPU is ignorant of, for example, "1"
- * if the CPU was aware of the previous grace period.
- *
- * Also print out idle and (if CONFIG_RCU_FAST_NO_HZ) idle-entry info.
- */
-static void print_cpu_stall_info(int cpu)
-{
- unsigned long delta;
- char fast_no_hz[72];
- struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
- char *ticks_title;
- unsigned long ticks_value;
-
- /*
- * We could be printing a lot while holding a spinlock. Avoid
- * triggering hard lockup.
- */
- touch_nmi_watchdog();
-
- ticks_value = rcu_seq_ctr(rcu_state.gp_seq - rdp->gp_seq);
- if (ticks_value) {
- ticks_title = "GPs behind";
- } else {
- ticks_title = "ticks this GP";
- ticks_value = rdp->ticks_this_gp;
- }
- print_cpu_stall_fast_no_hz(fast_no_hz, cpu);
- delta = rcu_seq_ctr(rdp->mynode->gp_seq - rdp->rcu_iw_gp_seq);
- pr_err("\t%d-%c%c%c%c: (%lu %s) idle=%03x/%ld/%#lx softirq=%u/%u fqs=%ld %s\n",
- cpu,
- "O."[!!cpu_online(cpu)],
- "o."[!!(rdp->grpmask & rdp->mynode->qsmaskinit)],
- "N."[!!(rdp->grpmask & rdp->mynode->qsmaskinitnext)],
- !IS_ENABLED(CONFIG_IRQ_WORK) ? '?' :
- rdp->rcu_iw_pending ? (int)min(delta, 9UL) + '0' :
- "!."[!delta],
- ticks_value, ticks_title,
- rcu_dynticks_snap(rdp) & 0xfff,
- rdp->dynticks_nesting, rdp->dynticks_nmi_nesting,
- rdp->softirq_snap, kstat_softirqs_cpu(RCU_SOFTIRQ, cpu),
- READ_ONCE(rcu_state.n_force_qs) - rcu_state.n_force_qs_gpstart,
- fast_no_hz);
-}
-
-/* Terminate the stall-info list. */
-static void print_cpu_stall_info_end(void)
-{
- pr_err("\t");
-}
-
-/* Zero ->ticks_this_gp and snapshot the number of RCU softirq handlers. */
-static void zero_cpu_stall_ticks(struct rcu_data *rdp)
-{
- rdp->ticks_this_gp = 0;
- rdp->softirq_snap = kstat_softirqs_cpu(RCU_SOFTIRQ, smp_processor_id());
- WRITE_ONCE(rdp->last_fqs_resched, jiffies);
-}
-
#ifdef CONFIG_RCU_NOCB_CPU
/*
* Offload callback processing from the boot-time-specified set of CPUs
- * specified by rcu_nocb_mask. For each CPU in the set, there is a
- * kthread created that pulls the callbacks from the corresponding CPU,
- * waits for a grace period to elapse, and invokes the callbacks.
- * The no-CBs CPUs do a wake_up() on their kthread when they insert
- * a callback into any empty list, unless the rcu_nocb_poll boot parameter
- * has been specified, in which case each kthread actively polls its
- * CPU. (Which isn't so great for energy efficiency, but which does
- * reduce RCU's overhead on that CPU.)
+ * specified by rcu_nocb_mask. For the CPUs in the set, there are kthreads
+ * created that pull the callbacks from the corresponding CPU, wait for
+ * a grace period to elapse, and invoke the callbacks. These kthreads
+ * are organized into leaders, which manage incoming callbacks, wait for
+ * grace periods, and awaken followers, and the followers, which only
+ * invoke callbacks. Each leader is its own follower. The no-CBs CPUs
+ * do a wake_up() on their kthread when they insert a callback into any
+ * empty list, unless the rcu_nocb_poll boot parameter has been specified,
+ * in which case each kthread actively polls its CPU. (Which isn't so great
+ * for energy efficiency, but which does reduce RCU's overhead on that CPU.)
*
* This is intended to be used in conjunction with Frederic Weisbecker's
* adaptive-idle work, which would seriously reduce OS jitter on CPUs
* running CPU-bound user-mode computations.
*
- * Offloading of callback processing could also in theory be used as
- * an energy-efficiency measure because CPUs with no RCU callbacks
- * queued are more aggressive about entering dyntick-idle mode.
+ * Offloading of callbacks can also be used as an energy-efficiency
+ * measure because CPUs with no RCU callbacks queued are more aggressive
+ * about entering dyntick-idle mode.
*/
-/* Parse the boot-time rcu_nocb_mask CPU list from the kernel parameters. */
+/*
+ * Parse the boot-time rcu_nocb_mask CPU list from the kernel parameters.
+ * The string after the "rcu_nocbs=" is either "all" for all CPUs, or a
+ * comma-separated list of CPUs and/or CPU ranges. If an invalid list is
+ * given, a warning is emitted and all CPUs are offloaded.
+ */
static int __init rcu_nocb_setup(char *str)
{
alloc_bootmem_cpumask_var(&rcu_nocb_mask);
- cpulist_parse(str, rcu_nocb_mask);
+ if (!strcasecmp(str, "all"))
+ cpumask_setall(rcu_nocb_mask);
+ else
+ if (cpulist_parse(str, rcu_nocb_mask)) {
+ pr_warn("rcu_nocbs= bad CPU range, all CPUs set\n");
+ cpumask_setall(rcu_nocb_mask);
+ }
return 1;
}
__setup("rcu_nocbs=", rcu_nocb_setup);
@@ -1987,10 +1598,7 @@ static void wake_nocb_leader_defer(struct rcu_data *rdp, int waketype,
raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags);
}
-/*
- * Does the specified CPU need an RCU callback for this invocation
- * of rcu_barrier()?
- */
+/* Does rcu_barrier need to queue an RCU callback on the specified CPU? */
static bool rcu_nocb_cpu_needs_barrier(int cpu)
{
struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
@@ -2006,8 +1614,8 @@ static bool rcu_nocb_cpu_needs_barrier(int cpu)
* callbacks would be posted. In the worst case, the first
* barrier in rcu_barrier() suffices (but the caller cannot
* necessarily rely on this, not a substitute for the caller
- * getting the concurrency design right!). There must also be
- * a barrier between the following load an posting of a callback
+ * getting the concurrency design right!). There must also be a
+ * barrier between the following load and posting of a callback
* (if a callback is in fact needed). This is associated with an
* atomic_inc() in the caller.
*/
@@ -2517,9 +2125,9 @@ static void rcu_spawn_one_nocb_kthread(int cpu)
/*
* If the specified CPU is a no-CBs CPU that does not already have its
- * rcuo kthreads, spawn them.
+ * rcuo kthread, spawn it.
*/
-static void rcu_spawn_all_nocb_kthreads(int cpu)
+static void rcu_spawn_cpu_nocb_kthread(int cpu)
{
if (rcu_scheduler_fully_active)
rcu_spawn_one_nocb_kthread(cpu);
@@ -2536,7 +2144,7 @@ static void __init rcu_spawn_nocb_kthreads(void)
int cpu;
for_each_online_cpu(cpu)
- rcu_spawn_all_nocb_kthreads(cpu);
+ rcu_spawn_cpu_nocb_kthread(cpu);
}
/* How many follower CPU IDs per leader? Default of -1 for sqrt(nr_cpu_ids). */
@@ -2670,7 +2278,7 @@ static void do_nocb_deferred_wakeup(struct rcu_data *rdp)
{
}
-static void rcu_spawn_all_nocb_kthreads(int cpu)
+static void rcu_spawn_cpu_nocb_kthread(int cpu)
{
}
diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h
new file mode 100644
index 000000000000..065183391f75
--- /dev/null
+++ b/kernel/rcu/tree_stall.h
@@ -0,0 +1,711 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * RCU CPU stall warnings for normal RCU grace periods
+ *
+ * Copyright IBM Corporation, 2019
+ *
+ * Author: Paul E. McKenney <paulmck@linux.ibm.com>
+ */
+
+//////////////////////////////////////////////////////////////////////////////
+//
+// Controlling CPU stall warnings, including delay calculation.
+
+/* panic() on RCU Stall sysctl. */
+int sysctl_panic_on_rcu_stall __read_mostly;
+
+#ifdef CONFIG_PROVE_RCU
+#define RCU_STALL_DELAY_DELTA (5 * HZ)
+#else
+#define RCU_STALL_DELAY_DELTA 0
+#endif
+
+/* Limit-check stall timeouts specified at boottime and runtime. */
+int rcu_jiffies_till_stall_check(void)
+{
+ int till_stall_check = READ_ONCE(rcu_cpu_stall_timeout);
+
+ /*
+ * Limit check must be consistent with the Kconfig limits
+ * for CONFIG_RCU_CPU_STALL_TIMEOUT.
+ */
+ if (till_stall_check < 3) {
+ WRITE_ONCE(rcu_cpu_stall_timeout, 3);
+ till_stall_check = 3;
+ } else if (till_stall_check > 300) {
+ WRITE_ONCE(rcu_cpu_stall_timeout, 300);
+ till_stall_check = 300;
+ }
+ return till_stall_check * HZ + RCU_STALL_DELAY_DELTA;
+}
+EXPORT_SYMBOL_GPL(rcu_jiffies_till_stall_check);
+
+/* Don't do RCU CPU stall warnings during long sysrq printouts. */
+void rcu_sysrq_start(void)
+{
+ if (!rcu_cpu_stall_suppress)
+ rcu_cpu_stall_suppress = 2;
+}
+
+void rcu_sysrq_end(void)
+{
+ if (rcu_cpu_stall_suppress == 2)
+ rcu_cpu_stall_suppress = 0;
+}
+
+/* Don't print RCU CPU stall warnings during a kernel panic. */
+static int rcu_panic(struct notifier_block *this, unsigned long ev, void *ptr)
+{
+ rcu_cpu_stall_suppress = 1;
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block rcu_panic_block = {
+ .notifier_call = rcu_panic,
+};
+
+static int __init check_cpu_stall_init(void)
+{
+ atomic_notifier_chain_register(&panic_notifier_list, &rcu_panic_block);
+ return 0;
+}
+early_initcall(check_cpu_stall_init);
+
+/* If so specified via sysctl, panic, yielding cleaner stall-warning output. */
+static void panic_on_rcu_stall(void)
+{
+ if (sysctl_panic_on_rcu_stall)
+ panic("RCU Stall\n");
+}
+
+/**
+ * rcu_cpu_stall_reset - prevent further stall warnings in current grace period
+ *
+ * Set the stall-warning timeout way off into the future, thus preventing
+ * any RCU CPU stall-warning messages from appearing in the current set of
+ * RCU grace periods.
+ *
+ * The caller must disable hard irqs.
+ */
+void rcu_cpu_stall_reset(void)
+{
+ WRITE_ONCE(rcu_state.jiffies_stall, jiffies + ULONG_MAX / 2);
+}
+
+//////////////////////////////////////////////////////////////////////////////
+//
+// Interaction with RCU grace periods
+
+/* Start of new grace period, so record stall time (and forcing times). */
+static void record_gp_stall_check_time(void)
+{
+ unsigned long j = jiffies;
+ unsigned long j1;
+
+ rcu_state.gp_start = j;
+ j1 = rcu_jiffies_till_stall_check();
+ /* Record ->gp_start before ->jiffies_stall. */
+ smp_store_release(&rcu_state.jiffies_stall, j + j1); /* ^^^ */
+ rcu_state.jiffies_resched = j + j1 / 2;
+ rcu_state.n_force_qs_gpstart = READ_ONCE(rcu_state.n_force_qs);
+}
+
+/* Zero ->ticks_this_gp and snapshot the number of RCU softirq handlers. */
+static void zero_cpu_stall_ticks(struct rcu_data *rdp)
+{
+ rdp->ticks_this_gp = 0;
+ rdp->softirq_snap = kstat_softirqs_cpu(RCU_SOFTIRQ, smp_processor_id());
+ WRITE_ONCE(rdp->last_fqs_resched, jiffies);
+}
+
+/*
+ * If too much time has passed in the current grace period, and if
+ * so configured, go kick the relevant kthreads.
+ */
+static void rcu_stall_kick_kthreads(void)
+{
+ unsigned long j;
+
+ if (!rcu_kick_kthreads)
+ return;
+ j = READ_ONCE(rcu_state.jiffies_kick_kthreads);
+ if (time_after(jiffies, j) && rcu_state.gp_kthread &&
+ (rcu_gp_in_progress() || READ_ONCE(rcu_state.gp_flags))) {
+ WARN_ONCE(1, "Kicking %s grace-period kthread\n",
+ rcu_state.name);
+ rcu_ftrace_dump(DUMP_ALL);
+ wake_up_process(rcu_state.gp_kthread);
+ WRITE_ONCE(rcu_state.jiffies_kick_kthreads, j + HZ);
+ }
+}
+
+/*
+ * Handler for the irq_work request posted about halfway into the RCU CPU
+ * stall timeout, and used to detect excessive irq disabling. Set state
+ * appropriately, but just complain if there is unexpected state on entry.
+ */
+static void rcu_iw_handler(struct irq_work *iwp)
+{
+ struct rcu_data *rdp;
+ struct rcu_node *rnp;
+
+ rdp = container_of(iwp, struct rcu_data, rcu_iw);
+ rnp = rdp->mynode;
+ raw_spin_lock_rcu_node(rnp);
+ if (!WARN_ON_ONCE(!rdp->rcu_iw_pending)) {
+ rdp->rcu_iw_gp_seq = rnp->gp_seq;
+ rdp->rcu_iw_pending = false;
+ }
+ raw_spin_unlock_rcu_node(rnp);
+}
+
+//////////////////////////////////////////////////////////////////////////////
+//
+// Printing RCU CPU stall warnings
+
+#ifdef CONFIG_PREEMPT
+
+/*
+ * Dump detailed information for all tasks blocking the current RCU
+ * grace period on the specified rcu_node structure.
+ */
+static void rcu_print_detail_task_stall_rnp(struct rcu_node *rnp)
+{
+ unsigned long flags;
+ struct task_struct *t;
+
+ raw_spin_lock_irqsave_rcu_node(rnp, flags);
+ if (!rcu_preempt_blocked_readers_cgp(rnp)) {
+ raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+ return;
+ }
+ t = list_entry(rnp->gp_tasks->prev,
+ struct task_struct, rcu_node_entry);
+ list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) {
+ /*
+ * We could be printing a lot while holding a spinlock.
+ * Avoid triggering hard lockup.
+ */
+ touch_nmi_watchdog();
+ sched_show_task(t);
+ }
+ raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+}
+
+/*
+ * Scan the current list of tasks blocked within RCU read-side critical
+ * sections, printing out the tid of each.
+ */
+static int rcu_print_task_stall(struct rcu_node *rnp)
+{
+ struct task_struct *t;
+ int ndetected = 0;
+
+ if (!rcu_preempt_blocked_readers_cgp(rnp))
+ return 0;
+ pr_err("\tTasks blocked on level-%d rcu_node (CPUs %d-%d):",
+ rnp->level, rnp->grplo, rnp->grphi);
+ t = list_entry(rnp->gp_tasks->prev,
+ struct task_struct, rcu_node_entry);
+ list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) {
+ pr_cont(" P%d", t->pid);
+ ndetected++;
+ }
+ pr_cont("\n");
+ return ndetected;
+}
+
+#else /* #ifdef CONFIG_PREEMPT */
+
+/*
+ * Because preemptible RCU does not exist, we never have to check for
+ * tasks blocked within RCU read-side critical sections.
+ */
+static void rcu_print_detail_task_stall_rnp(struct rcu_node *rnp)
+{
+}
+
+/*
+ * Because preemptible RCU does not exist, we never have to check for
+ * tasks blocked within RCU read-side critical sections.
+ */
+static int rcu_print_task_stall(struct rcu_node *rnp)
+{
+ return 0;
+}
+#endif /* #else #ifdef CONFIG_PREEMPT */
+
+/*
+ * Dump stacks of all tasks running on stalled CPUs. First try using
+ * NMIs, but fall back to manual remote stack tracing on architectures
+ * that don't support NMI-based stack dumps. The NMI-triggered stack
+ * traces are more accurate because they are printed by the target CPU.
+ */
+static void rcu_dump_cpu_stacks(void)
+{
+ int cpu;
+ unsigned long flags;
+ struct rcu_node *rnp;
+
+ rcu_for_each_leaf_node(rnp) {
+ raw_spin_lock_irqsave_rcu_node(rnp, flags);
+ for_each_leaf_node_possible_cpu(rnp, cpu)
+ if (rnp->qsmask & leaf_node_cpu_bit(rnp, cpu))
+ if (!trigger_single_cpu_backtrace(cpu))
+ dump_cpu_task(cpu);
+ raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+ }
+}
+
+#ifdef CONFIG_RCU_FAST_NO_HZ
+
+static void print_cpu_stall_fast_no_hz(char *cp, int cpu)
+{
+ struct rcu_data *rdp = &per_cpu(rcu_data, cpu);
+
+ sprintf(cp, "last_accelerate: %04lx/%04lx, Nonlazy posted: %c%c%c",
+ rdp->last_accelerate & 0xffff, jiffies & 0xffff,
+ ".l"[rdp->all_lazy],
+ ".L"[!rcu_segcblist_n_nonlazy_cbs(&rdp->cblist)],
+ ".D"[!!rdp->tick_nohz_enabled_snap]);
+}
+
+#else /* #ifdef CONFIG_RCU_FAST_NO_HZ */
+
+static void print_cpu_stall_fast_no_hz(char *cp, int cpu)
+{
+ *cp = '\0';
+}
+
+#endif /* #else #ifdef CONFIG_RCU_FAST_NO_HZ */
+
+/*
+ * Print out diagnostic information for the specified stalled CPU.
+ *
+ * If the specified CPU is aware of the current RCU grace period, then
+ * print the number of scheduling clock interrupts the CPU has taken
+ * during the time that it has been aware. Otherwise, print the number
+ * of RCU grace periods that this CPU is ignorant of, for example, "1"
+ * if the CPU was aware of the previous grace period.
+ *
+ * Also print out idle and (if CONFIG_RCU_FAST_NO_HZ) idle-entry info.
+ */
+static void print_cpu_stall_info(int cpu)
+{
+ unsigned long delta;
+ char fast_no_hz[72];
+ struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
+ char *ticks_title;
+ unsigned long ticks_value;
+
+ /*
+ * We could be printing a lot while holding a spinlock. Avoid
+ * triggering hard lockup.
+ */
+ touch_nmi_watchdog();
+
+ ticks_value = rcu_seq_ctr(rcu_state.gp_seq - rdp->gp_seq);
+ if (ticks_value) {
+ ticks_title = "GPs behind";
+ } else {
+ ticks_title = "ticks this GP";
+ ticks_value = rdp->ticks_this_gp;
+ }
+ print_cpu_stall_fast_no_hz(fast_no_hz, cpu);
+ delta = rcu_seq_ctr(rdp->mynode->gp_seq - rdp->rcu_iw_gp_seq);
+ pr_err("\t%d-%c%c%c%c: (%lu %s) idle=%03x/%ld/%#lx softirq=%u/%u fqs=%ld %s\n",
+ cpu,
+ "O."[!!cpu_online(cpu)],
+ "o."[!!(rdp->grpmask & rdp->mynode->qsmaskinit)],
+ "N."[!!(rdp->grpmask & rdp->mynode->qsmaskinitnext)],
+ !IS_ENABLED(CONFIG_IRQ_WORK) ? '?' :
+ rdp->rcu_iw_pending ? (int)min(delta, 9UL) + '0' :
+ "!."[!delta],
+ ticks_value, ticks_title,
+ rcu_dynticks_snap(rdp) & 0xfff,
+ rdp->dynticks_nesting, rdp->dynticks_nmi_nesting,
+ rdp->softirq_snap, kstat_softirqs_cpu(RCU_SOFTIRQ, cpu),
+ READ_ONCE(rcu_state.n_force_qs) - rcu_state.n_force_qs_gpstart,
+ fast_no_hz);
+}
+
+/* Complain about starvation of grace-period kthread. */
+static void rcu_check_gp_kthread_starvation(void)
+{
+ struct task_struct *gpk = rcu_state.gp_kthread;
+ unsigned long j;
+
+ j = jiffies - READ_ONCE(rcu_state.gp_activity);
+ if (j > 2 * HZ) {
+ pr_err("%s kthread starved for %ld jiffies! g%ld f%#x %s(%d) ->state=%#lx ->cpu=%d\n",
+ rcu_state.name, j,
+ (long)rcu_seq_current(&rcu_state.gp_seq),
+ READ_ONCE(rcu_state.gp_flags),
+ gp_state_getname(rcu_state.gp_state), rcu_state.gp_state,
+ gpk ? gpk->state : ~0, gpk ? task_cpu(gpk) : -1);
+ if (gpk) {
+ pr_err("RCU grace-period kthread stack dump:\n");
+ sched_show_task(gpk);
+ wake_up_process(gpk);
+ }
+ }
+}
+
+static void print_other_cpu_stall(unsigned long gp_seq)
+{
+ int cpu;
+ unsigned long flags;
+ unsigned long gpa;
+ unsigned long j;
+ int ndetected = 0;
+ struct rcu_node *rnp;
+ long totqlen = 0;
+
+ /* Kick and suppress, if so configured. */
+ rcu_stall_kick_kthreads();
+ if (rcu_cpu_stall_suppress)
+ return;
+
+ /*
+ * OK, time to rat on our buddy...
+ * See Documentation/RCU/stallwarn.txt for info on how to debug
+ * RCU CPU stall warnings.
+ */
+ pr_err("INFO: %s detected stalls on CPUs/tasks:\n", rcu_state.name);
+ rcu_for_each_leaf_node(rnp) {
+ raw_spin_lock_irqsave_rcu_node(rnp, flags);
+ ndetected += rcu_print_task_stall(rnp);
+ if (rnp->qsmask != 0) {
+ for_each_leaf_node_possible_cpu(rnp, cpu)
+ if (rnp->qsmask & leaf_node_cpu_bit(rnp, cpu)) {
+ print_cpu_stall_info(cpu);
+ ndetected++;
+ }
+ }
+ raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+ }
+
+ for_each_possible_cpu(cpu)
+ totqlen += rcu_get_n_cbs_cpu(cpu);
+ pr_cont("\t(detected by %d, t=%ld jiffies, g=%ld, q=%lu)\n",
+ smp_processor_id(), (long)(jiffies - rcu_state.gp_start),
+ (long)rcu_seq_current(&rcu_state.gp_seq), totqlen);
+ if (ndetected) {
+ rcu_dump_cpu_stacks();
+
+ /* Complain about tasks blocking the grace period. */
+ rcu_for_each_leaf_node(rnp)
+ rcu_print_detail_task_stall_rnp(rnp);
+ } else {
+ if (rcu_seq_current(&rcu_state.gp_seq) != gp_seq) {
+ pr_err("INFO: Stall ended before state dump start\n");
+ } else {
+ j = jiffies;
+ gpa = READ_ONCE(rcu_state.gp_activity);
+ pr_err("All QSes seen, last %s kthread activity %ld (%ld-%ld), jiffies_till_next_fqs=%ld, root ->qsmask %#lx\n",
+ rcu_state.name, j - gpa, j, gpa,
+ READ_ONCE(jiffies_till_next_fqs),
+ rcu_get_root()->qsmask);
+ /* In this case, the current CPU might be at fault. */
+ sched_show_task(current);
+ }
+ }
+ /* Rewrite if needed in case of slow consoles. */
+ if (ULONG_CMP_GE(jiffies, READ_ONCE(rcu_state.jiffies_stall)))
+ WRITE_ONCE(rcu_state.jiffies_stall,
+ jiffies + 3 * rcu_jiffies_till_stall_check() + 3);
+
+ rcu_check_gp_kthread_starvation();
+
+ panic_on_rcu_stall();
+
+ rcu_force_quiescent_state(); /* Kick them all. */
+}
+
+static void print_cpu_stall(void)
+{
+ int cpu;
+ unsigned long flags;
+ struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
+ struct rcu_node *rnp = rcu_get_root();
+ long totqlen = 0;
+
+ /* Kick and suppress, if so configured. */
+ rcu_stall_kick_kthreads();
+ if (rcu_cpu_stall_suppress)
+ return;
+
+ /*
+ * OK, time to rat on ourselves...
+ * See Documentation/RCU/stallwarn.txt for info on how to debug
+ * RCU CPU stall warnings.
+ */
+ pr_err("INFO: %s self-detected stall on CPU\n", rcu_state.name);
+ raw_spin_lock_irqsave_rcu_node(rdp->mynode, flags);
+ print_cpu_stall_info(smp_processor_id());
+ raw_spin_unlock_irqrestore_rcu_node(rdp->mynode, flags);
+ for_each_possible_cpu(cpu)
+ totqlen += rcu_get_n_cbs_cpu(cpu);
+ pr_cont("\t(t=%lu jiffies g=%ld q=%lu)\n",
+ jiffies - rcu_state.gp_start,
+ (long)rcu_seq_current(&rcu_state.gp_seq), totqlen);
+
+ rcu_check_gp_kthread_starvation();
+
+ rcu_dump_cpu_stacks();
+
+ raw_spin_lock_irqsave_rcu_node(rnp, flags);
+ /* Rewrite if needed in case of slow consoles. */
+ if (ULONG_CMP_GE(jiffies, READ_ONCE(rcu_state.jiffies_stall)))
+ WRITE_ONCE(rcu_state.jiffies_stall,
+ jiffies + 3 * rcu_jiffies_till_stall_check() + 3);
+ raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+
+ panic_on_rcu_stall();
+
+ /*
+ * Attempt to revive the RCU machinery by forcing a context switch.
+ *
+ * A context switch would normally allow the RCU state machine to make
+ * progress and it could be we're stuck in kernel space without context
+ * switches for an entirely unreasonable amount of time.
+ */
+ set_tsk_need_resched(current);
+ set_preempt_need_resched();
+}
+
+static void check_cpu_stall(struct rcu_data *rdp)
+{
+ unsigned long gs1;
+ unsigned long gs2;
+ unsigned long gps;
+ unsigned long j;
+ unsigned long jn;
+ unsigned long js;
+ struct rcu_node *rnp;
+
+ if ((rcu_cpu_stall_suppress && !rcu_kick_kthreads) ||
+ !rcu_gp_in_progress())
+ return;
+ rcu_stall_kick_kthreads();
+ j = jiffies;
+
+ /*
+ * Lots of memory barriers to reject false positives.
+ *
+ * The idea is to pick up rcu_state.gp_seq, then
+ * rcu_state.jiffies_stall, then rcu_state.gp_start, and finally
+ * another copy of rcu_state.gp_seq. These values are updated in
+ * the opposite order with memory barriers (or equivalent) during
+ * grace-period initialization and cleanup. Now, a false positive
+ * can occur if we get an new value of rcu_state.gp_start and a old
+ * value of rcu_state.jiffies_stall. But given the memory barriers,
+ * the only way that this can happen is if one grace period ends
+ * and another starts between these two fetches. This is detected
+ * by comparing the second fetch of rcu_state.gp_seq with the
+ * previous fetch from rcu_state.gp_seq.
+ *
+ * Given this check, comparisons of jiffies, rcu_state.jiffies_stall,
+ * and rcu_state.gp_start suffice to forestall false positives.
+ */
+ gs1 = READ_ONCE(rcu_state.gp_seq);
+ smp_rmb(); /* Pick up ->gp_seq first... */
+ js = READ_ONCE(rcu_state.jiffies_stall);
+ smp_rmb(); /* ...then ->jiffies_stall before the rest... */
+ gps = READ_ONCE(rcu_state.gp_start);
+ smp_rmb(); /* ...and finally ->gp_start before ->gp_seq again. */
+ gs2 = READ_ONCE(rcu_state.gp_seq);
+ if (gs1 != gs2 ||
+ ULONG_CMP_LT(j, js) ||
+ ULONG_CMP_GE(gps, js))
+ return; /* No stall or GP completed since entering function. */
+ rnp = rdp->mynode;
+ jn = jiffies + 3 * rcu_jiffies_till_stall_check() + 3;
+ if (rcu_gp_in_progress() &&
+ (READ_ONCE(rnp->qsmask) & rdp->grpmask) &&
+ cmpxchg(&rcu_state.jiffies_stall, js, jn) == js) {
+
+ /* We haven't checked in, so go dump stack. */
+ print_cpu_stall();
+
+ } else if (rcu_gp_in_progress() &&
+ ULONG_CMP_GE(j, js + RCU_STALL_RAT_DELAY) &&
+ cmpxchg(&rcu_state.jiffies_stall, js, jn) == js) {
+
+ /* They had a few time units to dump stack, so complain. */
+ print_other_cpu_stall(gs2);
+ }
+}
+
+//////////////////////////////////////////////////////////////////////////////
+//
+// RCU forward-progress mechanisms, including of callback invocation.
+
+
+/*
+ * Show the state of the grace-period kthreads.
+ */
+void show_rcu_gp_kthreads(void)
+{
+ int cpu;
+ unsigned long j;
+ unsigned long ja;
+ unsigned long jr;
+ unsigned long jw;
+ struct rcu_data *rdp;
+ struct rcu_node *rnp;
+
+ j = jiffies;
+ ja = j - READ_ONCE(rcu_state.gp_activity);
+ jr = j - READ_ONCE(rcu_state.gp_req_activity);
+ jw = j - READ_ONCE(rcu_state.gp_wake_time);
+ pr_info("%s: wait state: %s(%d) ->state: %#lx delta ->gp_activity %lu ->gp_req_activity %lu ->gp_wake_time %lu ->gp_wake_seq %ld ->gp_seq %ld ->gp_seq_needed %ld ->gp_flags %#x\n",
+ rcu_state.name, gp_state_getname(rcu_state.gp_state),
+ rcu_state.gp_state,
+ rcu_state.gp_kthread ? rcu_state.gp_kthread->state : 0x1ffffL,
+ ja, jr, jw, (long)READ_ONCE(rcu_state.gp_wake_seq),
+ (long)READ_ONCE(rcu_state.gp_seq),
+ (long)READ_ONCE(rcu_get_root()->gp_seq_needed),
+ READ_ONCE(rcu_state.gp_flags));
+ rcu_for_each_node_breadth_first(rnp) {
+ if (ULONG_CMP_GE(rcu_state.gp_seq, rnp->gp_seq_needed))
+ continue;
+ pr_info("\trcu_node %d:%d ->gp_seq %ld ->gp_seq_needed %ld\n",
+ rnp->grplo, rnp->grphi, (long)rnp->gp_seq,
+ (long)rnp->gp_seq_needed);
+ if (!rcu_is_leaf_node(rnp))
+ continue;
+ for_each_leaf_node_possible_cpu(rnp, cpu) {
+ rdp = per_cpu_ptr(&rcu_data, cpu);
+ if (rdp->gpwrap ||
+ ULONG_CMP_GE(rcu_state.gp_seq,
+ rdp->gp_seq_needed))
+ continue;
+ pr_info("\tcpu %d ->gp_seq_needed %ld\n",
+ cpu, (long)rdp->gp_seq_needed);
+ }
+ }
+ /* sched_show_task(rcu_state.gp_kthread); */
+}
+EXPORT_SYMBOL_GPL(show_rcu_gp_kthreads);
+
+/*
+ * This function checks for grace-period requests that fail to motivate
+ * RCU to come out of its idle mode.
+ */
+static void rcu_check_gp_start_stall(struct rcu_node *rnp, struct rcu_data *rdp,
+ const unsigned long gpssdelay)
+{
+ unsigned long flags;
+ unsigned long j;
+ struct rcu_node *rnp_root = rcu_get_root();
+ static atomic_t warned = ATOMIC_INIT(0);
+
+ if (!IS_ENABLED(CONFIG_PROVE_RCU) || rcu_gp_in_progress() ||
+ ULONG_CMP_GE(rnp_root->gp_seq, rnp_root->gp_seq_needed))
+ return;
+ j = jiffies; /* Expensive access, and in common case don't get here. */
+ if (time_before(j, READ_ONCE(rcu_state.gp_req_activity) + gpssdelay) ||
+ time_before(j, READ_ONCE(rcu_state.gp_activity) + gpssdelay) ||
+ atomic_read(&warned))
+ return;
+
+ raw_spin_lock_irqsave_rcu_node(rnp, flags);
+ j = jiffies;
+ if (rcu_gp_in_progress() ||
+ ULONG_CMP_GE(rnp_root->gp_seq, rnp_root->gp_seq_needed) ||
+ time_before(j, READ_ONCE(rcu_state.gp_req_activity) + gpssdelay) ||
+ time_before(j, READ_ONCE(rcu_state.gp_activity) + gpssdelay) ||
+ atomic_read(&warned)) {
+ raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+ return;
+ }
+ /* Hold onto the leaf lock to make others see warned==1. */
+
+ if (rnp_root != rnp)
+ raw_spin_lock_rcu_node(rnp_root); /* irqs already disabled. */
+ j = jiffies;
+ if (rcu_gp_in_progress() ||
+ ULONG_CMP_GE(rnp_root->gp_seq, rnp_root->gp_seq_needed) ||
+ time_before(j, rcu_state.gp_req_activity + gpssdelay) ||
+ time_before(j, rcu_state.gp_activity + gpssdelay) ||
+ atomic_xchg(&warned, 1)) {
+ if (rnp_root != rnp)
+ /* irqs remain disabled. */
+ raw_spin_unlock_rcu_node(rnp_root);
+ raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+ return;
+ }
+ WARN_ON(1);
+ if (rnp_root != rnp)
+ raw_spin_unlock_rcu_node(rnp_root);
+ raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+ show_rcu_gp_kthreads();
+}
+
+/*
+ * Do a forward-progress check for rcutorture. This is normally invoked
+ * due to an OOM event. The argument "j" gives the time period during
+ * which rcutorture would like progress to have been made.
+ */
+void rcu_fwd_progress_check(unsigned long j)
+{
+ unsigned long cbs;
+ int cpu;
+ unsigned long max_cbs = 0;
+ int max_cpu = -1;
+ struct rcu_data *rdp;
+
+ if (rcu_gp_in_progress()) {
+ pr_info("%s: GP age %lu jiffies\n",
+ __func__, jiffies - rcu_state.gp_start);
+ show_rcu_gp_kthreads();
+ } else {
+ pr_info("%s: Last GP end %lu jiffies ago\n",
+ __func__, jiffies - rcu_state.gp_end);
+ preempt_disable();
+ rdp = this_cpu_ptr(&rcu_data);
+ rcu_check_gp_start_stall(rdp->mynode, rdp, j);
+ preempt_enable();
+ }
+ for_each_possible_cpu(cpu) {
+ cbs = rcu_get_n_cbs_cpu(cpu);
+ if (!cbs)
+ continue;
+ if (max_cpu < 0)
+ pr_info("%s: callbacks", __func__);
+ pr_cont(" %d: %lu", cpu, cbs);
+ if (cbs <= max_cbs)
+ continue;
+ max_cbs = cbs;
+ max_cpu = cpu;
+ }
+ if (max_cpu >= 0)
+ pr_cont("\n");
+}
+EXPORT_SYMBOL_GPL(rcu_fwd_progress_check);
+
+/* Commandeer a sysrq key to dump RCU's tree. */
+static bool sysrq_rcu;
+module_param(sysrq_rcu, bool, 0444);
+
+/* Dump grace-period-request information due to commandeered sysrq. */
+static void sysrq_show_rcu(int key)
+{
+ show_rcu_gp_kthreads();
+}
+
+static struct sysrq_key_op sysrq_rcudump_op = {
+ .handler = sysrq_show_rcu,
+ .help_msg = "show-rcu(y)",
+ .action_msg = "Show RCU tree",
+ .enable_mask = SYSRQ_ENABLE_DUMP,
+};
+
+static int __init rcu_sysrq_init(void)
+{
+ if (sysrq_rcu)
+ return register_sysrq_key('y', &sysrq_rcudump_op);
+ return 0;
+}
+early_initcall(rcu_sysrq_init);
diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
index 1971869c4072..61df2bf08563 100644
--- a/kernel/rcu/update.c
+++ b/kernel/rcu/update.c
@@ -1,26 +1,13 @@
+// SPDX-License-Identifier: GPL-2.0+
/*
* Read-Copy Update mechanism for mutual exclusion
*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, you can access it online at
- * http://www.gnu.org/licenses/gpl-2.0.html.
- *
* Copyright IBM Corporation, 2001
*
* Authors: Dipankar Sarma <dipankar@in.ibm.com>
* Manfred Spraul <manfred@colorfullife.com>
*
- * Based on the original work by Paul McKenney <paulmck@us.ibm.com>
+ * Based on the original work by Paul McKenney <paulmck@linux.ibm.com>
* and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen.
* Papers:
* http://www.rdrop.com/users/paulmck/paper/rclockpdcsproof.pdf
@@ -52,6 +39,7 @@
#include <linux/tick.h>
#include <linux/rcupdate_wait.h>
#include <linux/sched/isolation.h>
+#include <linux/kprobes.h>
#define CREATE_TRACE_POINTS
@@ -249,6 +237,7 @@ int notrace debug_lockdep_rcu_enabled(void)
current->lockdep_recursion == 0;
}
EXPORT_SYMBOL_GPL(debug_lockdep_rcu_enabled);
+NOKPROBE_SYMBOL(debug_lockdep_rcu_enabled);
/**
* rcu_read_lock_held() - might we be in RCU read-side critical section?
@@ -434,69 +423,25 @@ EXPORT_SYMBOL_GPL(do_trace_rcu_torture_read);
do { } while (0)
#endif
-#ifdef CONFIG_RCU_STALL_COMMON
+#if IS_ENABLED(CONFIG_RCU_TORTURE_TEST) || IS_MODULE(CONFIG_RCU_TORTURE_TEST)
+/* Get rcutorture access to sched_setaffinity(). */
+long rcutorture_sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
+{
+ int ret;
-#ifdef CONFIG_PROVE_RCU
-#define RCU_STALL_DELAY_DELTA (5 * HZ)
-#else
-#define RCU_STALL_DELAY_DELTA 0
+ ret = sched_setaffinity(pid, in_mask);
+ WARN_ONCE(ret, "%s: sched_setaffinity() returned %d\n", __func__, ret);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(rcutorture_sched_setaffinity);
#endif
+#ifdef CONFIG_RCU_STALL_COMMON
int rcu_cpu_stall_suppress __read_mostly; /* 1 = suppress stall warnings. */
EXPORT_SYMBOL_GPL(rcu_cpu_stall_suppress);
-static int rcu_cpu_stall_timeout __read_mostly = CONFIG_RCU_CPU_STALL_TIMEOUT;
-
module_param(rcu_cpu_stall_suppress, int, 0644);
+int rcu_cpu_stall_timeout __read_mostly = CONFIG_RCU_CPU_STALL_TIMEOUT;
module_param(rcu_cpu_stall_timeout, int, 0644);
-
-int rcu_jiffies_till_stall_check(void)
-{
- int till_stall_check = READ_ONCE(rcu_cpu_stall_timeout);
-
- /*
- * Limit check must be consistent with the Kconfig limits
- * for CONFIG_RCU_CPU_STALL_TIMEOUT.
- */
- if (till_stall_check < 3) {
- WRITE_ONCE(rcu_cpu_stall_timeout, 3);
- till_stall_check = 3;
- } else if (till_stall_check > 300) {
- WRITE_ONCE(rcu_cpu_stall_timeout, 300);
- till_stall_check = 300;
- }
- return till_stall_check * HZ + RCU_STALL_DELAY_DELTA;
-}
-EXPORT_SYMBOL_GPL(rcu_jiffies_till_stall_check);
-
-void rcu_sysrq_start(void)
-{
- if (!rcu_cpu_stall_suppress)
- rcu_cpu_stall_suppress = 2;
-}
-
-void rcu_sysrq_end(void)
-{
- if (rcu_cpu_stall_suppress == 2)
- rcu_cpu_stall_suppress = 0;
-}
-
-static int rcu_panic(struct notifier_block *this, unsigned long ev, void *ptr)
-{
- rcu_cpu_stall_suppress = 1;
- return NOTIFY_DONE;
-}
-
-static struct notifier_block rcu_panic_block = {
- .notifier_call = rcu_panic,
-};
-
-static int __init check_cpu_stall_init(void)
-{
- atomic_notifier_chain_register(&panic_notifier_list, &rcu_panic_block);
- return 0;
-}
-early_initcall(check_cpu_stall_init);
-
#endif /* #ifdef CONFIG_RCU_STALL_COMMON */
#ifdef CONFIG_TASKS_RCU
diff --git a/kernel/reboot.c b/kernel/reboot.c
index e1b79b6a2735..c4d472b7f1b4 100644
--- a/kernel/reboot.c
+++ b/kernel/reboot.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* linux/kernel/reboot.c
*
@@ -31,6 +32,7 @@ EXPORT_SYMBOL(cad_pid);
#define DEFAULT_REBOOT_MODE
#endif
enum reboot_mode reboot_mode DEFAULT_REBOOT_MODE;
+enum reboot_mode panic_reboot_mode = REBOOT_UNDEFINED;
/*
* This variable is used privately to keep track of whether or not
@@ -519,6 +521,8 @@ EXPORT_SYMBOL_GPL(orderly_reboot);
static int __init reboot_setup(char *str)
{
for (;;) {
+ enum reboot_mode *mode;
+
/*
* Having anything passed on the command line via
* reboot= will cause us to disable DMI checking
@@ -526,17 +530,24 @@ static int __init reboot_setup(char *str)
*/
reboot_default = 0;
+ if (!strncmp(str, "panic_", 6)) {
+ mode = &panic_reboot_mode;
+ str += 6;
+ } else {
+ mode = &reboot_mode;
+ }
+
switch (*str) {
case 'w':
- reboot_mode = REBOOT_WARM;
+ *mode = REBOOT_WARM;
break;
case 'c':
- reboot_mode = REBOOT_COLD;
+ *mode = REBOOT_COLD;
break;
case 'h':
- reboot_mode = REBOOT_HARD;
+ *mode = REBOOT_HARD;
break;
case 's':
@@ -553,11 +564,11 @@ static int __init reboot_setup(char *str)
if (rc)
return rc;
} else
- reboot_mode = REBOOT_SOFT;
+ *mode = REBOOT_SOFT;
break;
}
case 'g':
- reboot_mode = REBOOT_GPIO;
+ *mode = REBOOT_GPIO;
break;
case 'b':
diff --git a/kernel/relay.c b/kernel/relay.c
index 9e0f52375487..ade14fb7ce2e 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -1177,7 +1177,6 @@ static void relay_pipe_buf_release(struct pipe_inode_info *pipe,
}
static const struct pipe_buf_operations relay_pipe_buf_ops = {
- .can_merge = 0,
.confirm = generic_pipe_buf_confirm,
.release = relay_pipe_buf_release,
.steal = generic_pipe_buf_steal,
diff --git a/kernel/resource.c b/kernel/resource.c
index 915c02e8e5dd..158f04ec1d4f 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* linux/kernel/resource.c
*
@@ -382,7 +383,7 @@ static int __walk_iomem_res_desc(resource_size_t start, resource_size_t end,
int (*func)(struct resource *, void *))
{
struct resource res;
- int ret = -1;
+ int ret = -EINVAL;
while (start < end &&
!find_next_iomem_res(start, end, flags, desc, first_lvl, &res)) {
@@ -448,12 +449,13 @@ int walk_mem_res(u64 start, u64 end, void *arg,
arg, func);
}
-#if !defined(CONFIG_ARCH_HAS_WALK_MEMORY)
-
/*
* This function calls the @func callback against all memory ranges of type
* System RAM which are marked as IORESOURCE_SYSTEM_RAM and IORESOUCE_BUSY.
* It is to be used only for System RAM.
+ *
+ * This will find System RAM ranges that are children of top-level resources
+ * in addition to top-level System RAM resources.
*/
int walk_system_ram_range(unsigned long start_pfn, unsigned long nr_pages,
void *arg, int (*func)(unsigned long, unsigned long, void *))
@@ -462,14 +464,14 @@ int walk_system_ram_range(unsigned long start_pfn, unsigned long nr_pages,
unsigned long flags;
struct resource res;
unsigned long pfn, end_pfn;
- int ret = -1;
+ int ret = -EINVAL;
start = (u64) start_pfn << PAGE_SHIFT;
end = ((u64)(start_pfn + nr_pages) << PAGE_SHIFT) - 1;
flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
while (start < end &&
!find_next_iomem_res(start, end, flags, IORES_DESC_NONE,
- true, &res)) {
+ false, &res)) {
pfn = (res.start + PAGE_SIZE - 1) >> PAGE_SHIFT;
end_pfn = (res.end + 1) >> PAGE_SHIFT;
if (end_pfn > pfn)
@@ -481,8 +483,6 @@ int walk_system_ram_range(unsigned long start_pfn, unsigned long nr_pages,
return ret;
}
-#endif
-
static int __is_ram(unsigned long pfn, unsigned long nr_pages, void *arg)
{
return 1;
@@ -521,21 +521,20 @@ EXPORT_SYMBOL_GPL(page_is_ram);
int region_intersects(resource_size_t start, size_t size, unsigned long flags,
unsigned long desc)
{
- resource_size_t end = start + size - 1;
+ struct resource res;
int type = 0; int other = 0;
struct resource *p;
+ res.start = start;
+ res.end = start + size - 1;
+
read_lock(&resource_lock);
for (p = iomem_resource.child; p ; p = p->sibling) {
bool is_type = (((p->flags & flags) == flags) &&
((desc == IORES_DESC_NONE) ||
(desc == p->desc)));
- if (start >= p->start && start <= p->end)
- is_type ? type++ : other++;
- if (end >= p->start && end <= p->end)
- is_type ? type++ : other++;
- if (p->start >= start && p->end <= end)
+ if (resource_overlaps(p, &res))
is_type ? type++ : other++;
}
read_unlock(&resource_lock);
@@ -1132,6 +1131,15 @@ struct resource * __request_region(struct resource *parent,
conflict = __request_resource(parent, res);
if (!conflict)
break;
+ /*
+ * mm/hmm.c reserves physical addresses which then
+ * become unavailable to other users. Conflicts are
+ * not expected. Warn to aid debugging if encountered.
+ */
+ if (conflict->desc == IORES_DESC_DEVICE_PRIVATE_MEMORY) {
+ pr_warn("Unaddressable device %s %pR conflicts with %pR",
+ conflict->name, conflict, res);
+ }
if (conflict != parent) {
if (!(conflict->flags & IORESOURCE_BUSY)) {
parent = conflict;
diff --git a/kernel/rseq.c b/kernel/rseq.c
index 25e9a7b60eba..27c48eb7de40 100644
--- a/kernel/rseq.c
+++ b/kernel/rseq.c
@@ -254,8 +254,7 @@ static int rseq_ip_fixup(struct pt_regs *regs)
* - signal delivery,
* and return to user-space.
*
- * This is how we can ensure that the entire rseq critical section,
- * consisting of both the C part and the assembly instruction sequence,
+ * This is how we can ensure that the entire rseq critical section
* will issue the commit instruction only if executed atomically with
* respect to other threads scheduled on the same CPU, and with respect
* to signal handlers.
@@ -278,7 +277,7 @@ void __rseq_handle_notify_resume(struct ksignal *ksig, struct pt_regs *regs)
error:
sig = ksig ? ksig->sig : 0;
- force_sigsegv(sig, t);
+ force_sigsegv(sig);
}
#ifdef CONFIG_DEBUG_RSEQ
@@ -297,7 +296,7 @@ void rseq_syscall(struct pt_regs *regs)
return;
if (!access_ok(t->rseq, sizeof(*t->rseq)) ||
rseq_get_rseq_cs(t, &rseq_cs) || in_rseq_cs(ip, &rseq_cs))
- force_sig(SIGSEGV, t);
+ force_sig(SIGSEGV);
}
#endif
@@ -314,7 +313,7 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len,
/* Unregister rseq for current thread. */
if (current->rseq != rseq || !current->rseq)
return -EINVAL;
- if (current->rseq_len != rseq_len)
+ if (rseq_len != sizeof(*rseq))
return -EINVAL;
if (current->rseq_sig != sig)
return -EPERM;
@@ -322,7 +321,6 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len,
if (ret)
return ret;
current->rseq = NULL;
- current->rseq_len = 0;
current->rseq_sig = 0;
return 0;
}
@@ -336,7 +334,7 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len,
* the provided address differs from the prior
* one.
*/
- if (current->rseq != rseq || current->rseq_len != rseq_len)
+ if (current->rseq != rseq || rseq_len != sizeof(*rseq))
return -EINVAL;
if (current->rseq_sig != sig)
return -EPERM;
@@ -354,7 +352,6 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len,
if (!access_ok(rseq, rseq_len))
return -EFAULT;
current->rseq = rseq;
- current->rseq_len = rseq_len;
current->rseq_sig = sig;
/*
* If rseq was previously inactive, and has just been
diff --git a/kernel/sched/autogroup.c b/kernel/sched/autogroup.c
index 2d4ff5353ded..2067080bb235 100644
--- a/kernel/sched/autogroup.c
+++ b/kernel/sched/autogroup.c
@@ -259,7 +259,6 @@ out:
}
#endif /* CONFIG_PROC_FS */
-#ifdef CONFIG_SCHED_DEBUG
int autogroup_path(struct task_group *tg, char *buf, int buflen)
{
if (!task_group_is_autogroup(tg))
@@ -267,4 +266,3 @@ int autogroup_path(struct task_group *tg, char *buf, int buflen)
return snprintf(buf, buflen, "%s-%ld", "/autogroup", tg->autogroup->id);
}
-#endif
diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c
index e3e3b979f9bd..1152259a4ca0 100644
--- a/kernel/sched/clock.c
+++ b/kernel/sched/clock.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* sched_clock() for unstable CPU clocks
*
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index d8d76a65cfdd..2b037f195473 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* kernel/sched/core.c
*
@@ -22,6 +23,17 @@
#define CREATE_TRACE_POINTS
#include <trace/events/sched.h>
+/*
+ * Export tracepoints that act as a bare tracehook (ie: have no trace event
+ * associated with them) to allow external modules to probe them.
+ */
+EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_cfs_tp);
+EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_rt_tp);
+EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_dl_tp);
+EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_irq_tp);
+EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_se_tp);
+EXPORT_TRACEPOINT_SYMBOL_GPL(sched_overutilized_tp);
+
DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_JUMP_LABEL)
@@ -107,11 +119,12 @@ struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf)
* [L] ->on_rq
* RELEASE (rq->lock)
*
- * If we observe the old CPU in task_rq_lock, the acquire of
+ * If we observe the old CPU in task_rq_lock(), the acquire of
* the old rq->lock will fully serialize against the stores.
*
- * If we observe the new CPU in task_rq_lock, the acquire will
- * pair with the WMB to ensure we must then also see migrating.
+ * If we observe the new CPU in task_rq_lock(), the address
+ * dependency headed by '[L] rq = task_rq()' and the acquire
+ * will pair with the WMB to ensure we then also see migrating.
*/
if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) {
rq_pin_lock(rq, rf);
@@ -180,6 +193,7 @@ static void update_rq_clock_task(struct rq *rq, s64 delta)
if ((irq_delta + steal) && sched_feat(NONTASK_CAPACITY))
update_irq_load_avg(rq, irq_delta + steal);
#endif
+ update_rq_clock_pelt(rq, delta);
}
void update_rq_clock(struct rq *rq)
@@ -396,19 +410,7 @@ static bool set_nr_if_polling(struct task_struct *p)
#endif
#endif
-/**
- * wake_q_add() - queue a wakeup for 'later' waking.
- * @head: the wake_q_head to add @task to
- * @task: the task to queue for 'later' wakeup
- *
- * Queue a task for later wakeup, most likely by the wake_up_q() call in the
- * same context, _HOWEVER_ this is not guaranteed, the wakeup can come
- * instantly.
- *
- * This function must be used as-if it were wake_up_process(); IOW the task
- * must be ready to be woken at this location.
- */
-void wake_q_add(struct wake_q_head *head, struct task_struct *task)
+static bool __wake_q_add(struct wake_q_head *head, struct task_struct *task)
{
struct wake_q_node *node = &task->wake_q;
@@ -421,16 +423,56 @@ void wake_q_add(struct wake_q_head *head, struct task_struct *task)
* state, even in the failed case, an explicit smp_mb() must be used.
*/
smp_mb__before_atomic();
- if (cmpxchg_relaxed(&node->next, NULL, WAKE_Q_TAIL))
- return;
-
- get_task_struct(task);
+ if (unlikely(cmpxchg_relaxed(&node->next, NULL, WAKE_Q_TAIL)))
+ return false;
/*
* The head is context local, there can be no concurrency.
*/
*head->lastp = node;
head->lastp = &node->next;
+ return true;
+}
+
+/**
+ * wake_q_add() - queue a wakeup for 'later' waking.
+ * @head: the wake_q_head to add @task to
+ * @task: the task to queue for 'later' wakeup
+ *
+ * Queue a task for later wakeup, most likely by the wake_up_q() call in the
+ * same context, _HOWEVER_ this is not guaranteed, the wakeup can come
+ * instantly.
+ *
+ * This function must be used as-if it were wake_up_process(); IOW the task
+ * must be ready to be woken at this location.
+ */
+void wake_q_add(struct wake_q_head *head, struct task_struct *task)
+{
+ if (__wake_q_add(head, task))
+ get_task_struct(task);
+}
+
+/**
+ * wake_q_add_safe() - safely queue a wakeup for 'later' waking.
+ * @head: the wake_q_head to add @task to
+ * @task: the task to queue for 'later' wakeup
+ *
+ * Queue a task for later wakeup, most likely by the wake_up_q() call in the
+ * same context, _HOWEVER_ this is not guaranteed, the wakeup can come
+ * instantly.
+ *
+ * This function must be used as-if it were wake_up_process(); IOW the task
+ * must be ready to be woken at this location.
+ *
+ * This function is essentially a task-safe equivalent to wake_q_add(). Callers
+ * that already hold reference to @task can call the 'safe' version and trust
+ * wake_q to do the right thing depending whether or not the @task is already
+ * queued for wakeup.
+ */
+void wake_q_add_safe(struct wake_q_head *head, struct task_struct *task)
+{
+ if (!__wake_q_add(head, task))
+ put_task_struct(task);
}
void wake_up_q(struct wake_q_head *head)
@@ -730,6 +772,401 @@ static void set_load_weight(struct task_struct *p, bool update_load)
}
}
+#ifdef CONFIG_UCLAMP_TASK
+/* Max allowed minimum utilization */
+unsigned int sysctl_sched_uclamp_util_min = SCHED_CAPACITY_SCALE;
+
+/* Max allowed maximum utilization */
+unsigned int sysctl_sched_uclamp_util_max = SCHED_CAPACITY_SCALE;
+
+/* All clamps are required to be less or equal than these values */
+static struct uclamp_se uclamp_default[UCLAMP_CNT];
+
+/* Integer rounded range for each bucket */
+#define UCLAMP_BUCKET_DELTA DIV_ROUND_CLOSEST(SCHED_CAPACITY_SCALE, UCLAMP_BUCKETS)
+
+#define for_each_clamp_id(clamp_id) \
+ for ((clamp_id) = 0; (clamp_id) < UCLAMP_CNT; (clamp_id)++)
+
+static inline unsigned int uclamp_bucket_id(unsigned int clamp_value)
+{
+ return clamp_value / UCLAMP_BUCKET_DELTA;
+}
+
+static inline unsigned int uclamp_bucket_base_value(unsigned int clamp_value)
+{
+ return UCLAMP_BUCKET_DELTA * uclamp_bucket_id(clamp_value);
+}
+
+static inline unsigned int uclamp_none(int clamp_id)
+{
+ if (clamp_id == UCLAMP_MIN)
+ return 0;
+ return SCHED_CAPACITY_SCALE;
+}
+
+static inline void uclamp_se_set(struct uclamp_se *uc_se,
+ unsigned int value, bool user_defined)
+{
+ uc_se->value = value;
+ uc_se->bucket_id = uclamp_bucket_id(value);
+ uc_se->user_defined = user_defined;
+}
+
+static inline unsigned int
+uclamp_idle_value(struct rq *rq, unsigned int clamp_id,
+ unsigned int clamp_value)
+{
+ /*
+ * Avoid blocked utilization pushing up the frequency when we go
+ * idle (which drops the max-clamp) by retaining the last known
+ * max-clamp.
+ */
+ if (clamp_id == UCLAMP_MAX) {
+ rq->uclamp_flags |= UCLAMP_FLAG_IDLE;
+ return clamp_value;
+ }
+
+ return uclamp_none(UCLAMP_MIN);
+}
+
+static inline void uclamp_idle_reset(struct rq *rq, unsigned int clamp_id,
+ unsigned int clamp_value)
+{
+ /* Reset max-clamp retention only on idle exit */
+ if (!(rq->uclamp_flags & UCLAMP_FLAG_IDLE))
+ return;
+
+ WRITE_ONCE(rq->uclamp[clamp_id].value, clamp_value);
+}
+
+static inline
+unsigned int uclamp_rq_max_value(struct rq *rq, unsigned int clamp_id,
+ unsigned int clamp_value)
+{
+ struct uclamp_bucket *bucket = rq->uclamp[clamp_id].bucket;
+ int bucket_id = UCLAMP_BUCKETS - 1;
+
+ /*
+ * Since both min and max clamps are max aggregated, find the
+ * top most bucket with tasks in.
+ */
+ for ( ; bucket_id >= 0; bucket_id--) {
+ if (!bucket[bucket_id].tasks)
+ continue;
+ return bucket[bucket_id].value;
+ }
+
+ /* No tasks -- default clamp values */
+ return uclamp_idle_value(rq, clamp_id, clamp_value);
+}
+
+/*
+ * The effective clamp bucket index of a task depends on, by increasing
+ * priority:
+ * - the task specific clamp value, when explicitly requested from userspace
+ * - the system default clamp value, defined by the sysadmin
+ */
+static inline struct uclamp_se
+uclamp_eff_get(struct task_struct *p, unsigned int clamp_id)
+{
+ struct uclamp_se uc_req = p->uclamp_req[clamp_id];
+ struct uclamp_se uc_max = uclamp_default[clamp_id];
+
+ /* System default restrictions always apply */
+ if (unlikely(uc_req.value > uc_max.value))
+ return uc_max;
+
+ return uc_req;
+}
+
+unsigned int uclamp_eff_value(struct task_struct *p, unsigned int clamp_id)
+{
+ struct uclamp_se uc_eff;
+
+ /* Task currently refcounted: use back-annotated (effective) value */
+ if (p->uclamp[clamp_id].active)
+ return p->uclamp[clamp_id].value;
+
+ uc_eff = uclamp_eff_get(p, clamp_id);
+
+ return uc_eff.value;
+}
+
+/*
+ * When a task is enqueued on a rq, the clamp bucket currently defined by the
+ * task's uclamp::bucket_id is refcounted on that rq. This also immediately
+ * updates the rq's clamp value if required.
+ *
+ * Tasks can have a task-specific value requested from user-space, track
+ * within each bucket the maximum value for tasks refcounted in it.
+ * This "local max aggregation" allows to track the exact "requested" value
+ * for each bucket when all its RUNNABLE tasks require the same clamp.
+ */
+static inline void uclamp_rq_inc_id(struct rq *rq, struct task_struct *p,
+ unsigned int clamp_id)
+{
+ struct uclamp_rq *uc_rq = &rq->uclamp[clamp_id];
+ struct uclamp_se *uc_se = &p->uclamp[clamp_id];
+ struct uclamp_bucket *bucket;
+
+ lockdep_assert_held(&rq->lock);
+
+ /* Update task effective clamp */
+ p->uclamp[clamp_id] = uclamp_eff_get(p, clamp_id);
+
+ bucket = &uc_rq->bucket[uc_se->bucket_id];
+ bucket->tasks++;
+ uc_se->active = true;
+
+ uclamp_idle_reset(rq, clamp_id, uc_se->value);
+
+ /*
+ * Local max aggregation: rq buckets always track the max
+ * "requested" clamp value of its RUNNABLE tasks.
+ */
+ if (bucket->tasks == 1 || uc_se->value > bucket->value)
+ bucket->value = uc_se->value;
+
+ if (uc_se->value > READ_ONCE(uc_rq->value))
+ WRITE_ONCE(uc_rq->value, uc_se->value);
+}
+
+/*
+ * When a task is dequeued from a rq, the clamp bucket refcounted by the task
+ * is released. If this is the last task reference counting the rq's max
+ * active clamp value, then the rq's clamp value is updated.
+ *
+ * Both refcounted tasks and rq's cached clamp values are expected to be
+ * always valid. If it's detected they are not, as defensive programming,
+ * enforce the expected state and warn.
+ */
+static inline void uclamp_rq_dec_id(struct rq *rq, struct task_struct *p,
+ unsigned int clamp_id)
+{
+ struct uclamp_rq *uc_rq = &rq->uclamp[clamp_id];
+ struct uclamp_se *uc_se = &p->uclamp[clamp_id];
+ struct uclamp_bucket *bucket;
+ unsigned int bkt_clamp;
+ unsigned int rq_clamp;
+
+ lockdep_assert_held(&rq->lock);
+
+ bucket = &uc_rq->bucket[uc_se->bucket_id];
+ SCHED_WARN_ON(!bucket->tasks);
+ if (likely(bucket->tasks))
+ bucket->tasks--;
+ uc_se->active = false;
+
+ /*
+ * Keep "local max aggregation" simple and accept to (possibly)
+ * overboost some RUNNABLE tasks in the same bucket.
+ * The rq clamp bucket value is reset to its base value whenever
+ * there are no more RUNNABLE tasks refcounting it.
+ */
+ if (likely(bucket->tasks))
+ return;
+
+ rq_clamp = READ_ONCE(uc_rq->value);
+ /*
+ * Defensive programming: this should never happen. If it happens,
+ * e.g. due to future modification, warn and fixup the expected value.
+ */
+ SCHED_WARN_ON(bucket->value > rq_clamp);
+ if (bucket->value >= rq_clamp) {
+ bkt_clamp = uclamp_rq_max_value(rq, clamp_id, uc_se->value);
+ WRITE_ONCE(uc_rq->value, bkt_clamp);
+ }
+}
+
+static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p)
+{
+ unsigned int clamp_id;
+
+ if (unlikely(!p->sched_class->uclamp_enabled))
+ return;
+
+ for_each_clamp_id(clamp_id)
+ uclamp_rq_inc_id(rq, p, clamp_id);
+
+ /* Reset clamp idle holding when there is one RUNNABLE task */
+ if (rq->uclamp_flags & UCLAMP_FLAG_IDLE)
+ rq->uclamp_flags &= ~UCLAMP_FLAG_IDLE;
+}
+
+static inline void uclamp_rq_dec(struct rq *rq, struct task_struct *p)
+{
+ unsigned int clamp_id;
+
+ if (unlikely(!p->sched_class->uclamp_enabled))
+ return;
+
+ for_each_clamp_id(clamp_id)
+ uclamp_rq_dec_id(rq, p, clamp_id);
+}
+
+int sysctl_sched_uclamp_handler(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp,
+ loff_t *ppos)
+{
+ int old_min, old_max;
+ static DEFINE_MUTEX(mutex);
+ int result;
+
+ mutex_lock(&mutex);
+ old_min = sysctl_sched_uclamp_util_min;
+ old_max = sysctl_sched_uclamp_util_max;
+
+ result = proc_dointvec(table, write, buffer, lenp, ppos);
+ if (result)
+ goto undo;
+ if (!write)
+ goto done;
+
+ if (sysctl_sched_uclamp_util_min > sysctl_sched_uclamp_util_max ||
+ sysctl_sched_uclamp_util_max > SCHED_CAPACITY_SCALE) {
+ result = -EINVAL;
+ goto undo;
+ }
+
+ if (old_min != sysctl_sched_uclamp_util_min) {
+ uclamp_se_set(&uclamp_default[UCLAMP_MIN],
+ sysctl_sched_uclamp_util_min, false);
+ }
+ if (old_max != sysctl_sched_uclamp_util_max) {
+ uclamp_se_set(&uclamp_default[UCLAMP_MAX],
+ sysctl_sched_uclamp_util_max, false);
+ }
+
+ /*
+ * Updating all the RUNNABLE task is expensive, keep it simple and do
+ * just a lazy update at each next enqueue time.
+ */
+ goto done;
+
+undo:
+ sysctl_sched_uclamp_util_min = old_min;
+ sysctl_sched_uclamp_util_max = old_max;
+done:
+ mutex_unlock(&mutex);
+
+ return result;
+}
+
+static int uclamp_validate(struct task_struct *p,
+ const struct sched_attr *attr)
+{
+ unsigned int lower_bound = p->uclamp_req[UCLAMP_MIN].value;
+ unsigned int upper_bound = p->uclamp_req[UCLAMP_MAX].value;
+
+ if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MIN)
+ lower_bound = attr->sched_util_min;
+ if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MAX)
+ upper_bound = attr->sched_util_max;
+
+ if (lower_bound > upper_bound)
+ return -EINVAL;
+ if (upper_bound > SCHED_CAPACITY_SCALE)
+ return -EINVAL;
+
+ return 0;
+}
+
+static void __setscheduler_uclamp(struct task_struct *p,
+ const struct sched_attr *attr)
+{
+ unsigned int clamp_id;
+
+ /*
+ * On scheduling class change, reset to default clamps for tasks
+ * without a task-specific value.
+ */
+ for_each_clamp_id(clamp_id) {
+ struct uclamp_se *uc_se = &p->uclamp_req[clamp_id];
+ unsigned int clamp_value = uclamp_none(clamp_id);
+
+ /* Keep using defined clamps across class changes */
+ if (uc_se->user_defined)
+ continue;
+
+ /* By default, RT tasks always get 100% boost */
+ if (unlikely(rt_task(p) && clamp_id == UCLAMP_MIN))
+ clamp_value = uclamp_none(UCLAMP_MAX);
+
+ uclamp_se_set(uc_se, clamp_value, false);
+ }
+
+ if (likely(!(attr->sched_flags & SCHED_FLAG_UTIL_CLAMP)))
+ return;
+
+ if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MIN) {
+ uclamp_se_set(&p->uclamp_req[UCLAMP_MIN],
+ attr->sched_util_min, true);
+ }
+
+ if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MAX) {
+ uclamp_se_set(&p->uclamp_req[UCLAMP_MAX],
+ attr->sched_util_max, true);
+ }
+}
+
+static void uclamp_fork(struct task_struct *p)
+{
+ unsigned int clamp_id;
+
+ for_each_clamp_id(clamp_id)
+ p->uclamp[clamp_id].active = false;
+
+ if (likely(!p->sched_reset_on_fork))
+ return;
+
+ for_each_clamp_id(clamp_id) {
+ unsigned int clamp_value = uclamp_none(clamp_id);
+
+ /* By default, RT tasks always get 100% boost */
+ if (unlikely(rt_task(p) && clamp_id == UCLAMP_MIN))
+ clamp_value = uclamp_none(UCLAMP_MAX);
+
+ uclamp_se_set(&p->uclamp_req[clamp_id], clamp_value, false);
+ }
+}
+
+static void __init init_uclamp(void)
+{
+ struct uclamp_se uc_max = {};
+ unsigned int clamp_id;
+ int cpu;
+
+ for_each_possible_cpu(cpu) {
+ memset(&cpu_rq(cpu)->uclamp, 0, sizeof(struct uclamp_rq));
+ cpu_rq(cpu)->uclamp_flags = 0;
+ }
+
+ for_each_clamp_id(clamp_id) {
+ uclamp_se_set(&init_task.uclamp_req[clamp_id],
+ uclamp_none(clamp_id), false);
+ }
+
+ /* System defaults allow max clamp values for both indexes */
+ uclamp_se_set(&uc_max, uclamp_none(UCLAMP_MAX), false);
+ for_each_clamp_id(clamp_id)
+ uclamp_default[clamp_id] = uc_max;
+}
+
+#else /* CONFIG_UCLAMP_TASK */
+static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p) { }
+static inline void uclamp_rq_dec(struct rq *rq, struct task_struct *p) { }
+static inline int uclamp_validate(struct task_struct *p,
+ const struct sched_attr *attr)
+{
+ return -EOPNOTSUPP;
+}
+static void __setscheduler_uclamp(struct task_struct *p,
+ const struct sched_attr *attr) { }
+static inline void uclamp_fork(struct task_struct *p) { }
+static inline void init_uclamp(void) { }
+#endif /* CONFIG_UCLAMP_TASK */
+
static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
{
if (!(flags & ENQUEUE_NOCLOCK))
@@ -740,6 +1177,7 @@ static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
psi_enqueue(p, flags & ENQUEUE_WAKEUP);
}
+ uclamp_rq_inc(rq, p);
p->sched_class->enqueue_task(rq, p, flags);
}
@@ -753,6 +1191,7 @@ static inline void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
psi_dequeue(p, flags & DEQUEUE_SLEEP);
}
+ uclamp_rq_dec(rq, p);
p->sched_class->dequeue_task(rq, p, flags);
}
@@ -762,10 +1201,14 @@ void activate_task(struct rq *rq, struct task_struct *p, int flags)
rq->nr_uninterruptible--;
enqueue_task(rq, p, flags);
+
+ p->on_rq = TASK_ON_RQ_QUEUED;
}
void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
{
+ p->on_rq = (flags & DEQUEUE_SLEEP) ? 0 : TASK_ON_RQ_MIGRATING;
+
if (task_contributes_to_load(p))
rq->nr_uninterruptible++;
@@ -890,12 +1333,12 @@ static inline bool is_per_cpu_kthread(struct task_struct *p)
}
/*
- * Per-CPU kthreads are allowed to run on !actie && online CPUs, see
+ * Per-CPU kthreads are allowed to run on !active && online CPUs, see
* __set_cpus_allowed_ptr() and select_fallback_rq().
*/
static inline bool is_cpu_allowed(struct task_struct *p, int cpu)
{
- if (!cpumask_test_cpu(cpu, &p->cpus_allowed))
+ if (!cpumask_test_cpu(cpu, p->cpus_ptr))
return false;
if (is_per_cpu_kthread(p))
@@ -928,7 +1371,7 @@ static struct rq *move_queued_task(struct rq *rq, struct rq_flags *rf,
{
lockdep_assert_held(&rq->lock);
- p->on_rq = TASK_ON_RQ_MIGRATING;
+ WRITE_ONCE(p->on_rq, TASK_ON_RQ_MIGRATING);
dequeue_task(rq, p, DEQUEUE_NOCLOCK);
set_task_cpu(p, new_cpu);
rq_unlock(rq, rf);
@@ -990,7 +1433,7 @@ static int migration_cpu_stop(void *data)
local_irq_disable();
/*
* We need to explicitly wake pending tasks before running
- * __migrate_task() such that we will not miss enforcing cpus_allowed
+ * __migrate_task() such that we will not miss enforcing cpus_ptr
* during wakeups, see set_cpus_allowed_ptr()'s TASK_WAKING test.
*/
sched_ttwu_pending();
@@ -1021,7 +1464,7 @@ static int migration_cpu_stop(void *data)
*/
void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask)
{
- cpumask_copy(&p->cpus_allowed, new_mask);
+ cpumask_copy(&p->cpus_mask, new_mask);
p->nr_cpus_allowed = cpumask_weight(new_mask);
}
@@ -1091,7 +1534,7 @@ static int __set_cpus_allowed_ptr(struct task_struct *p,
goto out;
}
- if (cpumask_equal(&p->cpus_allowed, new_mask))
+ if (cpumask_equal(p->cpus_ptr, new_mask))
goto out;
if (!cpumask_intersects(new_mask, cpu_valid_mask)) {
@@ -1121,7 +1564,6 @@ static int __set_cpus_allowed_ptr(struct task_struct *p,
/* Need help from migration thread: drop lock and wait. */
task_rq_unlock(rq, p, &rf);
stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
- tlb_migrate_finish(p->mm);
return 0;
} else if (task_on_rq_queued(p)) {
/*
@@ -1207,11 +1649,9 @@ static void __migrate_swap_task(struct task_struct *p, int cpu)
rq_pin_lock(src_rq, &srf);
rq_pin_lock(dst_rq, &drf);
- p->on_rq = TASK_ON_RQ_MIGRATING;
deactivate_task(src_rq, p, 0);
set_task_cpu(p, cpu);
activate_task(dst_rq, p, 0);
- p->on_rq = TASK_ON_RQ_QUEUED;
check_preempt_curr(dst_rq, p, 0);
rq_unpin_lock(dst_rq, &drf);
@@ -1254,10 +1694,10 @@ static int migrate_swap_stop(void *data)
if (task_cpu(arg->src_task) != arg->src_cpu)
goto unlock;
- if (!cpumask_test_cpu(arg->dst_cpu, &arg->src_task->cpus_allowed))
+ if (!cpumask_test_cpu(arg->dst_cpu, arg->src_task->cpus_ptr))
goto unlock;
- if (!cpumask_test_cpu(arg->src_cpu, &arg->dst_task->cpus_allowed))
+ if (!cpumask_test_cpu(arg->src_cpu, arg->dst_task->cpus_ptr))
goto unlock;
__migrate_swap_task(arg->src_task, arg->dst_cpu);
@@ -1299,10 +1739,10 @@ int migrate_swap(struct task_struct *cur, struct task_struct *p,
if (!cpu_active(arg.src_cpu) || !cpu_active(arg.dst_cpu))
goto out;
- if (!cpumask_test_cpu(arg.dst_cpu, &arg.src_task->cpus_allowed))
+ if (!cpumask_test_cpu(arg.dst_cpu, arg.src_task->cpus_ptr))
goto out;
- if (!cpumask_test_cpu(arg.src_cpu, &arg.dst_task->cpus_allowed))
+ if (!cpumask_test_cpu(arg.src_cpu, arg.dst_task->cpus_ptr))
goto out;
trace_sched_swap_numa(cur, arg.src_cpu, p, arg.dst_cpu);
@@ -1447,7 +1887,7 @@ void kick_process(struct task_struct *p)
EXPORT_SYMBOL_GPL(kick_process);
/*
- * ->cpus_allowed is protected by both rq->lock and p->pi_lock
+ * ->cpus_ptr is protected by both rq->lock and p->pi_lock
*
* A few notes on cpu_active vs cpu_online:
*
@@ -1487,14 +1927,14 @@ static int select_fallback_rq(int cpu, struct task_struct *p)
for_each_cpu(dest_cpu, nodemask) {
if (!cpu_active(dest_cpu))
continue;
- if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed))
+ if (cpumask_test_cpu(dest_cpu, p->cpus_ptr))
return dest_cpu;
}
}
for (;;) {
/* Any allowed, online CPU? */
- for_each_cpu(dest_cpu, &p->cpus_allowed) {
+ for_each_cpu(dest_cpu, p->cpus_ptr) {
if (!is_cpu_allowed(p, dest_cpu))
continue;
@@ -1538,7 +1978,7 @@ out:
}
/*
- * The caller (fork, wakeup) owns p->pi_lock, ->cpus_allowed is stable.
+ * The caller (fork, wakeup) owns p->pi_lock, ->cpus_ptr is stable.
*/
static inline
int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags)
@@ -1548,11 +1988,11 @@ int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags)
if (p->nr_cpus_allowed > 1)
cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags);
else
- cpu = cpumask_any(&p->cpus_allowed);
+ cpu = cpumask_any(p->cpus_ptr);
/*
* In order not to call set_task_cpu() on a blocking task we need
- * to rely on ttwu() to place the task on a valid ->cpus_allowed
+ * to rely on ttwu() to place the task on a valid ->cpus_ptr
* CPU.
*
* Since this is common to all placement strategies, this lives here.
@@ -1651,16 +2091,6 @@ ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
__schedstat_inc(p->se.statistics.nr_wakeups_sync);
}
-static inline void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags)
-{
- activate_task(rq, p, en_flags);
- p->on_rq = TASK_ON_RQ_QUEUED;
-
- /* If a worker is waking up, notify the workqueue: */
- if (p->flags & PF_WQ_WORKER)
- wq_worker_waking_up(p, cpu_of(rq));
-}
-
/*
* Mark the task runnable and perform wakeup-preemption.
*/
@@ -1712,7 +2142,7 @@ ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags,
en_flags |= ENQUEUE_MIGRATED;
#endif
- ttwu_activate(rq, p, en_flags);
+ activate_task(rq, p, en_flags);
ttwu_do_wakeup(rq, p, wake_flags, rf);
}
@@ -1969,6 +2399,30 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
unsigned long flags;
int cpu, success = 0;
+ preempt_disable();
+ if (p == current) {
+ /*
+ * We're waking current, this means 'p->on_rq' and 'task_cpu(p)
+ * == smp_processor_id()'. Together this means we can special
+ * case the whole 'p->on_rq && ttwu_remote()' case below
+ * without taking any locks.
+ *
+ * In particular:
+ * - we rely on Program-Order guarantees for all the ordering,
+ * - we're serialized against set_special_state() by virtue of
+ * it disabling IRQs (this allows not taking ->pi_lock).
+ */
+ if (!(p->state & state))
+ goto out;
+
+ success = 1;
+ cpu = task_cpu(p);
+ trace_sched_waking(p);
+ p->state = TASK_RUNNING;
+ trace_sched_wakeup(p);
+ goto out;
+ }
+
/*
* If we are going to wake up a thread waiting for CONDITION we
* need to ensure that CONDITION=1 done by the caller can not be
@@ -1978,7 +2432,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
raw_spin_lock_irqsave(&p->pi_lock, flags);
smp_mb__after_spinlock();
if (!(p->state & state))
- goto out;
+ goto unlock;
trace_sched_waking(p);
@@ -2008,7 +2462,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
*/
smp_rmb();
if (p->on_rq && ttwu_remote(p, wake_flags))
- goto stat;
+ goto unlock;
#ifdef CONFIG_SMP
/*
@@ -2068,65 +2522,17 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
#endif /* CONFIG_SMP */
ttwu_queue(p, cpu, wake_flags);
-stat:
- ttwu_stat(p, cpu, wake_flags);
-out:
+unlock:
raw_spin_unlock_irqrestore(&p->pi_lock, flags);
+out:
+ if (success)
+ ttwu_stat(p, cpu, wake_flags);
+ preempt_enable();
return success;
}
/**
- * try_to_wake_up_local - try to wake up a local task with rq lock held
- * @p: the thread to be awakened
- * @rf: request-queue flags for pinning
- *
- * Put @p on the run-queue if it's not already there. The caller must
- * ensure that this_rq() is locked, @p is bound to this_rq() and not
- * the current task.
- */
-static void try_to_wake_up_local(struct task_struct *p, struct rq_flags *rf)
-{
- struct rq *rq = task_rq(p);
-
- if (WARN_ON_ONCE(rq != this_rq()) ||
- WARN_ON_ONCE(p == current))
- return;
-
- lockdep_assert_held(&rq->lock);
-
- if (!raw_spin_trylock(&p->pi_lock)) {
- /*
- * This is OK, because current is on_cpu, which avoids it being
- * picked for load-balance and preemption/IRQs are still
- * disabled avoiding further scheduler activity on it and we've
- * not yet picked a replacement task.
- */
- rq_unlock(rq, rf);
- raw_spin_lock(&p->pi_lock);
- rq_relock(rq, rf);
- }
-
- if (!(p->state & TASK_NORMAL))
- goto out;
-
- trace_sched_waking(p);
-
- if (!task_on_rq_queued(p)) {
- if (p->in_iowait) {
- delayacct_blkio_end(p);
- atomic_dec(&rq->nr_iowait);
- }
- ttwu_activate(rq, p, ENQUEUE_WAKEUP | ENQUEUE_NOCLOCK);
- }
-
- ttwu_do_wakeup(rq, p, 0, rf);
- ttwu_stat(p, smp_processor_id(), 0);
-out:
- raw_spin_unlock(&p->pi_lock);
-}
-
-/**
* wake_up_process - Wake up a specific process
* @p: The process to be woken up.
*
@@ -2190,6 +2596,9 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
INIT_HLIST_HEAD(&p->preempt_notifiers);
#endif
+#ifdef CONFIG_COMPACTION
+ p->capture_control = NULL;
+#endif
init_numa_balancing(clone_flags, p);
}
@@ -2325,6 +2734,8 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
*/
p->prio = current->normal_prio;
+ uclamp_fork(p);
+
/*
* Revert to default priority/policy on fork if requested.
*/
@@ -2420,7 +2831,7 @@ void wake_up_new_task(struct task_struct *p)
#ifdef CONFIG_SMP
/*
* Fork balancing, do it here and not earlier because:
- * - cpus_allowed can change in the fork path
+ * - cpus_ptr can change in the fork path
* - any previously selected CPU might disappear through hotplug
*
* Use __set_task_cpu() to avoid calling sched_class::migrate_task_rq,
@@ -2431,10 +2842,9 @@ void wake_up_new_task(struct task_struct *p)
#endif
rq = __task_rq_lock(p, &rf);
update_rq_clock(rq);
- post_init_entity_util_avg(&p->se);
+ post_init_entity_util_avg(p);
activate_task(rq, p, ENQUEUE_NOCLOCK);
- p->on_rq = TASK_ON_RQ_QUEUED;
trace_sched_wakeup_new(p);
check_preempt_curr(rq, p, WF_FORK);
#ifdef CONFIG_SMP
@@ -3059,7 +3469,6 @@ void scheduler_tick(void)
update_rq_clock(rq);
curr->sched_class->task_tick(rq, curr, 0);
- cpu_load_update_active(rq);
calc_global_load_tick(rq);
psi_task_tick(rq);
@@ -3433,25 +3842,11 @@ static void __sched notrace __schedule(bool preempt)
prev->state = TASK_RUNNING;
} else {
deactivate_task(rq, prev, DEQUEUE_SLEEP | DEQUEUE_NOCLOCK);
- prev->on_rq = 0;
if (prev->in_iowait) {
atomic_inc(&rq->nr_iowait);
delayacct_blkio_start();
}
-
- /*
- * If a worker went to sleep, notify and ask workqueue
- * whether it wants to wake up a task to maintain
- * concurrency.
- */
- if (prev->flags & PF_WQ_WORKER) {
- struct task_struct *to_wakeup;
-
- to_wakeup = wq_worker_sleeping(prev);
- if (to_wakeup)
- try_to_wake_up_local(to_wakeup, &rf);
- }
}
switch_count = &prev->nvcsw;
}
@@ -3511,6 +3906,20 @@ static inline void sched_submit_work(struct task_struct *tsk)
{
if (!tsk->state || tsk_is_pi_blocked(tsk))
return;
+
+ /*
+ * If a worker went to sleep, notify and ask workqueue whether
+ * it wants to wake up a task to maintain concurrency.
+ * As this function is called inside the schedule() context,
+ * we disable preemption to avoid it calling schedule() again
+ * in the possible wakeup of a kworker.
+ */
+ if (tsk->flags & PF_WQ_WORKER) {
+ preempt_disable();
+ wq_worker_sleeping(tsk);
+ preempt_enable_no_resched();
+ }
+
/*
* If we are going to sleep and we have plugged IO queued,
* make sure to submit it to avoid deadlocks.
@@ -3519,6 +3928,12 @@ static inline void sched_submit_work(struct task_struct *tsk)
blk_schedule_flush_plug(tsk);
}
+static void sched_update_worker(struct task_struct *tsk)
+{
+ if (tsk->flags & PF_WQ_WORKER)
+ wq_worker_running(tsk);
+}
+
asmlinkage __visible void __sched schedule(void)
{
struct task_struct *tsk = current;
@@ -3529,6 +3944,7 @@ asmlinkage __visible void __sched schedule(void)
__schedule(false);
sched_preempt_enable_no_resched();
} while (need_resched());
+ sched_update_worker(tsk);
}
EXPORT_SYMBOL(schedule);
@@ -4090,6 +4506,13 @@ static void __setscheduler_params(struct task_struct *p,
static void __setscheduler(struct rq *rq, struct task_struct *p,
const struct sched_attr *attr, bool keep_boost)
{
+ /*
+ * If params can't change scheduling class changes aren't allowed
+ * either.
+ */
+ if (attr->sched_flags & SCHED_FLAG_KEEP_PARAMS)
+ return;
+
__setscheduler_params(p, attr);
/*
@@ -4227,6 +4650,13 @@ recheck:
return retval;
}
+ /* Update task specific "requested" clamps */
+ if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP) {
+ retval = uclamp_validate(p, attr);
+ if (retval)
+ return retval;
+ }
+
/*
* Make sure no PI-waiters arrive (or leave) while we are
* changing the priority of the task:
@@ -4256,6 +4686,8 @@ recheck:
goto change;
if (dl_policy(policy) && dl_param_changed(p, attr))
goto change;
+ if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP)
+ goto change;
p->sched_reset_on_fork = reset_on_fork;
task_rq_unlock(rq, p, &rf);
@@ -4286,7 +4718,7 @@ change:
* the entire root_domain to become SCHED_DEADLINE. We
* will also fail if there's no bandwidth available.
*/
- if (!cpumask_subset(span, &p->cpus_allowed) ||
+ if (!cpumask_subset(span, p->cpus_ptr) ||
rq->rd->dl_bw.bw == 0) {
task_rq_unlock(rq, p, &rf);
return -EPERM;
@@ -4336,7 +4768,9 @@ change:
put_prev_task(rq, p);
prev_class = p->sched_class;
+
__setscheduler(rq, p, attr, pi);
+ __setscheduler_uclamp(p, attr);
if (queued) {
/*
@@ -4512,6 +4946,10 @@ static int sched_copy_attr(struct sched_attr __user *uattr, struct sched_attr *a
if (ret)
return -EFAULT;
+ if ((attr->sched_flags & SCHED_FLAG_UTIL_CLAMP) &&
+ size < SCHED_ATTR_SIZE_VER1)
+ return -EINVAL;
+
/*
* XXX: Do we want to be lenient like existing syscalls; or do we want
* to be strict and return an error on out-of-bounds values?
@@ -4575,14 +5013,21 @@ SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr,
if ((int)attr.sched_policy < 0)
return -EINVAL;
+ if (attr.sched_flags & SCHED_FLAG_KEEP_POLICY)
+ attr.sched_policy = SETPARAM_POLICY;
rcu_read_lock();
retval = -ESRCH;
p = find_process_by_pid(pid);
- if (p != NULL)
- retval = sched_setattr(p, &attr);
+ if (likely(p))
+ get_task_struct(p);
rcu_read_unlock();
+ if (likely(p)) {
+ retval = sched_setattr(p, &attr);
+ put_task_struct(p);
+ }
+
return retval;
}
@@ -4733,6 +5178,11 @@ SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr,
else
attr.sched_nice = task_nice(p);
+#ifdef CONFIG_UCLAMP_TASK
+ attr.sched_util_min = p->uclamp_req[UCLAMP_MIN].value;
+ attr.sched_util_max = p->uclamp_req[UCLAMP_MAX].value;
+#endif
+
rcu_read_unlock();
retval = sched_read_attr(uattr, &attr, size);
@@ -4885,7 +5335,7 @@ long sched_getaffinity(pid_t pid, struct cpumask *mask)
goto out_unlock;
raw_spin_lock_irqsave(&p->pi_lock, flags);
- cpumask_and(mask, &p->cpus_allowed, cpu_active_mask);
+ cpumask_and(mask, &p->cpus_mask, cpu_active_mask);
raw_spin_unlock_irqrestore(&p->pi_lock, flags);
out_unlock:
@@ -5142,7 +5592,7 @@ long __sched io_schedule_timeout(long timeout)
}
EXPORT_SYMBOL(io_schedule_timeout);
-void io_schedule(void)
+void __sched io_schedule(void)
{
int token;
@@ -5265,9 +5715,8 @@ SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
}
#ifdef CONFIG_COMPAT_32BIT_TIME
-COMPAT_SYSCALL_DEFINE2(sched_rr_get_interval,
- compat_pid_t, pid,
- struct old_timespec32 __user *, interval)
+SYSCALL_DEFINE2(sched_rr_get_interval_time32, pid_t, pid,
+ struct old_timespec32 __user *, interval)
{
struct timespec64 t;
int retval = sched_rr_get_interval(pid, &t);
@@ -5463,7 +5912,7 @@ int task_can_attach(struct task_struct *p,
* allowed nodes is unnecessary. Thus, cpusets are not
* applicable for such threads. This prevents checking for
* success of set_cpus_allowed_ptr() on all attached tasks
- * before cpus_allowed may be changed.
+ * before cpus_mask may be changed.
*/
if (p->flags & PF_NO_SETAFFINITY) {
ret = -EINVAL;
@@ -5490,7 +5939,7 @@ int migrate_task_to(struct task_struct *p, int target_cpu)
if (curr_cpu == target_cpu)
return 0;
- if (!cpumask_test_cpu(target_cpu, &p->cpus_allowed))
+ if (!cpumask_test_cpu(target_cpu, p->cpus_ptr))
return -EINVAL;
/* TODO: This is not properly updating schedstats */
@@ -5628,7 +6077,7 @@ static void migrate_tasks(struct rq *dead_rq, struct rq_flags *rf)
put_prev_task(rq, next);
/*
- * Rules for changing task_struct::cpus_allowed are holding
+ * Rules for changing task_struct::cpus_mask are holding
* both pi_lock and rq->lock, such that holding either
* stabilizes the mask.
*
@@ -5867,14 +6316,11 @@ void __init sched_init_smp(void)
/*
* There's no userspace yet to cause hotplug operations; hence all the
* CPU masks are stable and all blatant races in the below code cannot
- * happen. The hotplug lock is nevertheless taken to satisfy lockdep,
- * but there won't be any contention on it.
+ * happen.
*/
- cpus_read_lock();
mutex_lock(&sched_domains_mutex);
sched_init_domains(cpu_active_mask);
mutex_unlock(&sched_domains_mutex);
- cpus_read_unlock();
/* Move init over to a non-isolated CPU */
if (set_cpus_allowed_ptr(current, housekeeping_cpumask(HK_FLAG_DOMAIN)) < 0)
@@ -5889,7 +6335,7 @@ void __init sched_init_smp(void)
static int __init migration_init(void)
{
- sched_rq_cpu_starting(smp_processor_id());
+ sched_cpu_starting(smp_processor_id());
return 0;
}
early_initcall(migration_init);
@@ -5925,8 +6371,8 @@ DECLARE_PER_CPU(cpumask_var_t, select_idle_mask);
void __init sched_init(void)
{
- int i, j;
unsigned long alloc_size = 0, ptr;
+ int i;
wait_bit_init();
@@ -6028,10 +6474,6 @@ void __init sched_init(void)
#ifdef CONFIG_RT_GROUP_SCHED
init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, NULL);
#endif
-
- for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
- rq->cpu_load[j] = 0;
-
#ifdef CONFIG_SMP
rq->sd = NULL;
rq->rd = NULL;
@@ -6086,6 +6528,8 @@ void __init sched_init(void)
psi_init();
+ init_uclamp();
+
scheduler_running = 1;
}
@@ -6162,6 +6606,34 @@ void ___might_sleep(const char *file, int line, int preempt_offset)
add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
}
EXPORT_SYMBOL(___might_sleep);
+
+void __cant_sleep(const char *file, int line, int preempt_offset)
+{
+ static unsigned long prev_jiffy;
+
+ if (irqs_disabled())
+ return;
+
+ if (!IS_ENABLED(CONFIG_PREEMPT_COUNT))
+ return;
+
+ if (preempt_count() > preempt_offset)
+ return;
+
+ if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
+ return;
+ prev_jiffy = jiffies;
+
+ printk(KERN_ERR "BUG: assuming atomic context at %s:%d\n", file, line);
+ printk(KERN_ERR "in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n",
+ in_atomic(), irqs_disabled(),
+ current->pid, current->comm);
+
+ debug_show_held_locks(current);
+ dump_stack();
+ add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
+}
+EXPORT_SYMBOL_GPL(__cant_sleep);
#endif
#ifdef CONFIG_MAGIC_SYSRQ
@@ -6502,6 +6974,8 @@ static void cpu_cgroup_attach(struct cgroup_taskset *tset)
static int cpu_shares_write_u64(struct cgroup_subsys_state *css,
struct cftype *cftype, u64 shareval)
{
+ if (shareval > scale_load_down(ULONG_MAX))
+ shareval = MAX_SHARES;
return sched_group_set_shares(css_tg(css), scale_load(shareval));
}
@@ -6517,7 +6991,7 @@ static u64 cpu_shares_read_u64(struct cgroup_subsys_state *css,
static DEFINE_MUTEX(cfs_constraints_mutex);
const u64 max_cfs_quota_period = 1 * NSEC_PER_SEC; /* 1s */
-const u64 min_cfs_quota_period = 1 * NSEC_PER_MSEC; /* 1ms */
+static const u64 min_cfs_quota_period = 1 * NSEC_PER_MSEC; /* 1ms */
static int __cfs_schedulable(struct task_group *tg, u64 period, u64 runtime);
@@ -6597,20 +7071,22 @@ out_unlock:
return ret;
}
-int tg_set_cfs_quota(struct task_group *tg, long cfs_quota_us)
+static int tg_set_cfs_quota(struct task_group *tg, long cfs_quota_us)
{
u64 quota, period;
period = ktime_to_ns(tg->cfs_bandwidth.period);
if (cfs_quota_us < 0)
quota = RUNTIME_INF;
- else
+ else if ((u64)cfs_quota_us <= U64_MAX / NSEC_PER_USEC)
quota = (u64)cfs_quota_us * NSEC_PER_USEC;
+ else
+ return -EINVAL;
return tg_set_cfs_bandwidth(tg, period, quota);
}
-long tg_get_cfs_quota(struct task_group *tg)
+static long tg_get_cfs_quota(struct task_group *tg)
{
u64 quota_us;
@@ -6623,17 +7099,20 @@ long tg_get_cfs_quota(struct task_group *tg)
return quota_us;
}
-int tg_set_cfs_period(struct task_group *tg, long cfs_period_us)
+static int tg_set_cfs_period(struct task_group *tg, long cfs_period_us)
{
u64 quota, period;
+ if ((u64)cfs_period_us > U64_MAX / NSEC_PER_USEC)
+ return -EINVAL;
+
period = (u64)cfs_period_us * NSEC_PER_USEC;
quota = tg->cfs_bandwidth.quota;
return tg_set_cfs_bandwidth(tg, period, quota);
}
-long tg_get_cfs_period(struct task_group *tg)
+static long tg_get_cfs_period(struct task_group *tg)
{
u64 cfs_period_us;
@@ -6941,7 +7420,7 @@ static int __maybe_unused cpu_period_quota_parse(char *buf,
{
char tok[21]; /* U64_MAX */
- if (!sscanf(buf, "%s %llu", tok, periodp))
+ if (sscanf(buf, "%20s %llu", tok, periodp) < 1)
return -EINVAL;
*periodp *= NSEC_PER_USEC;
diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c
index 50316455ea66..5cc4012572ec 100644
--- a/kernel/sched/cpudeadline.c
+++ b/kernel/sched/cpudeadline.c
@@ -1,14 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* kernel/sched/cpudl.c
*
* Global CPU deadline management
*
* Author: Juri Lelli <j.lelli@sssup.it>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; version 2
- * of the License.
*/
#include "sched.h"
@@ -124,14 +120,14 @@ int cpudl_find(struct cpudl *cp, struct task_struct *p,
const struct sched_dl_entity *dl_se = &p->dl;
if (later_mask &&
- cpumask_and(later_mask, cp->free_cpus, &p->cpus_allowed)) {
+ cpumask_and(later_mask, cp->free_cpus, p->cpus_ptr)) {
return 1;
} else {
int best_cpu = cpudl_maximum(cp);
WARN_ON(best_cpu != -1 && !cpu_present(best_cpu));
- if (cpumask_test_cpu(best_cpu, &p->cpus_allowed) &&
+ if (cpumask_test_cpu(best_cpu, p->cpus_ptr) &&
dl_time_before(dl_se->deadline, cp->elements[0].dl)) {
if (later_mask)
cpumask_set_cpu(best_cpu, later_mask);
diff --git a/kernel/sched/cpufreq.c b/kernel/sched/cpufreq.c
index 22bd8980f32f..b5dcd1d83c7f 100644
--- a/kernel/sched/cpufreq.c
+++ b/kernel/sched/cpufreq.c
@@ -7,7 +7,7 @@
*/
#include "sched.h"
-DEFINE_PER_CPU(struct update_util_data *, cpufreq_update_util_data);
+DEFINE_PER_CPU(struct update_util_data __rcu *, cpufreq_update_util_data);
/**
* cpufreq_add_update_util_hook - Populate the CPU's update_util_data pointer.
@@ -48,8 +48,8 @@ EXPORT_SYMBOL_GPL(cpufreq_add_update_util_hook);
*
* Clear the update_util_data pointer for the given CPU.
*
- * Callers must use RCU-sched callbacks to free any memory that might be
- * accessed via the old update_util_data pointer or invoke synchronize_sched()
+ * Callers must use RCU callbacks to free any memory that might be
+ * accessed via the old update_util_data pointer or invoke synchronize_rcu()
* right after this function to avoid use-after-free.
*/
void cpufreq_remove_update_util_hook(int cpu)
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index 033ec7c45f13..636ca6f88c8e 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -13,6 +13,8 @@
#include <linux/sched/cpufreq.h>
#include <trace/events/power.h>
+#define IOWAIT_BOOST_MIN (SCHED_CAPACITY_SCALE / 8)
+
struct sugov_tunables {
struct gov_attr_set attr_set;
unsigned int rate_limit_us;
@@ -48,7 +50,6 @@ struct sugov_cpu {
bool iowait_boost_pending;
unsigned int iowait_boost;
- unsigned int iowait_boost_max;
u64 last_update;
unsigned long bw_dl;
@@ -195,14 +196,17 @@ static unsigned int get_next_freq(struct sugov_policy *sg_policy,
* based on the task model parameters and gives the minimal utilization
* required to meet deadlines.
*/
-unsigned long schedutil_freq_util(int cpu, unsigned long util_cfs,
- unsigned long max, enum schedutil_type type)
+unsigned long schedutil_cpu_util(int cpu, unsigned long util_cfs,
+ unsigned long max, enum schedutil_type type,
+ struct task_struct *p)
{
unsigned long dl_util, util, irq;
struct rq *rq = cpu_rq(cpu);
- if (type == FREQUENCY_UTIL && rt_rq_is_runnable(&rq->rt))
+ if (!IS_BUILTIN(CONFIG_UCLAMP_TASK) &&
+ type == FREQUENCY_UTIL && rt_rq_is_runnable(&rq->rt)) {
return max;
+ }
/*
* Early check to see if IRQ/steal time saturates the CPU, can be
@@ -218,9 +222,16 @@ unsigned long schedutil_freq_util(int cpu, unsigned long util_cfs,
* CFS tasks and we use the same metric to track the effective
* utilization (PELT windows are synchronized) we can directly add them
* to obtain the CPU's actual utilization.
+ *
+ * CFS and RT utilization can be boosted or capped, depending on
+ * utilization clamp constraints requested by currently RUNNABLE
+ * tasks.
+ * When there are no CFS RUNNABLE tasks, clamps are released and
+ * frequency will be gracefully reduced with the utilization decay.
*/
- util = util_cfs;
- util += cpu_util_rt(rq);
+ util = util_cfs + cpu_util_rt(rq);
+ if (type == FREQUENCY_UTIL)
+ util = uclamp_util_with(rq, util, p);
dl_util = cpu_util_dl(rq);
@@ -275,12 +286,12 @@ static unsigned long sugov_get_util(struct sugov_cpu *sg_cpu)
{
struct rq *rq = cpu_rq(sg_cpu->cpu);
unsigned long util = cpu_util_cfs(rq);
- unsigned long max = arch_scale_cpu_capacity(NULL, sg_cpu->cpu);
+ unsigned long max = arch_scale_cpu_capacity(sg_cpu->cpu);
sg_cpu->max = max;
sg_cpu->bw_dl = cpu_bw_dl(rq);
- return schedutil_freq_util(sg_cpu->cpu, util, max, FREQUENCY_UTIL);
+ return schedutil_cpu_util(sg_cpu->cpu, util, max, FREQUENCY_UTIL, NULL);
}
/**
@@ -291,8 +302,8 @@ static unsigned long sugov_get_util(struct sugov_cpu *sg_cpu)
*
* The IO wait boost of a task is disabled after a tick since the last update
* of a CPU. If a new IO wait boost is requested after more then a tick, then
- * we enable the boost starting from the minimum frequency, which improves
- * energy efficiency by ignoring sporadic wakeups from IO.
+ * we enable the boost starting from IOWAIT_BOOST_MIN, which improves energy
+ * efficiency by ignoring sporadic wakeups from IO.
*/
static bool sugov_iowait_reset(struct sugov_cpu *sg_cpu, u64 time,
bool set_iowait_boost)
@@ -303,8 +314,7 @@ static bool sugov_iowait_reset(struct sugov_cpu *sg_cpu, u64 time,
if (delta_ns <= TICK_NSEC)
return false;
- sg_cpu->iowait_boost = set_iowait_boost
- ? sg_cpu->sg_policy->policy->min : 0;
+ sg_cpu->iowait_boost = set_iowait_boost ? IOWAIT_BOOST_MIN : 0;
sg_cpu->iowait_boost_pending = set_iowait_boost;
return true;
@@ -318,8 +328,9 @@ static bool sugov_iowait_reset(struct sugov_cpu *sg_cpu, u64 time,
*
* Each time a task wakes up after an IO operation, the CPU utilization can be
* boosted to a certain utilization which doubles at each "frequent and
- * successive" wakeup from IO, ranging from the utilization of the minimum
- * OPP to the utilization of the maximum OPP.
+ * successive" wakeup from IO, ranging from IOWAIT_BOOST_MIN to the utilization
+ * of the maximum OPP.
+ *
* To keep doubling, an IO boost has to be requested at least once per tick,
* otherwise we restart from the utilization of the minimum OPP.
*/
@@ -344,14 +355,13 @@ static void sugov_iowait_boost(struct sugov_cpu *sg_cpu, u64 time,
/* Double the boost at each request */
if (sg_cpu->iowait_boost) {
- sg_cpu->iowait_boost <<= 1;
- if (sg_cpu->iowait_boost > sg_cpu->iowait_boost_max)
- sg_cpu->iowait_boost = sg_cpu->iowait_boost_max;
+ sg_cpu->iowait_boost =
+ min_t(unsigned int, sg_cpu->iowait_boost << 1, SCHED_CAPACITY_SCALE);
return;
}
/* First wakeup after IO: start with minimum boost */
- sg_cpu->iowait_boost = sg_cpu->sg_policy->policy->min;
+ sg_cpu->iowait_boost = IOWAIT_BOOST_MIN;
}
/**
@@ -373,47 +383,38 @@ static void sugov_iowait_boost(struct sugov_cpu *sg_cpu, u64 time,
* This mechanism is designed to boost high frequently IO waiting tasks, while
* being more conservative on tasks which does sporadic IO operations.
*/
-static void sugov_iowait_apply(struct sugov_cpu *sg_cpu, u64 time,
- unsigned long *util, unsigned long *max)
+static unsigned long sugov_iowait_apply(struct sugov_cpu *sg_cpu, u64 time,
+ unsigned long util, unsigned long max)
{
- unsigned int boost_util, boost_max;
+ unsigned long boost;
/* No boost currently required */
if (!sg_cpu->iowait_boost)
- return;
+ return util;
/* Reset boost if the CPU appears to have been idle enough */
if (sugov_iowait_reset(sg_cpu, time, false))
- return;
+ return util;
- /*
- * An IO waiting task has just woken up:
- * allow to further double the boost value
- */
- if (sg_cpu->iowait_boost_pending) {
- sg_cpu->iowait_boost_pending = false;
- } else {
+ if (!sg_cpu->iowait_boost_pending) {
/*
- * Otherwise: reduce the boost value and disable it when we
- * reach the minimum.
+ * No boost pending; reduce the boost value.
*/
sg_cpu->iowait_boost >>= 1;
- if (sg_cpu->iowait_boost < sg_cpu->sg_policy->policy->min) {
+ if (sg_cpu->iowait_boost < IOWAIT_BOOST_MIN) {
sg_cpu->iowait_boost = 0;
- return;
+ return util;
}
}
+ sg_cpu->iowait_boost_pending = false;
+
/*
- * Apply the current boost value: a CPU is boosted only if its current
- * utilization is smaller then the current IO boost level.
+ * @util is already in capacity scale; convert iowait_boost
+ * into the same scale so we can compare.
*/
- boost_util = sg_cpu->iowait_boost;
- boost_max = sg_cpu->iowait_boost_max;
- if (*util * boost_max < *max * boost_util) {
- *util = boost_util;
- *max = boost_max;
- }
+ boost = (sg_cpu->iowait_boost * max) >> SCHED_CAPACITY_SHIFT;
+ return max(boost, util);
}
#ifdef CONFIG_NO_HZ_COMMON
@@ -460,7 +461,7 @@ static void sugov_update_single(struct update_util_data *hook, u64 time,
util = sugov_get_util(sg_cpu);
max = sg_cpu->max;
- sugov_iowait_apply(sg_cpu, time, &util, &max);
+ util = sugov_iowait_apply(sg_cpu, time, util, max);
next_f = get_next_freq(sg_policy, util, max);
/*
* Do not reduce the frequency if the CPU has not been idle
@@ -500,7 +501,7 @@ static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, u64 time)
j_util = sugov_get_util(j_sg_cpu);
j_max = j_sg_cpu->max;
- sugov_iowait_apply(j_sg_cpu, time, &j_util, &j_max);
+ j_util = sugov_iowait_apply(j_sg_cpu, time, j_util, j_max);
if (j_util * max > j_max * util) {
util = j_util;
@@ -609,13 +610,14 @@ rate_limit_us_store(struct gov_attr_set *attr_set, const char *buf, size_t count
static struct governor_attr rate_limit_us = __ATTR_RW(rate_limit_us);
-static struct attribute *sugov_attributes[] = {
+static struct attribute *sugov_attrs[] = {
&rate_limit_us.attr,
NULL
};
+ATTRIBUTE_GROUPS(sugov);
static struct kobj_type sugov_tunables_ktype = {
- .default_attrs = sugov_attributes,
+ .default_groups = sugov_groups,
.sysfs_ops = &governor_sysfs_ops,
};
@@ -782,6 +784,7 @@ out:
return 0;
fail:
+ kobject_put(&tunables->attr_set.kobj);
policy->governor_data = NULL;
sugov_tunables_free(tunables);
@@ -837,7 +840,6 @@ static int sugov_start(struct cpufreq_policy *policy)
memset(sg_cpu, 0, sizeof(*sg_cpu));
sg_cpu->cpu = cpu;
sg_cpu->sg_policy = sg_policy;
- sg_cpu->iowait_boost_max = policy->cpuinfo.max_freq;
}
for_each_cpu(cpu, policy->cpus) {
@@ -859,7 +861,7 @@ static void sugov_stop(struct cpufreq_policy *policy)
for_each_cpu(cpu, policy->cpus)
cpufreq_remove_update_util_hook(cpu);
- synchronize_sched();
+ synchronize_rcu();
if (!policy->fast_switch_enabled) {
irq_work_sync(&sg_policy->irq_work);
diff --git a/kernel/sched/cpupri.c b/kernel/sched/cpupri.c
index daaadf939ccb..b7abca987d94 100644
--- a/kernel/sched/cpupri.c
+++ b/kernel/sched/cpupri.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* kernel/sched/cpupri.c
*
@@ -20,11 +21,6 @@
* searches). For tasks with affinity restrictions, the algorithm has a
* worst case complexity of O(min(102, nr_domcpus)), though the scenario that
* yields the worst case search is fairly contrived.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; version 2
- * of the License.
*/
#include "sched.h"
@@ -98,11 +94,11 @@ int cpupri_find(struct cpupri *cp, struct task_struct *p,
if (skip)
continue;
- if (cpumask_any_and(&p->cpus_allowed, vec->mask) >= nr_cpu_ids)
+ if (cpumask_any_and(p->cpus_ptr, vec->mask) >= nr_cpu_ids)
continue;
if (lowest_mask) {
- cpumask_and(lowest_mask, &p->cpus_allowed, vec->mask);
+ cpumask_and(lowest_mask, p->cpus_ptr, vec->mask);
/*
* We have to ensure that we have at least one bit
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index ba4a143bdcf3..2305ce89a26c 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Simple CPU accounting cgroup controller
*/
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index fb8b7b5d745d..ef5b9f6b1d42 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -252,7 +252,6 @@ static void task_non_contending(struct task_struct *p)
if (dl_entity_is_special(dl_se))
return;
- WARN_ON(hrtimer_active(&dl_se->inactive_timer));
WARN_ON(dl_se->dl_non_contending);
zerolag_time = dl_se->deadline -
@@ -269,7 +268,7 @@ static void task_non_contending(struct task_struct *p)
* If the "0-lag time" already passed, decrease the active
* utilization now, instead of starting a timer
*/
- if (zerolag_time < 0) {
+ if ((zerolag_time < 0) || hrtimer_active(&dl_se->inactive_timer)) {
if (dl_task(p))
sub_running_bw(dl_se, dl_rq);
if (!dl_task(p) || p->state == TASK_DEAD) {
@@ -539,7 +538,7 @@ static struct rq *dl_task_offline_migration(struct rq *rq, struct task_struct *p
* If we cannot preempt any rq, fall back to pick any
* online CPU:
*/
- cpu = cpumask_any_and(cpu_active_mask, &p->cpus_allowed);
+ cpu = cpumask_any_and(cpu_active_mask, p->cpus_ptr);
if (cpu >= nr_cpu_ids) {
/*
* Failed to find any suitable CPU.
@@ -727,7 +726,7 @@ static void replenish_dl_entity(struct sched_dl_entity *dl_se,
* refill the runtime and set the deadline a period in the future,
* because keeping the current (absolute) deadline of the task would
* result in breaking guarantees promised to other tasks (refer to
- * Documentation/scheduler/sched-deadline.txt for more information).
+ * Documentation/scheduler/sched-deadline.rst for more information).
*
* This function returns true if:
*
@@ -1196,7 +1195,7 @@ static void update_curr_dl(struct rq *rq)
&curr->dl);
} else {
unsigned long scale_freq = arch_scale_freq_capacity(cpu);
- unsigned long scale_cpu = arch_scale_cpu_capacity(NULL, cpu);
+ unsigned long scale_cpu = arch_scale_cpu_capacity(cpu);
scaled_delta_exec = cap_scale(delta_exec, scale_freq);
scaled_delta_exec = cap_scale(scaled_delta_exec, scale_cpu);
@@ -1767,7 +1766,7 @@ pick_next_task_dl(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
deadline_queue_push_tasks(rq);
if (rq->curr->sched_class != &dl_sched_class)
- update_dl_rq_load_avg(rq_clock_task(rq), rq, 0);
+ update_dl_rq_load_avg(rq_clock_pelt(rq), rq, 0);
return p;
}
@@ -1776,7 +1775,7 @@ static void put_prev_task_dl(struct rq *rq, struct task_struct *p)
{
update_curr_dl(rq);
- update_dl_rq_load_avg(rq_clock_task(rq), rq, 1);
+ update_dl_rq_load_avg(rq_clock_pelt(rq), rq, 1);
if (on_dl_rq(&p->dl) && p->nr_cpus_allowed > 1)
enqueue_pushable_dl_task(rq, p);
}
@@ -1793,7 +1792,7 @@ static void task_tick_dl(struct rq *rq, struct task_struct *p, int queued)
{
update_curr_dl(rq);
- update_dl_rq_load_avg(rq_clock_task(rq), rq, 1);
+ update_dl_rq_load_avg(rq_clock_pelt(rq), rq, 1);
/*
* Even when we have runtime, update_curr_dl() might have resulted in us
* not being the leftmost task anymore. In that case NEED_RESCHED will
@@ -1825,7 +1824,7 @@ static void set_curr_task_dl(struct rq *rq)
static int pick_dl_task(struct rq *rq, struct task_struct *p, int cpu)
{
if (!task_running(rq, p) &&
- cpumask_test_cpu(cpu, &p->cpus_allowed))
+ cpumask_test_cpu(cpu, p->cpus_ptr))
return 1;
return 0;
}
@@ -1975,7 +1974,7 @@ static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq)
/* Retry if something changed. */
if (double_lock_balance(rq, later_rq)) {
if (unlikely(task_rq(task) != rq ||
- !cpumask_test_cpu(later_rq->cpu, &task->cpus_allowed) ||
+ !cpumask_test_cpu(later_rq->cpu, task->cpus_ptr) ||
task_running(rq, task) ||
!dl_task(task) ||
!task_on_rq_queued(task))) {
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index de3de997e245..f7e4579e746c 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -1,13 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* kernel/sched/debug.c
*
* Print the CFS rbtree and other debugging details
*
* Copyright(C) 2007, Red Hat, Inc., Ingo Molnar
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
*/
#include "sched.h"
@@ -236,49 +233,35 @@ static void sd_free_ctl_entry(struct ctl_table **tablep)
*tablep = NULL;
}
-static int min_load_idx = 0;
-static int max_load_idx = CPU_LOAD_IDX_MAX-1;
-
static void
set_table_entry(struct ctl_table *entry,
const char *procname, void *data, int maxlen,
- umode_t mode, proc_handler *proc_handler,
- bool load_idx)
+ umode_t mode, proc_handler *proc_handler)
{
entry->procname = procname;
entry->data = data;
entry->maxlen = maxlen;
entry->mode = mode;
entry->proc_handler = proc_handler;
-
- if (load_idx) {
- entry->extra1 = &min_load_idx;
- entry->extra2 = &max_load_idx;
- }
}
static struct ctl_table *
sd_alloc_ctl_domain_table(struct sched_domain *sd)
{
- struct ctl_table *table = sd_alloc_ctl_entry(14);
+ struct ctl_table *table = sd_alloc_ctl_entry(9);
if (table == NULL)
return NULL;
- set_table_entry(&table[0] , "min_interval", &sd->min_interval, sizeof(long), 0644, proc_doulongvec_minmax, false);
- set_table_entry(&table[1] , "max_interval", &sd->max_interval, sizeof(long), 0644, proc_doulongvec_minmax, false);
- set_table_entry(&table[2] , "busy_idx", &sd->busy_idx, sizeof(int) , 0644, proc_dointvec_minmax, true );
- set_table_entry(&table[3] , "idle_idx", &sd->idle_idx, sizeof(int) , 0644, proc_dointvec_minmax, true );
- set_table_entry(&table[4] , "newidle_idx", &sd->newidle_idx, sizeof(int) , 0644, proc_dointvec_minmax, true );
- set_table_entry(&table[5] , "wake_idx", &sd->wake_idx, sizeof(int) , 0644, proc_dointvec_minmax, true );
- set_table_entry(&table[6] , "forkexec_idx", &sd->forkexec_idx, sizeof(int) , 0644, proc_dointvec_minmax, true );
- set_table_entry(&table[7] , "busy_factor", &sd->busy_factor, sizeof(int) , 0644, proc_dointvec_minmax, false);
- set_table_entry(&table[8] , "imbalance_pct", &sd->imbalance_pct, sizeof(int) , 0644, proc_dointvec_minmax, false);
- set_table_entry(&table[9] , "cache_nice_tries", &sd->cache_nice_tries, sizeof(int) , 0644, proc_dointvec_minmax, false);
- set_table_entry(&table[10], "flags", &sd->flags, sizeof(int) , 0644, proc_dointvec_minmax, false);
- set_table_entry(&table[11], "max_newidle_lb_cost", &sd->max_newidle_lb_cost, sizeof(long), 0644, proc_doulongvec_minmax, false);
- set_table_entry(&table[12], "name", sd->name, CORENAME_MAX_SIZE, 0444, proc_dostring, false);
- /* &table[13] is terminator */
+ set_table_entry(&table[0], "min_interval", &sd->min_interval, sizeof(long), 0644, proc_doulongvec_minmax);
+ set_table_entry(&table[1], "max_interval", &sd->max_interval, sizeof(long), 0644, proc_doulongvec_minmax);
+ set_table_entry(&table[2], "busy_factor", &sd->busy_factor, sizeof(int), 0644, proc_dointvec_minmax);
+ set_table_entry(&table[3], "imbalance_pct", &sd->imbalance_pct, sizeof(int), 0644, proc_dointvec_minmax);
+ set_table_entry(&table[4], "cache_nice_tries", &sd->cache_nice_tries, sizeof(int), 0644, proc_dointvec_minmax);
+ set_table_entry(&table[5], "flags", &sd->flags, sizeof(int), 0644, proc_dointvec_minmax);
+ set_table_entry(&table[6], "max_newidle_lb_cost", &sd->max_newidle_lb_cost, sizeof(long), 0644, proc_doulongvec_minmax);
+ set_table_entry(&table[7], "name", sd->name, CORENAME_MAX_SIZE, 0444, proc_dostring);
+ /* &table[8] is terminator */
return table;
}
@@ -315,6 +298,7 @@ void register_sched_domain_sysctl(void)
{
static struct ctl_table *cpu_entries;
static struct ctl_table **cpu_idx;
+ static bool init_done = false;
char buf[32];
int i;
@@ -344,7 +328,10 @@ void register_sched_domain_sysctl(void)
if (!cpumask_available(sd_sysctl_cpus)) {
if (!alloc_cpumask_var(&sd_sysctl_cpus, GFP_KERNEL))
return;
+ }
+ if (!init_done) {
+ init_done = true;
/* init to possible to not have holes in @cpu_entries */
cpumask_copy(sd_sysctl_cpus, cpu_possible_mask);
}
@@ -652,8 +639,6 @@ do { \
SEQ_printf(m, " .%-30s: %Ld.%06ld\n", #x, SPLIT_NS(rq->x))
P(nr_running);
- SEQ_printf(m, " .%-30s: %lu\n", "load",
- rq->load.weight);
P(nr_switches);
P(nr_load_updates);
P(nr_uninterruptible);
@@ -661,11 +646,6 @@ do { \
SEQ_printf(m, " .%-30s: %ld\n", "curr->pid", (long)(task_pid_nr(rq->curr)));
PN(clock);
PN(clock_task);
- P(cpu_load[0]);
- P(cpu_load[1]);
- P(cpu_load[2]);
- P(cpu_load[3]);
- P(cpu_load[4]);
#undef P
#undef PN
@@ -698,7 +678,7 @@ do { \
static const char *sched_tunable_scaling_names[] = {
"none",
- "logaritmic",
+ "logarithmic",
"linear"
};
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 310d0637fe4b..036be95a87e9 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -248,13 +248,6 @@ const struct sched_class fair_sched_class;
*/
#ifdef CONFIG_FAIR_GROUP_SCHED
-
-/* cpu runqueue to which this cfs_rq is attached */
-static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
-{
- return cfs_rq->rq;
-}
-
static inline struct task_struct *task_of(struct sched_entity *se)
{
SCHED_WARN_ON(!entity_is_task(se));
@@ -282,79 +275,116 @@ static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
return grp->my_q;
}
-static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
+static inline void cfs_rq_tg_path(struct cfs_rq *cfs_rq, char *path, int len)
{
- if (!cfs_rq->on_list) {
- struct rq *rq = rq_of(cfs_rq);
- int cpu = cpu_of(rq);
+ if (!path)
+ return;
+
+ if (cfs_rq && task_group_is_autogroup(cfs_rq->tg))
+ autogroup_path(cfs_rq->tg, path, len);
+ else if (cfs_rq && cfs_rq->tg->css.cgroup)
+ cgroup_path(cfs_rq->tg->css.cgroup, path, len);
+ else
+ strlcpy(path, "(null)", len);
+}
+
+static inline bool list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
+{
+ struct rq *rq = rq_of(cfs_rq);
+ int cpu = cpu_of(rq);
+
+ if (cfs_rq->on_list)
+ return rq->tmp_alone_branch == &rq->leaf_cfs_rq_list;
+
+ cfs_rq->on_list = 1;
+
+ /*
+ * Ensure we either appear before our parent (if already
+ * enqueued) or force our parent to appear after us when it is
+ * enqueued. The fact that we always enqueue bottom-up
+ * reduces this to two cases and a special case for the root
+ * cfs_rq. Furthermore, it also means that we will always reset
+ * tmp_alone_branch either when the branch is connected
+ * to a tree or when we reach the top of the tree
+ */
+ if (cfs_rq->tg->parent &&
+ cfs_rq->tg->parent->cfs_rq[cpu]->on_list) {
/*
- * Ensure we either appear before our parent (if already
- * enqueued) or force our parent to appear after us when it is
- * enqueued. The fact that we always enqueue bottom-up
- * reduces this to two cases and a special case for the root
- * cfs_rq. Furthermore, it also means that we will always reset
- * tmp_alone_branch either when the branch is connected
- * to a tree or when we reach the beg of the tree
+ * If parent is already on the list, we add the child
+ * just before. Thanks to circular linked property of
+ * the list, this means to put the child at the tail
+ * of the list that starts by parent.
*/
- if (cfs_rq->tg->parent &&
- cfs_rq->tg->parent->cfs_rq[cpu]->on_list) {
- /*
- * If parent is already on the list, we add the child
- * just before. Thanks to circular linked property of
- * the list, this means to put the child at the tail
- * of the list that starts by parent.
- */
- list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
- &(cfs_rq->tg->parent->cfs_rq[cpu]->leaf_cfs_rq_list));
- /*
- * The branch is now connected to its tree so we can
- * reset tmp_alone_branch to the beginning of the
- * list.
- */
- rq->tmp_alone_branch = &rq->leaf_cfs_rq_list;
- } else if (!cfs_rq->tg->parent) {
- /*
- * cfs rq without parent should be put
- * at the tail of the list.
- */
- list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
- &rq->leaf_cfs_rq_list);
- /*
- * We have reach the beg of a tree so we can reset
- * tmp_alone_branch to the beginning of the list.
- */
- rq->tmp_alone_branch = &rq->leaf_cfs_rq_list;
- } else {
- /*
- * The parent has not already been added so we want to
- * make sure that it will be put after us.
- * tmp_alone_branch points to the beg of the branch
- * where we will add parent.
- */
- list_add_rcu(&cfs_rq->leaf_cfs_rq_list,
- rq->tmp_alone_branch);
- /*
- * update tmp_alone_branch to points to the new beg
- * of the branch
- */
- rq->tmp_alone_branch = &cfs_rq->leaf_cfs_rq_list;
- }
+ list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
+ &(cfs_rq->tg->parent->cfs_rq[cpu]->leaf_cfs_rq_list));
+ /*
+ * The branch is now connected to its tree so we can
+ * reset tmp_alone_branch to the beginning of the
+ * list.
+ */
+ rq->tmp_alone_branch = &rq->leaf_cfs_rq_list;
+ return true;
+ }
- cfs_rq->on_list = 1;
+ if (!cfs_rq->tg->parent) {
+ /*
+ * cfs rq without parent should be put
+ * at the tail of the list.
+ */
+ list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
+ &rq->leaf_cfs_rq_list);
+ /*
+ * We have reach the top of a tree so we can reset
+ * tmp_alone_branch to the beginning of the list.
+ */
+ rq->tmp_alone_branch = &rq->leaf_cfs_rq_list;
+ return true;
}
+
+ /*
+ * The parent has not already been added so we want to
+ * make sure that it will be put after us.
+ * tmp_alone_branch points to the begin of the branch
+ * where we will add parent.
+ */
+ list_add_rcu(&cfs_rq->leaf_cfs_rq_list, rq->tmp_alone_branch);
+ /*
+ * update tmp_alone_branch to points to the new begin
+ * of the branch
+ */
+ rq->tmp_alone_branch = &cfs_rq->leaf_cfs_rq_list;
+ return false;
}
static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
{
if (cfs_rq->on_list) {
+ struct rq *rq = rq_of(cfs_rq);
+
+ /*
+ * With cfs_rq being unthrottled/throttled during an enqueue,
+ * it can happen the tmp_alone_branch points the a leaf that
+ * we finally want to del. In this case, tmp_alone_branch moves
+ * to the prev element but it will point to rq->leaf_cfs_rq_list
+ * at the end of the enqueue.
+ */
+ if (rq->tmp_alone_branch == &cfs_rq->leaf_cfs_rq_list)
+ rq->tmp_alone_branch = cfs_rq->leaf_cfs_rq_list.prev;
+
list_del_rcu(&cfs_rq->leaf_cfs_rq_list);
cfs_rq->on_list = 0;
}
}
-/* Iterate through all leaf cfs_rq's on a runqueue: */
-#define for_each_leaf_cfs_rq(rq, cfs_rq) \
- list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list)
+static inline void assert_list_leaf_cfs_rq(struct rq *rq)
+{
+ SCHED_WARN_ON(rq->tmp_alone_branch != &rq->leaf_cfs_rq_list);
+}
+
+/* Iterate thr' all leaf cfs_rq's on a runqueue */
+#define for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) \
+ list_for_each_entry_safe(cfs_rq, pos, &rq->leaf_cfs_rq_list, \
+ leaf_cfs_rq_list)
/* Do the two (enqueued) entities belong to the same group ? */
static inline struct cfs_rq *
@@ -410,12 +440,6 @@ static inline struct task_struct *task_of(struct sched_entity *se)
return container_of(se, struct task_struct, se);
}
-static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
-{
- return container_of(cfs_rq, struct rq, cfs);
-}
-
-
#define for_each_sched_entity(se) \
for (; se; se = NULL)
@@ -438,16 +462,27 @@ static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
return NULL;
}
-static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
+static inline void cfs_rq_tg_path(struct cfs_rq *cfs_rq, char *path, int len)
+{
+ if (path)
+ strlcpy(path, "(null)", len);
+}
+
+static inline bool list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
{
+ return true;
}
static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
{
}
-#define for_each_leaf_cfs_rq(rq, cfs_rq) \
- for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL)
+static inline void assert_list_leaf_cfs_rq(struct rq *rq)
+{
+}
+
+#define for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) \
+ for (cfs_rq = &rq->cfs, pos = NULL; cfs_rq; cfs_rq = pos)
static inline struct sched_entity *parent_entity(struct sched_entity *se)
{
@@ -686,9 +721,8 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
return calc_delta_fair(sched_slice(cfs_rq, se), se);
}
-#ifdef CONFIG_SMP
#include "pelt.h"
-#include "sched-pelt.h"
+#ifdef CONFIG_SMP
static int select_idle_sibling(struct task_struct *p, int prev_cpu, int cpu);
static unsigned long task_h_load(struct task_struct *p);
@@ -744,11 +778,12 @@ static void attach_entity_cfs_rq(struct sched_entity *se);
* Finally, that extrapolated util_avg is clamped to the cap (util_avg_cap)
* if util_avg > util_avg_cap.
*/
-void post_init_entity_util_avg(struct sched_entity *se)
+void post_init_entity_util_avg(struct task_struct *p)
{
+ struct sched_entity *se = &p->se;
struct cfs_rq *cfs_rq = cfs_rq_of(se);
struct sched_avg *sa = &se->avg;
- long cpu_scale = arch_scale_cpu_capacity(NULL, cpu_of(rq_of(cfs_rq)));
+ long cpu_scale = arch_scale_cpu_capacity(cpu_of(rq_of(cfs_rq)));
long cap = (long)(cpu_scale - cfs_rq->avg.util_avg) / 2;
if (cap > 0) {
@@ -763,22 +798,19 @@ void post_init_entity_util_avg(struct sched_entity *se)
}
}
- if (entity_is_task(se)) {
- struct task_struct *p = task_of(se);
- if (p->sched_class != &fair_sched_class) {
- /*
- * For !fair tasks do:
- *
- update_cfs_rq_load_avg(now, cfs_rq);
- attach_entity_load_avg(cfs_rq, se, 0);
- switched_from_fair(rq, p);
- *
- * such that the next switched_to_fair() has the
- * expected state.
- */
- se->avg.last_update_time = cfs_rq_clock_task(cfs_rq);
- return;
- }
+ if (p->sched_class != &fair_sched_class) {
+ /*
+ * For !fair tasks do:
+ *
+ update_cfs_rq_load_avg(now, cfs_rq);
+ attach_entity_load_avg(cfs_rq, se, 0);
+ switched_from_fair(rq, p);
+ *
+ * such that the next switched_to_fair() has the
+ * expected state.
+ */
+ se->avg.last_update_time = cfs_rq_clock_pelt(cfs_rq);
+ return;
}
attach_entity_cfs_rq(se);
@@ -788,7 +820,7 @@ void post_init_entity_util_avg(struct sched_entity *se)
void init_entity_runnable_average(struct sched_entity *se)
{
}
-void post_init_entity_util_avg(struct sched_entity *se)
+void post_init_entity_util_avg(struct task_struct *p)
{
}
static void update_tg_load_avg(struct cfs_rq *cfs_rq, int force)
@@ -1035,7 +1067,7 @@ unsigned int sysctl_numa_balancing_scan_size = 256;
unsigned int sysctl_numa_balancing_scan_delay = 1000;
struct numa_group {
- atomic_t refcount;
+ refcount_t refcount;
spinlock_t lock; /* nr_tasks, tasks */
int nr_tasks;
@@ -1104,7 +1136,7 @@ static unsigned int task_scan_start(struct task_struct *p)
unsigned long shared = group_faults_shared(ng);
unsigned long private = group_faults_priv(ng);
- period *= atomic_read(&ng->refcount);
+ period *= refcount_read(&ng->refcount);
period *= shared + 1;
period /= private + shared + 1;
}
@@ -1127,7 +1159,7 @@ static unsigned int task_scan_max(struct task_struct *p)
unsigned long private = group_faults_priv(ng);
unsigned long period = smax;
- period *= atomic_read(&ng->refcount);
+ period *= refcount_read(&ng->refcount);
period *= shared + 1;
period /= private + shared + 1;
@@ -1160,7 +1192,7 @@ void init_numa_balancing(unsigned long clone_flags, struct task_struct *p)
/* New address space, reset the preferred nid */
if (!(clone_flags & CLONE_VM)) {
- p->numa_preferred_nid = -1;
+ p->numa_preferred_nid = NUMA_NO_NODE;
return;
}
@@ -1180,13 +1212,13 @@ void init_numa_balancing(unsigned long clone_flags, struct task_struct *p)
static void account_numa_enqueue(struct rq *rq, struct task_struct *p)
{
- rq->nr_numa_running += (p->numa_preferred_nid != -1);
+ rq->nr_numa_running += (p->numa_preferred_nid != NUMA_NO_NODE);
rq->nr_preferred_running += (p->numa_preferred_nid == task_node(p));
}
static void account_numa_dequeue(struct rq *rq, struct task_struct *p)
{
- rq->nr_numa_running -= (p->numa_preferred_nid != -1);
+ rq->nr_numa_running -= (p->numa_preferred_nid != NUMA_NO_NODE);
rq->nr_preferred_running -= (p->numa_preferred_nid == task_node(p));
}
@@ -1400,7 +1432,7 @@ bool should_numa_migrate_memory(struct task_struct *p, struct page * page,
* two full passes of the "multi-stage node selection" test that is
* executed below.
*/
- if ((p->numa_preferred_nid == -1 || p->numa_scan_seq <= 4) &&
+ if ((p->numa_preferred_nid == NUMA_NO_NODE || p->numa_scan_seq <= 4) &&
(cpupid_pid_unset(last_cpupid) || cpupid_match_pid(p, last_cpupid)))
return true;
@@ -1453,9 +1485,7 @@ bool should_numa_migrate_memory(struct task_struct *p, struct page * page,
group_faults_cpu(ng, src_nid) * group_faults(p, dst_nid) * 4;
}
-static unsigned long weighted_cpuload(struct rq *rq);
-static unsigned long source_load(int cpu, int type);
-static unsigned long target_load(int cpu, int type);
+static unsigned long cpu_runnable_load(struct rq *rq);
/* Cached statistics for all CPUs within a node */
struct numa_stats {
@@ -1476,7 +1506,7 @@ static void update_numa_stats(struct numa_stats *ns, int nid)
for_each_cpu(cpu, cpumask_of_node(nid)) {
struct rq *rq = cpu_rq(cpu);
- ns->load += weighted_cpuload(rq);
+ ns->load += cpu_runnable_load(rq);
ns->compute_capacity += capacity_of(cpu);
}
@@ -1608,7 +1638,7 @@ static void task_numa_compare(struct task_numa_env *env,
* be incurred if the tasks were swapped.
*/
/* Skip this swap candidate if cannot move to the source cpu */
- if (!cpumask_test_cpu(env->src_cpu, &cur->cpus_allowed))
+ if (!cpumask_test_cpu(env->src_cpu, cur->cpus_ptr))
goto unlock;
/*
@@ -1705,7 +1735,7 @@ static void task_numa_find_cpu(struct task_numa_env *env,
for_each_cpu(cpu, cpumask_of_node(env->dst_nid)) {
/* Skip this CPU if the source task cannot migrate */
- if (!cpumask_test_cpu(cpu, &env->p->cpus_allowed))
+ if (!cpumask_test_cpu(cpu, env->p->cpus_ptr))
continue;
env->dst_cpu = cpu;
@@ -1848,7 +1878,7 @@ static void numa_migrate_preferred(struct task_struct *p)
unsigned long interval = HZ;
/* This task has no NUMA fault statistics yet */
- if (unlikely(p->numa_preferred_nid == -1 || !p->numa_faults))
+ if (unlikely(p->numa_preferred_nid == NUMA_NO_NODE || !p->numa_faults))
return;
/* Periodically retry migrating the task to the preferred node */
@@ -1994,6 +2024,10 @@ static u64 numa_get_avg_runtime(struct task_struct *p, u64 *period)
if (p->last_task_numa_placement) {
delta = runtime - p->last_sum_exec_runtime;
*period = now - p->last_task_numa_placement;
+
+ /* Avoid time going backwards, prevent potential divide error: */
+ if (unlikely((s64)*period < 0))
+ *period = 0;
} else {
delta = p->se.avg.load_sum;
*period = LOAD_AVG_MAX;
@@ -2095,7 +2129,7 @@ static int preferred_group_nid(struct task_struct *p, int nid)
static void task_numa_placement(struct task_struct *p)
{
- int seq, nid, max_nid = -1;
+ int seq, nid, max_nid = NUMA_NO_NODE;
unsigned long max_faults = 0;
unsigned long fault_types[2] = { 0, 0 };
unsigned long total_faults;
@@ -2203,12 +2237,12 @@ static void task_numa_placement(struct task_struct *p)
static inline int get_numa_group(struct numa_group *grp)
{
- return atomic_inc_not_zero(&grp->refcount);
+ return refcount_inc_not_zero(&grp->refcount);
}
static inline void put_numa_group(struct numa_group *grp)
{
- if (atomic_dec_and_test(&grp->refcount))
+ if (refcount_dec_and_test(&grp->refcount))
kfree_rcu(grp, rcu);
}
@@ -2229,7 +2263,7 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags,
if (!grp)
return;
- atomic_set(&grp->refcount, 1);
+ refcount_set(&grp->refcount, 1);
grp->active_nodes = 1;
grp->max_faults_cpu = 0;
spin_lock_init(&grp->lock);
@@ -2580,7 +2614,7 @@ out:
/*
* Drive the periodic memory faults..
*/
-void task_tick_numa(struct rq *rq, struct task_struct *curr)
+static void task_tick_numa(struct rq *rq, struct task_struct *curr)
{
struct callback_head *work = &curr->numa_work;
u64 period, now;
@@ -2638,7 +2672,8 @@ static void update_scan_period(struct task_struct *p, int new_cpu)
* the preferred node.
*/
if (dst_nid == p->numa_preferred_nid ||
- (p->numa_preferred_nid != -1 && src_nid != p->numa_preferred_nid))
+ (p->numa_preferred_nid != NUMA_NO_NODE &&
+ src_nid != p->numa_preferred_nid))
return;
}
@@ -2668,8 +2703,6 @@ static void
account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
update_load_add(&cfs_rq->load, se->load.weight);
- if (!parent_entity(se))
- update_load_add(&rq_of(cfs_rq)->load, se->load.weight);
#ifdef CONFIG_SMP
if (entity_is_task(se)) {
struct rq *rq = rq_of(cfs_rq);
@@ -2685,8 +2718,6 @@ static void
account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
update_load_sub(&cfs_rq->load, se->load.weight);
- if (!parent_entity(se))
- update_load_sub(&rq_of(cfs_rq)->load, se->load.weight);
#ifdef CONFIG_SMP
if (entity_is_task(se)) {
account_numa_dequeue(rq_of(cfs_rq), task_of(se));
@@ -3122,7 +3153,7 @@ void set_task_rq_fair(struct sched_entity *se,
p_last_update_time = prev->avg.last_update_time;
n_last_update_time = next->avg.last_update_time;
#endif
- __update_load_avg_blocked_se(p_last_update_time, cpu_of(rq_of(prev)), se);
+ __update_load_avg_blocked_se(p_last_update_time, se);
se->avg.last_update_time = n_last_update_time;
}
@@ -3257,11 +3288,11 @@ update_tg_cfs_runnable(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cf
/*
* runnable_sum can't be lower than running_sum
- * As running sum is scale with CPU capacity wehreas the runnable sum
- * is not we rescale running_sum 1st
+ * Rescale running sum to be in the same range as runnable sum
+ * running_sum is in [0 : LOAD_AVG_MAX << SCHED_CAPACITY_SHIFT]
+ * runnable_sum is in [0 : LOAD_AVG_MAX]
*/
- running_sum = se->avg.util_sum /
- arch_scale_cpu_capacity(NULL, cpu_of(rq_of(cfs_rq)));
+ running_sum = se->avg.util_sum >> SCHED_CAPACITY_SHIFT;
runnable_sum = max(runnable_sum, running_sum);
load_sum = (s64)se_weight(se) * runnable_sum;
@@ -3316,6 +3347,9 @@ static inline int propagate_entity_load_avg(struct sched_entity *se)
update_tg_cfs_util(cfs_rq, se, gcfs_rq);
update_tg_cfs_runnable(cfs_rq, se, gcfs_rq);
+ trace_pelt_cfs_tp(cfs_rq);
+ trace_pelt_se_tp(se);
+
return 1;
}
@@ -3364,7 +3398,7 @@ static inline void add_tg_cfs_propagate(struct cfs_rq *cfs_rq, long runnable_sum
/**
* update_cfs_rq_load_avg - update the cfs_rq's load/util averages
- * @now: current time, as per cfs_rq_clock_task()
+ * @now: current time, as per cfs_rq_clock_pelt()
* @cfs_rq: cfs_rq to update
*
* The cfs_rq avg is the direct sum of all its entities (blocked and runnable)
@@ -3409,7 +3443,7 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
decayed = 1;
}
- decayed |= __update_load_avg_cfs_rq(now, cpu_of(rq_of(cfs_rq)), cfs_rq);
+ decayed |= __update_load_avg_cfs_rq(now, cfs_rq);
#ifndef CONFIG_64BIT
smp_wmb();
@@ -3468,6 +3502,8 @@ static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
add_tg_cfs_propagate(cfs_rq, se->avg.load_sum);
cfs_rq_util_change(cfs_rq, flags);
+
+ trace_pelt_cfs_tp(cfs_rq);
}
/**
@@ -3487,6 +3523,8 @@ static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
add_tg_cfs_propagate(cfs_rq, -se->avg.load_sum);
cfs_rq_util_change(cfs_rq, 0);
+
+ trace_pelt_cfs_tp(cfs_rq);
}
/*
@@ -3499,9 +3537,7 @@ static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
/* Update task and its cfs_rq load average */
static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
{
- u64 now = cfs_rq_clock_task(cfs_rq);
- struct rq *rq = rq_of(cfs_rq);
- int cpu = cpu_of(rq);
+ u64 now = cfs_rq_clock_pelt(cfs_rq);
int decayed;
/*
@@ -3509,7 +3545,7 @@ static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
* track group sched_entity load average for task_h_load calc in migration
*/
if (se->avg.last_update_time && !(flags & SKIP_AGE_LOAD))
- __update_load_avg_se(now, cpu, cfs_rq, se);
+ __update_load_avg_se(now, cfs_rq, se);
decayed = update_cfs_rq_load_avg(now, cfs_rq);
decayed |= propagate_entity_load_avg(se);
@@ -3555,20 +3591,20 @@ static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq)
* Synchronize entity load avg of dequeued entity without locking
* the previous rq.
*/
-void sync_entity_load_avg(struct sched_entity *se)
+static void sync_entity_load_avg(struct sched_entity *se)
{
struct cfs_rq *cfs_rq = cfs_rq_of(se);
u64 last_update_time;
last_update_time = cfs_rq_last_update_time(cfs_rq);
- __update_load_avg_blocked_se(last_update_time, cpu_of(rq_of(cfs_rq)), se);
+ __update_load_avg_blocked_se(last_update_time, se);
}
/*
* Task first catches up with cfs_rq, and then subtract
* itself from the cfs_rq (task must be off the queue now).
*/
-void remove_entity_load_avg(struct sched_entity *se)
+static void remove_entity_load_avg(struct sched_entity *se)
{
struct cfs_rq *cfs_rq = cfs_rq_of(se);
unsigned long flags;
@@ -3577,10 +3613,6 @@ void remove_entity_load_avg(struct sched_entity *se)
* tasks cannot exit without having gone through wake_up_new_task() ->
* post_init_entity_util_avg() which will have added things to the
* cfs_rq, so we can remove unconditionally.
- *
- * Similarly for groups, they will have passed through
- * post_init_entity_util_avg() before unregister_sched_fair_group()
- * calls this.
*/
sync_entity_load_avg(se);
@@ -3654,6 +3686,7 @@ util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p, bool task_sleep)
{
long last_ewma_diff;
struct util_est ue;
+ int cpu;
if (!sched_feat(UTIL_EST))
return;
@@ -3688,6 +3721,14 @@ util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p, bool task_sleep)
return;
/*
+ * To avoid overestimation of actual task utilization, skip updates if
+ * we cannot grant there is idle time in this CPU.
+ */
+ cpu = cpu_of(rq_of(cfs_rq));
+ if (task_util(p) > capacity_orig_of(cpu))
+ return;
+
+ /*
* Update Task's estimated utilization
*
* When *p completes an activation we can consolidate another sample
@@ -4079,7 +4120,8 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
* least twice that of our own weight (i.e. dont track it
* when there are only lesser-weight tasks around):
*/
- if (schedstat_enabled() && rq_of(cfs_rq)->load.weight >= 2*se->load.weight) {
+ if (schedstat_enabled() &&
+ rq_of(cfs_rq)->cfs.load.weight >= 2*se->load.weight) {
schedstat_set(se->statistics.slice_max,
max((u64)schedstat_val(se->statistics.slice_max),
se->sum_exec_runtime - se->prev_sum_exec_runtime));
@@ -4429,6 +4471,10 @@ static int tg_unthrottle_up(struct task_group *tg, void *data)
/* adjust cfs_rq_clock_task() */
cfs_rq->throttled_clock_task_time += rq_clock_task(rq) -
cfs_rq->throttled_clock_task;
+
+ /* Add cfs_rq with already running entity in the list */
+ if (cfs_rq->nr_running >= 1)
+ list_add_leaf_cfs_rq(cfs_rq);
}
return 0;
@@ -4440,8 +4486,10 @@ static int tg_throttle_down(struct task_group *tg, void *data)
struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
/* group is entering throttled state, stop time */
- if (!cfs_rq->throttle_count)
+ if (!cfs_rq->throttle_count) {
cfs_rq->throttled_clock_task = rq_clock_task(rq);
+ list_del_leaf_cfs_rq(cfs_rq);
+ }
cfs_rq->throttle_count++;
return 0;
@@ -4544,6 +4592,8 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
break;
}
+ assert_list_leaf_cfs_rq(rq);
+
if (!se)
add_nr_running(rq, task_delta);
@@ -4565,7 +4615,7 @@ static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b,
struct rq *rq = rq_of(cfs_rq);
struct rq_flags rf;
- rq_lock(rq, &rf);
+ rq_lock_irqsave(rq, &rf);
if (!cfs_rq_throttled(cfs_rq))
goto next;
@@ -4582,7 +4632,7 @@ static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b,
unthrottle_cfs_rq(cfs_rq);
next:
- rq_unlock(rq, &rf);
+ rq_unlock_irqrestore(rq, &rf);
if (!remaining)
break;
@@ -4598,7 +4648,7 @@ next:
* period the timer is deactivated until scheduling resumes; cfs_b->idle is
* used to track this state.
*/
-static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
+static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun, unsigned long flags)
{
u64 runtime, runtime_expires;
int throttled;
@@ -4640,11 +4690,11 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
while (throttled && cfs_b->runtime > 0 && !cfs_b->distribute_running) {
runtime = cfs_b->runtime;
cfs_b->distribute_running = 1;
- raw_spin_unlock(&cfs_b->lock);
+ raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
/* we can't nest cfs_b->lock while distributing bandwidth */
runtime = distribute_cfs_runtime(cfs_b, runtime,
runtime_expires);
- raw_spin_lock(&cfs_b->lock);
+ raw_spin_lock_irqsave(&cfs_b->lock, flags);
cfs_b->distribute_running = 0;
throttled = !list_empty(&cfs_b->throttled_cfs_rq);
@@ -4705,6 +4755,11 @@ static void start_cfs_slack_bandwidth(struct cfs_bandwidth *cfs_b)
if (runtime_refresh_within(cfs_b, min_left))
return;
+ /* don't push forwards an existing deferred unthrottle */
+ if (cfs_b->slack_started)
+ return;
+ cfs_b->slack_started = true;
+
hrtimer_start(&cfs_b->slack_timer,
ns_to_ktime(cfs_bandwidth_slack_period),
HRTIMER_MODE_REL);
@@ -4753,17 +4808,19 @@ static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
{
u64 runtime = 0, slice = sched_cfs_bandwidth_slice();
+ unsigned long flags;
u64 expires;
/* confirm we're still not at a refresh boundary */
- raw_spin_lock(&cfs_b->lock);
+ raw_spin_lock_irqsave(&cfs_b->lock, flags);
+ cfs_b->slack_started = false;
if (cfs_b->distribute_running) {
- raw_spin_unlock(&cfs_b->lock);
+ raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
return;
}
if (runtime_refresh_within(cfs_b, min_bandwidth_expiration)) {
- raw_spin_unlock(&cfs_b->lock);
+ raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
return;
}
@@ -4774,18 +4831,18 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
if (runtime)
cfs_b->distribute_running = 1;
- raw_spin_unlock(&cfs_b->lock);
+ raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
if (!runtime)
return;
runtime = distribute_cfs_runtime(cfs_b, runtime, expires);
- raw_spin_lock(&cfs_b->lock);
+ raw_spin_lock_irqsave(&cfs_b->lock, flags);
if (expires == cfs_b->runtime_expires)
lsub_positive(&cfs_b->runtime, runtime);
cfs_b->distribute_running = 0;
- raw_spin_unlock(&cfs_b->lock);
+ raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
}
/*
@@ -4859,24 +4916,50 @@ static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer)
return HRTIMER_NORESTART;
}
+extern const u64 max_cfs_quota_period;
+
static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
{
struct cfs_bandwidth *cfs_b =
container_of(timer, struct cfs_bandwidth, period_timer);
+ unsigned long flags;
int overrun;
int idle = 0;
+ int count = 0;
- raw_spin_lock(&cfs_b->lock);
+ raw_spin_lock_irqsave(&cfs_b->lock, flags);
for (;;) {
overrun = hrtimer_forward_now(timer, cfs_b->period);
if (!overrun)
break;
- idle = do_sched_cfs_period_timer(cfs_b, overrun);
+ if (++count > 3) {
+ u64 new, old = ktime_to_ns(cfs_b->period);
+
+ new = (old * 147) / 128; /* ~115% */
+ new = min(new, max_cfs_quota_period);
+
+ cfs_b->period = ns_to_ktime(new);
+
+ /* since max is 1s, this is limited to 1e9^2, which fits in u64 */
+ cfs_b->quota *= new;
+ cfs_b->quota = div64_u64(cfs_b->quota, old);
+
+ pr_warn_ratelimited(
+ "cfs_period_timer[cpu%d]: period too short, scaling up (new cfs_period_us %lld, cfs_quota_us = %lld)\n",
+ smp_processor_id(),
+ div_u64(new, NSEC_PER_USEC),
+ div_u64(cfs_b->quota, NSEC_PER_USEC));
+
+ /* reset count so we don't come right back in here */
+ count = 0;
+ }
+
+ idle = do_sched_cfs_period_timer(cfs_b, overrun, flags);
}
if (idle)
cfs_b->period_active = 0;
- raw_spin_unlock(&cfs_b->lock);
+ raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
}
@@ -4894,6 +4977,7 @@ void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
cfs_b->slack_timer.function = sched_cfs_slack_timer;
cfs_b->distribute_running = 0;
+ cfs_b->slack_started = false;
}
static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
@@ -4986,6 +5070,12 @@ static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq)
}
#else /* CONFIG_CFS_BANDWIDTH */
+
+static inline bool cfs_bandwidth_used(void)
+{
+ return false;
+}
+
static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
{
return rq_clock_task(rq_of(cfs_rq));
@@ -5083,7 +5173,6 @@ static inline void hrtick_update(struct rq *rq)
#ifdef CONFIG_SMP
static inline unsigned long cpu_util(int cpu);
-static unsigned long capacity_of(int cpu);
static inline bool cpu_overutilized(int cpu)
{
@@ -5092,8 +5181,10 @@ static inline bool cpu_overutilized(int cpu)
static inline void update_overutilized_status(struct rq *rq)
{
- if (!READ_ONCE(rq->rd->overutilized) && cpu_overutilized(rq->cpu))
+ if (!READ_ONCE(rq->rd->overutilized) && cpu_overutilized(rq->cpu)) {
WRITE_ONCE(rq->rd->overutilized, SG_OVERUTILIZED);
+ trace_sched_overutilized_tp(rq->rd, SG_OVERUTILIZED);
+ }
}
#else
static inline void update_overutilized_status(struct rq *rq) { }
@@ -5177,6 +5268,23 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
}
+ if (cfs_bandwidth_used()) {
+ /*
+ * When bandwidth control is enabled; the cfs_rq_throttled()
+ * breaks in the above iteration can result in incomplete
+ * leaf list maintenance, resulting in triggering the assertion
+ * below.
+ */
+ for_each_sched_entity(se) {
+ cfs_rq = cfs_rq_of(se);
+
+ if (list_add_leaf_cfs_rq(cfs_rq))
+ break;
+ }
+ }
+
+ assert_list_leaf_cfs_rq(rq);
+
hrtick_update(rq);
}
@@ -5247,71 +5355,6 @@ DEFINE_PER_CPU(cpumask_var_t, load_balance_mask);
DEFINE_PER_CPU(cpumask_var_t, select_idle_mask);
#ifdef CONFIG_NO_HZ_COMMON
-/*
- * per rq 'load' arrray crap; XXX kill this.
- */
-
-/*
- * The exact cpuload calculated at every tick would be:
- *
- * load' = (1 - 1/2^i) * load + (1/2^i) * cur_load
- *
- * If a CPU misses updates for n ticks (as it was idle) and update gets
- * called on the n+1-th tick when CPU may be busy, then we have:
- *
- * load_n = (1 - 1/2^i)^n * load_0
- * load_n+1 = (1 - 1/2^i) * load_n + (1/2^i) * cur_load
- *
- * decay_load_missed() below does efficient calculation of
- *
- * load' = (1 - 1/2^i)^n * load
- *
- * Because x^(n+m) := x^n * x^m we can decompose any x^n in power-of-2 factors.
- * This allows us to precompute the above in said factors, thereby allowing the
- * reduction of an arbitrary n in O(log_2 n) steps. (See also
- * fixed_power_int())
- *
- * The calculation is approximated on a 128 point scale.
- */
-#define DEGRADE_SHIFT 7
-
-static const u8 degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128};
-static const u8 degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = {
- { 0, 0, 0, 0, 0, 0, 0, 0 },
- { 64, 32, 8, 0, 0, 0, 0, 0 },
- { 96, 72, 40, 12, 1, 0, 0, 0 },
- { 112, 98, 75, 43, 15, 1, 0, 0 },
- { 120, 112, 98, 76, 45, 16, 2, 0 }
-};
-
-/*
- * Update cpu_load for any missed ticks, due to tickless idle. The backlog
- * would be when CPU is idle and so we just decay the old load without
- * adding any new load.
- */
-static unsigned long
-decay_load_missed(unsigned long load, unsigned long missed_updates, int idx)
-{
- int j = 0;
-
- if (!missed_updates)
- return load;
-
- if (missed_updates >= degrade_zero_ticks[idx])
- return 0;
-
- if (idx == 1)
- return load >> missed_updates;
-
- while (missed_updates) {
- if (missed_updates % 2)
- load = (load * degrade_factor[idx][j]) >> DEGRADE_SHIFT;
-
- missed_updates >>= 1;
- j++;
- }
- return load;
-}
static struct {
cpumask_var_t idle_cpus_mask;
@@ -5323,249 +5366,21 @@ static struct {
#endif /* CONFIG_NO_HZ_COMMON */
-/**
- * __cpu_load_update - update the rq->cpu_load[] statistics
- * @this_rq: The rq to update statistics for
- * @this_load: The current load
- * @pending_updates: The number of missed updates
- *
- * Update rq->cpu_load[] statistics. This function is usually called every
- * scheduler tick (TICK_NSEC).
- *
- * This function computes a decaying average:
- *
- * load[i]' = (1 - 1/2^i) * load[i] + (1/2^i) * load
- *
- * Because of NOHZ it might not get called on every tick which gives need for
- * the @pending_updates argument.
- *
- * load[i]_n = (1 - 1/2^i) * load[i]_n-1 + (1/2^i) * load_n-1
- * = A * load[i]_n-1 + B ; A := (1 - 1/2^i), B := (1/2^i) * load
- * = A * (A * load[i]_n-2 + B) + B
- * = A * (A * (A * load[i]_n-3 + B) + B) + B
- * = A^3 * load[i]_n-3 + (A^2 + A + 1) * B
- * = A^n * load[i]_0 + (A^(n-1) + A^(n-2) + ... + 1) * B
- * = A^n * load[i]_0 + ((1 - A^n) / (1 - A)) * B
- * = (1 - 1/2^i)^n * (load[i]_0 - load) + load
- *
- * In the above we've assumed load_n := load, which is true for NOHZ_FULL as
- * any change in load would have resulted in the tick being turned back on.
- *
- * For regular NOHZ, this reduces to:
- *
- * load[i]_n = (1 - 1/2^i)^n * load[i]_0
- *
- * see decay_load_misses(). For NOHZ_FULL we get to subtract and add the extra
- * term.
- */
-static void cpu_load_update(struct rq *this_rq, unsigned long this_load,
- unsigned long pending_updates)
-{
- unsigned long __maybe_unused tickless_load = this_rq->cpu_load[0];
- int i, scale;
-
- this_rq->nr_load_updates++;
-
- /* Update our load: */
- this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */
- for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
- unsigned long old_load, new_load;
-
- /* scale is effectively 1 << i now, and >> i divides by scale */
-
- old_load = this_rq->cpu_load[i];
-#ifdef CONFIG_NO_HZ_COMMON
- old_load = decay_load_missed(old_load, pending_updates - 1, i);
- if (tickless_load) {
- old_load -= decay_load_missed(tickless_load, pending_updates - 1, i);
- /*
- * old_load can never be a negative value because a
- * decayed tickless_load cannot be greater than the
- * original tickless_load.
- */
- old_load += tickless_load;
- }
-#endif
- new_load = this_load;
- /*
- * Round up the averaging division if load is increasing. This
- * prevents us from getting stuck on 9 if the load is 10, for
- * example.
- */
- if (new_load > old_load)
- new_load += scale - 1;
-
- this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i;
- }
-}
-
-/* Used instead of source_load when we know the type == 0 */
-static unsigned long weighted_cpuload(struct rq *rq)
+static unsigned long cpu_runnable_load(struct rq *rq)
{
return cfs_rq_runnable_load_avg(&rq->cfs);
}
-#ifdef CONFIG_NO_HZ_COMMON
-/*
- * There is no sane way to deal with nohz on smp when using jiffies because the
- * CPU doing the jiffies update might drift wrt the CPU doing the jiffy reading
- * causing off-by-one errors in observed deltas; {0,2} instead of {1,1}.
- *
- * Therefore we need to avoid the delta approach from the regular tick when
- * possible since that would seriously skew the load calculation. This is why we
- * use cpu_load_update_periodic() for CPUs out of nohz. However we'll rely on
- * jiffies deltas for updates happening while in nohz mode (idle ticks, idle
- * loop exit, nohz_idle_balance, nohz full exit...)
- *
- * This means we might still be one tick off for nohz periods.
- */
-
-static void cpu_load_update_nohz(struct rq *this_rq,
- unsigned long curr_jiffies,
- unsigned long load)
-{
- unsigned long pending_updates;
-
- pending_updates = curr_jiffies - this_rq->last_load_update_tick;
- if (pending_updates) {
- this_rq->last_load_update_tick = curr_jiffies;
- /*
- * In the regular NOHZ case, we were idle, this means load 0.
- * In the NOHZ_FULL case, we were non-idle, we should consider
- * its weighted load.
- */
- cpu_load_update(this_rq, load, pending_updates);
- }
-}
-
-/*
- * Called from nohz_idle_balance() to update the load ratings before doing the
- * idle balance.
- */
-static void cpu_load_update_idle(struct rq *this_rq)
-{
- /*
- * bail if there's load or we're actually up-to-date.
- */
- if (weighted_cpuload(this_rq))
- return;
-
- cpu_load_update_nohz(this_rq, READ_ONCE(jiffies), 0);
-}
-
-/*
- * Record CPU load on nohz entry so we know the tickless load to account
- * on nohz exit. cpu_load[0] happens then to be updated more frequently
- * than other cpu_load[idx] but it should be fine as cpu_load readers
- * shouldn't rely into synchronized cpu_load[*] updates.
- */
-void cpu_load_update_nohz_start(void)
-{
- struct rq *this_rq = this_rq();
-
- /*
- * This is all lockless but should be fine. If weighted_cpuload changes
- * concurrently we'll exit nohz. And cpu_load write can race with
- * cpu_load_update_idle() but both updater would be writing the same.
- */
- this_rq->cpu_load[0] = weighted_cpuload(this_rq);
-}
-
-/*
- * Account the tickless load in the end of a nohz frame.
- */
-void cpu_load_update_nohz_stop(void)
-{
- unsigned long curr_jiffies = READ_ONCE(jiffies);
- struct rq *this_rq = this_rq();
- unsigned long load;
- struct rq_flags rf;
-
- if (curr_jiffies == this_rq->last_load_update_tick)
- return;
-
- load = weighted_cpuload(this_rq);
- rq_lock(this_rq, &rf);
- update_rq_clock(this_rq);
- cpu_load_update_nohz(this_rq, curr_jiffies, load);
- rq_unlock(this_rq, &rf);
-}
-#else /* !CONFIG_NO_HZ_COMMON */
-static inline void cpu_load_update_nohz(struct rq *this_rq,
- unsigned long curr_jiffies,
- unsigned long load) { }
-#endif /* CONFIG_NO_HZ_COMMON */
-
-static void cpu_load_update_periodic(struct rq *this_rq, unsigned long load)
-{
-#ifdef CONFIG_NO_HZ_COMMON
- /* See the mess around cpu_load_update_nohz(). */
- this_rq->last_load_update_tick = READ_ONCE(jiffies);
-#endif
- cpu_load_update(this_rq, load, 1);
-}
-
-/*
- * Called from scheduler_tick()
- */
-void cpu_load_update_active(struct rq *this_rq)
-{
- unsigned long load = weighted_cpuload(this_rq);
-
- if (tick_nohz_tick_stopped())
- cpu_load_update_nohz(this_rq, READ_ONCE(jiffies), load);
- else
- cpu_load_update_periodic(this_rq, load);
-}
-
-/*
- * Return a low guess at the load of a migration-source CPU weighted
- * according to the scheduling class and "nice" value.
- *
- * We want to under-estimate the load of migration sources, to
- * balance conservatively.
- */
-static unsigned long source_load(int cpu, int type)
-{
- struct rq *rq = cpu_rq(cpu);
- unsigned long total = weighted_cpuload(rq);
-
- if (type == 0 || !sched_feat(LB_BIAS))
- return total;
-
- return min(rq->cpu_load[type-1], total);
-}
-
-/*
- * Return a high guess at the load of a migration-target CPU weighted
- * according to the scheduling class and "nice" value.
- */
-static unsigned long target_load(int cpu, int type)
-{
- struct rq *rq = cpu_rq(cpu);
- unsigned long total = weighted_cpuload(rq);
-
- if (type == 0 || !sched_feat(LB_BIAS))
- return total;
-
- return max(rq->cpu_load[type-1], total);
-}
-
static unsigned long capacity_of(int cpu)
{
return cpu_rq(cpu)->cpu_capacity;
}
-static unsigned long capacity_orig_of(int cpu)
-{
- return cpu_rq(cpu)->cpu_capacity_orig;
-}
-
static unsigned long cpu_avg_load_per_task(int cpu)
{
struct rq *rq = cpu_rq(cpu);
unsigned long nr_running = READ_ONCE(rq->cfs.h_nr_running);
- unsigned long load_avg = weighted_cpuload(rq);
+ unsigned long load_avg = cpu_runnable_load(rq);
if (nr_running)
return load_avg / nr_running;
@@ -5663,7 +5478,7 @@ wake_affine_weight(struct sched_domain *sd, struct task_struct *p,
s64 this_eff_load, prev_eff_load;
unsigned long task_load;
- this_eff_load = target_load(this_cpu, sd->wake_idx);
+ this_eff_load = cpu_runnable_load(cpu_rq(this_cpu));
if (sync) {
unsigned long current_load = task_h_load(current);
@@ -5681,7 +5496,7 @@ wake_affine_weight(struct sched_domain *sd, struct task_struct *p,
this_eff_load *= 100;
this_eff_load *= capacity_of(prev_cpu);
- prev_eff_load = source_load(prev_cpu, sd->wake_idx);
+ prev_eff_load = cpu_runnable_load(cpu_rq(prev_cpu));
prev_eff_load -= task_load;
if (sched_feat(WA_BIAS))
prev_eff_load *= 100 + (sd->imbalance_pct - 100) / 2;
@@ -5742,14 +5557,10 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
unsigned long this_runnable_load = ULONG_MAX;
unsigned long min_avg_load = ULONG_MAX, this_avg_load = ULONG_MAX;
unsigned long most_spare = 0, this_spare = 0;
- int load_idx = sd->forkexec_idx;
int imbalance_scale = 100 + (sd->imbalance_pct-100)/2;
unsigned long imbalance = scale_load_down(NICE_0_LOAD) *
(sd->imbalance_pct-100) / 100;
- if (sd_flag & SD_BALANCE_WAKE)
- load_idx = sd->wake_idx;
-
do {
unsigned long load, avg_load, runnable_load;
unsigned long spare_cap, max_spare_cap;
@@ -5758,7 +5569,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
/* Skip over this group if it has no CPUs allowed */
if (!cpumask_intersects(sched_group_span(group),
- &p->cpus_allowed))
+ p->cpus_ptr))
continue;
local_group = cpumask_test_cpu(this_cpu,
@@ -5773,12 +5584,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
max_spare_cap = 0;
for_each_cpu(i, sched_group_span(group)) {
- /* Bias balancing toward CPUs of our domain */
- if (local_group)
- load = source_load(i, load_idx);
- else
- load = target_load(i, load_idx);
-
+ load = cpu_runnable_load(cpu_rq(i));
runnable_load += load;
avg_load += cfs_rq_load_avg(&cpu_rq(i)->cfs);
@@ -5890,7 +5696,7 @@ find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this
return cpumask_first(sched_group_span(group));
/* Traverse only the allowed CPUs */
- for_each_cpu_and(i, sched_group_span(group), &p->cpus_allowed) {
+ for_each_cpu_and(i, sched_group_span(group), p->cpus_ptr) {
if (available_idle_cpu(i)) {
struct rq *rq = cpu_rq(i);
struct cpuidle_state *idle = idle_get_state(rq);
@@ -5914,7 +5720,7 @@ find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this
shallowest_idle_cpu = i;
}
} else if (shallowest_idle_cpu == -1) {
- load = weighted_cpuload(cpu_rq(i));
+ load = cpu_runnable_load(cpu_rq(i));
if (load < min_load) {
min_load = load;
least_loaded_cpu = i;
@@ -5930,7 +5736,7 @@ static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p
{
int new_cpu = cpu;
- if (!cpumask_intersects(sched_domain_span(sd), &p->cpus_allowed))
+ if (!cpumask_intersects(sched_domain_span(sd), p->cpus_ptr))
return prev_cpu;
/*
@@ -6047,13 +5853,13 @@ static int select_idle_core(struct task_struct *p, struct sched_domain *sd, int
if (!test_idle_cores(target, false))
return -1;
- cpumask_and(cpus, sched_domain_span(sd), &p->cpus_allowed);
+ cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr);
for_each_cpu_wrap(core, cpus, target) {
bool idle = true;
for_each_cpu(cpu, cpu_smt_mask(core)) {
- cpumask_clear_cpu(cpu, cpus);
+ __cpumask_clear_cpu(cpu, cpus);
if (!available_idle_cpu(cpu))
idle = false;
}
@@ -6073,7 +5879,7 @@ static int select_idle_core(struct task_struct *p, struct sched_domain *sd, int
/*
* Scan the local SMT mask for idle CPUs.
*/
-static int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target)
+static int select_idle_smt(struct task_struct *p, int target)
{
int cpu;
@@ -6081,7 +5887,7 @@ static int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int t
return -1;
for_each_cpu(cpu, cpu_smt_mask(target)) {
- if (!cpumask_test_cpu(cpu, &p->cpus_allowed))
+ if (!cpumask_test_cpu(cpu, p->cpus_ptr))
continue;
if (available_idle_cpu(cpu))
return cpu;
@@ -6097,7 +5903,7 @@ static inline int select_idle_core(struct task_struct *p, struct sched_domain *s
return -1;
}
-static inline int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target)
+static inline int select_idle_smt(struct task_struct *p, int target)
{
return -1;
}
@@ -6116,6 +5922,7 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t
u64 time, cost;
s64 delta;
int cpu, nr = INT_MAX;
+ int this = smp_processor_id();
this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc));
if (!this_sd)
@@ -6139,18 +5946,18 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t
nr = 4;
}
- time = local_clock();
+ time = cpu_clock(this);
for_each_cpu_wrap(cpu, sched_domain_span(sd), target) {
if (!--nr)
return -1;
- if (!cpumask_test_cpu(cpu, &p->cpus_allowed))
+ if (!cpumask_test_cpu(cpu, p->cpus_ptr))
continue;
if (available_idle_cpu(cpu))
break;
}
- time = local_clock() - time;
+ time = cpu_clock(this) - time;
cost = this_sd->avg_scan_cost;
delta = (s64)(time - cost) / 8;
this_sd->avg_scan_cost += delta;
@@ -6181,7 +5988,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
recent_used_cpu != target &&
cpus_share_cache(recent_used_cpu, target) &&
available_idle_cpu(recent_used_cpu) &&
- cpumask_test_cpu(p->recent_used_cpu, &p->cpus_allowed)) {
+ cpumask_test_cpu(p->recent_used_cpu, p->cpus_ptr)) {
/*
* Replace recent_used_cpu with prev as it is a potential
* candidate for the next wake:
@@ -6202,7 +6009,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
if ((unsigned)i < nr_cpumask_bits)
return i;
- i = select_idle_smt(p, sd, target);
+ i = select_idle_smt(p, target);
if ((unsigned)i < nr_cpumask_bits)
return i;
@@ -6425,11 +6232,21 @@ static unsigned long cpu_util_next(int cpu, struct task_struct *p, int dst_cpu)
static long
compute_energy(struct task_struct *p, int dst_cpu, struct perf_domain *pd)
{
- long util, max_util, sum_util, energy = 0;
+ unsigned int max_util, util_cfs, cpu_util, cpu_cap;
+ unsigned long sum_util, energy = 0;
+ struct task_struct *tsk;
int cpu;
for (; pd; pd = pd->next) {
+ struct cpumask *pd_mask = perf_domain_span(pd);
+
+ /*
+ * The energy model mandates all the CPUs of a performance
+ * domain have the same capacity.
+ */
+ cpu_cap = arch_scale_cpu_capacity(cpumask_first(pd_mask));
max_util = sum_util = 0;
+
/*
* The capacity state of CPUs of the current rd can be driven by
* CPUs of another rd if they belong to the same performance
@@ -6440,11 +6257,29 @@ compute_energy(struct task_struct *p, int dst_cpu, struct perf_domain *pd)
* it will not appear in its pd list and will not be accounted
* by compute_energy().
*/
- for_each_cpu_and(cpu, perf_domain_span(pd), cpu_online_mask) {
- util = cpu_util_next(cpu, p, dst_cpu);
- util = schedutil_energy_util(cpu, util);
- max_util = max(util, max_util);
- sum_util += util;
+ for_each_cpu_and(cpu, pd_mask, cpu_online_mask) {
+ util_cfs = cpu_util_next(cpu, p, dst_cpu);
+
+ /*
+ * Busy time computation: utilization clamping is not
+ * required since the ratio (sum_util / cpu_capacity)
+ * is already enough to scale the EM reported power
+ * consumption at the (eventually clamped) cpu_capacity.
+ */
+ sum_util += schedutil_cpu_util(cpu, util_cfs, cpu_cap,
+ ENERGY_UTIL, NULL);
+
+ /*
+ * Performance domain frequency: utilization clamping
+ * must be considered since it affects the selection
+ * of the performance domain frequency.
+ * NOTE: in case RT tasks are running, by default the
+ * FREQUENCY_UTIL's utilization can be max OPP.
+ */
+ tsk = cpu == dst_cpu ? p : NULL;
+ cpu_util = schedutil_cpu_util(cpu, util_cfs, cpu_cap,
+ FREQUENCY_UTIL, tsk);
+ max_util = max(max_util, cpu_util);
}
energy += em_pd_energy(pd->em_pd, max_util, sum_util);
@@ -6527,7 +6362,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
int max_spare_cap_cpu = -1;
for_each_cpu_and(cpu, perf_domain_span(pd), sched_domain_span(sd)) {
- if (!cpumask_test_cpu(cpu, &p->cpus_allowed))
+ if (!cpumask_test_cpu(cpu, p->cpus_ptr))
continue;
/* Skip CPUs that will be overutilized. */
@@ -6608,7 +6443,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
if (sd_flag & SD_BALANCE_WAKE) {
record_wakee(p);
- if (static_branch_unlikely(&sched_energy_present)) {
+ if (sched_energy_enabled()) {
new_cpu = find_energy_efficient_cpu(p, prev_cpu);
if (new_cpu >= 0)
return new_cpu;
@@ -6616,7 +6451,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
}
want_affine = !wake_wide(p) && !wake_cap(p, cpu, prev_cpu) &&
- cpumask_test_cpu(cpu, &p->cpus_allowed);
+ cpumask_test_cpu(cpu, p->cpus_ptr);
}
rcu_read_lock();
@@ -7027,6 +6862,12 @@ idle:
if (new_tasks > 0)
goto again;
+ /*
+ * rq is about to be idle, check if we need to update the
+ * lost_idle_time of clock_pelt
+ */
+ update_idle_rq_clock_pelt(rq);
+
return NULL;
}
@@ -7366,14 +7207,14 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
/*
* We do not migrate tasks that are:
* 1) throttled_lb_pair, or
- * 2) cannot be migrated to this CPU due to cpus_allowed, or
+ * 2) cannot be migrated to this CPU due to cpus_ptr, or
* 3) running (obviously), or
* 4) are cache-hot on their current CPU.
*/
if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu))
return 0;
- if (!cpumask_test_cpu(env->dst_cpu, &p->cpus_allowed)) {
+ if (!cpumask_test_cpu(env->dst_cpu, p->cpus_ptr)) {
int cpu;
schedstat_inc(p->se.statistics.nr_failed_migrations_affine);
@@ -7393,7 +7234,7 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
/* Prevent to re-select dst_cpu via env's CPUs: */
for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) {
- if (cpumask_test_cpu(cpu, &p->cpus_allowed)) {
+ if (cpumask_test_cpu(cpu, p->cpus_ptr)) {
env->flags |= LBF_DST_PINNED;
env->new_dst_cpu = cpu;
break;
@@ -7441,7 +7282,6 @@ static void detach_task(struct task_struct *p, struct lb_env *env)
{
lockdep_assert_held(&env->src_rq->lock);
- p->on_rq = TASK_ON_RQ_MIGRATING;
deactivate_task(env->src_rq, p, DEQUEUE_NOCLOCK);
set_task_cpu(p, env->dst_cpu);
}
@@ -7480,7 +7320,7 @@ static struct task_struct *detach_one_task(struct lb_env *env)
static const unsigned int sched_nr_migrate_break = 32;
/*
- * detach_tasks() -- tries to detach up to imbalance weighted load from
+ * detach_tasks() -- tries to detach up to imbalance runnable load from
* busiest_rq, as part of a balancing operation within domain "sd".
*
* Returns number of detached tasks if successful and 0 otherwise.
@@ -7548,7 +7388,7 @@ static int detach_tasks(struct lb_env *env)
/*
* We only want to steal up to the prescribed amount of
- * weighted load.
+ * runnable load.
*/
if (env->imbalance <= 0)
break;
@@ -7577,7 +7417,6 @@ static void attach_task(struct rq *rq, struct task_struct *p)
BUG_ON(task_rq(p) != rq);
activate_task(rq, p, ENQUEUE_NOCLOCK);
- p->on_rq = TASK_ON_RQ_QUEUED;
check_preempt_curr(rq, p, 0);
}
@@ -7618,6 +7457,7 @@ static void attach_tasks(struct lb_env *env)
rq_unlock(env->dst_rq, &rf);
}
+#ifdef CONFIG_NO_HZ_COMMON
static inline bool cfs_rq_has_blocked(struct cfs_rq *cfs_rq)
{
if (cfs_rq->avg.load_avg)
@@ -7645,12 +7485,42 @@ static inline bool others_have_blocked(struct rq *rq)
return false;
}
+static inline void update_blocked_load_status(struct rq *rq, bool has_blocked)
+{
+ rq->last_blocked_load_update_tick = jiffies;
+
+ if (!has_blocked)
+ rq->has_blocked_load = 0;
+}
+#else
+static inline bool cfs_rq_has_blocked(struct cfs_rq *cfs_rq) { return false; }
+static inline bool others_have_blocked(struct rq *rq) { return false; }
+static inline void update_blocked_load_status(struct rq *rq, bool has_blocked) {}
+#endif
+
#ifdef CONFIG_FAIR_GROUP_SCHED
+static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq)
+{
+ if (cfs_rq->load.weight)
+ return false;
+
+ if (cfs_rq->avg.load_sum)
+ return false;
+
+ if (cfs_rq->avg.util_sum)
+ return false;
+
+ if (cfs_rq->avg.runnable_load_sum)
+ return false;
+
+ return true;
+}
+
static void update_blocked_averages(int cpu)
{
struct rq *rq = cpu_rq(cpu);
- struct cfs_rq *cfs_rq;
+ struct cfs_rq *cfs_rq, *pos;
const struct sched_class *curr_class;
struct rq_flags rf;
bool done = true;
@@ -7662,14 +7532,10 @@ static void update_blocked_averages(int cpu)
* Iterates the task_group tree in a bottom up fashion, see
* list_add_leaf_cfs_rq() for details.
*/
- for_each_leaf_cfs_rq(rq, cfs_rq) {
+ for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) {
struct sched_entity *se;
- /* throttled entities do not contribute to load */
- if (throttled_hierarchy(cfs_rq))
- continue;
-
- if (update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq))
+ if (update_cfs_rq_load_avg(cfs_rq_clock_pelt(cfs_rq), cfs_rq))
update_tg_load_avg(cfs_rq, 0);
/* Propagate pending load changes to the parent, if any: */
@@ -7677,24 +7543,27 @@ static void update_blocked_averages(int cpu)
if (se && !skip_blocked_update(se))
update_load_avg(cfs_rq_of(se), se, 0);
+ /*
+ * There can be a lot of idle CPU cgroups. Don't let fully
+ * decayed cfs_rqs linger on the list.
+ */
+ if (cfs_rq_is_decayed(cfs_rq))
+ list_del_leaf_cfs_rq(cfs_rq);
+
/* Don't need periodic decay once load/util_avg are null */
if (cfs_rq_has_blocked(cfs_rq))
done = false;
}
curr_class = rq->curr->sched_class;
- update_rt_rq_load_avg(rq_clock_task(rq), rq, curr_class == &rt_sched_class);
- update_dl_rq_load_avg(rq_clock_task(rq), rq, curr_class == &dl_sched_class);
+ update_rt_rq_load_avg(rq_clock_pelt(rq), rq, curr_class == &rt_sched_class);
+ update_dl_rq_load_avg(rq_clock_pelt(rq), rq, curr_class == &dl_sched_class);
update_irq_load_avg(rq, 0);
/* Don't need periodic decay once load/util_avg are null */
if (others_have_blocked(rq))
done = false;
-#ifdef CONFIG_NO_HZ_COMMON
- rq->last_blocked_load_update_tick = jiffies;
- if (done)
- rq->has_blocked_load = 0;
-#endif
+ update_blocked_load_status(rq, !done);
rq_unlock_irqrestore(rq, &rf);
}
@@ -7713,10 +7582,10 @@ static void update_cfs_rq_h_load(struct cfs_rq *cfs_rq)
if (cfs_rq->last_h_load_update == now)
return;
- cfs_rq->h_load_next = NULL;
+ WRITE_ONCE(cfs_rq->h_load_next, NULL);
for_each_sched_entity(se) {
cfs_rq = cfs_rq_of(se);
- cfs_rq->h_load_next = se;
+ WRITE_ONCE(cfs_rq->h_load_next, se);
if (cfs_rq->last_h_load_update == now)
break;
}
@@ -7726,7 +7595,7 @@ static void update_cfs_rq_h_load(struct cfs_rq *cfs_rq)
cfs_rq->last_h_load_update = now;
}
- while ((se = cfs_rq->h_load_next) != NULL) {
+ while ((se = READ_ONCE(cfs_rq->h_load_next)) != NULL) {
load = cfs_rq->h_load;
load = div64_ul(load * se->avg.load_avg,
cfs_rq_load_avg(cfs_rq) + 1);
@@ -7754,17 +7623,13 @@ static inline void update_blocked_averages(int cpu)
rq_lock_irqsave(rq, &rf);
update_rq_clock(rq);
- update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq);
+ update_cfs_rq_load_avg(cfs_rq_clock_pelt(cfs_rq), cfs_rq);
curr_class = rq->curr->sched_class;
- update_rt_rq_load_avg(rq_clock_task(rq), rq, curr_class == &rt_sched_class);
- update_dl_rq_load_avg(rq_clock_task(rq), rq, curr_class == &dl_sched_class);
+ update_rt_rq_load_avg(rq_clock_pelt(rq), rq, curr_class == &rt_sched_class);
+ update_dl_rq_load_avg(rq_clock_pelt(rq), rq, curr_class == &dl_sched_class);
update_irq_load_avg(rq, 0);
-#ifdef CONFIG_NO_HZ_COMMON
- rq->last_blocked_load_update_tick = jiffies;
- if (!cfs_rq_has_blocked(cfs_rq) && !others_have_blocked(rq))
- rq->has_blocked_load = 0;
-#endif
+ update_blocked_load_status(rq, cfs_rq_has_blocked(cfs_rq) || others_have_blocked(rq));
rq_unlock_irqrestore(rq, &rf);
}
@@ -7782,7 +7647,6 @@ static unsigned long task_h_load(struct task_struct *p)
struct sg_lb_stats {
unsigned long avg_load; /*Avg load across the CPUs of the group */
unsigned long group_load; /* Total load over the CPUs of the group */
- unsigned long sum_weighted_load; /* Weighted load of group's tasks */
unsigned long load_per_task;
unsigned long group_capacity;
unsigned long group_util; /* Total utilization of the group */
@@ -7836,38 +7700,10 @@ static inline void init_sd_lb_stats(struct sd_lb_stats *sds)
};
}
-/**
- * get_sd_load_idx - Obtain the load index for a given sched domain.
- * @sd: The sched_domain whose load_idx is to be obtained.
- * @idle: The idle status of the CPU for whose sd load_idx is obtained.
- *
- * Return: The load index.
- */
-static inline int get_sd_load_idx(struct sched_domain *sd,
- enum cpu_idle_type idle)
-{
- int load_idx;
-
- switch (idle) {
- case CPU_NOT_IDLE:
- load_idx = sd->busy_idx;
- break;
-
- case CPU_NEWLY_IDLE:
- load_idx = sd->newidle_idx;
- break;
- default:
- load_idx = sd->idle_idx;
- break;
- }
-
- return load_idx;
-}
-
static unsigned long scale_rt_capacity(struct sched_domain *sd, int cpu)
{
struct rq *rq = cpu_rq(cpu);
- unsigned long max = arch_scale_cpu_capacity(sd, cpu);
+ unsigned long max = arch_scale_cpu_capacity(cpu);
unsigned long used, free;
unsigned long irq;
@@ -7892,7 +7728,7 @@ static void update_cpu_capacity(struct sched_domain *sd, int cpu)
unsigned long capacity = scale_rt_capacity(sd, cpu);
struct sched_group *sdg = sd->groups;
- cpu_rq(cpu)->cpu_capacity_orig = arch_scale_cpu_capacity(sd, cpu);
+ cpu_rq(cpu)->cpu_capacity_orig = arch_scale_cpu_capacity(cpu);
if (!capacity)
capacity = 1;
@@ -7989,8 +7825,20 @@ check_cpu_capacity(struct rq *rq, struct sched_domain *sd)
}
/*
+ * Check whether a rq has a misfit task and if it looks like we can actually
+ * help that task: we can migrate the task to a CPU of higher capacity, or
+ * the task's current CPU is heavily pressured.
+ */
+static inline int check_misfit_status(struct rq *rq, struct sched_domain *sd)
+{
+ return rq->misfit_task_load &&
+ (rq->cpu_capacity_orig < rq->rd->max_cpu_capacity ||
+ check_cpu_capacity(rq, sd));
+}
+
+/*
* Group imbalance indicates (and tries to solve) the problem where balancing
- * groups is inadequate due to ->cpus_allowed constraints.
+ * groups is inadequate due to ->cpus_ptr constraints.
*
* Imagine a situation of two groups of 4 CPUs each and 4 tasks each with a
* cpumask covering 1 CPU of the first group and 3 CPUs of the second group.
@@ -8140,9 +7988,6 @@ static inline void update_sg_lb_stats(struct lb_env *env,
struct sg_lb_stats *sgs,
int *sg_status)
{
- int local_group = cpumask_test_cpu(env->dst_cpu, sched_group_span(group));
- int load_idx = get_sd_load_idx(env->sd, env->idle);
- unsigned long load;
int i, nr_running;
memset(sgs, 0, sizeof(*sgs));
@@ -8153,13 +7998,7 @@ static inline void update_sg_lb_stats(struct lb_env *env,
if ((env->flags & LBF_NOHZ_STATS) && update_nohz_stats(rq, false))
env->flags |= LBF_NOHZ_AGAIN;
- /* Bias balancing toward CPUs of our domain: */
- if (local_group)
- load = target_load(i, load_idx);
- else
- load = source_load(i, load_idx);
-
- sgs->group_load += load;
+ sgs->group_load += cpu_runnable_load(rq);
sgs->group_util += cpu_util(i);
sgs->sum_nr_running += rq->cfs.h_nr_running;
@@ -8174,7 +8013,6 @@ static inline void update_sg_lb_stats(struct lb_env *env,
sgs->nr_numa_running += rq->nr_numa_running;
sgs->nr_preferred_running += rq->nr_preferred_running;
#endif
- sgs->sum_weighted_load += weighted_cpuload(rq);
/*
* No need to call idle_cpu() if nr_running is not 0
*/
@@ -8193,7 +8031,7 @@ static inline void update_sg_lb_stats(struct lb_env *env,
sgs->avg_load = (sgs->group_load*SCHED_CAPACITY_SCALE) / sgs->group_capacity;
if (sgs->sum_nr_running)
- sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;
+ sgs->load_per_task = sgs->group_load / sgs->sum_nr_running;
sgs->group_weight = group->group_weight;
@@ -8407,8 +8245,12 @@ next_group:
/* Update over-utilization (tipping point, U >= 0) indicator */
WRITE_ONCE(rd->overutilized, sg_status & SG_OVERUTILIZED);
+ trace_sched_overutilized_tp(rd, sg_status & SG_OVERUTILIZED);
} else if (sg_status & SG_OVERUTILIZED) {
- WRITE_ONCE(env->dst_rq->rd->overutilized, SG_OVERUTILIZED);
+ struct root_domain *rd = env->dst_rq->rd;
+
+ WRITE_ONCE(rd->overutilized, SG_OVERUTILIZED);
+ trace_sched_overutilized_tp(rd, SG_OVERUTILIZED);
}
}
@@ -8452,9 +8294,7 @@ static int check_asym_packing(struct lb_env *env, struct sd_lb_stats *sds)
if (sched_asym_prefer(busiest_cpu, env->dst_cpu))
return 0;
- env->imbalance = DIV_ROUND_CLOSEST(
- sds->busiest_stat.avg_load * sds->busiest_stat.group_capacity,
- SCHED_CAPACITY_SCALE);
+ env->imbalance = sds->busiest_stat.group_load;
return 1;
}
@@ -8616,7 +8456,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
* find_busiest_group - Returns the busiest group within the sched_domain
* if there is an imbalance.
*
- * Also calculates the amount of weighted load which should be moved
+ * Also calculates the amount of runnable load which should be moved
* to restore balance.
*
* @env: The load balancing environment.
@@ -8636,7 +8476,7 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
*/
update_sd_lb_stats(env, &sds);
- if (static_branch_unlikely(&sched_energy_present)) {
+ if (sched_energy_enabled()) {
struct root_domain *rd = env->dst_rq->rd;
if (rcu_dereference(rd->pd) && !READ_ONCE(rd->overutilized))
@@ -8661,7 +8501,7 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
/*
* If the busiest group is imbalanced the below checks don't
* work because they assume all things are equal, which typically
- * isn't true due to cpus_allowed constraints and the like.
+ * isn't true due to cpus_ptr constraints and the like.
*/
if (busiest->group_type == group_imbalanced)
goto force_balance;
@@ -8735,7 +8575,7 @@ static struct rq *find_busiest_queue(struct lb_env *env,
int i;
for_each_cpu_and(i, sched_group_span(group), env->cpus) {
- unsigned long capacity, wl;
+ unsigned long capacity, load;
enum fbq_type rt;
rq = cpu_rq(i);
@@ -8789,30 +8629,30 @@ static struct rq *find_busiest_queue(struct lb_env *env,
rq->nr_running == 1)
continue;
- wl = weighted_cpuload(rq);
+ load = cpu_runnable_load(rq);
/*
- * When comparing with imbalance, use weighted_cpuload()
+ * When comparing with imbalance, use cpu_runnable_load()
* which is not scaled with the CPU capacity.
*/
- if (rq->nr_running == 1 && wl > env->imbalance &&
+ if (rq->nr_running == 1 && load > env->imbalance &&
!check_cpu_capacity(rq, env->sd))
continue;
/*
* For the load comparisons with the other CPU's, consider
- * the weighted_cpuload() scaled with the CPU capacity, so
+ * the cpu_runnable_load() scaled with the CPU capacity, so
* that the load can be moved away from the CPU that is
* potentially running at a lower capacity.
*
- * Thus we're looking for max(wl_i / capacity_i), crosswise
+ * Thus we're looking for max(load_i / capacity_i), crosswise
* multiplication to rid ourselves of the division works out
- * to: wl_i * capacity_j > wl_j * capacity_i; where j is
+ * to: load_i * capacity_j > load_j * capacity_i; where j is
* our previous maximum.
*/
- if (wl * busiest_capacity > busiest_load * capacity) {
- busiest_load = wl;
+ if (load * busiest_capacity > busiest_load * capacity) {
+ busiest_load = load;
busiest_capacity = capacity;
busiest = rq;
}
@@ -8827,21 +8667,25 @@ static struct rq *find_busiest_queue(struct lb_env *env,
*/
#define MAX_PINNED_INTERVAL 512
-static int need_active_balance(struct lb_env *env)
+static inline bool
+asym_active_balance(struct lb_env *env)
{
- struct sched_domain *sd = env->sd;
+ /*
+ * ASYM_PACKING needs to force migrate tasks from busy but
+ * lower priority CPUs in order to pack all tasks in the
+ * highest priority CPUs.
+ */
+ return env->idle != CPU_NOT_IDLE && (env->sd->flags & SD_ASYM_PACKING) &&
+ sched_asym_prefer(env->dst_cpu, env->src_cpu);
+}
- if (env->idle == CPU_NEWLY_IDLE) {
+static inline bool
+voluntary_active_balance(struct lb_env *env)
+{
+ struct sched_domain *sd = env->sd;
- /*
- * ASYM_PACKING needs to force migrate tasks from busy but
- * lower priority CPUs in order to pack all tasks in the
- * highest priority CPUs.
- */
- if ((sd->flags & SD_ASYM_PACKING) &&
- sched_asym_prefer(env->dst_cpu, env->src_cpu))
- return 1;
- }
+ if (asym_active_balance(env))
+ return 1;
/*
* The dst_cpu is idle and the src_cpu CPU has only 1 CFS task.
@@ -8859,6 +8703,16 @@ static int need_active_balance(struct lb_env *env)
if (env->src_grp_type == group_misfit_task)
return 1;
+ return 0;
+}
+
+static int need_active_balance(struct lb_env *env)
+{
+ struct sched_domain *sd = env->sd;
+
+ if (voluntary_active_balance(env))
+ return 1;
+
return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2);
}
@@ -9023,7 +8877,7 @@ more_balance:
if ((env.flags & LBF_DST_PINNED) && env.imbalance > 0) {
/* Prevent to re-select dst_cpu via env's CPUs */
- cpumask_clear_cpu(env.dst_cpu, env.cpus);
+ __cpumask_clear_cpu(env.dst_cpu, env.cpus);
env.dst_rq = cpu_rq(env.new_dst_cpu);
env.dst_cpu = env.new_dst_cpu;
@@ -9050,7 +8904,7 @@ more_balance:
/* All tasks on this runqueue were pinned by CPU affinity */
if (unlikely(env.flags & LBF_ALL_PINNED)) {
- cpumask_clear_cpu(cpu_of(busiest), cpus);
+ __cpumask_clear_cpu(cpu_of(busiest), cpus);
/*
* Attempting to continue load balancing at the current
* sched_domain level only makes sense if there are
@@ -9089,7 +8943,7 @@ more_balance:
* if the curr task on busiest CPU can't be
* moved to this_cpu:
*/
- if (!cpumask_test_cpu(this_cpu, &busiest->curr->cpus_allowed)) {
+ if (!cpumask_test_cpu(this_cpu, busiest->curr->cpus_ptr)) {
raw_spin_unlock_irqrestore(&busiest->lock,
flags);
env.flags |= LBF_ALL_PINNED;
@@ -9120,7 +8974,7 @@ more_balance:
} else
sd->nr_balance_failed = 0;
- if (likely(!active_balance)) {
+ if (likely(!active_balance) || voluntary_active_balance(&env)) {
/* We were unbalanced, so reset the balancing interval */
sd->balance_interval = sd->min_interval;
} else {
@@ -9427,22 +9281,26 @@ static inline int on_null_domain(struct rq *rq)
* - When one of the busy CPUs notice that there may be an idle rebalancing
* needed, they will kick the idle load balancer, which then does idle
* load balancing for all the idle CPUs.
+ * - HK_FLAG_MISC CPUs are used for this task, because HK_FLAG_SCHED not set
+ * anywhere yet.
*/
static inline int find_new_ilb(void)
{
- int ilb = cpumask_first(nohz.idle_cpus_mask);
+ int ilb;
- if (ilb < nr_cpu_ids && idle_cpu(ilb))
- return ilb;
+ for_each_cpu_and(ilb, nohz.idle_cpus_mask,
+ housekeeping_cpumask(HK_FLAG_MISC)) {
+ if (idle_cpu(ilb))
+ return ilb;
+ }
return nr_cpu_ids;
}
/*
- * Kick a CPU to do the nohz balancing, if it is time for it. We pick the
- * nohz_load_balancer CPU (if there is one) otherwise fallback to any idle
- * CPU (if there is one).
+ * Kick a CPU to do the nohz balancing, if it is time for it. We pick any
+ * idle CPU in the HK_FLAG_MISC housekeeping set (if there is one).
*/
static void kick_ilb(unsigned int flags)
{
@@ -9469,15 +9327,8 @@ static void kick_ilb(unsigned int flags)
}
/*
- * Current heuristic for kicking the idle load balancer in the presence
- * of an idle cpu in the system.
- * - This rq has more than one task.
- * - This rq has at least one CFS task and the capacity of the CPU is
- * significantly reduced because of RT tasks or IRQs.
- * - At parent of LLC scheduler domain level, this cpu's scheduler group has
- * multiple busy cpu.
- * - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler
- * domain span are idle.
+ * Current decision point for kicking the idle load balancer in the presence
+ * of idle CPUs in the system.
*/
static void nohz_balancer_kick(struct rq *rq)
{
@@ -9510,30 +9361,21 @@ static void nohz_balancer_kick(struct rq *rq)
if (time_before(now, nohz.next_balance))
goto out;
- if (rq->nr_running >= 2 || rq->misfit_task_load) {
+ if (rq->nr_running >= 2) {
flags = NOHZ_KICK_MASK;
goto out;
}
rcu_read_lock();
- sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
- if (sds) {
- /*
- * XXX: write a coherent comment on why we do this.
- * See also: http://lkml.kernel.org/r/20111202010832.602203411@sbsiddha-desk.sc.intel.com
- */
- nr_busy = atomic_read(&sds->nr_busy_cpus);
- if (nr_busy > 1) {
- flags = NOHZ_KICK_MASK;
- goto unlock;
- }
-
- }
sd = rcu_dereference(rq->sd);
if (sd) {
- if ((rq->cfs.h_nr_running >= 1) &&
- check_cpu_capacity(rq, sd)) {
+ /*
+ * If there's a CFS task and the current CPU has reduced
+ * capacity; kick the ILB to see if there's a better CPU to run
+ * on.
+ */
+ if (rq->cfs.h_nr_running >= 1 && check_cpu_capacity(rq, sd)) {
flags = NOHZ_KICK_MASK;
goto unlock;
}
@@ -9541,17 +9383,57 @@ static void nohz_balancer_kick(struct rq *rq)
sd = rcu_dereference(per_cpu(sd_asym_packing, cpu));
if (sd) {
- for_each_cpu(i, sched_domain_span(sd)) {
- if (i == cpu ||
- !cpumask_test_cpu(i, nohz.idle_cpus_mask))
- continue;
-
+ /*
+ * When ASYM_PACKING; see if there's a more preferred CPU
+ * currently idle; in which case, kick the ILB to move tasks
+ * around.
+ */
+ for_each_cpu_and(i, sched_domain_span(sd), nohz.idle_cpus_mask) {
if (sched_asym_prefer(i, cpu)) {
flags = NOHZ_KICK_MASK;
goto unlock;
}
}
}
+
+ sd = rcu_dereference(per_cpu(sd_asym_cpucapacity, cpu));
+ if (sd) {
+ /*
+ * When ASYM_CPUCAPACITY; see if there's a higher capacity CPU
+ * to run the misfit task on.
+ */
+ if (check_misfit_status(rq, sd)) {
+ flags = NOHZ_KICK_MASK;
+ goto unlock;
+ }
+
+ /*
+ * For asymmetric systems, we do not want to nicely balance
+ * cache use, instead we want to embrace asymmetry and only
+ * ensure tasks have enough CPU capacity.
+ *
+ * Skip the LLC logic because it's not relevant in that case.
+ */
+ goto unlock;
+ }
+
+ sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
+ if (sds) {
+ /*
+ * If there is an imbalance between LLC domains (IOW we could
+ * increase the overall cache use), we need some less-loaded LLC
+ * domain to pull some load. Likewise, we may need to spread
+ * load within the current LLC domain (e.g. packed SMT cores but
+ * other CPUs are idle). We can't really know from here how busy
+ * the others are - so just get a nohz balance going if it looks
+ * like this LLC domain has tasks we could move.
+ */
+ nr_busy = atomic_read(&sds->nr_busy_cpus);
+ if (nr_busy > 1) {
+ flags = NOHZ_KICK_MASK;
+ goto unlock;
+ }
+ }
unlock:
rcu_read_unlock();
out:
@@ -9730,7 +9612,6 @@ static bool _nohz_idle_balance(struct rq *this_rq, unsigned int flags,
rq_lock_irqsave(rq, &rf);
update_rq_clock(rq);
- cpu_load_update_idle(rq);
rq_unlock_irqrestore(rq, &rf);
if (flags & NOHZ_BALANCE_KICK)
@@ -10541,15 +10422,19 @@ const struct sched_class fair_sched_class = {
#ifdef CONFIG_FAIR_GROUP_SCHED
.task_change_group = task_change_group_fair,
#endif
+
+#ifdef CONFIG_UCLAMP_TASK
+ .uclamp_enabled = 1,
+#endif
};
#ifdef CONFIG_SCHED_DEBUG
void print_cfs_stats(struct seq_file *m, int cpu)
{
- struct cfs_rq *cfs_rq;
+ struct cfs_rq *cfs_rq, *pos;
rcu_read_lock();
- for_each_leaf_cfs_rq(cpu_rq(cpu), cfs_rq)
+ for_each_leaf_cfs_rq_safe(cpu_rq(cpu), cfs_rq, pos)
print_cfs_rq(m, cpu, cfs_rq);
rcu_read_unlock();
}
@@ -10588,3 +10473,83 @@ __init void init_sched_fair_class(void)
#endif /* SMP */
}
+
+/*
+ * Helper functions to facilitate extracting info from tracepoints.
+ */
+
+const struct sched_avg *sched_trace_cfs_rq_avg(struct cfs_rq *cfs_rq)
+{
+#ifdef CONFIG_SMP
+ return cfs_rq ? &cfs_rq->avg : NULL;
+#else
+ return NULL;
+#endif
+}
+EXPORT_SYMBOL_GPL(sched_trace_cfs_rq_avg);
+
+char *sched_trace_cfs_rq_path(struct cfs_rq *cfs_rq, char *str, int len)
+{
+ if (!cfs_rq) {
+ if (str)
+ strlcpy(str, "(null)", len);
+ else
+ return NULL;
+ }
+
+ cfs_rq_tg_path(cfs_rq, str, len);
+ return str;
+}
+EXPORT_SYMBOL_GPL(sched_trace_cfs_rq_path);
+
+int sched_trace_cfs_rq_cpu(struct cfs_rq *cfs_rq)
+{
+ return cfs_rq ? cpu_of(rq_of(cfs_rq)) : -1;
+}
+EXPORT_SYMBOL_GPL(sched_trace_cfs_rq_cpu);
+
+const struct sched_avg *sched_trace_rq_avg_rt(struct rq *rq)
+{
+#ifdef CONFIG_SMP
+ return rq ? &rq->avg_rt : NULL;
+#else
+ return NULL;
+#endif
+}
+EXPORT_SYMBOL_GPL(sched_trace_rq_avg_rt);
+
+const struct sched_avg *sched_trace_rq_avg_dl(struct rq *rq)
+{
+#ifdef CONFIG_SMP
+ return rq ? &rq->avg_dl : NULL;
+#else
+ return NULL;
+#endif
+}
+EXPORT_SYMBOL_GPL(sched_trace_rq_avg_dl);
+
+const struct sched_avg *sched_trace_rq_avg_irq(struct rq *rq)
+{
+#if defined(CONFIG_SMP) && defined(CONFIG_HAVE_SCHED_AVG_IRQ)
+ return rq ? &rq->avg_irq : NULL;
+#else
+ return NULL;
+#endif
+}
+EXPORT_SYMBOL_GPL(sched_trace_rq_avg_irq);
+
+int sched_trace_rq_cpu(struct rq *rq)
+{
+ return rq ? cpu_of(rq) : -1;
+}
+EXPORT_SYMBOL_GPL(sched_trace_rq_cpu);
+
+const struct cpumask *sched_trace_rd_span(struct root_domain *rd)
+{
+#ifdef CONFIG_SMP
+ return rd ? rd->span : NULL;
+#else
+ return NULL;
+#endif
+}
+EXPORT_SYMBOL_GPL(sched_trace_rd_span);
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index 858589b83377..2410db5e9a35 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -39,7 +39,6 @@ SCHED_FEAT(WAKEUP_PREEMPTION, true)
SCHED_FEAT(HRTICK, false)
SCHED_FEAT(DOUBLE_TICK, false)
-SCHED_FEAT(LB_BIAS, false)
/*
* Decrement CPU capacity based on time not spent running tasks
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index f5516bae0c1b..80940939b733 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Generic entry points for the idle threads and
* implementation of the idle task scheduling class.
diff --git a/kernel/sched/isolation.c b/kernel/sched/isolation.c
index 81faddba9e20..123ea07a3f3b 100644
--- a/kernel/sched/isolation.c
+++ b/kernel/sched/isolation.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Housekeeping management. Manage the targets for routine code that can run on
* any CPU: unbound workqueues, timers, kthreads and any offloadable work.
@@ -65,6 +66,7 @@ void __init housekeeping_init(void)
static int __init housekeeping_setup(char *str, enum hk_flags flags)
{
cpumask_var_t non_housekeeping_mask;
+ cpumask_var_t tmp;
int err;
alloc_bootmem_cpumask_var(&non_housekeeping_mask);
@@ -75,16 +77,23 @@ static int __init housekeeping_setup(char *str, enum hk_flags flags)
return 0;
}
+ alloc_bootmem_cpumask_var(&tmp);
if (!housekeeping_flags) {
alloc_bootmem_cpumask_var(&housekeeping_mask);
cpumask_andnot(housekeeping_mask,
cpu_possible_mask, non_housekeeping_mask);
- if (cpumask_empty(housekeeping_mask))
- cpumask_set_cpu(smp_processor_id(), housekeeping_mask);
- } else {
- cpumask_var_t tmp;
- alloc_bootmem_cpumask_var(&tmp);
+ cpumask_andnot(tmp, cpu_present_mask, non_housekeeping_mask);
+ if (cpumask_empty(tmp)) {
+ pr_warn("Housekeeping: must include one present CPU, "
+ "using boot CPU:%d\n", smp_processor_id());
+ __cpumask_set_cpu(smp_processor_id(), housekeeping_mask);
+ __cpumask_clear_cpu(smp_processor_id(), non_housekeeping_mask);
+ }
+ } else {
+ cpumask_andnot(tmp, cpu_present_mask, non_housekeeping_mask);
+ if (cpumask_empty(tmp))
+ __cpumask_clear_cpu(smp_processor_id(), non_housekeeping_mask);
cpumask_andnot(tmp, cpu_possible_mask, non_housekeeping_mask);
if (!cpumask_equal(tmp, housekeeping_mask)) {
pr_warn("Housekeeping: nohz_full= must match isolcpus=\n");
@@ -92,8 +101,8 @@ static int __init housekeeping_setup(char *str, enum hk_flags flags)
free_bootmem_cpumask_var(non_housekeeping_mask);
return 0;
}
- free_bootmem_cpumask_var(tmp);
}
+ free_bootmem_cpumask_var(tmp);
if ((flags & HK_FLAG_TICK) && !(housekeeping_flags & HK_FLAG_TICK)) {
if (IS_ENABLED(CONFIG_NO_HZ_FULL)) {
diff --git a/kernel/sched/membarrier.c b/kernel/sched/membarrier.c
index 3cd8a3a795d2..aa8d75804108 100644
--- a/kernel/sched/membarrier.c
+++ b/kernel/sched/membarrier.c
@@ -1,17 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* Copyright (C) 2010-2017 Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
*
* membarrier system call
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
*/
#include "sched.h"
diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c
index 90fb5bc12ad4..a96db50d40e0 100644
--- a/kernel/sched/pelt.c
+++ b/kernel/sched/pelt.c
@@ -26,9 +26,10 @@
#include <linux/sched.h>
#include "sched.h"
-#include "sched-pelt.h"
#include "pelt.h"
+#include <trace/events/sched.h>
+
/*
* Approximate:
* val * y^n, where y^32 ~= 0.5 (~1 scheduling period)
@@ -106,16 +107,12 @@ static u32 __accumulate_pelt_segments(u64 periods, u32 d1, u32 d3)
* n=1
*/
static __always_inline u32
-accumulate_sum(u64 delta, int cpu, struct sched_avg *sa,
+accumulate_sum(u64 delta, struct sched_avg *sa,
unsigned long load, unsigned long runnable, int running)
{
- unsigned long scale_freq, scale_cpu;
u32 contrib = (u32)delta; /* p == 0 -> delta < 1024 */
u64 periods;
- scale_freq = arch_scale_freq_capacity(cpu);
- scale_cpu = arch_scale_cpu_capacity(NULL, cpu);
-
delta += sa->period_contrib;
periods = delta / 1024; /* A period is 1024us (~1ms) */
@@ -137,13 +134,12 @@ accumulate_sum(u64 delta, int cpu, struct sched_avg *sa,
}
sa->period_contrib = delta;
- contrib = cap_scale(contrib, scale_freq);
if (load)
sa->load_sum += load * contrib;
if (runnable)
sa->runnable_load_sum += runnable * contrib;
if (running)
- sa->util_sum += contrib * scale_cpu;
+ sa->util_sum += contrib << SCHED_CAPACITY_SHIFT;
return periods;
}
@@ -177,7 +173,7 @@ accumulate_sum(u64 delta, int cpu, struct sched_avg *sa,
* = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1}]
*/
static __always_inline int
-___update_load_sum(u64 now, int cpu, struct sched_avg *sa,
+___update_load_sum(u64 now, struct sched_avg *sa,
unsigned long load, unsigned long runnable, int running)
{
u64 delta;
@@ -221,7 +217,7 @@ ___update_load_sum(u64 now, int cpu, struct sched_avg *sa,
* Step 1: accumulate *_sum since last_update_time. If we haven't
* crossed period boundaries, finish.
*/
- if (!accumulate_sum(delta, cpu, sa, load, runnable, running))
+ if (!accumulate_sum(delta, sa, load, runnable, running))
return 0;
return 1;
@@ -267,37 +263,40 @@ ___update_load_avg(struct sched_avg *sa, unsigned long load, unsigned long runna
* runnable_load_avg = \Sum se->avg.runable_load_avg
*/
-int __update_load_avg_blocked_se(u64 now, int cpu, struct sched_entity *se)
+int __update_load_avg_blocked_se(u64 now, struct sched_entity *se)
{
- if (___update_load_sum(now, cpu, &se->avg, 0, 0, 0)) {
+ if (___update_load_sum(now, &se->avg, 0, 0, 0)) {
___update_load_avg(&se->avg, se_weight(se), se_runnable(se));
+ trace_pelt_se_tp(se);
return 1;
}
return 0;
}
-int __update_load_avg_se(u64 now, int cpu, struct cfs_rq *cfs_rq, struct sched_entity *se)
+int __update_load_avg_se(u64 now, struct cfs_rq *cfs_rq, struct sched_entity *se)
{
- if (___update_load_sum(now, cpu, &se->avg, !!se->on_rq, !!se->on_rq,
+ if (___update_load_sum(now, &se->avg, !!se->on_rq, !!se->on_rq,
cfs_rq->curr == se)) {
___update_load_avg(&se->avg, se_weight(se), se_runnable(se));
cfs_se_util_change(&se->avg);
+ trace_pelt_se_tp(se);
return 1;
}
return 0;
}
-int __update_load_avg_cfs_rq(u64 now, int cpu, struct cfs_rq *cfs_rq)
+int __update_load_avg_cfs_rq(u64 now, struct cfs_rq *cfs_rq)
{
- if (___update_load_sum(now, cpu, &cfs_rq->avg,
+ if (___update_load_sum(now, &cfs_rq->avg,
scale_load_down(cfs_rq->load.weight),
scale_load_down(cfs_rq->runnable_weight),
cfs_rq->curr != NULL)) {
___update_load_avg(&cfs_rq->avg, 1, 1);
+ trace_pelt_cfs_tp(cfs_rq);
return 1;
}
@@ -317,12 +316,13 @@ int __update_load_avg_cfs_rq(u64 now, int cpu, struct cfs_rq *cfs_rq)
int update_rt_rq_load_avg(u64 now, struct rq *rq, int running)
{
- if (___update_load_sum(now, rq->cpu, &rq->avg_rt,
+ if (___update_load_sum(now, &rq->avg_rt,
running,
running,
running)) {
___update_load_avg(&rq->avg_rt, 1, 1);
+ trace_pelt_rt_tp(rq);
return 1;
}
@@ -340,12 +340,13 @@ int update_rt_rq_load_avg(u64 now, struct rq *rq, int running)
int update_dl_rq_load_avg(u64 now, struct rq *rq, int running)
{
- if (___update_load_sum(now, rq->cpu, &rq->avg_dl,
+ if (___update_load_sum(now, &rq->avg_dl,
running,
running,
running)) {
___update_load_avg(&rq->avg_dl, 1, 1);
+ trace_pelt_dl_tp(rq);
return 1;
}
@@ -365,28 +366,39 @@ int update_dl_rq_load_avg(u64 now, struct rq *rq, int running)
int update_irq_load_avg(struct rq *rq, u64 running)
{
int ret = 0;
+
+ /*
+ * We can't use clock_pelt because irq time is not accounted in
+ * clock_task. Instead we directly scale the running time to
+ * reflect the real amount of computation
+ */
+ running = cap_scale(running, arch_scale_freq_capacity(cpu_of(rq)));
+ running = cap_scale(running, arch_scale_cpu_capacity(cpu_of(rq)));
+
/*
* We know the time that has been used by interrupt since last update
* but we don't when. Let be pessimistic and assume that interrupt has
* happened just before the update. This is not so far from reality
* because interrupt will most probably wake up task and trig an update
- * of rq clock during which the metric si updated.
+ * of rq clock during which the metric is updated.
* We start to decay with normal context time and then we add the
* interrupt context time.
* We can safely remove running from rq->clock because
* rq->clock += delta with delta >= running
*/
- ret = ___update_load_sum(rq->clock - running, rq->cpu, &rq->avg_irq,
+ ret = ___update_load_sum(rq->clock - running, &rq->avg_irq,
0,
0,
0);
- ret += ___update_load_sum(rq->clock, rq->cpu, &rq->avg_irq,
+ ret += ___update_load_sum(rq->clock, &rq->avg_irq,
1,
1,
1);
- if (ret)
+ if (ret) {
___update_load_avg(&rq->avg_irq, 1, 1);
+ trace_pelt_irq_tp(rq);
+ }
return ret;
}
diff --git a/kernel/sched/pelt.h b/kernel/sched/pelt.h
index 7e56b489ff32..afff644da065 100644
--- a/kernel/sched/pelt.h
+++ b/kernel/sched/pelt.h
@@ -1,8 +1,9 @@
#ifdef CONFIG_SMP
+#include "sched-pelt.h"
-int __update_load_avg_blocked_se(u64 now, int cpu, struct sched_entity *se);
-int __update_load_avg_se(u64 now, int cpu, struct cfs_rq *cfs_rq, struct sched_entity *se);
-int __update_load_avg_cfs_rq(u64 now, int cpu, struct cfs_rq *cfs_rq);
+int __update_load_avg_blocked_se(u64 now, struct sched_entity *se);
+int __update_load_avg_se(u64 now, struct cfs_rq *cfs_rq, struct sched_entity *se);
+int __update_load_avg_cfs_rq(u64 now, struct cfs_rq *cfs_rq);
int update_rt_rq_load_avg(u64 now, struct rq *rq, int running);
int update_dl_rq_load_avg(u64 now, struct rq *rq, int running);
@@ -42,6 +43,101 @@ static inline void cfs_se_util_change(struct sched_avg *avg)
WRITE_ONCE(avg->util_est.enqueued, enqueued);
}
+/*
+ * The clock_pelt scales the time to reflect the effective amount of
+ * computation done during the running delta time but then sync back to
+ * clock_task when rq is idle.
+ *
+ *
+ * absolute time | 1| 2| 3| 4| 5| 6| 7| 8| 9|10|11|12|13|14|15|16
+ * @ max capacity ------******---------------******---------------
+ * @ half capacity ------************---------************---------
+ * clock pelt | 1| 2| 3| 4| 7| 8| 9| 10| 11|14|15|16
+ *
+ */
+static inline void update_rq_clock_pelt(struct rq *rq, s64 delta)
+{
+ if (unlikely(is_idle_task(rq->curr))) {
+ /* The rq is idle, we can sync to clock_task */
+ rq->clock_pelt = rq_clock_task(rq);
+ return;
+ }
+
+ /*
+ * When a rq runs at a lower compute capacity, it will need
+ * more time to do the same amount of work than at max
+ * capacity. In order to be invariant, we scale the delta to
+ * reflect how much work has been really done.
+ * Running longer results in stealing idle time that will
+ * disturb the load signal compared to max capacity. This
+ * stolen idle time will be automatically reflected when the
+ * rq will be idle and the clock will be synced with
+ * rq_clock_task.
+ */
+
+ /*
+ * Scale the elapsed time to reflect the real amount of
+ * computation
+ */
+ delta = cap_scale(delta, arch_scale_cpu_capacity(cpu_of(rq)));
+ delta = cap_scale(delta, arch_scale_freq_capacity(cpu_of(rq)));
+
+ rq->clock_pelt += delta;
+}
+
+/*
+ * When rq becomes idle, we have to check if it has lost idle time
+ * because it was fully busy. A rq is fully used when the /Sum util_sum
+ * is greater or equal to:
+ * (LOAD_AVG_MAX - 1024 + rq->cfs.avg.period_contrib) << SCHED_CAPACITY_SHIFT;
+ * For optimization and computing rounding purpose, we don't take into account
+ * the position in the current window (period_contrib) and we use the higher
+ * bound of util_sum to decide.
+ */
+static inline void update_idle_rq_clock_pelt(struct rq *rq)
+{
+ u32 divider = ((LOAD_AVG_MAX - 1024) << SCHED_CAPACITY_SHIFT) - LOAD_AVG_MAX;
+ u32 util_sum = rq->cfs.avg.util_sum;
+ util_sum += rq->avg_rt.util_sum;
+ util_sum += rq->avg_dl.util_sum;
+
+ /*
+ * Reflecting stolen time makes sense only if the idle
+ * phase would be present at max capacity. As soon as the
+ * utilization of a rq has reached the maximum value, it is
+ * considered as an always runnig rq without idle time to
+ * steal. This potential idle time is considered as lost in
+ * this case. We keep track of this lost idle time compare to
+ * rq's clock_task.
+ */
+ if (util_sum >= divider)
+ rq->lost_idle_time += rq_clock_task(rq) - rq->clock_pelt;
+}
+
+static inline u64 rq_clock_pelt(struct rq *rq)
+{
+ lockdep_assert_held(&rq->lock);
+ assert_clock_updated(rq);
+
+ return rq->clock_pelt - rq->lost_idle_time;
+}
+
+#ifdef CONFIG_CFS_BANDWIDTH
+/* rq->task_clock normalized against any time this cfs_rq has spent throttled */
+static inline u64 cfs_rq_clock_pelt(struct cfs_rq *cfs_rq)
+{
+ if (unlikely(cfs_rq->throttle_count))
+ return cfs_rq->throttled_clock_task - cfs_rq->throttled_clock_task_time;
+
+ return rq_clock_pelt(rq_of(cfs_rq)) - cfs_rq->throttled_clock_task_time;
+}
+#else
+static inline u64 cfs_rq_clock_pelt(struct cfs_rq *cfs_rq)
+{
+ return rq_clock_pelt(rq_of(cfs_rq));
+}
+#endif
+
#else
static inline int
@@ -67,6 +163,18 @@ update_irq_load_avg(struct rq *rq, u64 running)
{
return 0;
}
+
+static inline u64 rq_clock_pelt(struct rq *rq)
+{
+ return rq_clock_task(rq);
+}
+
+static inline void
+update_rq_clock_pelt(struct rq *rq, s64 delta) { }
+
+static inline void
+update_idle_rq_clock_pelt(struct rq *rq) { }
+
#endif
diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
index c3484785b179..7acc632c3b82 100644
--- a/kernel/sched/psi.c
+++ b/kernel/sched/psi.c
@@ -4,6 +4,9 @@
* Copyright (c) 2018 Facebook, Inc.
* Author: Johannes Weiner <hannes@cmpxchg.org>
*
+ * Polling support by Suren Baghdasaryan <surenb@google.com>
+ * Copyright (c) 2018 Google, Inc.
+ *
* When CPU, memory and IO are contended, tasks experience delays that
* reduce throughput and introduce latencies into the workload. Memory
* and IO contention, in addition, can cause a full loss of forward
@@ -129,9 +132,13 @@
#include <linux/seq_file.h>
#include <linux/proc_fs.h>
#include <linux/seqlock.h>
+#include <linux/uaccess.h>
#include <linux/cgroup.h>
#include <linux/module.h>
#include <linux/sched.h>
+#include <linux/ctype.h>
+#include <linux/file.h>
+#include <linux/poll.h>
#include <linux/psi.h>
#include "sched.h"
@@ -140,9 +147,9 @@ static int psi_bug __read_mostly;
DEFINE_STATIC_KEY_FALSE(psi_disabled);
#ifdef CONFIG_PSI_DEFAULT_DISABLED
-bool psi_enable;
+static bool psi_enable;
#else
-bool psi_enable = true;
+static bool psi_enable = true;
#endif
static int __init setup_psi(char *str)
{
@@ -156,16 +163,21 @@ __setup("psi=", setup_psi);
#define EXP_60s 1981 /* 1/exp(2s/60s) */
#define EXP_300s 2034 /* 1/exp(2s/300s) */
+/* PSI trigger definitions */
+#define WINDOW_MIN_US 500000 /* Min window size is 500ms */
+#define WINDOW_MAX_US 10000000 /* Max window size is 10s */
+#define UPDATES_PER_WINDOW 10 /* 10 updates per window */
+
/* Sampling frequency in nanoseconds */
static u64 psi_period __read_mostly;
/* System-level pressure and stall tracking */
static DEFINE_PER_CPU(struct psi_group_cpu, system_group_pcpu);
-static struct psi_group psi_system = {
+struct psi_group psi_system = {
.pcpu = &system_group_pcpu,
};
-static void psi_update_work(struct work_struct *work);
+static void psi_avgs_work(struct work_struct *work);
static void group_init(struct psi_group *group)
{
@@ -173,9 +185,20 @@ static void group_init(struct psi_group *group)
for_each_possible_cpu(cpu)
seqcount_init(&per_cpu_ptr(group->pcpu, cpu)->seq);
- group->next_update = sched_clock() + psi_period;
- INIT_DELAYED_WORK(&group->clock_work, psi_update_work);
- mutex_init(&group->stat_lock);
+ group->avg_next_update = sched_clock() + psi_period;
+ INIT_DELAYED_WORK(&group->avgs_work, psi_avgs_work);
+ mutex_init(&group->avgs_lock);
+ /* Init trigger-related members */
+ atomic_set(&group->poll_scheduled, 0);
+ mutex_init(&group->trigger_lock);
+ INIT_LIST_HEAD(&group->triggers);
+ memset(group->nr_triggers, 0, sizeof(group->nr_triggers));
+ group->poll_states = 0;
+ group->poll_min_period = U32_MAX;
+ memset(group->polling_total, 0, sizeof(group->polling_total));
+ group->polling_next_update = ULLONG_MAX;
+ group->polling_until = 0;
+ rcu_assign_pointer(group->poll_kworker, NULL);
}
void __init psi_init(void)
@@ -210,20 +233,24 @@ static bool test_state(unsigned int *tasks, enum psi_states state)
}
}
-static void get_recent_times(struct psi_group *group, int cpu, u32 *times)
+static void get_recent_times(struct psi_group *group, int cpu,
+ enum psi_aggregators aggregator, u32 *times,
+ u32 *pchanged_states)
{
struct psi_group_cpu *groupc = per_cpu_ptr(group->pcpu, cpu);
- unsigned int tasks[NR_PSI_TASK_COUNTS];
u64 now, state_start;
+ enum psi_states s;
unsigned int seq;
- int s;
+ u32 state_mask;
+
+ *pchanged_states = 0;
/* Snapshot a coherent view of the CPU state */
do {
seq = read_seqcount_begin(&groupc->seq);
now = cpu_clock(cpu);
memcpy(times, groupc->times, sizeof(groupc->times));
- memcpy(tasks, groupc->tasks, sizeof(groupc->tasks));
+ state_mask = groupc->state_mask;
state_start = groupc->state_start;
} while (read_seqcount_retry(&groupc->seq, seq));
@@ -239,13 +266,15 @@ static void get_recent_times(struct psi_group *group, int cpu, u32 *times)
* (u32) and our reported pressure close to what's
* actually happening.
*/
- if (test_state(tasks, s))
+ if (state_mask & (1 << s))
times[s] += now - state_start;
- delta = times[s] - groupc->times_prev[s];
- groupc->times_prev[s] = times[s];
+ delta = times[s] - groupc->times_prev[aggregator][s];
+ groupc->times_prev[aggregator][s] = times[s];
times[s] = delta;
+ if (delta)
+ *pchanged_states |= (1 << s);
}
}
@@ -269,17 +298,16 @@ static void calc_avgs(unsigned long avg[3], int missed_periods,
avg[2] = calc_load(avg[2], EXP_300s, pct);
}
-static bool update_stats(struct psi_group *group)
+static void collect_percpu_times(struct psi_group *group,
+ enum psi_aggregators aggregator,
+ u32 *pchanged_states)
{
u64 deltas[NR_PSI_STATES - 1] = { 0, };
- unsigned long missed_periods = 0;
unsigned long nonidle_total = 0;
- u64 now, expires, period;
+ u32 changed_states = 0;
int cpu;
int s;
- mutex_lock(&group->stat_lock);
-
/*
* Collect the per-cpu time buckets and average them into a
* single time sample that is normalized to wallclock time.
@@ -291,8 +319,11 @@ static bool update_stats(struct psi_group *group)
for_each_possible_cpu(cpu) {
u32 times[NR_PSI_STATES];
u32 nonidle;
+ u32 cpu_changed_states;
- get_recent_times(group, cpu, times);
+ get_recent_times(group, cpu, aggregator, times,
+ &cpu_changed_states);
+ changed_states |= cpu_changed_states;
nonidle = nsecs_to_jiffies(times[PSI_NONIDLE]);
nonidle_total += nonidle;
@@ -315,14 +346,23 @@ static bool update_stats(struct psi_group *group)
/* total= */
for (s = 0; s < NR_PSI_STATES - 1; s++)
- group->total[s] += div_u64(deltas[s], max(nonidle_total, 1UL));
+ group->total[aggregator][s] +=
+ div_u64(deltas[s], max(nonidle_total, 1UL));
+
+ if (pchanged_states)
+ *pchanged_states = changed_states;
+}
+
+static u64 update_averages(struct psi_group *group, u64 now)
+{
+ unsigned long missed_periods = 0;
+ u64 expires, period;
+ u64 avg_next_update;
+ int s;
/* avgX= */
- now = sched_clock();
- expires = group->next_update;
- if (now < expires)
- goto out;
- if (now - expires > psi_period)
+ expires = group->avg_next_update;
+ if (now - expires >= psi_period)
missed_periods = div_u64(now - expires, psi_period);
/*
@@ -332,14 +372,14 @@ static bool update_stats(struct psi_group *group)
* But the deltas we sample out of the per-cpu buckets above
* are based on the actual time elapsing between clock ticks.
*/
- group->next_update = expires + ((1 + missed_periods) * psi_period);
- period = now - (group->last_update + (missed_periods * psi_period));
- group->last_update = now;
+ avg_next_update = expires + ((1 + missed_periods) * psi_period);
+ period = now - (group->avg_last_update + (missed_periods * psi_period));
+ group->avg_last_update = now;
for (s = 0; s < NR_PSI_STATES - 1; s++) {
u32 sample;
- sample = group->total[s] - group->total_prev[s];
+ sample = group->total[PSI_AVGS][s] - group->avg_total[s];
/*
* Due to the lockless sampling of the time buckets,
* recorded time deltas can slip into the next period,
@@ -359,23 +399,30 @@ static bool update_stats(struct psi_group *group)
*/
if (sample > period)
sample = period;
- group->total_prev[s] += sample;
+ group->avg_total[s] += sample;
calc_avgs(group->avg[s], missed_periods, sample, period);
}
-out:
- mutex_unlock(&group->stat_lock);
- return nonidle_total;
+
+ return avg_next_update;
}
-static void psi_update_work(struct work_struct *work)
+static void psi_avgs_work(struct work_struct *work)
{
struct delayed_work *dwork;
struct psi_group *group;
+ u32 changed_states;
bool nonidle;
+ u64 now;
dwork = to_delayed_work(work);
- group = container_of(dwork, struct psi_group, clock_work);
+ group = container_of(dwork, struct psi_group, avgs_work);
+
+ mutex_lock(&group->avgs_lock);
+ now = sched_clock();
+
+ collect_percpu_times(group, PSI_AVGS, &changed_states);
+ nonidle = changed_states & (1 << PSI_NONIDLE);
/*
* If there is task activity, periodically fold the per-cpu
* times and feed samples into the running averages. If things
@@ -383,18 +430,196 @@ static void psi_update_work(struct work_struct *work)
* Once restarted, we'll catch up the running averages in one
* go - see calc_avgs() and missed_periods.
*/
-
- nonidle = update_stats(group);
+ if (now >= group->avg_next_update)
+ group->avg_next_update = update_averages(group, now);
if (nonidle) {
- unsigned long delay = 0;
- u64 now;
+ schedule_delayed_work(dwork, nsecs_to_jiffies(
+ group->avg_next_update - now) + 1);
+ }
+
+ mutex_unlock(&group->avgs_lock);
+}
+
+/* Trigger tracking window manupulations */
+static void window_reset(struct psi_window *win, u64 now, u64 value,
+ u64 prev_growth)
+{
+ win->start_time = now;
+ win->start_value = value;
+ win->prev_growth = prev_growth;
+}
+
+/*
+ * PSI growth tracking window update and growth calculation routine.
+ *
+ * This approximates a sliding tracking window by interpolating
+ * partially elapsed windows using historical growth data from the
+ * previous intervals. This minimizes memory requirements (by not storing
+ * all the intermediate values in the previous window) and simplifies
+ * the calculations. It works well because PSI signal changes only in
+ * positive direction and over relatively small window sizes the growth
+ * is close to linear.
+ */
+static u64 window_update(struct psi_window *win, u64 now, u64 value)
+{
+ u64 elapsed;
+ u64 growth;
+
+ elapsed = now - win->start_time;
+ growth = value - win->start_value;
+ /*
+ * After each tracking window passes win->start_value and
+ * win->start_time get reset and win->prev_growth stores
+ * the average per-window growth of the previous window.
+ * win->prev_growth is then used to interpolate additional
+ * growth from the previous window assuming it was linear.
+ */
+ if (elapsed > win->size)
+ window_reset(win, now, value, growth);
+ else {
+ u32 remaining;
+
+ remaining = win->size - elapsed;
+ growth += div_u64(win->prev_growth * remaining, win->size);
+ }
+
+ return growth;
+}
+
+static void init_triggers(struct psi_group *group, u64 now)
+{
+ struct psi_trigger *t;
+
+ list_for_each_entry(t, &group->triggers, node)
+ window_reset(&t->win, now,
+ group->total[PSI_POLL][t->state], 0);
+ memcpy(group->polling_total, group->total[PSI_POLL],
+ sizeof(group->polling_total));
+ group->polling_next_update = now + group->poll_min_period;
+}
+
+static u64 update_triggers(struct psi_group *group, u64 now)
+{
+ struct psi_trigger *t;
+ bool new_stall = false;
+ u64 *total = group->total[PSI_POLL];
+
+ /*
+ * On subsequent updates, calculate growth deltas and let
+ * watchers know when their specified thresholds are exceeded.
+ */
+ list_for_each_entry(t, &group->triggers, node) {
+ u64 growth;
+
+ /* Check for stall activity */
+ if (group->polling_total[t->state] == total[t->state])
+ continue;
+
+ /*
+ * Multiple triggers might be looking at the same state,
+ * remember to update group->polling_total[] once we've
+ * been through all of them. Also remember to extend the
+ * polling time if we see new stall activity.
+ */
+ new_stall = true;
+
+ /* Calculate growth since last update */
+ growth = window_update(&t->win, now, total[t->state]);
+ if (growth < t->threshold)
+ continue;
+
+ /* Limit event signaling to once per window */
+ if (now < t->last_event_time + t->win.size)
+ continue;
+
+ /* Generate an event */
+ if (cmpxchg(&t->event, 0, 1) == 0)
+ wake_up_interruptible(&t->event_wait);
+ t->last_event_time = now;
+ }
+
+ if (new_stall)
+ memcpy(group->polling_total, total,
+ sizeof(group->polling_total));
+
+ return now + group->poll_min_period;
+}
+
+/*
+ * Schedule polling if it's not already scheduled. It's safe to call even from
+ * hotpath because even though kthread_queue_delayed_work takes worker->lock
+ * spinlock that spinlock is never contended due to poll_scheduled atomic
+ * preventing such competition.
+ */
+static void psi_schedule_poll_work(struct psi_group *group, unsigned long delay)
+{
+ struct kthread_worker *kworker;
+
+ /* Do not reschedule if already scheduled */
+ if (atomic_cmpxchg(&group->poll_scheduled, 0, 1) != 0)
+ return;
+
+ rcu_read_lock();
- now = sched_clock();
- if (group->next_update > now)
- delay = nsecs_to_jiffies(group->next_update - now) + 1;
- schedule_delayed_work(dwork, delay);
+ kworker = rcu_dereference(group->poll_kworker);
+ /*
+ * kworker might be NULL in case psi_trigger_destroy races with
+ * psi_task_change (hotpath) which can't use locks
+ */
+ if (likely(kworker))
+ kthread_queue_delayed_work(kworker, &group->poll_work, delay);
+ else
+ atomic_set(&group->poll_scheduled, 0);
+
+ rcu_read_unlock();
+}
+
+static void psi_poll_work(struct kthread_work *work)
+{
+ struct kthread_delayed_work *dwork;
+ struct psi_group *group;
+ u32 changed_states;
+ u64 now;
+
+ dwork = container_of(work, struct kthread_delayed_work, work);
+ group = container_of(dwork, struct psi_group, poll_work);
+
+ atomic_set(&group->poll_scheduled, 0);
+
+ mutex_lock(&group->trigger_lock);
+
+ now = sched_clock();
+
+ collect_percpu_times(group, PSI_POLL, &changed_states);
+
+ if (changed_states & group->poll_states) {
+ /* Initialize trigger windows when entering polling mode */
+ if (now > group->polling_until)
+ init_triggers(group, now);
+
+ /*
+ * Keep the monitor active for at least the duration of the
+ * minimum tracking window as long as monitor states are
+ * changing.
+ */
+ group->polling_until = now +
+ group->poll_min_period * UPDATES_PER_WINDOW;
+ }
+
+ if (now > group->polling_until) {
+ group->polling_next_update = ULLONG_MAX;
+ goto out;
}
+
+ if (now >= group->polling_next_update)
+ group->polling_next_update = update_triggers(group, now);
+
+ psi_schedule_poll_work(group,
+ nsecs_to_jiffies(group->polling_next_update - now) + 1);
+
+out:
+ mutex_unlock(&group->trigger_lock);
}
static void record_times(struct psi_group_cpu *groupc, int cpu,
@@ -407,15 +632,15 @@ static void record_times(struct psi_group_cpu *groupc, int cpu,
delta = now - groupc->state_start;
groupc->state_start = now;
- if (test_state(groupc->tasks, PSI_IO_SOME)) {
+ if (groupc->state_mask & (1 << PSI_IO_SOME)) {
groupc->times[PSI_IO_SOME] += delta;
- if (test_state(groupc->tasks, PSI_IO_FULL))
+ if (groupc->state_mask & (1 << PSI_IO_FULL))
groupc->times[PSI_IO_FULL] += delta;
}
- if (test_state(groupc->tasks, PSI_MEM_SOME)) {
+ if (groupc->state_mask & (1 << PSI_MEM_SOME)) {
groupc->times[PSI_MEM_SOME] += delta;
- if (test_state(groupc->tasks, PSI_MEM_FULL))
+ if (groupc->state_mask & (1 << PSI_MEM_FULL))
groupc->times[PSI_MEM_FULL] += delta;
else if (memstall_tick) {
u32 sample;
@@ -436,18 +661,20 @@ static void record_times(struct psi_group_cpu *groupc, int cpu,
}
}
- if (test_state(groupc->tasks, PSI_CPU_SOME))
+ if (groupc->state_mask & (1 << PSI_CPU_SOME))
groupc->times[PSI_CPU_SOME] += delta;
- if (test_state(groupc->tasks, PSI_NONIDLE))
+ if (groupc->state_mask & (1 << PSI_NONIDLE))
groupc->times[PSI_NONIDLE] += delta;
}
-static void psi_group_change(struct psi_group *group, int cpu,
- unsigned int clear, unsigned int set)
+static u32 psi_group_change(struct psi_group *group, int cpu,
+ unsigned int clear, unsigned int set)
{
struct psi_group_cpu *groupc;
unsigned int t, m;
+ enum psi_states s;
+ u32 state_mask = 0;
groupc = per_cpu_ptr(group->pcpu, cpu);
@@ -480,7 +707,16 @@ static void psi_group_change(struct psi_group *group, int cpu,
if (set & (1 << t))
groupc->tasks[t]++;
+ /* Calculate state mask representing active states */
+ for (s = 0; s < NR_PSI_STATES; s++) {
+ if (test_state(groupc->tasks, s))
+ state_mask |= (1 << s);
+ }
+ groupc->state_mask = state_mask;
+
write_seqcount_end(&groupc->seq);
+
+ return state_mask;
}
static struct psi_group *iterate_groups(struct task_struct *task, void **iter)
@@ -537,13 +773,17 @@ void psi_task_change(struct task_struct *task, int clear, int set)
*/
if (unlikely((clear & TSK_RUNNING) &&
(task->flags & PF_WQ_WORKER) &&
- wq_worker_last_func(task) == psi_update_work))
+ wq_worker_last_func(task) == psi_avgs_work))
wake_clock = false;
while ((group = iterate_groups(task, &iter))) {
- psi_group_change(group, cpu, clear, set);
- if (wake_clock && !delayed_work_pending(&group->clock_work))
- schedule_delayed_work(&group->clock_work, PSI_FREQ);
+ u32 state_mask = psi_group_change(group, cpu, clear, set);
+
+ if (state_mask & group->poll_states)
+ psi_schedule_poll_work(group, 1);
+
+ if (wake_clock && !delayed_work_pending(&group->avgs_work))
+ schedule_delayed_work(&group->avgs_work, PSI_FREQ);
}
}
@@ -640,8 +880,10 @@ void psi_cgroup_free(struct cgroup *cgroup)
if (static_branch_likely(&psi_disabled))
return;
- cancel_delayed_work_sync(&cgroup->psi.clock_work);
+ cancel_delayed_work_sync(&cgroup->psi.avgs_work);
free_percpu(cgroup->psi.pcpu);
+ /* All triggers must be removed by now */
+ WARN_ONCE(cgroup->psi.poll_states, "psi: trigger leak\n");
}
/**
@@ -697,11 +939,18 @@ void cgroup_move_task(struct task_struct *task, struct css_set *to)
int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res)
{
int full;
+ u64 now;
if (static_branch_likely(&psi_disabled))
return -EOPNOTSUPP;
- update_stats(group);
+ /* Update averages before reporting them */
+ mutex_lock(&group->avgs_lock);
+ now = sched_clock();
+ collect_percpu_times(group, PSI_AVGS, NULL);
+ if (now >= group->avg_next_update)
+ group->avg_next_update = update_averages(group, now);
+ mutex_unlock(&group->avgs_lock);
for (full = 0; full < 2 - (res == PSI_CPU); full++) {
unsigned long avg[3];
@@ -710,7 +959,8 @@ int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res)
for (w = 0; w < 3; w++)
avg[w] = group->avg[res * 2 + full][w];
- total = div_u64(group->total[res * 2 + full], NSEC_PER_USEC);
+ total = div_u64(group->total[PSI_AVGS][res * 2 + full],
+ NSEC_PER_USEC);
seq_printf(m, "%s avg10=%lu.%02lu avg60=%lu.%02lu avg300=%lu.%02lu total=%llu\n",
full ? "full" : "some",
@@ -753,25 +1003,270 @@ static int psi_cpu_open(struct inode *inode, struct file *file)
return single_open(file, psi_cpu_show, NULL);
}
+struct psi_trigger *psi_trigger_create(struct psi_group *group,
+ char *buf, size_t nbytes, enum psi_res res)
+{
+ struct psi_trigger *t;
+ enum psi_states state;
+ u32 threshold_us;
+ u32 window_us;
+
+ if (static_branch_likely(&psi_disabled))
+ return ERR_PTR(-EOPNOTSUPP);
+
+ if (sscanf(buf, "some %u %u", &threshold_us, &window_us) == 2)
+ state = PSI_IO_SOME + res * 2;
+ else if (sscanf(buf, "full %u %u", &threshold_us, &window_us) == 2)
+ state = PSI_IO_FULL + res * 2;
+ else
+ return ERR_PTR(-EINVAL);
+
+ if (state >= PSI_NONIDLE)
+ return ERR_PTR(-EINVAL);
+
+ if (window_us < WINDOW_MIN_US ||
+ window_us > WINDOW_MAX_US)
+ return ERR_PTR(-EINVAL);
+
+ /* Check threshold */
+ if (threshold_us == 0 || threshold_us > window_us)
+ return ERR_PTR(-EINVAL);
+
+ t = kmalloc(sizeof(*t), GFP_KERNEL);
+ if (!t)
+ return ERR_PTR(-ENOMEM);
+
+ t->group = group;
+ t->state = state;
+ t->threshold = threshold_us * NSEC_PER_USEC;
+ t->win.size = window_us * NSEC_PER_USEC;
+ window_reset(&t->win, 0, 0, 0);
+
+ t->event = 0;
+ t->last_event_time = 0;
+ init_waitqueue_head(&t->event_wait);
+ kref_init(&t->refcount);
+
+ mutex_lock(&group->trigger_lock);
+
+ if (!rcu_access_pointer(group->poll_kworker)) {
+ struct sched_param param = {
+ .sched_priority = MAX_RT_PRIO - 1,
+ };
+ struct kthread_worker *kworker;
+
+ kworker = kthread_create_worker(0, "psimon");
+ if (IS_ERR(kworker)) {
+ kfree(t);
+ mutex_unlock(&group->trigger_lock);
+ return ERR_CAST(kworker);
+ }
+ sched_setscheduler(kworker->task, SCHED_FIFO, &param);
+ kthread_init_delayed_work(&group->poll_work,
+ psi_poll_work);
+ rcu_assign_pointer(group->poll_kworker, kworker);
+ }
+
+ list_add(&t->node, &group->triggers);
+ group->poll_min_period = min(group->poll_min_period,
+ div_u64(t->win.size, UPDATES_PER_WINDOW));
+ group->nr_triggers[t->state]++;
+ group->poll_states |= (1 << t->state);
+
+ mutex_unlock(&group->trigger_lock);
+
+ return t;
+}
+
+static void psi_trigger_destroy(struct kref *ref)
+{
+ struct psi_trigger *t = container_of(ref, struct psi_trigger, refcount);
+ struct psi_group *group = t->group;
+ struct kthread_worker *kworker_to_destroy = NULL;
+
+ if (static_branch_likely(&psi_disabled))
+ return;
+
+ /*
+ * Wakeup waiters to stop polling. Can happen if cgroup is deleted
+ * from under a polling process.
+ */
+ wake_up_interruptible(&t->event_wait);
+
+ mutex_lock(&group->trigger_lock);
+
+ if (!list_empty(&t->node)) {
+ struct psi_trigger *tmp;
+ u64 period = ULLONG_MAX;
+
+ list_del(&t->node);
+ group->nr_triggers[t->state]--;
+ if (!group->nr_triggers[t->state])
+ group->poll_states &= ~(1 << t->state);
+ /* reset min update period for the remaining triggers */
+ list_for_each_entry(tmp, &group->triggers, node)
+ period = min(period, div_u64(tmp->win.size,
+ UPDATES_PER_WINDOW));
+ group->poll_min_period = period;
+ /* Destroy poll_kworker when the last trigger is destroyed */
+ if (group->poll_states == 0) {
+ group->polling_until = 0;
+ kworker_to_destroy = rcu_dereference_protected(
+ group->poll_kworker,
+ lockdep_is_held(&group->trigger_lock));
+ rcu_assign_pointer(group->poll_kworker, NULL);
+ }
+ }
+
+ mutex_unlock(&group->trigger_lock);
+
+ /*
+ * Wait for both *trigger_ptr from psi_trigger_replace and
+ * poll_kworker RCUs to complete their read-side critical sections
+ * before destroying the trigger and optionally the poll_kworker
+ */
+ synchronize_rcu();
+ /*
+ * Destroy the kworker after releasing trigger_lock to prevent a
+ * deadlock while waiting for psi_poll_work to acquire trigger_lock
+ */
+ if (kworker_to_destroy) {
+ kthread_cancel_delayed_work_sync(&group->poll_work);
+ kthread_destroy_worker(kworker_to_destroy);
+ }
+ kfree(t);
+}
+
+void psi_trigger_replace(void **trigger_ptr, struct psi_trigger *new)
+{
+ struct psi_trigger *old = *trigger_ptr;
+
+ if (static_branch_likely(&psi_disabled))
+ return;
+
+ rcu_assign_pointer(*trigger_ptr, new);
+ if (old)
+ kref_put(&old->refcount, psi_trigger_destroy);
+}
+
+__poll_t psi_trigger_poll(void **trigger_ptr,
+ struct file *file, poll_table *wait)
+{
+ __poll_t ret = DEFAULT_POLLMASK;
+ struct psi_trigger *t;
+
+ if (static_branch_likely(&psi_disabled))
+ return DEFAULT_POLLMASK | EPOLLERR | EPOLLPRI;
+
+ rcu_read_lock();
+
+ t = rcu_dereference(*(void __rcu __force **)trigger_ptr);
+ if (!t) {
+ rcu_read_unlock();
+ return DEFAULT_POLLMASK | EPOLLERR | EPOLLPRI;
+ }
+ kref_get(&t->refcount);
+
+ rcu_read_unlock();
+
+ poll_wait(file, &t->event_wait, wait);
+
+ if (cmpxchg(&t->event, 1, 0) == 1)
+ ret |= EPOLLPRI;
+
+ kref_put(&t->refcount, psi_trigger_destroy);
+
+ return ret;
+}
+
+static ssize_t psi_write(struct file *file, const char __user *user_buf,
+ size_t nbytes, enum psi_res res)
+{
+ char buf[32];
+ size_t buf_size;
+ struct seq_file *seq;
+ struct psi_trigger *new;
+
+ if (static_branch_likely(&psi_disabled))
+ return -EOPNOTSUPP;
+
+ buf_size = min(nbytes, (sizeof(buf) - 1));
+ if (copy_from_user(buf, user_buf, buf_size))
+ return -EFAULT;
+
+ buf[buf_size - 1] = '\0';
+
+ new = psi_trigger_create(&psi_system, buf, nbytes, res);
+ if (IS_ERR(new))
+ return PTR_ERR(new);
+
+ seq = file->private_data;
+ /* Take seq->lock to protect seq->private from concurrent writes */
+ mutex_lock(&seq->lock);
+ psi_trigger_replace(&seq->private, new);
+ mutex_unlock(&seq->lock);
+
+ return nbytes;
+}
+
+static ssize_t psi_io_write(struct file *file, const char __user *user_buf,
+ size_t nbytes, loff_t *ppos)
+{
+ return psi_write(file, user_buf, nbytes, PSI_IO);
+}
+
+static ssize_t psi_memory_write(struct file *file, const char __user *user_buf,
+ size_t nbytes, loff_t *ppos)
+{
+ return psi_write(file, user_buf, nbytes, PSI_MEM);
+}
+
+static ssize_t psi_cpu_write(struct file *file, const char __user *user_buf,
+ size_t nbytes, loff_t *ppos)
+{
+ return psi_write(file, user_buf, nbytes, PSI_CPU);
+}
+
+static __poll_t psi_fop_poll(struct file *file, poll_table *wait)
+{
+ struct seq_file *seq = file->private_data;
+
+ return psi_trigger_poll(&seq->private, file, wait);
+}
+
+static int psi_fop_release(struct inode *inode, struct file *file)
+{
+ struct seq_file *seq = file->private_data;
+
+ psi_trigger_replace(&seq->private, NULL);
+ return single_release(inode, file);
+}
+
static const struct file_operations psi_io_fops = {
.open = psi_io_open,
.read = seq_read,
.llseek = seq_lseek,
- .release = single_release,
+ .write = psi_io_write,
+ .poll = psi_fop_poll,
+ .release = psi_fop_release,
};
static const struct file_operations psi_memory_fops = {
.open = psi_memory_open,
.read = seq_read,
.llseek = seq_lseek,
- .release = single_release,
+ .write = psi_memory_write,
+ .poll = psi_fop_poll,
+ .release = psi_fop_release,
};
static const struct file_operations psi_cpu_fops = {
.open = psi_cpu_open,
.read = seq_read,
.llseek = seq_lseek,
- .release = single_release,
+ .write = psi_cpu_write,
+ .poll = psi_fop_poll,
+ .release = psi_fop_release,
};
static int __init psi_proc_init(void)
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index e4f398ad9e73..a532558a5176 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -1587,7 +1587,7 @@ pick_next_task_rt(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
* rt task
*/
if (rq->curr->sched_class != &rt_sched_class)
- update_rt_rq_load_avg(rq_clock_task(rq), rq, 0);
+ update_rt_rq_load_avg(rq_clock_pelt(rq), rq, 0);
return p;
}
@@ -1596,7 +1596,7 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
{
update_curr_rt(rq);
- update_rt_rq_load_avg(rq_clock_task(rq), rq, 1);
+ update_rt_rq_load_avg(rq_clock_pelt(rq), rq, 1);
/*
* The previous task needs to be made eligible for pushing
@@ -1614,7 +1614,7 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu)
{
if (!task_running(rq, p) &&
- cpumask_test_cpu(cpu, &p->cpus_allowed))
+ cpumask_test_cpu(cpu, p->cpus_ptr))
return 1;
return 0;
@@ -1751,7 +1751,7 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq)
* Also make sure that it wasn't scheduled on its rq.
*/
if (unlikely(task_rq(task) != rq ||
- !cpumask_test_cpu(lowest_rq->cpu, &task->cpus_allowed) ||
+ !cpumask_test_cpu(lowest_rq->cpu, task->cpus_ptr) ||
task_running(rq, task) ||
!rt_task(task) ||
!task_on_rq_queued(task))) {
@@ -2325,7 +2325,7 @@ static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued)
struct sched_rt_entity *rt_se = &p->rt;
update_curr_rt(rq);
- update_rt_rq_load_avg(rq_clock_task(rq), rq, 1);
+ update_rt_rq_load_avg(rq_clock_pelt(rq), rq, 1);
watchdog(rq, p);
@@ -2400,6 +2400,10 @@ const struct sched_class rt_sched_class = {
.switched_to = switched_to_rt,
.update_curr = update_curr_rt,
+
+#ifdef CONFIG_UCLAMP_TASK
+ .uclamp_enabled = 1,
+#endif
};
#ifdef CONFIG_RT_GROUP_SCHED
@@ -2555,6 +2559,8 @@ int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us)
rt_runtime = (u64)rt_runtime_us * NSEC_PER_USEC;
if (rt_runtime_us < 0)
rt_runtime = RUNTIME_INF;
+ else if ((u64)rt_runtime_us > U64_MAX / NSEC_PER_USEC)
+ return -EINVAL;
return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
}
@@ -2575,6 +2581,9 @@ int sched_group_set_rt_period(struct task_group *tg, u64 rt_period_us)
{
u64 rt_runtime, rt_period;
+ if (rt_period_us > U64_MAX / NSEC_PER_USEC)
+ return -EINVAL;
+
rt_period = rt_period_us * NSEC_PER_USEC;
rt_runtime = tg->rt_bandwidth.rt_runtime;
diff --git a/kernel/sched/sched-pelt.h b/kernel/sched/sched-pelt.h
index a26473674fb7..c529706bed11 100644
--- a/kernel/sched/sched-pelt.h
+++ b/kernel/sched/sched-pelt.h
@@ -1,7 +1,7 @@
/* SPDX-License-Identifier: GPL-2.0 */
/* Generated by Documentation/scheduler/sched-pelt; do not modify. */
-static const u32 runnable_avg_yN_inv[] = {
+static const u32 runnable_avg_yN_inv[] __maybe_unused = {
0xffffffff, 0xfa83b2da, 0xf5257d14, 0xefe4b99a, 0xeac0c6e6, 0xe5b906e6,
0xe0ccdeeb, 0xdbfbb796, 0xd744fcc9, 0xd2a81d91, 0xce248c14, 0xc9b9bd85,
0xc5672a10, 0xc12c4cc9, 0xbd08a39e, 0xb8fbaf46, 0xb504f333, 0xb123f581,
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index d04530bf251f..802b1f3405f2 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -96,12 +96,6 @@ extern atomic_long_t calc_load_tasks;
extern void calc_global_load_tick(struct rq *this_rq);
extern long calc_load_fold_active(struct rq *this_rq, long adjust);
-#ifdef CONFIG_SMP
-extern void cpu_load_update_active(struct rq *this_rq);
-#else
-static inline void cpu_load_update_active(struct rq *this_rq) { }
-#endif
-
/*
* Helpers for converting nanosecond timing to jiffy resolution
*/
@@ -344,8 +338,10 @@ struct cfs_bandwidth {
u64 runtime_expires;
int expires_seq;
- short idle;
- short period_active;
+ u8 idle;
+ u8 period_active;
+ u8 distribute_running;
+ u8 slack_started;
struct hrtimer period_timer;
struct hrtimer slack_timer;
struct list_head throttled_cfs_rq;
@@ -354,8 +350,6 @@ struct cfs_bandwidth {
int nr_periods;
int nr_throttled;
u64 throttled_time;
-
- bool distribute_running;
#endif
};
@@ -780,7 +774,7 @@ struct root_domain {
* NULL-terminated list of performance domains intersecting with the
* CPUs of the rd. Protected by RCU.
*/
- struct perf_domain *pd;
+ struct perf_domain __rcu *pd;
};
extern struct root_domain def_root_domain;
@@ -797,6 +791,48 @@ extern void rto_push_irq_work_func(struct irq_work *work);
#endif
#endif /* CONFIG_SMP */
+#ifdef CONFIG_UCLAMP_TASK
+/*
+ * struct uclamp_bucket - Utilization clamp bucket
+ * @value: utilization clamp value for tasks on this clamp bucket
+ * @tasks: number of RUNNABLE tasks on this clamp bucket
+ *
+ * Keep track of how many tasks are RUNNABLE for a given utilization
+ * clamp value.
+ */
+struct uclamp_bucket {
+ unsigned long value : bits_per(SCHED_CAPACITY_SCALE);
+ unsigned long tasks : BITS_PER_LONG - bits_per(SCHED_CAPACITY_SCALE);
+};
+
+/*
+ * struct uclamp_rq - rq's utilization clamp
+ * @value: currently active clamp values for a rq
+ * @bucket: utilization clamp buckets affecting a rq
+ *
+ * Keep track of RUNNABLE tasks on a rq to aggregate their clamp values.
+ * A clamp value is affecting a rq when there is at least one task RUNNABLE
+ * (or actually running) with that value.
+ *
+ * There are up to UCLAMP_CNT possible different clamp values, currently there
+ * are only two: minimum utilization and maximum utilization.
+ *
+ * All utilization clamping values are MAX aggregated, since:
+ * - for util_min: we want to run the CPU at least at the max of the minimum
+ * utilization required by its currently RUNNABLE tasks.
+ * - for util_max: we want to allow the CPU to run up to the max of the
+ * maximum utilization allowed by its currently RUNNABLE tasks.
+ *
+ * Since on each system we expect only a limited number of different
+ * utilization clamp values (UCLAMP_BUCKETS), use a simple array to track
+ * the metrics required to compute all the per-rq utilization clamp values.
+ */
+struct uclamp_rq {
+ unsigned int value;
+ struct uclamp_bucket bucket[UCLAMP_BUCKETS];
+};
+#endif /* CONFIG_UCLAMP_TASK */
+
/*
* This is the main, per-CPU runqueue data structure.
*
@@ -818,8 +854,6 @@ struct rq {
unsigned int nr_preferred_running;
unsigned int numa_migrate_on;
#endif
- #define CPU_LOAD_IDX_MAX 5
- unsigned long cpu_load[CPU_LOAD_IDX_MAX];
#ifdef CONFIG_NO_HZ_COMMON
#ifdef CONFIG_SMP
unsigned long last_load_update_tick;
@@ -830,11 +864,16 @@ struct rq {
atomic_t nohz_flags;
#endif /* CONFIG_NO_HZ_COMMON */
- /* capture load from *all* tasks on this CPU: */
- struct load_weight load;
unsigned long nr_load_updates;
u64 nr_switches;
+#ifdef CONFIG_UCLAMP_TASK
+ /* Utilization clamp values based on CPU's RUNNABLE tasks */
+ struct uclamp_rq uclamp[UCLAMP_CNT] ____cacheline_aligned;
+ unsigned int uclamp_flags;
+#define UCLAMP_FLAG_IDLE 0x01
+#endif
+
struct cfs_rq cfs;
struct rt_rq rt;
struct dl_rq dl;
@@ -861,13 +900,16 @@ struct rq {
unsigned int clock_update_flags;
u64 clock;
- u64 clock_task;
+ /* Ensure that all clocks are in the same cache line */
+ u64 clock_task ____cacheline_aligned;
+ u64 clock_pelt;
+ unsigned long lost_idle_time;
atomic_t nr_iowait;
#ifdef CONFIG_SMP
- struct root_domain *rd;
- struct sched_domain *sd;
+ struct root_domain *rd;
+ struct sched_domain __rcu *sd;
unsigned long cpu_capacity;
unsigned long cpu_capacity_orig;
@@ -951,6 +993,22 @@ struct rq {
#endif
};
+#ifdef CONFIG_FAIR_GROUP_SCHED
+
+/* CPU runqueue to which this cfs_rq is attached */
+static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
+{
+ return cfs_rq->rq;
+}
+
+#else
+
+static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
+{
+ return container_of(cfs_rq, struct rq, cfs);
+}
+#endif
+
static inline int cpu_of(struct rq *rq)
{
#ifdef CONFIG_SMP
@@ -1260,7 +1318,7 @@ extern void sched_ttwu_pending(void);
/*
* The domain tree (rq->sd) is protected by RCU's quiescent state transition.
- * See detach_destroy_domains: synchronize_sched for details.
+ * See destroy_sched_domains: call_rcu for details.
*
* The domain tree of any CPU may only be accessed from within
* preempt-disabled sections.
@@ -1305,13 +1363,13 @@ static inline struct sched_domain *lowest_flag_domain(int cpu, int flag)
return sd;
}
-DECLARE_PER_CPU(struct sched_domain *, sd_llc);
+DECLARE_PER_CPU(struct sched_domain __rcu *, sd_llc);
DECLARE_PER_CPU(int, sd_llc_size);
DECLARE_PER_CPU(int, sd_llc_id);
-DECLARE_PER_CPU(struct sched_domain_shared *, sd_llc_shared);
-DECLARE_PER_CPU(struct sched_domain *, sd_numa);
-DECLARE_PER_CPU(struct sched_domain *, sd_asym_packing);
-DECLARE_PER_CPU(struct sched_domain *, sd_asym_cpucapacity);
+DECLARE_PER_CPU(struct sched_domain_shared __rcu *, sd_llc_shared);
+DECLARE_PER_CPU(struct sched_domain __rcu *, sd_numa);
+DECLARE_PER_CPU(struct sched_domain __rcu *, sd_asym_packing);
+DECLARE_PER_CPU(struct sched_domain __rcu *, sd_asym_cpucapacity);
extern struct static_key_false sched_asym_cpucapacity;
struct sched_group_capacity {
@@ -1460,9 +1518,9 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
*/
smp_wmb();
#ifdef CONFIG_THREAD_INFO_IN_TASK
- p->cpu = cpu;
+ WRITE_ONCE(p->cpu, cpu);
#else
- task_thread_info(p)->cpu = cpu;
+ WRITE_ONCE(task_thread_info(p)->cpu, cpu);
#endif
p->wake_cpu = cpu;
#endif
@@ -1563,7 +1621,7 @@ static inline int task_on_rq_queued(struct task_struct *p)
static inline int task_on_rq_migrating(struct task_struct *p)
{
- return p->on_rq == TASK_ON_RQ_MIGRATING;
+ return READ_ONCE(p->on_rq) == TASK_ON_RQ_MIGRATING;
}
/*
@@ -1630,6 +1688,10 @@ extern const u32 sched_prio_to_wmult[40];
struct sched_class {
const struct sched_class *next;
+#ifdef CONFIG_UCLAMP_TASK
+ int uclamp_enabled;
+#endif
+
void (*enqueue_task) (struct rq *rq, struct task_struct *p, int flags);
void (*dequeue_task) (struct rq *rq, struct task_struct *p, int flags);
void (*yield_task) (struct rq *rq);
@@ -1781,7 +1843,7 @@ extern void init_dl_rq_bw_ratio(struct dl_rq *dl_rq);
unsigned long to_ratio(u64 period, u64 runtime);
extern void init_entity_runnable_average(struct sched_entity *se);
-extern void post_init_entity_util_avg(struct sched_entity *se);
+extern void post_init_entity_util_avg(struct task_struct *p);
#ifdef CONFIG_NO_HZ_FULL
extern bool sched_can_stop_tick(struct rq *rq);
@@ -2166,7 +2228,7 @@ static inline u64 irq_time_read(int cpu)
#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
#ifdef CONFIG_CPU_FREQ
-DECLARE_PER_CPU(struct update_util_data *, cpufreq_update_util_data);
+DECLARE_PER_CPU(struct update_util_data __rcu *, cpufreq_update_util_data);
/**
* cpufreq_update_util - Take a note about CPU utilization changes.
@@ -2203,6 +2265,48 @@ static inline void cpufreq_update_util(struct rq *rq, unsigned int flags)
static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) {}
#endif /* CONFIG_CPU_FREQ */
+#ifdef CONFIG_UCLAMP_TASK
+unsigned int uclamp_eff_value(struct task_struct *p, unsigned int clamp_id);
+
+static __always_inline
+unsigned int uclamp_util_with(struct rq *rq, unsigned int util,
+ struct task_struct *p)
+{
+ unsigned int min_util = READ_ONCE(rq->uclamp[UCLAMP_MIN].value);
+ unsigned int max_util = READ_ONCE(rq->uclamp[UCLAMP_MAX].value);
+
+ if (p) {
+ min_util = max(min_util, uclamp_eff_value(p, UCLAMP_MIN));
+ max_util = max(max_util, uclamp_eff_value(p, UCLAMP_MAX));
+ }
+
+ /*
+ * Since CPU's {min,max}_util clamps are MAX aggregated considering
+ * RUNNABLE tasks with _different_ clamps, we can end up with an
+ * inversion. Fix it now when the clamps are applied.
+ */
+ if (unlikely(min_util >= max_util))
+ return min_util;
+
+ return clamp(util, min_util, max_util);
+}
+
+static inline unsigned int uclamp_util(struct rq *rq, unsigned int util)
+{
+ return uclamp_util_with(rq, util, NULL);
+}
+#else /* CONFIG_UCLAMP_TASK */
+static inline unsigned int uclamp_util_with(struct rq *rq, unsigned int util,
+ struct task_struct *p)
+{
+ return util;
+}
+static inline unsigned int uclamp_util(struct rq *rq, unsigned int util)
+{
+ return util;
+}
+#endif /* CONFIG_UCLAMP_TASK */
+
#ifdef arch_scale_freq_capacity
# ifndef arch_scale_freq_invariant
# define arch_scale_freq_invariant() true
@@ -2211,7 +2315,13 @@ static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) {}
# define arch_scale_freq_invariant() false
#endif
-#ifdef CONFIG_CPU_FREQ_GOV_SCHEDUTIL
+#ifdef CONFIG_SMP
+static inline unsigned long capacity_orig_of(int cpu)
+{
+ return cpu_rq(cpu)->cpu_capacity_orig;
+}
+#endif
+
/**
* enum schedutil_type - CPU utilization type
* @FREQUENCY_UTIL: Utilization used to select frequency
@@ -2227,15 +2337,11 @@ enum schedutil_type {
ENERGY_UTIL,
};
-unsigned long schedutil_freq_util(int cpu, unsigned long util_cfs,
- unsigned long max, enum schedutil_type type);
-
-static inline unsigned long schedutil_energy_util(int cpu, unsigned long cfs)
-{
- unsigned long max = arch_scale_cpu_capacity(NULL, cpu);
+#ifdef CONFIG_CPU_FREQ_GOV_SCHEDUTIL
- return schedutil_freq_util(cpu, cfs, max, ENERGY_UTIL);
-}
+unsigned long schedutil_cpu_util(int cpu, unsigned long util_cfs,
+ unsigned long max, enum schedutil_type type,
+ struct task_struct *p);
static inline unsigned long cpu_bw_dl(struct rq *rq)
{
@@ -2264,11 +2370,13 @@ static inline unsigned long cpu_util_rt(struct rq *rq)
return READ_ONCE(rq->avg_rt.util_avg);
}
#else /* CONFIG_CPU_FREQ_GOV_SCHEDUTIL */
-static inline unsigned long schedutil_energy_util(int cpu, unsigned long cfs)
+static inline unsigned long schedutil_cpu_util(int cpu, unsigned long util_cfs,
+ unsigned long max, enum schedutil_type type,
+ struct task_struct *p)
{
- return cfs;
+ return 0;
}
-#endif
+#endif /* CONFIG_CPU_FREQ_GOV_SCHEDUTIL */
#ifdef CONFIG_HAVE_SCHED_AVG_IRQ
static inline unsigned long cpu_util_irq(struct rq *rq)
@@ -2299,11 +2407,19 @@ unsigned long scale_irq_capacity(unsigned long util, unsigned long irq, unsigned
#endif
#if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL)
+
#define perf_domain_span(pd) (to_cpumask(((pd)->em_pd->cpus)))
-#else
+
+DECLARE_STATIC_KEY_FALSE(sched_energy_present);
+
+static inline bool sched_energy_enabled(void)
+{
+ return static_branch_unlikely(&sched_energy_present);
+}
+
+#else /* ! (CONFIG_ENERGY_MODEL && CONFIG_CPU_FREQ_GOV_SCHEDUTIL) */
+
#define perf_domain_span(pd) NULL
-#endif
+static inline bool sched_energy_enabled(void) { return false; }
-#ifdef CONFIG_SMP
-extern struct static_key_false sched_energy_present;
-#endif
+#endif /* CONFIG_ENERGY_MODEL && CONFIG_CPU_FREQ_GOV_SCHEDUTIL */
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 3f35ba1d8fde..f751ce0b783e 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -201,11 +201,37 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
return 1;
}
-DEFINE_STATIC_KEY_FALSE(sched_energy_present);
#if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL)
+DEFINE_STATIC_KEY_FALSE(sched_energy_present);
+unsigned int sysctl_sched_energy_aware = 1;
DEFINE_MUTEX(sched_energy_mutex);
bool sched_energy_update;
+#ifdef CONFIG_PROC_SYSCTL
+int sched_energy_aware_handler(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ int ret, state;
+
+ if (write && !capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
+ if (!ret && write) {
+ state = static_branch_unlikely(&sched_energy_present);
+ if (state != sysctl_sched_energy_aware) {
+ mutex_lock(&sched_energy_mutex);
+ sched_energy_update = 1;
+ rebuild_sched_domains();
+ sched_energy_update = 0;
+ mutex_unlock(&sched_energy_mutex);
+ }
+ }
+
+ return ret;
+}
+#endif
+
static void free_pd(struct perf_domain *pd)
{
struct perf_domain *tmp;
@@ -322,6 +348,9 @@ static bool build_perf_domains(const struct cpumask *cpu_map)
struct cpufreq_policy *policy;
struct cpufreq_governor *gov;
+ if (!sysctl_sched_energy_aware)
+ goto free;
+
/* EAS is enabled for asymmetric CPU capacity topologies. */
if (!per_cpu(sd_asym_cpucapacity, cpu)) {
if (sched_debug()) {
@@ -442,7 +471,7 @@ void rq_attach_root(struct rq *rq, struct root_domain *rd)
raw_spin_unlock_irqrestore(&rq->lock, flags);
if (old_rd)
- call_rcu_sched(&old_rd->rcu, free_rootdomain);
+ call_rcu(&old_rd->rcu, free_rootdomain);
}
void sched_get_rd(struct root_domain *rd)
@@ -455,7 +484,7 @@ void sched_put_rd(struct root_domain *rd)
if (!atomic_dec_and_test(&rd->refcount))
return;
- call_rcu_sched(&rd->rcu, free_rootdomain);
+ call_rcu(&rd->rcu, free_rootdomain);
}
static int init_rootdomain(struct root_domain *rd)
@@ -586,13 +615,13 @@ static void destroy_sched_domains(struct sched_domain *sd)
* the cpumask of the domain), this allows us to quickly tell if
* two CPUs are in the same cache domain, see cpus_share_cache().
*/
-DEFINE_PER_CPU(struct sched_domain *, sd_llc);
+DEFINE_PER_CPU(struct sched_domain __rcu *, sd_llc);
DEFINE_PER_CPU(int, sd_llc_size);
DEFINE_PER_CPU(int, sd_llc_id);
-DEFINE_PER_CPU(struct sched_domain_shared *, sd_llc_shared);
-DEFINE_PER_CPU(struct sched_domain *, sd_numa);
-DEFINE_PER_CPU(struct sched_domain *, sd_asym_packing);
-DEFINE_PER_CPU(struct sched_domain *, sd_asym_cpucapacity);
+DEFINE_PER_CPU(struct sched_domain_shared __rcu *, sd_llc_shared);
+DEFINE_PER_CPU(struct sched_domain __rcu *, sd_numa);
+DEFINE_PER_CPU(struct sched_domain __rcu *, sd_asym_packing);
+DEFINE_PER_CPU(struct sched_domain __rcu *, sd_asym_cpucapacity);
DEFINE_STATIC_KEY_FALSE(sched_asym_cpucapacity);
static void update_top_cache_domain(int cpu)
@@ -676,7 +705,7 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
}
struct s_data {
- struct sched_domain ** __percpu sd;
+ struct sched_domain * __percpu *sd;
struct root_domain *rd;
};
@@ -1030,6 +1059,7 @@ static struct sched_group *get_group(int cpu, struct sd_data *sdd)
struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu);
struct sched_domain *child = sd->child;
struct sched_group *sg;
+ bool already_visited;
if (child)
cpu = cpumask_first(sched_domain_span(child));
@@ -1037,9 +1067,14 @@ static struct sched_group *get_group(int cpu, struct sd_data *sdd)
sg = *per_cpu_ptr(sdd->sg, cpu);
sg->sgc = *per_cpu_ptr(sdd->sgc, cpu);
- /* For claim_allocations: */
- atomic_inc(&sg->ref);
- atomic_inc(&sg->sgc->ref);
+ /* Increase refcounts for claim_allocations: */
+ already_visited = atomic_inc_return(&sg->ref) > 1;
+ /* sgc visits should follow a similar trend as sg */
+ WARN_ON(already_visited != (atomic_inc_return(&sg->sgc->ref) > 1));
+
+ /* If we have already visited that group, it's already initialized. */
+ if (already_visited)
+ return sg;
if (child) {
cpumask_copy(sched_group_span(sg), sched_domain_span(child));
@@ -1058,8 +1093,8 @@ static struct sched_group *get_group(int cpu, struct sd_data *sdd)
/*
* build_sched_groups will build a circular linked list of the groups
- * covered by the given span, and will set each group's ->cpumask correctly,
- * and ->cpu_capacity to 0.
+ * covered by the given span, will set each group's ->cpumask correctly,
+ * and will initialize their ->sgc.
*
* Assumes the sched_domain tree is fully constructed
*/
@@ -1309,11 +1344,6 @@ sd_init(struct sched_domain_topology_level *tl,
.imbalance_pct = 125,
.cache_nice_tries = 0,
- .busy_idx = 0,
- .idle_idx = 0,
- .newidle_idx = 0,
- .wake_idx = 0,
- .forkexec_idx = 0,
.flags = 1*SD_LOAD_BALANCE
| 1*SD_BALANCE_NEWIDLE
@@ -1365,13 +1395,10 @@ sd_init(struct sched_domain_topology_level *tl,
} else if (sd->flags & SD_SHARE_PKG_RESOURCES) {
sd->imbalance_pct = 117;
sd->cache_nice_tries = 1;
- sd->busy_idx = 2;
#ifdef CONFIG_NUMA
} else if (sd->flags & SD_NUMA) {
sd->cache_nice_tries = 2;
- sd->busy_idx = 3;
- sd->idle_idx = 2;
sd->flags &= ~SD_PREFER_SIBLING;
sd->flags |= SD_SERIALIZE;
@@ -1384,8 +1411,6 @@ sd_init(struct sched_domain_topology_level *tl,
#endif
} else {
sd->cache_nice_tries = 1;
- sd->busy_idx = 2;
- sd->idle_idx = 1;
}
/*
@@ -1849,10 +1874,10 @@ static struct sched_domain_topology_level
unsigned long cap;
/* Is there any asymmetry? */
- cap = arch_scale_cpu_capacity(NULL, cpumask_first(cpu_map));
+ cap = arch_scale_cpu_capacity(cpumask_first(cpu_map));
for_each_cpu(i, cpu_map) {
- if (arch_scale_cpu_capacity(NULL, i) != cap) {
+ if (arch_scale_cpu_capacity(i) != cap) {
asym = true;
break;
}
@@ -1867,7 +1892,7 @@ static struct sched_domain_topology_level
* to everyone.
*/
for_each_cpu(i, cpu_map) {
- unsigned long max_capacity = arch_scale_cpu_capacity(NULL, i);
+ unsigned long max_capacity = arch_scale_cpu_capacity(i);
int tl_id = 0;
for_each_sd_topology(tl) {
@@ -1877,7 +1902,7 @@ static struct sched_domain_topology_level
for_each_cpu_and(j, tl->mask(i), cpu_map) {
unsigned long capacity;
- capacity = arch_scale_cpu_capacity(NULL, j);
+ capacity = arch_scale_cpu_capacity(j);
if (capacity <= max_capacity)
continue;
@@ -2046,9 +2071,8 @@ void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms)
}
/*
- * Set up scheduler domains and groups. Callers must hold the hotplug lock.
- * For now this just excludes isolated CPUs, but could be used to
- * exclude other special cases in the future.
+ * Set up scheduler domains and groups. For now this just excludes isolated
+ * CPUs, but could be used to exclude other special cases in the future.
*/
int sched_init_domains(const struct cpumask *cpu_map)
{
diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c
index 6eb1f8efd221..c1e566a114ca 100644
--- a/kernel/sched/wait.c
+++ b/kernel/sched/wait.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Generic waiting primitives.
*
@@ -117,16 +118,12 @@ static void __wake_up_common_lock(struct wait_queue_head *wq_head, unsigned int
bookmark.func = NULL;
INIT_LIST_HEAD(&bookmark.entry);
- spin_lock_irqsave(&wq_head->lock, flags);
- nr_exclusive = __wake_up_common(wq_head, mode, nr_exclusive, wake_flags, key, &bookmark);
- spin_unlock_irqrestore(&wq_head->lock, flags);
-
- while (bookmark.flags & WQ_FLAG_BOOKMARK) {
+ do {
spin_lock_irqsave(&wq_head->lock, flags);
nr_exclusive = __wake_up_common(wq_head, mode, nr_exclusive,
wake_flags, key, &bookmark);
spin_unlock_irqrestore(&wq_head->lock, flags);
- }
+ } while (bookmark.flags & WQ_FLAG_BOOKMARK);
}
/**
diff --git a/kernel/sched/wait_bit.c b/kernel/sched/wait_bit.c
index c67c6d24adc2..45eba18a2898 100644
--- a/kernel/sched/wait_bit.c
+++ b/kernel/sched/wait_bit.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* The implementation of the wait_bit*() and related waiting APIs:
*/
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index e815781ed751..dba52a7db5e8 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -148,8 +148,8 @@ static void populate_seccomp_data(struct seccomp_data *sd)
unsigned long args[6];
sd->nr = syscall_get_nr(task, regs);
- sd->arch = syscall_get_arch();
- syscall_get_arguments(task, regs, 0, 6, args);
+ sd->arch = syscall_get_arch(task);
+ syscall_get_arguments(task, regs, args);
sd->args[0] = args[0];
sd->args[1] = args[1];
sd->args[2] = args[2];
@@ -267,6 +267,7 @@ static u32 seccomp_run_filters(const struct seccomp_data *sd,
* All filters in the list are evaluated and the lowest BPF return
* value always takes priority (ignoring the DATA).
*/
+ preempt_disable();
for (; f; f = f->prev) {
u32 cur_ret = BPF_PROG_RUN(f->prog, sd);
@@ -275,6 +276,7 @@ static u32 seccomp_run_filters(const struct seccomp_data *sd,
*match = f;
}
}
+ preempt_enable();
return ret;
}
#endif /* CONFIG_SECCOMP_FILTER */
@@ -329,7 +331,7 @@ static int is_ancestor(struct seccomp_filter *parent,
* Expects sighand and cred_guard_mutex locks to be held.
*
* Returns 0 on success, -ve on error, or the pid of a thread which was
- * either not in the correct seccomp mode or it did not have an ancestral
+ * either not in the correct seccomp mode or did not have an ancestral
* seccomp filter.
*/
static inline pid_t seccomp_can_sync_threads(void)
@@ -443,8 +445,8 @@ static struct seccomp_filter *seccomp_prepare_filter(struct sock_fprog *fprog)
* behavior of privileged children.
*/
if (!task_no_new_privs(current) &&
- security_capable_noaudit(current_cred(), current_user_ns(),
- CAP_SYS_ADMIN) != 0)
+ security_capable(current_cred(), current_user_ns(),
+ CAP_SYS_ADMIN, CAP_OPT_NOAUDIT) != 0)
return ERR_PTR(-EACCES);
/* Allocate a new seccomp_filter */
@@ -500,7 +502,10 @@ out:
*
* Caller must be holding current->sighand->siglock lock.
*
- * Returns 0 on success, -ve on error.
+ * Returns 0 on success, -ve on error, or
+ * - in TSYNC mode: the pid of a thread which was either not in the correct
+ * seccomp mode or did not have an ancestral seccomp filter
+ * - in NEW_LISTENER mode: the fd of the new listener
*/
static long seccomp_attach_filter(unsigned int flags,
struct seccomp_filter *filter)
@@ -589,7 +594,7 @@ static void seccomp_init_siginfo(kernel_siginfo_t *info, int syscall, int reason
info->si_code = SYS_SECCOMP;
info->si_call_addr = (void __user *)KSTK_EIP(current);
info->si_errno = reason;
- info->si_arch = syscall_get_arch();
+ info->si_arch = syscall_get_arch(current);
info->si_syscall = syscall;
}
@@ -604,7 +609,7 @@ static void seccomp_send_sigsys(int syscall, int reason)
{
struct kernel_siginfo info;
seccomp_init_siginfo(&info, syscall, reason);
- force_sig_info(SIGSYS, &info, current);
+ force_sig_info(&info);
}
#endif /* CONFIG_SECCOMP_FILTER */
@@ -1256,6 +1261,16 @@ static long seccomp_set_mode_filter(unsigned int flags,
if (flags & ~SECCOMP_FILTER_FLAG_MASK)
return -EINVAL;
+ /*
+ * In the successful case, NEW_LISTENER returns the new listener fd.
+ * But in the failure case, TSYNC returns the thread that died. If you
+ * combine these two flags, there's no way to tell whether something
+ * succeeded or failed. So, let's disallow this combination.
+ */
+ if ((flags & SECCOMP_FILTER_FLAG_TSYNC) &&
+ (flags & SECCOMP_FILTER_FLAG_NEW_LISTENER))
+ return -EINVAL;
+
/* Prepare the new filter before holding any locks. */
prepared = seccomp_prepare_user_filter(filter);
if (IS_ERR(prepared))
@@ -1302,7 +1317,7 @@ out:
mutex_unlock(&current->signal->cred_guard_mutex);
out_put_fd:
if (flags & SECCOMP_FILTER_FLAG_NEW_LISTENER) {
- if (ret < 0) {
+ if (ret) {
listener_f->private_data = NULL;
fput(listener_f);
put_unused_fd(listener);
diff --git a/kernel/signal.c b/kernel/signal.c
index 57b7771e20d7..dabe100d2091 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* linux/kernel/signal.c
*
@@ -19,7 +20,9 @@
#include <linux/sched/task.h>
#include <linux/sched/task_stack.h>
#include <linux/sched/cputime.h>
+#include <linux/file.h>
#include <linux/fs.h>
+#include <linux/proc_fs.h>
#include <linux/tty.h>
#include <linux/binfmts.h>
#include <linux/coredump.h>
@@ -41,6 +44,8 @@
#include <linux/compiler.h>
#include <linux/posix-timers.h>
#include <linux/livepatch.h>
+#include <linux/cgroup.h>
+#include <linux/audit.h>
#define CREATE_TRACE_POINTS
#include <trace/events/signal.h>
@@ -50,7 +55,6 @@
#include <asm/unistd.h>
#include <asm/siginfo.h>
#include <asm/cacheflush.h>
-#include "audit.h" /* audit_signal_info() */
/*
* SLAB caches for signal bits.
@@ -144,9 +148,10 @@ static inline bool has_pending_signals(sigset_t *signal, sigset_t *blocked)
static bool recalc_sigpending_tsk(struct task_struct *t)
{
- if ((t->jobctl & JOBCTL_PENDING_MASK) ||
+ if ((t->jobctl & (JOBCTL_PENDING_MASK | JOBCTL_TRAP_FREEZE)) ||
PENDING(&t->pending, &t->blocked) ||
- PENDING(&t->signal->shared_pending, &t->blocked)) {
+ PENDING(&t->signal->shared_pending, &t->blocked) ||
+ cgroup_task_frozen(t)) {
set_tsk_thread_flag(t, TIF_SIGPENDING);
return true;
}
@@ -836,6 +841,7 @@ static int check_kill_permission(int sig, struct kernel_siginfo *info,
*/
if (!sid || sid == task_session(current))
break;
+ /* fall through */
default:
return -EPERM;
}
@@ -1051,29 +1057,8 @@ static inline bool legacy_queue(struct sigpending *signals, int sig)
return (sig < SIGRTMIN) && sigismember(&signals->signal, sig);
}
-#ifdef CONFIG_USER_NS
-static inline void userns_fixup_signal_uid(struct kernel_siginfo *info, struct task_struct *t)
-{
- if (current_user_ns() == task_cred_xxx(t, user_ns))
- return;
-
- if (SI_FROMKERNEL(info))
- return;
-
- rcu_read_lock();
- info->si_uid = from_kuid_munged(task_cred_xxx(t, user_ns),
- make_kuid(current_user_ns(), info->si_uid));
- rcu_read_unlock();
-}
-#else
-static inline void userns_fixup_signal_uid(struct kernel_siginfo *info, struct task_struct *t)
-{
- return;
-}
-#endif
-
static int __send_signal(int sig, struct kernel_siginfo *info, struct task_struct *t,
- enum pid_type type, int from_ancestor_ns)
+ enum pid_type type, bool force)
{
struct sigpending *pending;
struct sigqueue *q;
@@ -1083,8 +1068,7 @@ static int __send_signal(int sig, struct kernel_siginfo *info, struct task_struc
assert_spin_locked(&t->sighand->siglock);
result = TRACE_SIGNAL_IGNORED;
- if (!prepare_signal(sig, t,
- from_ancestor_ns || (info == SEND_SIG_PRIV)))
+ if (!prepare_signal(sig, t, force))
goto ret;
pending = (type != PIDTYPE_PID) ? &t->signal->shared_pending : &t->pending;
@@ -1129,7 +1113,11 @@ static int __send_signal(int sig, struct kernel_siginfo *info, struct task_struc
q->info.si_code = SI_USER;
q->info.si_pid = task_tgid_nr_ns(current,
task_active_pid_ns(t));
- q->info.si_uid = from_kuid_munged(current_user_ns(), current_uid());
+ rcu_read_lock();
+ q->info.si_uid =
+ from_kuid_munged(task_cred_xxx(t, user_ns),
+ current_uid());
+ rcu_read_unlock();
break;
case (unsigned long) SEND_SIG_PRIV:
clear_siginfo(&q->info);
@@ -1141,30 +1129,24 @@ static int __send_signal(int sig, struct kernel_siginfo *info, struct task_struc
break;
default:
copy_siginfo(&q->info, info);
- if (from_ancestor_ns)
- q->info.si_pid = 0;
break;
}
-
- userns_fixup_signal_uid(&q->info, t);
-
- } else if (!is_si_special(info)) {
- if (sig >= SIGRTMIN && info->si_code != SI_USER) {
- /*
- * Queue overflow, abort. We may abort if the
- * signal was rt and sent by user using something
- * other than kill().
- */
- result = TRACE_SIGNAL_OVERFLOW_FAIL;
- ret = -EAGAIN;
- goto ret;
- } else {
- /*
- * This is a silent loss of information. We still
- * send the signal, but the *info bits are lost.
- */
- result = TRACE_SIGNAL_LOSE_INFO;
- }
+ } else if (!is_si_special(info) &&
+ sig >= SIGRTMIN && info->si_code != SI_USER) {
+ /*
+ * Queue overflow, abort. We may abort if the
+ * signal was rt and sent by user using something
+ * other than kill().
+ */
+ result = TRACE_SIGNAL_OVERFLOW_FAIL;
+ ret = -EAGAIN;
+ goto ret;
+ } else {
+ /*
+ * This is a silent loss of information. We still
+ * send the signal, but the *info bits are lost.
+ */
+ result = TRACE_SIGNAL_LOSE_INFO;
}
out_set:
@@ -1191,17 +1173,62 @@ ret:
return ret;
}
+static inline bool has_si_pid_and_uid(struct kernel_siginfo *info)
+{
+ bool ret = false;
+ switch (siginfo_layout(info->si_signo, info->si_code)) {
+ case SIL_KILL:
+ case SIL_CHLD:
+ case SIL_RT:
+ ret = true;
+ break;
+ case SIL_TIMER:
+ case SIL_POLL:
+ case SIL_FAULT:
+ case SIL_FAULT_MCEERR:
+ case SIL_FAULT_BNDERR:
+ case SIL_FAULT_PKUERR:
+ case SIL_SYS:
+ ret = false;
+ break;
+ }
+ return ret;
+}
+
static int send_signal(int sig, struct kernel_siginfo *info, struct task_struct *t,
enum pid_type type)
{
- int from_ancestor_ns = 0;
+ /* Should SIGKILL or SIGSTOP be received by a pid namespace init? */
+ bool force = false;
-#ifdef CONFIG_PID_NS
- from_ancestor_ns = si_fromuser(info) &&
- !task_pid_nr_ns(current, task_active_pid_ns(t));
-#endif
+ if (info == SEND_SIG_NOINFO) {
+ /* Force if sent from an ancestor pid namespace */
+ force = !task_pid_nr_ns(current, task_active_pid_ns(t));
+ } else if (info == SEND_SIG_PRIV) {
+ /* Don't ignore kernel generated signals */
+ force = true;
+ } else if (has_si_pid_and_uid(info)) {
+ /* SIGKILL and SIGSTOP is special or has ids */
+ struct user_namespace *t_user_ns;
+
+ rcu_read_lock();
+ t_user_ns = task_cred_xxx(t, user_ns);
+ if (current_user_ns() != t_user_ns) {
+ kuid_t uid = make_kuid(current_user_ns(), info->si_uid);
+ info->si_uid = from_kuid_munged(t_user_ns, uid);
+ }
+ rcu_read_unlock();
- return __send_signal(sig, info, t, type, from_ancestor_ns);
+ /* A kernel generated signal? */
+ force = (info->si_code == SI_KERNEL);
+
+ /* From an ancestor pid namespace? */
+ if (!task_pid_nr_ns(current, task_active_pid_ns(t))) {
+ info->si_pid = 0;
+ force = true;
+ }
+ }
+ return __send_signal(sig, info, t, type, force);
}
static void print_fatal_signal(int signr)
@@ -1268,12 +1295,13 @@ int do_send_sig_info(int sig, struct kernel_siginfo *info, struct task_struct *p
* We don't want to have recursive SIGSEGV's etc, for example,
* that is why we also clear SIGNAL_UNKILLABLE.
*/
-int
-force_sig_info(int sig, struct kernel_siginfo *info, struct task_struct *t)
+static int
+force_sig_info_to_task(struct kernel_siginfo *info, struct task_struct *t)
{
unsigned long int flags;
int ret, blocked, ignored;
struct k_sigaction *action;
+ int sig = info->si_signo;
spin_lock_irqsave(&t->sighand->siglock, flags);
action = &t->sighand->action[sig-1];
@@ -1298,6 +1326,11 @@ force_sig_info(int sig, struct kernel_siginfo *info, struct task_struct *t)
return ret;
}
+int force_sig_info(struct kernel_siginfo *info)
+{
+ return force_sig_info_to_task(info, current);
+}
+
/*
* Nuke all other threads in the group.
*/
@@ -1434,13 +1467,44 @@ static inline bool kill_as_cred_perm(const struct cred *cred,
uid_eq(cred->uid, pcred->uid);
}
-/* like kill_pid_info(), but doesn't use uid/euid of "current" */
-int kill_pid_info_as_cred(int sig, struct kernel_siginfo *info, struct pid *pid,
- const struct cred *cred)
+/*
+ * The usb asyncio usage of siginfo is wrong. The glibc support
+ * for asyncio which uses SI_ASYNCIO assumes the layout is SIL_RT.
+ * AKA after the generic fields:
+ * kernel_pid_t si_pid;
+ * kernel_uid32_t si_uid;
+ * sigval_t si_value;
+ *
+ * Unfortunately when usb generates SI_ASYNCIO it assumes the layout
+ * after the generic fields is:
+ * void __user *si_addr;
+ *
+ * This is a practical problem when there is a 64bit big endian kernel
+ * and a 32bit userspace. As the 32bit address will encoded in the low
+ * 32bits of the pointer. Those low 32bits will be stored at higher
+ * address than appear in a 32 bit pointer. So userspace will not
+ * see the address it was expecting for it's completions.
+ *
+ * There is nothing in the encoding that can allow
+ * copy_siginfo_to_user32 to detect this confusion of formats, so
+ * handle this by requiring the caller of kill_pid_usb_asyncio to
+ * notice when this situration takes place and to store the 32bit
+ * pointer in sival_int, instead of sival_addr of the sigval_t addr
+ * parameter.
+ */
+int kill_pid_usb_asyncio(int sig, int errno, sigval_t addr,
+ struct pid *pid, const struct cred *cred)
{
- int ret = -EINVAL;
+ struct kernel_siginfo info;
struct task_struct *p;
unsigned long flags;
+ int ret = -EINVAL;
+
+ clear_siginfo(&info);
+ info.si_signo = sig;
+ info.si_errno = errno;
+ info.si_code = SI_ASYNCIO;
+ *((sigval_t *)&info.si_pid) = addr;
if (!valid_signal(sig))
return ret;
@@ -1451,17 +1515,17 @@ int kill_pid_info_as_cred(int sig, struct kernel_siginfo *info, struct pid *pid,
ret = -ESRCH;
goto out_unlock;
}
- if (si_fromuser(info) && !kill_as_cred_perm(cred, p)) {
+ if (!kill_as_cred_perm(cred, p)) {
ret = -EPERM;
goto out_unlock;
}
- ret = security_task_kill(p, info, sig, cred);
+ ret = security_task_kill(p, &info, sig, cred);
if (ret)
goto out_unlock;
if (sig) {
if (lock_task_sighand(p, &flags)) {
- ret = __send_signal(sig, info, p, PIDTYPE_TGID, 0);
+ ret = __send_signal(sig, &info, p, PIDTYPE_TGID, false);
unlock_task_sighand(p, &flags);
} else
ret = -ESRCH;
@@ -1470,7 +1534,7 @@ out_unlock:
rcu_read_unlock();
return ret;
}
-EXPORT_SYMBOL_GPL(kill_pid_info_as_cred);
+EXPORT_SYMBOL_GPL(kill_pid_usb_asyncio);
/*
* kill_something_info() interprets pid in interesting ways just like kill(2).
@@ -1546,9 +1610,17 @@ send_sig(int sig, struct task_struct *p, int priv)
}
EXPORT_SYMBOL(send_sig);
-void force_sig(int sig, struct task_struct *p)
+void force_sig(int sig)
{
- force_sig_info(sig, SEND_SIG_PRIV, p);
+ struct kernel_siginfo info;
+
+ clear_siginfo(&info);
+ info.si_signo = sig;
+ info.si_errno = 0;
+ info.si_code = SI_KERNEL;
+ info.si_pid = 0;
+ info.si_uid = 0;
+ force_sig_info(&info);
}
EXPORT_SYMBOL(force_sig);
@@ -1558,18 +1630,20 @@ EXPORT_SYMBOL(force_sig);
* the problem was already a SIGSEGV, we'll want to
* make sure we don't even try to deliver the signal..
*/
-void force_sigsegv(int sig, struct task_struct *p)
+void force_sigsegv(int sig)
{
+ struct task_struct *p = current;
+
if (sig == SIGSEGV) {
unsigned long flags;
spin_lock_irqsave(&p->sighand->siglock, flags);
p->sighand->action[sig - 1].sa.sa_handler = SIG_DFL;
spin_unlock_irqrestore(&p->sighand->siglock, flags);
}
- force_sig(SIGSEGV, p);
+ force_sig(SIGSEGV);
}
-int force_sig_fault(int sig, int code, void __user *addr
+int force_sig_fault_to_task(int sig, int code, void __user *addr
___ARCH_SI_TRAPNO(int trapno)
___ARCH_SI_IA64(int imm, unsigned int flags, unsigned long isr)
, struct task_struct *t)
@@ -1589,7 +1663,16 @@ int force_sig_fault(int sig, int code, void __user *addr
info.si_flags = flags;
info.si_isr = isr;
#endif
- return force_sig_info(info.si_signo, &info, t);
+ return force_sig_info_to_task(&info, t);
+}
+
+int force_sig_fault(int sig, int code, void __user *addr
+ ___ARCH_SI_TRAPNO(int trapno)
+ ___ARCH_SI_IA64(int imm, unsigned int flags, unsigned long isr))
+{
+ return force_sig_fault_to_task(sig, code, addr
+ ___ARCH_SI_TRAPNO(trapno)
+ ___ARCH_SI_IA64(imm, flags, isr), current);
}
int send_sig_fault(int sig, int code, void __user *addr
@@ -1615,7 +1698,7 @@ int send_sig_fault(int sig, int code, void __user *addr
return send_sig_info(info.si_signo, &info, t);
}
-int force_sig_mceerr(int code, void __user *addr, short lsb, struct task_struct *t)
+int force_sig_mceerr(int code, void __user *addr, short lsb)
{
struct kernel_siginfo info;
@@ -1626,7 +1709,7 @@ int force_sig_mceerr(int code, void __user *addr, short lsb, struct task_struct
info.si_code = code;
info.si_addr = addr;
info.si_addr_lsb = lsb;
- return force_sig_info(info.si_signo, &info, t);
+ return force_sig_info(&info);
}
int send_sig_mceerr(int code, void __user *addr, short lsb, struct task_struct *t)
@@ -1655,7 +1738,7 @@ int force_sig_bnderr(void __user *addr, void __user *lower, void __user *upper)
info.si_addr = addr;
info.si_lower = lower;
info.si_upper = upper;
- return force_sig_info(info.si_signo, &info, current);
+ return force_sig_info(&info);
}
#ifdef SEGV_PKUERR
@@ -1669,7 +1752,7 @@ int force_sig_pkuerr(void __user *addr, u32 pkey)
info.si_code = SEGV_PKUERR;
info.si_addr = addr;
info.si_pkey = pkey;
- return force_sig_info(info.si_signo, &info, current);
+ return force_sig_info(&info);
}
#endif
@@ -1685,7 +1768,7 @@ int force_sig_ptrace_errno_trap(int errno, void __user *addr)
info.si_errno = errno;
info.si_code = TRAP_HWBKPT;
info.si_addr = addr;
- return force_sig_info(info.si_signo, &info, current);
+ return force_sig_info(&info);
}
int kill_pgrp(struct pid *pid, int sig, int priv)
@@ -1798,6 +1881,14 @@ ret:
return ret;
}
+static void do_notify_pidfd(struct task_struct *task)
+{
+ struct pid *pid;
+
+ pid = task_pid(task);
+ wake_up_all(&pid->wait_pidfd);
+}
+
/*
* Let a parent know about the death of a child.
* For a stopped/continued status change, use do_notify_parent_cldstop instead.
@@ -1821,6 +1912,9 @@ bool do_notify_parent(struct task_struct *tsk, int sig)
BUG_ON(!tsk->ptrace &&
(tsk->group_leader != tsk || !thread_group_empty(tsk)));
+ /* Wake up all pidfd waiters */
+ do_notify_pidfd(tsk);
+
if (sig != SIGCHLD) {
/*
* This is only possible if parent == real_parent.
@@ -2106,7 +2200,9 @@ static void ptrace_stop(int exit_code, int why, int clear_code, kernel_siginfo_t
preempt_disable();
read_unlock(&tasklist_lock);
preempt_enable_no_resched();
+ cgroup_enter_frozen();
freezable_schedule();
+ cgroup_leave_frozen(true);
} else {
/*
* By the time we got the lock, our tracer went away.
@@ -2284,6 +2380,7 @@ static bool do_signal_stop(int signr)
}
/* Now we don't run again until woken by SIGCONT or SIGKILL */
+ cgroup_enter_frozen();
freezable_schedule();
return true;
} else {
@@ -2330,6 +2427,43 @@ static void do_jobctl_trap(void)
}
}
+/**
+ * do_freezer_trap - handle the freezer jobctl trap
+ *
+ * Puts the task into frozen state, if only the task is not about to quit.
+ * In this case it drops JOBCTL_TRAP_FREEZE.
+ *
+ * CONTEXT:
+ * Must be called with @current->sighand->siglock held,
+ * which is always released before returning.
+ */
+static void do_freezer_trap(void)
+ __releases(&current->sighand->siglock)
+{
+ /*
+ * If there are other trap bits pending except JOBCTL_TRAP_FREEZE,
+ * let's make another loop to give it a chance to be handled.
+ * In any case, we'll return back.
+ */
+ if ((current->jobctl & (JOBCTL_PENDING_MASK | JOBCTL_TRAP_FREEZE)) !=
+ JOBCTL_TRAP_FREEZE) {
+ spin_unlock_irq(&current->sighand->siglock);
+ return;
+ }
+
+ /*
+ * Now we're sure that there is no pending fatal signal and no
+ * pending traps. Clear TIF_SIGPENDING to not get out of schedule()
+ * immediately (if there is a non-fatal signal pending), and
+ * put the task into sleep.
+ */
+ __set_current_state(TASK_INTERRUPTIBLE);
+ clear_thread_flag(TIF_SIGPENDING);
+ spin_unlock_irq(&current->sighand->siglock);
+ cgroup_enter_frozen();
+ freezable_schedule();
+}
+
static int ptrace_signal(int signr, kernel_siginfo_t *info)
{
/*
@@ -2439,6 +2573,8 @@ relock:
if (signal_group_exit(signal)) {
ksig->info.si_signo = signr = SIGKILL;
sigdelset(&current->pending.signal, SIGKILL);
+ trace_signal_deliver(SIGKILL, SEND_SIG_NOINFO,
+ &sighand->action[SIGKILL - 1]);
recalc_sigpending();
goto fatal;
}
@@ -2450,9 +2586,24 @@ relock:
do_signal_stop(0))
goto relock;
- if (unlikely(current->jobctl & JOBCTL_TRAP_MASK)) {
- do_jobctl_trap();
+ if (unlikely(current->jobctl &
+ (JOBCTL_TRAP_MASK | JOBCTL_TRAP_FREEZE))) {
+ if (current->jobctl & JOBCTL_TRAP_MASK) {
+ do_jobctl_trap();
+ spin_unlock_irq(&sighand->siglock);
+ } else if (current->jobctl & JOBCTL_TRAP_FREEZE)
+ do_freezer_trap();
+
+ goto relock;
+ }
+
+ /*
+ * If the task is leaving the frozen state, let's update
+ * cgroup counters and reset the frozen bit.
+ */
+ if (unlikely(cgroup_task_frozen(current))) {
spin_unlock_irq(&sighand->siglock);
+ cgroup_leave_frozen(false);
goto relock;
}
@@ -2548,6 +2699,8 @@ relock:
fatal:
spin_unlock_irq(&sighand->siglock);
+ if (unlikely(cgroup_task_frozen(current)))
+ cgroup_leave_frozen(true);
/*
* Anything else is fatal, maybe with a core dump.
@@ -2611,7 +2764,7 @@ static void signal_delivered(struct ksignal *ksig, int stepping)
void signal_setup_done(int failed, struct ksignal *ksig, int stepping)
{
if (failed)
- force_sigsegv(ksig->sig, current);
+ force_sigsegv(ksig->sig);
else
signal_delivered(ksig, stepping);
}
@@ -2847,7 +3000,8 @@ EXPORT_SYMBOL(set_compat_user_sigmask);
* This is useful for syscalls such as ppoll, pselect, io_pgetevents and
* epoll_pwait where a new sigmask is passed in from userland for the syscalls.
*/
-void restore_user_sigmask(const void __user *usigmask, sigset_t *sigsaved)
+void restore_user_sigmask(const void __user *usigmask, sigset_t *sigsaved,
+ bool interrupted)
{
if (!usigmask)
@@ -2857,7 +3011,7 @@ void restore_user_sigmask(const void __user *usigmask, sigset_t *sigsaved)
* Restoring sigmask here can lead to delivering signals that the above
* syscalls are intended to block because of the sigmask passed in.
*/
- if (signal_pending(current)) {
+ if (interrupted) {
current->saved_sigmask = *sigsaved;
set_restore_sigmask();
return;
@@ -3455,7 +3609,7 @@ COMPAT_SYSCALL_DEFINE4(rt_sigtimedwait_time64, compat_sigset_t __user *, uthese,
}
#ifdef CONFIG_COMPAT_32BIT_TIME
-COMPAT_SYSCALL_DEFINE4(rt_sigtimedwait, compat_sigset_t __user *, uthese,
+COMPAT_SYSCALL_DEFINE4(rt_sigtimedwait_time32, compat_sigset_t __user *, uthese,
struct compat_siginfo __user *, uinfo,
struct old_timespec32 __user *, uts, compat_size_t, sigsetsize)
{
@@ -3487,6 +3641,16 @@ COMPAT_SYSCALL_DEFINE4(rt_sigtimedwait, compat_sigset_t __user *, uthese,
#endif
#endif
+static inline void prepare_kill_siginfo(int sig, struct kernel_siginfo *info)
+{
+ clear_siginfo(info);
+ info->si_signo = sig;
+ info->si_errno = 0;
+ info->si_code = SI_USER;
+ info->si_pid = task_tgid_vnr(current);
+ info->si_uid = from_kuid_munged(current_user_ns(), current_uid());
+}
+
/**
* sys_kill - send a signal to a process
* @pid: the PID of the process
@@ -3496,16 +3660,125 @@ SYSCALL_DEFINE2(kill, pid_t, pid, int, sig)
{
struct kernel_siginfo info;
- clear_siginfo(&info);
- info.si_signo = sig;
- info.si_errno = 0;
- info.si_code = SI_USER;
- info.si_pid = task_tgid_vnr(current);
- info.si_uid = from_kuid_munged(current_user_ns(), current_uid());
+ prepare_kill_siginfo(sig, &info);
return kill_something_info(sig, &info, pid);
}
+/*
+ * Verify that the signaler and signalee either are in the same pid namespace
+ * or that the signaler's pid namespace is an ancestor of the signalee's pid
+ * namespace.
+ */
+static bool access_pidfd_pidns(struct pid *pid)
+{
+ struct pid_namespace *active = task_active_pid_ns(current);
+ struct pid_namespace *p = ns_of_pid(pid);
+
+ for (;;) {
+ if (!p)
+ return false;
+ if (p == active)
+ break;
+ p = p->parent;
+ }
+
+ return true;
+}
+
+static int copy_siginfo_from_user_any(kernel_siginfo_t *kinfo, siginfo_t *info)
+{
+#ifdef CONFIG_COMPAT
+ /*
+ * Avoid hooking up compat syscalls and instead handle necessary
+ * conversions here. Note, this is a stop-gap measure and should not be
+ * considered a generic solution.
+ */
+ if (in_compat_syscall())
+ return copy_siginfo_from_user32(
+ kinfo, (struct compat_siginfo __user *)info);
+#endif
+ return copy_siginfo_from_user(kinfo, info);
+}
+
+static struct pid *pidfd_to_pid(const struct file *file)
+{
+ if (file->f_op == &pidfd_fops)
+ return file->private_data;
+
+ return tgid_pidfd_to_pid(file);
+}
+
+/**
+ * sys_pidfd_send_signal - Signal a process through a pidfd
+ * @pidfd: file descriptor of the process
+ * @sig: signal to send
+ * @info: signal info
+ * @flags: future flags
+ *
+ * The syscall currently only signals via PIDTYPE_PID which covers
+ * kill(<positive-pid>, <signal>. It does not signal threads or process
+ * groups.
+ * In order to extend the syscall to threads and process groups the @flags
+ * argument should be used. In essence, the @flags argument will determine
+ * what is signaled and not the file descriptor itself. Put in other words,
+ * grouping is a property of the flags argument not a property of the file
+ * descriptor.
+ *
+ * Return: 0 on success, negative errno on failure
+ */
+SYSCALL_DEFINE4(pidfd_send_signal, int, pidfd, int, sig,
+ siginfo_t __user *, info, unsigned int, flags)
+{
+ int ret;
+ struct fd f;
+ struct pid *pid;
+ kernel_siginfo_t kinfo;
+
+ /* Enforce flags be set to 0 until we add an extension. */
+ if (flags)
+ return -EINVAL;
+
+ f = fdget(pidfd);
+ if (!f.file)
+ return -EBADF;
+
+ /* Is this a pidfd? */
+ pid = pidfd_to_pid(f.file);
+ if (IS_ERR(pid)) {
+ ret = PTR_ERR(pid);
+ goto err;
+ }
+
+ ret = -EINVAL;
+ if (!access_pidfd_pidns(pid))
+ goto err;
+
+ if (info) {
+ ret = copy_siginfo_from_user_any(&kinfo, info);
+ if (unlikely(ret))
+ goto err;
+
+ ret = -EINVAL;
+ if (unlikely(sig != kinfo.si_signo))
+ goto err;
+
+ /* Only allow sending arbitrary signals to yourself. */
+ ret = -EPERM;
+ if ((task_pid(current) != pid) &&
+ (kinfo.si_code >= 0 || kinfo.si_code == SI_TKILL))
+ goto err;
+ } else {
+ prepare_kill_siginfo(sig, &kinfo);
+ }
+
+ ret = kill_pid_info(sig, &kinfo, pid);
+
+err:
+ fdput(f);
+ return ret;
+}
+
static int
do_send_specific(pid_t tgid, pid_t pid, int sig, struct kernel_siginfo *info)
{
@@ -4292,6 +4565,28 @@ static inline void siginfo_buildtime_checks(void)
CHECK_OFFSET(si_syscall);
CHECK_OFFSET(si_arch);
#undef CHECK_OFFSET
+
+ /* usb asyncio */
+ BUILD_BUG_ON(offsetof(struct siginfo, si_pid) !=
+ offsetof(struct siginfo, si_addr));
+ if (sizeof(int) == sizeof(void __user *)) {
+ BUILD_BUG_ON(sizeof_field(struct siginfo, si_pid) !=
+ sizeof(void __user *));
+ } else {
+ BUILD_BUG_ON((sizeof_field(struct siginfo, si_pid) +
+ sizeof_field(struct siginfo, si_uid)) !=
+ sizeof(void __user *));
+ BUILD_BUG_ON(offsetofend(struct siginfo, si_pid) !=
+ offsetof(struct siginfo, si_uid));
+ }
+#ifdef CONFIG_COMPAT
+ BUILD_BUG_ON(offsetof(struct compat_siginfo, si_pid) !=
+ offsetof(struct compat_siginfo, si_addr));
+ BUILD_BUG_ON(sizeof_field(struct compat_siginfo, si_pid) !=
+ sizeof(compat_uptr_t));
+ BUILD_BUG_ON(sizeof_field(struct compat_siginfo, si_pid) !=
+ sizeof_field(struct siginfo, si_pid));
+#endif
}
void __init signals_init(void)
diff --git a/kernel/smp.c b/kernel/smp.c
index f4cf1b0bb3b8..616d4d114847 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Generic helpers for smp ipi calls
*
@@ -33,7 +34,7 @@ struct call_function_data {
cpumask_var_t cpumask_ipi;
};
-static DEFINE_PER_CPU_SHARED_ALIGNED(struct call_function_data, cfd_data);
+static DEFINE_PER_CPU_ALIGNED(struct call_function_data, cfd_data);
static DEFINE_PER_CPU_SHARED_ALIGNED(struct llist_head, call_single_queue);
@@ -486,13 +487,11 @@ EXPORT_SYMBOL(smp_call_function_many);
* You must not call this function with disabled interrupts or from a
* hardware interrupt handler or from a bottom half handler.
*/
-int smp_call_function(smp_call_func_t func, void *info, int wait)
+void smp_call_function(smp_call_func_t func, void *info, int wait)
{
preempt_disable();
smp_call_function_many(cpu_online_mask, func, info, wait);
preempt_enable();
-
- return 0;
}
EXPORT_SYMBOL(smp_call_function);
@@ -593,18 +592,16 @@ void __init smp_init(void)
* early_boot_irqs_disabled is set. Use local_irq_save/restore() instead
* of local_irq_disable/enable().
*/
-int on_each_cpu(void (*func) (void *info), void *info, int wait)
+void on_each_cpu(void (*func) (void *info), void *info, int wait)
{
unsigned long flags;
- int ret = 0;
preempt_disable();
- ret = smp_call_function(func, info, wait);
+ smp_call_function(func, info, wait);
local_irq_save(flags);
func(info);
local_irq_restore(flags);
preempt_enable();
- return ret;
}
EXPORT_SYMBOL(on_each_cpu);
diff --git a/kernel/smpboot.c b/kernel/smpboot.c
index c230c2dd48e1..2efe1e206167 100644
--- a/kernel/smpboot.c
+++ b/kernel/smpboot.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Common SMP CPU bringup/teardown functions
*/
diff --git a/kernel/softirq.c b/kernel/softirq.c
index d28813306b2c..0427a86743a4 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -1,10 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* linux/kernel/softirq.c
*
* Copyright (C) 1992 Linus Torvalds
*
- * Distribute under GPLv2.
- *
* Rewritten. Old one was good in 2.2, but in 2.3 it was immoral. --ANK (990903)
*/
@@ -89,7 +88,8 @@ static bool ksoftirqd_running(unsigned long pending)
if (pending & SOFTIRQ_NOW_MASK)
return false;
- return tsk && (tsk->state == TASK_RUNNING);
+ return tsk && (tsk->state == TASK_RUNNING) &&
+ !__kthread_should_park(tsk);
}
/*
@@ -572,57 +572,6 @@ void tasklet_kill(struct tasklet_struct *t)
}
EXPORT_SYMBOL(tasklet_kill);
-/*
- * tasklet_hrtimer
- */
-
-/*
- * The trampoline is called when the hrtimer expires. It schedules a tasklet
- * to run __tasklet_hrtimer_trampoline() which in turn will call the intended
- * hrtimer callback, but from softirq context.
- */
-static enum hrtimer_restart __hrtimer_tasklet_trampoline(struct hrtimer *timer)
-{
- struct tasklet_hrtimer *ttimer =
- container_of(timer, struct tasklet_hrtimer, timer);
-
- tasklet_hi_schedule(&ttimer->tasklet);
- return HRTIMER_NORESTART;
-}
-
-/*
- * Helper function which calls the hrtimer callback from
- * tasklet/softirq context
- */
-static void __tasklet_hrtimer_trampoline(unsigned long data)
-{
- struct tasklet_hrtimer *ttimer = (void *)data;
- enum hrtimer_restart restart;
-
- restart = ttimer->function(&ttimer->timer);
- if (restart != HRTIMER_NORESTART)
- hrtimer_restart(&ttimer->timer);
-}
-
-/**
- * tasklet_hrtimer_init - Init a tasklet/hrtimer combo for softirq callbacks
- * @ttimer: tasklet_hrtimer which is initialized
- * @function: hrtimer callback function which gets called from softirq context
- * @which_clock: clock id (CLOCK_MONOTONIC/CLOCK_REALTIME)
- * @mode: hrtimer mode (HRTIMER_MODE_ABS/HRTIMER_MODE_REL)
- */
-void tasklet_hrtimer_init(struct tasklet_hrtimer *ttimer,
- enum hrtimer_restart (*function)(struct hrtimer *),
- clockid_t which_clock, enum hrtimer_mode mode)
-{
- hrtimer_init(&ttimer->timer, which_clock, mode);
- ttimer->timer.function = __hrtimer_tasklet_trampoline;
- tasklet_init(&ttimer->tasklet, __tasklet_hrtimer_trampoline,
- (unsigned long)ttimer);
- ttimer->function = function;
-}
-EXPORT_SYMBOL_GPL(tasklet_hrtimer_init);
-
void __init softirq_init(void)
{
int cpu;
@@ -700,7 +649,7 @@ static int takeover_tasklets(unsigned int cpu)
/* Find end, append list for that CPU. */
if (&per_cpu(tasklet_vec, cpu).head != per_cpu(tasklet_vec, cpu).tail) {
*__this_cpu_read(tasklet_vec.tail) = per_cpu(tasklet_vec, cpu).head;
- this_cpu_write(tasklet_vec.tail, per_cpu(tasklet_vec, cpu).tail);
+ __this_cpu_write(tasklet_vec.tail, per_cpu(tasklet_vec, cpu).tail);
per_cpu(tasklet_vec, cpu).head = NULL;
per_cpu(tasklet_vec, cpu).tail = &per_cpu(tasklet_vec, cpu).head;
}
diff --git a/kernel/stacktrace.c b/kernel/stacktrace.c
index f8edee9c792d..e6a02b274b73 100644
--- a/kernel/stacktrace.c
+++ b/kernel/stacktrace.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* kernel/stacktrace.c
*
@@ -5,41 +6,56 @@
*
* Copyright (C) 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
*/
+#include <linux/sched/task_stack.h>
+#include <linux/sched/debug.h>
#include <linux/sched.h>
#include <linux/kernel.h>
#include <linux/export.h>
#include <linux/kallsyms.h>
#include <linux/stacktrace.h>
-void print_stack_trace(struct stack_trace *trace, int spaces)
+/**
+ * stack_trace_print - Print the entries in the stack trace
+ * @entries: Pointer to storage array
+ * @nr_entries: Number of entries in the storage array
+ * @spaces: Number of leading spaces to print
+ */
+void stack_trace_print(unsigned long *entries, unsigned int nr_entries,
+ int spaces)
{
- int i;
+ unsigned int i;
- if (WARN_ON(!trace->entries))
+ if (WARN_ON(!entries))
return;
- for (i = 0; i < trace->nr_entries; i++)
- printk("%*c%pS\n", 1 + spaces, ' ', (void *)trace->entries[i]);
+ for (i = 0; i < nr_entries; i++)
+ printk("%*c%pS\n", 1 + spaces, ' ', (void *)entries[i]);
}
-EXPORT_SYMBOL_GPL(print_stack_trace);
+EXPORT_SYMBOL_GPL(stack_trace_print);
-int snprint_stack_trace(char *buf, size_t size,
- struct stack_trace *trace, int spaces)
+/**
+ * stack_trace_snprint - Print the entries in the stack trace into a buffer
+ * @buf: Pointer to the print buffer
+ * @size: Size of the print buffer
+ * @entries: Pointer to storage array
+ * @nr_entries: Number of entries in the storage array
+ * @spaces: Number of leading spaces to print
+ *
+ * Return: Number of bytes printed.
+ */
+int stack_trace_snprint(char *buf, size_t size, unsigned long *entries,
+ unsigned int nr_entries, int spaces)
{
- int i;
- int generated;
- int total = 0;
+ unsigned int generated, i, total = 0;
- if (WARN_ON(!trace->entries))
+ if (WARN_ON(!entries))
return 0;
- for (i = 0; i < trace->nr_entries; i++) {
+ for (i = 0; i < nr_entries && size; i++) {
generated = snprintf(buf, size, "%*c%pS\n", 1 + spaces, ' ',
- (void *)trace->entries[i]);
+ (void *)entries[i]);
total += generated;
-
- /* Assume that generated isn't a negative number */
if (generated >= size) {
buf += size;
size = 0;
@@ -51,7 +67,176 @@ int snprint_stack_trace(char *buf, size_t size,
return total;
}
-EXPORT_SYMBOL_GPL(snprint_stack_trace);
+EXPORT_SYMBOL_GPL(stack_trace_snprint);
+
+#ifdef CONFIG_ARCH_STACKWALK
+
+struct stacktrace_cookie {
+ unsigned long *store;
+ unsigned int size;
+ unsigned int skip;
+ unsigned int len;
+};
+
+static bool stack_trace_consume_entry(void *cookie, unsigned long addr,
+ bool reliable)
+{
+ struct stacktrace_cookie *c = cookie;
+
+ if (c->len >= c->size)
+ return false;
+
+ if (c->skip > 0) {
+ c->skip--;
+ return true;
+ }
+ c->store[c->len++] = addr;
+ return c->len < c->size;
+}
+
+static bool stack_trace_consume_entry_nosched(void *cookie, unsigned long addr,
+ bool reliable)
+{
+ if (in_sched_functions(addr))
+ return true;
+ return stack_trace_consume_entry(cookie, addr, reliable);
+}
+
+/**
+ * stack_trace_save - Save a stack trace into a storage array
+ * @store: Pointer to storage array
+ * @size: Size of the storage array
+ * @skipnr: Number of entries to skip at the start of the stack trace
+ *
+ * Return: Number of trace entries stored.
+ */
+unsigned int stack_trace_save(unsigned long *store, unsigned int size,
+ unsigned int skipnr)
+{
+ stack_trace_consume_fn consume_entry = stack_trace_consume_entry;
+ struct stacktrace_cookie c = {
+ .store = store,
+ .size = size,
+ .skip = skipnr + 1,
+ };
+
+ arch_stack_walk(consume_entry, &c, current, NULL);
+ return c.len;
+}
+EXPORT_SYMBOL_GPL(stack_trace_save);
+
+/**
+ * stack_trace_save_tsk - Save a task stack trace into a storage array
+ * @task: The task to examine
+ * @store: Pointer to storage array
+ * @size: Size of the storage array
+ * @skipnr: Number of entries to skip at the start of the stack trace
+ *
+ * Return: Number of trace entries stored.
+ */
+unsigned int stack_trace_save_tsk(struct task_struct *tsk, unsigned long *store,
+ unsigned int size, unsigned int skipnr)
+{
+ stack_trace_consume_fn consume_entry = stack_trace_consume_entry_nosched;
+ struct stacktrace_cookie c = {
+ .store = store,
+ .size = size,
+ .skip = skipnr + 1,
+ };
+
+ if (!try_get_task_stack(tsk))
+ return 0;
+
+ arch_stack_walk(consume_entry, &c, tsk, NULL);
+ put_task_stack(tsk);
+ return c.len;
+}
+
+/**
+ * stack_trace_save_regs - Save a stack trace based on pt_regs into a storage array
+ * @regs: Pointer to pt_regs to examine
+ * @store: Pointer to storage array
+ * @size: Size of the storage array
+ * @skipnr: Number of entries to skip at the start of the stack trace
+ *
+ * Return: Number of trace entries stored.
+ */
+unsigned int stack_trace_save_regs(struct pt_regs *regs, unsigned long *store,
+ unsigned int size, unsigned int skipnr)
+{
+ stack_trace_consume_fn consume_entry = stack_trace_consume_entry;
+ struct stacktrace_cookie c = {
+ .store = store,
+ .size = size,
+ .skip = skipnr,
+ };
+
+ arch_stack_walk(consume_entry, &c, current, regs);
+ return c.len;
+}
+
+#ifdef CONFIG_HAVE_RELIABLE_STACKTRACE
+/**
+ * stack_trace_save_tsk_reliable - Save task stack with verification
+ * @tsk: Pointer to the task to examine
+ * @store: Pointer to storage array
+ * @size: Size of the storage array
+ *
+ * Return: An error if it detects any unreliable features of the
+ * stack. Otherwise it guarantees that the stack trace is
+ * reliable and returns the number of entries stored.
+ *
+ * If the task is not 'current', the caller *must* ensure the task is inactive.
+ */
+int stack_trace_save_tsk_reliable(struct task_struct *tsk, unsigned long *store,
+ unsigned int size)
+{
+ stack_trace_consume_fn consume_entry = stack_trace_consume_entry;
+ struct stacktrace_cookie c = {
+ .store = store,
+ .size = size,
+ };
+ int ret;
+
+ /*
+ * If the task doesn't have a stack (e.g., a zombie), the stack is
+ * "reliably" empty.
+ */
+ if (!try_get_task_stack(tsk))
+ return 0;
+
+ ret = arch_stack_walk_reliable(consume_entry, &c, tsk);
+ put_task_stack(tsk);
+ return ret ? ret : c.len;
+}
+#endif
+
+#ifdef CONFIG_USER_STACKTRACE_SUPPORT
+/**
+ * stack_trace_save_user - Save a user space stack trace into a storage array
+ * @store: Pointer to storage array
+ * @size: Size of the storage array
+ *
+ * Return: Number of trace entries stored.
+ */
+unsigned int stack_trace_save_user(unsigned long *store, unsigned int size)
+{
+ stack_trace_consume_fn consume_entry = stack_trace_consume_entry;
+ struct stacktrace_cookie c = {
+ .store = store,
+ .size = size,
+ };
+
+ /* Trace user stack if not a kernel thread */
+ if (current->flags & PF_KTHREAD)
+ return 0;
+
+ arch_stack_walk_user(consume_entry, &c, task_pt_regs(current));
+ return c.len;
+}
+#endif
+
+#else /* CONFIG_ARCH_STACKWALK */
/*
* Architectures that do not implement save_stack_trace_*()
@@ -70,10 +255,117 @@ save_stack_trace_regs(struct pt_regs *regs, struct stack_trace *trace)
WARN_ONCE(1, KERN_INFO "save_stack_trace_regs() not implemented yet.\n");
}
-__weak int
-save_stack_trace_tsk_reliable(struct task_struct *tsk,
- struct stack_trace *trace)
+/**
+ * stack_trace_save - Save a stack trace into a storage array
+ * @store: Pointer to storage array
+ * @size: Size of the storage array
+ * @skipnr: Number of entries to skip at the start of the stack trace
+ *
+ * Return: Number of trace entries stored
+ */
+unsigned int stack_trace_save(unsigned long *store, unsigned int size,
+ unsigned int skipnr)
+{
+ struct stack_trace trace = {
+ .entries = store,
+ .max_entries = size,
+ .skip = skipnr + 1,
+ };
+
+ save_stack_trace(&trace);
+ return trace.nr_entries;
+}
+EXPORT_SYMBOL_GPL(stack_trace_save);
+
+/**
+ * stack_trace_save_tsk - Save a task stack trace into a storage array
+ * @task: The task to examine
+ * @store: Pointer to storage array
+ * @size: Size of the storage array
+ * @skipnr: Number of entries to skip at the start of the stack trace
+ *
+ * Return: Number of trace entries stored
+ */
+unsigned int stack_trace_save_tsk(struct task_struct *task,
+ unsigned long *store, unsigned int size,
+ unsigned int skipnr)
{
- WARN_ONCE(1, KERN_INFO "save_stack_tsk_reliable() not implemented yet.\n");
- return -ENOSYS;
+ struct stack_trace trace = {
+ .entries = store,
+ .max_entries = size,
+ .skip = skipnr + 1,
+ };
+
+ save_stack_trace_tsk(task, &trace);
+ return trace.nr_entries;
}
+
+/**
+ * stack_trace_save_regs - Save a stack trace based on pt_regs into a storage array
+ * @regs: Pointer to pt_regs to examine
+ * @store: Pointer to storage array
+ * @size: Size of the storage array
+ * @skipnr: Number of entries to skip at the start of the stack trace
+ *
+ * Return: Number of trace entries stored
+ */
+unsigned int stack_trace_save_regs(struct pt_regs *regs, unsigned long *store,
+ unsigned int size, unsigned int skipnr)
+{
+ struct stack_trace trace = {
+ .entries = store,
+ .max_entries = size,
+ .skip = skipnr,
+ };
+
+ save_stack_trace_regs(regs, &trace);
+ return trace.nr_entries;
+}
+
+#ifdef CONFIG_HAVE_RELIABLE_STACKTRACE
+/**
+ * stack_trace_save_tsk_reliable - Save task stack with verification
+ * @tsk: Pointer to the task to examine
+ * @store: Pointer to storage array
+ * @size: Size of the storage array
+ *
+ * Return: An error if it detects any unreliable features of the
+ * stack. Otherwise it guarantees that the stack trace is
+ * reliable and returns the number of entries stored.
+ *
+ * If the task is not 'current', the caller *must* ensure the task is inactive.
+ */
+int stack_trace_save_tsk_reliable(struct task_struct *tsk, unsigned long *store,
+ unsigned int size)
+{
+ struct stack_trace trace = {
+ .entries = store,
+ .max_entries = size,
+ };
+ int ret = save_stack_trace_tsk_reliable(tsk, &trace);
+
+ return ret ? ret : trace.nr_entries;
+}
+#endif
+
+#ifdef CONFIG_USER_STACKTRACE_SUPPORT
+/**
+ * stack_trace_save_user - Save a user space stack trace into a storage array
+ * @store: Pointer to storage array
+ * @size: Size of the storage array
+ *
+ * Return: Number of trace entries stored
+ */
+unsigned int stack_trace_save_user(unsigned long *store, unsigned int size)
+{
+ struct stack_trace trace = {
+ .entries = store,
+ .max_entries = size,
+ };
+
+ save_stack_trace_user(&trace);
+ return trace.nr_entries;
+}
+#endif /* CONFIG_USER_STACKTRACE_SUPPORT */
+
+#endif /* !CONFIG_ARCH_STACKWALK */
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 067cb83f37ea..b4f83f7bdf86 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* kernel/stop_machine.c
*
@@ -5,8 +6,6 @@
* Copyright (C) 2008, 2005 Rusty Russell rusty@rustcorp.com.au
* Copyright (C) 2010 SUSE Linux Products GmbH
* Copyright (C) 2010 Tejun Heo <tj@kernel.org>
- *
- * This file is released under the GPLv2 and any later version.
*/
#include <linux/completion.h>
#include <linux/cpu.h>
@@ -178,12 +177,18 @@ static void ack_state(struct multi_stop_data *msdata)
set_state(msdata, msdata->state + 1);
}
+void __weak stop_machine_yield(const struct cpumask *cpumask)
+{
+ cpu_relax();
+}
+
/* This is the cpu_stop function which stops the CPU. */
static int multi_cpu_stop(void *data)
{
struct multi_stop_data *msdata = data;
enum multi_stop_state curstate = MULTI_STOP_NONE;
int cpu = smp_processor_id(), err = 0;
+ const struct cpumask *cpumask;
unsigned long flags;
bool is_active;
@@ -193,15 +198,18 @@ static int multi_cpu_stop(void *data)
*/
local_save_flags(flags);
- if (!msdata->active_cpus)
- is_active = cpu == cpumask_first(cpu_online_mask);
- else
- is_active = cpumask_test_cpu(cpu, msdata->active_cpus);
+ if (!msdata->active_cpus) {
+ cpumask = cpu_online_mask;
+ is_active = cpu == cpumask_first(cpumask);
+ } else {
+ cpumask = msdata->active_cpus;
+ is_active = cpumask_test_cpu(cpu, cpumask);
+ }
/* Simple state machine */
do {
/* Chill out and ensure we re-read multi_stop_state. */
- cpu_relax_yield();
+ stop_machine_yield(cpumask);
if (msdata->state != curstate) {
curstate = msdata->state;
switch (curstate) {
@@ -513,7 +521,7 @@ repeat:
}
preempt_count_dec();
WARN_ONCE(preempt_count(),
- "cpu_stop: %pf(%p) leaked preempt count\n", fn, arg);
+ "cpu_stop: %ps(%p) leaked preempt count\n", fn, arg);
goto repeat;
}
}
diff --git a/kernel/sys.c b/kernel/sys.c
index f7eb62eceb24..2969304c29fe 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -516,7 +516,7 @@ long __sys_setreuid(uid_t ruid, uid_t euid)
new->uid = kruid;
if (!uid_eq(old->uid, kruid) &&
!uid_eq(old->euid, kruid) &&
- !ns_capable(old->user_ns, CAP_SETUID))
+ !ns_capable_setid(old->user_ns, CAP_SETUID))
goto error;
}
@@ -525,7 +525,7 @@ long __sys_setreuid(uid_t ruid, uid_t euid)
if (!uid_eq(old->uid, keuid) &&
!uid_eq(old->euid, keuid) &&
!uid_eq(old->suid, keuid) &&
- !ns_capable(old->user_ns, CAP_SETUID))
+ !ns_capable_setid(old->user_ns, CAP_SETUID))
goto error;
}
@@ -584,7 +584,7 @@ long __sys_setuid(uid_t uid)
old = current_cred();
retval = -EPERM;
- if (ns_capable(old->user_ns, CAP_SETUID)) {
+ if (ns_capable_setid(old->user_ns, CAP_SETUID)) {
new->suid = new->uid = kuid;
if (!uid_eq(kuid, old->uid)) {
retval = set_user(new);
@@ -646,7 +646,7 @@ long __sys_setresuid(uid_t ruid, uid_t euid, uid_t suid)
old = current_cred();
retval = -EPERM;
- if (!ns_capable(old->user_ns, CAP_SETUID)) {
+ if (!ns_capable_setid(old->user_ns, CAP_SETUID)) {
if (ruid != (uid_t) -1 && !uid_eq(kruid, old->uid) &&
!uid_eq(kruid, old->euid) && !uid_eq(kruid, old->suid))
goto error;
@@ -814,7 +814,7 @@ long __sys_setfsuid(uid_t uid)
if (uid_eq(kuid, old->uid) || uid_eq(kuid, old->euid) ||
uid_eq(kuid, old->suid) || uid_eq(kuid, old->fsuid) ||
- ns_capable(old->user_ns, CAP_SETUID)) {
+ ns_capable_setid(old->user_ns, CAP_SETUID)) {
if (!uid_eq(kuid, old->fsuid)) {
new->fsuid = kuid;
if (security_task_fix_setuid(new, old, LSM_SETID_FS) == 0)
@@ -1747,6 +1747,7 @@ void getrusage(struct task_struct *p, int who, struct rusage *r)
if (who == RUSAGE_CHILDREN)
break;
+ /* fall through */
case RUSAGE_SELF:
thread_group_cputime_adjusted(p, &tgutime, &tgstime);
@@ -1881,13 +1882,14 @@ exit_err:
}
/*
+ * Check arithmetic relations of passed addresses.
+ *
* WARNING: we don't require any capability here so be very careful
* in what is allowed for modification from userspace.
*/
-static int validate_prctl_map(struct prctl_mm_map *prctl_map)
+static int validate_prctl_map_addr(struct prctl_mm_map *prctl_map)
{
unsigned long mmap_max_addr = TASK_SIZE;
- struct mm_struct *mm = current->mm;
int error = -EINVAL, i;
static const unsigned char offsets[] = {
@@ -1923,7 +1925,7 @@ static int validate_prctl_map(struct prctl_mm_map *prctl_map)
((unsigned long)prctl_map->__m1 __op \
(unsigned long)prctl_map->__m2) ? 0 : -EINVAL
error = __prctl_check_order(start_code, <, end_code);
- error |= __prctl_check_order(start_data, <, end_data);
+ error |= __prctl_check_order(start_data,<=, end_data);
error |= __prctl_check_order(start_brk, <=, brk);
error |= __prctl_check_order(arg_start, <=, arg_end);
error |= __prctl_check_order(env_start, <=, env_end);
@@ -1948,24 +1950,6 @@ static int validate_prctl_map(struct prctl_mm_map *prctl_map)
prctl_map->start_data))
goto out;
- /*
- * Someone is trying to cheat the auxv vector.
- */
- if (prctl_map->auxv_size) {
- if (!prctl_map->auxv || prctl_map->auxv_size > sizeof(mm->saved_auxv))
- goto out;
- }
-
- /*
- * Finally, make sure the caller has the rights to
- * change /proc/pid/exe link: only local sys admin should
- * be allowed to.
- */
- if (prctl_map->exe_fd != (u32)-1) {
- if (!ns_capable(current_user_ns(), CAP_SYS_ADMIN))
- goto out;
- }
-
error = 0;
out:
return error;
@@ -1992,11 +1976,18 @@ static int prctl_set_mm_map(int opt, const void __user *addr, unsigned long data
if (copy_from_user(&prctl_map, addr, sizeof(prctl_map)))
return -EFAULT;
- error = validate_prctl_map(&prctl_map);
+ error = validate_prctl_map_addr(&prctl_map);
if (error)
return error;
if (prctl_map.auxv_size) {
+ /*
+ * Someone is trying to cheat the auxv vector.
+ */
+ if (!prctl_map.auxv ||
+ prctl_map.auxv_size > sizeof(mm->saved_auxv))
+ return -EINVAL;
+
memset(user_auxv, 0, sizeof(user_auxv));
if (copy_from_user(user_auxv,
(const void __user *)prctl_map.auxv,
@@ -2009,6 +2000,14 @@ static int prctl_set_mm_map(int opt, const void __user *addr, unsigned long data
}
if (prctl_map.exe_fd != (u32)-1) {
+ /*
+ * Make sure the caller has the rights to
+ * change /proc/pid/exe link: only local sys admin should
+ * be allowed to.
+ */
+ if (!ns_capable(current_user_ns(), CAP_SYS_ADMIN))
+ return -EINVAL;
+
error = prctl_set_mm_exe_file(mm, prctl_map.exe_fd);
if (error)
return error;
@@ -2096,7 +2095,11 @@ static int prctl_set_mm(int opt, unsigned long addr,
unsigned long arg4, unsigned long arg5)
{
struct mm_struct *mm = current->mm;
- struct prctl_mm_map prctl_map;
+ struct prctl_mm_map prctl_map = {
+ .auxv = NULL,
+ .auxv_size = 0,
+ .exe_fd = -1,
+ };
struct vm_area_struct *vma;
int error;
@@ -2124,9 +2127,15 @@ static int prctl_set_mm(int opt, unsigned long addr,
error = -EINVAL;
- down_write(&mm->mmap_sem);
+ /*
+ * arg_lock protects concurent updates of arg boundaries, we need
+ * mmap_sem for a) concurrent sys_brk, b) finding VMA for addr
+ * validation.
+ */
+ down_read(&mm->mmap_sem);
vma = find_vma(mm, addr);
+ spin_lock(&mm->arg_lock);
prctl_map.start_code = mm->start_code;
prctl_map.end_code = mm->end_code;
prctl_map.start_data = mm->start_data;
@@ -2138,9 +2147,6 @@ static int prctl_set_mm(int opt, unsigned long addr,
prctl_map.arg_end = mm->arg_end;
prctl_map.env_start = mm->env_start;
prctl_map.env_end = mm->env_end;
- prctl_map.auxv = NULL;
- prctl_map.auxv_size = 0;
- prctl_map.exe_fd = -1;
switch (opt) {
case PR_SET_MM_START_CODE:
@@ -2180,7 +2186,7 @@ static int prctl_set_mm(int opt, unsigned long addr,
goto out;
}
- error = validate_prctl_map(&prctl_map);
+ error = validate_prctl_map_addr(&prctl_map);
if (error)
goto out;
@@ -2217,7 +2223,8 @@ static int prctl_set_mm(int opt, unsigned long addr,
error = 0;
out:
- up_write(&mm->mmap_sem);
+ spin_unlock(&mm->arg_lock);
+ up_read(&mm->mmap_sem);
return error;
}
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index ab9d0e3c6d50..34b76895b81e 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -42,10 +42,15 @@ COND_SYSCALL(io_destroy);
COND_SYSCALL(io_submit);
COND_SYSCALL_COMPAT(io_submit);
COND_SYSCALL(io_cancel);
+COND_SYSCALL(io_getevents_time32);
COND_SYSCALL(io_getevents);
+COND_SYSCALL(io_pgetevents_time32);
COND_SYSCALL(io_pgetevents);
-COND_SYSCALL_COMPAT(io_getevents);
+COND_SYSCALL_COMPAT(io_pgetevents_time32);
COND_SYSCALL_COMPAT(io_pgetevents);
+COND_SYSCALL(io_uring_setup);
+COND_SYSCALL(io_uring_enter);
+COND_SYSCALL(io_uring_register);
/* fs/xattr.c */
@@ -114,9 +119,9 @@ COND_SYSCALL_COMPAT(signalfd4);
/* fs/timerfd.c */
COND_SYSCALL(timerfd_create);
COND_SYSCALL(timerfd_settime);
-COND_SYSCALL_COMPAT(timerfd_settime);
+COND_SYSCALL(timerfd_settime32);
COND_SYSCALL(timerfd_gettime);
-COND_SYSCALL_COMPAT(timerfd_gettime);
+COND_SYSCALL(timerfd_gettime32);
/* fs/utimes.c */
@@ -132,10 +137,12 @@ COND_SYSCALL(capset);
/* kernel/exit.c */
/* kernel/fork.c */
+/* __ARCH_WANT_SYS_CLONE3 */
+COND_SYSCALL(clone3);
/* kernel/futex.c */
COND_SYSCALL(futex);
-COND_SYSCALL_COMPAT(futex);
+COND_SYSCALL(futex_time32);
COND_SYSCALL(set_robust_list);
COND_SYSCALL_COMPAT(set_robust_list);
COND_SYSCALL(get_robust_list);
@@ -162,8 +169,6 @@ COND_SYSCALL(syslog);
/* kernel/sched/core.c */
-/* kernel/signal.c */
-
/* kernel/sys.c */
COND_SYSCALL(setregid);
COND_SYSCALL(setgid);
@@ -187,9 +192,9 @@ COND_SYSCALL(mq_open);
COND_SYSCALL_COMPAT(mq_open);
COND_SYSCALL(mq_unlink);
COND_SYSCALL(mq_timedsend);
-COND_SYSCALL_COMPAT(mq_timedsend);
+COND_SYSCALL(mq_timedsend_time32);
COND_SYSCALL(mq_timedreceive);
-COND_SYSCALL_COMPAT(mq_timedreceive);
+COND_SYSCALL(mq_timedreceive_time32);
COND_SYSCALL(mq_notify);
COND_SYSCALL_COMPAT(mq_notify);
COND_SYSCALL(mq_getsetattr);
@@ -197,8 +202,10 @@ COND_SYSCALL_COMPAT(mq_getsetattr);
/* ipc/msg.c */
COND_SYSCALL(msgget);
+COND_SYSCALL(old_msgctl);
COND_SYSCALL(msgctl);
COND_SYSCALL_COMPAT(msgctl);
+COND_SYSCALL_COMPAT(old_msgctl);
COND_SYSCALL(msgrcv);
COND_SYSCALL_COMPAT(msgrcv);
COND_SYSCALL(msgsnd);
@@ -206,16 +213,20 @@ COND_SYSCALL_COMPAT(msgsnd);
/* ipc/sem.c */
COND_SYSCALL(semget);
+COND_SYSCALL(old_semctl);
COND_SYSCALL(semctl);
COND_SYSCALL_COMPAT(semctl);
+COND_SYSCALL_COMPAT(old_semctl);
COND_SYSCALL(semtimedop);
-COND_SYSCALL_COMPAT(semtimedop);
+COND_SYSCALL(semtimedop_time32);
COND_SYSCALL(semop);
/* ipc/shm.c */
COND_SYSCALL(shmget);
+COND_SYSCALL(old_shmctl);
COND_SYSCALL(shmctl);
COND_SYSCALL_COMPAT(shmctl);
+COND_SYSCALL_COMPAT(old_shmctl);
COND_SYSCALL(shmat);
COND_SYSCALL_COMPAT(shmat);
COND_SYSCALL(shmdt);
@@ -285,7 +296,7 @@ COND_SYSCALL(perf_event_open);
COND_SYSCALL(accept4);
COND_SYSCALL(recvmmsg);
COND_SYSCALL(recvmmsg_time32);
-COND_SYSCALL_COMPAT(recvmmsg);
+COND_SYSCALL_COMPAT(recvmmsg_time32);
COND_SYSCALL_COMPAT(recvmmsg_time64);
/*
@@ -366,6 +377,7 @@ COND_SYSCALL(kexec_file_load);
/* s390 */
COND_SYSCALL(s390_pci_mmio_read);
COND_SYSCALL(s390_pci_mmio_write);
+COND_SYSCALL(s390_ipc);
COND_SYSCALL_COMPAT(s390_ipc);
/* powerpc */
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index ba4d9e85feb8..1c1ad1e14f21 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* sysctl.c: General linux system control interface
*
@@ -66,6 +67,9 @@
#include <linux/kexec.h>
#include <linux/bpf.h>
#include <linux/mount.h>
+#include <linux/userfaultfd_k.h>
+
+#include "../lib/kstrtox.h"
#include <linux/uaccess.h>
#include <asm/processor.h>
@@ -126,7 +130,9 @@ static int zero;
static int __maybe_unused one = 1;
static int __maybe_unused two = 2;
static int __maybe_unused four = 4;
+static unsigned long zero_ul;
static unsigned long one_ul = 1;
+static unsigned long long_max = LONG_MAX;
static int one_hundred = 100;
static int one_thousand = 1000;
#ifdef CONFIG_PRINTK
@@ -446,6 +452,22 @@ static struct ctl_table kern_table[] = {
.mode = 0644,
.proc_handler = sched_rr_handler,
},
+#ifdef CONFIG_UCLAMP_TASK
+ {
+ .procname = "sched_util_clamp_min",
+ .data = &sysctl_sched_uclamp_util_min,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = sysctl_sched_uclamp_handler,
+ },
+ {
+ .procname = "sched_util_clamp_max",
+ .data = &sysctl_sched_uclamp_util_max,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = sysctl_sched_uclamp_handler,
+ },
+#endif
#ifdef CONFIG_SCHED_AUTOGROUP
{
.procname = "sched_autogroup_enabled",
@@ -467,6 +489,17 @@ static struct ctl_table kern_table[] = {
.extra1 = &one,
},
#endif
+#if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL)
+ {
+ .procname = "sched_energy_aware",
+ .data = &sysctl_sched_energy_aware,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = sched_energy_aware_handler,
+ .extra1 = &zero,
+ .extra2 = &one,
+ },
+#endif
#ifdef CONFIG_PROVE_LOCKING
{
.procname = "prove_locking",
@@ -1229,6 +1262,13 @@ static struct ctl_table kern_table[] = {
.extra1 = &one,
.extra2 = &one,
},
+ {
+ .procname = "bpf_stats_enabled",
+ .data = &bpf_stats_enabled_key.key,
+ .maxlen = sizeof(bpf_stats_enabled_key),
+ .mode = 0644,
+ .proc_handler = proc_do_static_key,
+ },
#endif
#if defined(CONFIG_TREE_RCU) || defined(CONFIG_PREEMPT_RCU)
{
@@ -1446,7 +1486,7 @@ static struct ctl_table vm_table[] = {
.data = &sysctl_extfrag_threshold,
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = sysctl_extfrag_handler,
+ .proc_handler = proc_dointvec_minmax,
.extra1 = &min_extfrag_threshold,
.extra2 = &max_extfrag_threshold,
},
@@ -1691,6 +1731,17 @@ static struct ctl_table vm_table[] = {
.extra2 = (void *)&mmap_rnd_compat_bits_max,
},
#endif
+#ifdef CONFIG_USERFAULTFD
+ {
+ .procname = "unprivileged_userfaultfd",
+ .data = &sysctl_unprivileged_userfaultfd,
+ .maxlen = sizeof(sysctl_unprivileged_userfaultfd),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &zero,
+ .extra2 = &one,
+ },
+#endif
{ }
};
@@ -1722,6 +1773,8 @@ static struct ctl_table fs_table[] = {
.maxlen = sizeof(files_stat.max_files),
.mode = 0644,
.proc_handler = proc_doulongvec_minmax,
+ .extra1 = &zero_ul,
+ .extra2 = &long_max,
},
{
.procname = "nr_open",
@@ -2092,6 +2145,41 @@ static void proc_skip_char(char **buf, size_t *size, const char v)
}
}
+/**
+ * strtoul_lenient - parse an ASCII formatted integer from a buffer and only
+ * fail on overflow
+ *
+ * @cp: kernel buffer containing the string to parse
+ * @endp: pointer to store the trailing characters
+ * @base: the base to use
+ * @res: where the parsed integer will be stored
+ *
+ * In case of success 0 is returned and @res will contain the parsed integer,
+ * @endp will hold any trailing characters.
+ * This function will fail the parse on overflow. If there wasn't an overflow
+ * the function will defer the decision what characters count as invalid to the
+ * caller.
+ */
+static int strtoul_lenient(const char *cp, char **endp, unsigned int base,
+ unsigned long *res)
+{
+ unsigned long long result;
+ unsigned int rv;
+
+ cp = _parse_integer_fixup_radix(cp, &base);
+ rv = _parse_integer(cp, base, &result);
+ if ((rv & KSTRTOX_OVERFLOW) || (result != (unsigned long)result))
+ return -ERANGE;
+
+ cp += rv;
+
+ if (endp)
+ *endp = (char *)cp;
+
+ *res = (unsigned long)result;
+ return 0;
+}
+
#define TMPBUFLEN 22
/**
* proc_get_long - reads an ASCII formatted integer from a user buffer
@@ -2135,7 +2223,8 @@ static int proc_get_long(char **buf, size_t *size,
if (!isdigit(*p))
return -EINVAL;
- *val = simple_strtoul(p, &p, 0);
+ if (strtoul_lenient(p, &p, 0, val))
+ return -EINVAL;
len = p - tmp;
@@ -2577,23 +2666,25 @@ static int do_proc_dointvec_minmax_conv(bool *negp, unsigned long *lvalp,
int *valp,
int write, void *data)
{
+ int tmp, ret;
struct do_proc_dointvec_minmax_conv_param *param = data;
+ /*
+ * If writing, first do so via a temporary local int so we can
+ * bounds-check it before touching *valp.
+ */
+ int *ip = write ? &tmp : valp;
+
+ ret = do_proc_dointvec_conv(negp, lvalp, ip, write, data);
+ if (ret)
+ return ret;
+
if (write) {
- int val = *negp ? -*lvalp : *lvalp;
- if ((param->min && *param->min > val) ||
- (param->max && *param->max < val))
+ if ((param->min && *param->min > tmp) ||
+ (param->max && *param->max < tmp))
return -EINVAL;
- *valp = val;
- } else {
- int val = *valp;
- if (val < 0) {
- *negp = true;
- *lvalp = -(unsigned long)val;
- } else {
- *negp = false;
- *lvalp = (unsigned long)val;
- }
+ *valp = tmp;
}
+
return 0;
}
@@ -2642,22 +2733,22 @@ static int do_proc_douintvec_minmax_conv(unsigned long *lvalp,
unsigned int *valp,
int write, void *data)
{
+ int ret;
+ unsigned int tmp;
struct do_proc_douintvec_minmax_conv_param *param = data;
+ /* write via temporary local uint for bounds-checking */
+ unsigned int *up = write ? &tmp : valp;
- if (write) {
- unsigned int val = *lvalp;
+ ret = do_proc_douintvec_conv(lvalp, up, write, data);
+ if (ret)
+ return ret;
- if (*lvalp > UINT_MAX)
- return -EINVAL;
-
- if ((param->min && *param->min > val) ||
- (param->max && *param->max < val))
+ if (write) {
+ if ((param->min && *param->min > tmp) ||
+ (param->max && *param->max < tmp))
return -ERANGE;
- *valp = val;
- } else {
- unsigned int val = *valp;
- *lvalp = (unsigned long) val;
+ *valp = tmp;
}
return 0;
@@ -2805,8 +2896,10 @@ static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int
if (neg)
continue;
val = convmul * val / convdiv;
- if ((min && val < *min) || (max && val > *max))
- continue;
+ if ((min && val < *min) || (max && val > *max)) {
+ err = -EINVAL;
+ break;
+ }
*i = val;
} else {
val = convdiv * (*i) / convmul;
@@ -3089,17 +3182,19 @@ int proc_do_large_bitmap(struct ctl_table *table, int write,
if (write) {
char *kbuf, *p;
+ size_t skipped = 0;
- if (left > PAGE_SIZE - 1)
+ if (left > PAGE_SIZE - 1) {
left = PAGE_SIZE - 1;
+ /* How much of the buffer we'll skip this pass */
+ skipped = *lenp - left;
+ }
p = kbuf = memdup_user_nul(buffer, left);
if (IS_ERR(kbuf))
return PTR_ERR(kbuf);
- tmp_bitmap = kcalloc(BITS_TO_LONGS(bitmap_len),
- sizeof(unsigned long),
- GFP_KERNEL);
+ tmp_bitmap = bitmap_zalloc(bitmap_len, GFP_KERNEL);
if (!tmp_bitmap) {
kfree(kbuf);
return -ENOMEM;
@@ -3108,9 +3203,22 @@ int proc_do_large_bitmap(struct ctl_table *table, int write,
while (!err && left) {
unsigned long val_a, val_b;
bool neg;
+ size_t saved_left;
+ /* In case we stop parsing mid-number, we can reset */
+ saved_left = left;
err = proc_get_long(&p, &left, &val_a, &neg, tr_a,
sizeof(tr_a), &c);
+ /*
+ * If we consumed the entirety of a truncated buffer or
+ * only one char is left (may be a "-"), then stop here,
+ * reset, & come back for more.
+ */
+ if ((left <= 1) && skipped) {
+ left = saved_left;
+ break;
+ }
+
if (err)
break;
if (val_a >= bitmap_len || neg) {
@@ -3128,6 +3236,15 @@ int proc_do_large_bitmap(struct ctl_table *table, int write,
err = proc_get_long(&p, &left, &val_b,
&neg, tr_b, sizeof(tr_b),
&c);
+ /*
+ * If we consumed all of a truncated buffer or
+ * then stop here, reset, & come back for more.
+ */
+ if (!left && skipped) {
+ left = saved_left;
+ break;
+ }
+
if (err)
break;
if (val_b >= bitmap_len || neg ||
@@ -3146,6 +3263,7 @@ int proc_do_large_bitmap(struct ctl_table *table, int write,
proc_skip_char(&p, &left, '\n');
}
kfree(kbuf);
+ left += skipped;
} else {
unsigned long bit_a, bit_b = 0;
@@ -3190,7 +3308,7 @@ int proc_do_large_bitmap(struct ctl_table *table, int write,
*ppos += *lenp;
}
- kfree(tmp_bitmap);
+ bitmap_free(tmp_bitmap);
return err;
}
@@ -3257,9 +3375,46 @@ int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int write,
return -ENOSYS;
}
+int proc_do_large_bitmap(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ return -ENOSYS;
+}
#endif /* CONFIG_PROC_SYSCTL */
+#if defined(CONFIG_SYSCTL)
+int proc_do_static_key(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp,
+ loff_t *ppos)
+{
+ struct static_key *key = (struct static_key *)table->data;
+ static DEFINE_MUTEX(static_key_mutex);
+ int val, ret;
+ struct ctl_table tmp = {
+ .data = &val,
+ .maxlen = sizeof(val),
+ .mode = table->mode,
+ .extra1 = &zero,
+ .extra2 = &one,
+ };
+
+ if (write && !capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ mutex_lock(&static_key_mutex);
+ val = static_key_enabled(key);
+ ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
+ if (write && !ret) {
+ if (val)
+ static_key_enable(key);
+ else
+ static_key_disable(key);
+ }
+ mutex_unlock(&static_key_mutex);
+ return ret;
+}
+#endif
/*
* No sense putting this after each symbol definition, twice,
* exception granted :-)
@@ -3274,3 +3429,4 @@ EXPORT_SYMBOL(proc_dointvec_ms_jiffies);
EXPORT_SYMBOL(proc_dostring);
EXPORT_SYMBOL(proc_doulongvec_minmax);
EXPORT_SYMBOL(proc_doulongvec_ms_jiffies_minmax);
+EXPORT_SYMBOL(proc_do_large_bitmap);
diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index 4e62a4a8fa91..13a0f2e6ebc2 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -1,19 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* taskstats.c - Export per-task statistics to userland
*
* Copyright (C) Shailabh Nagar, IBM Corp. 2006
* (C) Balbir Singh, IBM Corp. 2006
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
*/
#include <linux/kernel.h>
@@ -375,7 +365,7 @@ static struct taskstats *mk_reply(struct sk_buff *skb, int type, u32 pid)
? TASKSTATS_TYPE_AGGR_PID
: TASKSTATS_TYPE_AGGR_TGID;
- na = nla_nest_start(skb, aggr);
+ na = nla_nest_start_noflag(skb, aggr);
if (!na)
goto err;
@@ -649,17 +639,41 @@ err:
static const struct genl_ops taskstats_ops[] = {
{
.cmd = TASKSTATS_CMD_GET,
+ .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
.doit = taskstats_user_cmd,
- .policy = taskstats_cmd_get_policy,
- .flags = GENL_ADMIN_PERM,
+ /* policy enforced later */
+ .flags = GENL_ADMIN_PERM | GENL_CMD_CAP_HASPOL,
},
{
.cmd = CGROUPSTATS_CMD_GET,
+ .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
.doit = cgroupstats_user_cmd,
- .policy = cgroupstats_cmd_get_policy,
+ /* policy enforced later */
+ .flags = GENL_CMD_CAP_HASPOL,
},
};
+static int taskstats_pre_doit(const struct genl_ops *ops, struct sk_buff *skb,
+ struct genl_info *info)
+{
+ const struct nla_policy *policy = NULL;
+
+ switch (ops->cmd) {
+ case TASKSTATS_CMD_GET:
+ policy = taskstats_cmd_get_policy;
+ break;
+ case CGROUPSTATS_CMD_GET:
+ policy = cgroupstats_cmd_get_policy;
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ return nlmsg_validate_deprecated(info->nlhdr, GENL_HDRLEN,
+ TASKSTATS_CMD_ATTR_MAX, policy,
+ info->extack);
+}
+
static struct genl_family family __ro_after_init = {
.name = TASKSTATS_GENL_NAME,
.version = TASKSTATS_GENL_VERSION,
@@ -667,6 +681,7 @@ static struct genl_family family __ro_after_init = {
.module = THIS_MODULE,
.ops = taskstats_ops,
.n_ops = ARRAY_SIZE(taskstats_ops),
+ .pre_doit = taskstats_pre_doit,
};
/* Needed early in initialization */
diff --git a/kernel/test_kprobes.c b/kernel/test_kprobes.c
index 7bca480151b0..76c997fdbc9d 100644
--- a/kernel/test_kprobes.c
+++ b/kernel/test_kprobes.c
@@ -1,17 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* test_kprobes.c - simple sanity test for *probes
*
* Copyright IBM Corp. 2008
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it would be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
- * the GNU General Public License for more details.
*/
#define pr_fmt(fmt) "Kprobe smoke test: " fmt
diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig
index 58b981f4bb5d..fcc42353f125 100644
--- a/kernel/time/Kconfig
+++ b/kernel/time/Kconfig
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
#
# Timer subsystem related configuration options
#
@@ -117,6 +118,35 @@ config NO_HZ_FULL
endchoice
+config CONTEXT_TRACKING
+ bool
+
+config CONTEXT_TRACKING_FORCE
+ bool "Force context tracking"
+ depends on CONTEXT_TRACKING
+ default y if !NO_HZ_FULL
+ help
+ The major pre-requirement for full dynticks to work is to
+ support the context tracking subsystem. But there are also
+ other dependencies to provide in order to make the full
+ dynticks working.
+
+ This option stands for testing when an arch implements the
+ context tracking backend but doesn't yet fullfill all the
+ requirements to make the full dynticks feature working.
+ Without the full dynticks, there is no way to test the support
+ for context tracking and the subsystems that rely on it: RCU
+ userspace extended quiescent state and tickless cputime
+ accounting. This option copes with the absence of the full
+ dynticks subsystem by forcing the context tracking on all
+ CPUs in the system.
+
+ Say Y only if you're working on the development of an
+ architecture backend for the context tracking.
+
+ Say N otherwise, this option brings an overhead that you
+ don't want in production.
+
config NO_HZ
bool "Old Idle dynticks config"
depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS
diff --git a/kernel/time/Makefile b/kernel/time/Makefile
index f1e46f338a9c..1867044800bb 100644
--- a/kernel/time/Makefile
+++ b/kernel/time/Makefile
@@ -16,5 +16,6 @@ ifeq ($(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST),y)
endif
obj-$(CONFIG_GENERIC_SCHED_CLOCK) += sched_clock.o
obj-$(CONFIG_TICK_ONESHOT) += tick-oneshot.o tick-sched.o
+obj-$(CONFIG_HAVE_GENERIC_VDSO) += vsyscall.o
obj-$(CONFIG_DEBUG_FS) += timekeeping_debug.o
obj-$(CONFIG_TEST_UDELAY) += test_udelay.o
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index 2c97e8c2d29f..57518efc3810 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -233,7 +233,6 @@ EXPORT_SYMBOL_GPL(alarm_expires_remaining);
/**
* alarmtimer_suspend - Suspend time callback
* @dev: unused
- * @state: unused
*
* When we are going into suspend, we look through the bases
* to see which is the soonest timer to expire. We then
@@ -594,7 +593,7 @@ static ktime_t alarm_timer_remaining(struct k_itimer *timr, ktime_t now)
{
struct alarm *alarm = &timr->it.alarm.alarmtimer;
- return ktime_sub(now, alarm->node.expires);
+ return ktime_sub(alarm->node.expires, now);
}
/**
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index 5e77662dd2d9..f5490222e134 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -611,6 +611,22 @@ void clockevents_resume(void)
}
#ifdef CONFIG_HOTPLUG_CPU
+
+# ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST
+/**
+ * tick_offline_cpu - Take CPU out of the broadcast mechanism
+ * @cpu: The outgoing CPU
+ *
+ * Called on the outgoing CPU after it took itself offline.
+ */
+void tick_offline_cpu(unsigned int cpu)
+{
+ raw_spin_lock(&clockevents_lock);
+ tick_broadcast_offline(cpu);
+ raw_spin_unlock(&clockevents_lock);
+}
+# endif
+
/**
* tick_cleanup_dead_cpu - Cleanup the tick and clockevents of a dead cpu
*/
@@ -621,8 +637,6 @@ void tick_cleanup_dead_cpu(int cpu)
raw_spin_lock_irqsave(&clockevents_lock, flags);
- tick_shutdown_broadcast_oneshot(cpu);
- tick_shutdown_broadcast(cpu);
tick_shutdown(cpu);
/*
* Unregister the clock event devices which were
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index 3bcc19ceb073..fff5f64981c6 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -105,12 +105,12 @@ static DEFINE_SPINLOCK(watchdog_lock);
static int watchdog_running;
static atomic_t watchdog_reset_pending;
-static void inline clocksource_watchdog_lock(unsigned long *flags)
+static inline void clocksource_watchdog_lock(unsigned long *flags)
{
spin_lock_irqsave(&watchdog_lock, *flags);
}
-static void inline clocksource_watchdog_unlock(unsigned long *flags)
+static inline void clocksource_watchdog_unlock(unsigned long *flags)
{
spin_unlock_irqrestore(&watchdog_lock, *flags);
}
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index f5cfa1b73d6f..5ee77f1a8a92 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -30,7 +30,6 @@
#include <linux/syscalls.h>
#include <linux/interrupt.h>
#include <linux/tick.h>
-#include <linux/seq_file.h>
#include <linux/err.h>
#include <linux/debugobjects.h>
#include <linux/sched/signal.h>
@@ -364,7 +363,7 @@ static bool hrtimer_fixup_activate(void *addr, enum debug_obj_state state)
switch (state) {
case ODEBUG_STATE_ACTIVE:
WARN_ON(1);
-
+ /* fall through */
default:
return false;
}
@@ -1115,9 +1114,10 @@ EXPORT_SYMBOL_GPL(hrtimer_start_range_ns);
* @timer: hrtimer to stop
*
* Returns:
- * 0 when the timer was not active
- * 1 when the timer was active
- * -1 when the timer is currently executing the callback function and
+ *
+ * * 0 when the timer was not active
+ * * 1 when the timer was active
+ * * -1 when the timer is currently executing the callback function and
* cannot be stopped
*/
int hrtimer_try_to_cancel(struct hrtimer *timer)
@@ -1771,7 +1771,7 @@ SYSCALL_DEFINE2(nanosleep, struct __kernel_timespec __user *, rqtp,
#ifdef CONFIG_COMPAT_32BIT_TIME
-COMPAT_SYSCALL_DEFINE2(nanosleep, struct old_timespec32 __user *, rqtp,
+SYSCALL_DEFINE2(nanosleep_time32, struct old_timespec32 __user *, rqtp,
struct old_timespec32 __user *, rmtp)
{
struct timespec64 tu;
diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c
index dc1b6f1929f9..d23b434c2ca7 100644
--- a/kernel/time/jiffies.c
+++ b/kernel/time/jiffies.c
@@ -63,7 +63,7 @@ __cacheline_aligned_in_smp DEFINE_SEQLOCK(jiffies_lock);
#if (BITS_PER_LONG < 64)
u64 get_jiffies_64(void)
{
- unsigned long seq;
+ unsigned int seq;
u64 ret;
do {
@@ -89,7 +89,7 @@ struct clocksource * __init __weak clocksource_default_clock(void)
return &clocksource_jiffies;
}
-struct clocksource refined_jiffies;
+static struct clocksource refined_jiffies;
int register_refined_jiffies(long cycles_per_second)
{
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index 36a2bef00125..65eb796610dc 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -17,6 +17,7 @@
#include <linux/mm.h>
#include <linux/module.h>
#include <linux/rtc.h>
+#include <linux/audit.h>
#include "ntp_internal.h"
#include "timekeeping_internal.h"
@@ -42,6 +43,7 @@ static u64 tick_length_base;
#define MAX_TICKADJ 500LL /* usecs */
#define MAX_TICKADJ_SCALED \
(((MAX_TICKADJ * NSEC_PER_USEC) << NTP_SCALE_SHIFT) / NTP_INTERVAL_FREQ)
+#define MAX_TAI_OFFSET 100000
/*
* phase-lock loop variables
@@ -188,13 +190,13 @@ static inline int is_error_status(int status)
&& (status & (STA_PPSWANDER|STA_PPSERROR)));
}
-static inline void pps_fill_timex(struct timex *txc)
+static inline void pps_fill_timex(struct __kernel_timex *txc)
{
txc->ppsfreq = shift_right((pps_freq >> PPM_SCALE_INV_SHIFT) *
PPM_SCALE_INV, NTP_SCALE_SHIFT);
txc->jitter = pps_jitter;
if (!(time_status & STA_NANO))
- txc->jitter /= NSEC_PER_USEC;
+ txc->jitter = pps_jitter / NSEC_PER_USEC;
txc->shift = pps_shift;
txc->stabil = pps_stabil;
txc->jitcnt = pps_jitcnt;
@@ -220,7 +222,7 @@ static inline int is_error_status(int status)
return status & (STA_UNSYNC|STA_CLOCKERR);
}
-static inline void pps_fill_timex(struct timex *txc)
+static inline void pps_fill_timex(struct __kernel_timex *txc)
{
/* PPS is not implemented, so these are zero */
txc->ppsfreq = 0;
@@ -633,7 +635,7 @@ void ntp_notify_cmos_timer(void)
/*
* Propagate a new txc->status value into the NTP state:
*/
-static inline void process_adj_status(const struct timex *txc)
+static inline void process_adj_status(const struct __kernel_timex *txc)
{
if ((time_status & STA_PLL) && !(txc->status & STA_PLL)) {
time_state = TIME_OK;
@@ -656,7 +658,8 @@ static inline void process_adj_status(const struct timex *txc)
}
-static inline void process_adjtimex_modes(const struct timex *txc, s32 *time_tai)
+static inline void process_adjtimex_modes(const struct __kernel_timex *txc,
+ s32 *time_tai)
{
if (txc->modes & ADJ_STATUS)
process_adj_status(txc);
@@ -689,7 +692,8 @@ static inline void process_adjtimex_modes(const struct timex *txc, s32 *time_tai
time_constant = max(time_constant, 0l);
}
- if (txc->modes & ADJ_TAI && txc->constant > 0)
+ if (txc->modes & ADJ_TAI &&
+ txc->constant >= 0 && txc->constant <= MAX_TAI_OFFSET)
*time_tai = txc->constant;
if (txc->modes & ADJ_OFFSET)
@@ -707,7 +711,8 @@ static inline void process_adjtimex_modes(const struct timex *txc, s32 *time_tai
* adjtimex mainly allows reading (and writing, if superuser) of
* kernel time-keeping variables. used by xntpd.
*/
-int __do_adjtimex(struct timex *txc, const struct timespec64 *ts, s32 *time_tai)
+int __do_adjtimex(struct __kernel_timex *txc, const struct timespec64 *ts,
+ s32 *time_tai, struct audit_ntp_data *ad)
{
int result;
@@ -718,18 +723,33 @@ int __do_adjtimex(struct timex *txc, const struct timespec64 *ts, s32 *time_tai)
/* adjtime() is independent from ntp_adjtime() */
time_adjust = txc->offset;
ntp_update_frequency();
+
+ audit_ntp_set_old(ad, AUDIT_NTP_ADJUST, save_adjust);
+ audit_ntp_set_new(ad, AUDIT_NTP_ADJUST, time_adjust);
}
txc->offset = save_adjust;
} else {
-
/* If there are input parameters, then process them: */
- if (txc->modes)
+ if (txc->modes) {
+ audit_ntp_set_old(ad, AUDIT_NTP_OFFSET, time_offset);
+ audit_ntp_set_old(ad, AUDIT_NTP_FREQ, time_freq);
+ audit_ntp_set_old(ad, AUDIT_NTP_STATUS, time_status);
+ audit_ntp_set_old(ad, AUDIT_NTP_TAI, *time_tai);
+ audit_ntp_set_old(ad, AUDIT_NTP_TICK, tick_usec);
+
process_adjtimex_modes(txc, time_tai);
+ audit_ntp_set_new(ad, AUDIT_NTP_OFFSET, time_offset);
+ audit_ntp_set_new(ad, AUDIT_NTP_FREQ, time_freq);
+ audit_ntp_set_new(ad, AUDIT_NTP_STATUS, time_status);
+ audit_ntp_set_new(ad, AUDIT_NTP_TAI, *time_tai);
+ audit_ntp_set_new(ad, AUDIT_NTP_TICK, tick_usec);
+ }
+
txc->offset = shift_right(time_offset * NTP_INTERVAL_FREQ,
NTP_SCALE_SHIFT);
if (!(time_status & STA_NANO))
- txc->offset /= NSEC_PER_USEC;
+ txc->offset = (u32)txc->offset / NSEC_PER_USEC;
}
result = time_state; /* mostly `TIME_OK' */
@@ -754,7 +774,7 @@ int __do_adjtimex(struct timex *txc, const struct timespec64 *ts, s32 *time_tai)
txc->time.tv_sec = (time_t)ts->tv_sec;
txc->time.tv_usec = ts->tv_nsec;
if (!(time_status & STA_NANO))
- txc->time.tv_usec /= NSEC_PER_USEC;
+ txc->time.tv_usec = ts->tv_nsec / NSEC_PER_USEC;
/* Handle leapsec adjustments */
if (unlikely(ts->tv_sec >= ntp_next_leap_sec)) {
diff --git a/kernel/time/ntp_internal.h b/kernel/time/ntp_internal.h
index c24b0e13f011..908ecaa65fc3 100644
--- a/kernel/time/ntp_internal.h
+++ b/kernel/time/ntp_internal.h
@@ -8,6 +8,8 @@ extern void ntp_clear(void);
extern u64 ntp_tick_length(void);
extern ktime_t ntp_get_next_leap(void);
extern int second_overflow(time64_t secs);
-extern int __do_adjtimex(struct timex *txc, const struct timespec64 *ts, s32 *time_tai);
+extern int __do_adjtimex(struct __kernel_timex *txc,
+ const struct timespec64 *ts,
+ s32 *time_tai, struct audit_ntp_data *ad);
extern void __hardpps(const struct timespec64 *phase_ts, const struct timespec64 *raw_ts);
#endif /* _LINUX_NTP_INTERNAL_H */
diff --git a/kernel/time/posix-clock.c b/kernel/time/posix-clock.c
index 425bbfce6819..ec960bb939fd 100644
--- a/kernel/time/posix-clock.c
+++ b/kernel/time/posix-clock.c
@@ -228,7 +228,7 @@ static void put_clock_desc(struct posix_clock_desc *cd)
fput(cd->fp);
}
-static int pc_clock_adjtime(clockid_t id, struct timex *tx)
+static int pc_clock_adjtime(clockid_t id, struct __kernel_timex *tx)
{
struct posix_clock_desc cd;
int err;
diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index 80f955210861..0a426f4e3125 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -67,13 +67,13 @@ static void bump_cpu_timer(struct k_itimer *timer, u64 now)
int i;
u64 delta, incr;
- if (timer->it.cpu.incr == 0)
+ if (!timer->it_interval)
return;
if (now < timer->it.cpu.expires)
return;
- incr = timer->it.cpu.incr;
+ incr = timer->it_interval;
delta = now + incr - timer->it.cpu.expires;
/* Don't use (incr*2 < delta), incr*2 might overflow. */
@@ -520,7 +520,7 @@ static void cpu_timer_fire(struct k_itimer *timer)
*/
wake_up_process(timer->it_process);
timer->it.cpu.expires = 0;
- } else if (timer->it.cpu.incr == 0) {
+ } else if (!timer->it_interval) {
/*
* One-shot timer. Clear it as soon as it's fired.
*/
@@ -606,7 +606,7 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int timer_flags,
*/
ret = 0;
- old_incr = timer->it.cpu.incr;
+ old_incr = timer->it_interval;
old_expires = timer->it.cpu.expires;
if (unlikely(timer->it.cpu.firing)) {
timer->it.cpu.firing = -1;
@@ -684,8 +684,7 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int timer_flags,
* Install the new reload setting, and
* set up the signal and overrun bookkeeping.
*/
- timer->it.cpu.incr = timespec64_to_ns(&new->it_interval);
- timer->it_interval = ns_to_ktime(timer->it.cpu.incr);
+ timer->it_interval = timespec64_to_ktime(new->it_interval);
/*
* This acts as a modification timestamp for the timer,
@@ -724,7 +723,7 @@ static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec64 *itp
/*
* Easy part: convert the reload time.
*/
- itp->it_interval = ns_to_timespec64(timer->it.cpu.incr);
+ itp->it_interval = ktime_to_timespec64(timer->it_interval);
if (!timer->it.cpu.expires)
return;
diff --git a/kernel/time/posix-stubs.c b/kernel/time/posix-stubs.c
index a51895486e5e..67df65f887ac 100644
--- a/kernel/time/posix-stubs.c
+++ b/kernel/time/posix-stubs.c
@@ -45,6 +45,7 @@ SYS_NI(timer_delete);
SYS_NI(clock_adjtime);
SYS_NI(getitimer);
SYS_NI(setitimer);
+SYS_NI(clock_adjtime32);
#ifdef __ARCH_WANT_SYS_ALARM
SYS_NI(alarm);
#endif
@@ -150,16 +151,16 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags,
#ifdef CONFIG_COMPAT
COMPAT_SYS_NI(timer_create);
-COMPAT_SYS_NI(clock_adjtime);
-COMPAT_SYS_NI(timer_settime);
-COMPAT_SYS_NI(timer_gettime);
COMPAT_SYS_NI(getitimer);
COMPAT_SYS_NI(setitimer);
#endif
#ifdef CONFIG_COMPAT_32BIT_TIME
-COMPAT_SYSCALL_DEFINE2(clock_settime, const clockid_t, which_clock,
- struct old_timespec32 __user *, tp)
+SYS_NI(timer_settime32);
+SYS_NI(timer_gettime32);
+
+SYSCALL_DEFINE2(clock_settime32, const clockid_t, which_clock,
+ struct old_timespec32 __user *, tp)
{
struct timespec64 new_tp;
@@ -171,8 +172,8 @@ COMPAT_SYSCALL_DEFINE2(clock_settime, const clockid_t, which_clock,
return do_sys_settimeofday64(&new_tp, NULL);
}
-COMPAT_SYSCALL_DEFINE2(clock_gettime, clockid_t, which_clock,
- struct old_timespec32 __user *, tp)
+SYSCALL_DEFINE2(clock_gettime32, clockid_t, which_clock,
+ struct old_timespec32 __user *, tp)
{
int ret;
struct timespec64 kernel_tp;
@@ -186,8 +187,8 @@ COMPAT_SYSCALL_DEFINE2(clock_gettime, clockid_t, which_clock,
return 0;
}
-COMPAT_SYSCALL_DEFINE2(clock_getres, clockid_t, which_clock,
- struct old_timespec32 __user *, tp)
+SYSCALL_DEFINE2(clock_getres_time32, clockid_t, which_clock,
+ struct old_timespec32 __user *, tp)
{
struct timespec64 rtn_tp = {
.tv_sec = 0,
@@ -206,9 +207,9 @@ COMPAT_SYSCALL_DEFINE2(clock_getres, clockid_t, which_clock,
}
}
-COMPAT_SYSCALL_DEFINE4(clock_nanosleep, clockid_t, which_clock, int, flags,
- struct old_timespec32 __user *, rqtp,
- struct old_timespec32 __user *, rmtp)
+SYSCALL_DEFINE4(clock_nanosleep_time32, clockid_t, which_clock, int, flags,
+ struct old_timespec32 __user *, rqtp,
+ struct old_timespec32 __user *, rmtp)
{
struct timespec64 t;
diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c
index 0e84bb72a3da..d7f2d91acdac 100644
--- a/kernel/time/posix-timers.c
+++ b/kernel/time/posix-timers.c
@@ -179,7 +179,7 @@ static int posix_clock_realtime_set(const clockid_t which_clock,
}
static int posix_clock_realtime_adj(const clockid_t which_clock,
- struct timex *t)
+ struct __kernel_timex *t)
{
return do_adjtimex(t);
}
@@ -730,8 +730,8 @@ SYSCALL_DEFINE2(timer_gettime, timer_t, timer_id,
#ifdef CONFIG_COMPAT_32BIT_TIME
-COMPAT_SYSCALL_DEFINE2(timer_gettime, timer_t, timer_id,
- struct old_itimerspec32 __user *, setting)
+SYSCALL_DEFINE2(timer_gettime32, timer_t, timer_id,
+ struct old_itimerspec32 __user *, setting)
{
struct itimerspec64 cur_setting;
@@ -903,9 +903,9 @@ SYSCALL_DEFINE4(timer_settime, timer_t, timer_id, int, flags,
}
#ifdef CONFIG_COMPAT_32BIT_TIME
-COMPAT_SYSCALL_DEFINE4(timer_settime, timer_t, timer_id, int, flags,
- struct old_itimerspec32 __user *, new,
- struct old_itimerspec32 __user *, old)
+SYSCALL_DEFINE4(timer_settime32, timer_t, timer_id, int, flags,
+ struct old_itimerspec32 __user *, new,
+ struct old_itimerspec32 __user *, old)
{
struct itimerspec64 new_spec, old_spec;
struct itimerspec64 *rtn = old ? &old_spec : NULL;
@@ -980,23 +980,16 @@ retry_delete:
*/
static void itimer_delete(struct k_itimer *timer)
{
- unsigned long flags;
-
retry_delete:
- spin_lock_irqsave(&timer->it_lock, flags);
+ spin_lock_irq(&timer->it_lock);
if (timer_delete_hook(timer) == TIMER_RETRY) {
- unlock_timer(timer, flags);
+ spin_unlock_irq(&timer->it_lock);
goto retry_delete;
}
list_del(&timer->list);
- /*
- * This keeps any tasks waiting on the spin lock from thinking
- * they got something (see the lock code above).
- */
- timer->it_signal = NULL;
- unlock_timer(timer, flags);
+ spin_unlock_irq(&timer->it_lock);
release_posix_timer(timer, IT_ID_SET);
}
@@ -1047,22 +1040,28 @@ SYSCALL_DEFINE2(clock_gettime, const clockid_t, which_clock,
return error;
}
-SYSCALL_DEFINE2(clock_adjtime, const clockid_t, which_clock,
- struct timex __user *, utx)
+int do_clock_adjtime(const clockid_t which_clock, struct __kernel_timex * ktx)
{
const struct k_clock *kc = clockid_to_kclock(which_clock);
- struct timex ktx;
- int err;
if (!kc)
return -EINVAL;
if (!kc->clock_adj)
return -EOPNOTSUPP;
+ return kc->clock_adj(which_clock, ktx);
+}
+
+SYSCALL_DEFINE2(clock_adjtime, const clockid_t, which_clock,
+ struct __kernel_timex __user *, utx)
+{
+ struct __kernel_timex ktx;
+ int err;
+
if (copy_from_user(&ktx, utx, sizeof(ktx)))
return -EFAULT;
- err = kc->clock_adj(which_clock, &ktx);
+ err = do_clock_adjtime(which_clock, &ktx);
if (err >= 0 && copy_to_user(utx, &ktx, sizeof(ktx)))
return -EFAULT;
@@ -1090,8 +1089,8 @@ SYSCALL_DEFINE2(clock_getres, const clockid_t, which_clock,
#ifdef CONFIG_COMPAT_32BIT_TIME
-COMPAT_SYSCALL_DEFINE2(clock_settime, clockid_t, which_clock,
- struct old_timespec32 __user *, tp)
+SYSCALL_DEFINE2(clock_settime32, clockid_t, which_clock,
+ struct old_timespec32 __user *, tp)
{
const struct k_clock *kc = clockid_to_kclock(which_clock);
struct timespec64 ts;
@@ -1105,8 +1104,8 @@ COMPAT_SYSCALL_DEFINE2(clock_settime, clockid_t, which_clock,
return kc->clock_set(which_clock, &ts);
}
-COMPAT_SYSCALL_DEFINE2(clock_gettime, clockid_t, which_clock,
- struct old_timespec32 __user *, tp)
+SYSCALL_DEFINE2(clock_gettime32, clockid_t, which_clock,
+ struct old_timespec32 __user *, tp)
{
const struct k_clock *kc = clockid_to_kclock(which_clock);
struct timespec64 ts;
@@ -1123,40 +1122,26 @@ COMPAT_SYSCALL_DEFINE2(clock_gettime, clockid_t, which_clock,
return err;
}
-#endif
-
-#ifdef CONFIG_COMPAT
-
-COMPAT_SYSCALL_DEFINE2(clock_adjtime, clockid_t, which_clock,
- struct compat_timex __user *, utp)
+SYSCALL_DEFINE2(clock_adjtime32, clockid_t, which_clock,
+ struct old_timex32 __user *, utp)
{
- const struct k_clock *kc = clockid_to_kclock(which_clock);
- struct timex ktx;
+ struct __kernel_timex ktx;
int err;
- if (!kc)
- return -EINVAL;
- if (!kc->clock_adj)
- return -EOPNOTSUPP;
-
- err = compat_get_timex(&ktx, utp);
+ err = get_old_timex32(&ktx, utp);
if (err)
return err;
- err = kc->clock_adj(which_clock, &ktx);
+ err = do_clock_adjtime(which_clock, &ktx);
if (err >= 0)
- err = compat_put_timex(utp, &ktx);
+ err = put_old_timex32(utp, &ktx);
return err;
}
-#endif
-
-#ifdef CONFIG_COMPAT_32BIT_TIME
-
-COMPAT_SYSCALL_DEFINE2(clock_getres, clockid_t, which_clock,
- struct old_timespec32 __user *, tp)
+SYSCALL_DEFINE2(clock_getres_time32, clockid_t, which_clock,
+ struct old_timespec32 __user *, tp)
{
const struct k_clock *kc = clockid_to_kclock(which_clock);
struct timespec64 ts;
@@ -1212,9 +1197,9 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags,
#ifdef CONFIG_COMPAT_32BIT_TIME
-COMPAT_SYSCALL_DEFINE4(clock_nanosleep, clockid_t, which_clock, int, flags,
- struct old_timespec32 __user *, rqtp,
- struct old_timespec32 __user *, rmtp)
+SYSCALL_DEFINE4(clock_nanosleep_time32, clockid_t, which_clock, int, flags,
+ struct old_timespec32 __user *, rqtp,
+ struct old_timespec32 __user *, rmtp)
{
const struct k_clock *kc = clockid_to_kclock(which_clock);
struct timespec64 t;
diff --git a/kernel/time/posix-timers.h b/kernel/time/posix-timers.h
index ddb21145211a..de5daa6d975a 100644
--- a/kernel/time/posix-timers.h
+++ b/kernel/time/posix-timers.h
@@ -8,7 +8,7 @@ struct k_clock {
const struct timespec64 *tp);
int (*clock_get)(const clockid_t which_clock,
struct timespec64 *tp);
- int (*clock_adj)(const clockid_t which_clock, struct timex *tx);
+ int (*clock_adj)(const clockid_t which_clock, struct __kernel_timex *tx);
int (*timer_create)(struct k_itimer *timer);
int (*nsleep)(const clockid_t which_clock, int flags,
const struct timespec64 *);
diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c
index 094b82ca95e5..142b07619918 100644
--- a/kernel/time/sched_clock.c
+++ b/kernel/time/sched_clock.c
@@ -94,7 +94,7 @@ static inline u64 notrace cyc_to_ns(u64 cyc, u32 mult, u32 shift)
unsigned long long notrace sched_clock(void)
{
u64 cyc, res;
- unsigned long seq;
+ unsigned int seq;
struct clock_read_data *rd;
do {
@@ -231,7 +231,7 @@ sched_clock_register(u64 (*read)(void), int bits, unsigned long rate)
if (irqtime > 0 || (irqtime == -1 && rate >= 1000000))
enable_sched_clock_irqtime();
- pr_debug("Registered %pF as sched_clock source\n", read);
+ pr_debug("Registered %pS as sched_clock source\n", read);
}
void __init generic_sched_clock_init(void)
@@ -267,12 +267,12 @@ void __init generic_sched_clock_init(void)
*/
static u64 notrace suspended_sched_clock_read(void)
{
- unsigned long seq = raw_read_seqcount(&cd.seq);
+ unsigned int seq = raw_read_seqcount(&cd.seq);
return cd.read_data[seq & 1].epoch_cyc;
}
-static int sched_clock_suspend(void)
+int sched_clock_suspend(void)
{
struct clock_read_data *rd = &cd.read_data[0];
@@ -283,7 +283,7 @@ static int sched_clock_suspend(void)
return 0;
}
-static void sched_clock_resume(void)
+void sched_clock_resume(void)
{
struct clock_read_data *rd = &cd.read_data[0];
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index 803fa67aace9..e51778c312f1 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -36,10 +36,16 @@ static __cacheline_aligned_in_smp DEFINE_RAW_SPINLOCK(tick_broadcast_lock);
static void tick_broadcast_setup_oneshot(struct clock_event_device *bc);
static void tick_broadcast_clear_oneshot(int cpu);
static void tick_resume_broadcast_oneshot(struct clock_event_device *bc);
+# ifdef CONFIG_HOTPLUG_CPU
+static void tick_broadcast_oneshot_offline(unsigned int cpu);
+# endif
#else
static inline void tick_broadcast_setup_oneshot(struct clock_event_device *bc) { BUG(); }
static inline void tick_broadcast_clear_oneshot(int cpu) { }
static inline void tick_resume_broadcast_oneshot(struct clock_event_device *bc) { }
+# ifdef CONFIG_HOTPLUG_CPU
+static inline void tick_broadcast_oneshot_offline(unsigned int cpu) { }
+# endif
#endif
/*
@@ -375,6 +381,7 @@ void tick_broadcast_control(enum tick_broadcast_mode mode)
switch (mode) {
case TICK_BROADCAST_FORCE:
tick_broadcast_forced = 1;
+ /* fall through */
case TICK_BROADCAST_ON:
cpumask_set_cpu(cpu, tick_broadcast_on);
if (!cpumask_test_and_set_cpu(cpu, tick_broadcast_mask)) {
@@ -432,27 +439,29 @@ void tick_set_periodic_handler(struct clock_event_device *dev, int broadcast)
}
#ifdef CONFIG_HOTPLUG_CPU
-/*
- * Remove a CPU from broadcasting
- */
-void tick_shutdown_broadcast(unsigned int cpu)
+static void tick_shutdown_broadcast(void)
{
- struct clock_event_device *bc;
- unsigned long flags;
-
- raw_spin_lock_irqsave(&tick_broadcast_lock, flags);
-
- bc = tick_broadcast_device.evtdev;
- cpumask_clear_cpu(cpu, tick_broadcast_mask);
- cpumask_clear_cpu(cpu, tick_broadcast_on);
+ struct clock_event_device *bc = tick_broadcast_device.evtdev;
if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) {
if (bc && cpumask_empty(tick_broadcast_mask))
clockevents_shutdown(bc);
}
+}
- raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);
+/*
+ * Remove a CPU from broadcasting
+ */
+void tick_broadcast_offline(unsigned int cpu)
+{
+ raw_spin_lock(&tick_broadcast_lock);
+ cpumask_clear_cpu(cpu, tick_broadcast_mask);
+ cpumask_clear_cpu(cpu, tick_broadcast_on);
+ tick_broadcast_oneshot_offline(cpu);
+ tick_shutdown_broadcast();
+ raw_spin_unlock(&tick_broadcast_lock);
}
+
#endif
void tick_suspend_broadcast(void)
@@ -800,13 +809,13 @@ int __tick_broadcast_oneshot_control(enum tick_broadcast_state state)
* either the CPU handling the broadcast
* interrupt or we got woken by something else.
*
- * We are not longer in the broadcast mask, so
+ * We are no longer in the broadcast mask, so
* if the cpu local expiry time is already
* reached, we would reprogram the cpu local
* timer with an already expired event.
*
* This can lead to a ping-pong when we return
- * to idle and therefor rearm the broadcast
+ * to idle and therefore rearm the broadcast
* timer before the cpu local timer was able
* to fire. This happens because the forced
* reprogramming makes sure that the event
@@ -949,14 +958,10 @@ void hotplug_cpu__broadcast_tick_pull(int deadcpu)
}
/*
- * Remove a dead CPU from broadcasting
+ * Remove a dying CPU from broadcasting
*/
-void tick_shutdown_broadcast_oneshot(unsigned int cpu)
+static void tick_broadcast_oneshot_offline(unsigned int cpu)
{
- unsigned long flags;
-
- raw_spin_lock_irqsave(&tick_broadcast_lock, flags);
-
/*
* Clear the broadcast masks for the dead cpu, but do not stop
* the broadcast device!
@@ -964,8 +969,6 @@ void tick_shutdown_broadcast_oneshot(unsigned int cpu)
cpumask_clear_cpu(cpu, tick_broadcast_oneshot_mask);
cpumask_clear_cpu(cpu, tick_broadcast_pending_mask);
cpumask_clear_cpu(cpu, tick_broadcast_force_mask);
-
- raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);
}
#endif
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index 529143b4c8d2..59225b484e4e 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -46,6 +46,14 @@ ktime_t tick_period;
* procedure also covers cpu hotplug.
*/
int tick_do_timer_cpu __read_mostly = TICK_DO_TIMER_BOOT;
+#ifdef CONFIG_NO_HZ_FULL
+/*
+ * tick_do_timer_boot_cpu indicates the boot CPU temporarily owns
+ * tick_do_timer_cpu and it should be taken over by an eligible secondary
+ * when one comes online.
+ */
+static int tick_do_timer_boot_cpu __read_mostly = -1;
+#endif
/*
* Debugging: see timer_list.c
@@ -149,7 +157,7 @@ void tick_setup_periodic(struct clock_event_device *dev, int broadcast)
!tick_broadcast_oneshot_active()) {
clockevents_switch_state(dev, CLOCK_EVT_STATE_PERIODIC);
} else {
- unsigned long seq;
+ unsigned int seq;
ktime_t next;
do {
@@ -167,6 +175,26 @@ void tick_setup_periodic(struct clock_event_device *dev, int broadcast)
}
}
+#ifdef CONFIG_NO_HZ_FULL
+static void giveup_do_timer(void *info)
+{
+ int cpu = *(unsigned int *)info;
+
+ WARN_ON(tick_do_timer_cpu != smp_processor_id());
+
+ tick_do_timer_cpu = cpu;
+}
+
+static void tick_take_do_timer_from_boot(void)
+{
+ int cpu = smp_processor_id();
+ int from = tick_do_timer_boot_cpu;
+
+ if (from >= 0 && from != cpu)
+ smp_call_function_single(from, giveup_do_timer, &cpu, 1);
+}
+#endif
+
/*
* Setup the tick device
*/
@@ -186,12 +214,26 @@ static void tick_setup_device(struct tick_device *td,
* this cpu:
*/
if (tick_do_timer_cpu == TICK_DO_TIMER_BOOT) {
- if (!tick_nohz_full_cpu(cpu))
- tick_do_timer_cpu = cpu;
- else
- tick_do_timer_cpu = TICK_DO_TIMER_NONE;
+ tick_do_timer_cpu = cpu;
+
tick_next_period = ktime_get();
tick_period = NSEC_PER_SEC / HZ;
+#ifdef CONFIG_NO_HZ_FULL
+ /*
+ * The boot CPU may be nohz_full, in which case set
+ * tick_do_timer_boot_cpu so the first housekeeping
+ * secondary that comes up will take do_timer from
+ * us.
+ */
+ if (tick_nohz_full_cpu(cpu))
+ tick_do_timer_boot_cpu = cpu;
+
+ } else if (tick_do_timer_boot_cpu != -1 &&
+ !tick_nohz_full_cpu(cpu)) {
+ tick_take_do_timer_from_boot();
+ tick_do_timer_boot_cpu = -1;
+ WARN_ON(tick_do_timer_cpu != cpu);
+#endif
}
/*
@@ -487,6 +529,7 @@ void tick_freeze(void)
trace_suspend_resume(TPS("timekeeping_freeze"),
smp_processor_id(), true);
system_state = SYSTEM_SUSPEND;
+ sched_clock_suspend();
timekeeping_suspend();
} else {
tick_suspend_local();
@@ -510,6 +553,7 @@ void tick_unfreeze(void)
if (tick_freeze_depth == num_online_cpus()) {
timekeeping_resume();
+ sched_clock_resume();
system_state = SYSTEM_RUNNING;
trace_suspend_resume(TPS("timekeeping_freeze"),
smp_processor_id(), false);
diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h
index e277284c2831..7b2496136729 100644
--- a/kernel/time/tick-internal.h
+++ b/kernel/time/tick-internal.h
@@ -64,7 +64,6 @@ extern ssize_t sysfs_get_uname(const char *buf, char *dst, size_t cnt);
extern int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu);
extern void tick_install_broadcast_device(struct clock_event_device *dev);
extern int tick_is_broadcast_device(struct clock_event_device *dev);
-extern void tick_shutdown_broadcast(unsigned int cpu);
extern void tick_suspend_broadcast(void);
extern void tick_resume_broadcast(void);
extern bool tick_resume_check_broadcast(void);
@@ -78,7 +77,6 @@ static inline void tick_install_broadcast_device(struct clock_event_device *dev)
static inline int tick_is_broadcast_device(struct clock_event_device *dev) { return 0; }
static inline int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu) { return 0; }
static inline void tick_do_periodic_broadcast(struct clock_event_device *d) { }
-static inline void tick_shutdown_broadcast(unsigned int cpu) { }
static inline void tick_suspend_broadcast(void) { }
static inline void tick_resume_broadcast(void) { }
static inline bool tick_resume_check_broadcast(void) { return false; }
@@ -128,19 +126,23 @@ static inline int tick_check_oneshot_change(int allow_nohz) { return 0; }
/* Functions related to oneshot broadcasting */
#if defined(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST) && defined(CONFIG_TICK_ONESHOT)
extern void tick_broadcast_switch_to_oneshot(void);
-extern void tick_shutdown_broadcast_oneshot(unsigned int cpu);
extern int tick_broadcast_oneshot_active(void);
extern void tick_check_oneshot_broadcast_this_cpu(void);
bool tick_broadcast_oneshot_available(void);
extern struct cpumask *tick_get_broadcast_oneshot_mask(void);
#else /* !(BROADCAST && ONESHOT): */
static inline void tick_broadcast_switch_to_oneshot(void) { }
-static inline void tick_shutdown_broadcast_oneshot(unsigned int cpu) { }
static inline int tick_broadcast_oneshot_active(void) { return 0; }
static inline void tick_check_oneshot_broadcast_this_cpu(void) { }
static inline bool tick_broadcast_oneshot_available(void) { return tick_oneshot_possible(); }
#endif /* !(BROADCAST && ONESHOT) */
+#if defined(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST) && defined(CONFIG_HOTPLUG_CPU)
+extern void tick_broadcast_offline(unsigned int cpu);
+#else
+static inline void tick_broadcast_offline(unsigned int cpu) { }
+#endif
+
/* NO_HZ_FULL internal */
#ifdef CONFIG_NO_HZ_FULL
extern void tick_nohz_init(void);
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 6fa52cd6df0b..be9707f68024 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -121,10 +121,16 @@ static void tick_sched_do_timer(struct tick_sched *ts, ktime_t now)
* into a long sleep. If two CPUs happen to assign themselves to
* this duty, then the jiffies update is still serialized by
* jiffies_lock.
+ *
+ * If nohz_full is enabled, this should not happen because the
+ * tick_do_timer_cpu never relinquishes.
*/
- if (unlikely(tick_do_timer_cpu == TICK_DO_TIMER_NONE)
- && !tick_nohz_full_cpu(cpu))
+ if (unlikely(tick_do_timer_cpu == TICK_DO_TIMER_NONE)) {
+#ifdef CONFIG_NO_HZ_FULL
+ WARN_ON(tick_nohz_full_running);
+#endif
tick_do_timer_cpu = cpu;
+ }
#endif
/* Check, if the jiffies need an update */
@@ -395,8 +401,8 @@ void __init tick_nohz_full_setup(cpumask_var_t cpumask)
static int tick_nohz_cpu_down(unsigned int cpu)
{
/*
- * The boot CPU handles housekeeping duty (unbound timers,
- * workqueues, timekeeping, ...) on behalf of full dynticks
+ * The tick_do_timer_cpu CPU handles housekeeping duty (unbound
+ * timers, workqueues, timekeeping, ...) on behalf of full dynticks
* CPUs. It must remain online when nohz full is enabled.
*/
if (tick_nohz_full_running && tick_do_timer_cpu == cpu)
@@ -423,12 +429,15 @@ void __init tick_nohz_init(void)
return;
}
- cpu = smp_processor_id();
+ if (IS_ENABLED(CONFIG_PM_SLEEP_SMP) &&
+ !IS_ENABLED(CONFIG_PM_SLEEP_SMP_NONZERO_CPU)) {
+ cpu = smp_processor_id();
- if (cpumask_test_cpu(cpu, tick_nohz_full_mask)) {
- pr_warn("NO_HZ: Clearing %d from nohz_full range for timekeeping\n",
- cpu);
- cpumask_clear_cpu(cpu, tick_nohz_full_mask);
+ if (cpumask_test_cpu(cpu, tick_nohz_full_mask)) {
+ pr_warn("NO_HZ: Clearing %d from nohz_full range "
+ "for timekeeping\n", cpu);
+ cpumask_clear_cpu(cpu, tick_nohz_full_mask);
+ }
}
for_each_cpu(cpu, tick_nohz_full_mask)
@@ -645,7 +654,8 @@ static inline bool local_timer_softirq_pending(void)
static ktime_t tick_nohz_next_event(struct tick_sched *ts, int cpu)
{
u64 basemono, next_tick, next_tmr, next_rcu, delta, expires;
- unsigned long seq, basejiff;
+ unsigned long basejiff;
+ unsigned int seq;
/* Read jiffies and the time when jiffies were updated last */
do {
@@ -772,7 +782,6 @@ static void tick_nohz_stop_tick(struct tick_sched *ts, int cpu)
*/
if (!ts->tick_stopped) {
calc_load_nohz_start();
- cpu_load_update_nohz_start();
quiet_vmstat();
ts->last_tick = hrtimer_get_expires(&ts->sched_timer);
@@ -819,7 +828,6 @@ static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now)
{
/* Update jiffies first */
tick_do_update_jiffies64(now);
- cpu_load_update_nohz_stop();
/*
* Clear the timer idle flag, so we avoid IPIs on remote queueing and
@@ -904,8 +912,13 @@ static bool can_stop_idle_tick(int cpu, struct tick_sched *ts)
/*
* Boot safety: make sure the timekeeping duty has been
* assigned before entering dyntick-idle mode,
+ * tick_do_timer_cpu is TICK_DO_TIMER_BOOT
*/
- if (tick_do_timer_cpu == TICK_DO_TIMER_NONE)
+ if (unlikely(tick_do_timer_cpu == TICK_DO_TIMER_BOOT))
+ return false;
+
+ /* Should not happen for nohz-full */
+ if (WARN_ON_ONCE(tick_do_timer_cpu == TICK_DO_TIMER_NONE))
return false;
}
@@ -1023,6 +1036,18 @@ bool tick_nohz_idle_got_tick(void)
}
/**
+ * tick_nohz_get_next_hrtimer - return the next expiration time for the hrtimer
+ * or the tick, whatever that expires first. Note that, if the tick has been
+ * stopped, it returns the next hrtimer.
+ *
+ * Called from power state control code with interrupts disabled
+ */
+ktime_t tick_nohz_get_next_hrtimer(void)
+{
+ return __this_cpu_read(tick_cpu_device.evtdev)->next_event;
+}
+
+/**
* tick_nohz_get_sleep_length - return the expected length of the current sleep
* @delta_next: duration until the next event if the tick cannot be stopped
*
diff --git a/kernel/time/tick-sched.h b/kernel/time/tick-sched.h
index 6de959a854b2..4fb06527cf64 100644
--- a/kernel/time/tick-sched.h
+++ b/kernel/time/tick-sched.h
@@ -24,12 +24,19 @@ enum tick_nohz_mode {
* struct tick_sched - sched tick emulation and no idle tick control/stats
* @sched_timer: hrtimer to schedule the periodic tick in high
* resolution mode
+ * @check_clocks: Notification mechanism about clocksource changes
+ * @nohz_mode: Mode - one state of tick_nohz_mode
+ * @inidle: Indicator that the CPU is in the tick idle mode
+ * @tick_stopped: Indicator that the idle tick has been stopped
+ * @idle_active: Indicator that the CPU is actively in the tick idle mode;
+ * it is resetted during irq handling phases.
+ * @do_timer_lst: CPU was the last one doing do_timer before going idle
+ * @got_idle_tick: Tick timer function has run with @inidle set
* @last_tick: Store the last tick expiry time when the tick
* timer is modified for nohz sleeps. This is necessary
* to resume the tick timer operation in the timeline
* when the CPU returns from nohz sleep.
* @next_tick: Next tick to be fired when in dynticks mode.
- * @tick_stopped: Indicator that the idle tick has been stopped
* @idle_jiffies: jiffies at the entry to idle for idle time accounting
* @idle_calls: Total number of idle calls
* @idle_sleeps: Number of idle calls, where the sched tick was stopped
@@ -40,8 +47,8 @@ enum tick_nohz_mode {
* @iowait_sleeptime: Sum of the time slept in idle with sched tick stopped, with IO outstanding
* @timer_expires: Anticipated timer expiration time (in case sched tick is stopped)
* @timer_expires_base: Base time clock monotonic for @timer_expires
- * @do_timer_lst: CPU was the last one doing do_timer before going idle
- * @got_idle_tick: Tick timer function has run with @inidle set
+ * @next_timer: Expiry time of next expiring timer for debugging purpose only
+ * @tick_dep_mask: Tick dependency mask - is set, if someone needs the tick
*/
struct tick_sched {
struct hrtimer sched_timer;
diff --git a/kernel/time/time.c b/kernel/time/time.c
index 2edb5088a70b..5c54ca632d08 100644
--- a/kernel/time/time.c
+++ b/kernel/time/time.c
@@ -98,11 +98,11 @@ SYSCALL_DEFINE1(stime, time_t __user *, tptr)
#endif /* __ARCH_WANT_SYS_TIME */
-#ifdef CONFIG_COMPAT
-#ifdef __ARCH_WANT_COMPAT_SYS_TIME
+#ifdef CONFIG_COMPAT_32BIT_TIME
+#ifdef __ARCH_WANT_SYS_TIME32
/* old_time32_t is a 32 bit "long" and needs to get converted. */
-COMPAT_SYSCALL_DEFINE1(time, old_time32_t __user *, tloc)
+SYSCALL_DEFINE1(time32, old_time32_t __user *, tloc)
{
old_time32_t i;
@@ -116,7 +116,7 @@ COMPAT_SYSCALL_DEFINE1(time, old_time32_t __user *, tloc)
return i;
}
-COMPAT_SYSCALL_DEFINE1(stime, old_time32_t __user *, tptr)
+SYSCALL_DEFINE1(stime32, old_time32_t __user *, tptr)
{
struct timespec64 tv;
int err;
@@ -134,7 +134,7 @@ COMPAT_SYSCALL_DEFINE1(stime, old_time32_t __user *, tptr)
return 0;
}
-#endif /* __ARCH_WANT_COMPAT_SYS_TIME */
+#endif /* __ARCH_WANT_SYS_TIME32 */
#endif
SYSCALL_DEFINE2(gettimeofday, struct timeval __user *, tv,
@@ -171,7 +171,7 @@ int do_sys_settimeofday64(const struct timespec64 *tv, const struct timezone *tz
static int firsttime = 1;
int error = 0;
- if (tv && !timespec64_valid(tv))
+ if (tv && !timespec64_valid_settod(tv))
return -EINVAL;
error = security_settime64(tv, tz);
@@ -251,6 +251,10 @@ COMPAT_SYSCALL_DEFINE2(settimeofday, struct old_timeval32 __user *, tv,
if (tv) {
if (compat_get_timeval(&user_tv, tv))
return -EFAULT;
+
+ if (!timeval_valid(&user_tv))
+ return -EINVAL;
+
new_ts.tv_sec = user_tv.tv_sec;
new_ts.tv_nsec = user_tv.tv_usec * NSEC_PER_USEC;
}
@@ -263,35 +267,99 @@ COMPAT_SYSCALL_DEFINE2(settimeofday, struct old_timeval32 __user *, tv,
}
#endif
-SYSCALL_DEFINE1(adjtimex, struct timex __user *, txc_p)
+#if !defined(CONFIG_64BIT_TIME) || defined(CONFIG_64BIT)
+SYSCALL_DEFINE1(adjtimex, struct __kernel_timex __user *, txc_p)
{
- struct timex txc; /* Local copy of parameter */
+ struct __kernel_timex txc; /* Local copy of parameter */
int ret;
/* Copy the user data space into the kernel copy
* structure. But bear in mind that the structures
* may change
*/
- if (copy_from_user(&txc, txc_p, sizeof(struct timex)))
+ if (copy_from_user(&txc, txc_p, sizeof(struct __kernel_timex)))
return -EFAULT;
ret = do_adjtimex(&txc);
- return copy_to_user(txc_p, &txc, sizeof(struct timex)) ? -EFAULT : ret;
+ return copy_to_user(txc_p, &txc, sizeof(struct __kernel_timex)) ? -EFAULT : ret;
}
+#endif
-#ifdef CONFIG_COMPAT
+#ifdef CONFIG_COMPAT_32BIT_TIME
+int get_old_timex32(struct __kernel_timex *txc, const struct old_timex32 __user *utp)
+{
+ struct old_timex32 tx32;
+
+ memset(txc, 0, sizeof(struct __kernel_timex));
+ if (copy_from_user(&tx32, utp, sizeof(struct old_timex32)))
+ return -EFAULT;
-COMPAT_SYSCALL_DEFINE1(adjtimex, struct compat_timex __user *, utp)
+ txc->modes = tx32.modes;
+ txc->offset = tx32.offset;
+ txc->freq = tx32.freq;
+ txc->maxerror = tx32.maxerror;
+ txc->esterror = tx32.esterror;
+ txc->status = tx32.status;
+ txc->constant = tx32.constant;
+ txc->precision = tx32.precision;
+ txc->tolerance = tx32.tolerance;
+ txc->time.tv_sec = tx32.time.tv_sec;
+ txc->time.tv_usec = tx32.time.tv_usec;
+ txc->tick = tx32.tick;
+ txc->ppsfreq = tx32.ppsfreq;
+ txc->jitter = tx32.jitter;
+ txc->shift = tx32.shift;
+ txc->stabil = tx32.stabil;
+ txc->jitcnt = tx32.jitcnt;
+ txc->calcnt = tx32.calcnt;
+ txc->errcnt = tx32.errcnt;
+ txc->stbcnt = tx32.stbcnt;
+
+ return 0;
+}
+
+int put_old_timex32(struct old_timex32 __user *utp, const struct __kernel_timex *txc)
+{
+ struct old_timex32 tx32;
+
+ memset(&tx32, 0, sizeof(struct old_timex32));
+ tx32.modes = txc->modes;
+ tx32.offset = txc->offset;
+ tx32.freq = txc->freq;
+ tx32.maxerror = txc->maxerror;
+ tx32.esterror = txc->esterror;
+ tx32.status = txc->status;
+ tx32.constant = txc->constant;
+ tx32.precision = txc->precision;
+ tx32.tolerance = txc->tolerance;
+ tx32.time.tv_sec = txc->time.tv_sec;
+ tx32.time.tv_usec = txc->time.tv_usec;
+ tx32.tick = txc->tick;
+ tx32.ppsfreq = txc->ppsfreq;
+ tx32.jitter = txc->jitter;
+ tx32.shift = txc->shift;
+ tx32.stabil = txc->stabil;
+ tx32.jitcnt = txc->jitcnt;
+ tx32.calcnt = txc->calcnt;
+ tx32.errcnt = txc->errcnt;
+ tx32.stbcnt = txc->stbcnt;
+ tx32.tai = txc->tai;
+ if (copy_to_user(utp, &tx32, sizeof(struct old_timex32)))
+ return -EFAULT;
+ return 0;
+}
+
+SYSCALL_DEFINE1(adjtimex_time32, struct old_timex32 __user *, utp)
{
- struct timex txc;
+ struct __kernel_timex txc;
int err, ret;
- err = compat_get_timex(&txc, utp);
+ err = get_old_timex32(&txc, utp);
if (err)
return err;
ret = do_adjtimex(&txc);
- err = compat_put_timex(utp, &txc);
+ err = put_old_timex32(utp, &txc);
if (err)
return err;
@@ -719,6 +787,16 @@ u64 jiffies64_to_nsecs(u64 j)
}
EXPORT_SYMBOL(jiffies64_to_nsecs);
+u64 jiffies64_to_msecs(const u64 j)
+{
+#if HZ <= MSEC_PER_SEC && !(MSEC_PER_SEC % HZ)
+ return (MSEC_PER_SEC / HZ) * j;
+#else
+ return div_u64(j * HZ_TO_MSEC_NUM, HZ_TO_MSEC_DEN);
+#endif
+}
+EXPORT_SYMBOL(jiffies64_to_msecs);
+
/**
* nsecs_to_jiffies64 - Convert nsecs in u64 to jiffies64
*
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index ac5dbf2cd4a2..d911c8470149 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -21,6 +21,7 @@
#include <linux/stop_machine.h>
#include <linux/pvclock_gtod.h>
#include <linux/compiler.h>
+#include <linux/audit.h>
#include "tick-internal.h"
#include "ntp_internal.h"
@@ -720,7 +721,7 @@ static void timekeeping_forward_now(struct timekeeper *tk)
void ktime_get_real_ts64(struct timespec64 *ts)
{
struct timekeeper *tk = &tk_core.timekeeper;
- unsigned long seq;
+ unsigned int seq;
u64 nsecs;
WARN_ON(timekeeping_suspended);
@@ -807,17 +808,18 @@ ktime_t ktime_get_coarse_with_offset(enum tk_offsets offs)
struct timekeeper *tk = &tk_core.timekeeper;
unsigned int seq;
ktime_t base, *offset = offsets[offs];
+ u64 nsecs;
WARN_ON(timekeeping_suspended);
do {
seq = read_seqcount_begin(&tk_core.seq);
base = ktime_add(tk->tkr_mono.base, *offset);
+ nsecs = tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift;
} while (read_seqcount_retry(&tk_core.seq, seq));
- return base;
-
+ return ktime_add_ns(base, nsecs);
}
EXPORT_SYMBOL_GPL(ktime_get_coarse_with_offset);
@@ -829,7 +831,7 @@ EXPORT_SYMBOL_GPL(ktime_get_coarse_with_offset);
ktime_t ktime_mono_to_any(ktime_t tmono, enum tk_offsets offs)
{
ktime_t *offset = offsets[offs];
- unsigned long seq;
+ unsigned int seq;
ktime_t tconv;
do {
@@ -960,7 +962,7 @@ time64_t __ktime_get_real_seconds(void)
void ktime_get_snapshot(struct system_time_snapshot *systime_snapshot)
{
struct timekeeper *tk = &tk_core.timekeeper;
- unsigned long seq;
+ unsigned int seq;
ktime_t base_raw;
ktime_t base_real;
u64 nsec_raw;
@@ -1122,7 +1124,7 @@ int get_device_system_crosststamp(int (*get_time_fn)
ktime_t base_real, base_raw;
u64 nsec_real, nsec_raw;
u8 cs_was_changed_seq;
- unsigned long seq;
+ unsigned int seq;
bool do_interp;
int ret;
@@ -1221,7 +1223,7 @@ int do_settimeofday64(const struct timespec64 *ts)
unsigned long flags;
int ret = 0;
- if (!timespec64_valid_strict(ts))
+ if (!timespec64_valid_settod(ts))
return -EINVAL;
raw_spin_lock_irqsave(&timekeeper_lock, flags);
@@ -1250,6 +1252,9 @@ out:
/* signal hrtimers about time change */
clock_was_set();
+ if (!ret)
+ audit_tk_injoffset(ts_delta);
+
return ret;
}
EXPORT_SYMBOL(do_settimeofday64);
@@ -1278,7 +1283,7 @@ static int timekeeping_inject_offset(const struct timespec64 *ts)
/* Make sure the proposed value is valid */
tmp = timespec64_add(tk_xtime(tk), *ts);
if (timespec64_compare(&tk->wall_to_monotonic, ts) > 0 ||
- !timespec64_valid_strict(&tmp)) {
+ !timespec64_valid_settod(&tmp)) {
ret = -EINVAL;
goto error;
}
@@ -1409,7 +1414,7 @@ int timekeeping_notify(struct clocksource *clock)
void ktime_get_raw_ts64(struct timespec64 *ts)
{
struct timekeeper *tk = &tk_core.timekeeper;
- unsigned long seq;
+ unsigned int seq;
u64 nsecs;
do {
@@ -1431,7 +1436,7 @@ EXPORT_SYMBOL(ktime_get_raw_ts64);
int timekeeping_valid_for_hres(void)
{
struct timekeeper *tk = &tk_core.timekeeper;
- unsigned long seq;
+ unsigned int seq;
int ret;
do {
@@ -1450,7 +1455,7 @@ int timekeeping_valid_for_hres(void)
u64 timekeeping_max_deferment(void)
{
struct timekeeper *tk = &tk_core.timekeeper;
- unsigned long seq;
+ unsigned int seq;
u64 ret;
do {
@@ -1527,7 +1532,7 @@ void __init timekeeping_init(void)
unsigned long flags;
read_persistent_wall_and_boot_offset(&wall_time, &boot_offset);
- if (timespec64_valid_strict(&wall_time) &&
+ if (timespec64_valid_settod(&wall_time) &&
timespec64_to_ns(&wall_time) > 0) {
persistent_clock_exists = true;
} else if (timespec64_to_ns(&wall_time) != 0) {
@@ -2150,7 +2155,7 @@ EXPORT_SYMBOL_GPL(getboottime64);
void ktime_get_coarse_real_ts64(struct timespec64 *ts)
{
struct timekeeper *tk = &tk_core.timekeeper;
- unsigned long seq;
+ unsigned int seq;
do {
seq = read_seqcount_begin(&tk_core.seq);
@@ -2164,7 +2169,7 @@ void ktime_get_coarse_ts64(struct timespec64 *ts)
{
struct timekeeper *tk = &tk_core.timekeeper;
struct timespec64 now, mono;
- unsigned long seq;
+ unsigned int seq;
do {
seq = read_seqcount_begin(&tk_core.seq);
@@ -2234,7 +2239,7 @@ ktime_t ktime_get_update_offsets_now(unsigned int *cwsseq, ktime_t *offs_real,
/**
* timekeeping_validate_timex - Ensures the timex is ok for use in do_adjtimex
*/
-static int timekeeping_validate_timex(const struct timex *txc)
+static int timekeeping_validate_timex(const struct __kernel_timex *txc)
{
if (txc->modes & ADJ_ADJTIME) {
/* singleshot must not be used with any other mode bits */
@@ -2300,9 +2305,10 @@ static int timekeeping_validate_timex(const struct timex *txc)
/**
* do_adjtimex() - Accessor function to NTP __do_adjtimex function
*/
-int do_adjtimex(struct timex *txc)
+int do_adjtimex(struct __kernel_timex *txc)
{
struct timekeeper *tk = &tk_core.timekeeper;
+ struct audit_ntp_data ad;
unsigned long flags;
struct timespec64 ts;
s32 orig_tai, tai;
@@ -2322,15 +2328,19 @@ int do_adjtimex(struct timex *txc)
ret = timekeeping_inject_offset(&delta);
if (ret)
return ret;
+
+ audit_tk_injoffset(delta);
}
+ audit_ntp_init(&ad);
+
ktime_get_real_ts64(&ts);
raw_spin_lock_irqsave(&timekeeper_lock, flags);
write_seqcount_begin(&tk_core.seq);
orig_tai = tai = tk->tai_offset;
- ret = __do_adjtimex(txc, &ts, &tai);
+ ret = __do_adjtimex(txc, &ts, &tai, &ad);
if (tai != orig_tai) {
__timekeeping_set_tai_offset(tk, tai);
@@ -2341,6 +2351,8 @@ int do_adjtimex(struct timex *txc)
write_seqcount_end(&tk_core.seq);
raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
+ audit_ntp_log(&ad);
+
/* Update the multiplier immediately if frequency was set directly */
if (txc->modes & (ADJ_FREQUENCY | ADJ_TICK))
timekeeping_advance(TK_ADV_FREQ);
diff --git a/kernel/time/timekeeping.h b/kernel/time/timekeeping.h
index 7a9b4eb7a1d5..141ab3ab0354 100644
--- a/kernel/time/timekeeping.h
+++ b/kernel/time/timekeeping.h
@@ -14,6 +14,13 @@ extern u64 timekeeping_max_deferment(void);
extern void timekeeping_warp_clock(void);
extern int timekeeping_suspend(void);
extern void timekeeping_resume(void);
+#ifdef CONFIG_GENERIC_SCHED_CLOCK
+extern int sched_clock_suspend(void);
+extern void sched_clock_resume(void);
+#else
+static inline int sched_clock_suspend(void) { return 0; }
+static inline void sched_clock_resume(void) { }
+#endif
extern void do_timer(unsigned long ticks);
extern void update_wall_time(void);
diff --git a/kernel/time/timekeeping_debug.c b/kernel/time/timekeeping_debug.c
index 86489950d690..b73e8850e58d 100644
--- a/kernel/time/timekeeping_debug.c
+++ b/kernel/time/timekeeping_debug.c
@@ -37,15 +37,8 @@ DEFINE_SHOW_ATTRIBUTE(tk_debug_sleep_time);
static int __init tk_debug_sleep_time_init(void)
{
- struct dentry *d;
-
- d = debugfs_create_file("sleep_time", 0444, NULL, NULL,
- &tk_debug_sleep_time_fops);
- if (!d) {
- pr_err("Failed to create sleep_time debug file\n");
- return -ENOMEM;
- }
-
+ debugfs_create_file("sleep_time", 0444, NULL, NULL,
+ &tk_debug_sleep_time_fops);
return 0;
}
late_initcall(tk_debug_sleep_time_init);
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index 444156debfa0..343c7ba33b1c 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -536,6 +536,8 @@ static void enqueue_timer(struct timer_base *base, struct timer_list *timer,
hlist_add_head(&timer->entry, base->vectors + idx);
__set_bit(idx, base->pending_map);
timer_set_idx(timer, idx);
+
+ trace_timer_start(timer, timer->expires, timer->flags);
}
static void
@@ -647,7 +649,7 @@ static bool timer_fixup_activate(void *addr, enum debug_obj_state state)
case ODEBUG_STATE_ACTIVE:
WARN_ON(1);
-
+ /* fall through */
default:
return false;
}
@@ -757,13 +759,6 @@ static inline void debug_init(struct timer_list *timer)
trace_timer_init(timer);
}
-static inline void
-debug_activate(struct timer_list *timer, unsigned long expires)
-{
- debug_timer_activate(timer);
- trace_timer_start(timer, expires, timer->flags);
-}
-
static inline void debug_deactivate(struct timer_list *timer)
{
debug_timer_deactivate(timer);
@@ -1037,7 +1032,7 @@ __mod_timer(struct timer_list *timer, unsigned long expires, unsigned int option
}
}
- debug_activate(timer, expires);
+ debug_timer_activate(timer);
timer->expires = expires;
/*
@@ -1171,7 +1166,7 @@ void add_timer_on(struct timer_list *timer, int cpu)
}
forward_timer_base(base);
- debug_activate(timer, timer->expires);
+ debug_timer_activate(timer);
internal_add_timer(base, timer);
raw_spin_unlock_irqrestore(&base->lock, flags);
}
@@ -1298,7 +1293,9 @@ int del_timer_sync(struct timer_list *timer)
EXPORT_SYMBOL(del_timer_sync);
#endif
-static void call_timer_fn(struct timer_list *timer, void (*fn)(struct timer_list *))
+static void call_timer_fn(struct timer_list *timer,
+ void (*fn)(struct timer_list *),
+ unsigned long baseclk)
{
int count = preempt_count();
@@ -1321,14 +1318,14 @@ static void call_timer_fn(struct timer_list *timer, void (*fn)(struct timer_list
*/
lock_map_acquire(&lockdep_map);
- trace_timer_expire_entry(timer);
+ trace_timer_expire_entry(timer, baseclk);
fn(timer);
trace_timer_expire_exit(timer);
lock_map_release(&lockdep_map);
if (count != preempt_count()) {
- WARN_ONCE(1, "timer: %pF preempt leak: %08x -> %08x\n",
+ WARN_ONCE(1, "timer: %pS preempt leak: %08x -> %08x\n",
fn, count, preempt_count());
/*
* Restore the preempt count. That gives us a decent
@@ -1342,6 +1339,13 @@ static void call_timer_fn(struct timer_list *timer, void (*fn)(struct timer_list
static void expire_timers(struct timer_base *base, struct hlist_head *head)
{
+ /*
+ * This value is required only for tracing. base->clk was
+ * incremented directly before expire_timers was called. But expiry
+ * is related to the old base->clk value.
+ */
+ unsigned long baseclk = base->clk - 1;
+
while (!hlist_empty(head)) {
struct timer_list *timer;
void (*fn)(struct timer_list *);
@@ -1355,11 +1359,11 @@ static void expire_timers(struct timer_base *base, struct hlist_head *head)
if (timer->flags & TIMER_IRQSAFE) {
raw_spin_unlock(&base->lock);
- call_timer_fn(timer, fn);
+ call_timer_fn(timer, fn, baseclk);
raw_spin_lock(&base->lock);
} else {
raw_spin_unlock_irq(&base->lock);
- call_timer_fn(timer, fn);
+ call_timer_fn(timer, fn, baseclk);
raw_spin_lock_irq(&base->lock);
}
}
@@ -1632,7 +1636,7 @@ void update_process_times(int user_tick)
/* Note: this timer irq context must be accounted for as well. */
account_process_tick(p, user_tick);
run_local_timers();
- rcu_check_callbacks(user_tick);
+ rcu_sched_clock_irq(user_tick);
#ifdef CONFIG_IRQ_WORK
if (in_irq())
irq_work_tick();
diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c
index 98ba50dcb1b2..acb326f5f50a 100644
--- a/kernel/time/timer_list.c
+++ b/kernel/time/timer_list.c
@@ -282,23 +282,6 @@ static inline void timer_list_header(struct seq_file *m, u64 now)
SEQ_printf(m, "\n");
}
-static int timer_list_show(struct seq_file *m, void *v)
-{
- struct timer_list_iter *iter = v;
-
- if (iter->cpu == -1 && !iter->second_pass)
- timer_list_header(m, iter->now);
- else if (!iter->second_pass)
- print_cpu(m, iter->cpu, iter->now);
-#ifdef CONFIG_GENERIC_CLOCKEVENTS
- else if (iter->cpu == -1 && iter->second_pass)
- timer_list_show_tickdevices_header(m);
- else
- print_tickdevice(m, tick_get_device(iter->cpu), iter->cpu);
-#endif
- return 0;
-}
-
void sysrq_timer_list_show(void)
{
u64 now = ktime_to_ns(ktime_get());
@@ -317,6 +300,24 @@ void sysrq_timer_list_show(void)
return;
}
+#ifdef CONFIG_PROC_FS
+static int timer_list_show(struct seq_file *m, void *v)
+{
+ struct timer_list_iter *iter = v;
+
+ if (iter->cpu == -1 && !iter->second_pass)
+ timer_list_header(m, iter->now);
+ else if (!iter->second_pass)
+ print_cpu(m, iter->cpu, iter->now);
+#ifdef CONFIG_GENERIC_CLOCKEVENTS
+ else if (iter->cpu == -1 && iter->second_pass)
+ timer_list_show_tickdevices_header(m);
+ else
+ print_tickdevice(m, tick_get_device(iter->cpu), iter->cpu);
+#endif
+ return 0;
+}
+
static void *move_iter(struct timer_list_iter *iter, loff_t offset)
{
for (; offset; offset--) {
@@ -376,3 +377,4 @@ static int __init init_timer_list_procfs(void)
return 0;
}
__initcall(init_timer_list_procfs);
+#endif
diff --git a/kernel/time/vsyscall.c b/kernel/time/vsyscall.c
new file mode 100644
index 000000000000..8cf3596a4ce6
--- /dev/null
+++ b/kernel/time/vsyscall.c
@@ -0,0 +1,129 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright 2019 ARM Ltd.
+ *
+ * Generic implementation of update_vsyscall and update_vsyscall_tz.
+ *
+ * Based on the x86 specific implementation.
+ */
+
+#include <linux/hrtimer.h>
+#include <linux/timekeeper_internal.h>
+#include <vdso/datapage.h>
+#include <vdso/helpers.h>
+#include <vdso/vsyscall.h>
+
+static inline void update_vdso_data(struct vdso_data *vdata,
+ struct timekeeper *tk)
+{
+ struct vdso_timestamp *vdso_ts;
+ u64 nsec;
+
+ vdata[CS_HRES_COARSE].cycle_last = tk->tkr_mono.cycle_last;
+ vdata[CS_HRES_COARSE].mask = tk->tkr_mono.mask;
+ vdata[CS_HRES_COARSE].mult = tk->tkr_mono.mult;
+ vdata[CS_HRES_COARSE].shift = tk->tkr_mono.shift;
+ vdata[CS_RAW].cycle_last = tk->tkr_raw.cycle_last;
+ vdata[CS_RAW].mask = tk->tkr_raw.mask;
+ vdata[CS_RAW].mult = tk->tkr_raw.mult;
+ vdata[CS_RAW].shift = tk->tkr_raw.shift;
+
+ /* CLOCK_REALTIME */
+ vdso_ts = &vdata[CS_HRES_COARSE].basetime[CLOCK_REALTIME];
+ vdso_ts->sec = tk->xtime_sec;
+ vdso_ts->nsec = tk->tkr_mono.xtime_nsec;
+
+ /* CLOCK_MONOTONIC */
+ vdso_ts = &vdata[CS_HRES_COARSE].basetime[CLOCK_MONOTONIC];
+ vdso_ts->sec = tk->xtime_sec + tk->wall_to_monotonic.tv_sec;
+
+ nsec = tk->tkr_mono.xtime_nsec;
+ nsec += ((u64)tk->wall_to_monotonic.tv_nsec << tk->tkr_mono.shift);
+ while (nsec >= (((u64)NSEC_PER_SEC) << tk->tkr_mono.shift)) {
+ nsec -= (((u64)NSEC_PER_SEC) << tk->tkr_mono.shift);
+ vdso_ts->sec++;
+ }
+ vdso_ts->nsec = nsec;
+
+ /* CLOCK_MONOTONIC_RAW */
+ vdso_ts = &vdata[CS_RAW].basetime[CLOCK_MONOTONIC_RAW];
+ vdso_ts->sec = tk->raw_sec;
+ vdso_ts->nsec = tk->tkr_raw.xtime_nsec;
+
+ /* CLOCK_BOOTTIME */
+ vdso_ts = &vdata[CS_HRES_COARSE].basetime[CLOCK_BOOTTIME];
+ vdso_ts->sec = tk->xtime_sec + tk->wall_to_monotonic.tv_sec;
+ nsec = tk->tkr_mono.xtime_nsec;
+ nsec += ((u64)(tk->wall_to_monotonic.tv_nsec +
+ ktime_to_ns(tk->offs_boot)) << tk->tkr_mono.shift);
+ while (nsec >= (((u64)NSEC_PER_SEC) << tk->tkr_mono.shift)) {
+ nsec -= (((u64)NSEC_PER_SEC) << tk->tkr_mono.shift);
+ vdso_ts->sec++;
+ }
+ vdso_ts->nsec = nsec;
+
+ /* CLOCK_TAI */
+ vdso_ts = &vdata[CS_HRES_COARSE].basetime[CLOCK_TAI];
+ vdso_ts->sec = tk->xtime_sec + (s64)tk->tai_offset;
+ vdso_ts->nsec = tk->tkr_mono.xtime_nsec;
+
+ /*
+ * Read without the seqlock held by clock_getres().
+ * Note: No need to have a second copy.
+ */
+ WRITE_ONCE(vdata[CS_HRES_COARSE].hrtimer_res, hrtimer_resolution);
+}
+
+void update_vsyscall(struct timekeeper *tk)
+{
+ struct vdso_data *vdata = __arch_get_k_vdso_data();
+ struct vdso_timestamp *vdso_ts;
+ u64 nsec;
+
+ if (__arch_update_vdso_data()) {
+ /*
+ * Some architectures might want to skip the update of the
+ * data page.
+ */
+ return;
+ }
+
+ /* copy vsyscall data */
+ vdso_write_begin(vdata);
+
+ vdata[CS_HRES_COARSE].clock_mode = __arch_get_clock_mode(tk);
+ vdata[CS_RAW].clock_mode = __arch_get_clock_mode(tk);
+
+ /* CLOCK_REALTIME_COARSE */
+ vdso_ts = &vdata[CS_HRES_COARSE].basetime[CLOCK_REALTIME_COARSE];
+ vdso_ts->sec = tk->xtime_sec;
+ vdso_ts->nsec = tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift;
+
+ /* CLOCK_MONOTONIC_COARSE */
+ vdso_ts = &vdata[CS_HRES_COARSE].basetime[CLOCK_MONOTONIC_COARSE];
+ vdso_ts->sec = tk->xtime_sec + tk->wall_to_monotonic.tv_sec;
+ nsec = tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift;
+ nsec = nsec + tk->wall_to_monotonic.tv_nsec;
+ vdso_ts->sec += __iter_div_u64_rem(nsec, NSEC_PER_SEC, &vdso_ts->nsec);
+
+ if (__arch_use_vsyscall(vdata))
+ update_vdso_data(vdata, tk);
+
+ __arch_update_vsyscall(vdata, tk);
+
+ vdso_write_end(vdata);
+
+ __arch_sync_vdso_data(vdata);
+}
+
+void update_vsyscall_tz(void)
+{
+ struct vdso_data *vdata = __arch_get_k_vdso_data();
+
+ if (__arch_use_vsyscall(vdata)) {
+ vdata[CS_HRES_COARSE].tz_minuteswest = sys_tz.tz_minuteswest;
+ vdata[CS_HRES_COARSE].tz_dsttime = sys_tz.tz_dsttime;
+ }
+
+ __arch_sync_vdso_data(vdata);
+}
diff --git a/kernel/torture.c b/kernel/torture.c
index bbf6d473e50c..a8d9bdfba7c3 100644
--- a/kernel/torture.c
+++ b/kernel/torture.c
@@ -1,23 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0+
/*
* Common functions for in-kernel torture tests.
*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, you can access it online at
- * http://www.gnu.org/licenses/gpl-2.0.html.
- *
* Copyright (C) IBM Corporation, 2014
*
- * Author: Paul E. McKenney <paulmck@us.ibm.com>
+ * Author: Paul E. McKenney <paulmck@linux.ibm.com>
* Based on kernel/rcu/torture.c.
*/
@@ -53,7 +40,7 @@
#include "rcu/rcu.h"
MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com>");
+MODULE_AUTHOR("Paul E. McKenney <paulmck@linux.ibm.com>");
static char *torture_type;
static int verbose;
@@ -75,6 +62,7 @@ static DEFINE_MUTEX(fullstop_mutex);
static struct task_struct *onoff_task;
static long onoff_holdoff;
static long onoff_interval;
+static torture_ofl_func *onoff_f;
static long n_offline_attempts;
static long n_offline_successes;
static unsigned long sum_offline;
@@ -100,6 +88,8 @@ bool torture_offline(int cpu, long *n_offl_attempts, long *n_offl_successes,
if (!cpu_online(cpu) || !cpu_is_hotpluggable(cpu))
return false;
+ if (num_online_cpus() <= 1)
+ return false; /* Can't offline the last CPU. */
if (verbose > 1)
pr_alert("%s" TORTURE_FLAG
@@ -118,6 +108,8 @@ bool torture_offline(int cpu, long *n_offl_attempts, long *n_offl_successes,
pr_alert("%s" TORTURE_FLAG
"torture_onoff task: offlined %d\n",
torture_type, cpu);
+ if (onoff_f)
+ onoff_f();
(*n_offl_successes)++;
delta = jiffies - starttime;
*sum_offl += delta;
@@ -243,11 +235,12 @@ stop:
/*
* Initiate online-offline handling.
*/
-int torture_onoff_init(long ooholdoff, long oointerval)
+int torture_onoff_init(long ooholdoff, long oointerval, torture_ofl_func *f)
{
#ifdef CONFIG_HOTPLUG_CPU
onoff_holdoff = ooholdoff;
onoff_interval = oointerval;
+ onoff_f = f;
if (onoff_interval <= 0)
return 0;
return torture_create_kthread(torture_onoff, NULL, onoff_task);
@@ -577,6 +570,7 @@ static void torture_shutdown_cleanup(void)
static struct task_struct *stutter_task;
static int stutter_pause_test;
static int stutter;
+static int stutter_gap;
/*
* Block until the stutter interval ends. This must be called periodically
@@ -585,10 +579,12 @@ static int stutter;
bool stutter_wait(const char *title)
{
int spt;
+ bool ret = false;
cond_resched_tasks_rcu_qs();
spt = READ_ONCE(stutter_pause_test);
for (; spt; spt = READ_ONCE(stutter_pause_test)) {
+ ret = true;
if (spt == 1) {
schedule_timeout_interruptible(1);
} else if (spt == 2) {
@@ -599,7 +595,7 @@ bool stutter_wait(const char *title)
}
torture_shutdown_absorb(title);
}
- return !!spt;
+ return ret;
}
EXPORT_SYMBOL_GPL(stutter_wait);
@@ -609,17 +605,24 @@ EXPORT_SYMBOL_GPL(stutter_wait);
*/
static int torture_stutter(void *arg)
{
+ int wtime;
+
VERBOSE_TOROUT_STRING("torture_stutter task started");
do {
if (!torture_must_stop() && stutter > 1) {
- WRITE_ONCE(stutter_pause_test, 1);
- schedule_timeout_interruptible(stutter - 1);
+ wtime = stutter;
+ if (stutter > HZ + 1) {
+ WRITE_ONCE(stutter_pause_test, 1);
+ wtime = stutter - HZ - 1;
+ schedule_timeout_interruptible(wtime);
+ wtime = HZ + 1;
+ }
WRITE_ONCE(stutter_pause_test, 2);
- schedule_timeout_interruptible(1);
+ schedule_timeout_interruptible(wtime);
}
WRITE_ONCE(stutter_pause_test, 0);
if (!torture_must_stop())
- schedule_timeout_interruptible(stutter);
+ schedule_timeout_interruptible(stutter_gap);
torture_shutdown_absorb("torture_stutter");
} while (!torture_must_stop());
torture_kthread_stopping("torture_stutter");
@@ -629,9 +632,10 @@ static int torture_stutter(void *arg)
/*
* Initialize and kick off the torture_stutter kthread.
*/
-int torture_stutter_init(const int s)
+int torture_stutter_init(const int s, const int sgap)
{
stutter = s;
+ stutter_gap = sgap;
return torture_create_kthread(torture_stutter, NULL, stutter_task);
}
EXPORT_SYMBOL_GPL(torture_stutter_init);
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index fa8b1fe824f3..564e5fdb025f 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
#
# Architectures that offer an FUNCTION_TRACER implementation should
# select HAVE_FUNCTION_TRACER:
@@ -370,6 +371,7 @@ config PROFILE_ANNOTATED_BRANCHES
config PROFILE_ALL_BRANCHES
bool "Profile all if conditionals" if !FORTIFY_SOURCE
select TRACE_BRANCH_PROFILING
+ imply CC_DISABLE_WARN_MAYBE_UNINITIALIZED # avoid false positives
help
This tracer profiles all branch conditions. Every if ()
taken in the kernel is recorded whether it hit or miss.
@@ -773,13 +775,6 @@ config TRACE_EVAL_MAP_FILE
If unsure, say N
-config TRACING_EVENTS_GPIO
- bool "Trace gpio events"
- depends on GPIOLIB
- default y
- help
- Enable tracing events for gpio subsystem
-
config GCOV_PROFILE_FTRACE
bool "Enable GCOV profiling on ftrace subsystem"
depends on GCOV_KERNEL
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index fac0ddf8a8e2..2d6e93ab0478 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -512,8 +512,6 @@ static int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
dir = debugfs_lookup(buts->name, blk_debugfs_root);
if (!dir)
bt->dir = dir = debugfs_create_dir(buts->name, blk_debugfs_root);
- if (!dir)
- goto err;
bt->dev = dev;
atomic_set(&bt->dropped, 0);
@@ -522,12 +520,8 @@ static int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
ret = -EIO;
bt->dropped_file = debugfs_create_file("dropped", 0444, dir, bt,
&blk_dropped_fops);
- if (!bt->dropped_file)
- goto err;
bt->msg_file = debugfs_create_file("msg", 0222, dir, bt, &blk_msg_fops);
- if (!bt->msg_file)
- goto err;
bt->rchan = relay_open("trace", dir, buts->buf_size,
buts->buf_nr, &blk_relay_callbacks, bt);
@@ -723,6 +717,7 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg)
#endif
case BLKTRACESTART:
start = 1;
+ /* fall through */
case BLKTRACESTOP:
ret = __blk_trace_startstop(q, start);
break;
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index f1a86a0d881d..ca1255d14576 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -14,9 +14,14 @@
#include <linux/syscalls.h>
#include <linux/error-injection.h>
+#include <asm/tlb.h>
+
#include "trace_probe.h"
#include "trace.h"
+#define bpf_event_rcu_dereference(p) \
+ rcu_dereference_protected(p, lockdep_is_held(&bpf_event_mutex))
+
#ifdef CONFIG_MODULES
struct bpf_trace_module {
struct module *module;
@@ -163,6 +168,10 @@ BPF_CALL_3(bpf_probe_write_user, void *, unsafe_ptr, const void *, src,
* access_ok() should prevent writing to non-user memory, but in
* some situations (nommu, temporary switch, etc) access_ok() does
* not provide enough validation, hence the check on KERNEL_DS.
+ *
+ * nmi_uaccess_okay() ensures the probe is not run in an interim
+ * state, when the task or mm are switched. This is specifically
+ * required to prevent the use of temporary mm.
*/
if (unlikely(in_interrupt() ||
@@ -170,6 +179,8 @@ BPF_CALL_3(bpf_probe_write_user, void *, unsafe_ptr, const void *, src,
return -EPERM;
if (unlikely(uaccess_kernel()))
return -EPERM;
+ if (unlikely(!nmi_uaccess_okay()))
+ return -EPERM;
if (!access_ok(unsafe_ptr, size))
return -EPERM;
@@ -402,8 +413,6 @@ static const struct bpf_func_proto bpf_perf_event_read_value_proto = {
.arg4_type = ARG_CONST_SIZE,
};
-static DEFINE_PER_CPU(struct perf_sample_data, bpf_trace_sd);
-
static __always_inline u64
__bpf_perf_event_output(struct pt_regs *regs, struct bpf_map *map,
u64 flags, struct perf_sample_data *sd)
@@ -431,28 +440,53 @@ __bpf_perf_event_output(struct pt_regs *regs, struct bpf_map *map,
if (unlikely(event->oncpu != cpu))
return -EOPNOTSUPP;
- perf_event_output(event, sd, regs);
- return 0;
+ return perf_event_output(event, sd, regs);
}
+/*
+ * Support executing tracepoints in normal, irq, and nmi context that each call
+ * bpf_perf_event_output
+ */
+struct bpf_trace_sample_data {
+ struct perf_sample_data sds[3];
+};
+
+static DEFINE_PER_CPU(struct bpf_trace_sample_data, bpf_trace_sds);
+static DEFINE_PER_CPU(int, bpf_trace_nest_level);
BPF_CALL_5(bpf_perf_event_output, struct pt_regs *, regs, struct bpf_map *, map,
u64, flags, void *, data, u64, size)
{
- struct perf_sample_data *sd = this_cpu_ptr(&bpf_trace_sd);
+ struct bpf_trace_sample_data *sds = this_cpu_ptr(&bpf_trace_sds);
+ int nest_level = this_cpu_inc_return(bpf_trace_nest_level);
struct perf_raw_record raw = {
.frag = {
.size = size,
.data = data,
},
};
+ struct perf_sample_data *sd;
+ int err;
- if (unlikely(flags & ~(BPF_F_INDEX_MASK)))
- return -EINVAL;
+ if (WARN_ON_ONCE(nest_level > ARRAY_SIZE(sds->sds))) {
+ err = -EBUSY;
+ goto out;
+ }
+
+ sd = &sds->sds[nest_level - 1];
+
+ if (unlikely(flags & ~(BPF_F_INDEX_MASK))) {
+ err = -EINVAL;
+ goto out;
+ }
perf_sample_data_init(sd, 0, 0);
sd->raw = &raw;
- return __bpf_perf_event_output(regs, map, flags, sd);
+ err = __bpf_perf_event_output(regs, map, flags, sd);
+
+out:
+ this_cpu_dec(bpf_trace_nest_level);
+ return err;
}
static const struct bpf_func_proto bpf_perf_event_output_proto = {
@@ -560,6 +594,69 @@ static const struct bpf_func_proto bpf_probe_read_str_proto = {
.arg3_type = ARG_ANYTHING,
};
+struct send_signal_irq_work {
+ struct irq_work irq_work;
+ struct task_struct *task;
+ u32 sig;
+};
+
+static DEFINE_PER_CPU(struct send_signal_irq_work, send_signal_work);
+
+static void do_bpf_send_signal(struct irq_work *entry)
+{
+ struct send_signal_irq_work *work;
+
+ work = container_of(entry, struct send_signal_irq_work, irq_work);
+ group_send_sig_info(work->sig, SEND_SIG_PRIV, work->task, PIDTYPE_TGID);
+}
+
+BPF_CALL_1(bpf_send_signal, u32, sig)
+{
+ struct send_signal_irq_work *work = NULL;
+
+ /* Similar to bpf_probe_write_user, task needs to be
+ * in a sound condition and kernel memory access be
+ * permitted in order to send signal to the current
+ * task.
+ */
+ if (unlikely(current->flags & (PF_KTHREAD | PF_EXITING)))
+ return -EPERM;
+ if (unlikely(uaccess_kernel()))
+ return -EPERM;
+ if (unlikely(!nmi_uaccess_okay()))
+ return -EPERM;
+
+ if (in_nmi()) {
+ /* Do an early check on signal validity. Otherwise,
+ * the error is lost in deferred irq_work.
+ */
+ if (unlikely(!valid_signal(sig)))
+ return -EINVAL;
+
+ work = this_cpu_ptr(&send_signal_work);
+ if (work->irq_work.flags & IRQ_WORK_BUSY)
+ return -EBUSY;
+
+ /* Add the current task, which is the target of sending signal,
+ * to the irq_work. The current task may change when queued
+ * irq works get executed.
+ */
+ work->task = current;
+ work->sig = sig;
+ irq_work_queue(&work->irq_work);
+ return 0;
+ }
+
+ return group_send_sig_info(sig, SEND_SIG_PRIV, current, PIDTYPE_TGID);
+}
+
+static const struct bpf_func_proto bpf_send_signal_proto = {
+ .func = bpf_send_signal,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_ANYTHING,
+};
+
static const struct bpf_func_proto *
tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
@@ -570,6 +667,12 @@ tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_map_update_elem_proto;
case BPF_FUNC_map_delete_elem:
return &bpf_map_delete_elem_proto;
+ case BPF_FUNC_map_push_elem:
+ return &bpf_map_push_elem_proto;
+ case BPF_FUNC_map_pop_elem:
+ return &bpf_map_pop_elem_proto;
+ case BPF_FUNC_map_peek_elem:
+ return &bpf_map_peek_elem_proto;
case BPF_FUNC_probe_read:
return &bpf_probe_read_proto;
case BPF_FUNC_ktime_get_ns:
@@ -604,6 +707,8 @@ tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
case BPF_FUNC_get_current_cgroup_id:
return &bpf_get_current_cgroup_id_proto;
#endif
+ case BPF_FUNC_send_signal:
+ return &bpf_send_signal_proto;
default:
return NULL;
}
@@ -809,16 +914,48 @@ pe_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
/*
* bpf_raw_tp_regs are separate from bpf_pt_regs used from skb/xdp
* to avoid potential recursive reuse issue when/if tracepoints are added
- * inside bpf_*_event_output, bpf_get_stackid and/or bpf_get_stack
+ * inside bpf_*_event_output, bpf_get_stackid and/or bpf_get_stack.
+ *
+ * Since raw tracepoints run despite bpf_prog_active, support concurrent usage
+ * in normal, irq, and nmi context.
*/
-static DEFINE_PER_CPU(struct pt_regs, bpf_raw_tp_regs);
+struct bpf_raw_tp_regs {
+ struct pt_regs regs[3];
+};
+static DEFINE_PER_CPU(struct bpf_raw_tp_regs, bpf_raw_tp_regs);
+static DEFINE_PER_CPU(int, bpf_raw_tp_nest_level);
+static struct pt_regs *get_bpf_raw_tp_regs(void)
+{
+ struct bpf_raw_tp_regs *tp_regs = this_cpu_ptr(&bpf_raw_tp_regs);
+ int nest_level = this_cpu_inc_return(bpf_raw_tp_nest_level);
+
+ if (WARN_ON_ONCE(nest_level > ARRAY_SIZE(tp_regs->regs))) {
+ this_cpu_dec(bpf_raw_tp_nest_level);
+ return ERR_PTR(-EBUSY);
+ }
+
+ return &tp_regs->regs[nest_level - 1];
+}
+
+static void put_bpf_raw_tp_regs(void)
+{
+ this_cpu_dec(bpf_raw_tp_nest_level);
+}
+
BPF_CALL_5(bpf_perf_event_output_raw_tp, struct bpf_raw_tracepoint_args *, args,
struct bpf_map *, map, u64, flags, void *, data, u64, size)
{
- struct pt_regs *regs = this_cpu_ptr(&bpf_raw_tp_regs);
+ struct pt_regs *regs = get_bpf_raw_tp_regs();
+ int ret;
+
+ if (IS_ERR(regs))
+ return PTR_ERR(regs);
perf_fetch_caller_regs(regs);
- return ____bpf_perf_event_output(regs, map, flags, data, size);
+ ret = ____bpf_perf_event_output(regs, map, flags, data, size);
+
+ put_bpf_raw_tp_regs();
+ return ret;
}
static const struct bpf_func_proto bpf_perf_event_output_proto_raw_tp = {
@@ -835,12 +972,18 @@ static const struct bpf_func_proto bpf_perf_event_output_proto_raw_tp = {
BPF_CALL_3(bpf_get_stackid_raw_tp, struct bpf_raw_tracepoint_args *, args,
struct bpf_map *, map, u64, flags)
{
- struct pt_regs *regs = this_cpu_ptr(&bpf_raw_tp_regs);
+ struct pt_regs *regs = get_bpf_raw_tp_regs();
+ int ret;
+
+ if (IS_ERR(regs))
+ return PTR_ERR(regs);
perf_fetch_caller_regs(regs);
/* similar to bpf_perf_event_output_tp, but pt_regs fetched differently */
- return bpf_get_stackid((unsigned long) regs, (unsigned long) map,
- flags, 0, 0);
+ ret = bpf_get_stackid((unsigned long) regs, (unsigned long) map,
+ flags, 0, 0);
+ put_bpf_raw_tp_regs();
+ return ret;
}
static const struct bpf_func_proto bpf_get_stackid_proto_raw_tp = {
@@ -855,11 +998,17 @@ static const struct bpf_func_proto bpf_get_stackid_proto_raw_tp = {
BPF_CALL_4(bpf_get_stack_raw_tp, struct bpf_raw_tracepoint_args *, args,
void *, buf, u32, size, u64, flags)
{
- struct pt_regs *regs = this_cpu_ptr(&bpf_raw_tp_regs);
+ struct pt_regs *regs = get_bpf_raw_tp_regs();
+ int ret;
+
+ if (IS_ERR(regs))
+ return PTR_ERR(regs);
perf_fetch_caller_regs(regs);
- return bpf_get_stack((unsigned long) regs, (unsigned long) buf,
- (unsigned long) size, flags, 0);
+ ret = bpf_get_stack((unsigned long) regs, (unsigned long) buf,
+ (unsigned long) size, flags, 0);
+ put_bpf_raw_tp_regs();
+ return ret;
}
static const struct bpf_func_proto bpf_get_stack_proto_raw_tp = {
@@ -910,6 +1059,27 @@ const struct bpf_verifier_ops raw_tracepoint_verifier_ops = {
const struct bpf_prog_ops raw_tracepoint_prog_ops = {
};
+static bool raw_tp_writable_prog_is_valid_access(int off, int size,
+ enum bpf_access_type type,
+ const struct bpf_prog *prog,
+ struct bpf_insn_access_aux *info)
+{
+ if (off == 0) {
+ if (size != sizeof(u64) || type != BPF_READ)
+ return false;
+ info->reg_type = PTR_TO_TP_BUFFER;
+ }
+ return raw_tp_prog_is_valid_access(off, size, type, prog, info);
+}
+
+const struct bpf_verifier_ops raw_tracepoint_writable_verifier_ops = {
+ .get_func_proto = raw_tp_prog_func_proto,
+ .is_valid_access = raw_tp_writable_prog_is_valid_access,
+};
+
+const struct bpf_prog_ops raw_tracepoint_writable_prog_ops = {
+};
+
static bool pe_prog_is_valid_access(int off, int size, enum bpf_access_type type,
const struct bpf_prog *prog,
struct bpf_insn_access_aux *info)
@@ -1000,7 +1170,7 @@ static DEFINE_MUTEX(bpf_event_mutex);
int perf_event_attach_bpf_prog(struct perf_event *event,
struct bpf_prog *prog)
{
- struct bpf_prog_array __rcu *old_array;
+ struct bpf_prog_array *old_array;
struct bpf_prog_array *new_array;
int ret = -EEXIST;
@@ -1018,7 +1188,7 @@ int perf_event_attach_bpf_prog(struct perf_event *event,
if (event->prog)
goto unlock;
- old_array = event->tp_event->prog_array;
+ old_array = bpf_event_rcu_dereference(event->tp_event->prog_array);
if (old_array &&
bpf_prog_array_length(old_array) >= BPF_TRACE_MAX_PROGS) {
ret = -E2BIG;
@@ -1041,7 +1211,7 @@ unlock:
void perf_event_detach_bpf_prog(struct perf_event *event)
{
- struct bpf_prog_array __rcu *old_array;
+ struct bpf_prog_array *old_array;
struct bpf_prog_array *new_array;
int ret;
@@ -1050,7 +1220,7 @@ void perf_event_detach_bpf_prog(struct perf_event *event)
if (!event->prog)
goto unlock;
- old_array = event->tp_event->prog_array;
+ old_array = bpf_event_rcu_dereference(event->tp_event->prog_array);
ret = bpf_prog_array_copy(old_array, event->prog, NULL, &new_array);
if (ret == -ENOENT)
goto unlock;
@@ -1072,6 +1242,7 @@ int perf_event_query_prog_array(struct perf_event *event, void __user *info)
{
struct perf_event_query_bpf __user *uquery = info;
struct perf_event_query_bpf query = {};
+ struct bpf_prog_array *progs;
u32 *ids, prog_cnt, ids_len;
int ret;
@@ -1096,10 +1267,8 @@ int perf_event_query_prog_array(struct perf_event *event, void __user *info)
*/
mutex_lock(&bpf_event_mutex);
- ret = bpf_prog_array_copy_info(event->tp_event->prog_array,
- ids,
- ids_len,
- &prog_cnt);
+ progs = bpf_event_rcu_dereference(event->tp_event->prog_array);
+ ret = bpf_prog_array_copy_info(progs, ids, ids_len, &prog_cnt);
mutex_unlock(&bpf_event_mutex);
if (copy_to_user(&uquery->prog_cnt, &prog_cnt, sizeof(prog_cnt)) ||
@@ -1199,6 +1368,9 @@ static int __bpf_probe_register(struct bpf_raw_event_map *btp, struct bpf_prog *
if (prog->aux->max_ctx_offset > btp->num_args * sizeof(u64))
return -EINVAL;
+ if (prog->aux->max_tp_access > btp->writable_size)
+ return -EINVAL;
+
return tracepoint_probe_register(tp, (void *)btp->bpf_func, prog);
}
@@ -1259,8 +1431,23 @@ int bpf_get_perf_event_info(const struct perf_event *event, u32 *prog_id,
return err;
}
+static int __init send_signal_irq_work_init(void)
+{
+ int cpu;
+ struct send_signal_irq_work *work;
+
+ for_each_possible_cpu(cpu) {
+ work = per_cpu_ptr(&send_signal_work, cpu);
+ init_irq_work(&work->irq_work, do_bpf_send_signal);
+ }
+ return 0;
+}
+
+subsys_initcall(send_signal_irq_work_init);
+
#ifdef CONFIG_MODULES
-int bpf_event_notify(struct notifier_block *nb, unsigned long op, void *module)
+static int bpf_event_notify(struct notifier_block *nb, unsigned long op,
+ void *module)
{
struct bpf_trace_module *btm, *tmp;
struct module *mod = module;
@@ -1299,7 +1486,7 @@ static struct notifier_block bpf_module_nb = {
.notifier_call = bpf_event_notify,
};
-int __init bpf_event_init(void)
+static int __init bpf_event_init(void)
{
register_module_notifier(&bpf_module_nb);
return 0;
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index aac7847c0214..576c41644e77 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -33,6 +33,7 @@
#include <linux/list.h>
#include <linux/hash.h>
#include <linux/rcupdate.h>
+#include <linux/kprobes.h>
#include <trace/events/sched.h>
@@ -69,12 +70,8 @@
#define INIT_OPS_HASH(opsname) \
.func_hash = &opsname.local_hash, \
.local_hash.regex_lock = __MUTEX_INITIALIZER(opsname.local_hash.regex_lock),
-#define ASSIGN_OPS_HASH(opsname, val) \
- .func_hash = val, \
- .local_hash.regex_lock = __MUTEX_INITIALIZER(opsname.local_hash.regex_lock),
#else
#define INIT_OPS_HASH(opsname)
-#define ASSIGN_OPS_HASH(opsname, val)
#endif
enum {
@@ -1992,7 +1989,7 @@ static void print_bug_type(void)
* modifying the code. @failed should be one of either:
* EFAULT - if the problem happens on reading the @ip address
* EINVAL - if what is read at @ip is not what was expected
- * EPERM - if the problem happens on writting to the @ip address
+ * EPERM - if the problem happens on writing to the @ip address
*/
void ftrace_bug(int failed, struct dyn_ftrace *rec)
{
@@ -2391,7 +2388,7 @@ __ftrace_replace_code(struct dyn_ftrace *rec, int enable)
return ftrace_modify_call(rec, ftrace_old_addr, ftrace_addr);
}
- return -1; /* unknow ftrace bug */
+ return -1; /* unknown ftrace bug */
}
void __weak ftrace_replace_code(int mod_flags)
@@ -2938,14 +2935,13 @@ static int ftrace_update_code(struct module *mod, struct ftrace_page *new_pgs)
p = &pg->records[i];
p->flags = rec_flags;
-#ifndef CC_USING_NOP_MCOUNT
/*
* Do the initial record conversion from mcount jump
* to the NOP instructions.
*/
- if (!ftrace_code_disable(mod, p))
+ if (!__is_defined(CC_USING_NOP_MCOUNT) &&
+ !ftrace_code_disable(mod, p))
break;
-#endif
update_cnt++;
}
@@ -3004,7 +3000,7 @@ ftrace_allocate_pages(unsigned long num_to_init)
int cnt;
if (!num_to_init)
- return 0;
+ return NULL;
start_pg = pg = kzalloc(sizeof(*pg), GFP_KERNEL);
if (!pg)
@@ -3702,6 +3698,31 @@ enter_record(struct ftrace_hash *hash, struct dyn_ftrace *rec, int clear_filter)
}
static int
+add_rec_by_index(struct ftrace_hash *hash, struct ftrace_glob *func_g,
+ int clear_filter)
+{
+ long index = simple_strtoul(func_g->search, NULL, 0);
+ struct ftrace_page *pg;
+ struct dyn_ftrace *rec;
+
+ /* The index starts at 1 */
+ if (--index < 0)
+ return 0;
+
+ do_for_each_ftrace_rec(pg, rec) {
+ if (pg->index <= index) {
+ index -= pg->index;
+ /* this is a double loop, break goes to the next page */
+ break;
+ }
+ rec = &pg->records[index];
+ enter_record(hash, rec, clear_filter);
+ return 1;
+ } while_for_each_ftrace_rec();
+ return 0;
+}
+
+static int
ftrace_match_record(struct dyn_ftrace *rec, struct ftrace_glob *func_g,
struct ftrace_glob *mod_g, int exclude_mod)
{
@@ -3769,6 +3790,11 @@ match_records(struct ftrace_hash *hash, char *func, int len, char *mod)
if (unlikely(ftrace_disabled))
goto out_unlock;
+ if (func_g.type == MATCH_INDEX) {
+ found = add_rec_by_index(hash, &func_g, clear_filter);
+ goto out_unlock;
+ }
+
do_for_each_ftrace_rec(pg, rec) {
if (rec->flags & FTRACE_FL_DISABLED)
@@ -3849,7 +3875,7 @@ static int ftrace_hash_move_and_update_ops(struct ftrace_ops *ops,
static bool module_exists(const char *module)
{
/* All modules have the symbol __this_module */
- const char this_mod[] = "__this_module";
+ static const char this_mod[] = "__this_module";
char modname[MAX_PARAM_PREFIX_LEN + sizeof(this_mod) + 2];
unsigned long val;
int n;
@@ -4194,10 +4220,13 @@ void free_ftrace_func_mapper(struct ftrace_func_mapper *mapper,
struct ftrace_func_entry *entry;
struct ftrace_func_map *map;
struct hlist_head *hhd;
- int size = 1 << mapper->hash.size_bits;
- int i;
+ int size, i;
+
+ if (!mapper)
+ return;
if (free_func && mapper->hash.count) {
+ size = 1 << mapper->hash.size_bits;
for (i = 0; i < size; i++) {
hhd = &mapper->hash.buckets[i];
hlist_for_each_entry(entry, hhd, hlist) {
@@ -4725,7 +4754,7 @@ static int
ftrace_set_addr(struct ftrace_ops *ops, unsigned long ip, int remove,
int reset, int enable)
{
- return ftrace_set_hash(ops, 0, 0, ip, remove, reset, enable);
+ return ftrace_set_hash(ops, NULL, 0, ip, remove, reset, enable);
}
/**
@@ -5433,7 +5462,7 @@ void ftrace_create_filter_files(struct ftrace_ops *ops,
/*
* The name "destroy_filter_files" is really a misnomer. Although
- * in the future, it may actualy delete the files, but this is
+ * in the future, it may actually delete the files, but this is
* really intended to make sure the ops passed in are disabled
* and that when this function returns, the caller is free to
* free the ops.
@@ -5756,7 +5785,7 @@ void ftrace_module_enable(struct module *mod)
/*
* If the tracing is enabled, go ahead and enable the record.
*
- * The reason not to enable the record immediatelly is the
+ * The reason not to enable the record immediately is the
* inherent check of ftrace_make_nop/ftrace_make_call for
* correct previous instructions. Making first the NOP
* conversion puts the module to the correct state, thus
@@ -6216,7 +6245,7 @@ void ftrace_reset_array_ops(struct trace_array *tr)
tr->ops->func = ftrace_stub;
}
-static inline void
+static nokprobe_inline void
__ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip,
struct ftrace_ops *ignored, struct pt_regs *regs)
{
@@ -6234,6 +6263,9 @@ __ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip,
preempt_disable_notrace();
do_for_each_ftrace_op(op, ftrace_ops_list) {
+ /* Stub functions don't need to be called nor tested */
+ if (op->flags & FTRACE_OPS_FL_STUB)
+ continue;
/*
* Check the following for each ops before calling their func:
* if RCU flag is set, then rcu_is_watching() must be true
@@ -6276,11 +6308,13 @@ static void ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip,
{
__ftrace_ops_list_func(ip, parent_ip, NULL, regs);
}
+NOKPROBE_SYMBOL(ftrace_ops_list_func);
#else
static void ftrace_ops_no_ops(unsigned long ip, unsigned long parent_ip)
{
__ftrace_ops_list_func(ip, parent_ip, NULL, NULL);
}
+NOKPROBE_SYMBOL(ftrace_ops_no_ops);
#endif
/*
@@ -6307,6 +6341,7 @@ static void ftrace_ops_assist_func(unsigned long ip, unsigned long parent_ip,
preempt_enable_notrace();
trace_clear_recursion(bit);
}
+NOKPROBE_SYMBOL(ftrace_ops_assist_func);
/**
* ftrace_ops_get_func - get the function a trampoline should call
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 06e864a334bb..05b0b3139ebc 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -353,20 +353,6 @@ static void rb_init_page(struct buffer_data_page *bpage)
local_set(&bpage->commit, 0);
}
-/**
- * ring_buffer_page_len - the size of data on the page.
- * @page: The page to read
- *
- * Returns the amount of data on the page, including buffer page header.
- */
-size_t ring_buffer_page_len(void *page)
-{
- struct buffer_data_page *bpage = page;
-
- return (local_read(&bpage->commit) & ~RB_MISSED_FLAGS)
- + BUF_PAGE_HDR_SIZE;
-}
-
/*
* Also stolen from mm/slob.c. Thanks to Mathieu Desnoyers for pointing
* this issue out.
@@ -776,7 +762,7 @@ u64 ring_buffer_time_stamp(struct ring_buffer *buffer, int cpu)
preempt_disable_notrace();
time = rb_time_stamp(buffer);
- preempt_enable_no_resched_notrace();
+ preempt_enable_notrace();
return time;
}
@@ -4205,6 +4191,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_consume);
* ring_buffer_read_prepare - Prepare for a non consuming read of the buffer
* @buffer: The ring buffer to read from
* @cpu: The cpu buffer to iterate over
+ * @flags: gfp flags to use for memory allocation
*
* This performs the initial preparations necessary to iterate
* through the buffer. Memory is allocated, buffer recording
@@ -4222,7 +4209,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_consume);
* This overall must be paired with ring_buffer_read_finish.
*/
struct ring_buffer_iter *
-ring_buffer_read_prepare(struct ring_buffer *buffer, int cpu)
+ring_buffer_read_prepare(struct ring_buffer *buffer, int cpu, gfp_t flags)
{
struct ring_buffer_per_cpu *cpu_buffer;
struct ring_buffer_iter *iter;
@@ -4230,7 +4217,7 @@ ring_buffer_read_prepare(struct ring_buffer *buffer, int cpu)
if (!cpumask_test_cpu(cpu, buffer->cpumask))
return NULL;
- iter = kmalloc(sizeof(*iter), GFP_KERNEL);
+ iter = kmalloc(sizeof(*iter), flags);
if (!iter)
return NULL;
@@ -4992,7 +4979,7 @@ static __init int rb_write_something(struct rb_test_data *data, bool nested)
cnt = data->cnt + (nested ? 27 : 0);
/* Multiply cnt by ~e, to make some unique increment */
- size = (data->cnt * 68 / 25) % (sizeof(rb_string) - 1);
+ size = (cnt * 68 / 25) % (sizeof(rb_string) - 1);
len = size + sizeof(struct rb_item);
diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c
index ffba6789c0e2..0564f6db0561 100644
--- a/kernel/trace/ring_buffer_benchmark.c
+++ b/kernel/trace/ring_buffer_benchmark.c
@@ -362,7 +362,7 @@ static void ring_buffer_producer(void)
hit--; /* make it non zero */
}
- /* Caculate the average time in nanosecs */
+ /* Calculate the average time in nanosecs */
avg = NSEC_PER_MSEC / (hit + missed);
trace_printk("%ld ns per entry\n", avg);
}
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index c521b7347482..c90c687cf950 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -159,6 +159,8 @@ static union trace_eval_map_item *trace_eval_maps;
#endif /* CONFIG_TRACE_EVAL_MAP_FILE */
static int tracing_set_tracer(struct trace_array *tr, const char *buf);
+static void ftrace_trace_userstack(struct ring_buffer *buffer,
+ unsigned long flags, int pc);
#define MAX_TRACER_SIZE 100
static char bootup_tracer_buf[MAX_TRACER_SIZE] __initdata;
@@ -496,8 +498,10 @@ int trace_pid_write(struct trace_pid_list *filtered_pids,
* not modified.
*/
pid_list = kmalloc(sizeof(*pid_list), GFP_KERNEL);
- if (!pid_list)
+ if (!pid_list) {
+ trace_parser_put(&parser);
return -ENOMEM;
+ }
pid_list->pid_max = READ_ONCE(pid_max);
@@ -507,6 +511,7 @@ int trace_pid_write(struct trace_pid_list *filtered_pids,
pid_list->pids = vzalloc((pid_list->pid_max + 7) >> 3);
if (!pid_list->pids) {
+ trace_parser_put(&parser);
kfree(pid_list);
return -ENOMEM;
}
@@ -894,7 +899,7 @@ int __trace_bputs(unsigned long ip, const char *str)
EXPORT_SYMBOL_GPL(__trace_bputs);
#ifdef CONFIG_TRACER_SNAPSHOT
-void tracing_snapshot_instance(struct trace_array *tr)
+void tracing_snapshot_instance_cond(struct trace_array *tr, void *cond_data)
{
struct tracer *tracer = tr->current_trace;
unsigned long flags;
@@ -920,10 +925,15 @@ void tracing_snapshot_instance(struct trace_array *tr)
}
local_irq_save(flags);
- update_max_tr(tr, current, smp_processor_id());
+ update_max_tr(tr, current, smp_processor_id(), cond_data);
local_irq_restore(flags);
}
+void tracing_snapshot_instance(struct trace_array *tr)
+{
+ tracing_snapshot_instance_cond(tr, NULL);
+}
+
/**
* tracing_snapshot - take a snapshot of the current buffer.
*
@@ -946,6 +956,54 @@ void tracing_snapshot(void)
}
EXPORT_SYMBOL_GPL(tracing_snapshot);
+/**
+ * tracing_snapshot_cond - conditionally take a snapshot of the current buffer.
+ * @tr: The tracing instance to snapshot
+ * @cond_data: The data to be tested conditionally, and possibly saved
+ *
+ * This is the same as tracing_snapshot() except that the snapshot is
+ * conditional - the snapshot will only happen if the
+ * cond_snapshot.update() implementation receiving the cond_data
+ * returns true, which means that the trace array's cond_snapshot
+ * update() operation used the cond_data to determine whether the
+ * snapshot should be taken, and if it was, presumably saved it along
+ * with the snapshot.
+ */
+void tracing_snapshot_cond(struct trace_array *tr, void *cond_data)
+{
+ tracing_snapshot_instance_cond(tr, cond_data);
+}
+EXPORT_SYMBOL_GPL(tracing_snapshot_cond);
+
+/**
+ * tracing_snapshot_cond_data - get the user data associated with a snapshot
+ * @tr: The tracing instance
+ *
+ * When the user enables a conditional snapshot using
+ * tracing_snapshot_cond_enable(), the user-defined cond_data is saved
+ * with the snapshot. This accessor is used to retrieve it.
+ *
+ * Should not be called from cond_snapshot.update(), since it takes
+ * the tr->max_lock lock, which the code calling
+ * cond_snapshot.update() has already done.
+ *
+ * Returns the cond_data associated with the trace array's snapshot.
+ */
+void *tracing_cond_snapshot_data(struct trace_array *tr)
+{
+ void *cond_data = NULL;
+
+ arch_spin_lock(&tr->max_lock);
+
+ if (tr->cond_snapshot)
+ cond_data = tr->cond_snapshot->cond_data;
+
+ arch_spin_unlock(&tr->max_lock);
+
+ return cond_data;
+}
+EXPORT_SYMBOL_GPL(tracing_cond_snapshot_data);
+
static int resize_buffer_duplicate_size(struct trace_buffer *trace_buf,
struct trace_buffer *size_buf, int cpu_id);
static void set_buffer_entries(struct trace_buffer *buf, unsigned long val);
@@ -1025,12 +1083,111 @@ void tracing_snapshot_alloc(void)
tracing_snapshot();
}
EXPORT_SYMBOL_GPL(tracing_snapshot_alloc);
+
+/**
+ * tracing_snapshot_cond_enable - enable conditional snapshot for an instance
+ * @tr: The tracing instance
+ * @cond_data: User data to associate with the snapshot
+ * @update: Implementation of the cond_snapshot update function
+ *
+ * Check whether the conditional snapshot for the given instance has
+ * already been enabled, or if the current tracer is already using a
+ * snapshot; if so, return -EBUSY, else create a cond_snapshot and
+ * save the cond_data and update function inside.
+ *
+ * Returns 0 if successful, error otherwise.
+ */
+int tracing_snapshot_cond_enable(struct trace_array *tr, void *cond_data,
+ cond_update_fn_t update)
+{
+ struct cond_snapshot *cond_snapshot;
+ int ret = 0;
+
+ cond_snapshot = kzalloc(sizeof(*cond_snapshot), GFP_KERNEL);
+ if (!cond_snapshot)
+ return -ENOMEM;
+
+ cond_snapshot->cond_data = cond_data;
+ cond_snapshot->update = update;
+
+ mutex_lock(&trace_types_lock);
+
+ ret = tracing_alloc_snapshot_instance(tr);
+ if (ret)
+ goto fail_unlock;
+
+ if (tr->current_trace->use_max_tr) {
+ ret = -EBUSY;
+ goto fail_unlock;
+ }
+
+ /*
+ * The cond_snapshot can only change to NULL without the
+ * trace_types_lock. We don't care if we race with it going
+ * to NULL, but we want to make sure that it's not set to
+ * something other than NULL when we get here, which we can
+ * do safely with only holding the trace_types_lock and not
+ * having to take the max_lock.
+ */
+ if (tr->cond_snapshot) {
+ ret = -EBUSY;
+ goto fail_unlock;
+ }
+
+ arch_spin_lock(&tr->max_lock);
+ tr->cond_snapshot = cond_snapshot;
+ arch_spin_unlock(&tr->max_lock);
+
+ mutex_unlock(&trace_types_lock);
+
+ return ret;
+
+ fail_unlock:
+ mutex_unlock(&trace_types_lock);
+ kfree(cond_snapshot);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(tracing_snapshot_cond_enable);
+
+/**
+ * tracing_snapshot_cond_disable - disable conditional snapshot for an instance
+ * @tr: The tracing instance
+ *
+ * Check whether the conditional snapshot for the given instance is
+ * enabled; if so, free the cond_snapshot associated with it,
+ * otherwise return -EINVAL.
+ *
+ * Returns 0 if successful, error otherwise.
+ */
+int tracing_snapshot_cond_disable(struct trace_array *tr)
+{
+ int ret = 0;
+
+ arch_spin_lock(&tr->max_lock);
+
+ if (!tr->cond_snapshot)
+ ret = -EINVAL;
+ else {
+ kfree(tr->cond_snapshot);
+ tr->cond_snapshot = NULL;
+ }
+
+ arch_spin_unlock(&tr->max_lock);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(tracing_snapshot_cond_disable);
#else
void tracing_snapshot(void)
{
WARN_ONCE(1, "Snapshot feature not enabled, but internal snapshot used");
}
EXPORT_SYMBOL_GPL(tracing_snapshot);
+void tracing_snapshot_cond(struct trace_array *tr, void *cond_data)
+{
+ WARN_ONCE(1, "Snapshot feature not enabled, but internal conditional snapshot used");
+}
+EXPORT_SYMBOL_GPL(tracing_snapshot_cond);
int tracing_alloc_snapshot(void)
{
WARN_ONCE(1, "Snapshot feature not enabled, but snapshot allocation used");
@@ -1043,6 +1200,21 @@ void tracing_snapshot_alloc(void)
tracing_snapshot();
}
EXPORT_SYMBOL_GPL(tracing_snapshot_alloc);
+void *tracing_cond_snapshot_data(struct trace_array *tr)
+{
+ return NULL;
+}
+EXPORT_SYMBOL_GPL(tracing_cond_snapshot_data);
+int tracing_snapshot_cond_enable(struct trace_array *tr, void *cond_data, cond_update_fn_t update)
+{
+ return -ENODEV;
+}
+EXPORT_SYMBOL_GPL(tracing_snapshot_cond_enable);
+int tracing_snapshot_cond_disable(struct trace_array *tr)
+{
+ return false;
+}
+EXPORT_SYMBOL_GPL(tracing_snapshot_cond_disable);
#endif /* CONFIG_TRACER_SNAPSHOT */
void tracer_tracing_off(struct trace_array *tr)
@@ -1330,7 +1502,7 @@ __update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)
max_data->critical_start = data->critical_start;
max_data->critical_end = data->critical_end;
- memcpy(max_data->comm, tsk->comm, TASK_COMM_LEN);
+ strncpy(max_data->comm, tsk->comm, TASK_COMM_LEN);
max_data->pid = tsk->pid;
/*
* If tsk == current, then use current_uid(), as that does not use
@@ -1354,12 +1526,14 @@ __update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)
* @tr: tracer
* @tsk: the task with the latency
* @cpu: The cpu that initiated the trace.
+ * @cond_data: User data associated with a conditional snapshot
*
* Flip the buffers between the @tr and the max_tr and record information
* about which task was the cause of this latency.
*/
void
-update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)
+update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu,
+ void *cond_data)
{
if (tr->stop_count)
return;
@@ -1380,9 +1554,15 @@ update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)
else
ring_buffer_record_off(tr->max_buffer.buffer);
+#ifdef CONFIG_TRACER_SNAPSHOT
+ if (tr->cond_snapshot && !tr->cond_snapshot->update(tr, cond_data))
+ goto out_unlock;
+#endif
swap(tr->trace_buffer.buffer, tr->max_buffer.buffer);
__update_max_tr(tr, tsk, cpu);
+
+ out_unlock:
arch_spin_unlock(&tr->max_lock);
}
@@ -1547,6 +1727,10 @@ static __init int init_trace_selftests(void)
pr_info("Running postponed tracer tests:\n");
list_for_each_entry_safe(p, n, &postponed_selftests, list) {
+ /* This loop can take minutes when sanitizers are enabled, so
+ * lets make sure we allow RCU processing.
+ */
+ cond_resched();
ret = run_tracer_selftest(p->type);
/* If the test fails, then warn and remove from available_tracers */
if (ret < 0) {
@@ -1748,7 +1932,7 @@ static inline char *get_saved_cmdlines(int idx)
static inline void set_cmdline(int idx, const char *cmdline)
{
- memcpy(get_saved_cmdlines(idx), cmdline, TASK_COMM_LEN);
+ strncpy(get_saved_cmdlines(idx), cmdline, TASK_COMM_LEN);
}
static int allocate_cmdlines_buffer(unsigned int val,
@@ -2574,12 +2758,21 @@ trace_function(struct trace_array *tr,
#ifdef CONFIG_STACKTRACE
-#define FTRACE_STACK_MAX_ENTRIES (PAGE_SIZE / sizeof(unsigned long))
+/* Allow 4 levels of nesting: normal, softirq, irq, NMI */
+#define FTRACE_KSTACK_NESTING 4
+
+#define FTRACE_KSTACK_ENTRIES (PAGE_SIZE / FTRACE_KSTACK_NESTING)
+
struct ftrace_stack {
- unsigned long calls[FTRACE_STACK_MAX_ENTRIES];
+ unsigned long calls[FTRACE_KSTACK_ENTRIES];
+};
+
+
+struct ftrace_stacks {
+ struct ftrace_stack stacks[FTRACE_KSTACK_NESTING];
};
-static DEFINE_PER_CPU(struct ftrace_stack, ftrace_stack);
+static DEFINE_PER_CPU(struct ftrace_stacks, ftrace_stacks);
static DEFINE_PER_CPU(int, ftrace_stack_reserve);
static void __ftrace_trace_stack(struct ring_buffer *buffer,
@@ -2588,13 +2781,10 @@ static void __ftrace_trace_stack(struct ring_buffer *buffer,
{
struct trace_event_call *call = &event_kernel_stack;
struct ring_buffer_event *event;
+ unsigned int size, nr_entries;
+ struct ftrace_stack *fstack;
struct stack_entry *entry;
- struct stack_trace trace;
- int use_stack;
- int size = FTRACE_STACK_ENTRIES;
-
- trace.nr_entries = 0;
- trace.skip = skip;
+ int stackidx;
/*
* Add one, for this function and the call to save_stack_trace()
@@ -2602,7 +2792,7 @@ static void __ftrace_trace_stack(struct ring_buffer *buffer,
*/
#ifndef CONFIG_UNWINDER_ORC
if (!regs)
- trace.skip++;
+ skip++;
#endif
/*
@@ -2613,53 +2803,40 @@ static void __ftrace_trace_stack(struct ring_buffer *buffer,
*/
preempt_disable_notrace();
- use_stack = __this_cpu_inc_return(ftrace_stack_reserve);
+ stackidx = __this_cpu_inc_return(ftrace_stack_reserve) - 1;
+
+ /* This should never happen. If it does, yell once and skip */
+ if (WARN_ON_ONCE(stackidx > FTRACE_KSTACK_NESTING))
+ goto out;
+
/*
- * We don't need any atomic variables, just a barrier.
- * If an interrupt comes in, we don't care, because it would
- * have exited and put the counter back to what we want.
- * We just need a barrier to keep gcc from moving things
- * around.
+ * The above __this_cpu_inc_return() is 'atomic' cpu local. An
+ * interrupt will either see the value pre increment or post
+ * increment. If the interrupt happens pre increment it will have
+ * restored the counter when it returns. We just need a barrier to
+ * keep gcc from moving things around.
*/
barrier();
- if (use_stack == 1) {
- trace.entries = this_cpu_ptr(ftrace_stack.calls);
- trace.max_entries = FTRACE_STACK_MAX_ENTRIES;
- if (regs)
- save_stack_trace_regs(regs, &trace);
- else
- save_stack_trace(&trace);
-
- if (trace.nr_entries > size)
- size = trace.nr_entries;
- } else
- /* From now on, use_stack is a boolean */
- use_stack = 0;
+ fstack = this_cpu_ptr(ftrace_stacks.stacks) + stackidx;
+ size = ARRAY_SIZE(fstack->calls);
- size *= sizeof(unsigned long);
+ if (regs) {
+ nr_entries = stack_trace_save_regs(regs, fstack->calls,
+ size, skip);
+ } else {
+ nr_entries = stack_trace_save(fstack->calls, size, skip);
+ }
+ size = nr_entries * sizeof(unsigned long);
event = __trace_buffer_lock_reserve(buffer, TRACE_STACK,
sizeof(*entry) + size, flags, pc);
if (!event)
goto out;
entry = ring_buffer_event_data(event);
- memset(&entry->caller, 0, size);
-
- if (use_stack)
- memcpy(&entry->caller, trace.entries,
- trace.nr_entries * sizeof(unsigned long));
- else {
- trace.max_entries = FTRACE_STACK_ENTRIES;
- trace.entries = entry->caller;
- if (regs)
- save_stack_trace_regs(regs, &trace);
- else
- save_stack_trace(&trace);
- }
-
- entry->size = trace.nr_entries;
+ memcpy(&entry->caller, fstack->calls, size);
+ entry->size = nr_entries;
if (!call_filter_check_discard(call, entry, buffer, event))
__buffer_unlock_commit(buffer, event);
@@ -2729,15 +2906,15 @@ void trace_dump_stack(int skip)
}
EXPORT_SYMBOL_GPL(trace_dump_stack);
+#ifdef CONFIG_USER_STACKTRACE_SUPPORT
static DEFINE_PER_CPU(int, user_stack_count);
-void
+static void
ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc)
{
struct trace_event_call *call = &event_user_stack;
struct ring_buffer_event *event;
struct userstack_entry *entry;
- struct stack_trace trace;
if (!(global_trace.trace_flags & TRACE_ITER_USERSTACKTRACE))
return;
@@ -2768,12 +2945,7 @@ ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc)
entry->tgid = current->tgid;
memset(&entry->caller, 0, sizeof(entry->caller));
- trace.nr_entries = 0;
- trace.max_entries = FTRACE_STACK_ENTRIES;
- trace.skip = 0;
- trace.entries = entry->caller;
-
- save_stack_trace_user(&trace);
+ stack_trace_save_user(entry->caller, FTRACE_STACK_ENTRIES);
if (!call_filter_check_discard(call, entry, buffer, event))
__buffer_unlock_commit(buffer, event);
@@ -2782,13 +2954,12 @@ ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc)
out:
preempt_enable();
}
-
-#ifdef UNUSED
-static void __trace_userstack(struct trace_array *tr, unsigned long flags)
+#else /* CONFIG_USER_STACKTRACE_SUPPORT */
+static void ftrace_trace_userstack(struct ring_buffer *buffer,
+ unsigned long flags, int pc)
{
- ftrace_trace_userstack(tr, flags, preempt_count());
}
-#endif /* UNUSED */
+#endif /* !CONFIG_USER_STACKTRACE_SUPPORT */
#endif /* CONFIG_STACKTRACE */
@@ -2878,6 +3049,7 @@ void trace_printk_init_buffers(void)
if (global_trace.trace_buffer.buffer)
tracing_start_cmdline_record();
}
+EXPORT_SYMBOL_GPL(trace_printk_init_buffers);
void trace_printk_start_comm(void)
{
@@ -3038,6 +3210,7 @@ int trace_array_printk(struct trace_array *tr,
va_end(ap);
return ret;
}
+EXPORT_SYMBOL_GPL(trace_array_printk);
__printf(3, 4)
int trace_array_printk_buf(struct ring_buffer *buffer,
@@ -3316,33 +3489,68 @@ static void s_stop(struct seq_file *m, void *p)
}
static void
+get_total_entries_cpu(struct trace_buffer *buf, unsigned long *total,
+ unsigned long *entries, int cpu)
+{
+ unsigned long count;
+
+ count = ring_buffer_entries_cpu(buf->buffer, cpu);
+ /*
+ * If this buffer has skipped entries, then we hold all
+ * entries for the trace and we need to ignore the
+ * ones before the time stamp.
+ */
+ if (per_cpu_ptr(buf->data, cpu)->skipped_entries) {
+ count -= per_cpu_ptr(buf->data, cpu)->skipped_entries;
+ /* total is the same as the entries */
+ *total = count;
+ } else
+ *total = count +
+ ring_buffer_overrun_cpu(buf->buffer, cpu);
+ *entries = count;
+}
+
+static void
get_total_entries(struct trace_buffer *buf,
unsigned long *total, unsigned long *entries)
{
- unsigned long count;
+ unsigned long t, e;
int cpu;
*total = 0;
*entries = 0;
for_each_tracing_cpu(cpu) {
- count = ring_buffer_entries_cpu(buf->buffer, cpu);
- /*
- * If this buffer has skipped entries, then we hold all
- * entries for the trace and we need to ignore the
- * ones before the time stamp.
- */
- if (per_cpu_ptr(buf->data, cpu)->skipped_entries) {
- count -= per_cpu_ptr(buf->data, cpu)->skipped_entries;
- /* total is the same as the entries */
- *total += count;
- } else
- *total += count +
- ring_buffer_overrun_cpu(buf->buffer, cpu);
- *entries += count;
+ get_total_entries_cpu(buf, &t, &e, cpu);
+ *total += t;
+ *entries += e;
}
}
+unsigned long trace_total_entries_cpu(struct trace_array *tr, int cpu)
+{
+ unsigned long total, entries;
+
+ if (!tr)
+ tr = &global_trace;
+
+ get_total_entries_cpu(&tr->trace_buffer, &total, &entries, cpu);
+
+ return entries;
+}
+
+unsigned long trace_total_entries(struct trace_array *tr)
+{
+ unsigned long total, entries;
+
+ if (!tr)
+ tr = &global_trace;
+
+ get_total_entries(&tr->trace_buffer, &total, &entries);
+
+ return entries;
+}
+
static void print_lat_help_header(struct seq_file *m)
{
seq_puts(m, "# _------=> CPU# \n"
@@ -3381,23 +3589,18 @@ static void print_func_help_header_irq(struct trace_buffer *buf, struct seq_file
unsigned int flags)
{
bool tgid = flags & TRACE_ITER_RECORD_TGID;
- const char tgid_space[] = " ";
- const char space[] = " ";
-
- seq_printf(m, "# %s _-----=> irqs-off\n",
- tgid ? tgid_space : space);
- seq_printf(m, "# %s / _----=> need-resched\n",
- tgid ? tgid_space : space);
- seq_printf(m, "# %s| / _---=> hardirq/softirq\n",
- tgid ? tgid_space : space);
- seq_printf(m, "# %s|| / _--=> preempt-depth\n",
- tgid ? tgid_space : space);
- seq_printf(m, "# %s||| / delay\n",
- tgid ? tgid_space : space);
- seq_printf(m, "# TASK-PID %sCPU# |||| TIMESTAMP FUNCTION\n",
- tgid ? " TGID " : space);
- seq_printf(m, "# | | %s | |||| | |\n",
- tgid ? " | " : space);
+ const char *space = " ";
+ int prec = tgid ? 10 : 2;
+
+ print_event_info(buf, m);
+
+ seq_printf(m, "# %.*s _-----=> irqs-off\n", prec, space);
+ seq_printf(m, "# %.*s / _----=> need-resched\n", prec, space);
+ seq_printf(m, "# %.*s| / _---=> hardirq/softirq\n", prec, space);
+ seq_printf(m, "# %.*s|| / _--=> preempt-depth\n", prec, space);
+ seq_printf(m, "# %.*s||| / delay\n", prec, space);
+ seq_printf(m, "# TASK-PID %.*sCPU# |||| TIMESTAMP FUNCTION\n", prec, " TGID ");
+ seq_printf(m, "# | | %.*s | |||| | |\n", prec, " | ");
}
void
@@ -3902,7 +4105,8 @@ __tracing_open(struct inode *inode, struct file *file, bool snapshot)
if (iter->cpu_file == RING_BUFFER_ALL_CPUS) {
for_each_tracing_cpu(cpu) {
iter->buffer_iter[cpu] =
- ring_buffer_read_prepare(iter->trace_buffer->buffer, cpu);
+ ring_buffer_read_prepare(iter->trace_buffer->buffer,
+ cpu, GFP_KERNEL);
}
ring_buffer_read_prepare_sync();
for_each_tracing_cpu(cpu) {
@@ -3912,7 +4116,8 @@ __tracing_open(struct inode *inode, struct file *file, bool snapshot)
} else {
cpu = iter->cpu_file;
iter->buffer_iter[cpu] =
- ring_buffer_read_prepare(iter->trace_buffer->buffer, cpu);
+ ring_buffer_read_prepare(iter->trace_buffer->buffer,
+ cpu, GFP_KERNEL);
ring_buffer_read_prepare_sync();
ring_buffer_read_start(iter->buffer_iter[cpu]);
tracing_iter_reset(iter, cpu);
@@ -4521,6 +4726,7 @@ static const char readme_msg[] =
" trace_pipe\t\t- A consuming read to see the contents of the buffer\n"
" current_tracer\t- function and latency tracers\n"
" available_tracers\t- list of configured tracers for current_tracer\n"
+ " error_log\t- error log for failed commands (that support it)\n"
" buffer_size_kb\t- view and modify size of per cpu buffer\n"
" buffer_total_size_kb - view total size of all cpu buffers\n\n"
" trace_clock\t\t-change the clock used to order events\n"
@@ -4541,7 +4747,7 @@ static const char readme_msg[] =
" instances\t\t- Make sub-buffers with: mkdir instances/foo\n"
"\t\t\t Remove sub-buffer with rmdir\n"
" trace_options\t\t- Set format or modify how tracing happens\n"
- "\t\t\t Disable an option by adding a suffix 'no' to the\n"
+ "\t\t\t Disable an option by prefixing 'no' to the\n"
"\t\t\t option name\n"
" saved_cmdlines_size\t- echo command number in here to store comm-pid list\n"
#ifdef CONFIG_DYNAMIC_FTRACE
@@ -4700,6 +4906,7 @@ static const char readme_msg[] =
"\t [:size=#entries]\n"
"\t [:pause][:continue][:clear]\n"
"\t [:name=histname1]\n"
+ "\t [:<handler>.<action>]\n"
"\t [if <filter>]\n\n"
"\t When a matching event is hit, an entry is added to a hash\n"
"\t table using the key(s) and value(s) named, and the value of a\n"
@@ -4740,8 +4947,21 @@ static const char readme_msg[] =
"\t unchanged.\n\n"
"\t The enable_hist and disable_hist triggers can be used to\n"
"\t have one event conditionally start and stop another event's\n"
- "\t already-attached hist trigger. The syntax is analagous to\n"
- "\t the enable_event and disable_event triggers.\n"
+ "\t already-attached hist trigger. The syntax is analogous to\n"
+ "\t the enable_event and disable_event triggers.\n\n"
+ "\t Hist trigger handlers and actions are executed whenever a\n"
+ "\t a histogram entry is added or updated. They take the form:\n\n"
+ "\t <handler>.<action>\n\n"
+ "\t The available handlers are:\n\n"
+ "\t onmatch(matching.event) - invoke on addition or update\n"
+ "\t onmax(var) - invoke if var exceeds current max\n"
+ "\t onchange(var) - invoke action if var changes\n\n"
+ "\t The available actions are:\n\n"
+ "\t trace(<synthetic_event>,param list) - generate synthetic event\n"
+ "\t save(field,...) - save current event fields\n"
+#ifdef CONFIG_TRACER_SNAPSHOT
+ "\t snapshot() - snapshot the trace buffer\n"
+#endif
#endif
;
@@ -5386,6 +5606,16 @@ static int tracing_set_tracer(struct trace_array *tr, const char *buf)
if (t == tr->current_trace)
goto out;
+#ifdef CONFIG_TRACER_SNAPSHOT
+ if (t->use_max_tr) {
+ arch_spin_lock(&tr->max_lock);
+ if (tr->cond_snapshot)
+ ret = -EBUSY;
+ arch_spin_unlock(&tr->max_lock);
+ if (ret)
+ goto out;
+ }
+#endif
/* Some tracers won't work on kernel command line */
if (system_state < SYSTEM_RUNNING && t->noboot) {
pr_warn("Tracer '%s' is not allowed on command line, ignored\n",
@@ -5624,7 +5854,6 @@ out:
return ret;
fail:
- kfree(iter->trace);
kfree(iter);
__trace_array_put(tr);
mutex_unlock(&trace_types_lock);
@@ -5823,7 +6052,6 @@ static void tracing_spd_release_pipe(struct splice_pipe_desc *spd,
}
static const struct pipe_buf_operations tracing_pipe_buf_ops = {
- .can_merge = 0,
.confirm = generic_pipe_buf_confirm,
.release = generic_pipe_buf_release,
.steal = generic_pipe_buf_steal,
@@ -6103,13 +6331,13 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,
struct ring_buffer *buffer;
struct print_entry *entry;
unsigned long irq_flags;
- const char faulted[] = "<faulted>";
ssize_t written;
int size;
int len;
/* Used in tracing_mark_raw_write() as well */
-#define FAULTED_SIZE (sizeof(faulted) - 1) /* '\0' is already accounted for */
+#define FAULTED_STR "<faulted>"
+#define FAULTED_SIZE (sizeof(FAULTED_STR) - 1) /* '\0' is already accounted for */
if (tracing_disabled)
return -EINVAL;
@@ -6141,7 +6369,7 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,
len = __copy_from_user_inatomic(&entry->buf, ubuf, cnt);
if (len) {
- memcpy(&entry->buf, faulted, FAULTED_SIZE);
+ memcpy(&entry->buf, FAULTED_STR, FAULTED_SIZE);
cnt = FAULTED_SIZE;
written = -EFAULT;
} else
@@ -6182,7 +6410,6 @@ tracing_mark_raw_write(struct file *filp, const char __user *ubuf,
struct ring_buffer_event *event;
struct ring_buffer *buffer;
struct raw_data_entry *entry;
- const char faulted[] = "<faulted>";
unsigned long irq_flags;
ssize_t written;
int size;
@@ -6222,7 +6449,7 @@ tracing_mark_raw_write(struct file *filp, const char __user *ubuf,
len = __copy_from_user_inatomic(&entry->id, ubuf, cnt);
if (len) {
entry->id = -1;
- memcpy(&entry->buf, faulted, FAULTED_SIZE);
+ memcpy(&entry->buf, FAULTED_STR, FAULTED_SIZE);
written = -EFAULT;
} else
written = cnt;
@@ -6468,6 +6695,13 @@ tracing_snapshot_write(struct file *filp, const char __user *ubuf, size_t cnt,
goto out;
}
+ arch_spin_lock(&tr->max_lock);
+ if (tr->cond_snapshot)
+ ret = -EBUSY;
+ arch_spin_unlock(&tr->max_lock);
+ if (ret)
+ goto out;
+
switch (val) {
case 0:
if (iter->cpu_file != RING_BUFFER_ALL_CPUS) {
@@ -6485,15 +6719,17 @@ tracing_snapshot_write(struct file *filp, const char __user *ubuf, size_t cnt,
break;
}
#endif
- if (!tr->allocated_snapshot) {
+ if (tr->allocated_snapshot)
+ ret = resize_buffer_duplicate_size(&tr->max_buffer,
+ &tr->trace_buffer, iter->cpu_file);
+ else
ret = tracing_alloc_snapshot_instance(tr);
- if (ret < 0)
- break;
- }
+ if (ret < 0)
+ break;
local_irq_disable();
/* Now, we're going to swap */
if (iter->cpu_file == RING_BUFFER_ALL_CPUS)
- update_max_tr(tr, current, smp_processor_id());
+ update_max_tr(tr, current, smp_processor_id(), NULL);
else
update_max_tr_single(tr, current, iter->cpu_file);
local_irq_enable();
@@ -6668,6 +6904,250 @@ static const struct file_operations snapshot_raw_fops = {
#endif /* CONFIG_TRACER_SNAPSHOT */
+#define TRACING_LOG_ERRS_MAX 8
+#define TRACING_LOG_LOC_MAX 128
+
+#define CMD_PREFIX " Command: "
+
+struct err_info {
+ const char **errs; /* ptr to loc-specific array of err strings */
+ u8 type; /* index into errs -> specific err string */
+ u8 pos; /* MAX_FILTER_STR_VAL = 256 */
+ u64 ts;
+};
+
+struct tracing_log_err {
+ struct list_head list;
+ struct err_info info;
+ char loc[TRACING_LOG_LOC_MAX]; /* err location */
+ char cmd[MAX_FILTER_STR_VAL]; /* what caused err */
+};
+
+static DEFINE_MUTEX(tracing_err_log_lock);
+
+static struct tracing_log_err *get_tracing_log_err(struct trace_array *tr)
+{
+ struct tracing_log_err *err;
+
+ if (tr->n_err_log_entries < TRACING_LOG_ERRS_MAX) {
+ err = kzalloc(sizeof(*err), GFP_KERNEL);
+ if (!err)
+ err = ERR_PTR(-ENOMEM);
+ tr->n_err_log_entries++;
+
+ return err;
+ }
+
+ err = list_first_entry(&tr->err_log, struct tracing_log_err, list);
+ list_del(&err->list);
+
+ return err;
+}
+
+/**
+ * err_pos - find the position of a string within a command for error careting
+ * @cmd: The tracing command that caused the error
+ * @str: The string to position the caret at within @cmd
+ *
+ * Finds the position of the first occurence of @str within @cmd. The
+ * return value can be passed to tracing_log_err() for caret placement
+ * within @cmd.
+ *
+ * Returns the index within @cmd of the first occurence of @str or 0
+ * if @str was not found.
+ */
+unsigned int err_pos(char *cmd, const char *str)
+{
+ char *found;
+
+ if (WARN_ON(!strlen(cmd)))
+ return 0;
+
+ found = strstr(cmd, str);
+ if (found)
+ return found - cmd;
+
+ return 0;
+}
+
+/**
+ * tracing_log_err - write an error to the tracing error log
+ * @tr: The associated trace array for the error (NULL for top level array)
+ * @loc: A string describing where the error occurred
+ * @cmd: The tracing command that caused the error
+ * @errs: The array of loc-specific static error strings
+ * @type: The index into errs[], which produces the specific static err string
+ * @pos: The position the caret should be placed in the cmd
+ *
+ * Writes an error into tracing/error_log of the form:
+ *
+ * <loc>: error: <text>
+ * Command: <cmd>
+ * ^
+ *
+ * tracing/error_log is a small log file containing the last
+ * TRACING_LOG_ERRS_MAX errors (8). Memory for errors isn't allocated
+ * unless there has been a tracing error, and the error log can be
+ * cleared and have its memory freed by writing the empty string in
+ * truncation mode to it i.e. echo > tracing/error_log.
+ *
+ * NOTE: the @errs array along with the @type param are used to
+ * produce a static error string - this string is not copied and saved
+ * when the error is logged - only a pointer to it is saved. See
+ * existing callers for examples of how static strings are typically
+ * defined for use with tracing_log_err().
+ */
+void tracing_log_err(struct trace_array *tr,
+ const char *loc, const char *cmd,
+ const char **errs, u8 type, u8 pos)
+{
+ struct tracing_log_err *err;
+
+ if (!tr)
+ tr = &global_trace;
+
+ mutex_lock(&tracing_err_log_lock);
+ err = get_tracing_log_err(tr);
+ if (PTR_ERR(err) == -ENOMEM) {
+ mutex_unlock(&tracing_err_log_lock);
+ return;
+ }
+
+ snprintf(err->loc, TRACING_LOG_LOC_MAX, "%s: error: ", loc);
+ snprintf(err->cmd, MAX_FILTER_STR_VAL,"\n" CMD_PREFIX "%s\n", cmd);
+
+ err->info.errs = errs;
+ err->info.type = type;
+ err->info.pos = pos;
+ err->info.ts = local_clock();
+
+ list_add_tail(&err->list, &tr->err_log);
+ mutex_unlock(&tracing_err_log_lock);
+}
+
+static void clear_tracing_err_log(struct trace_array *tr)
+{
+ struct tracing_log_err *err, *next;
+
+ mutex_lock(&tracing_err_log_lock);
+ list_for_each_entry_safe(err, next, &tr->err_log, list) {
+ list_del(&err->list);
+ kfree(err);
+ }
+
+ tr->n_err_log_entries = 0;
+ mutex_unlock(&tracing_err_log_lock);
+}
+
+static void *tracing_err_log_seq_start(struct seq_file *m, loff_t *pos)
+{
+ struct trace_array *tr = m->private;
+
+ mutex_lock(&tracing_err_log_lock);
+
+ return seq_list_start(&tr->err_log, *pos);
+}
+
+static void *tracing_err_log_seq_next(struct seq_file *m, void *v, loff_t *pos)
+{
+ struct trace_array *tr = m->private;
+
+ return seq_list_next(v, &tr->err_log, pos);
+}
+
+static void tracing_err_log_seq_stop(struct seq_file *m, void *v)
+{
+ mutex_unlock(&tracing_err_log_lock);
+}
+
+static void tracing_err_log_show_pos(struct seq_file *m, u8 pos)
+{
+ u8 i;
+
+ for (i = 0; i < sizeof(CMD_PREFIX) - 1; i++)
+ seq_putc(m, ' ');
+ for (i = 0; i < pos; i++)
+ seq_putc(m, ' ');
+ seq_puts(m, "^\n");
+}
+
+static int tracing_err_log_seq_show(struct seq_file *m, void *v)
+{
+ struct tracing_log_err *err = v;
+
+ if (err) {
+ const char *err_text = err->info.errs[err->info.type];
+ u64 sec = err->info.ts;
+ u32 nsec;
+
+ nsec = do_div(sec, NSEC_PER_SEC);
+ seq_printf(m, "[%5llu.%06u] %s%s", sec, nsec / 1000,
+ err->loc, err_text);
+ seq_printf(m, "%s", err->cmd);
+ tracing_err_log_show_pos(m, err->info.pos);
+ }
+
+ return 0;
+}
+
+static const struct seq_operations tracing_err_log_seq_ops = {
+ .start = tracing_err_log_seq_start,
+ .next = tracing_err_log_seq_next,
+ .stop = tracing_err_log_seq_stop,
+ .show = tracing_err_log_seq_show
+};
+
+static int tracing_err_log_open(struct inode *inode, struct file *file)
+{
+ struct trace_array *tr = inode->i_private;
+ int ret = 0;
+
+ if (trace_array_get(tr) < 0)
+ return -ENODEV;
+
+ /* If this file was opened for write, then erase contents */
+ if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC))
+ clear_tracing_err_log(tr);
+
+ if (file->f_mode & FMODE_READ) {
+ ret = seq_open(file, &tracing_err_log_seq_ops);
+ if (!ret) {
+ struct seq_file *m = file->private_data;
+ m->private = tr;
+ } else {
+ trace_array_put(tr);
+ }
+ }
+ return ret;
+}
+
+static ssize_t tracing_err_log_write(struct file *file,
+ const char __user *buffer,
+ size_t count, loff_t *ppos)
+{
+ return count;
+}
+
+static int tracing_err_log_release(struct inode *inode, struct file *file)
+{
+ struct trace_array *tr = inode->i_private;
+
+ trace_array_put(tr);
+
+ if (file->f_mode & FMODE_READ)
+ seq_release(inode, file);
+
+ return 0;
+}
+
+static const struct file_operations tracing_err_log_fops = {
+ .open = tracing_err_log_open,
+ .write = tracing_err_log_write,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = tracing_err_log_release,
+};
+
static int tracing_buffers_open(struct inode *inode, struct file *filp)
{
struct trace_array *tr = inode->i_private;
@@ -6817,36 +7297,43 @@ struct buffer_ref {
struct ring_buffer *buffer;
void *page;
int cpu;
- int ref;
+ refcount_t refcount;
};
+static void buffer_ref_release(struct buffer_ref *ref)
+{
+ if (!refcount_dec_and_test(&ref->refcount))
+ return;
+ ring_buffer_free_read_page(ref->buffer, ref->cpu, ref->page);
+ kfree(ref);
+}
+
static void buffer_pipe_buf_release(struct pipe_inode_info *pipe,
struct pipe_buffer *buf)
{
struct buffer_ref *ref = (struct buffer_ref *)buf->private;
- if (--ref->ref)
- return;
-
- ring_buffer_free_read_page(ref->buffer, ref->cpu, ref->page);
- kfree(ref);
+ buffer_ref_release(ref);
buf->private = 0;
}
-static void buffer_pipe_buf_get(struct pipe_inode_info *pipe,
+static bool buffer_pipe_buf_get(struct pipe_inode_info *pipe,
struct pipe_buffer *buf)
{
struct buffer_ref *ref = (struct buffer_ref *)buf->private;
- ref->ref++;
+ if (refcount_read(&ref->refcount) > INT_MAX/2)
+ return false;
+
+ refcount_inc(&ref->refcount);
+ return true;
}
/* Pipe buffer operations for a buffer. */
static const struct pipe_buf_operations buffer_pipe_buf_ops = {
- .can_merge = 0,
.confirm = generic_pipe_buf_confirm,
.release = buffer_pipe_buf_release,
- .steal = generic_pipe_buf_steal,
+ .steal = generic_pipe_buf_nosteal,
.get = buffer_pipe_buf_get,
};
@@ -6859,11 +7346,7 @@ static void buffer_spd_release(struct splice_pipe_desc *spd, unsigned int i)
struct buffer_ref *ref =
(struct buffer_ref *)spd->partial[i].private;
- if (--ref->ref)
- return;
-
- ring_buffer_free_read_page(ref->buffer, ref->cpu, ref->page);
- kfree(ref);
+ buffer_ref_release(ref);
spd->partial[i].private = 0;
}
@@ -6918,7 +7401,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
break;
}
- ref->ref = 1;
+ refcount_set(&ref->refcount, 1);
ref->buffer = iter->trace_buffer->buffer;
ref->page = ring_buffer_alloc_read_page(ref->buffer, iter->cpu_file);
if (IS_ERR(ref->page)) {
@@ -7723,7 +8206,7 @@ static const struct file_operations buffer_percent_fops = {
.llseek = default_llseek,
};
-struct dentry *trace_instance_dir;
+static struct dentry *trace_instance_dir;
static void
init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer);
@@ -7830,7 +8313,7 @@ static void update_tracer_options(struct trace_array *tr)
mutex_unlock(&trace_types_lock);
}
-static int instance_mkdir(const char *name)
+struct trace_array *trace_array_create(const char *name)
{
struct trace_array *tr;
int ret;
@@ -7869,6 +8352,7 @@ static int instance_mkdir(const char *name)
INIT_LIST_HEAD(&tr->systems);
INIT_LIST_HEAD(&tr->events);
INIT_LIST_HEAD(&tr->hist_vars);
+ INIT_LIST_HEAD(&tr->err_log);
if (allocate_trace_buffers(tr, trace_buf_size) < 0)
goto out_free_tr;
@@ -7894,7 +8378,7 @@ static int instance_mkdir(const char *name)
mutex_unlock(&trace_types_lock);
mutex_unlock(&event_mutex);
- return 0;
+ return tr;
out_free_tr:
free_trace_buffers(tr);
@@ -7906,33 +8390,21 @@ static int instance_mkdir(const char *name)
mutex_unlock(&trace_types_lock);
mutex_unlock(&event_mutex);
- return ret;
+ return ERR_PTR(ret);
+}
+EXPORT_SYMBOL_GPL(trace_array_create);
+static int instance_mkdir(const char *name)
+{
+ return PTR_ERR_OR_ZERO(trace_array_create(name));
}
-static int instance_rmdir(const char *name)
+static int __remove_instance(struct trace_array *tr)
{
- struct trace_array *tr;
- int found = 0;
- int ret;
int i;
- mutex_lock(&event_mutex);
- mutex_lock(&trace_types_lock);
-
- ret = -ENODEV;
- list_for_each_entry(tr, &ftrace_trace_arrays, list) {
- if (tr->name && strcmp(tr->name, name) == 0) {
- found = 1;
- break;
- }
- }
- if (!found)
- goto out_unlock;
-
- ret = -EBUSY;
if (tr->ref || (tr->current_trace && tr->current_trace->ref))
- goto out_unlock;
+ return -EBUSY;
list_del(&tr->list);
@@ -7958,10 +8430,46 @@ static int instance_rmdir(const char *name)
free_cpumask_var(tr->tracing_cpumask);
kfree(tr->name);
kfree(tr);
+ tr = NULL;
- ret = 0;
+ return 0;
+}
+
+int trace_array_destroy(struct trace_array *tr)
+{
+ int ret;
+
+ if (!tr)
+ return -EINVAL;
+
+ mutex_lock(&event_mutex);
+ mutex_lock(&trace_types_lock);
+
+ ret = __remove_instance(tr);
+
+ mutex_unlock(&trace_types_lock);
+ mutex_unlock(&event_mutex);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(trace_array_destroy);
+
+static int instance_rmdir(const char *name)
+{
+ struct trace_array *tr;
+ int ret;
+
+ mutex_lock(&event_mutex);
+ mutex_lock(&trace_types_lock);
+
+ ret = -ENODEV;
+ list_for_each_entry(tr, &ftrace_trace_arrays, list) {
+ if (tr->name && strcmp(tr->name, name) == 0) {
+ ret = __remove_instance(tr);
+ break;
+ }
+ }
- out_unlock:
mutex_unlock(&trace_types_lock);
mutex_unlock(&event_mutex);
@@ -8051,6 +8559,9 @@ init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer)
tr, &snapshot_fops);
#endif
+ trace_create_file("error_log", 0644, d_tracer,
+ tr, &tracing_err_log_fops);
+
for_each_tracing_cpu(cpu)
tracing_init_tracefs_percpu(tr, cpu);
@@ -8107,10 +8618,6 @@ struct dentry *tracing_init_dentry(void)
*/
tr->dir = debugfs_create_automount("tracing", NULL,
trace_automount, NULL);
- if (!tr->dir) {
- pr_warn_once("Could not create debugfs directory 'tracing'\n");
- return ERR_PTR(-ENOMEM);
- }
return NULL;
}
@@ -8413,12 +8920,8 @@ void ftrace_dump(enum ftrace_dump_mode oops_dump_mode)
cnt++;
- /* reset all but tr, trace, and overruns */
- memset(&iter.seq, 0,
- sizeof(struct trace_iterator) -
- offsetof(struct trace_iterator, seq));
+ trace_iterator_reset(&iter);
iter.iter_flags |= TRACE_FILE_LAT_FMT;
- iter.pos = -1;
if (trace_find_next_entry_inc(&iter) != NULL) {
int ret;
@@ -8636,6 +9139,7 @@ __init static int tracer_alloc_buffers(void)
INIT_LIST_HEAD(&global_trace.systems);
INIT_LIST_HEAD(&global_trace.events);
INIT_LIST_HEAD(&global_trace.hist_vars);
+ INIT_LIST_HEAD(&global_trace.err_log);
list_add(&global_trace.list, &ftrace_trace_arrays);
apply_trace_boot_options();
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 08900828d282..005f08629b8b 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -15,7 +15,6 @@
#include <linux/trace_seq.h>
#include <linux/trace_events.h>
#include <linux/compiler.h>
-#include <linux/trace_seq.h>
#include <linux/glob.h>
#ifdef CONFIG_FTRACE_SYSCALLS
@@ -194,6 +193,51 @@ struct trace_pid_list {
unsigned long *pids;
};
+typedef bool (*cond_update_fn_t)(struct trace_array *tr, void *cond_data);
+
+/**
+ * struct cond_snapshot - conditional snapshot data and callback
+ *
+ * The cond_snapshot structure encapsulates a callback function and
+ * data associated with the snapshot for a given tracing instance.
+ *
+ * When a snapshot is taken conditionally, by invoking
+ * tracing_snapshot_cond(tr, cond_data), the cond_data passed in is
+ * passed in turn to the cond_snapshot.update() function. That data
+ * can be compared by the update() implementation with the cond_data
+ * contained wihin the struct cond_snapshot instance associated with
+ * the trace_array. Because the tr->max_lock is held throughout the
+ * update() call, the update() function can directly retrieve the
+ * cond_snapshot and cond_data associated with the per-instance
+ * snapshot associated with the trace_array.
+ *
+ * The cond_snapshot.update() implementation can save data to be
+ * associated with the snapshot if it decides to, and returns 'true'
+ * in that case, or it returns 'false' if the conditional snapshot
+ * shouldn't be taken.
+ *
+ * The cond_snapshot instance is created and associated with the
+ * user-defined cond_data by tracing_cond_snapshot_enable().
+ * Likewise, the cond_snapshot instance is destroyed and is no longer
+ * associated with the trace instance by
+ * tracing_cond_snapshot_disable().
+ *
+ * The method below is required.
+ *
+ * @update: When a conditional snapshot is invoked, the update()
+ * callback function is invoked with the tr->max_lock held. The
+ * update() implementation signals whether or not to actually
+ * take the snapshot, by returning 'true' if so, 'false' if no
+ * snapshot should be taken. Because the max_lock is held for
+ * the duration of update(), the implementation is safe to
+ * directly retrieven and save any implementation data it needs
+ * to in association with the snapshot.
+ */
+struct cond_snapshot {
+ void *cond_data;
+ cond_update_fn_t update;
+};
+
/*
* The trace array - an array of per-CPU trace arrays. This is the
* highest level data structure that individual tracers deal with.
@@ -248,11 +292,13 @@ struct trace_array {
int nr_topts;
bool clear_trace;
int buffer_percent;
+ unsigned int n_err_log_entries;
struct tracer *current_trace;
unsigned int trace_flags;
unsigned char trace_flags_index[TRACE_FLAGS_MAX_SIZE];
unsigned int flags;
raw_spinlock_t start_lock;
+ struct list_head err_log;
struct dentry *dir;
struct dentry *options;
struct dentry *percpu_dir;
@@ -277,6 +323,9 @@ struct trace_array {
#endif
int time_stamp_abs_ref;
struct list_head hist_vars;
+#ifdef CONFIG_TRACER_SNAPSHOT
+ struct cond_snapshot *cond_snapshot;
+#endif
};
enum {
@@ -671,6 +720,9 @@ void trace_init_global_iter(struct trace_iterator *iter);
void tracing_iter_reset(struct trace_iterator *iter, int cpu);
+unsigned long trace_total_entries_cpu(struct trace_array *tr, int cpu);
+unsigned long trace_total_entries(struct trace_array *tr);
+
void trace_function(struct trace_array *tr,
unsigned long ip,
unsigned long parent_ip,
@@ -727,23 +779,16 @@ int trace_pid_write(struct trace_pid_list *filtered_pids,
const char __user *ubuf, size_t cnt);
#ifdef CONFIG_TRACER_MAX_TRACE
-void update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu);
+void update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu,
+ void *cond_data);
void update_max_tr_single(struct trace_array *tr,
struct task_struct *tsk, int cpu);
#endif /* CONFIG_TRACER_MAX_TRACE */
#ifdef CONFIG_STACKTRACE
-void ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags,
- int pc);
-
void __trace_stack(struct trace_array *tr, unsigned long flags, int skip,
int pc);
#else
-static inline void ftrace_trace_userstack(struct ring_buffer *buffer,
- unsigned long flags, int pc)
-{
-}
-
static inline void __trace_stack(struct trace_array *tr, unsigned long flags,
int skip, int pc)
{
@@ -855,10 +900,11 @@ static __always_inline bool ftrace_hash_empty(struct ftrace_hash *hash)
#define TRACE_GRAPH_PRINT_PROC 0x8
#define TRACE_GRAPH_PRINT_DURATION 0x10
#define TRACE_GRAPH_PRINT_ABS_TIME 0x20
-#define TRACE_GRAPH_PRINT_IRQS 0x40
-#define TRACE_GRAPH_PRINT_TAIL 0x80
-#define TRACE_GRAPH_SLEEP_TIME 0x100
-#define TRACE_GRAPH_GRAPH_TIME 0x200
+#define TRACE_GRAPH_PRINT_REL_TIME 0x40
+#define TRACE_GRAPH_PRINT_IRQS 0x80
+#define TRACE_GRAPH_PRINT_TAIL 0x100
+#define TRACE_GRAPH_SLEEP_TIME 0x200
+#define TRACE_GRAPH_GRAPH_TIME 0x400
#define TRACE_GRAPH_PRINT_FILL_SHIFT 28
#define TRACE_GRAPH_PRINT_FILL_MASK (0x3 << TRACE_GRAPH_PRINT_FILL_SHIFT)
@@ -1458,6 +1504,7 @@ enum regex_type {
MATCH_MIDDLE_ONLY,
MATCH_END_ONLY,
MATCH_GLOB,
+ MATCH_INDEX,
};
struct regex {
@@ -1502,7 +1549,8 @@ extern int apply_subsystem_event_filter(struct trace_subsystem_dir *dir,
extern void print_subsystem_event_filter(struct event_subsystem *system,
struct trace_seq *s);
extern int filter_assign_type(const char *type);
-extern int create_event_filter(struct trace_event_call *call,
+extern int create_event_filter(struct trace_array *tr,
+ struct trace_event_call *call,
char *filter_str, bool set_str,
struct event_filter **filterp);
extern void free_event_filter(struct event_filter *filter);
@@ -1808,6 +1856,11 @@ static inline bool event_command_needs_rec(struct event_command *cmd_ops)
extern int trace_event_enable_disable(struct trace_event_file *file,
int enable, int soft_disable);
extern int tracing_alloc_snapshot(void);
+extern void tracing_snapshot_cond(struct trace_array *tr, void *cond_data);
+extern int tracing_snapshot_cond_enable(struct trace_array *tr, void *cond_data, cond_update_fn_t update);
+
+extern int tracing_snapshot_cond_disable(struct trace_array *tr);
+extern void *tracing_cond_snapshot_data(struct trace_array *tr);
extern const char *__start___trace_bprintk_fmt[];
extern const char *__stop___trace_bprintk_fmt[];
@@ -1828,6 +1881,11 @@ extern ssize_t trace_parse_run_command(struct file *file,
const char __user *buffer, size_t count, loff_t *ppos,
int (*createfn)(int, char**));
+extern unsigned int err_pos(char *cmd, const char *str);
+extern void tracing_log_err(struct trace_array *tr,
+ const char *loc, const char *cmd,
+ const char **errs, u8 type, u8 pos);
+
/*
* Normal trace_printk() and friends allocates special buffers
* to do the manipulation, as well as saves the print formats
@@ -1908,4 +1966,22 @@ static inline void tracer_hardirqs_off(unsigned long a0, unsigned long a1) { }
extern struct trace_iterator *tracepoint_print_iter;
+/*
+ * Reset the state of the trace_iterator so that it can read consumed data.
+ * Normally, the trace_iterator is used for reading the data when it is not
+ * consumed, and must retain state.
+ */
+static __always_inline void trace_iterator_reset(struct trace_iterator *iter)
+{
+ const size_t offset = offsetof(struct trace_iterator, seq);
+
+ /*
+ * Keep gcc from complaining about overwriting more than just one
+ * member in the structure.
+ */
+ memset((char *)iter + offset, 0, sizeof(struct trace_iterator) - offset);
+
+ iter->pos = -1;
+}
+
#endif /* _LINUX_KERNEL_TRACE_H */
diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c
index 4ad967453b6f..3ea65cdff30d 100644
--- a/kernel/trace/trace_branch.c
+++ b/kernel/trace/trace_branch.c
@@ -205,6 +205,8 @@ void trace_likely_condition(struct ftrace_likely_data *f, int val, int expect)
void ftrace_likely_update(struct ftrace_likely_data *f, int val,
int expect, int is_constant)
{
+ unsigned long flags = user_access_save();
+
/* A constant is always correct */
if (is_constant) {
f->constant++;
@@ -223,6 +225,8 @@ void ftrace_likely_update(struct ftrace_likely_data *f, int val,
f->data.correct++;
else
f->data.incorrect++;
+
+ user_access_restore(flags);
}
EXPORT_SYMBOL(ftrace_likely_update);
diff --git a/kernel/trace/trace_dynevent.c b/kernel/trace/trace_dynevent.c
index dd1f43588d70..fa100ed3b4de 100644
--- a/kernel/trace/trace_dynevent.c
+++ b/kernel/trace/trace_dynevent.c
@@ -74,7 +74,7 @@ int dyn_event_release(int argc, char **argv, struct dyn_event_operations *type)
static int create_dyn_event(int argc, char **argv)
{
struct dyn_event_operations *ops;
- int ret;
+ int ret = -ENODEV;
if (argv[0][0] == '-' || argv[0][0] == '!')
return dyn_event_release(argc, argv, NULL);
diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h
index 06bb2fd9a56c..fc8e97328e54 100644
--- a/kernel/trace/trace_entries.h
+++ b/kernel/trace/trace_entries.h
@@ -65,7 +65,8 @@ FTRACE_ENTRY_REG(function, ftrace_entry,
__field( unsigned long, parent_ip )
),
- F_printk(" %lx <-- %lx", __entry->ip, __entry->parent_ip),
+ F_printk(" %ps <-- %ps",
+ (void *)__entry->ip, (void *)__entry->parent_ip),
FILTER_TRACE_FN,
@@ -83,7 +84,7 @@ FTRACE_ENTRY_PACKED(funcgraph_entry, ftrace_graph_ent_entry,
__field_desc( int, graph_ent, depth )
),
- F_printk("--> %lx (%d)", __entry->func, __entry->depth),
+ F_printk("--> %ps (%d)", (void *)__entry->func, __entry->depth),
FILTER_OTHER
);
@@ -102,8 +103,8 @@ FTRACE_ENTRY_PACKED(funcgraph_exit, ftrace_graph_ret_entry,
__field_desc( int, ret, depth )
),
- F_printk("<-- %lx (%d) (start: %llx end: %llx) over: %d",
- __entry->func, __entry->depth,
+ F_printk("<-- %ps (%d) (start: %llx end: %llx) over: %d",
+ (void *)__entry->func, __entry->depth,
__entry->calltime, __entry->rettime,
__entry->depth),
@@ -167,12 +168,6 @@ FTRACE_ENTRY_DUP(wakeup, ctx_switch_entry,
#define FTRACE_STACK_ENTRIES 8
-#ifndef CONFIG_64BIT
-# define IP_FMT "%08lx"
-#else
-# define IP_FMT "%016lx"
-#endif
-
FTRACE_ENTRY(kernel_stack, stack_entry,
TRACE_STACK,
@@ -182,12 +177,13 @@ FTRACE_ENTRY(kernel_stack, stack_entry,
__dynamic_array(unsigned long, caller )
),
- F_printk("\t=> (" IP_FMT ")\n\t=> (" IP_FMT ")\n\t=> (" IP_FMT ")\n"
- "\t=> (" IP_FMT ")\n\t=> (" IP_FMT ")\n\t=> (" IP_FMT ")\n"
- "\t=> (" IP_FMT ")\n\t=> (" IP_FMT ")\n",
- __entry->caller[0], __entry->caller[1], __entry->caller[2],
- __entry->caller[3], __entry->caller[4], __entry->caller[5],
- __entry->caller[6], __entry->caller[7]),
+ F_printk("\t=> %ps\n\t=> %ps\n\t=> %ps\n"
+ "\t=> %ps\n\t=> %ps\n\t=> %ps\n"
+ "\t=> %ps\n\t=> %ps\n",
+ (void *)__entry->caller[0], (void *)__entry->caller[1],
+ (void *)__entry->caller[2], (void *)__entry->caller[3],
+ (void *)__entry->caller[4], (void *)__entry->caller[5],
+ (void *)__entry->caller[6], (void *)__entry->caller[7]),
FILTER_OTHER
);
@@ -201,12 +197,13 @@ FTRACE_ENTRY(user_stack, userstack_entry,
__array( unsigned long, caller, FTRACE_STACK_ENTRIES )
),
- F_printk("\t=> (" IP_FMT ")\n\t=> (" IP_FMT ")\n\t=> (" IP_FMT ")\n"
- "\t=> (" IP_FMT ")\n\t=> (" IP_FMT ")\n\t=> (" IP_FMT ")\n"
- "\t=> (" IP_FMT ")\n\t=> (" IP_FMT ")\n",
- __entry->caller[0], __entry->caller[1], __entry->caller[2],
- __entry->caller[3], __entry->caller[4], __entry->caller[5],
- __entry->caller[6], __entry->caller[7]),
+ F_printk("\t=> %ps\n\t=> %ps\n\t=> %ps\n"
+ "\t=> %ps\n\t=> %ps\n\t=> %ps\n"
+ "\t=> %ps\n\t=> %ps\n",
+ (void *)__entry->caller[0], (void *)__entry->caller[1],
+ (void *)__entry->caller[2], (void *)__entry->caller[3],
+ (void *)__entry->caller[4], (void *)__entry->caller[5],
+ (void *)__entry->caller[6], (void *)__entry->caller[7]),
FILTER_OTHER
);
diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c
index 76217bbef815..4629a6104474 100644
--- a/kernel/trace/trace_event_perf.c
+++ b/kernel/trace/trace_event_perf.c
@@ -299,15 +299,13 @@ int perf_uprobe_init(struct perf_event *p_event,
if (!p_event->attr.uprobe_path)
return -EINVAL;
- path = kzalloc(PATH_MAX, GFP_KERNEL);
- if (!path)
- return -ENOMEM;
- ret = strncpy_from_user(
- path, u64_to_user_ptr(p_event->attr.uprobe_path), PATH_MAX);
- if (ret == PATH_MAX)
- return -E2BIG;
- if (ret < 0)
- goto out;
+
+ path = strndup_user(u64_to_user_ptr(p_event->attr.uprobe_path),
+ PATH_MAX);
+ if (IS_ERR(path)) {
+ ret = PTR_ERR(path);
+ return (ret == -EINVAL) ? -E2BIG : ret;
+ }
if (path[0] == '\0') {
ret = -EINVAL;
goto out;
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 5b3b0c3c8a47..0ce3db67f556 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -832,6 +832,7 @@ static int ftrace_set_clr_event(struct trace_array *tr, char *buf, int set)
return ret;
}
+EXPORT_SYMBOL_GPL(ftrace_set_clr_event);
/**
* trace_set_clr_event - enable or disable an event
@@ -1318,9 +1319,6 @@ event_id_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos)
char buf[32];
int len;
- if (*ppos)
- return 0;
-
if (unlikely(!id))
return -ENODEV;
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 27821480105e..5079d1db3754 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -66,7 +66,8 @@ static const char * ops[] = { OPS };
C(INVALID_FILTER, "Meaningless filter expression"), \
C(IP_FIELD_ONLY, "Only 'ip' field is supported for function trace"), \
C(INVALID_VALUE, "Invalid value (did you forget quotes)?"), \
- C(NO_FILTER, "No filter found"),
+ C(ERRNO, "Error"), \
+ C(NO_FILTER, "No filter found")
#undef C
#define C(a, b) FILT_ERR_##a
@@ -76,7 +77,7 @@ enum { ERRORS };
#undef C
#define C(a, b) b
-static char *err_text[] = { ERRORS };
+static const char *err_text[] = { ERRORS };
/* Called after a '!' character but "!=" and "!~" are not "not"s */
static bool is_not(const char *str)
@@ -427,7 +428,7 @@ predicate_parse(const char *str, int nr_parens, int nr_preds,
op_stack = kmalloc_array(nr_parens, sizeof(*op_stack), GFP_KERNEL);
if (!op_stack)
return ERR_PTR(-ENOMEM);
- prog_stack = kmalloc_array(nr_preds, sizeof(*prog_stack), GFP_KERNEL);
+ prog_stack = kcalloc(nr_preds, sizeof(*prog_stack), GFP_KERNEL);
if (!prog_stack) {
parse_error(pe, -ENOMEM, 0);
goto out_free;
@@ -491,10 +492,12 @@ predicate_parse(const char *str, int nr_parens, int nr_preds,
break;
case '&':
case '|':
+ /* accepting only "&&" or "||" */
if (next[1] == next[0]) {
ptr++;
break;
}
+ /* fall through */
default:
parse_error(pe, FILT_ERR_TOO_MANY_PREDS,
next - str);
@@ -576,7 +579,11 @@ predicate_parse(const char *str, int nr_parens, int nr_preds,
out_free:
kfree(op_stack);
kfree(inverts);
- kfree(prog_stack);
+ if (prog_stack) {
+ for (i = 0; prog_stack[i].pred; i++)
+ kfree(prog_stack[i].pred);
+ kfree(prog_stack);
+ }
return ERR_PTR(ret);
}
@@ -823,6 +830,9 @@ enum regex_type filter_parse_regex(char *buff, int len, char **search, int *not)
*search = buff;
+ if (isdigit(buff[0]))
+ return MATCH_INDEX;
+
for (i = 0; i < len; i++) {
if (buff[i] == '*') {
if (!i) {
@@ -860,6 +870,8 @@ static void filter_build_regex(struct filter_pred *pred)
}
switch (type) {
+ /* MATCH_INDEX should not happen, but if it does, match full */
+ case MATCH_INDEX:
case MATCH_FULL:
r->match = regex_match_full;
break;
@@ -912,7 +924,8 @@ static void remove_filter_string(struct event_filter *filter)
filter->filter_string = NULL;
}
-static void append_filter_err(struct filter_parse_error *pe,
+static void append_filter_err(struct trace_array *tr,
+ struct filter_parse_error *pe,
struct event_filter *filter)
{
struct trace_seq *s;
@@ -940,8 +953,14 @@ static void append_filter_err(struct filter_parse_error *pe,
if (pe->lasterr > 0) {
trace_seq_printf(s, "\n%*s", pos, "^");
trace_seq_printf(s, "\nparse_error: %s\n", err_text[pe->lasterr]);
+ tracing_log_err(tr, "event filter parse error",
+ filter->filter_string, err_text,
+ pe->lasterr, pe->lasterr_pos);
} else {
trace_seq_printf(s, "\nError: (%d)\n", pe->lasterr);
+ tracing_log_err(tr, "event filter parse error",
+ filter->filter_string, err_text,
+ FILT_ERR_ERRNO, 0);
}
trace_seq_putc(s, 0);
buf = kmemdup_nul(s->buffer, s->seq.len, GFP_KERNEL);
@@ -1207,30 +1226,30 @@ static int parse_pred(const char *str, void *data,
* (perf doesn't use it) and grab everything.
*/
if (strcmp(field->name, "ip") != 0) {
- parse_error(pe, FILT_ERR_IP_FIELD_ONLY, pos + i);
- goto err_free;
- }
- pred->fn = filter_pred_none;
-
- /*
- * Quotes are not required, but if they exist then we need
- * to read them till we hit a matching one.
- */
- if (str[i] == '\'' || str[i] == '"')
- q = str[i];
- else
- q = 0;
-
- for (i++; str[i]; i++) {
- if (q && str[i] == q)
- break;
- if (!q && (str[i] == ')' || str[i] == '&' ||
- str[i] == '|'))
- break;
- }
- /* Skip quotes */
- if (q)
- s++;
+ parse_error(pe, FILT_ERR_IP_FIELD_ONLY, pos + i);
+ goto err_free;
+ }
+ pred->fn = filter_pred_none;
+
+ /*
+ * Quotes are not required, but if they exist then we need
+ * to read them till we hit a matching one.
+ */
+ if (str[i] == '\'' || str[i] == '"')
+ q = str[i];
+ else
+ q = 0;
+
+ for (i++; str[i]; i++) {
+ if (q && str[i] == q)
+ break;
+ if (!q && (str[i] == ')' || str[i] == '&' ||
+ str[i] == '|'))
+ break;
+ }
+ /* Skip quotes */
+ if (q)
+ s++;
len = i - s;
if (len >= MAX_FILTER_STR_VAL) {
parse_error(pe, FILT_ERR_OPERAND_TOO_LONG, pos + i);
@@ -1301,7 +1320,7 @@ static int parse_pred(const char *str, void *data,
/* go past the last quote */
i++;
- } else if (isdigit(str[i])) {
+ } else if (isdigit(str[i]) || str[i] == '-') {
/* Make sure the field is not a string */
if (is_string_field(field)) {
@@ -1314,6 +1333,9 @@ static int parse_pred(const char *str, void *data,
goto err_free;
}
+ if (str[i] == '-')
+ i++;
+
/* We allow 0xDEADBEEF */
while (isalnum(str[i]))
i++;
@@ -1590,7 +1612,7 @@ static int process_system_preds(struct trace_subsystem_dir *dir,
if (err) {
filter_disable(file);
parse_error(pe, FILT_ERR_BAD_SUBSYS_FILTER, 0);
- append_filter_err(pe, filter);
+ append_filter_err(tr, pe, filter);
} else
event_set_filtered_flag(file);
@@ -1702,7 +1724,8 @@ static void create_filter_finish(struct filter_parse_error *pe)
* information if @set_str is %true and the caller is responsible for
* freeing it.
*/
-static int create_filter(struct trace_event_call *call,
+static int create_filter(struct trace_array *tr,
+ struct trace_event_call *call,
char *filter_string, bool set_str,
struct event_filter **filterp)
{
@@ -1719,17 +1742,18 @@ static int create_filter(struct trace_event_call *call,
err = process_preds(call, filter_string, *filterp, pe);
if (err && set_str)
- append_filter_err(pe, *filterp);
+ append_filter_err(tr, pe, *filterp);
create_filter_finish(pe);
return err;
}
-int create_event_filter(struct trace_event_call *call,
+int create_event_filter(struct trace_array *tr,
+ struct trace_event_call *call,
char *filter_str, bool set_str,
struct event_filter **filterp)
{
- return create_filter(call, filter_str, set_str, filterp);
+ return create_filter(tr, call, filter_str, set_str, filterp);
}
/**
@@ -1756,7 +1780,7 @@ static int create_system_filter(struct trace_subsystem_dir *dir,
kfree((*filterp)->filter_string);
(*filterp)->filter_string = NULL;
} else {
- append_filter_err(pe, *filterp);
+ append_filter_err(tr, pe, *filterp);
}
}
create_filter_finish(pe);
@@ -1787,7 +1811,7 @@ int apply_event_filter(struct trace_event_file *file, char *filter_string)
return 0;
}
- err = create_filter(call, filter_string, true, &filter);
+ err = create_filter(file->tr, call, filter_string, true, &filter);
/*
* Always swap the call filter with the new filter
@@ -2043,7 +2067,7 @@ int ftrace_profile_set_filter(struct perf_event *event, int event_id,
if (event->filter)
goto out_unlock;
- err = create_filter(call, filter_str, false, &filter);
+ err = create_filter(NULL, call, filter_str, false, &filter);
if (err)
goto free_filter;
@@ -2192,8 +2216,8 @@ static __init int ftrace_test_event_filter(void)
struct test_filter_data_t *d = &test_filter_data[i];
int err;
- err = create_filter(&event_ftrace_test_filter, d->filter,
- false, &filter);
+ err = create_filter(NULL, &event_ftrace_test_filter,
+ d->filter, false, &filter);
if (err) {
printk(KERN_INFO
"Failed to get filter for '%s', err %d\n",
diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c
index 449d90cfa151..ca6b0dff60c5 100644
--- a/kernel/trace/trace_events_hist.c
+++ b/kernel/trace/trace_events_hist.c
@@ -22,6 +22,57 @@
#define STR_VAR_LEN_MAX 32 /* must be multiple of sizeof(u64) */
+#define ERRORS \
+ C(NONE, "No error"), \
+ C(DUPLICATE_VAR, "Variable already defined"), \
+ C(VAR_NOT_UNIQUE, "Variable name not unique, need to use fully qualified name (subsys.event.var) for variable"), \
+ C(TOO_MANY_VARS, "Too many variables defined"), \
+ C(MALFORMED_ASSIGNMENT, "Malformed assignment"), \
+ C(NAMED_MISMATCH, "Named hist trigger doesn't match existing named trigger (includes variables)"), \
+ C(TRIGGER_EEXIST, "Hist trigger already exists"), \
+ C(TRIGGER_ENOENT_CLEAR, "Can't clear or continue a nonexistent hist trigger"), \
+ C(SET_CLOCK_FAIL, "Couldn't set trace_clock"), \
+ C(BAD_FIELD_MODIFIER, "Invalid field modifier"), \
+ C(TOO_MANY_SUBEXPR, "Too many subexpressions (3 max)"), \
+ C(TIMESTAMP_MISMATCH, "Timestamp units in expression don't match"), \
+ C(TOO_MANY_FIELD_VARS, "Too many field variables defined"), \
+ C(EVENT_FILE_NOT_FOUND, "Event file not found"), \
+ C(HIST_NOT_FOUND, "Matching event histogram not found"), \
+ C(HIST_CREATE_FAIL, "Couldn't create histogram for field"), \
+ C(SYNTH_VAR_NOT_FOUND, "Couldn't find synthetic variable"), \
+ C(SYNTH_EVENT_NOT_FOUND,"Couldn't find synthetic event"), \
+ C(SYNTH_TYPE_MISMATCH, "Param type doesn't match synthetic event field type"), \
+ C(SYNTH_COUNT_MISMATCH, "Param count doesn't match synthetic event field count"), \
+ C(FIELD_VAR_PARSE_FAIL, "Couldn't parse field variable"), \
+ C(VAR_CREATE_FIND_FAIL, "Couldn't create or find variable"), \
+ C(ONX_NOT_VAR, "For onmax(x) or onchange(x), x must be a variable"), \
+ C(ONX_VAR_NOT_FOUND, "Couldn't find onmax or onchange variable"), \
+ C(ONX_VAR_CREATE_FAIL, "Couldn't create onmax or onchange variable"), \
+ C(FIELD_VAR_CREATE_FAIL,"Couldn't create field variable"), \
+ C(TOO_MANY_PARAMS, "Too many action params"), \
+ C(PARAM_NOT_FOUND, "Couldn't find param"), \
+ C(INVALID_PARAM, "Invalid action param"), \
+ C(ACTION_NOT_FOUND, "No action found"), \
+ C(NO_SAVE_PARAMS, "No params found for save()"), \
+ C(TOO_MANY_SAVE_ACTIONS,"Can't have more than one save() action per hist"), \
+ C(ACTION_MISMATCH, "Handler doesn't support action"), \
+ C(NO_CLOSING_PAREN, "No closing paren found"), \
+ C(SUBSYS_NOT_FOUND, "Missing subsystem"), \
+ C(INVALID_SUBSYS_EVENT, "Invalid subsystem or event name"), \
+ C(INVALID_REF_KEY, "Using variable references in keys not supported"), \
+ C(VAR_NOT_FOUND, "Couldn't find variable"), \
+ C(FIELD_NOT_FOUND, "Couldn't find field"),
+
+#undef C
+#define C(a, b) HIST_ERR_##a
+
+enum { ERRORS };
+
+#undef C
+#define C(a, b) b
+
+static const char *err_text[] = { ERRORS };
+
struct hist_field;
typedef u64 (*hist_field_fn_t) (struct hist_field *field,
@@ -313,9 +364,9 @@ struct hist_trigger_data {
struct field_var_hist *field_var_hists[SYNTH_FIELDS_MAX];
unsigned int n_field_var_hists;
- struct field_var *max_vars[SYNTH_FIELDS_MAX];
- unsigned int n_max_vars;
- unsigned int n_max_var_str;
+ struct field_var *save_vars[SYNTH_FIELDS_MAX];
+ unsigned int n_save_vars;
+ unsigned int n_save_var_str;
};
static int synth_event_create(int argc, const char **argv);
@@ -383,98 +434,201 @@ struct action_data;
typedef void (*action_fn_t) (struct hist_trigger_data *hist_data,
struct tracing_map_elt *elt, void *rec,
- struct ring_buffer_event *rbe,
+ struct ring_buffer_event *rbe, void *key,
struct action_data *data, u64 *var_ref_vals);
+typedef bool (*check_track_val_fn_t) (u64 track_val, u64 var_val);
+
+enum handler_id {
+ HANDLER_ONMATCH = 1,
+ HANDLER_ONMAX,
+ HANDLER_ONCHANGE,
+};
+
+enum action_id {
+ ACTION_SAVE = 1,
+ ACTION_TRACE,
+ ACTION_SNAPSHOT,
+};
+
struct action_data {
+ enum handler_id handler;
+ enum action_id action;
+ char *action_name;
action_fn_t fn;
+
unsigned int n_params;
char *params[SYNTH_FIELDS_MAX];
+ /*
+ * When a histogram trigger is hit, the values of any
+ * references to variables, including variables being passed
+ * as parameters to synthetic events, are collected into a
+ * var_ref_vals array. This var_ref_idx is the index of the
+ * first param in the array to be passed to the synthetic
+ * event invocation.
+ */
+ unsigned int var_ref_idx;
+ struct synth_event *synth_event;
+ bool use_trace_keyword;
+ char *synth_event_name;
+
union {
struct {
- /*
- * When a histogram trigger is hit, the values of any
- * references to variables, including variables being passed
- * as parameters to synthetic events, are collected into a
- * var_ref_vals array. This var_ref_idx is the index of the
- * first param in the array to be passed to the synthetic
- * event invocation.
- */
- unsigned int var_ref_idx;
- char *match_event;
- char *match_event_system;
- char *synth_event_name;
- struct synth_event *synth_event;
- } onmatch;
+ char *event;
+ char *event_system;
+ } match_data;
struct {
+ /*
+ * var_str contains the $-unstripped variable
+ * name referenced by var_ref, and used when
+ * printing the action. Because var_ref
+ * creation is deferred to create_actions(),
+ * we need a per-action way to save it until
+ * then, thus var_str.
+ */
char *var_str;
- char *fn_name;
- unsigned int max_var_ref_idx;
- struct hist_field *max_var;
- struct hist_field *var;
- } onmax;
+
+ /*
+ * var_ref refers to the variable being
+ * tracked e.g onmax($var).
+ */
+ struct hist_field *var_ref;
+
+ /*
+ * track_var contains the 'invisible' tracking
+ * variable created to keep the current
+ * e.g. max value.
+ */
+ struct hist_field *track_var;
+
+ check_track_val_fn_t check_val;
+ action_fn_t save_data;
+ } track_data;
};
};
+struct track_data {
+ u64 track_val;
+ bool updated;
+
+ unsigned int key_len;
+ void *key;
+ struct tracing_map_elt elt;
-static char last_hist_cmd[MAX_FILTER_STR_VAL];
-static char hist_err_str[MAX_FILTER_STR_VAL];
+ struct action_data *action_data;
+ struct hist_trigger_data *hist_data;
+};
-static void last_cmd_set(char *str)
+struct hist_elt_data {
+ char *comm;
+ u64 *var_ref_vals;
+ char *field_var_str[SYNTH_FIELDS_MAX];
+};
+
+struct snapshot_context {
+ struct tracing_map_elt *elt;
+ void *key;
+};
+
+static void track_data_free(struct track_data *track_data)
{
- if (!str)
+ struct hist_elt_data *elt_data;
+
+ if (!track_data)
return;
- strncpy(last_hist_cmd, str, MAX_FILTER_STR_VAL - 1);
+ kfree(track_data->key);
+
+ elt_data = track_data->elt.private_data;
+ if (elt_data) {
+ kfree(elt_data->comm);
+ kfree(elt_data);
+ }
+
+ kfree(track_data);
}
-static void hist_err(char *str, char *var)
+static struct track_data *track_data_alloc(unsigned int key_len,
+ struct action_data *action_data,
+ struct hist_trigger_data *hist_data)
{
- int maxlen = MAX_FILTER_STR_VAL - 1;
+ struct track_data *data = kzalloc(sizeof(*data), GFP_KERNEL);
+ struct hist_elt_data *elt_data;
- if (!str)
- return;
+ if (!data)
+ return ERR_PTR(-ENOMEM);
- if (strlen(hist_err_str))
- return;
+ data->key = kzalloc(key_len, GFP_KERNEL);
+ if (!data->key) {
+ track_data_free(data);
+ return ERR_PTR(-ENOMEM);
+ }
- if (!var)
- var = "";
+ data->key_len = key_len;
+ data->action_data = action_data;
+ data->hist_data = hist_data;
- if (strlen(hist_err_str) + strlen(str) + strlen(var) > maxlen)
- return;
+ elt_data = kzalloc(sizeof(*elt_data), GFP_KERNEL);
+ if (!elt_data) {
+ track_data_free(data);
+ return ERR_PTR(-ENOMEM);
+ }
+ data->elt.private_data = elt_data;
+
+ elt_data->comm = kzalloc(TASK_COMM_LEN, GFP_KERNEL);
+ if (!elt_data->comm) {
+ track_data_free(data);
+ return ERR_PTR(-ENOMEM);
+ }
- strcat(hist_err_str, str);
- strcat(hist_err_str, var);
+ return data;
}
-static void hist_err_event(char *str, char *system, char *event, char *var)
+static char last_cmd[MAX_FILTER_STR_VAL];
+static char last_cmd_loc[MAX_FILTER_STR_VAL];
+
+static int errpos(char *str)
{
- char err[MAX_FILTER_STR_VAL];
+ return err_pos(last_cmd, str);
+}
- if (system && var)
- snprintf(err, MAX_FILTER_STR_VAL, "%s.%s.%s", system, event, var);
- else if (system)
- snprintf(err, MAX_FILTER_STR_VAL, "%s.%s", system, event);
- else
- strscpy(err, var, MAX_FILTER_STR_VAL);
+static void last_cmd_set(struct trace_event_file *file, char *str)
+{
+ const char *system = NULL, *name = NULL;
+ struct trace_event_call *call;
- hist_err(str, err);
+ if (!str)
+ return;
+
+ strncpy(last_cmd, str, MAX_FILTER_STR_VAL - 1);
+
+ if (file) {
+ call = file->event_call;
+
+ system = call->class->system;
+ if (system) {
+ name = trace_event_name(call);
+ if (!name)
+ system = NULL;
+ }
+ }
+
+ if (system)
+ snprintf(last_cmd_loc, MAX_FILTER_STR_VAL, "hist:%s:%s", system, name);
}
-static void hist_err_clear(void)
+static void hist_err(struct trace_array *tr, u8 err_type, u8 err_pos)
{
- hist_err_str[0] = '\0';
+ tracing_log_err(tr, last_cmd_loc, last_cmd, err_text,
+ err_type, err_pos);
}
-static bool have_hist_err(void)
+static void hist_err_clear(void)
{
- if (strlen(hist_err_str))
- return true;
-
- return false;
+ last_cmd[0] = '\0';
+ last_cmd_loc[0] = '\0';
}
struct synth_trace_event {
@@ -1078,12 +1232,12 @@ static struct synth_event *alloc_synth_event(const char *name, int n_fields,
static void action_trace(struct hist_trigger_data *hist_data,
struct tracing_map_elt *elt, void *rec,
- struct ring_buffer_event *rbe,
+ struct ring_buffer_event *rbe, void *key,
struct action_data *data, u64 *var_ref_vals)
{
- struct synth_event *event = data->onmatch.synth_event;
+ struct synth_event *event = data->synth_event;
- trace_synth(event, var_ref_vals, data->onmatch.var_ref_idx);
+ trace_synth(event, var_ref_vals, data->var_ref_idx);
}
struct hist_var_data {
@@ -1200,8 +1354,8 @@ static int synth_event_create(int argc, const char **argv)
/* This interface accepts group name prefix */
if (strchr(name, '/')) {
- len = sizeof(SYNTH_SYSTEM "/") - 1;
- if (strncmp(name, SYNTH_SYSTEM "/", len))
+ len = str_has_prefix(name, SYNTH_SYSTEM "/");
+ if (len == 0)
return -EINVAL;
name += len;
}
@@ -1603,7 +1757,7 @@ static struct trace_event_file *find_var_file(struct trace_array *tr,
if (find_var_field(var_hist_data, var_name)) {
if (found) {
- hist_err_event("Variable name not unique, need to use fully qualified name (subsys.event.var) for variable: ", system, event_name, var_name);
+ hist_err(tr, HIST_ERR_VAR_NOT_UNIQUE, errpos(var_name));
return NULL;
}
@@ -1644,9 +1798,9 @@ find_match_var(struct hist_trigger_data *hist_data, char *var_name)
for (i = 0; i < hist_data->n_actions; i++) {
struct action_data *data = hist_data->actions[i];
- if (data->fn == action_trace) {
- char *system = data->onmatch.match_event_system;
- char *event_name = data->onmatch.match_event;
+ if (data->handler == HANDLER_ONMATCH) {
+ char *system = data->match_data.event_system;
+ char *event_name = data->match_data.event;
file = find_var_file(tr, system, event_name, var_name);
if (!file)
@@ -1654,7 +1808,8 @@ find_match_var(struct hist_trigger_data *hist_data, char *var_name)
hist_field = find_file_var(file, var_name);
if (hist_field) {
if (found) {
- hist_err_event("Variable name not unique, need to use fully qualified name (subsys.event.var) for variable: ", system, event_name, var_name);
+ hist_err(tr, HIST_ERR_VAR_NOT_UNIQUE,
+ errpos(var_name));
return ERR_PTR(-EINVAL);
}
@@ -1691,12 +1846,6 @@ static struct hist_field *find_event_var(struct hist_trigger_data *hist_data,
return hist_field;
}
-struct hist_elt_data {
- char *comm;
- u64 *var_ref_vals;
- char *field_var_str[SYNTH_FIELDS_MAX];
-};
-
static u64 hist_field_var_ref(struct hist_field *hist_field,
struct tracing_map_elt *elt,
struct ring_buffer_event *rbe,
@@ -1705,6 +1854,9 @@ static u64 hist_field_var_ref(struct hist_field *hist_field,
struct hist_elt_data *elt_data;
u64 var_val = 0;
+ if (WARN_ON_ONCE(!elt))
+ return var_val;
+
elt_data = elt->private_data;
var_val = elt_data->var_ref_vals[hist_field->var_ref_idx];
@@ -1882,7 +2034,8 @@ static int parse_action(char *str, struct hist_trigger_attrs *attrs)
return ret;
if ((str_has_prefix(str, "onmatch(")) ||
- (str_has_prefix(str, "onmax("))) {
+ (str_has_prefix(str, "onmax(")) ||
+ (str_has_prefix(str, "onchange("))) {
attrs->action_str[attrs->n_actions] = kstrdup(str, GFP_KERNEL);
if (!attrs->action_str[attrs->n_actions]) {
ret = -ENOMEM;
@@ -1891,11 +2044,11 @@ static int parse_action(char *str, struct hist_trigger_attrs *attrs)
attrs->n_actions++;
ret = 0;
}
-
return ret;
}
-static int parse_assignment(char *str, struct hist_trigger_attrs *attrs)
+static int parse_assignment(struct trace_array *tr,
+ char *str, struct hist_trigger_attrs *attrs)
{
int ret = 0;
@@ -1951,7 +2104,7 @@ static int parse_assignment(char *str, struct hist_trigger_attrs *attrs)
char *assignment;
if (attrs->n_assignments == TRACING_MAP_VARS_MAX) {
- hist_err("Too many variables defined: ", str);
+ hist_err(tr, HIST_ERR_TOO_MANY_VARS, errpos(str));
ret = -EINVAL;
goto out;
}
@@ -1968,7 +2121,8 @@ static int parse_assignment(char *str, struct hist_trigger_attrs *attrs)
return ret;
}
-static struct hist_trigger_attrs *parse_hist_trigger_attrs(char *trigger_str)
+static struct hist_trigger_attrs *
+parse_hist_trigger_attrs(struct trace_array *tr, char *trigger_str)
{
struct hist_trigger_attrs *attrs;
int ret = 0;
@@ -1981,7 +2135,7 @@ static struct hist_trigger_attrs *parse_hist_trigger_attrs(char *trigger_str)
char *str = strsep(&trigger_str, ":");
if (strchr(str, '=')) {
- ret = parse_assignment(str, attrs);
+ ret = parse_assignment(tr, str, attrs);
if (ret)
goto free;
} else if (strcmp(str, "pause") == 0)
@@ -2030,7 +2184,7 @@ static inline void save_comm(char *comm, struct task_struct *task)
return;
}
- memcpy(comm, task->comm, TASK_COMM_LEN);
+ strncpy(comm, task->comm, TASK_COMM_LEN);
}
static void hist_elt_data_free(struct hist_elt_data *elt_data)
@@ -2076,7 +2230,7 @@ static int hist_trigger_elt_data_alloc(struct tracing_map_elt *elt)
}
}
- n_str = hist_data->n_field_var_str + hist_data->n_max_var_str;
+ n_str = hist_data->n_field_var_str + hist_data->n_save_var_str;
size = STR_VAR_LEN_MAX;
@@ -2537,6 +2691,7 @@ static struct hist_field *parse_var_ref(struct hist_trigger_data *hist_data,
char *var_name)
{
struct hist_field *var_field = NULL, *ref_field = NULL;
+ struct trace_array *tr = hist_data->event_file->tr;
if (!is_var_ref(var_name))
return NULL;
@@ -2549,8 +2704,7 @@ static struct hist_field *parse_var_ref(struct hist_trigger_data *hist_data,
system, event_name);
if (!ref_field)
- hist_err_event("Couldn't find variable: $",
- system, event_name, var_name);
+ hist_err(tr, HIST_ERR_VAR_NOT_FOUND, errpos(var_name));
return ref_field;
}
@@ -2561,6 +2715,7 @@ parse_field(struct hist_trigger_data *hist_data, struct trace_event_file *file,
{
struct ftrace_event_field *field = NULL;
char *field_name, *modifier, *str;
+ struct trace_array *tr = file->tr;
modifier = str = kstrdup(field_str, GFP_KERNEL);
if (!modifier)
@@ -2584,7 +2739,7 @@ parse_field(struct hist_trigger_data *hist_data, struct trace_event_file *file,
else if (strcmp(modifier, "usecs") == 0)
*flags |= HIST_FIELD_FL_TIMESTAMP_USECS;
else {
- hist_err("Invalid field modifier: ", modifier);
+ hist_err(tr, HIST_ERR_BAD_FIELD_MODIFIER, errpos(modifier));
field = ERR_PTR(-EINVAL);
goto out;
}
@@ -2600,7 +2755,7 @@ parse_field(struct hist_trigger_data *hist_data, struct trace_event_file *file,
else {
field = trace_find_event_field(file->event_call, field_name);
if (!field || !field->size) {
- hist_err("Couldn't find field: ", field_name);
+ hist_err(tr, HIST_ERR_FIELD_NOT_FOUND, errpos(field_name));
field = ERR_PTR(-EINVAL);
goto out;
}
@@ -2662,7 +2817,8 @@ static struct hist_field *parse_atom(struct hist_trigger_data *hist_data,
s = local_field_var_ref(hist_data, ref_system, ref_event, ref_var);
if (!s) {
- hist_field = parse_var_ref(hist_data, ref_system, ref_event, ref_var);
+ hist_field = parse_var_ref(hist_data, ref_system,
+ ref_event, ref_var);
if (hist_field) {
if (var_name) {
hist_field = create_alias(hist_data, hist_field, var_name);
@@ -2711,7 +2867,7 @@ static struct hist_field *parse_unary(struct hist_trigger_data *hist_data,
/* we support only -(xxx) i.e. explicit parens required */
if (level > 3) {
- hist_err("Too many subexpressions (3 max): ", str);
+ hist_err(file->tr, HIST_ERR_TOO_MANY_SUBEXPR, errpos(str));
ret = -EINVAL;
goto free;
}
@@ -2766,7 +2922,8 @@ static struct hist_field *parse_unary(struct hist_trigger_data *hist_data,
return ERR_PTR(ret);
}
-static int check_expr_operands(struct hist_field *operand1,
+static int check_expr_operands(struct trace_array *tr,
+ struct hist_field *operand1,
struct hist_field *operand2)
{
unsigned long operand1_flags = operand1->flags;
@@ -2794,7 +2951,7 @@ static int check_expr_operands(struct hist_field *operand1,
if ((operand1_flags & HIST_FIELD_FL_TIMESTAMP_USECS) !=
(operand2_flags & HIST_FIELD_FL_TIMESTAMP_USECS)) {
- hist_err("Timestamp units in expression don't match", NULL);
+ hist_err(tr, HIST_ERR_TIMESTAMP_MISMATCH, 0);
return -EINVAL;
}
@@ -2812,7 +2969,7 @@ static struct hist_field *parse_expr(struct hist_trigger_data *hist_data,
char *sep, *operand1_str;
if (level > 3) {
- hist_err("Too many subexpressions (3 max): ", str);
+ hist_err(file->tr, HIST_ERR_TOO_MANY_SUBEXPR, errpos(str));
return ERR_PTR(-EINVAL);
}
@@ -2857,7 +3014,7 @@ static struct hist_field *parse_expr(struct hist_trigger_data *hist_data,
goto free;
}
- ret = check_expr_operands(operand1, operand2);
+ ret = check_expr_operands(file->tr, operand1, operand2);
if (ret)
goto free;
@@ -3050,16 +3207,14 @@ create_field_var_hist(struct hist_trigger_data *target_hist_data,
int ret;
if (target_hist_data->n_field_var_hists >= SYNTH_FIELDS_MAX) {
- hist_err_event("onmatch: Too many field variables defined: ",
- subsys_name, event_name, field_name);
+ hist_err(tr, HIST_ERR_TOO_MANY_FIELD_VARS, errpos(field_name));
return ERR_PTR(-EINVAL);
}
file = event_file(tr, subsys_name, event_name);
if (IS_ERR(file)) {
- hist_err_event("onmatch: Event file not found: ",
- subsys_name, event_name, field_name);
+ hist_err(tr, HIST_ERR_EVENT_FILE_NOT_FOUND, errpos(field_name));
ret = PTR_ERR(file);
return ERR_PTR(ret);
}
@@ -3072,8 +3227,7 @@ create_field_var_hist(struct hist_trigger_data *target_hist_data,
*/
hist_data = find_compatible_hist(target_hist_data, file);
if (!hist_data) {
- hist_err_event("onmatch: Matching event histogram not found: ",
- subsys_name, event_name, field_name);
+ hist_err(tr, HIST_ERR_HIST_NOT_FOUND, errpos(field_name));
return ERR_PTR(-EINVAL);
}
@@ -3134,8 +3288,7 @@ create_field_var_hist(struct hist_trigger_data *target_hist_data,
kfree(cmd);
kfree(var_hist->cmd);
kfree(var_hist);
- hist_err_event("onmatch: Couldn't create histogram for field: ",
- subsys_name, event_name, field_name);
+ hist_err(tr, HIST_ERR_HIST_CREATE_FAIL, errpos(field_name));
return ERR_PTR(ret);
}
@@ -3147,8 +3300,7 @@ create_field_var_hist(struct hist_trigger_data *target_hist_data,
if (IS_ERR_OR_NULL(event_var)) {
kfree(var_hist->cmd);
kfree(var_hist);
- hist_err_event("onmatch: Couldn't find synthetic variable: ",
- subsys_name, event_name, field_name);
+ hist_err(tr, HIST_ERR_SYNTH_VAR_NOT_FOUND, errpos(field_name));
return ERR_PTR(-EINVAL);
}
@@ -3225,13 +3377,13 @@ static void update_field_vars(struct hist_trigger_data *hist_data,
hist_data->n_field_vars, 0);
}
-static void update_max_vars(struct hist_trigger_data *hist_data,
- struct tracing_map_elt *elt,
- struct ring_buffer_event *rbe,
- void *rec)
+static void save_track_data_vars(struct hist_trigger_data *hist_data,
+ struct tracing_map_elt *elt, void *rec,
+ struct ring_buffer_event *rbe, void *key,
+ struct action_data *data, u64 *var_ref_vals)
{
- __update_field_vars(elt, rbe, rec, hist_data->max_vars,
- hist_data->n_max_vars, hist_data->n_field_var_str);
+ __update_field_vars(elt, rbe, rec, hist_data->save_vars,
+ hist_data->n_save_vars, hist_data->n_field_var_str);
}
static struct hist_field *create_var(struct hist_trigger_data *hist_data,
@@ -3281,25 +3433,26 @@ static struct field_var *create_field_var(struct hist_trigger_data *hist_data,
{
struct hist_field *val = NULL, *var = NULL;
unsigned long flags = HIST_FIELD_FL_VAR;
+ struct trace_array *tr = file->tr;
struct field_var *field_var;
int ret = 0;
if (hist_data->n_field_vars >= SYNTH_FIELDS_MAX) {
- hist_err("Too many field variables defined: ", field_name);
+ hist_err(tr, HIST_ERR_TOO_MANY_FIELD_VARS, errpos(field_name));
ret = -EINVAL;
goto err;
}
val = parse_atom(hist_data, file, field_name, &flags, NULL);
if (IS_ERR(val)) {
- hist_err("Couldn't parse field variable: ", field_name);
+ hist_err(tr, HIST_ERR_FIELD_VAR_PARSE_FAIL, errpos(field_name));
ret = PTR_ERR(val);
goto err;
}
var = create_var(hist_data, file, field_name, val->size, val->type);
if (IS_ERR(var)) {
- hist_err("Couldn't create or find variable: ", field_name);
+ hist_err(tr, HIST_ERR_VAR_CREATE_FIND_FAIL, errpos(field_name));
kfree(val);
ret = PTR_ERR(var);
goto err;
@@ -3366,18 +3519,196 @@ create_target_field_var(struct hist_trigger_data *target_hist_data,
return create_field_var(target_hist_data, file, var_name);
}
-static void onmax_print(struct seq_file *m,
- struct hist_trigger_data *hist_data,
- struct tracing_map_elt *elt,
- struct action_data *data)
+static bool check_track_val_max(u64 track_val, u64 var_val)
+{
+ if (var_val <= track_val)
+ return false;
+
+ return true;
+}
+
+static bool check_track_val_changed(u64 track_val, u64 var_val)
+{
+ if (var_val == track_val)
+ return false;
+
+ return true;
+}
+
+static u64 get_track_val(struct hist_trigger_data *hist_data,
+ struct tracing_map_elt *elt,
+ struct action_data *data)
{
- unsigned int i, save_var_idx, max_idx = data->onmax.max_var->var.idx;
+ unsigned int track_var_idx = data->track_data.track_var->var.idx;
+ u64 track_val;
+
+ track_val = tracing_map_read_var(elt, track_var_idx);
- seq_printf(m, "\n\tmax: %10llu", tracing_map_read_var(elt, max_idx));
+ return track_val;
+}
- for (i = 0; i < hist_data->n_max_vars; i++) {
- struct hist_field *save_val = hist_data->max_vars[i]->val;
- struct hist_field *save_var = hist_data->max_vars[i]->var;
+static void save_track_val(struct hist_trigger_data *hist_data,
+ struct tracing_map_elt *elt,
+ struct action_data *data, u64 var_val)
+{
+ unsigned int track_var_idx = data->track_data.track_var->var.idx;
+
+ tracing_map_set_var(elt, track_var_idx, var_val);
+}
+
+static void save_track_data(struct hist_trigger_data *hist_data,
+ struct tracing_map_elt *elt, void *rec,
+ struct ring_buffer_event *rbe, void *key,
+ struct action_data *data, u64 *var_ref_vals)
+{
+ if (data->track_data.save_data)
+ data->track_data.save_data(hist_data, elt, rec, rbe, key, data, var_ref_vals);
+}
+
+static bool check_track_val(struct tracing_map_elt *elt,
+ struct action_data *data,
+ u64 var_val)
+{
+ struct hist_trigger_data *hist_data;
+ u64 track_val;
+
+ hist_data = data->track_data.track_var->hist_data;
+ track_val = get_track_val(hist_data, elt, data);
+
+ return data->track_data.check_val(track_val, var_val);
+}
+
+#ifdef CONFIG_TRACER_SNAPSHOT
+static bool cond_snapshot_update(struct trace_array *tr, void *cond_data)
+{
+ /* called with tr->max_lock held */
+ struct track_data *track_data = tr->cond_snapshot->cond_data;
+ struct hist_elt_data *elt_data, *track_elt_data;
+ struct snapshot_context *context = cond_data;
+ struct action_data *action;
+ u64 track_val;
+
+ if (!track_data)
+ return false;
+
+ action = track_data->action_data;
+
+ track_val = get_track_val(track_data->hist_data, context->elt,
+ track_data->action_data);
+
+ if (!action->track_data.check_val(track_data->track_val, track_val))
+ return false;
+
+ track_data->track_val = track_val;
+ memcpy(track_data->key, context->key, track_data->key_len);
+
+ elt_data = context->elt->private_data;
+ track_elt_data = track_data->elt.private_data;
+ if (elt_data->comm)
+ strncpy(track_elt_data->comm, elt_data->comm, TASK_COMM_LEN);
+
+ track_data->updated = true;
+
+ return true;
+}
+
+static void save_track_data_snapshot(struct hist_trigger_data *hist_data,
+ struct tracing_map_elt *elt, void *rec,
+ struct ring_buffer_event *rbe, void *key,
+ struct action_data *data,
+ u64 *var_ref_vals)
+{
+ struct trace_event_file *file = hist_data->event_file;
+ struct snapshot_context context;
+
+ context.elt = elt;
+ context.key = key;
+
+ tracing_snapshot_cond(file->tr, &context);
+}
+
+static void hist_trigger_print_key(struct seq_file *m,
+ struct hist_trigger_data *hist_data,
+ void *key,
+ struct tracing_map_elt *elt);
+
+static struct action_data *snapshot_action(struct hist_trigger_data *hist_data)
+{
+ unsigned int i;
+
+ if (!hist_data->n_actions)
+ return NULL;
+
+ for (i = 0; i < hist_data->n_actions; i++) {
+ struct action_data *data = hist_data->actions[i];
+
+ if (data->action == ACTION_SNAPSHOT)
+ return data;
+ }
+
+ return NULL;
+}
+
+static void track_data_snapshot_print(struct seq_file *m,
+ struct hist_trigger_data *hist_data)
+{
+ struct trace_event_file *file = hist_data->event_file;
+ struct track_data *track_data;
+ struct action_data *action;
+
+ track_data = tracing_cond_snapshot_data(file->tr);
+ if (!track_data)
+ return;
+
+ if (!track_data->updated)
+ return;
+
+ action = snapshot_action(hist_data);
+ if (!action)
+ return;
+
+ seq_puts(m, "\nSnapshot taken (see tracing/snapshot). Details:\n");
+ seq_printf(m, "\ttriggering value { %s(%s) }: %10llu",
+ action->handler == HANDLER_ONMAX ? "onmax" : "onchange",
+ action->track_data.var_str, track_data->track_val);
+
+ seq_puts(m, "\ttriggered by event with key: ");
+ hist_trigger_print_key(m, hist_data, track_data->key, &track_data->elt);
+ seq_putc(m, '\n');
+}
+#else
+static bool cond_snapshot_update(struct trace_array *tr, void *cond_data)
+{
+ return false;
+}
+static void save_track_data_snapshot(struct hist_trigger_data *hist_data,
+ struct tracing_map_elt *elt, void *rec,
+ struct ring_buffer_event *rbe, void *key,
+ struct action_data *data,
+ u64 *var_ref_vals) {}
+static void track_data_snapshot_print(struct seq_file *m,
+ struct hist_trigger_data *hist_data) {}
+#endif /* CONFIG_TRACER_SNAPSHOT */
+
+static void track_data_print(struct seq_file *m,
+ struct hist_trigger_data *hist_data,
+ struct tracing_map_elt *elt,
+ struct action_data *data)
+{
+ u64 track_val = get_track_val(hist_data, elt, data);
+ unsigned int i, save_var_idx;
+
+ if (data->handler == HANDLER_ONMAX)
+ seq_printf(m, "\n\tmax: %10llu", track_val);
+ else if (data->handler == HANDLER_ONCHANGE)
+ seq_printf(m, "\n\tchanged: %10llu", track_val);
+
+ if (data->action == ACTION_SNAPSHOT)
+ return;
+
+ for (i = 0; i < hist_data->n_save_vars; i++) {
+ struct hist_field *save_val = hist_data->save_vars[i]->val;
+ struct hist_field *save_var = hist_data->save_vars[i]->var;
u64 val;
save_var_idx = save_var->var.idx;
@@ -3392,64 +3723,82 @@ static void onmax_print(struct seq_file *m,
}
}
-static void onmax_save(struct hist_trigger_data *hist_data,
- struct tracing_map_elt *elt, void *rec,
- struct ring_buffer_event *rbe,
- struct action_data *data, u64 *var_ref_vals)
+static void ontrack_action(struct hist_trigger_data *hist_data,
+ struct tracing_map_elt *elt, void *rec,
+ struct ring_buffer_event *rbe, void *key,
+ struct action_data *data, u64 *var_ref_vals)
{
- unsigned int max_idx = data->onmax.max_var->var.idx;
- unsigned int max_var_ref_idx = data->onmax.max_var_ref_idx;
-
- u64 var_val, max_val;
+ u64 var_val = var_ref_vals[data->track_data.var_ref->var_ref_idx];
- var_val = var_ref_vals[max_var_ref_idx];
- max_val = tracing_map_read_var(elt, max_idx);
-
- if (var_val <= max_val)
- return;
-
- tracing_map_set_var(elt, max_idx, var_val);
-
- update_max_vars(hist_data, elt, rbe, rec);
+ if (check_track_val(elt, data, var_val)) {
+ save_track_val(hist_data, elt, data, var_val);
+ save_track_data(hist_data, elt, rec, rbe, key, data, var_ref_vals);
+ }
}
-static void onmax_destroy(struct action_data *data)
+static void action_data_destroy(struct action_data *data)
{
unsigned int i;
- destroy_hist_field(data->onmax.max_var, 0);
- destroy_hist_field(data->onmax.var, 0);
+ lockdep_assert_held(&event_mutex);
- kfree(data->onmax.var_str);
- kfree(data->onmax.fn_name);
+ kfree(data->action_name);
for (i = 0; i < data->n_params; i++)
kfree(data->params[i]);
+ if (data->synth_event)
+ data->synth_event->ref--;
+
+ kfree(data->synth_event_name);
+
kfree(data);
}
-static int onmax_create(struct hist_trigger_data *hist_data,
- struct action_data *data)
+static void track_data_destroy(struct hist_trigger_data *hist_data,
+ struct action_data *data)
{
struct trace_event_file *file = hist_data->event_file;
- struct hist_field *var_field, *ref_field, *max_var;
- unsigned int var_ref_idx = hist_data->n_var_refs;
- struct field_var *field_var;
- char *onmax_var_str, *param;
- unsigned int i;
+
+ destroy_hist_field(data->track_data.track_var, 0);
+
+ if (data->action == ACTION_SNAPSHOT) {
+ struct track_data *track_data;
+
+ track_data = tracing_cond_snapshot_data(file->tr);
+ if (track_data && track_data->hist_data == hist_data) {
+ tracing_snapshot_cond_disable(file->tr);
+ track_data_free(track_data);
+ }
+ }
+
+ kfree(data->track_data.var_str);
+
+ action_data_destroy(data);
+}
+
+static int action_create(struct hist_trigger_data *hist_data,
+ struct action_data *data);
+
+static int track_data_create(struct hist_trigger_data *hist_data,
+ struct action_data *data)
+{
+ struct hist_field *var_field, *ref_field, *track_var = NULL;
+ struct trace_event_file *file = hist_data->event_file;
+ struct trace_array *tr = file->tr;
+ char *track_data_var_str;
int ret = 0;
- onmax_var_str = data->onmax.var_str;
- if (onmax_var_str[0] != '$') {
- hist_err("onmax: For onmax(x), x must be a variable: ", onmax_var_str);
+ track_data_var_str = data->track_data.var_str;
+ if (track_data_var_str[0] != '$') {
+ hist_err(tr, HIST_ERR_ONX_NOT_VAR, errpos(track_data_var_str));
return -EINVAL;
}
- onmax_var_str++;
+ track_data_var_str++;
- var_field = find_target_event_var(hist_data, NULL, NULL, onmax_var_str);
+ var_field = find_target_event_var(hist_data, NULL, NULL, track_data_var_str);
if (!var_field) {
- hist_err("onmax: Couldn't find onmax variable: ", onmax_var_str);
+ hist_err(tr, HIST_ERR_ONX_VAR_NOT_FOUND, errpos(track_data_var_str));
return -EINVAL;
}
@@ -3457,61 +3806,53 @@ static int onmax_create(struct hist_trigger_data *hist_data,
if (!ref_field)
return -ENOMEM;
- data->onmax.var = ref_field;
+ data->track_data.var_ref = ref_field;
- data->fn = onmax_save;
- data->onmax.max_var_ref_idx = var_ref_idx;
- max_var = create_var(hist_data, file, "max", sizeof(u64), "u64");
- if (IS_ERR(max_var)) {
- hist_err("onmax: Couldn't create onmax variable: ", "max");
- ret = PTR_ERR(max_var);
+ if (data->handler == HANDLER_ONMAX)
+ track_var = create_var(hist_data, file, "__max", sizeof(u64), "u64");
+ if (IS_ERR(track_var)) {
+ hist_err(tr, HIST_ERR_ONX_VAR_CREATE_FAIL, 0);
+ ret = PTR_ERR(track_var);
goto out;
}
- data->onmax.max_var = max_var;
- for (i = 0; i < data->n_params; i++) {
- param = kstrdup(data->params[i], GFP_KERNEL);
- if (!param) {
- ret = -ENOMEM;
- goto out;
- }
-
- field_var = create_target_field_var(hist_data, NULL, NULL, param);
- if (IS_ERR(field_var)) {
- hist_err("onmax: Couldn't create field variable: ", param);
- ret = PTR_ERR(field_var);
- kfree(param);
- goto out;
- }
-
- hist_data->max_vars[hist_data->n_max_vars++] = field_var;
- if (field_var->val->flags & HIST_FIELD_FL_STRING)
- hist_data->n_max_var_str++;
-
- kfree(param);
+ if (data->handler == HANDLER_ONCHANGE)
+ track_var = create_var(hist_data, file, "__change", sizeof(u64), "u64");
+ if (IS_ERR(track_var)) {
+ hist_err(tr, HIST_ERR_ONX_VAR_CREATE_FAIL, 0);
+ ret = PTR_ERR(track_var);
+ goto out;
}
+ data->track_data.track_var = track_var;
+
+ ret = action_create(hist_data, data);
out:
return ret;
}
-static int parse_action_params(char *params, struct action_data *data)
+static int parse_action_params(struct trace_array *tr, char *params,
+ struct action_data *data)
{
char *param, *saved_param;
+ bool first_param = true;
int ret = 0;
while (params) {
- if (data->n_params >= SYNTH_FIELDS_MAX)
+ if (data->n_params >= SYNTH_FIELDS_MAX) {
+ hist_err(tr, HIST_ERR_TOO_MANY_PARAMS, 0);
goto out;
+ }
param = strsep(&params, ",");
if (!param) {
+ hist_err(tr, HIST_ERR_PARAM_NOT_FOUND, 0);
ret = -EINVAL;
goto out;
}
param = strstrip(param);
if (strlen(param) < 2) {
- hist_err("Invalid action param: ", param);
+ hist_err(tr, HIST_ERR_INVALID_PARAM, errpos(param));
ret = -EINVAL;
goto out;
}
@@ -3522,86 +3863,164 @@ static int parse_action_params(char *params, struct action_data *data)
goto out;
}
+ if (first_param && data->use_trace_keyword) {
+ data->synth_event_name = saved_param;
+ first_param = false;
+ continue;
+ }
+ first_param = false;
+
data->params[data->n_params++] = saved_param;
}
out:
return ret;
}
-static struct action_data *onmax_parse(char *str)
+static int action_parse(struct trace_array *tr, char *str, struct action_data *data,
+ enum handler_id handler)
{
- char *onmax_fn_name, *onmax_var_str;
- struct action_data *data;
- int ret = -EINVAL;
-
- data = kzalloc(sizeof(*data), GFP_KERNEL);
- if (!data)
- return ERR_PTR(-ENOMEM);
+ char *action_name;
+ int ret = 0;
- onmax_var_str = strsep(&str, ")");
- if (!onmax_var_str || !str) {
+ strsep(&str, ".");
+ if (!str) {
+ hist_err(tr, HIST_ERR_ACTION_NOT_FOUND, 0);
ret = -EINVAL;
- goto free;
+ goto out;
}
- data->onmax.var_str = kstrdup(onmax_var_str, GFP_KERNEL);
- if (!data->onmax.var_str) {
- ret = -ENOMEM;
- goto free;
+ action_name = strsep(&str, "(");
+ if (!action_name || !str) {
+ hist_err(tr, HIST_ERR_ACTION_NOT_FOUND, 0);
+ ret = -EINVAL;
+ goto out;
}
- strsep(&str, ".");
- if (!str)
- goto free;
-
- onmax_fn_name = strsep(&str, "(");
- if (!onmax_fn_name || !str)
- goto free;
-
- if (str_has_prefix(onmax_fn_name, "save")) {
+ if (str_has_prefix(action_name, "save")) {
char *params = strsep(&str, ")");
if (!params) {
+ hist_err(tr, HIST_ERR_NO_SAVE_PARAMS, 0);
ret = -EINVAL;
- goto free;
+ goto out;
}
- ret = parse_action_params(params, data);
+ ret = parse_action_params(tr, params, data);
if (ret)
- goto free;
- } else
+ goto out;
+
+ if (handler == HANDLER_ONMAX)
+ data->track_data.check_val = check_track_val_max;
+ else if (handler == HANDLER_ONCHANGE)
+ data->track_data.check_val = check_track_val_changed;
+ else {
+ hist_err(tr, HIST_ERR_ACTION_MISMATCH, errpos(action_name));
+ ret = -EINVAL;
+ goto out;
+ }
+
+ data->track_data.save_data = save_track_data_vars;
+ data->fn = ontrack_action;
+ data->action = ACTION_SAVE;
+ } else if (str_has_prefix(action_name, "snapshot")) {
+ char *params = strsep(&str, ")");
+
+ if (!str) {
+ hist_err(tr, HIST_ERR_NO_CLOSING_PAREN, errpos(params));
+ ret = -EINVAL;
+ goto out;
+ }
+
+ if (handler == HANDLER_ONMAX)
+ data->track_data.check_val = check_track_val_max;
+ else if (handler == HANDLER_ONCHANGE)
+ data->track_data.check_val = check_track_val_changed;
+ else {
+ hist_err(tr, HIST_ERR_ACTION_MISMATCH, errpos(action_name));
+ ret = -EINVAL;
+ goto out;
+ }
+
+ data->track_data.save_data = save_track_data_snapshot;
+ data->fn = ontrack_action;
+ data->action = ACTION_SNAPSHOT;
+ } else {
+ char *params = strsep(&str, ")");
+
+ if (str_has_prefix(action_name, "trace"))
+ data->use_trace_keyword = true;
+
+ if (params) {
+ ret = parse_action_params(tr, params, data);
+ if (ret)
+ goto out;
+ }
+
+ if (handler == HANDLER_ONMAX)
+ data->track_data.check_val = check_track_val_max;
+ else if (handler == HANDLER_ONCHANGE)
+ data->track_data.check_val = check_track_val_changed;
+
+ if (handler != HANDLER_ONMATCH) {
+ data->track_data.save_data = action_trace;
+ data->fn = ontrack_action;
+ } else
+ data->fn = action_trace;
+
+ data->action = ACTION_TRACE;
+ }
+
+ data->action_name = kstrdup(action_name, GFP_KERNEL);
+ if (!data->action_name) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ data->handler = handler;
+ out:
+ return ret;
+}
+
+static struct action_data *track_data_parse(struct hist_trigger_data *hist_data,
+ char *str, enum handler_id handler)
+{
+ struct action_data *data;
+ int ret = -EINVAL;
+ char *var_str;
+
+ data = kzalloc(sizeof(*data), GFP_KERNEL);
+ if (!data)
+ return ERR_PTR(-ENOMEM);
+
+ var_str = strsep(&str, ")");
+ if (!var_str || !str) {
+ ret = -EINVAL;
goto free;
+ }
- data->onmax.fn_name = kstrdup(onmax_fn_name, GFP_KERNEL);
- if (!data->onmax.fn_name) {
+ data->track_data.var_str = kstrdup(var_str, GFP_KERNEL);
+ if (!data->track_data.var_str) {
ret = -ENOMEM;
goto free;
}
+
+ ret = action_parse(hist_data->event_file->tr, str, data, handler);
+ if (ret)
+ goto free;
out:
return data;
free:
- onmax_destroy(data);
+ track_data_destroy(hist_data, data);
data = ERR_PTR(ret);
goto out;
}
static void onmatch_destroy(struct action_data *data)
{
- unsigned int i;
-
- lockdep_assert_held(&event_mutex);
+ kfree(data->match_data.event);
+ kfree(data->match_data.event_system);
- kfree(data->onmatch.match_event);
- kfree(data->onmatch.match_event_system);
- kfree(data->onmatch.synth_event_name);
-
- for (i = 0; i < data->n_params; i++)
- kfree(data->params[i]);
-
- if (data->onmatch.synth_event)
- data->onmatch.synth_event->ref--;
-
- kfree(data);
+ action_data_destroy(data);
}
static void destroy_field_var(struct field_var *field_var)
@@ -3651,33 +4070,35 @@ static int check_synth_field(struct synth_event *event,
}
static struct hist_field *
-onmatch_find_var(struct hist_trigger_data *hist_data, struct action_data *data,
- char *system, char *event, char *var)
+trace_action_find_var(struct hist_trigger_data *hist_data,
+ struct action_data *data,
+ char *system, char *event, char *var)
{
+ struct trace_array *tr = hist_data->event_file->tr;
struct hist_field *hist_field;
var++; /* skip '$' */
hist_field = find_target_event_var(hist_data, system, event, var);
if (!hist_field) {
- if (!system) {
- system = data->onmatch.match_event_system;
- event = data->onmatch.match_event;
+ if (!system && data->handler == HANDLER_ONMATCH) {
+ system = data->match_data.event_system;
+ event = data->match_data.event;
}
hist_field = find_event_var(hist_data, system, event, var);
}
if (!hist_field)
- hist_err_event("onmatch: Couldn't find onmatch param: $", system, event, var);
+ hist_err(tr, HIST_ERR_PARAM_NOT_FOUND, errpos(var));
return hist_field;
}
static struct hist_field *
-onmatch_create_field_var(struct hist_trigger_data *hist_data,
- struct action_data *data, char *system,
- char *event, char *var)
+trace_action_create_field_var(struct hist_trigger_data *hist_data,
+ struct action_data *data, char *system,
+ char *event, char *var)
{
struct hist_field *hist_field = NULL;
struct field_var *field_var;
@@ -3700,9 +4121,9 @@ onmatch_create_field_var(struct hist_trigger_data *hist_data,
* looking for fields on the onmatch(system.event.xxx)
* event.
*/
- if (!system) {
- system = data->onmatch.match_event_system;
- event = data->onmatch.match_event;
+ if (!system && data->handler == HANDLER_ONMATCH) {
+ system = data->match_data.event_system;
+ event = data->match_data.event;
}
/*
@@ -3724,24 +4145,31 @@ onmatch_create_field_var(struct hist_trigger_data *hist_data,
goto out;
}
-static int onmatch_create(struct hist_trigger_data *hist_data,
- struct trace_event_file *file,
- struct action_data *data)
+static int trace_action_create(struct hist_trigger_data *hist_data,
+ struct action_data *data)
{
+ struct trace_array *tr = hist_data->event_file->tr;
char *event_name, *param, *system = NULL;
struct hist_field *hist_field, *var_ref;
unsigned int i, var_ref_idx;
unsigned int field_pos = 0;
struct synth_event *event;
+ char *synth_event_name;
int ret = 0;
lockdep_assert_held(&event_mutex);
- event = find_synth_event(data->onmatch.synth_event_name);
+ if (data->use_trace_keyword)
+ synth_event_name = data->synth_event_name;
+ else
+ synth_event_name = data->action_name;
+
+ event = find_synth_event(synth_event_name);
if (!event) {
- hist_err("onmatch: Couldn't find synthetic event: ", data->onmatch.synth_event_name);
+ hist_err(tr, HIST_ERR_SYNTH_EVENT_NOT_FOUND, errpos(synth_event_name));
return -EINVAL;
}
+
event->ref++;
var_ref_idx = hist_data->n_var_refs;
@@ -3769,13 +4197,15 @@ static int onmatch_create(struct hist_trigger_data *hist_data,
}
if (param[0] == '$')
- hist_field = onmatch_find_var(hist_data, data, system,
- event_name, param);
+ hist_field = trace_action_find_var(hist_data, data,
+ system, event_name,
+ param);
else
- hist_field = onmatch_create_field_var(hist_data, data,
- system,
- event_name,
- param);
+ hist_field = trace_action_create_field_var(hist_data,
+ data,
+ system,
+ event_name,
+ param);
if (!hist_field) {
kfree(p);
@@ -3797,22 +4227,20 @@ static int onmatch_create(struct hist_trigger_data *hist_data,
continue;
}
- hist_err_event("onmatch: Param type doesn't match synthetic event field type: ",
- system, event_name, param);
+ hist_err(tr, HIST_ERR_SYNTH_TYPE_MISMATCH, errpos(param));
kfree(p);
ret = -EINVAL;
goto err;
}
if (field_pos != event->n_fields) {
- hist_err("onmatch: Param count doesn't match synthetic event field count: ", event->name);
+ hist_err(tr, HIST_ERR_SYNTH_COUNT_MISMATCH, errpos(event->name));
ret = -EINVAL;
goto err;
}
- data->fn = action_trace;
- data->onmatch.synth_event = event;
- data->onmatch.var_ref_idx = var_ref_idx;
+ data->synth_event = event;
+ data->var_ref_idx = var_ref_idx;
out:
return ret;
err:
@@ -3821,10 +4249,77 @@ static int onmatch_create(struct hist_trigger_data *hist_data,
goto out;
}
+static int action_create(struct hist_trigger_data *hist_data,
+ struct action_data *data)
+{
+ struct trace_event_file *file = hist_data->event_file;
+ struct trace_array *tr = file->tr;
+ struct track_data *track_data;
+ struct field_var *field_var;
+ unsigned int i;
+ char *param;
+ int ret = 0;
+
+ if (data->action == ACTION_TRACE)
+ return trace_action_create(hist_data, data);
+
+ if (data->action == ACTION_SNAPSHOT) {
+ track_data = track_data_alloc(hist_data->key_size, data, hist_data);
+ if (IS_ERR(track_data)) {
+ ret = PTR_ERR(track_data);
+ goto out;
+ }
+
+ ret = tracing_snapshot_cond_enable(file->tr, track_data,
+ cond_snapshot_update);
+ if (ret)
+ track_data_free(track_data);
+
+ goto out;
+ }
+
+ if (data->action == ACTION_SAVE) {
+ if (hist_data->n_save_vars) {
+ ret = -EEXIST;
+ hist_err(tr, HIST_ERR_TOO_MANY_SAVE_ACTIONS, 0);
+ goto out;
+ }
+
+ for (i = 0; i < data->n_params; i++) {
+ param = kstrdup(data->params[i], GFP_KERNEL);
+ if (!param) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ field_var = create_target_field_var(hist_data, NULL, NULL, param);
+ if (IS_ERR(field_var)) {
+ hist_err(tr, HIST_ERR_FIELD_VAR_CREATE_FAIL,
+ errpos(param));
+ ret = PTR_ERR(field_var);
+ kfree(param);
+ goto out;
+ }
+
+ hist_data->save_vars[hist_data->n_save_vars++] = field_var;
+ if (field_var->val->flags & HIST_FIELD_FL_STRING)
+ hist_data->n_save_var_str++;
+ kfree(param);
+ }
+ }
+ out:
+ return ret;
+}
+
+static int onmatch_create(struct hist_trigger_data *hist_data,
+ struct action_data *data)
+{
+ return action_create(hist_data, data);
+}
+
static struct action_data *onmatch_parse(struct trace_array *tr, char *str)
{
char *match_event, *match_event_system;
- char *synth_event_name, *params;
struct action_data *data;
int ret = -EINVAL;
@@ -3834,59 +4329,34 @@ static struct action_data *onmatch_parse(struct trace_array *tr, char *str)
match_event = strsep(&str, ")");
if (!match_event || !str) {
- hist_err("onmatch: Missing closing paren: ", match_event);
+ hist_err(tr, HIST_ERR_NO_CLOSING_PAREN, errpos(match_event));
goto free;
}
match_event_system = strsep(&match_event, ".");
if (!match_event) {
- hist_err("onmatch: Missing subsystem for match event: ", match_event_system);
+ hist_err(tr, HIST_ERR_SUBSYS_NOT_FOUND, errpos(match_event_system));
goto free;
}
if (IS_ERR(event_file(tr, match_event_system, match_event))) {
- hist_err_event("onmatch: Invalid subsystem or event name: ",
- match_event_system, match_event, NULL);
+ hist_err(tr, HIST_ERR_INVALID_SUBSYS_EVENT, errpos(match_event));
goto free;
}
- data->onmatch.match_event = kstrdup(match_event, GFP_KERNEL);
- if (!data->onmatch.match_event) {
+ data->match_data.event = kstrdup(match_event, GFP_KERNEL);
+ if (!data->match_data.event) {
ret = -ENOMEM;
goto free;
}
- data->onmatch.match_event_system = kstrdup(match_event_system, GFP_KERNEL);
- if (!data->onmatch.match_event_system) {
- ret = -ENOMEM;
- goto free;
- }
-
- strsep(&str, ".");
- if (!str) {
- hist_err("onmatch: Missing . after onmatch(): ", str);
- goto free;
- }
-
- synth_event_name = strsep(&str, "(");
- if (!synth_event_name || !str) {
- hist_err("onmatch: Missing opening paramlist paren: ", synth_event_name);
- goto free;
- }
-
- data->onmatch.synth_event_name = kstrdup(synth_event_name, GFP_KERNEL);
- if (!data->onmatch.synth_event_name) {
+ data->match_data.event_system = kstrdup(match_event_system, GFP_KERNEL);
+ if (!data->match_data.event_system) {
ret = -ENOMEM;
goto free;
}
- params = strsep(&str, ")");
- if (!params || !str || (str && strlen(str))) {
- hist_err("onmatch: Missing closing paramlist paren: ", params);
- goto free;
- }
-
- ret = parse_action_params(params, data);
+ ret = action_parse(tr, str, data, HANDLER_ONMATCH);
if (ret)
goto free;
out:
@@ -3955,13 +4425,14 @@ static int create_var_field(struct hist_trigger_data *hist_data,
struct trace_event_file *file,
char *var_name, char *expr_str)
{
+ struct trace_array *tr = hist_data->event_file->tr;
unsigned long flags = 0;
if (WARN_ON(val_idx >= TRACING_MAP_VALS_MAX + TRACING_MAP_VARS_MAX))
return -EINVAL;
if (find_var(hist_data, file, var_name) && !hist_data->remove) {
- hist_err("Variable already defined: ", var_name);
+ hist_err(tr, HIST_ERR_DUPLICATE_VAR, errpos(var_name));
return -EINVAL;
}
@@ -4018,8 +4489,8 @@ static int create_key_field(struct hist_trigger_data *hist_data,
struct trace_event_file *file,
char *field_str)
{
+ struct trace_array *tr = hist_data->event_file->tr;
struct hist_field *hist_field = NULL;
-
unsigned long flags = 0;
unsigned int key_size;
int ret = 0;
@@ -4041,8 +4512,8 @@ static int create_key_field(struct hist_trigger_data *hist_data,
goto out;
}
- if (hist_field->flags & HIST_FIELD_FL_VAR_REF) {
- hist_err("Using variable references as keys not supported: ", field_str);
+ if (field_has_hist_vars(hist_field, 0)) {
+ hist_err(tr, HIST_ERR_INVALID_REF_KEY, errpos(field_str));
destroy_hist_field(hist_field, 0);
ret = -EINVAL;
goto out;
@@ -4143,6 +4614,7 @@ static void free_var_defs(struct hist_trigger_data *hist_data)
static int parse_var_defs(struct hist_trigger_data *hist_data)
{
+ struct trace_array *tr = hist_data->event_file->tr;
char *s, *str, *var_name, *field_str;
unsigned int i, j, n_vars = 0;
int ret = 0;
@@ -4156,13 +4628,14 @@ static int parse_var_defs(struct hist_trigger_data *hist_data)
var_name = strsep(&field_str, "=");
if (!var_name || !field_str) {
- hist_err("Malformed assignment: ", var_name);
+ hist_err(tr, HIST_ERR_MALFORMED_ASSIGNMENT,
+ errpos(var_name));
ret = -EINVAL;
goto free;
}
if (n_vars == TRACING_MAP_VARS_MAX) {
- hist_err("Too many variables defined: ", var_name);
+ hist_err(tr, HIST_ERR_TOO_MANY_VARS, errpos(var_name));
ret = -EINVAL;
goto free;
}
@@ -4326,10 +4799,11 @@ static void destroy_actions(struct hist_trigger_data *hist_data)
for (i = 0; i < hist_data->n_actions; i++) {
struct action_data *data = hist_data->actions[i];
- if (data->fn == action_trace)
+ if (data->handler == HANDLER_ONMATCH)
onmatch_destroy(data);
- else if (data->fn == onmax_save)
- onmax_destroy(data);
+ else if (data->handler == HANDLER_ONMAX ||
+ data->handler == HANDLER_ONCHANGE)
+ track_data_destroy(hist_data, data);
else
kfree(data);
}
@@ -4355,16 +4829,24 @@ static int parse_actions(struct hist_trigger_data *hist_data)
ret = PTR_ERR(data);
break;
}
- data->fn = action_trace;
} else if ((len = str_has_prefix(str, "onmax("))) {
char *action_str = str + len;
- data = onmax_parse(action_str);
+ data = track_data_parse(hist_data, action_str,
+ HANDLER_ONMAX);
+ if (IS_ERR(data)) {
+ ret = PTR_ERR(data);
+ break;
+ }
+ } else if ((len = str_has_prefix(str, "onchange("))) {
+ char *action_str = str + len;
+
+ data = track_data_parse(hist_data, action_str,
+ HANDLER_ONCHANGE);
if (IS_ERR(data)) {
ret = PTR_ERR(data);
break;
}
- data->fn = onmax_save;
} else {
ret = -EINVAL;
break;
@@ -4376,8 +4858,7 @@ static int parse_actions(struct hist_trigger_data *hist_data)
return ret;
}
-static int create_actions(struct hist_trigger_data *hist_data,
- struct trace_event_file *file)
+static int create_actions(struct hist_trigger_data *hist_data)
{
struct action_data *data;
unsigned int i;
@@ -4386,14 +4867,18 @@ static int create_actions(struct hist_trigger_data *hist_data,
for (i = 0; i < hist_data->attrs->n_actions; i++) {
data = hist_data->actions[i];
- if (data->fn == action_trace) {
- ret = onmatch_create(hist_data, file, data);
+ if (data->handler == HANDLER_ONMATCH) {
+ ret = onmatch_create(hist_data, data);
if (ret)
- return ret;
- } else if (data->fn == onmax_save) {
- ret = onmax_create(hist_data, data);
+ break;
+ } else if (data->handler == HANDLER_ONMAX ||
+ data->handler == HANDLER_ONCHANGE) {
+ ret = track_data_create(hist_data, data);
if (ret)
- return ret;
+ break;
+ } else {
+ ret = -EINVAL;
+ break;
}
}
@@ -4409,26 +4894,51 @@ static void print_actions(struct seq_file *m,
for (i = 0; i < hist_data->n_actions; i++) {
struct action_data *data = hist_data->actions[i];
- if (data->fn == onmax_save)
- onmax_print(m, hist_data, elt, data);
+ if (data->action == ACTION_SNAPSHOT)
+ continue;
+
+ if (data->handler == HANDLER_ONMAX ||
+ data->handler == HANDLER_ONCHANGE)
+ track_data_print(m, hist_data, elt, data);
}
}
-static void print_onmax_spec(struct seq_file *m,
- struct hist_trigger_data *hist_data,
- struct action_data *data)
+static void print_action_spec(struct seq_file *m,
+ struct hist_trigger_data *hist_data,
+ struct action_data *data)
{
unsigned int i;
- seq_puts(m, ":onmax(");
- seq_printf(m, "%s", data->onmax.var_str);
- seq_printf(m, ").%s(", data->onmax.fn_name);
-
- for (i = 0; i < hist_data->n_max_vars; i++) {
- seq_printf(m, "%s", hist_data->max_vars[i]->var->var.name);
- if (i < hist_data->n_max_vars - 1)
- seq_puts(m, ",");
+ if (data->action == ACTION_SAVE) {
+ for (i = 0; i < hist_data->n_save_vars; i++) {
+ seq_printf(m, "%s", hist_data->save_vars[i]->var->var.name);
+ if (i < hist_data->n_save_vars - 1)
+ seq_puts(m, ",");
+ }
+ } else if (data->action == ACTION_TRACE) {
+ if (data->use_trace_keyword)
+ seq_printf(m, "%s", data->synth_event_name);
+ for (i = 0; i < data->n_params; i++) {
+ if (i || data->use_trace_keyword)
+ seq_puts(m, ",");
+ seq_printf(m, "%s", data->params[i]);
+ }
}
+}
+
+static void print_track_data_spec(struct seq_file *m,
+ struct hist_trigger_data *hist_data,
+ struct action_data *data)
+{
+ if (data->handler == HANDLER_ONMAX)
+ seq_puts(m, ":onmax(");
+ else if (data->handler == HANDLER_ONCHANGE)
+ seq_puts(m, ":onchange(");
+ seq_printf(m, "%s", data->track_data.var_str);
+ seq_printf(m, ").%s(", data->action_name);
+
+ print_action_spec(m, hist_data, data);
+
seq_puts(m, ")");
}
@@ -4436,18 +4946,12 @@ static void print_onmatch_spec(struct seq_file *m,
struct hist_trigger_data *hist_data,
struct action_data *data)
{
- unsigned int i;
-
- seq_printf(m, ":onmatch(%s.%s).", data->onmatch.match_event_system,
- data->onmatch.match_event);
+ seq_printf(m, ":onmatch(%s.%s).", data->match_data.event_system,
+ data->match_data.event);
- seq_printf(m, "%s(", data->onmatch.synth_event->name);
+ seq_printf(m, "%s(", data->action_name);
- for (i = 0; i < data->n_params; i++) {
- if (i)
- seq_puts(m, ",");
- seq_printf(m, "%s", data->params[i]);
- }
+ print_action_spec(m, hist_data, data);
seq_puts(m, ")");
}
@@ -4463,8 +4967,11 @@ static bool actions_match(struct hist_trigger_data *hist_data,
for (i = 0; i < hist_data->n_actions; i++) {
struct action_data *data = hist_data->actions[i];
struct action_data *data_test = hist_data_test->actions[i];
+ char *action_name, *action_name_test;
- if (data->fn != data_test->fn)
+ if (data->handler != data_test->handler)
+ return false;
+ if (data->action != data_test->action)
return false;
if (data->n_params != data_test->n_params)
@@ -4475,22 +4982,30 @@ static bool actions_match(struct hist_trigger_data *hist_data,
return false;
}
- if (data->fn == action_trace) {
- if (strcmp(data->onmatch.synth_event_name,
- data_test->onmatch.synth_event_name) != 0)
- return false;
- if (strcmp(data->onmatch.match_event_system,
- data_test->onmatch.match_event_system) != 0)
- return false;
- if (strcmp(data->onmatch.match_event,
- data_test->onmatch.match_event) != 0)
+ if (data->use_trace_keyword)
+ action_name = data->synth_event_name;
+ else
+ action_name = data->action_name;
+
+ if (data_test->use_trace_keyword)
+ action_name_test = data_test->synth_event_name;
+ else
+ action_name_test = data_test->action_name;
+
+ if (strcmp(action_name, action_name_test) != 0)
+ return false;
+
+ if (data->handler == HANDLER_ONMATCH) {
+ if (strcmp(data->match_data.event_system,
+ data_test->match_data.event_system) != 0)
return false;
- } else if (data->fn == onmax_save) {
- if (strcmp(data->onmax.var_str,
- data_test->onmax.var_str) != 0)
+ if (strcmp(data->match_data.event,
+ data_test->match_data.event) != 0)
return false;
- if (strcmp(data->onmax.fn_name,
- data_test->onmax.fn_name) != 0)
+ } else if (data->handler == HANDLER_ONMAX ||
+ data->handler == HANDLER_ONCHANGE) {
+ if (strcmp(data->track_data.var_str,
+ data_test->track_data.var_str) != 0)
return false;
}
}
@@ -4507,10 +5022,11 @@ static void print_actions_spec(struct seq_file *m,
for (i = 0; i < hist_data->n_actions; i++) {
struct action_data *data = hist_data->actions[i];
- if (data->fn == action_trace)
+ if (data->handler == HANDLER_ONMATCH)
print_onmatch_spec(m, hist_data, data);
- else if (data->fn == onmax_save)
- print_onmax_spec(m, hist_data, data);
+ else if (data->handler == HANDLER_ONMAX ||
+ data->handler == HANDLER_ONCHANGE)
+ print_track_data_spec(m, hist_data, data);
}
}
@@ -4695,22 +5211,24 @@ static inline void add_to_key(char *compound_key, void *key,
/* ensure NULL-termination */
if (size > key_field->size - 1)
size = key_field->size - 1;
- }
- memcpy(compound_key + key_field->offset, key, size);
+ strncpy(compound_key + key_field->offset, (char *)key, size);
+ } else
+ memcpy(compound_key + key_field->offset, key, size);
}
static void
hist_trigger_actions(struct hist_trigger_data *hist_data,
struct tracing_map_elt *elt, void *rec,
- struct ring_buffer_event *rbe, u64 *var_ref_vals)
+ struct ring_buffer_event *rbe, void *key,
+ u64 *var_ref_vals)
{
struct action_data *data;
unsigned int i;
for (i = 0; i < hist_data->n_actions; i++) {
data = hist_data->actions[i];
- data->fn(hist_data, elt, rec, rbe, data, var_ref_vals);
+ data->fn(hist_data, elt, rec, rbe, key, data, var_ref_vals);
}
}
@@ -4723,7 +5241,6 @@ static void event_hist_trigger(struct event_trigger_data *data, void *rec,
u64 var_ref_vals[TRACING_MAP_VARS_MAX];
char compound_key[HIST_KEY_SIZE_MAX];
struct tracing_map_elt *elt = NULL;
- struct stack_trace stacktrace;
struct hist_field *key_field;
u64 field_contents;
void *key = NULL;
@@ -4735,14 +5252,9 @@ static void event_hist_trigger(struct event_trigger_data *data, void *rec,
key_field = hist_data->fields[i];
if (key_field->flags & HIST_FIELD_FL_STACKTRACE) {
- stacktrace.max_entries = HIST_STACKTRACE_DEPTH;
- stacktrace.entries = entries;
- stacktrace.nr_entries = 0;
- stacktrace.skip = HIST_STACKTRACE_SKIP;
-
- memset(stacktrace.entries, 0, HIST_STACKTRACE_SIZE);
- save_stack_trace(&stacktrace);
-
+ memset(entries, 0, HIST_STACKTRACE_SIZE);
+ stack_trace_save(entries, HIST_STACKTRACE_DEPTH,
+ HIST_STACKTRACE_SKIP);
key = entries;
} else {
field_contents = key_field->fn(key_field, elt, rbe, rec);
@@ -4771,7 +5283,7 @@ static void event_hist_trigger(struct event_trigger_data *data, void *rec,
hist_trigger_elt_update(hist_data, elt, rec, rbe, var_ref_vals);
if (resolve_var_refs(hist_data, key, var_ref_vals, true))
- hist_trigger_actions(hist_data, elt, rec, rbe, var_ref_vals);
+ hist_trigger_actions(hist_data, elt, rec, rbe, key, var_ref_vals);
}
static void hist_trigger_stacktrace_print(struct seq_file *m,
@@ -4783,7 +5295,7 @@ static void hist_trigger_stacktrace_print(struct seq_file *m,
unsigned int i;
for (i = 0; i < max_entries; i++) {
- if (stacktrace_entries[i] == ULONG_MAX)
+ if (!stacktrace_entries[i])
return;
seq_printf(m, "%*c", 1 + spaces, ' ');
@@ -4792,10 +5304,10 @@ static void hist_trigger_stacktrace_print(struct seq_file *m,
}
}
-static void
-hist_trigger_entry_print(struct seq_file *m,
- struct hist_trigger_data *hist_data, void *key,
- struct tracing_map_elt *elt)
+static void hist_trigger_print_key(struct seq_file *m,
+ struct hist_trigger_data *hist_data,
+ void *key,
+ struct tracing_map_elt *elt)
{
struct hist_field *key_field;
char str[KSYM_SYMBOL_LEN];
@@ -4871,6 +5383,17 @@ hist_trigger_entry_print(struct seq_file *m,
seq_puts(m, " ");
seq_puts(m, "}");
+}
+
+static void hist_trigger_entry_print(struct seq_file *m,
+ struct hist_trigger_data *hist_data,
+ void *key,
+ struct tracing_map_elt *elt)
+{
+ const char *field_name;
+ unsigned int i;
+
+ hist_trigger_print_key(m, hist_data, key, elt);
seq_printf(m, " hitcount: %10llu",
tracing_map_read_sum(elt, HITCOUNT_IDX));
@@ -4937,6 +5460,8 @@ static void hist_trigger_show(struct seq_file *m,
if (n_entries < 0)
n_entries = 0;
+ track_data_snapshot_print(m, hist_data);
+
seq_printf(m, "\nTotals:\n Hits: %llu\n Entries: %u\n Dropped: %llu\n",
(u64)atomic64_read(&hist_data->map->hits),
n_entries, (u64)atomic64_read(&hist_data->map->drops));
@@ -4961,11 +5486,6 @@ static int hist_show(struct seq_file *m, void *v)
hist_trigger_show(m, data, n++);
}
- if (have_hist_err()) {
- seq_printf(m, "\nERROR: %s\n", hist_err_str);
- seq_printf(m, " Last command: %s\n", last_hist_cmd);
- }
-
out_unlock:
mutex_unlock(&event_mutex);
@@ -5330,6 +5850,7 @@ static int hist_register_trigger(char *glob, struct event_trigger_ops *ops,
{
struct hist_trigger_data *hist_data = data->private_data;
struct event_trigger_data *test, *named_data = NULL;
+ struct trace_array *tr = file->tr;
int ret = 0;
if (hist_data->attrs->name) {
@@ -5337,7 +5858,7 @@ static int hist_register_trigger(char *glob, struct event_trigger_ops *ops,
if (named_data) {
if (!hist_trigger_match(data, named_data, named_data,
true)) {
- hist_err("Named hist trigger doesn't match existing named trigger (includes variables): ", hist_data->attrs->name);
+ hist_err(tr, HIST_ERR_NAMED_MISMATCH, errpos(hist_data->attrs->name));
ret = -EINVAL;
goto out;
}
@@ -5358,7 +5879,7 @@ static int hist_register_trigger(char *glob, struct event_trigger_ops *ops,
else if (hist_data->attrs->clear)
hist_clear(test);
else {
- hist_err("Hist trigger already exists", NULL);
+ hist_err(tr, HIST_ERR_TRIGGER_EEXIST, 0);
ret = -EEXIST;
}
goto out;
@@ -5366,7 +5887,7 @@ static int hist_register_trigger(char *glob, struct event_trigger_ops *ops,
}
new:
if (hist_data->attrs->cont || hist_data->attrs->clear) {
- hist_err("Can't clear or continue a nonexistent hist trigger", NULL);
+ hist_err(tr, HIST_ERR_TRIGGER_ENOENT_CLEAR, 0);
ret = -ENOENT;
goto out;
}
@@ -5391,7 +5912,7 @@ static int hist_register_trigger(char *glob, struct event_trigger_ops *ops,
ret = tracing_set_clock(file->tr, hist_data->attrs->clock);
if (ret) {
- hist_err("Couldn't set trace_clock: ", clock);
+ hist_err(tr, HIST_ERR_SET_CLOCK_FAIL, errpos(clock));
goto out;
}
@@ -5567,8 +6088,8 @@ static int event_hist_trigger_func(struct event_command *cmd_ops,
lockdep_assert_held(&event_mutex);
if (glob && strlen(glob)) {
- last_cmd_set(param);
hist_err_clear();
+ last_cmd_set(file, param);
}
if (!param)
@@ -5609,7 +6130,7 @@ static int event_hist_trigger_func(struct event_command *cmd_ops,
trigger = strstrip(trigger);
}
- attrs = parse_hist_trigger_attrs(trigger);
+ attrs = parse_hist_trigger_attrs(file->tr, trigger);
if (IS_ERR(attrs))
return PTR_ERR(attrs);
@@ -5683,7 +6204,7 @@ static int event_hist_trigger_func(struct event_command *cmd_ops,
if (has_hist_vars(hist_data))
save_hist_vars(hist_data);
- ret = create_actions(hist_data, file);
+ ret = create_actions(hist_data);
if (ret)
goto out_unreg;
diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c
index cd12ecb66eb9..2a2912cb4533 100644
--- a/kernel/trace/trace_events_trigger.c
+++ b/kernel/trace/trace_events_trigger.c
@@ -731,7 +731,8 @@ int set_trigger_filter(char *filter_str,
goto out;
/* The filter is for the 'trigger' event, not the triggered event */
- ret = create_event_filter(file->event_call, filter_str, false, &filter);
+ ret = create_event_filter(file->tr, file->event_call,
+ filter_str, false, &filter);
/*
* If create_event_filter() fails, filter still needs to be freed.
* Which the calling code will do with data->filter.
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index c2af1560e856..69ebf3c2f1b5 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -380,6 +380,7 @@ static void print_graph_lat_fmt(struct trace_seq *s, struct trace_entry *entry)
{
trace_seq_putc(s, ' ');
trace_print_lat_fmt(s, entry);
+ trace_seq_puts(s, " | ");
}
/* If the pid changed since the last trace, output this event */
@@ -501,6 +502,17 @@ static void print_graph_abs_time(u64 t, struct trace_seq *s)
}
static void
+print_graph_rel_time(struct trace_iterator *iter, struct trace_seq *s)
+{
+ unsigned long long usecs;
+
+ usecs = iter->ts - iter->trace_buffer->time_start;
+ do_div(usecs, NSEC_PER_USEC);
+
+ trace_seq_printf(s, "%9llu us | ", usecs);
+}
+
+static void
print_graph_irq(struct trace_iterator *iter, unsigned long addr,
enum trace_type type, int cpu, pid_t pid, u32 flags)
{
@@ -517,6 +529,10 @@ print_graph_irq(struct trace_iterator *iter, unsigned long addr,
if (flags & TRACE_GRAPH_PRINT_ABS_TIME)
print_graph_abs_time(iter->ts, s);
+ /* Relative time */
+ if (flags & TRACE_GRAPH_PRINT_REL_TIME)
+ print_graph_rel_time(iter, s);
+
/* Cpu */
if (flags & TRACE_GRAPH_PRINT_CPU)
print_graph_cpu(s, cpu);
@@ -725,6 +741,10 @@ print_graph_prologue(struct trace_iterator *iter, struct trace_seq *s,
if (flags & TRACE_GRAPH_PRINT_ABS_TIME)
print_graph_abs_time(iter->ts, s);
+ /* Relative time */
+ if (flags & TRACE_GRAPH_PRINT_REL_TIME)
+ print_graph_rel_time(iter, s);
+
/* Cpu */
if (flags & TRACE_GRAPH_PRINT_CPU)
print_graph_cpu(s, cpu);
@@ -1101,6 +1121,8 @@ static void print_lat_header(struct seq_file *s, u32 flags)
if (flags & TRACE_GRAPH_PRINT_ABS_TIME)
size += 16;
+ if (flags & TRACE_GRAPH_PRINT_REL_TIME)
+ size += 16;
if (flags & TRACE_GRAPH_PRINT_CPU)
size += 4;
if (flags & TRACE_GRAPH_PRINT_PROC)
@@ -1125,12 +1147,14 @@ static void __print_graph_headers_flags(struct trace_array *tr,
seq_putc(s, '#');
if (flags & TRACE_GRAPH_PRINT_ABS_TIME)
seq_puts(s, " TIME ");
+ if (flags & TRACE_GRAPH_PRINT_REL_TIME)
+ seq_puts(s, " REL TIME ");
if (flags & TRACE_GRAPH_PRINT_CPU)
seq_puts(s, " CPU");
if (flags & TRACE_GRAPH_PRINT_PROC)
seq_puts(s, " TASK/PID ");
if (lat)
- seq_puts(s, "||||");
+ seq_puts(s, "|||| ");
if (flags & TRACE_GRAPH_PRINT_DURATION)
seq_puts(s, " DURATION ");
seq_puts(s, " FUNCTION CALLS\n");
@@ -1139,12 +1163,14 @@ static void __print_graph_headers_flags(struct trace_array *tr,
seq_putc(s, '#');
if (flags & TRACE_GRAPH_PRINT_ABS_TIME)
seq_puts(s, " | ");
+ if (flags & TRACE_GRAPH_PRINT_REL_TIME)
+ seq_puts(s, " | ");
if (flags & TRACE_GRAPH_PRINT_CPU)
seq_puts(s, " | ");
if (flags & TRACE_GRAPH_PRINT_PROC)
seq_puts(s, " | | ");
if (lat)
- seq_puts(s, "||||");
+ seq_puts(s, "|||| ");
if (flags & TRACE_GRAPH_PRINT_DURATION)
seq_puts(s, " | | ");
seq_puts(s, " | | | |\n");
diff --git a/kernel/trace/trace_hwlat.c b/kernel/trace/trace_hwlat.c
index 1e6db9cbe4dc..fa95139445b2 100644
--- a/kernel/trace/trace_hwlat.c
+++ b/kernel/trace/trace_hwlat.c
@@ -277,7 +277,7 @@ static void move_to_next_cpu(void)
* of this thread, than stop migrating for the duration
* of the current test.
*/
- if (!cpumask_equal(current_mask, &current->cpus_allowed))
+ if (!cpumask_equal(current_mask, current->cpus_ptr))
goto disable;
get_online_cpus();
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index d3294721f119..a745b0cee5d3 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -14,6 +14,7 @@
#include <linux/uaccess.h>
#include <linux/module.h>
#include <linux/ftrace.h>
+#include <linux/kprobes.h>
#include "trace.h"
@@ -238,7 +239,7 @@ static void irqsoff_trace_close(struct trace_iterator *iter)
#define GRAPH_TRACER_FLAGS (TRACE_GRAPH_PRINT_CPU | \
TRACE_GRAPH_PRINT_PROC | \
- TRACE_GRAPH_PRINT_ABS_TIME | \
+ TRACE_GRAPH_PRINT_REL_TIME | \
TRACE_GRAPH_PRINT_DURATION)
static enum print_line_t irqsoff_print_line(struct trace_iterator *iter)
@@ -365,7 +366,7 @@ out:
__trace_function(tr, CALLER_ADDR0, parent_ip, flags, pc);
}
-static inline void
+static nokprobe_inline void
start_critical_timing(unsigned long ip, unsigned long parent_ip, int pc)
{
int cpu;
@@ -401,7 +402,7 @@ start_critical_timing(unsigned long ip, unsigned long parent_ip, int pc)
atomic_dec(&data->disabled);
}
-static inline void
+static nokprobe_inline void
stop_critical_timing(unsigned long ip, unsigned long parent_ip, int pc)
{
int cpu;
@@ -443,6 +444,7 @@ void start_critical_timings(void)
start_critical_timing(CALLER_ADDR0, CALLER_ADDR1, pc);
}
EXPORT_SYMBOL_GPL(start_critical_timings);
+NOKPROBE_SYMBOL(start_critical_timings);
void stop_critical_timings(void)
{
@@ -452,6 +454,7 @@ void stop_critical_timings(void)
stop_critical_timing(CALLER_ADDR0, CALLER_ADDR1, pc);
}
EXPORT_SYMBOL_GPL(stop_critical_timings);
+NOKPROBE_SYMBOL(stop_critical_timings);
#ifdef CONFIG_FUNCTION_TRACER
static bool function_enabled;
@@ -611,6 +614,7 @@ void tracer_hardirqs_on(unsigned long a0, unsigned long a1)
if (!preempt_trace(pc) && irq_trace())
stop_critical_timing(a0, a1, pc);
}
+NOKPROBE_SYMBOL(tracer_hardirqs_on);
void tracer_hardirqs_off(unsigned long a0, unsigned long a1)
{
@@ -619,6 +623,7 @@ void tracer_hardirqs_off(unsigned long a0, unsigned long a1)
if (!preempt_trace(pc) && irq_trace())
start_critical_timing(a0, a1, pc);
}
+NOKPROBE_SYMBOL(tracer_hardirqs_off);
static int irqsoff_tracer_init(struct trace_array *tr)
{
diff --git a/kernel/trace/trace_kdb.c b/kernel/trace/trace_kdb.c
index d953c163a079..cca65044c14c 100644
--- a/kernel/trace/trace_kdb.c
+++ b/kernel/trace/trace_kdb.c
@@ -17,48 +17,42 @@
#include "trace.h"
#include "trace_output.h"
-static void ftrace_dump_buf(int skip_lines, long cpu_file)
+static struct trace_iterator iter;
+static struct ring_buffer_iter *buffer_iter[CONFIG_NR_CPUS];
+
+static void ftrace_dump_buf(int skip_entries, long cpu_file)
{
- /* use static because iter can be a bit big for the stack */
- static struct trace_iterator iter;
- static struct ring_buffer_iter *buffer_iter[CONFIG_NR_CPUS];
struct trace_array *tr;
unsigned int old_userobj;
int cnt = 0, cpu;
- trace_init_global_iter(&iter);
- iter.buffer_iter = buffer_iter;
tr = iter.tr;
- for_each_tracing_cpu(cpu) {
- atomic_inc(&per_cpu_ptr(iter.trace_buffer->data, cpu)->disabled);
- }
-
old_userobj = tr->trace_flags;
/* don't look at user memory in panic mode */
tr->trace_flags &= ~TRACE_ITER_SYM_USEROBJ;
kdb_printf("Dumping ftrace buffer:\n");
+ if (skip_entries)
+ kdb_printf("(skipping %d entries)\n", skip_entries);
- /* reset all but tr, trace, and overruns */
- memset(&iter.seq, 0,
- sizeof(struct trace_iterator) -
- offsetof(struct trace_iterator, seq));
+ trace_iterator_reset(&iter);
iter.iter_flags |= TRACE_FILE_LAT_FMT;
- iter.pos = -1;
if (cpu_file == RING_BUFFER_ALL_CPUS) {
for_each_tracing_cpu(cpu) {
iter.buffer_iter[cpu] =
- ring_buffer_read_prepare(iter.trace_buffer->buffer, cpu);
+ ring_buffer_read_prepare(iter.trace_buffer->buffer,
+ cpu, GFP_ATOMIC);
ring_buffer_read_start(iter.buffer_iter[cpu]);
tracing_iter_reset(&iter, cpu);
}
} else {
iter.cpu_file = cpu_file;
iter.buffer_iter[cpu_file] =
- ring_buffer_read_prepare(iter.trace_buffer->buffer, cpu_file);
+ ring_buffer_read_prepare(iter.trace_buffer->buffer,
+ cpu_file, GFP_ATOMIC);
ring_buffer_read_start(iter.buffer_iter[cpu_file]);
tracing_iter_reset(&iter, cpu_file);
}
@@ -68,11 +62,11 @@ static void ftrace_dump_buf(int skip_lines, long cpu_file)
kdb_printf("---------------------------------\n");
cnt++;
- if (!skip_lines) {
+ if (!skip_entries) {
print_trace_line(&iter);
trace_printk_seq(&iter.seq);
} else {
- skip_lines--;
+ skip_entries--;
}
if (KDB_FLAG(CMD_INTERRUPT))
@@ -88,10 +82,6 @@ out:
tr->trace_flags = old_userobj;
for_each_tracing_cpu(cpu) {
- atomic_dec(&per_cpu_ptr(iter.trace_buffer->data, cpu)->disabled);
- }
-
- for_each_tracing_cpu(cpu) {
if (iter.buffer_iter[cpu]) {
ring_buffer_read_finish(iter.buffer_iter[cpu]);
iter.buffer_iter[cpu] = NULL;
@@ -104,17 +94,19 @@ out:
*/
static int kdb_ftdump(int argc, const char **argv)
{
- int skip_lines = 0;
+ int skip_entries = 0;
long cpu_file;
char *cp;
+ int cnt;
+ int cpu;
if (argc > 2)
return KDB_ARGCOUNT;
if (argc) {
- skip_lines = simple_strtol(argv[1], &cp, 0);
+ skip_entries = simple_strtol(argv[1], &cp, 0);
if (*cp)
- skip_lines = 0;
+ skip_entries = 0;
}
if (argc == 2) {
@@ -127,7 +119,29 @@ static int kdb_ftdump(int argc, const char **argv)
}
kdb_trap_printk++;
- ftrace_dump_buf(skip_lines, cpu_file);
+
+ trace_init_global_iter(&iter);
+ iter.buffer_iter = buffer_iter;
+
+ for_each_tracing_cpu(cpu) {
+ atomic_inc(&per_cpu_ptr(iter.trace_buffer->data, cpu)->disabled);
+ }
+
+ /* A negative skip_entries means skip all but the last entries */
+ if (skip_entries < 0) {
+ if (cpu_file == RING_BUFFER_ALL_CPUS)
+ cnt = trace_total_entries(NULL);
+ else
+ cnt = trace_total_entries_cpu(NULL, cpu_file);
+ skip_entries = max(cnt + skip_entries, 0);
+ }
+
+ ftrace_dump_buf(skip_entries, cpu_file);
+
+ for_each_tracing_cpu(cpu) {
+ atomic_dec(&per_cpu_ptr(iter.trace_buffer->data, cpu)->disabled);
+ }
+
kdb_trap_printk--;
return 0;
@@ -135,8 +149,9 @@ static int kdb_ftdump(int argc, const char **argv)
static __init int kdb_ftrace_register(void)
{
- kdb_register_flags("ftdump", kdb_ftdump, "[skip_#lines] [cpu]",
- "Dump ftrace log", 0, KDB_ENABLE_ALWAYS_SAFE);
+ kdb_register_flags("ftdump", kdb_ftdump, "[skip_#entries] [cpu]",
+ "Dump ftrace log; -skip dumps last #entries", 0,
+ KDB_ENABLE_ALWAYS_SAFE);
return 0;
}
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index d5fb09ebba8b..7d736248a070 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -35,7 +35,7 @@ static struct dyn_event_operations trace_kprobe_ops = {
.match = trace_kprobe_match,
};
-/**
+/*
* Kprobe event core functions
*/
struct trace_kprobe {
@@ -221,7 +221,7 @@ static struct trace_kprobe *alloc_trace_kprobe(const char *group,
tk->rp.maxactive = maxactive;
- if (!event || !is_good_name(event)) {
+ if (!event || !group) {
ret = -EINVAL;
goto error;
}
@@ -231,11 +231,6 @@ static struct trace_kprobe *alloc_trace_kprobe(const char *group,
if (!tk->tp.call.name)
goto error;
- if (!group || !is_good_name(group)) {
- ret = -EINVAL;
- goto error;
- }
-
tk->tp.class.system = kstrdup(group, GFP_KERNEL);
if (!tk->tp.class.system)
goto error;
@@ -446,13 +441,8 @@ static int __register_trace_kprobe(struct trace_kprobe *tk)
else
ret = register_kprobe(&tk->rp.kp);
- if (ret == 0) {
+ if (ret == 0)
tk->tp.flags |= TP_FLAG_REGISTERED;
- } else if (ret == -EILSEQ) {
- pr_warn("Probing address(0x%p) is not an instruction boundary.\n",
- tk->rp.kp.addr);
- ret = -EINVAL;
- }
return ret;
}
@@ -596,7 +586,7 @@ static int trace_kprobe_create(int argc, const char *argv[])
* Type of args:
* FETCHARG:TYPE : use TYPE instead of unsigned long.
*/
- struct trace_kprobe *tk;
+ struct trace_kprobe *tk = NULL;
int i, len, ret = 0;
bool is_return = false;
char *symbol = NULL, *tmp = NULL;
@@ -620,40 +610,50 @@ static int trace_kprobe_create(int argc, const char *argv[])
if (argc < 2)
return -ECANCELED;
+ trace_probe_log_init("trace_kprobe", argc, argv);
+
event = strchr(&argv[0][1], ':');
if (event)
event++;
- if (is_return && isdigit(argv[0][1])) {
+ if (isdigit(argv[0][1])) {
+ if (!is_return) {
+ trace_probe_log_err(1, MAXACT_NO_KPROBE);
+ goto parse_error;
+ }
if (event)
len = event - &argv[0][1] - 1;
else
len = strlen(&argv[0][1]);
- if (len > MAX_EVENT_NAME_LEN - 1)
- return -E2BIG;
+ if (len > MAX_EVENT_NAME_LEN - 1) {
+ trace_probe_log_err(1, BAD_MAXACT);
+ goto parse_error;
+ }
memcpy(buf, &argv[0][1], len);
buf[len] = '\0';
ret = kstrtouint(buf, 0, &maxactive);
- if (ret) {
- pr_info("Failed to parse maxactive.\n");
- return ret;
+ if (ret || !maxactive) {
+ trace_probe_log_err(1, BAD_MAXACT);
+ goto parse_error;
}
/* kretprobes instances are iterated over via a list. The
* maximum should stay reasonable.
*/
if (maxactive > KRETPROBE_MAXACTIVE_MAX) {
- pr_info("Maxactive is too big (%d > %d).\n",
- maxactive, KRETPROBE_MAXACTIVE_MAX);
- return -E2BIG;
+ trace_probe_log_err(1, MAXACT_TOO_BIG);
+ goto parse_error;
}
}
/* try to parse an address. if that fails, try to read the
* input as a symbol. */
if (kstrtoul(argv[1], 0, (unsigned long *)&addr)) {
+ trace_probe_log_set_index(1);
/* Check whether uprobe event specified */
- if (strchr(argv[1], '/') && strchr(argv[1], ':'))
- return -ECANCELED;
+ if (strchr(argv[1], '/') && strchr(argv[1], ':')) {
+ ret = -ECANCELED;
+ goto error;
+ }
/* a symbol specified */
symbol = kstrdup(argv[1], GFP_KERNEL);
if (!symbol)
@@ -661,23 +661,23 @@ static int trace_kprobe_create(int argc, const char *argv[])
/* TODO: support .init module functions */
ret = traceprobe_split_symbol_offset(symbol, &offset);
if (ret || offset < 0 || offset > UINT_MAX) {
- pr_info("Failed to parse either an address or a symbol.\n");
- goto out;
+ trace_probe_log_err(0, BAD_PROBE_ADDR);
+ goto parse_error;
}
if (kprobe_on_func_entry(NULL, symbol, offset))
flags |= TPARG_FL_FENTRY;
if (offset && is_return && !(flags & TPARG_FL_FENTRY)) {
- pr_info("Given offset is not valid for return probe.\n");
- ret = -EINVAL;
- goto out;
+ trace_probe_log_err(0, BAD_RETPROBE);
+ goto parse_error;
}
}
- argc -= 2; argv += 2;
+ trace_probe_log_set_index(0);
if (event) {
- ret = traceprobe_parse_event_name(&event, &group, buf);
+ ret = traceprobe_parse_event_name(&event, &group, buf,
+ event - argv[0]);
if (ret)
- goto out;
+ goto parse_error;
} else {
/* Make a new event name */
if (symbol)
@@ -692,13 +692,14 @@ static int trace_kprobe_create(int argc, const char *argv[])
/* setup a probe */
tk = alloc_trace_kprobe(group, event, addr, symbol, offset, maxactive,
- argc, is_return);
+ argc - 2, is_return);
if (IS_ERR(tk)) {
- pr_info("Failed to allocate trace_probe.(%d)\n",
- (int)PTR_ERR(tk));
ret = PTR_ERR(tk);
- goto out;
+ /* This must return -ENOMEM, else there is a bug */
+ WARN_ON_ONCE(ret != -ENOMEM);
+ goto out; /* We know tk is not allocated */
}
+ argc -= 2; argv += 2;
/* parse arguments */
for (i = 0; i < argc && i < MAX_TRACE_ARGS; i++) {
@@ -708,19 +709,32 @@ static int trace_kprobe_create(int argc, const char *argv[])
goto error;
}
+ trace_probe_log_set_index(i + 2);
ret = traceprobe_parse_probe_arg(&tk->tp, i, tmp, flags);
kfree(tmp);
if (ret)
- goto error;
+ goto error; /* This can be -ENOMEM */
}
ret = register_trace_kprobe(tk);
- if (ret)
+ if (ret) {
+ trace_probe_log_set_index(1);
+ if (ret == -EILSEQ)
+ trace_probe_log_err(0, BAD_INSN_BNDRY);
+ else if (ret == -ENOENT)
+ trace_probe_log_err(0, BAD_PROBE_ADDR);
+ else if (ret != -ENOMEM)
+ trace_probe_log_err(0, FAIL_REG_PROBE);
goto error;
+ }
+
out:
+ trace_probe_log_clear();
kfree(symbol);
return ret;
+parse_error:
+ ret = -EINVAL;
error:
free_trace_kprobe(tk);
goto out;
@@ -861,22 +875,14 @@ static const struct file_operations kprobe_profile_ops = {
static nokprobe_inline int
fetch_store_strlen(unsigned long addr)
{
- mm_segment_t old_fs;
int ret, len = 0;
u8 c;
- old_fs = get_fs();
- set_fs(KERNEL_DS);
- pagefault_disable();
-
do {
- ret = __copy_from_user_inatomic(&c, (u8 *)addr + len, 1);
+ ret = probe_kernel_read(&c, (u8 *)addr + len, 1);
len++;
} while (c && ret == 0 && len < MAX_STRING_SIZE);
- pagefault_enable();
- set_fs(old_fs);
-
return (ret < 0) ? ret : len;
}
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 54373d93e251..ba751f993c3b 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -1057,7 +1057,7 @@ static enum print_line_t trace_stack_print(struct trace_iterator *iter,
trace_seq_puts(s, "<stack trace>\n");
- for (p = field->caller; p && *p != ULONG_MAX && p < end; p++) {
+ for (p = field->caller; p && p < end && *p != ULONG_MAX; p++) {
if (trace_seq_has_overflowed(s))
break;
diff --git a/kernel/trace/trace_preemptirq.c b/kernel/trace/trace_preemptirq.c
index 71f553cceb3c..4d8e99fdbbbe 100644
--- a/kernel/trace/trace_preemptirq.c
+++ b/kernel/trace/trace_preemptirq.c
@@ -9,6 +9,7 @@
#include <linux/uaccess.h>
#include <linux/module.h>
#include <linux/ftrace.h>
+#include <linux/kprobes.h>
#include "trace.h"
#define CREATE_TRACE_POINTS
@@ -30,6 +31,7 @@ void trace_hardirqs_on(void)
lockdep_hardirqs_on(CALLER_ADDR0);
}
EXPORT_SYMBOL(trace_hardirqs_on);
+NOKPROBE_SYMBOL(trace_hardirqs_on);
void trace_hardirqs_off(void)
{
@@ -43,6 +45,7 @@ void trace_hardirqs_off(void)
lockdep_hardirqs_off(CALLER_ADDR0);
}
EXPORT_SYMBOL(trace_hardirqs_off);
+NOKPROBE_SYMBOL(trace_hardirqs_off);
__visible void trace_hardirqs_on_caller(unsigned long caller_addr)
{
@@ -56,6 +59,7 @@ __visible void trace_hardirqs_on_caller(unsigned long caller_addr)
lockdep_hardirqs_on(CALLER_ADDR0);
}
EXPORT_SYMBOL(trace_hardirqs_on_caller);
+NOKPROBE_SYMBOL(trace_hardirqs_on_caller);
__visible void trace_hardirqs_off_caller(unsigned long caller_addr)
{
@@ -69,6 +73,7 @@ __visible void trace_hardirqs_off_caller(unsigned long caller_addr)
lockdep_hardirqs_off(CALLER_ADDR0);
}
EXPORT_SYMBOL(trace_hardirqs_off_caller);
+NOKPROBE_SYMBOL(trace_hardirqs_off_caller);
#endif /* CONFIG_TRACE_IRQFLAGS */
#ifdef CONFIG_TRACE_PREEMPT_TOGGLE
diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c
index 9962cb5da8ac..a347faced959 100644
--- a/kernel/trace/trace_probe.c
+++ b/kernel/trace/trace_probe.c
@@ -13,7 +13,12 @@
#include "trace_probe.h"
-const char *reserved_field_names[] = {
+#undef C
+#define C(a, b) b
+
+static const char *trace_probe_err_text[] = { ERRORS };
+
+static const char *reserved_field_names[] = {
"common_type",
"common_flags",
"common_preempt_count",
@@ -133,6 +138,60 @@ fail:
return NULL;
}
+static struct trace_probe_log trace_probe_log;
+
+void trace_probe_log_init(const char *subsystem, int argc, const char **argv)
+{
+ trace_probe_log.subsystem = subsystem;
+ trace_probe_log.argc = argc;
+ trace_probe_log.argv = argv;
+ trace_probe_log.index = 0;
+}
+
+void trace_probe_log_clear(void)
+{
+ memset(&trace_probe_log, 0, sizeof(trace_probe_log));
+}
+
+void trace_probe_log_set_index(int index)
+{
+ trace_probe_log.index = index;
+}
+
+void __trace_probe_log_err(int offset, int err_type)
+{
+ char *command, *p;
+ int i, len = 0, pos = 0;
+
+ if (!trace_probe_log.argv)
+ return;
+
+ /* Recalcurate the length and allocate buffer */
+ for (i = 0; i < trace_probe_log.argc; i++) {
+ if (i == trace_probe_log.index)
+ pos = len;
+ len += strlen(trace_probe_log.argv[i]) + 1;
+ }
+ command = kzalloc(len, GFP_KERNEL);
+ if (!command)
+ return;
+
+ /* And make a command string from argv array */
+ p = command;
+ for (i = 0; i < trace_probe_log.argc; i++) {
+ len = strlen(trace_probe_log.argv[i]);
+ strcpy(p, trace_probe_log.argv[i]);
+ p[len] = ' ';
+ p += len + 1;
+ }
+ *(p - 1) = '\0';
+
+ tracing_log_err(NULL, trace_probe_log.subsystem, command,
+ trace_probe_err_text, err_type, pos + offset);
+
+ kfree(command);
+}
+
/* Split symbol and offset. */
int traceprobe_split_symbol_offset(char *symbol, long *offset)
{
@@ -156,26 +215,41 @@ int traceprobe_split_symbol_offset(char *symbol, long *offset)
/* @buf must has MAX_EVENT_NAME_LEN size */
int traceprobe_parse_event_name(const char **pevent, const char **pgroup,
- char *buf)
+ char *buf, int offset)
{
const char *slash, *event = *pevent;
+ int len;
slash = strchr(event, '/');
if (slash) {
if (slash == event) {
- pr_info("Group name is not specified\n");
+ trace_probe_log_err(offset, NO_GROUP_NAME);
return -EINVAL;
}
if (slash - event + 1 > MAX_EVENT_NAME_LEN) {
- pr_info("Group name is too long\n");
- return -E2BIG;
+ trace_probe_log_err(offset, GROUP_TOO_LONG);
+ return -EINVAL;
}
strlcpy(buf, event, slash - event + 1);
+ if (!is_good_name(buf)) {
+ trace_probe_log_err(offset, BAD_GROUP_NAME);
+ return -EINVAL;
+ }
*pgroup = buf;
*pevent = slash + 1;
+ offset += slash - event + 1;
+ event = *pevent;
+ }
+ len = strlen(event);
+ if (len == 0) {
+ trace_probe_log_err(offset, NO_EVENT_NAME);
+ return -EINVAL;
+ } else if (len > MAX_EVENT_NAME_LEN) {
+ trace_probe_log_err(offset, EVENT_TOO_LONG);
+ return -EINVAL;
}
- if (strlen(event) == 0) {
- pr_info("Event name is not specified\n");
+ if (!is_good_name(event)) {
+ trace_probe_log_err(offset, BAD_EVENT_NAME);
return -EINVAL;
}
return 0;
@@ -184,56 +258,67 @@ int traceprobe_parse_event_name(const char **pevent, const char **pgroup,
#define PARAM_MAX_STACK (THREAD_SIZE / sizeof(unsigned long))
static int parse_probe_vars(char *arg, const struct fetch_type *t,
- struct fetch_insn *code, unsigned int flags)
+ struct fetch_insn *code, unsigned int flags, int offs)
{
unsigned long param;
int ret = 0;
int len;
if (strcmp(arg, "retval") == 0) {
- if (flags & TPARG_FL_RETURN)
+ if (flags & TPARG_FL_RETURN) {
code->op = FETCH_OP_RETVAL;
- else
+ } else {
+ trace_probe_log_err(offs, RETVAL_ON_PROBE);
ret = -EINVAL;
+ }
} else if ((len = str_has_prefix(arg, "stack"))) {
if (arg[len] == '\0') {
code->op = FETCH_OP_STACKP;
} else if (isdigit(arg[len])) {
ret = kstrtoul(arg + len, 10, &param);
- if (ret || ((flags & TPARG_FL_KERNEL) &&
- param > PARAM_MAX_STACK))
+ if (ret) {
+ goto inval_var;
+ } else if ((flags & TPARG_FL_KERNEL) &&
+ param > PARAM_MAX_STACK) {
+ trace_probe_log_err(offs, BAD_STACK_NUM);
ret = -EINVAL;
- else {
+ } else {
code->op = FETCH_OP_STACK;
code->param = (unsigned int)param;
}
} else
- ret = -EINVAL;
+ goto inval_var;
} else if (strcmp(arg, "comm") == 0) {
code->op = FETCH_OP_COMM;
#ifdef CONFIG_HAVE_FUNCTION_ARG_ACCESS_API
} else if (((flags & TPARG_FL_MASK) ==
(TPARG_FL_KERNEL | TPARG_FL_FENTRY)) &&
(len = str_has_prefix(arg, "arg"))) {
- if (!isdigit(arg[len]))
- return -EINVAL;
ret = kstrtoul(arg + len, 10, &param);
- if (ret || !param || param > PARAM_MAX_STACK)
+ if (ret) {
+ goto inval_var;
+ } else if (!param || param > PARAM_MAX_STACK) {
+ trace_probe_log_err(offs, BAD_ARG_NUM);
return -EINVAL;
+ }
code->op = FETCH_OP_ARG;
code->param = (unsigned int)param - 1;
#endif
} else
- ret = -EINVAL;
+ goto inval_var;
return ret;
+
+inval_var:
+ trace_probe_log_err(offs, BAD_VAR);
+ return -EINVAL;
}
/* Recursive argument parser */
static int
parse_probe_arg(char *arg, const struct fetch_type *type,
struct fetch_insn **pcode, struct fetch_insn *end,
- unsigned int flags)
+ unsigned int flags, int offs)
{
struct fetch_insn *code = *pcode;
unsigned long param;
@@ -243,7 +328,7 @@ parse_probe_arg(char *arg, const struct fetch_type *type,
switch (arg[0]) {
case '$':
- ret = parse_probe_vars(arg + 1, type, code, flags);
+ ret = parse_probe_vars(arg + 1, type, code, flags, offs);
break;
case '%': /* named register */
@@ -252,47 +337,57 @@ parse_probe_arg(char *arg, const struct fetch_type *type,
code->op = FETCH_OP_REG;
code->param = (unsigned int)ret;
ret = 0;
- }
+ } else
+ trace_probe_log_err(offs, BAD_REG_NAME);
break;
case '@': /* memory, file-offset or symbol */
if (isdigit(arg[1])) {
ret = kstrtoul(arg + 1, 0, &param);
- if (ret)
+ if (ret) {
+ trace_probe_log_err(offs, BAD_MEM_ADDR);
break;
+ }
/* load address */
code->op = FETCH_OP_IMM;
code->immediate = param;
} else if (arg[1] == '+') {
/* kprobes don't support file offsets */
- if (flags & TPARG_FL_KERNEL)
+ if (flags & TPARG_FL_KERNEL) {
+ trace_probe_log_err(offs, FILE_ON_KPROBE);
return -EINVAL;
-
+ }
ret = kstrtol(arg + 2, 0, &offset);
- if (ret)
+ if (ret) {
+ trace_probe_log_err(offs, BAD_FILE_OFFS);
break;
+ }
code->op = FETCH_OP_FOFFS;
code->immediate = (unsigned long)offset; // imm64?
} else {
/* uprobes don't support symbols */
- if (!(flags & TPARG_FL_KERNEL))
+ if (!(flags & TPARG_FL_KERNEL)) {
+ trace_probe_log_err(offs, SYM_ON_UPROBE);
return -EINVAL;
-
+ }
/* Preserve symbol for updating */
code->op = FETCH_NOP_SYMBOL;
code->data = kstrdup(arg + 1, GFP_KERNEL);
if (!code->data)
return -ENOMEM;
- if (++code == end)
- return -E2BIG;
-
+ if (++code == end) {
+ trace_probe_log_err(offs, TOO_MANY_OPS);
+ return -EINVAL;
+ }
code->op = FETCH_OP_IMM;
code->immediate = 0;
}
/* These are fetching from memory */
- if (++code == end)
- return -E2BIG;
+ if (++code == end) {
+ trace_probe_log_err(offs, TOO_MANY_OPS);
+ return -EINVAL;
+ }
*pcode = code;
code->op = FETCH_OP_DEREF;
code->offset = offset;
@@ -300,30 +395,41 @@ parse_probe_arg(char *arg, const struct fetch_type *type,
case '+': /* deref memory */
arg++; /* Skip '+', because kstrtol() rejects it. */
+ /* fall through */
case '-':
tmp = strchr(arg, '(');
- if (!tmp)
+ if (!tmp) {
+ trace_probe_log_err(offs, DEREF_NEED_BRACE);
return -EINVAL;
-
+ }
*tmp = '\0';
ret = kstrtol(arg, 0, &offset);
- if (ret)
+ if (ret) {
+ trace_probe_log_err(offs, BAD_DEREF_OFFS);
break;
-
+ }
+ offs += (tmp + 1 - arg) + (arg[0] != '-' ? 1 : 0);
arg = tmp + 1;
tmp = strrchr(arg, ')');
-
- if (tmp) {
+ if (!tmp) {
+ trace_probe_log_err(offs + strlen(arg),
+ DEREF_OPEN_BRACE);
+ return -EINVAL;
+ } else {
const struct fetch_type *t2 = find_fetch_type(NULL);
*tmp = '\0';
- ret = parse_probe_arg(arg, t2, &code, end, flags);
+ ret = parse_probe_arg(arg, t2, &code, end, flags, offs);
if (ret)
break;
- if (code->op == FETCH_OP_COMM)
+ if (code->op == FETCH_OP_COMM) {
+ trace_probe_log_err(offs, COMM_CANT_DEREF);
+ return -EINVAL;
+ }
+ if (++code == end) {
+ trace_probe_log_err(offs, TOO_MANY_OPS);
return -EINVAL;
- if (++code == end)
- return -E2BIG;
+ }
*pcode = code;
code->op = FETCH_OP_DEREF;
@@ -333,6 +439,7 @@ parse_probe_arg(char *arg, const struct fetch_type *type,
}
if (!ret && code->op == FETCH_OP_NOP) {
/* Parsed, but do not find fetch method */
+ trace_probe_log_err(offs, BAD_FETCH_ARG);
ret = -EINVAL;
}
return ret;
@@ -364,7 +471,7 @@ static int __parse_bitfield_probe_arg(const char *bf,
return -EINVAL;
code++;
if (code->op != FETCH_OP_NOP)
- return -E2BIG;
+ return -EINVAL;
*pcode = code;
code->op = FETCH_OP_MOD_BF;
@@ -377,44 +484,66 @@ static int __parse_bitfield_probe_arg(const char *bf,
/* String length checking wrapper */
static int traceprobe_parse_probe_arg_body(char *arg, ssize_t *size,
- struct probe_arg *parg, unsigned int flags)
+ struct probe_arg *parg, unsigned int flags, int offset)
{
struct fetch_insn *code, *scode, *tmp = NULL;
- char *t, *t2;
+ char *t, *t2, *t3;
int ret, len;
- if (strlen(arg) > MAX_ARGSTR_LEN) {
- pr_info("Argument is too long.: %s\n", arg);
- return -ENOSPC;
+ len = strlen(arg);
+ if (len > MAX_ARGSTR_LEN) {
+ trace_probe_log_err(offset, ARG_TOO_LONG);
+ return -EINVAL;
+ } else if (len == 0) {
+ trace_probe_log_err(offset, NO_ARG_BODY);
+ return -EINVAL;
}
+
parg->comm = kstrdup(arg, GFP_KERNEL);
- if (!parg->comm) {
- pr_info("Failed to allocate memory for command '%s'.\n", arg);
+ if (!parg->comm)
return -ENOMEM;
- }
+
t = strchr(arg, ':');
if (t) {
*t = '\0';
t2 = strchr(++t, '[');
if (t2) {
- *t2 = '\0';
- parg->count = simple_strtoul(t2 + 1, &t2, 0);
- if (strcmp(t2, "]") || parg->count == 0)
+ *t2++ = '\0';
+ t3 = strchr(t2, ']');
+ if (!t3) {
+ offset += t2 + strlen(t2) - arg;
+ trace_probe_log_err(offset,
+ ARRAY_NO_CLOSE);
+ return -EINVAL;
+ } else if (t3[1] != '\0') {
+ trace_probe_log_err(offset + t3 + 1 - arg,
+ BAD_ARRAY_SUFFIX);
return -EINVAL;
- if (parg->count > MAX_ARRAY_LEN)
- return -E2BIG;
+ }
+ *t3 = '\0';
+ if (kstrtouint(t2, 0, &parg->count) || !parg->count) {
+ trace_probe_log_err(offset + t2 - arg,
+ BAD_ARRAY_NUM);
+ return -EINVAL;
+ }
+ if (parg->count > MAX_ARRAY_LEN) {
+ trace_probe_log_err(offset + t2 - arg,
+ ARRAY_TOO_BIG);
+ return -EINVAL;
+ }
}
}
- /*
- * The default type of $comm should be "string", and it can't be
- * dereferenced.
- */
- if (!t && strcmp(arg, "$comm") == 0)
+
+ /* Since $comm can not be dereferred, we can find $comm by strcmp */
+ if (strcmp(arg, "$comm") == 0) {
+ /* The type of $comm must be "string", and not an array. */
+ if (parg->count || (t && strcmp(t, "string")))
+ return -EINVAL;
parg->type = find_fetch_type("string");
- else
+ } else
parg->type = find_fetch_type(t);
if (!parg->type) {
- pr_info("Unsupported type: %s\n", t);
+ trace_probe_log_err(offset + (t ? (t - arg) : 0), BAD_TYPE);
return -EINVAL;
}
parg->offset = *size;
@@ -429,13 +558,13 @@ static int traceprobe_parse_probe_arg_body(char *arg, ssize_t *size,
parg->count);
}
- code = tmp = kzalloc(sizeof(*code) * FETCH_INSN_MAX, GFP_KERNEL);
+ code = tmp = kcalloc(FETCH_INSN_MAX, sizeof(*code), GFP_KERNEL);
if (!code)
return -ENOMEM;
code[FETCH_INSN_MAX - 1].op = FETCH_OP_END;
ret = parse_probe_arg(arg, parg->type, &code, &code[FETCH_INSN_MAX - 1],
- flags);
+ flags, offset);
if (ret)
goto fail;
@@ -443,7 +572,8 @@ static int traceprobe_parse_probe_arg_body(char *arg, ssize_t *size,
if (!strcmp(parg->type->name, "string")) {
if (code->op != FETCH_OP_DEREF && code->op != FETCH_OP_IMM &&
code->op != FETCH_OP_COMM) {
- pr_info("string only accepts memory or address.\n");
+ trace_probe_log_err(offset + (t ? (t - arg) : 0),
+ BAD_STRING);
ret = -EINVAL;
goto fail;
}
@@ -455,7 +585,8 @@ static int traceprobe_parse_probe_arg_body(char *arg, ssize_t *size,
*/
code++;
if (code->op != FETCH_OP_NOP) {
- ret = -E2BIG;
+ trace_probe_log_err(offset, TOO_MANY_OPS);
+ ret = -EINVAL;
goto fail;
}
}
@@ -468,7 +599,8 @@ static int traceprobe_parse_probe_arg_body(char *arg, ssize_t *size,
} else {
code++;
if (code->op != FETCH_OP_NOP) {
- ret = -E2BIG;
+ trace_probe_log_err(offset, TOO_MANY_OPS);
+ ret = -EINVAL;
goto fail;
}
code->op = FETCH_OP_ST_RAW;
@@ -478,20 +610,24 @@ static int traceprobe_parse_probe_arg_body(char *arg, ssize_t *size,
/* Modify operation */
if (t != NULL) {
ret = __parse_bitfield_probe_arg(t, parg->type, &code);
- if (ret)
+ if (ret) {
+ trace_probe_log_err(offset + t - arg, BAD_BITFIELD);
goto fail;
+ }
}
/* Loop(Array) operation */
if (parg->count) {
if (scode->op != FETCH_OP_ST_MEM &&
scode->op != FETCH_OP_ST_STRING) {
- pr_info("array only accepts memory or address\n");
+ trace_probe_log_err(offset + (t ? (t - arg) : 0),
+ BAD_STRING);
ret = -EINVAL;
goto fail;
}
code++;
if (code->op != FETCH_OP_NOP) {
- ret = -E2BIG;
+ trace_probe_log_err(offset, TOO_MANY_OPS);
+ ret = -EINVAL;
goto fail;
}
code->op = FETCH_OP_LP_ARRAY;
@@ -501,7 +637,7 @@ static int traceprobe_parse_probe_arg_body(char *arg, ssize_t *size,
code->op = FETCH_OP_END;
/* Shrink down the code buffer */
- parg->code = kzalloc(sizeof(*code) * (code - tmp + 1), GFP_KERNEL);
+ parg->code = kcalloc(code - tmp + 1, sizeof(*code), GFP_KERNEL);
if (!parg->code)
ret = -ENOMEM;
else
@@ -540,13 +676,19 @@ int traceprobe_parse_probe_arg(struct trace_probe *tp, int i, char *arg,
{
struct probe_arg *parg = &tp->args[i];
char *body;
- int ret;
/* Increment count for freeing args in error case */
tp->nr_args++;
body = strchr(arg, '=');
if (body) {
+ if (body - arg > MAX_ARG_NAME_LEN) {
+ trace_probe_log_err(0, ARG_NAME_TOO_LONG);
+ return -EINVAL;
+ } else if (body == arg) {
+ trace_probe_log_err(0, NO_ARG_NAME);
+ return -EINVAL;
+ }
parg->name = kmemdup_nul(arg, body - arg, GFP_KERNEL);
body++;
} else {
@@ -558,22 +700,16 @@ int traceprobe_parse_probe_arg(struct trace_probe *tp, int i, char *arg,
return -ENOMEM;
if (!is_good_name(parg->name)) {
- pr_info("Invalid argument[%d] name: %s\n",
- i, parg->name);
+ trace_probe_log_err(0, BAD_ARG_NAME);
return -EINVAL;
}
-
if (traceprobe_conflict_field_name(parg->name, tp->args, i)) {
- pr_info("Argument[%d]: '%s' conflicts with another field.\n",
- i, parg->name);
+ trace_probe_log_err(0, USED_ARG_NAME);
return -EINVAL;
}
-
/* Parse fetch argument */
- ret = traceprobe_parse_probe_arg_body(body, &tp->size, parg, flags);
- if (ret)
- pr_info("Parse error at argument[%d]. (%d)\n", i, ret);
- return ret;
+ return traceprobe_parse_probe_arg_body(body, &tp->size, parg, flags,
+ body - arg);
}
void traceprobe_free_probe_arg(struct probe_arg *arg)
diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h
index 8a63f8bc01bc..f9a8c632188b 100644
--- a/kernel/trace/trace_probe.h
+++ b/kernel/trace/trace_probe.h
@@ -32,6 +32,7 @@
#define MAX_TRACE_ARGS 128
#define MAX_ARGSTR_LEN 63
#define MAX_ARRAY_LEN 64
+#define MAX_ARG_NAME_LEN 32
#define MAX_STRING_SIZE PATH_MAX
/* Reserved field names */
@@ -123,6 +124,7 @@ struct fetch_insn {
/* fetch + deref*N + store + mod + end <= 16, this allows N=12, enough */
#define FETCH_INSN_MAX 16
+#define FETCH_TOKEN_COMM (-ECOMM)
/* Fetch type information table */
struct fetch_type {
@@ -279,8 +281,8 @@ extern int traceprobe_update_arg(struct probe_arg *arg);
extern void traceprobe_free_probe_arg(struct probe_arg *arg);
extern int traceprobe_split_symbol_offset(char *symbol, long *offset);
-extern int traceprobe_parse_event_name(const char **pevent,
- const char **pgroup, char *buf);
+int traceprobe_parse_event_name(const char **pevent, const char **pgroup,
+ char *buf, int offset);
extern int traceprobe_set_print_fmt(struct trace_probe *tp, bool is_return);
@@ -297,3 +299,76 @@ extern void destroy_local_trace_uprobe(struct trace_event_call *event_call);
#endif
extern int traceprobe_define_arg_fields(struct trace_event_call *event_call,
size_t offset, struct trace_probe *tp);
+
+#undef ERRORS
+#define ERRORS \
+ C(FILE_NOT_FOUND, "Failed to find the given file"), \
+ C(NO_REGULAR_FILE, "Not a regular file"), \
+ C(BAD_REFCNT, "Invalid reference counter offset"), \
+ C(REFCNT_OPEN_BRACE, "Reference counter brace is not closed"), \
+ C(BAD_REFCNT_SUFFIX, "Reference counter has wrong suffix"), \
+ C(BAD_UPROBE_OFFS, "Invalid uprobe offset"), \
+ C(MAXACT_NO_KPROBE, "Maxactive is not for kprobe"), \
+ C(BAD_MAXACT, "Invalid maxactive number"), \
+ C(MAXACT_TOO_BIG, "Maxactive is too big"), \
+ C(BAD_PROBE_ADDR, "Invalid probed address or symbol"), \
+ C(BAD_RETPROBE, "Retprobe address must be an function entry"), \
+ C(NO_GROUP_NAME, "Group name is not specified"), \
+ C(GROUP_TOO_LONG, "Group name is too long"), \
+ C(BAD_GROUP_NAME, "Group name must follow the same rules as C identifiers"), \
+ C(NO_EVENT_NAME, "Event name is not specified"), \
+ C(EVENT_TOO_LONG, "Event name is too long"), \
+ C(BAD_EVENT_NAME, "Event name must follow the same rules as C identifiers"), \
+ C(RETVAL_ON_PROBE, "$retval is not available on probe"), \
+ C(BAD_STACK_NUM, "Invalid stack number"), \
+ C(BAD_ARG_NUM, "Invalid argument number"), \
+ C(BAD_VAR, "Invalid $-valiable specified"), \
+ C(BAD_REG_NAME, "Invalid register name"), \
+ C(BAD_MEM_ADDR, "Invalid memory address"), \
+ C(FILE_ON_KPROBE, "File offset is not available with kprobe"), \
+ C(BAD_FILE_OFFS, "Invalid file offset value"), \
+ C(SYM_ON_UPROBE, "Symbol is not available with uprobe"), \
+ C(TOO_MANY_OPS, "Dereference is too much nested"), \
+ C(DEREF_NEED_BRACE, "Dereference needs a brace"), \
+ C(BAD_DEREF_OFFS, "Invalid dereference offset"), \
+ C(DEREF_OPEN_BRACE, "Dereference brace is not closed"), \
+ C(COMM_CANT_DEREF, "$comm can not be dereferenced"), \
+ C(BAD_FETCH_ARG, "Invalid fetch argument"), \
+ C(ARRAY_NO_CLOSE, "Array is not closed"), \
+ C(BAD_ARRAY_SUFFIX, "Array has wrong suffix"), \
+ C(BAD_ARRAY_NUM, "Invalid array size"), \
+ C(ARRAY_TOO_BIG, "Array number is too big"), \
+ C(BAD_TYPE, "Unknown type is specified"), \
+ C(BAD_STRING, "String accepts only memory argument"), \
+ C(BAD_BITFIELD, "Invalid bitfield"), \
+ C(ARG_NAME_TOO_LONG, "Argument name is too long"), \
+ C(NO_ARG_NAME, "Argument name is not specified"), \
+ C(BAD_ARG_NAME, "Argument name must follow the same rules as C identifiers"), \
+ C(USED_ARG_NAME, "This argument name is already used"), \
+ C(ARG_TOO_LONG, "Argument expression is too long"), \
+ C(NO_ARG_BODY, "No argument expression"), \
+ C(BAD_INSN_BNDRY, "Probe point is not an instruction boundary"),\
+ C(FAIL_REG_PROBE, "Failed to register probe event"),
+
+#undef C
+#define C(a, b) TP_ERR_##a
+
+/* Define TP_ERR_ */
+enum { ERRORS };
+
+/* Error text is defined in trace_probe.c */
+
+struct trace_probe_log {
+ const char *subsystem;
+ const char **argv;
+ int argc;
+ int index;
+};
+
+void trace_probe_log_init(const char *subsystem, int argc, const char **argv);
+void trace_probe_log_set_index(int index);
+void trace_probe_log_clear(void);
+void __trace_probe_log_err(int offset, int err);
+
+#define trace_probe_log_err(offs, err) \
+ __trace_probe_log_err(offs, TP_ERR_##err)
diff --git a/kernel/trace/trace_probe_tmpl.h b/kernel/trace/trace_probe_tmpl.h
index 4737bb8c07a3..c30c61f12ddd 100644
--- a/kernel/trace/trace_probe_tmpl.h
+++ b/kernel/trace/trace_probe_tmpl.h
@@ -88,7 +88,7 @@ stage3:
/* 3rd stage: store value to buffer */
if (unlikely(!dest)) {
if (code->op == FETCH_OP_ST_STRING) {
- ret += fetch_store_strlen(val + code->offset);
+ ret = fetch_store_strlen(val + code->offset);
code++;
goto array;
} else
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index 4ea7e6845efb..743b2b520d34 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -180,8 +180,11 @@ static void wakeup_trace_close(struct trace_iterator *iter)
}
#define GRAPH_TRACER_FLAGS (TRACE_GRAPH_PRINT_PROC | \
- TRACE_GRAPH_PRINT_ABS_TIME | \
- TRACE_GRAPH_PRINT_DURATION)
+ TRACE_GRAPH_PRINT_CPU | \
+ TRACE_GRAPH_PRINT_REL_TIME | \
+ TRACE_GRAPH_PRINT_DURATION | \
+ TRACE_GRAPH_PRINT_OVERHEAD | \
+ TRACE_GRAPH_PRINT_IRQS)
static enum print_line_t wakeup_print_line(struct trace_iterator *iter)
{
@@ -472,6 +475,7 @@ probe_wakeup_sched_switch(void *ignore, bool preempt,
__trace_function(wakeup_trace, CALLER_ADDR0, CALLER_ADDR1, flags, pc);
tracing_sched_switch_trace(wakeup_trace, prev, next, flags, pc);
+ __trace_stack(wakeup_trace, flags, 0, pc);
T0 = data->preempt_timestamp;
T1 = ftrace_now(cpu);
@@ -482,7 +486,7 @@ probe_wakeup_sched_switch(void *ignore, bool preempt,
if (likely(!is_tracing_stopped())) {
wakeup_trace->max_latency = delta;
- update_max_tr(wakeup_trace, wakeup_task, wakeup_cpu);
+ update_max_tr(wakeup_trace, wakeup_task, wakeup_cpu, NULL);
}
out_unlock:
@@ -583,6 +587,7 @@ probe_wakeup(void *ignore, struct task_struct *p)
data = per_cpu_ptr(wakeup_trace->trace_buffer.data, wakeup_cpu);
data->preempt_timestamp = ftrace_now(cpu);
tracing_sched_wakeup_trace(wakeup_trace, p, current, flags, pc);
+ __trace_stack(wakeup_trace, flags, 0, pc);
/*
* We must be careful in using CALLER_ADDR2. But since wake_up
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index 9d402e7fc949..69ee8ef12cee 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -792,7 +792,10 @@ trace_selftest_startup_function_graph(struct tracer *trace,
/* check the trace buffer */
ret = trace_test_buffer(&tr->trace_buffer, &count);
- trace->reset(tr);
+ /* Need to also simulate the tr->reset to remove this fgraph_ops */
+ tracing_stop_cmdline_record();
+ unregister_ftrace_graph(&fgraph_ops);
+
tracing_start();
if (!ret && !count) {
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index eec648a0d673..5d16f73898db 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -18,44 +18,32 @@
#include "trace.h"
-static unsigned long stack_dump_trace[STACK_TRACE_ENTRIES+1] =
- { [0 ... (STACK_TRACE_ENTRIES)] = ULONG_MAX };
-unsigned stack_trace_index[STACK_TRACE_ENTRIES];
+#define STACK_TRACE_ENTRIES 500
-/*
- * Reserve one entry for the passed in ip. This will allow
- * us to remove most or all of the stack size overhead
- * added by the stack tracer itself.
- */
-struct stack_trace stack_trace_max = {
- .max_entries = STACK_TRACE_ENTRIES - 1,
- .entries = &stack_dump_trace[0],
-};
+static unsigned long stack_dump_trace[STACK_TRACE_ENTRIES];
+static unsigned stack_trace_index[STACK_TRACE_ENTRIES];
-unsigned long stack_trace_max_size;
-arch_spinlock_t stack_trace_max_lock =
+static unsigned int stack_trace_nr_entries;
+static unsigned long stack_trace_max_size;
+static arch_spinlock_t stack_trace_max_lock =
(arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;
DEFINE_PER_CPU(int, disable_stack_tracer);
static DEFINE_MUTEX(stack_sysctl_mutex);
int stack_tracer_enabled;
-static int last_stack_tracer_enabled;
-void stack_trace_print(void)
+static void print_max_stack(void)
{
long i;
int size;
pr_emerg(" Depth Size Location (%d entries)\n"
" ----- ---- --------\n",
- stack_trace_max.nr_entries);
+ stack_trace_nr_entries);
- for (i = 0; i < stack_trace_max.nr_entries; i++) {
- if (stack_dump_trace[i] == ULONG_MAX)
- break;
- if (i+1 == stack_trace_max.nr_entries ||
- stack_dump_trace[i+1] == ULONG_MAX)
+ for (i = 0; i < stack_trace_nr_entries; i++) {
+ if (i + 1 == stack_trace_nr_entries)
size = stack_trace_index[i];
else
size = stack_trace_index[i] - stack_trace_index[i+1];
@@ -65,16 +53,7 @@ void stack_trace_print(void)
}
}
-/*
- * When arch-specific code overrides this function, the following
- * data should be filled up, assuming stack_trace_max_lock is held to
- * prevent concurrent updates.
- * stack_trace_index[]
- * stack_trace_max
- * stack_trace_max_size
- */
-void __weak
-check_stack(unsigned long ip, unsigned long *stack)
+static void check_stack(unsigned long ip, unsigned long *stack)
{
unsigned long this_size, flags; unsigned long *p, *top, *start;
static int tracer_frame;
@@ -110,13 +89,12 @@ check_stack(unsigned long ip, unsigned long *stack)
stack_trace_max_size = this_size;
- stack_trace_max.nr_entries = 0;
- stack_trace_max.skip = 0;
-
- save_stack_trace(&stack_trace_max);
+ stack_trace_nr_entries = stack_trace_save(stack_dump_trace,
+ ARRAY_SIZE(stack_dump_trace) - 1,
+ 0);
/* Skip over the overhead of the stack tracer itself */
- for (i = 0; i < stack_trace_max.nr_entries; i++) {
+ for (i = 0; i < stack_trace_nr_entries; i++) {
if (stack_dump_trace[i] == ip)
break;
}
@@ -125,7 +103,7 @@ check_stack(unsigned long ip, unsigned long *stack)
* Some archs may not have the passed in ip in the dump.
* If that happens, we need to show everything.
*/
- if (i == stack_trace_max.nr_entries)
+ if (i == stack_trace_nr_entries)
i = 0;
/*
@@ -143,15 +121,13 @@ check_stack(unsigned long ip, unsigned long *stack)
* loop will only happen once. This code only takes place
* on a new max, so it is far from a fast path.
*/
- while (i < stack_trace_max.nr_entries) {
+ while (i < stack_trace_nr_entries) {
int found = 0;
stack_trace_index[x] = this_size;
p = start;
- for (; p < top && i < stack_trace_max.nr_entries; p++) {
- if (stack_dump_trace[i] == ULONG_MAX)
- break;
+ for (; p < top && i < stack_trace_nr_entries; p++) {
/*
* The READ_ONCE_NOCHECK is used to let KASAN know that
* this is not a stack-out-of-bounds error.
@@ -182,12 +158,10 @@ check_stack(unsigned long ip, unsigned long *stack)
i++;
}
- stack_trace_max.nr_entries = x;
- for (; x < i; x++)
- stack_dump_trace[x] = ULONG_MAX;
+ stack_trace_nr_entries = x;
if (task_stack_end_corrupted(current)) {
- stack_trace_print();
+ print_max_stack();
BUG();
}
@@ -286,7 +260,7 @@ __next(struct seq_file *m, loff_t *pos)
{
long n = *pos - 1;
- if (n >= stack_trace_max.nr_entries || stack_dump_trace[n] == ULONG_MAX)
+ if (n >= stack_trace_nr_entries)
return NULL;
m->private = (void *)n;
@@ -350,7 +324,7 @@ static int t_show(struct seq_file *m, void *v)
seq_printf(m, " Depth Size Location"
" (%d entries)\n"
" ----- ---- --------\n",
- stack_trace_max.nr_entries);
+ stack_trace_nr_entries);
if (!stack_tracer_enabled && !stack_trace_max_size)
print_disabled(m);
@@ -360,12 +334,10 @@ static int t_show(struct seq_file *m, void *v)
i = *(long *)v;
- if (i >= stack_trace_max.nr_entries ||
- stack_dump_trace[i] == ULONG_MAX)
+ if (i >= stack_trace_nr_entries)
return 0;
- if (i+1 == stack_trace_max.nr_entries ||
- stack_dump_trace[i+1] == ULONG_MAX)
+ if (i + 1 == stack_trace_nr_entries)
size = stack_trace_index[i];
else
size = stack_trace_index[i] - stack_trace_index[i+1];
@@ -422,23 +394,21 @@ stack_trace_sysctl(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp,
loff_t *ppos)
{
+ int was_enabled;
int ret;
mutex_lock(&stack_sysctl_mutex);
+ was_enabled = !!stack_tracer_enabled;
ret = proc_dointvec(table, write, buffer, lenp, ppos);
- if (ret || !write ||
- (last_stack_tracer_enabled == !!stack_tracer_enabled))
+ if (ret || !write || (was_enabled == !!stack_tracer_enabled))
goto out;
- last_stack_tracer_enabled = !!stack_tracer_enabled;
-
if (stack_tracer_enabled)
register_ftrace_function(&trace_ops);
else
unregister_ftrace_function(&trace_ops);
-
out:
mutex_unlock(&stack_sysctl_mutex);
return ret;
@@ -454,7 +424,6 @@ static __init int enable_stacktrace(char *str)
strncpy(stack_trace_filter_buf, str + len, COMMAND_LINE_SIZE);
stack_tracer_enabled = 1;
- last_stack_tracer_enabled = 1;
return 1;
}
__setup("stacktrace", enable_stacktrace);
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index f93a56d2db27..fa8fbff736d6 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -314,6 +314,7 @@ static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id)
struct ring_buffer_event *event;
struct ring_buffer *buffer;
unsigned long irq_flags;
+ unsigned long args[6];
int pc;
int syscall_nr;
int size;
@@ -347,7 +348,8 @@ static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id)
entry = ring_buffer_event_data(event);
entry->nr = syscall_nr;
- syscall_get_arguments(current, regs, 0, sys_data->nb_args, entry->args);
+ syscall_get_arguments(current, regs, args);
+ memcpy(entry->args, args, sizeof(unsigned long) * sys_data->nb_args);
event_trigger_unlock_commit(trace_file, buffer, event, entry,
irq_flags, pc);
@@ -583,6 +585,7 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
struct syscall_metadata *sys_data;
struct syscall_trace_enter *rec;
struct hlist_head *head;
+ unsigned long args[6];
bool valid_prog_array;
int syscall_nr;
int rctx;
@@ -613,8 +616,8 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
return;
rec->nr = syscall_nr;
- syscall_get_arguments(current, regs, 0, sys_data->nb_args,
- (unsigned long *)&rec->args);
+ syscall_get_arguments(current, regs, args);
+ memcpy(&rec->args, args, sizeof(unsigned long) * sys_data->nb_args);
if ((valid_prog_array &&
!perf_call_bpf_enter(sys_data->enter_event, regs, sys_data, rec)) ||
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index 9bde07c06362..7860e3f59fad 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -156,7 +156,10 @@ fetch_store_string(unsigned long addr, void *dest, void *base)
if (unlikely(!maxlen))
return -ENOMEM;
- ret = strncpy_from_user(dst, src, maxlen);
+ if (addr == FETCH_TOKEN_COMM)
+ ret = strlcpy(dst, current->comm, maxlen);
+ else
+ ret = strncpy_from_user(dst, src, maxlen);
if (ret >= 0) {
if (ret == maxlen)
dst[ret - 1] = '\0';
@@ -180,7 +183,10 @@ fetch_store_strlen(unsigned long addr)
int len;
void __user *vaddr = (void __force __user *) addr;
- len = strnlen_user(vaddr, MAX_STRING_SIZE);
+ if (addr == FETCH_TOKEN_COMM)
+ len = strlen(current->comm) + 1;
+ else
+ len = strnlen_user(vaddr, MAX_STRING_SIZE);
return (len > MAX_STRING_SIZE) ? 0 : len;
}
@@ -220,6 +226,9 @@ process_fetch_insn(struct fetch_insn *code, struct pt_regs *regs, void *dest,
case FETCH_OP_IMM:
val = code->immediate;
break;
+ case FETCH_OP_COMM:
+ val = FETCH_TOKEN_COMM;
+ break;
case FETCH_OP_FOFFS:
val = translate_user_vaddr(code->immediate);
break;
@@ -273,10 +282,7 @@ alloc_trace_uprobe(const char *group, const char *event, int nargs, bool is_ret)
{
struct trace_uprobe *tu;
- if (!event || !is_good_name(event))
- return ERR_PTR(-EINVAL);
-
- if (!group || !is_good_name(group))
+ if (!event || !group)
return ERR_PTR(-EINVAL);
tu = kzalloc(SIZEOF_TRACE_UPROBE(nargs), GFP_KERNEL);
@@ -420,8 +426,6 @@ end:
/*
* Argument syntax:
* - Add uprobe: p|r[:[GRP/]EVENT] PATH:OFFSET [FETCHARGS]
- *
- * - Remove uprobe: -:[GRP/]EVENT
*/
static int trace_uprobe_create(int argc, const char **argv)
{
@@ -437,10 +441,17 @@ static int trace_uprobe_create(int argc, const char **argv)
ret = 0;
ref_ctr_offset = 0;
- /* argc must be >= 1 */
- if (argv[0][0] == 'r')
+ switch (argv[0][0]) {
+ case 'r':
is_return = true;
- else if (argv[0][0] != 'p' || argc < 2)
+ break;
+ case 'p':
+ break;
+ default:
+ return -ECANCELED;
+ }
+
+ if (argc < 2)
return -ECANCELED;
if (argv[0][1] == ':')
@@ -460,13 +471,19 @@ static int trace_uprobe_create(int argc, const char **argv)
return -ECANCELED;
}
+ trace_probe_log_init("trace_uprobe", argc, argv);
+ trace_probe_log_set_index(1); /* filename is the 2nd argument */
+
*arg++ = '\0';
ret = kern_path(filename, LOOKUP_FOLLOW, &path);
if (ret) {
+ trace_probe_log_err(0, FILE_NOT_FOUND);
kfree(filename);
+ trace_probe_log_clear();
return ret;
}
if (!d_is_reg(path.dentry)) {
+ trace_probe_log_err(0, NO_REGULAR_FILE);
ret = -EINVAL;
goto fail_address_parse;
}
@@ -475,9 +492,16 @@ static int trace_uprobe_create(int argc, const char **argv)
rctr = strchr(arg, '(');
if (rctr) {
rctr_end = strchr(rctr, ')');
- if (rctr > rctr_end || *(rctr_end + 1) != 0) {
+ if (!rctr_end) {
+ ret = -EINVAL;
+ rctr_end = rctr + strlen(rctr);
+ trace_probe_log_err(rctr_end - filename,
+ REFCNT_OPEN_BRACE);
+ goto fail_address_parse;
+ } else if (rctr_end[1] != '\0') {
ret = -EINVAL;
- pr_info("Invalid reference counter offset.\n");
+ trace_probe_log_err(rctr_end + 1 - filename,
+ BAD_REFCNT_SUFFIX);
goto fail_address_parse;
}
@@ -485,22 +509,23 @@ static int trace_uprobe_create(int argc, const char **argv)
*rctr_end = '\0';
ret = kstrtoul(rctr, 0, &ref_ctr_offset);
if (ret) {
- pr_info("Invalid reference counter offset.\n");
+ trace_probe_log_err(rctr - filename, BAD_REFCNT);
goto fail_address_parse;
}
}
/* Parse uprobe offset. */
ret = kstrtoul(arg, 0, &offset);
- if (ret)
+ if (ret) {
+ trace_probe_log_err(arg - filename, BAD_UPROBE_OFFS);
goto fail_address_parse;
-
- argc -= 2;
- argv += 2;
+ }
/* setup a probe */
+ trace_probe_log_set_index(0);
if (event) {
- ret = traceprobe_parse_event_name(&event, &group, buf);
+ ret = traceprobe_parse_event_name(&event, &group, buf,
+ event - argv[0]);
if (ret)
goto fail_address_parse;
} else {
@@ -522,10 +547,14 @@ static int trace_uprobe_create(int argc, const char **argv)
kfree(tail);
}
+ argc -= 2;
+ argv += 2;
+
tu = alloc_trace_uprobe(group, event, argc, is_return);
if (IS_ERR(tu)) {
- pr_info("Failed to allocate trace_uprobe.(%d)\n", (int)PTR_ERR(tu));
ret = PTR_ERR(tu);
+ /* This must return -ENOMEM otherwise there is a bug */
+ WARN_ON_ONCE(ret != -ENOMEM);
goto fail_address_parse;
}
tu->offset = offset;
@@ -541,6 +570,7 @@ static int trace_uprobe_create(int argc, const char **argv)
goto error;
}
+ trace_probe_log_set_index(i + 2);
ret = traceprobe_parse_probe_arg(&tu->tp, i, tmp,
is_return ? TPARG_FL_RETURN : 0);
kfree(tmp);
@@ -549,20 +579,20 @@ static int trace_uprobe_create(int argc, const char **argv)
}
ret = register_trace_uprobe(tu);
- if (ret)
- goto error;
- return 0;
+ if (!ret)
+ goto out;
error:
free_trace_uprobe(tu);
+out:
+ trace_probe_log_clear();
return ret;
fail_address_parse:
+ trace_probe_log_clear();
path_put(&path);
kfree(filename);
- pr_info("Failed to parse address or file.\n");
-
return ret;
}
@@ -1306,7 +1336,7 @@ static inline void init_trace_event_call(struct trace_uprobe *tu,
call->event.funcs = &uprobe_funcs;
call->class->define_fields = uprobe_event_define_fields;
- call->flags = TRACE_EVENT_FL_UPROBE;
+ call->flags = TRACE_EVENT_FL_UPROBE | TRACE_EVENT_FL_CAP_ANY;
call->class->reg = trace_uprobe_register;
call->data = tu;
}
diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c
index 46f2ab1e08a9..df3ade14ccbd 100644
--- a/kernel/tracepoint.c
+++ b/kernel/tracepoint.c
@@ -1,19 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* Copyright (C) 2008-2014 Mathieu Desnoyers
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
*/
#include <linux/module.h>
#include <linux/mutex.h>
diff --git a/kernel/tsacct.c b/kernel/tsacct.c
index 370724b45391..7be3e7530841 100644
--- a/kernel/tsacct.c
+++ b/kernel/tsacct.c
@@ -1,19 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* tsacct.c - System accounting over taskstats interface
*
* Copyright (C) Jay Lan, <jlan@sgi.com>
- *
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
*/
#include <linux/kernel.h>
diff --git a/kernel/ucount.c b/kernel/ucount.c
index f48d1b6376a4..feb128c7b5d9 100644
--- a/kernel/ucount.c
+++ b/kernel/ucount.c
@@ -1,9 +1,4 @@
-/*
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation, version 2 of the
- * License.
- */
+// SPDX-License-Identifier: GPL-2.0-only
#include <linux/stat.h>
#include <linux/sysctl.h>
diff --git a/kernel/umh.c b/kernel/umh.c
index d937cbad903a..7f255b5a8845 100644
--- a/kernel/umh.c
+++ b/kernel/umh.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* umh - the kernel usermode helper
*/
diff --git a/kernel/up.c b/kernel/up.c
index ff536f9cc8a2..862b460ab97a 100644
--- a/kernel/up.c
+++ b/kernel/up.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Uniprocessor-only support functions. The counterpart to kernel/smp.c
*/
@@ -34,14 +35,13 @@ int smp_call_function_single_async(int cpu, call_single_data_t *csd)
}
EXPORT_SYMBOL(smp_call_function_single_async);
-int on_each_cpu(smp_call_func_t func, void *info, int wait)
+void on_each_cpu(smp_call_func_t func, void *info, int wait)
{
unsigned long flags;
local_irq_save(flags);
func(info);
local_irq_restore(flags);
- return 0;
}
EXPORT_SYMBOL(on_each_cpu);
diff --git a/kernel/user-return-notifier.c b/kernel/user-return-notifier.c
index 9586b670a5b2..870ecd7c63ed 100644
--- a/kernel/user-return-notifier.c
+++ b/kernel/user-return-notifier.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
#include <linux/user-return-notifier.h>
#include <linux/percpu.h>
diff --git a/kernel/user.c b/kernel/user.c
index 0df9b1640b2a..5235d7f49982 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* The "user cache".
*
@@ -62,9 +63,9 @@ struct user_namespace init_user_ns = {
.ns.ops = &userns_operations,
#endif
.flags = USERNS_INIT_FLAGS,
-#ifdef CONFIG_PERSISTENT_KEYRINGS
- .persistent_keyring_register_sem =
- __RWSEM_INITIALIZER(init_user_ns.persistent_keyring_register_sem),
+#ifdef CONFIG_KEYS
+ .keyring_name_list = LIST_HEAD_INIT(init_user_ns.keyring_name_list),
+ .keyring_sem = __RWSEM_INITIALIZER(init_user_ns.keyring_sem),
#endif
};
EXPORT_SYMBOL_GPL(init_user_ns);
@@ -140,8 +141,6 @@ static void free_user(struct user_struct *up, unsigned long flags)
{
uid_hash_remove(up);
spin_unlock_irqrestore(&uidhash_lock, flags);
- key_put(up->uid_keyring);
- key_put(up->session_keyring);
kmem_cache_free(uid_cachep, up);
}
@@ -185,7 +184,7 @@ struct user_struct *alloc_uid(kuid_t uid)
if (!up) {
new = kmem_cache_zalloc(uid_cachep, GFP_KERNEL);
if (!new)
- goto out_unlock;
+ return NULL;
new->uid = uid;
refcount_set(&new->__count, 1);
@@ -199,8 +198,6 @@ struct user_struct *alloc_uid(kuid_t uid)
spin_lock_irq(&uidhash_lock);
up = uid_hash_find(uid, hashent);
if (up) {
- key_put(new->uid_keyring);
- key_put(new->session_keyring);
kmem_cache_free(uid_cachep, new);
} else {
uid_hash_insert(new, hashent);
@@ -210,9 +207,6 @@ struct user_struct *alloc_uid(kuid_t uid)
}
return up;
-
-out_unlock:
- return NULL;
}
static int __init uid_cache_init(void)
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index 923414a246e9..8eadadc478f9 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -1,9 +1,4 @@
-/*
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation, version 2 of the
- * License.
- */
+// SPDX-License-Identifier: GPL-2.0-only
#include <linux/export.h>
#include <linux/nsproxy.h>
@@ -133,8 +128,9 @@ int create_user_ns(struct cred *new)
ns->flags = parent_ns->flags;
mutex_unlock(&userns_state_mutex);
-#ifdef CONFIG_PERSISTENT_KEYRINGS
- init_rwsem(&ns->persistent_keyring_register_sem);
+#ifdef CONFIG_KEYS
+ INIT_LIST_HEAD(&ns->keyring_name_list);
+ init_rwsem(&ns->keyring_sem);
#endif
ret = -ENOMEM;
if (!setup_userns_sysctls(ns))
@@ -196,9 +192,7 @@ static void free_user_ns(struct work_struct *work)
kfree(ns->projid_map.reverse);
}
retire_userns_sysctls(ns);
-#ifdef CONFIG_PERSISTENT_KEYRINGS
- key_put(ns->persistent_keyring_register);
-#endif
+ key_free_user_ns(ns);
ns_free_inum(&ns->ns);
kmem_cache_free(user_ns_cachep, ns);
dec_user_namespaces(ucounts);
diff --git a/kernel/utsname.c b/kernel/utsname.c
index dcd6be1996fe..f0e491193009 100644
--- a/kernel/utsname.c
+++ b/kernel/utsname.c
@@ -1,12 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright (C) 2004 IBM Corporation
*
* Author: Serge Hallyn <serue@us.ibm.com>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation, version 2 of the
- * License.
*/
#include <linux/export.h>
diff --git a/kernel/utsname_sysctl.c b/kernel/utsname_sysctl.c
index 258033d62cb3..3732c888a949 100644
--- a/kernel/utsname_sysctl.c
+++ b/kernel/utsname_sysctl.c
@@ -1,12 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright (C) 2007
*
* Author: Eric Biederman <ebiederm@xmision.com>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation, version 2 of the
- * License.
*/
#include <linux/export.h>
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 977918d5d350..7f9e7b9306fe 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -42,9 +42,9 @@ int __read_mostly watchdog_user_enabled = 1;
int __read_mostly nmi_watchdog_user_enabled = NMI_WATCHDOG_DEFAULT;
int __read_mostly soft_watchdog_user_enabled = 1;
int __read_mostly watchdog_thresh = 10;
-int __read_mostly nmi_watchdog_available;
+static int __read_mostly nmi_watchdog_available;
-struct cpumask watchdog_allowed_mask __read_mostly;
+static struct cpumask watchdog_allowed_mask __read_mostly;
struct cpumask watchdog_cpumask __read_mostly;
unsigned long *watchdog_cpumask_bits = cpumask_bits(&watchdog_cpumask);
@@ -199,6 +199,13 @@ static int __init nosoftlockup_setup(char *str)
}
__setup("nosoftlockup", nosoftlockup_setup);
+static int __init watchdog_thresh_setup(char *str)
+{
+ get_option(&str, &watchdog_thresh);
+ return 1;
+}
+__setup("watchdog_thresh=", watchdog_thresh_setup);
+
#ifdef CONFIG_SMP
int __read_mostly sysctl_softlockup_all_cpu_backtrace;
@@ -547,13 +554,15 @@ static void softlockup_start_all(void)
int lockup_detector_online_cpu(unsigned int cpu)
{
- watchdog_enable(cpu);
+ if (cpumask_test_cpu(cpu, &watchdog_allowed_mask))
+ watchdog_enable(cpu);
return 0;
}
int lockup_detector_offline_cpu(unsigned int cpu)
{
- watchdog_disable(cpu);
+ if (cpumask_test_cpu(cpu, &watchdog_allowed_mask))
+ watchdog_disable(cpu);
return 0;
}
@@ -581,7 +590,7 @@ static void lockup_detector_reconfigure(void)
* Create the watchdog thread infrastructure and configure the detector(s).
*
* The threads are not unparked as watchdog_allowed_mask is empty. When
- * the threads are sucessfully initialized, take the proper locks and
+ * the threads are successfully initialized, take the proper locks and
* unpark the threads in the watchdog_cpumask if the watchdog is enabled.
*/
static __init void lockup_detector_setup(void)
diff --git a/kernel/watchdog_hld.c b/kernel/watchdog_hld.c
index 71381168dede..247bf0b1582c 100644
--- a/kernel/watchdog_hld.c
+++ b/kernel/watchdog_hld.c
@@ -135,7 +135,8 @@ static void watchdog_overflow_callback(struct perf_event *event,
if (__this_cpu_read(hard_watchdog_warn) == true)
return;
- pr_emerg("Watchdog detected hard LOCKUP on cpu %d", this_cpu);
+ pr_emerg("Watchdog detected hard LOCKUP on cpu %d\n",
+ this_cpu);
print_modules();
print_irqtrace_events(current);
if (regs)
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index fc5d23d752a5..601d61150b65 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* kernel/workqueue.c - generic async execution with shared worker pool
*
@@ -127,16 +128,16 @@ enum {
*
* PL: wq_pool_mutex protected.
*
- * PR: wq_pool_mutex protected for writes. Sched-RCU protected for reads.
+ * PR: wq_pool_mutex protected for writes. RCU protected for reads.
*
* PW: wq_pool_mutex and wq->mutex protected for writes. Either for reads.
*
* PWR: wq_pool_mutex and wq->mutex protected for writes. Either or
- * sched-RCU for reads.
+ * RCU for reads.
*
* WQ: wq->mutex protected.
*
- * WR: wq->mutex protected for writes. Sched-RCU protected for reads.
+ * WR: wq->mutex protected for writes. RCU protected for reads.
*
* MD: wq_mayday_lock protected.
*/
@@ -183,7 +184,7 @@ struct worker_pool {
atomic_t nr_running ____cacheline_aligned_in_smp;
/*
- * Destruction of pool is sched-RCU protected to allow dereferences
+ * Destruction of pool is RCU protected to allow dereferences
* from get_work_pool().
*/
struct rcu_head rcu;
@@ -212,7 +213,7 @@ struct pool_workqueue {
/*
* Release of unbound pwq is punted to system_wq. See put_pwq()
* and pwq_unbound_release_workfn() for details. pool_workqueue
- * itself is also sched-RCU protected so that the first pwq can be
+ * itself is also RCU protected so that the first pwq can be
* determined without grabbing wq->mutex.
*/
struct work_struct unbound_release_work;
@@ -259,13 +260,15 @@ struct workqueue_struct {
struct wq_device *wq_dev; /* I: for sysfs interface */
#endif
#ifdef CONFIG_LOCKDEP
+ char *lock_name;
+ struct lock_class_key key;
struct lockdep_map lockdep_map;
#endif
char name[WQ_NAME_LEN]; /* I: workqueue name */
/*
- * Destruction of workqueue_struct is sched-RCU protected to allow
- * walking the workqueues list without grabbing wq_pool_mutex.
+ * Destruction of workqueue_struct is RCU protected to allow walking
+ * the workqueues list without grabbing wq_pool_mutex.
* This is used to dump all workqueues from sysrq.
*/
struct rcu_head rcu;
@@ -357,20 +360,20 @@ static void workqueue_sysfs_unregister(struct workqueue_struct *wq);
#include <trace/events/workqueue.h>
#define assert_rcu_or_pool_mutex() \
- RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held() && \
+ RCU_LOCKDEP_WARN(!rcu_read_lock_held() && \
!lockdep_is_held(&wq_pool_mutex), \
- "sched RCU or wq_pool_mutex should be held")
+ "RCU or wq_pool_mutex should be held")
#define assert_rcu_or_wq_mutex(wq) \
- RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held() && \
+ RCU_LOCKDEP_WARN(!rcu_read_lock_held() && \
!lockdep_is_held(&wq->mutex), \
- "sched RCU or wq->mutex should be held")
+ "RCU or wq->mutex should be held")
#define assert_rcu_or_wq_mutex_or_pool_mutex(wq) \
- RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held() && \
+ RCU_LOCKDEP_WARN(!rcu_read_lock_held() && \
!lockdep_is_held(&wq->mutex) && \
!lockdep_is_held(&wq_pool_mutex), \
- "sched RCU, wq->mutex or wq_pool_mutex should be held")
+ "RCU, wq->mutex or wq_pool_mutex should be held")
#define for_each_cpu_worker_pool(pool, cpu) \
for ((pool) = &per_cpu(cpu_worker_pools, cpu)[0]; \
@@ -382,7 +385,7 @@ static void workqueue_sysfs_unregister(struct workqueue_struct *wq);
* @pool: iteration cursor
* @pi: integer used for iteration
*
- * This must be called either with wq_pool_mutex held or sched RCU read
+ * This must be called either with wq_pool_mutex held or RCU read
* locked. If the pool needs to be used beyond the locking in effect, the
* caller is responsible for guaranteeing that the pool stays online.
*
@@ -414,7 +417,7 @@ static void workqueue_sysfs_unregister(struct workqueue_struct *wq);
* @pwq: iteration cursor
* @wq: the target workqueue
*
- * This must be called either with wq->mutex held or sched RCU read locked.
+ * This must be called either with wq->mutex held or RCU read locked.
* If the pwq needs to be used beyond the locking in effect, the caller is
* responsible for guaranteeing that the pwq stays online.
*
@@ -550,7 +553,7 @@ static int worker_pool_assign_id(struct worker_pool *pool)
* @wq: the target workqueue
* @node: the node ID
*
- * This must be called with any of wq_pool_mutex, wq->mutex or sched RCU
+ * This must be called with any of wq_pool_mutex, wq->mutex or RCU
* read locked.
* If the pwq needs to be used beyond the locking in effect, the caller is
* responsible for guaranteeing that the pwq stays online.
@@ -646,7 +649,7 @@ static void set_work_pool_and_clear_pending(struct work_struct *work,
* The following mb guarantees that previous clear of a PENDING bit
* will not be reordered with any speculative LOADS or STORES from
* work->current_func, which is executed afterwards. This possible
- * reordering can lead to a missed execution on attempt to qeueue
+ * reordering can lead to a missed execution on attempt to queue
* the same @work. E.g. consider this case:
*
* CPU#0 CPU#1
@@ -694,8 +697,8 @@ static struct pool_workqueue *get_work_pwq(struct work_struct *work)
* @work: the work item of interest
*
* Pools are created and destroyed under wq_pool_mutex, and allows read
- * access under sched-RCU read lock. As such, this function should be
- * called under wq_pool_mutex or with preemption disabled.
+ * access under RCU read lock. As such, this function should be
+ * called under wq_pool_mutex or inside of a rcu_read_lock() region.
*
* All fields of the returned pool are accessible as long as the above
* mentioned locking is in effect. If the returned pool needs to be used
@@ -839,43 +842,32 @@ static void wake_up_worker(struct worker_pool *pool)
}
/**
- * wq_worker_waking_up - a worker is waking up
+ * wq_worker_running - a worker is running again
* @task: task waking up
- * @cpu: CPU @task is waking up to
*
- * This function is called during try_to_wake_up() when a worker is
- * being awoken.
- *
- * CONTEXT:
- * spin_lock_irq(rq->lock)
+ * This function is called when a worker returns from schedule()
*/
-void wq_worker_waking_up(struct task_struct *task, int cpu)
+void wq_worker_running(struct task_struct *task)
{
struct worker *worker = kthread_data(task);
- if (!(worker->flags & WORKER_NOT_RUNNING)) {
- WARN_ON_ONCE(worker->pool->cpu != cpu);
+ if (!worker->sleeping)
+ return;
+ if (!(worker->flags & WORKER_NOT_RUNNING))
atomic_inc(&worker->pool->nr_running);
- }
+ worker->sleeping = 0;
}
/**
* wq_worker_sleeping - a worker is going to sleep
* @task: task going to sleep
*
- * This function is called during schedule() when a busy worker is
- * going to sleep. Worker on the same cpu can be woken up by
- * returning pointer to its task.
- *
- * CONTEXT:
- * spin_lock_irq(rq->lock)
- *
- * Return:
- * Worker task on @cpu to wake up, %NULL if none.
+ * This function is called from schedule() when a busy worker is
+ * going to sleep.
*/
-struct task_struct *wq_worker_sleeping(struct task_struct *task)
+void wq_worker_sleeping(struct task_struct *task)
{
- struct worker *worker = kthread_data(task), *to_wakeup = NULL;
+ struct worker *next, *worker = kthread_data(task);
struct worker_pool *pool;
/*
@@ -884,13 +876,15 @@ struct task_struct *wq_worker_sleeping(struct task_struct *task)
* checking NOT_RUNNING.
*/
if (worker->flags & WORKER_NOT_RUNNING)
- return NULL;
+ return;
pool = worker->pool;
- /* this can only happen on the local cpu */
- if (WARN_ON_ONCE(pool->cpu != raw_smp_processor_id()))
- return NULL;
+ if (WARN_ON_ONCE(worker->sleeping))
+ return;
+
+ worker->sleeping = 1;
+ spin_lock_irq(&pool->lock);
/*
* The counterpart of the following dec_and_test, implied mb,
@@ -904,13 +898,17 @@ struct task_struct *wq_worker_sleeping(struct task_struct *task)
* lock is safe.
*/
if (atomic_dec_and_test(&pool->nr_running) &&
- !list_empty(&pool->worklist))
- to_wakeup = first_idle_worker(pool);
- return to_wakeup ? to_wakeup->task : NULL;
+ !list_empty(&pool->worklist)) {
+ next = first_idle_worker(pool);
+ if (next)
+ wake_up_process(next->task);
+ }
+ spin_unlock_irq(&pool->lock);
}
/**
* wq_worker_last_func - retrieve worker's last work function
+ * @task: Task to retrieve last work function of.
*
* Determine the last function a worker executed. This is called from
* the scheduler to get a worker's last known identity.
@@ -918,6 +916,16 @@ struct task_struct *wq_worker_sleeping(struct task_struct *task)
* CONTEXT:
* spin_lock_irq(rq->lock)
*
+ * This function is called during schedule() when a kworker is going
+ * to sleep. It's used by psi to identify aggregation workers during
+ * dequeuing, to allow periodic aggregation to shut-off when that
+ * worker is the last task in the system or cgroup to go to sleep.
+ *
+ * As this function doesn't involve any workqueue-related locking, it
+ * only returns stable values when called from inside the scheduler's
+ * queuing and dequeuing paths, when @task, which must be a kworker,
+ * is guaranteed to not be processing any works.
+ *
* Return:
* The last work function %current executed as a worker, NULL if it
* hasn't executed any work yet.
@@ -1120,7 +1128,7 @@ static void put_pwq_unlocked(struct pool_workqueue *pwq)
{
if (pwq) {
/*
- * As both pwqs and pools are sched-RCU protected, the
+ * As both pwqs and pools are RCU protected, the
* following lock operations are safe.
*/
spin_lock_irq(&pwq->pool->lock);
@@ -1248,6 +1256,7 @@ static int try_to_grab_pending(struct work_struct *work, bool is_dwork,
if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work)))
return 0;
+ rcu_read_lock();
/*
* The queueing is in progress, or it is already queued. Try to
* steal it from ->worklist without clearing WORK_STRUCT_PENDING.
@@ -1286,10 +1295,12 @@ static int try_to_grab_pending(struct work_struct *work, bool is_dwork,
set_work_pool_and_keep_pending(work, pool->id);
spin_unlock(&pool->lock);
+ rcu_read_unlock();
return 1;
}
spin_unlock(&pool->lock);
fail:
+ rcu_read_unlock();
local_irq_restore(*flags);
if (work_is_canceling(work))
return -ENOENT;
@@ -1341,7 +1352,7 @@ static bool is_chained_work(struct workqueue_struct *wq)
worker = current_wq_worker();
/*
- * Return %true iff I'm a worker execuing a work item on @wq. If
+ * Return %true iff I'm a worker executing a work item on @wq. If
* I'm @worker, it's safe to dereference it without locking.
*/
return worker && worker->current_pwq->wq == wq;
@@ -1403,6 +1414,7 @@ static void __queue_work(int cpu, struct workqueue_struct *wq,
if (unlikely(wq->flags & __WQ_DRAINING) &&
WARN_ON_ONCE(!is_chained_work(wq)))
return;
+ rcu_read_lock();
retry:
if (req_cpu == WORK_CPU_UNBOUND)
cpu = wq_select_unbound_cpu(raw_smp_processor_id());
@@ -1459,10 +1471,8 @@ retry:
/* pwq determined, queue */
trace_workqueue_queue_work(req_cpu, pwq, work);
- if (WARN_ON(!list_empty(&work->entry))) {
- spin_unlock(&pwq->pool->lock);
- return;
- }
+ if (WARN_ON(!list_empty(&work->entry)))
+ goto out;
pwq->nr_in_flight[pwq->work_color]++;
work_flags = work_color_to_flags(pwq->work_color);
@@ -1480,7 +1490,9 @@ retry:
insert_work(pwq, work, worklist, work_flags);
+out:
spin_unlock(&pwq->pool->lock);
+ rcu_read_unlock();
}
/**
@@ -1512,6 +1524,90 @@ bool queue_work_on(int cpu, struct workqueue_struct *wq,
}
EXPORT_SYMBOL(queue_work_on);
+/**
+ * workqueue_select_cpu_near - Select a CPU based on NUMA node
+ * @node: NUMA node ID that we want to select a CPU from
+ *
+ * This function will attempt to find a "random" cpu available on a given
+ * node. If there are no CPUs available on the given node it will return
+ * WORK_CPU_UNBOUND indicating that we should just schedule to any
+ * available CPU if we need to schedule this work.
+ */
+static int workqueue_select_cpu_near(int node)
+{
+ int cpu;
+
+ /* No point in doing this if NUMA isn't enabled for workqueues */
+ if (!wq_numa_enabled)
+ return WORK_CPU_UNBOUND;
+
+ /* Delay binding to CPU if node is not valid or online */
+ if (node < 0 || node >= MAX_NUMNODES || !node_online(node))
+ return WORK_CPU_UNBOUND;
+
+ /* Use local node/cpu if we are already there */
+ cpu = raw_smp_processor_id();
+ if (node == cpu_to_node(cpu))
+ return cpu;
+
+ /* Use "random" otherwise know as "first" online CPU of node */
+ cpu = cpumask_any_and(cpumask_of_node(node), cpu_online_mask);
+
+ /* If CPU is valid return that, otherwise just defer */
+ return cpu < nr_cpu_ids ? cpu : WORK_CPU_UNBOUND;
+}
+
+/**
+ * queue_work_node - queue work on a "random" cpu for a given NUMA node
+ * @node: NUMA node that we are targeting the work for
+ * @wq: workqueue to use
+ * @work: work to queue
+ *
+ * We queue the work to a "random" CPU within a given NUMA node. The basic
+ * idea here is to provide a way to somehow associate work with a given
+ * NUMA node.
+ *
+ * This function will only make a best effort attempt at getting this onto
+ * the right NUMA node. If no node is requested or the requested node is
+ * offline then we just fall back to standard queue_work behavior.
+ *
+ * Currently the "random" CPU ends up being the first available CPU in the
+ * intersection of cpu_online_mask and the cpumask of the node, unless we
+ * are running on the node. In that case we just use the current CPU.
+ *
+ * Return: %false if @work was already on a queue, %true otherwise.
+ */
+bool queue_work_node(int node, struct workqueue_struct *wq,
+ struct work_struct *work)
+{
+ unsigned long flags;
+ bool ret = false;
+
+ /*
+ * This current implementation is specific to unbound workqueues.
+ * Specifically we only return the first available CPU for a given
+ * node instead of cycling through individual CPUs within the node.
+ *
+ * If this is used with a per-cpu workqueue then the logic in
+ * workqueue_select_cpu_near would need to be updated to allow for
+ * some round robin type logic.
+ */
+ WARN_ON_ONCE(!(wq->flags & WQ_UNBOUND));
+
+ local_irq_save(flags);
+
+ if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) {
+ int cpu = workqueue_select_cpu_near(node);
+
+ __queue_work(cpu, wq, work);
+ ret = true;
+ }
+
+ local_irq_restore(flags);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(queue_work_node);
+
void delayed_work_timer_fn(struct timer_list *t)
{
struct delayed_work *dwork = from_timer(dwork, t, timer);
@@ -1639,7 +1735,7 @@ static void rcu_work_rcufn(struct rcu_head *rcu)
*
* Return: %false if @rwork was already pending, %true otherwise. Note
* that a full RCU grace period is guaranteed only after a %true return.
- * While @rwork is guarnateed to be executed after a %false return, the
+ * While @rwork is guaranteed to be executed after a %false return, the
* execution may happen before a full RCU grace period has passed.
*/
bool queue_rcu_work(struct workqueue_struct *wq, struct rcu_work *rwork)
@@ -2181,7 +2277,7 @@ __acquires(&pool->lock)
if (unlikely(in_atomic() || lockdep_depth(current) > 0)) {
pr_err("BUG: workqueue leaked lock or atomic: %s/0x%08x/%d\n"
- " last function: %pf\n",
+ " last function: %ps\n",
current->comm, preempt_count(), task_pid_nr(current),
worker->current_func);
debug_show_held_locks(current);
@@ -2500,11 +2596,11 @@ static void check_flush_dependency(struct workqueue_struct *target_wq,
worker = current_wq_worker();
WARN_ONCE(current->flags & PF_MEMALLOC,
- "workqueue: PF_MEMALLOC task %d(%s) is flushing !WQ_MEM_RECLAIM %s:%pf",
+ "workqueue: PF_MEMALLOC task %d(%s) is flushing !WQ_MEM_RECLAIM %s:%ps",
current->pid, current->comm, target_wq->name, target_func);
WARN_ONCE(worker && ((worker->current_pwq->wq->flags &
(WQ_MEM_RECLAIM | __WQ_LEGACY)) == WQ_MEM_RECLAIM),
- "workqueue: WQ_MEM_RECLAIM %s:%pf is flushing !WQ_MEM_RECLAIM %s:%pf",
+ "workqueue: WQ_MEM_RECLAIM %s:%ps is flushing !WQ_MEM_RECLAIM %s:%ps",
worker->current_pwq->wq->name, worker->current_func,
target_wq->name, target_func);
}
@@ -2878,14 +2974,14 @@ static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr,
might_sleep();
- local_irq_disable();
+ rcu_read_lock();
pool = get_work_pool(work);
if (!pool) {
- local_irq_enable();
+ rcu_read_unlock();
return false;
}
- spin_lock(&pool->lock);
+ spin_lock_irq(&pool->lock);
/* see the comment in try_to_grab_pending() with the same code */
pwq = get_work_pwq(work);
if (pwq) {
@@ -2917,10 +3013,11 @@ static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr,
lock_map_acquire(&pwq->wq->lockdep_map);
lock_map_release(&pwq->wq->lockdep_map);
}
-
+ rcu_read_unlock();
return true;
already_gone:
spin_unlock_irq(&pool->lock);
+ rcu_read_unlock();
return false;
}
@@ -2931,6 +3028,9 @@ static bool __flush_work(struct work_struct *work, bool from_cancel)
if (WARN_ON(!wq_online))
return false;
+ if (WARN_ON(!work->func))
+ return false;
+
if (!from_cancel) {
lock_map_acquire(&work->lockdep_map);
lock_map_release(&work->lockdep_map);
@@ -3229,7 +3329,7 @@ EXPORT_SYMBOL_GPL(execute_in_process_context);
*
* Undo alloc_workqueue_attrs().
*/
-void free_workqueue_attrs(struct workqueue_attrs *attrs)
+static void free_workqueue_attrs(struct workqueue_attrs *attrs)
{
if (attrs) {
free_cpumask_var(attrs->cpumask);
@@ -3239,21 +3339,20 @@ void free_workqueue_attrs(struct workqueue_attrs *attrs)
/**
* alloc_workqueue_attrs - allocate a workqueue_attrs
- * @gfp_mask: allocation mask to use
*
* Allocate a new workqueue_attrs, initialize with default settings and
* return it.
*
* Return: The allocated new workqueue_attr on success. %NULL on failure.
*/
-struct workqueue_attrs *alloc_workqueue_attrs(gfp_t gfp_mask)
+static struct workqueue_attrs *alloc_workqueue_attrs(void)
{
struct workqueue_attrs *attrs;
- attrs = kzalloc(sizeof(*attrs), gfp_mask);
+ attrs = kzalloc(sizeof(*attrs), GFP_KERNEL);
if (!attrs)
goto fail;
- if (!alloc_cpumask_var(&attrs->cpumask, gfp_mask))
+ if (!alloc_cpumask_var(&attrs->cpumask, GFP_KERNEL))
goto fail;
cpumask_copy(attrs->cpumask, cpu_possible_mask);
@@ -3331,17 +3430,57 @@ static int init_worker_pool(struct worker_pool *pool)
pool->refcnt = 1;
/* shouldn't fail above this point */
- pool->attrs = alloc_workqueue_attrs(GFP_KERNEL);
+ pool->attrs = alloc_workqueue_attrs();
if (!pool->attrs)
return -ENOMEM;
return 0;
}
+#ifdef CONFIG_LOCKDEP
+static void wq_init_lockdep(struct workqueue_struct *wq)
+{
+ char *lock_name;
+
+ lockdep_register_key(&wq->key);
+ lock_name = kasprintf(GFP_KERNEL, "%s%s", "(wq_completion)", wq->name);
+ if (!lock_name)
+ lock_name = wq->name;
+
+ wq->lock_name = lock_name;
+ lockdep_init_map(&wq->lockdep_map, lock_name, &wq->key, 0);
+}
+
+static void wq_unregister_lockdep(struct workqueue_struct *wq)
+{
+ lockdep_unregister_key(&wq->key);
+}
+
+static void wq_free_lockdep(struct workqueue_struct *wq)
+{
+ if (wq->lock_name != wq->name)
+ kfree(wq->lock_name);
+}
+#else
+static void wq_init_lockdep(struct workqueue_struct *wq)
+{
+}
+
+static void wq_unregister_lockdep(struct workqueue_struct *wq)
+{
+}
+
+static void wq_free_lockdep(struct workqueue_struct *wq)
+{
+}
+#endif
+
static void rcu_free_wq(struct rcu_head *rcu)
{
struct workqueue_struct *wq =
container_of(rcu, struct workqueue_struct, rcu);
+ wq_free_lockdep(wq);
+
if (!(wq->flags & WQ_UNBOUND))
free_percpu(wq->cpu_pwqs);
else
@@ -3364,7 +3503,7 @@ static void rcu_free_pool(struct rcu_head *rcu)
* put_unbound_pool - put a worker_pool
* @pool: worker_pool to put
*
- * Put @pool. If its refcnt reaches zero, it gets destroyed in sched-RCU
+ * Put @pool. If its refcnt reaches zero, it gets destroyed in RCU
* safe manner. get_unbound_pool() calls this function on its failure path
* and this function should be able to release pools which went through,
* successfully or not, init_worker_pool().
@@ -3418,7 +3557,7 @@ static void put_unbound_pool(struct worker_pool *pool)
del_timer_sync(&pool->idle_timer);
del_timer_sync(&pool->mayday_timer);
- /* sched-RCU protected to allow dereferences from get_work_pool() */
+ /* RCU protected to allow dereferences from get_work_pool() */
call_rcu(&pool->rcu, rcu_free_pool);
}
@@ -3532,8 +3671,10 @@ static void pwq_unbound_release_workfn(struct work_struct *work)
* If we're the last pwq going away, @wq is already dead and no one
* is gonna access it anymore. Schedule RCU free.
*/
- if (is_last)
+ if (is_last) {
+ wq_unregister_lockdep(wq);
call_rcu(&wq->rcu, rcu_free_wq);
+ }
}
/**
@@ -3754,8 +3895,8 @@ apply_wqattrs_prepare(struct workqueue_struct *wq,
ctx = kzalloc(struct_size(ctx, pwq_tbl, nr_node_ids), GFP_KERNEL);
- new_attrs = alloc_workqueue_attrs(GFP_KERNEL);
- tmp_attrs = alloc_workqueue_attrs(GFP_KERNEL);
+ new_attrs = alloc_workqueue_attrs();
+ tmp_attrs = alloc_workqueue_attrs();
if (!ctx || !new_attrs || !tmp_attrs)
goto out_free;
@@ -3891,7 +4032,7 @@ static int apply_workqueue_attrs_locked(struct workqueue_struct *wq,
*
* Return: 0 on success and -errno on failure.
*/
-int apply_workqueue_attrs(struct workqueue_struct *wq,
+static int apply_workqueue_attrs(struct workqueue_struct *wq,
const struct workqueue_attrs *attrs)
{
int ret;
@@ -3902,7 +4043,6 @@ int apply_workqueue_attrs(struct workqueue_struct *wq,
return ret;
}
-EXPORT_SYMBOL_GPL(apply_workqueue_attrs);
/**
* wq_update_unbound_numa - update NUMA affinity of a wq for CPU hot[un]plug
@@ -4067,11 +4207,10 @@ static int init_rescuer(struct workqueue_struct *wq)
return 0;
}
-struct workqueue_struct *__alloc_workqueue_key(const char *fmt,
- unsigned int flags,
- int max_active,
- struct lock_class_key *key,
- const char *lock_name, ...)
+__printf(1, 4)
+struct workqueue_struct *alloc_workqueue(const char *fmt,
+ unsigned int flags,
+ int max_active, ...)
{
size_t tbl_size = 0;
va_list args;
@@ -4101,12 +4240,12 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt,
return NULL;
if (flags & WQ_UNBOUND) {
- wq->unbound_attrs = alloc_workqueue_attrs(GFP_KERNEL);
+ wq->unbound_attrs = alloc_workqueue_attrs();
if (!wq->unbound_attrs)
goto err_free_wq;
}
- va_start(args, lock_name);
+ va_start(args, max_active);
vsnprintf(wq->name, sizeof(wq->name), fmt, args);
va_end(args);
@@ -4123,11 +4262,11 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt,
INIT_LIST_HEAD(&wq->flusher_overflow);
INIT_LIST_HEAD(&wq->maydays);
- lockdep_init_map(&wq->lockdep_map, lock_name, key, 0);
+ wq_init_lockdep(wq);
INIT_LIST_HEAD(&wq->list);
if (alloc_and_link_pwqs(wq) < 0)
- goto err_free_wq;
+ goto err_unreg_lockdep;
if (wq_online && init_rescuer(wq) < 0)
goto err_destroy;
@@ -4153,6 +4292,9 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt,
return wq;
+err_unreg_lockdep:
+ wq_unregister_lockdep(wq);
+ wq_free_lockdep(wq);
err_free_wq:
free_workqueue_attrs(wq->unbound_attrs);
kfree(wq);
@@ -4161,7 +4303,7 @@ err_destroy:
destroy_workqueue(wq);
return NULL;
}
-EXPORT_SYMBOL_GPL(__alloc_workqueue_key);
+EXPORT_SYMBOL_GPL(alloc_workqueue);
/**
* destroy_workqueue - safely terminate a workqueue
@@ -4214,6 +4356,7 @@ void destroy_workqueue(struct workqueue_struct *wq)
kthread_stop(wq->rescuer->task);
if (!(wq->flags & WQ_UNBOUND)) {
+ wq_unregister_lockdep(wq);
/*
* The base ref is never dropped on per-cpu pwqs. Directly
* schedule RCU free.
@@ -4328,7 +4471,8 @@ bool workqueue_congested(int cpu, struct workqueue_struct *wq)
struct pool_workqueue *pwq;
bool ret;
- rcu_read_lock_sched();
+ rcu_read_lock();
+ preempt_disable();
if (cpu == WORK_CPU_UNBOUND)
cpu = smp_processor_id();
@@ -4339,7 +4483,8 @@ bool workqueue_congested(int cpu, struct workqueue_struct *wq)
pwq = unbound_pwq_by_node(wq, cpu_to_node(cpu));
ret = !list_empty(&pwq->delayed_works);
- rcu_read_unlock_sched();
+ preempt_enable();
+ rcu_read_unlock();
return ret;
}
@@ -4365,15 +4510,15 @@ unsigned int work_busy(struct work_struct *work)
if (work_pending(work))
ret |= WORK_BUSY_PENDING;
- local_irq_save(flags);
+ rcu_read_lock();
pool = get_work_pool(work);
if (pool) {
- spin_lock(&pool->lock);
+ spin_lock_irqsave(&pool->lock, flags);
if (find_worker_executing_work(pool, work))
ret |= WORK_BUSY_RUNNING;
- spin_unlock(&pool->lock);
+ spin_unlock_irqrestore(&pool->lock, flags);
}
- local_irq_restore(flags);
+ rcu_read_unlock();
return ret;
}
@@ -4444,7 +4589,7 @@ void print_worker_info(const char *log_lvl, struct task_struct *task)
probe_kernel_read(desc, worker->desc, sizeof(desc) - 1);
if (fn || name[0] || desc[0]) {
- printk("%sWorkqueue: %s %pf", log_lvl, name, fn);
+ printk("%sWorkqueue: %s %ps", log_lvl, name, fn);
if (strcmp(name, desc))
pr_cont(" (%s)", desc);
pr_cont("\n");
@@ -4469,7 +4614,7 @@ static void pr_cont_work(bool comma, struct work_struct *work)
pr_cont("%s BAR(%d)", comma ? "," : "",
task_pid_nr(barr->task));
} else {
- pr_cont("%s %pf", comma ? "," : "", work->func);
+ pr_cont("%s %ps", comma ? "," : "", work->func);
}
}
@@ -4501,7 +4646,7 @@ static void show_pwq(struct pool_workqueue *pwq)
if (worker->current_pwq != pwq)
continue;
- pr_cont("%s %d%s:%pf", comma ? "," : "",
+ pr_cont("%s %d%s:%ps", comma ? "," : "",
task_pid_nr(worker->task),
worker == pwq->wq->rescuer ? "(RESCUER)" : "",
worker->current_func);
@@ -4557,7 +4702,7 @@ void show_workqueue_state(void)
unsigned long flags;
int pi;
- rcu_read_lock_sched();
+ rcu_read_lock();
pr_info("Showing busy workqueues and worker pools:\n");
@@ -4622,7 +4767,7 @@ void show_workqueue_state(void)
touch_nmi_watchdog();
}
- rcu_read_unlock_sched();
+ rcu_read_unlock();
}
/* used to show worker information through /proc/PID/{comm,stat,status} */
@@ -4786,7 +4931,7 @@ static void rebind_workers(struct worker_pool *pool)
*
* WRITE_ONCE() is necessary because @worker->flags may be
* tested without holding any lock in
- * wq_worker_waking_up(). Without it, NOT_RUNNING test may
+ * wq_worker_running(). Without it, NOT_RUNNING test may
* fail incorrectly leading to premature concurrency
* management operations.
*/
@@ -5009,16 +5154,16 @@ bool freeze_workqueues_busy(void)
* nr_active is monotonically decreasing. It's safe
* to peek without lock.
*/
- rcu_read_lock_sched();
+ rcu_read_lock();
for_each_pwq(pwq, wq) {
WARN_ON_ONCE(pwq->nr_active < 0);
if (pwq->nr_active) {
busy = true;
- rcu_read_unlock_sched();
+ rcu_read_unlock();
goto out_unlock;
}
}
- rcu_read_unlock_sched();
+ rcu_read_unlock();
}
out_unlock:
mutex_unlock(&wq_pool_mutex);
@@ -5213,7 +5358,8 @@ static ssize_t wq_pool_ids_show(struct device *dev,
const char *delim = "";
int node, written = 0;
- rcu_read_lock_sched();
+ get_online_cpus();
+ rcu_read_lock();
for_each_node(node) {
written += scnprintf(buf + written, PAGE_SIZE - written,
"%s%d:%d", delim, node,
@@ -5221,7 +5367,8 @@ static ssize_t wq_pool_ids_show(struct device *dev,
delim = " ";
}
written += scnprintf(buf + written, PAGE_SIZE - written, "\n");
- rcu_read_unlock_sched();
+ rcu_read_unlock();
+ put_online_cpus();
return written;
}
@@ -5246,7 +5393,7 @@ static struct workqueue_attrs *wq_sysfs_prep_attrs(struct workqueue_struct *wq)
lockdep_assert_held(&wq_pool_mutex);
- attrs = alloc_workqueue_attrs(GFP_KERNEL);
+ attrs = alloc_workqueue_attrs();
if (!attrs)
return NULL;
@@ -5668,7 +5815,7 @@ static void __init wq_numa_init(void)
return;
}
- wq_update_unbound_numa_attrs_buf = alloc_workqueue_attrs(GFP_KERNEL);
+ wq_update_unbound_numa_attrs_buf = alloc_workqueue_attrs();
BUG_ON(!wq_update_unbound_numa_attrs_buf);
/*
@@ -5743,7 +5890,7 @@ int __init workqueue_init_early(void)
for (i = 0; i < NR_STD_WORKER_POOLS; i++) {
struct workqueue_attrs *attrs;
- BUG_ON(!(attrs = alloc_workqueue_attrs(GFP_KERNEL)));
+ BUG_ON(!(attrs = alloc_workqueue_attrs()));
attrs->nice = std_nice[i];
unbound_std_wq_attrs[i] = attrs;
@@ -5752,7 +5899,7 @@ int __init workqueue_init_early(void)
* guaranteed by max_active which is enforced by pwqs.
* Turn off NUMA so that dfl_pwq is used for all nodes.
*/
- BUG_ON(!(attrs = alloc_workqueue_attrs(GFP_KERNEL)));
+ BUG_ON(!(attrs = alloc_workqueue_attrs()));
attrs->nice = std_nice[i];
attrs->no_numa = true;
ordered_wq_attrs[i] = attrs;
diff --git a/kernel/workqueue_internal.h b/kernel/workqueue_internal.h
index cb68b03ca89a..498de0e909a4 100644
--- a/kernel/workqueue_internal.h
+++ b/kernel/workqueue_internal.h
@@ -44,6 +44,7 @@ struct worker {
unsigned long last_active; /* L: last active timestamp */
unsigned int flags; /* X: flags */
int id; /* I: worker id */
+ int sleeping; /* None */
/*
* Opaque string set with work_set_desc(). Printed out with task
@@ -72,8 +73,8 @@ static inline struct worker *current_wq_worker(void)
* Scheduler hooks for concurrency managed workqueue. Only to be used from
* sched/ and workqueue.c.
*/
-void wq_worker_waking_up(struct task_struct *task, int cpu);
-struct task_struct *wq_worker_sleeping(struct task_struct *task);
+void wq_worker_running(struct task_struct *task);
+void wq_worker_sleeping(struct task_struct *task);
work_func_t wq_worker_last_func(struct task_struct *task);
#endif /* _KERNEL_WORKQUEUE_INTERNAL_H */